Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "../../include/fpdfapi/fpdf_page.h"
      8 #include "../../include/fpdfapi/fpdf_pageobj.h"
      9 #include "../../include/fpdftext/fpdf_text.h"
     10 #include "txtproc.h"
     11 #include "text_int.h"
     12 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
     13 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar)
     14 {
     15     if (destcp == 0) {
     16         if (unicode < 0x80) {
     17             return CFX_ByteString((char)unicode);
     18         }
     19         FX_LPCSTR altstr = FCS_GetAltStr(unicode);
     20         if (altstr) {
     21             return CFX_ByteString(altstr, -1);
     22         }
     23         return CFX_ByteString(defchar, -1);
     24     }
     25     FX_BOOL bDef = FALSE;
     26     char buf[10];
     27     int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
     28     if (ret && !bDef) {
     29         return CFX_ByteString(buf, ret);
     30     }
     31     FX_LPCSTR altstr = FCS_GetAltStr(unicode);
     32     if (altstr) {
     33         return CFX_ByteString(altstr, -1);
     34     }
     35     return CFX_ByteString(defchar, -1);
     36 }
     37 CTextPage::CTextPage()
     38 {
     39 }
     40 CTextPage::~CTextPage()
     41 {
     42     int i;
     43     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
     44         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
     45         delete pBaseLine;
     46     }
     47     for (i = 0; i < m_TextColumns.GetSize(); i ++) {
     48         CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
     49         delete pTextColumn;
     50     }
     51 }
     52 void CTextPage::ProcessObject(CPDF_PageObject* pObject)
     53 {
     54     if (pObject->m_Type != PDFPAGE_TEXT) {
     55         return;
     56     }
     57     CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
     58     CPDF_Font* pFont = pText->m_TextState.GetFont();
     59     int count = pText->CountItems();
     60     FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2);
     61     pText->CalcCharPos(pPosArray);
     62 
     63     FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
     64     FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
     65     FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
     66     FX_FLOAT spacew = 0;
     67     if (space_charcode != -1) {
     68         spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
     69     }
     70     if (spacew == 0) {
     71         spacew = fontsize_h / 4;
     72     }
     73     if (pText->m_TextState.GetBaselineAngle() != 0) {
     74         int cc = 0;
     75         CFX_AffineMatrix matrix;
     76         pText->GetTextMatrix(&matrix);
     77         for (int i = 0; i < pText->m_nChars; i ++) {
     78             FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
     79             if (charcode == (FX_DWORD) - 1) {
     80                 continue;
     81             }
     82             FX_RECT char_box;
     83             pFont->GetCharBBox(charcode, char_box);
     84             FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
     85             FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
     86             FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
     87             FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
     88             cc ++;
     89             FX_FLOAT char_origx, char_origy;
     90             matrix.Transform(char_left, 0, char_origx, char_origy);
     91             matrix.TransformRect(char_left, char_right, char_top, char_bottom);
     92             CFX_ByteString str;
     93             pFont->AppendChar(str, charcode);
     94             InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
     95                           char_bottom, spacew, fontsize_v, str, pFont);
     96         }
     97         if (pPosArray) {
     98             FX_Free(pPosArray);
     99         }
    100         return;
    101     }
    102     FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
    103     for (int ii = 0; ii < count * 2; ii ++) {
    104         pPosArray[ii] *= ratio_h;
    105     }
    106     FX_FLOAT baseline = pText->m_PosY;
    107     CTextBaseLine* pBaseLine = NULL;
    108     FX_FLOAT topy = pText->m_Top;
    109     FX_FLOAT bottomy = pText->m_Bottom;
    110     FX_FLOAT leftx = pText->m_Left;
    111     int cc = 0;
    112     CFX_ByteString segment;
    113     int space_count = 0;
    114     FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
    115     for (int i = 0; i < pText->m_nChars; i ++) {
    116         FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
    117         if (charcode == (FX_DWORD) - 1) {
    118             continue;
    119         }
    120         FX_FLOAT char_left = pPosArray[cc * 2];
    121         FX_FLOAT char_right = pPosArray[cc * 2 + 1];
    122         cc ++;
    123         if (char_left < last_left || (char_left - last_right) > spacew / 2) {
    124             pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
    125                                       topy, bottomy, spacew, fontsize_v, segment, pFont);
    126             segment_left = char_left;
    127             segment = "";
    128         }
    129         if (space_count > 1) {
    130             pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
    131                                       topy, bottomy, spacew, fontsize_v, segment, pFont);
    132             segment = "";
    133         } else if (space_count == 1) {
    134             pFont->AppendChar(segment, ' ');
    135         }
    136         if (segment.GetLength() == 0) {
    137             segment_left = char_left;
    138         }
    139         segment_right = char_right;
    140         pFont->AppendChar(segment, charcode);
    141         space_count = 0;
    142         last_left = char_left;
    143         last_right = char_right;
    144     }
    145     if (segment.GetLength())
    146         pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
    147                                   topy, bottomy, spacew, fontsize_v, segment, pFont);
    148     FX_Free(pPosArray);
    149 }
    150 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx,
    151                                         FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v,
    152                                         CFX_ByteString& str, CPDF_Font* pFont)
    153 {
    154     if (str.GetLength() == 0) {
    155         return NULL;
    156     }
    157     if (pBaseLine == NULL) {
    158         int i;
    159         for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    160             CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    161             if (pExistLine->m_BaseLine == basey) {
    162                 pBaseLine = pExistLine;
    163                 break;
    164             }
    165             if (pExistLine->m_BaseLine < basey) {
    166                 break;
    167             }
    168         }
    169         if (pBaseLine == NULL) {
    170             pBaseLine = new CTextBaseLine;
    171             pBaseLine->m_BaseLine = basey;
    172             m_BaseLines.InsertAt(i, pBaseLine);
    173         }
    174     }
    175     CFX_WideString text;
    176     FX_LPCSTR pStr = str;
    177     int len = str.GetLength(), offset = 0;
    178     while (offset < len) {
    179         FX_DWORD ch = pFont->GetNextChar(pStr, len, offset);
    180         CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
    181         if (unicode_str.IsEmpty()) {
    182             text += (FX_WCHAR)ch;
    183         }
    184         else {
    185             text += unicode_str;
    186         }
    187     }
    188     pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
    189     return pBaseLine;
    190 }
    191 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth)
    192 {
    193     FX_FLOAT lastheight = -1;
    194     FX_FLOAT lastbaseline = -1;
    195     FX_FLOAT MinLeftX = 1000000;
    196     FX_FLOAT MaxRightX = 0;
    197     int i;
    198     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    199         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    200         FX_FLOAT leftx, rightx;
    201         if (pBaseLine->GetWidth(leftx, rightx)) {
    202             if (leftx < MinLeftX) {
    203                 MinLeftX = leftx;
    204             }
    205             if (rightx > MaxRightX) {
    206                 MaxRightX = rightx;
    207             }
    208         }
    209     }
    210     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    211         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    212         pBaseLine->MergeBoxes();
    213     }
    214     for (i = 1; i < m_BaseLines.GetSize(); i ++) {
    215         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    216         CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
    217         if (pBaseLine->CanMerge(pPrevLine)) {
    218             pPrevLine->Merge(pBaseLine);
    219             delete pBaseLine;
    220             m_BaseLines.RemoveAt(i);
    221             i --;
    222         }
    223     }
    224     if (m_bAutoWidth) {
    225         int* widths = FX_Alloc(int, m_BaseLines.GetSize());
    226         for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    227             widths[i] = 0;
    228             CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    229             int TotalChars = 0;
    230             FX_FLOAT TotalWidth = 0;
    231             int minchars;
    232             pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
    233             if (TotalChars) {
    234                 FX_FLOAT charwidth = TotalWidth / TotalChars;
    235                 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
    236             }
    237             if (widths[i] > 1000) {
    238                 widths[i] = 1000;
    239             }
    240             if (widths[i] < minchars) {
    241                 widths[i] = minchars;
    242             }
    243         }
    244         int AvgWidth = 0, widthcount = 0;
    245         for (i = 0; i < m_BaseLines.GetSize(); i ++)
    246             if (widths[i]) {
    247                 AvgWidth += widths[i];
    248                 widthcount ++;
    249             }
    250         AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
    251         int MaxWidth = 0;
    252         for (i = 0; i < m_BaseLines.GetSize(); i ++)
    253             if (MaxWidth < widths[i]) {
    254                 MaxWidth = widths[i];
    255             }
    256         if (MaxWidth > AvgWidth * 6 / 5) {
    257             MaxWidth = AvgWidth * 6 / 5;
    258         }
    259         FX_Free(widths);
    260         if (iMinWidth < MaxWidth) {
    261             iMinWidth = MaxWidth;
    262         }
    263     }
    264     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    265         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    266         pBaseLine->MergeBoxes();
    267     }
    268     if (m_bKeepColumn) {
    269         FindColumns();
    270     }
    271     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    272         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    273         if (lastheight >= 0) {
    274             FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
    275             if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
    276                 lines.Add(L"");
    277             }
    278         }
    279         lastheight = pBaseLine->m_MaxFontSizeV;
    280         lastbaseline = pBaseLine->m_BaseLine;
    281         CFX_WideString str;
    282         pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
    283         lines.Add(str);
    284     }
    285 }
    286 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest)
    287 {
    288     wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
    289     FX_LPWSTR pDst = NULL;
    290     FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
    291     if (nCount < 1 ) {
    292         sDest += wChar;
    293         return;
    294     }
    295     pDst = new FX_WCHAR[nCount];
    296     FX_Unicode_GetNormalization(wChar, pDst);
    297     for (int nIndex = 0; nIndex < nCount; nIndex++) {
    298         sDest += pDst[nIndex];
    299     }
    300     delete[] pDst;
    301 }
    302 void NormalizeString(CFX_WideString& str)
    303 {
    304     if (str.GetLength() <= 0) {
    305         return;
    306     }
    307     CFX_WideString sBuffer;
    308     IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
    309     if (NULL == BidiChar)	{
    310         return;
    311     }
    312     CFX_WordArray order;
    313     FX_BOOL bR2L = FALSE;
    314     FX_INT32 start = 0, count = 0, i = 0;
    315     int nR2L = 0, nL2R = 0;
    316     for (i = 0; i < str.GetLength(); i++) {
    317         if(BidiChar->AppendChar(str.GetAt(i))) {
    318             FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
    319             order.Add(start);
    320             order.Add(count);
    321             order.Add(ret);
    322             if(!bR2L) {
    323                 if(ret == 2) {
    324                     nR2L++;
    325                 } else if (ret == 1) {
    326                     nL2R++;
    327                 }
    328             }
    329         }
    330     }
    331     if(BidiChar->EndChar()) {
    332         FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
    333         order.Add(start);
    334         order.Add(count);
    335         order.Add(ret);
    336         if(!bR2L) {
    337             if(ret == 2) {
    338                 nR2L++;
    339             } else if(ret == 1) {
    340                 nL2R++;
    341             }
    342         }
    343     }
    344     if(nR2L > 0 && nR2L >= nL2R) {
    345         bR2L = TRUE;
    346     }
    347     if(bR2L) {
    348         int count = order.GetSize();
    349         for(int j = count - 1; j > 0; j -= 3) {
    350             int ret = order.GetAt(j);
    351             int start = order.GetAt(j - 2);
    352             int count1 = order.GetAt(j - 1);
    353             if(ret == 2 || ret == 0) {
    354                 for(int i = start + count1 - 1; i >= start; i--) {
    355                     NormalizeCompositeChar(str[i], sBuffer);
    356                 }
    357             } else {
    358                 i = j;
    359                 FX_BOOL bSymbol = FALSE;
    360                 while(i > 0 && order.GetAt(i) != 2) {
    361                     bSymbol = !order.GetAt(i);
    362                     i -= 3;
    363                 }
    364                 int end = start + count1 ;
    365                 int n = 0;
    366                 if(bSymbol) {
    367                     n = i + 6;
    368                 } else {
    369                     n = i + 3;
    370                 }
    371                 if(n >= j) {
    372                     for(int m = start; m < end; m++) {
    373                         sBuffer += str[m];
    374                     }
    375                 } else {
    376                     i = j;
    377                     j = n;
    378                     for(; n <= i; n += 3) {
    379                         int start = order.GetAt(n - 2);
    380                         int count1 = order.GetAt(n - 1);
    381                         int end = start + count1 ;
    382                         for(int m = start; m < end; m++) {
    383                             sBuffer += str[m];
    384                         }
    385                     }
    386                 }
    387             }
    388         }
    389     } else {
    390         int count = order.GetSize();
    391         FX_BOOL bL2R = FALSE;
    392         for(int j = 0; j < count; j += 3) {
    393             int ret = order.GetAt(j + 2);
    394             int start = order.GetAt(j);
    395             int count1 = order.GetAt(j + 1);
    396             if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
    397                 int i = j + 3;
    398                 while(bR2L && i < count) {
    399                     if(order.GetAt(i + 2) == 1) {
    400                         break;
    401                     } else {
    402                         i += 3;
    403                     }
    404                 }
    405                 if(i == 3) {
    406                     j = -3;
    407                     bL2R = TRUE;
    408                     continue;
    409                 }
    410                 int end = str.GetLength() - 1;
    411                 if(i < count) {
    412                     end = order.GetAt(i) - 1;
    413                 }
    414                 j = i - 3;
    415                 for(int n = end; n >= start; n--) {
    416                     NormalizeCompositeChar(str[i], sBuffer);
    417                 }
    418             } else {
    419                 int end = start + count1 ;
    420                 for(int i = start; i < end; i++) {
    421                     sBuffer += str[i];
    422                 }
    423             }
    424         }
    425     }
    426     str.Empty();
    427     str += sBuffer;
    428     BidiChar->Release();
    429 }
    430 static FX_BOOL IsNumber(CFX_WideString& str)
    431 {
    432     for (int i = 0; i < str.GetLength(); i ++) {
    433         FX_WCHAR ch = str[i];
    434         if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
    435             return FALSE;
    436         }
    437     }
    438     return TRUE;
    439 }
    440 void CTextPage::FindColumns()
    441 {
    442     int i;
    443     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    444         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    445         for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
    446             CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
    447             CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
    448             if (pColumn == NULL) {
    449                 pColumn = new CTextColumn;
    450                 pColumn->m_Count = 1;
    451                 pColumn->m_AvgPos = pTextBox->m_Right;
    452                 pColumn->m_TextPos = -1;
    453                 m_TextColumns.Add(pColumn);
    454             } else {
    455                 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
    456                                     (pColumn->m_Count + 1);
    457                 pColumn->m_Count ++;
    458             }
    459         }
    460     }
    461     int mincount = m_BaseLines.GetSize() / 4;
    462     for (i = 0; i < m_TextColumns.GetSize(); i ++) {
    463         CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
    464         if (pTextColumn->m_Count >= mincount) {
    465             continue;
    466         }
    467         delete pTextColumn;
    468         m_TextColumns.RemoveAt(i);
    469         i --;
    470     }
    471     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    472         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    473         for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
    474             CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
    475             if (IsNumber(pTextBox->m_Text)) {
    476                 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
    477             }
    478         }
    479     }
    480 }
    481 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
    482 {
    483     for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
    484         CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
    485         if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
    486             return pColumn;
    487         }
    488     }
    489     return NULL;
    490 }
    491 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
    492 {
    493 }
    494 CTextBaseLine::CTextBaseLine()
    495 {
    496     m_Top = -100000;
    497     m_Bottom = 100000;
    498     m_MaxFontSizeV = 0;
    499 }
    500 CTextBaseLine::~CTextBaseLine()
    501 {
    502     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    503         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    504         delete pText;
    505     }
    506 }
    507 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy,
    508                                   FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text)
    509 {
    510     if (m_Top < topy) {
    511         m_Top = topy;
    512     }
    513     if (m_Bottom > bottomy) {
    514         m_Bottom = bottomy;
    515     }
    516     if (m_MaxFontSizeV < fontsize_v) {
    517         m_MaxFontSizeV = fontsize_v;
    518     }
    519     int i;
    520     for (i = 0; i < m_TextList.GetSize(); i ++) {
    521         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    522         if (pText->m_Left > leftx) {
    523             break;
    524         }
    525     }
    526     CTextBox* pText = new CTextBox;
    527     pText->m_Text = text;
    528     pText->m_Left = leftx;
    529     pText->m_Right = rightx;
    530     pText->m_Top = topy;
    531     pText->m_Bottom = bottomy;
    532     pText->m_SpaceWidth = spacew;
    533     pText->m_FontSizeV = fontsize_v;
    534     pText->m_pColumn = NULL;
    535     m_TextList.InsertAt(i, pText);
    536 }
    537 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2,
    538                         FX_FLOAT& interlow, FX_FLOAT& interhigh);
    539 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
    540 {
    541     FX_FLOAT inter_top, inter_bottom;
    542     if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
    543                          inter_bottom, inter_top)) {
    544         return FALSE;
    545     }
    546     FX_FLOAT inter_h = inter_top - inter_bottom;
    547     if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
    548         return FALSE;
    549     }
    550     FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
    551     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    552         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    553         for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
    554             CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
    555             FX_FLOAT inter_left, inter_right;
    556             if (!GetIntersection(pText->m_Left, pText->m_Right,
    557                                  pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) {
    558                 continue;
    559             }
    560             FX_FLOAT inter_w = inter_right - inter_left;
    561             if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) {
    562                 continue;
    563             }
    564             if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
    565                     dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
    566                 return FALSE;
    567             }
    568         }
    569     }
    570     return TRUE;
    571 }
    572 void CTextBaseLine::Merge(CTextBaseLine* pOther)
    573 {
    574     for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
    575         CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
    576         InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
    577                       pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
    578     }
    579 }
    580 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
    581 {
    582     int i;
    583     for (i = 0; i < m_TextList.GetSize(); i ++) {
    584         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    585         if (pText->m_Text != L" ") {
    586             break;
    587         }
    588     }
    589     if (i == m_TextList.GetSize()) {
    590         return FALSE;
    591     }
    592     CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    593     leftx = pText->m_Left;
    594     for (i = m_TextList.GetSize() - 1; i >= 0; i --) {
    595         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    596         if (pText->m_Text != L" ") {
    597             break;
    598         }
    599     }
    600     pText = (CTextBox*)m_TextList.GetAt(i);
    601     rightx = pText->m_Right;
    602     return TRUE;
    603 }
    604 void CTextBaseLine::MergeBoxes()
    605 {
    606     int i = 0;
    607     while (1) {
    608         if (i >= m_TextList.GetSize() - 1) {
    609             break;
    610         }
    611         CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
    612         CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
    613         FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
    614         FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ?
    615                           pNextText->m_SpaceWidth : pThisText->m_SpaceWidth;
    616         if (spacew > 0.0 && dx < spacew * 2) {
    617             pThisText->m_Right = pNextText->m_Right;
    618             if (dx > spacew * 1.5) {
    619                 pThisText->m_Text += L"  ";
    620             } else if (dx > spacew / 3) {
    621                 pThisText->m_Text += L' ';
    622             }
    623             pThisText->m_Text += pNextText->m_Text;
    624             pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ?
    625                                       spacew : pNextText->m_SpaceWidth;
    626             m_TextList.RemoveAt(i + 1);
    627             delete pNextText;
    628         } else {
    629             i ++;
    630         }
    631     }
    632 }
    633 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth,
    634                                 int iTextWidth)
    635 {
    636     int lastpos = -1;
    637     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    638         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    639         int xpos;
    640         if (pText->m_pColumn) {
    641             xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5);
    642             xpos -= pText->m_Text.GetLength();
    643         } else {
    644             xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
    645         }
    646         if (xpos <= lastpos) {
    647             xpos = lastpos + 1;
    648         }
    649         for (int j = lastpos + 1; j < xpos; j ++) {
    650             str += ' ';
    651         }
    652         CFX_WideString sSrc(pText->m_Text);
    653         NormalizeString(sSrc);
    654         str += sSrc;
    655         str += ' ';
    656         lastpos = xpos + pText->m_Text.GetLength();
    657     }
    658 }
    659 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
    660 {
    661     minchars = 0;
    662     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    663         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    664         if (pText->m_Right - pText->m_Left < 0.002) {
    665             continue;
    666         }
    667         count += pText->m_Text.GetLength();
    668         width += pText->m_Right - pText->m_Left;
    669         minchars += pText->m_Text.GetLength() + 1;
    670     }
    671 }
    672 #define PI 3.1415926535897932384626433832795
    673 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
    674 {
    675     int total_count = 0, rotated_count[3] = {0, 0, 0};
    676     FX_POSITION pos = page.GetFirstObjectPosition();
    677     while (pos) {
    678         CPDF_PageObject* pObj = page.GetNextObject(pos);
    679         if (pObj->m_Type != PDFPAGE_TEXT) {
    680             continue;
    681         }
    682         total_count ++;
    683         CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
    684         FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
    685         if (angle == 0.0) {
    686             continue;
    687         }
    688         int degree = (int)(angle * 180 / PI + 0.5);
    689         if (degree % 90) {
    690             continue;
    691         }
    692         if (degree < 0) {
    693             degree += 360;
    694         }
    695         int index = degree / 90 % 3 - 1;
    696         if (index < 0) {
    697             continue;
    698         }
    699         rotated_count[index] ++;
    700     }
    701     if (total_count == 0) {
    702         return;
    703     }
    704     CFX_AffineMatrix matrix;
    705     if (rotated_count[0] > total_count * 2 / 3) {
    706         matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
    707     } else if (rotated_count[1] > total_count * 2 / 3) {
    708         matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
    709     } else if (rotated_count[2] > total_count * 2 / 3) {
    710         matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
    711     } else {
    712         return;
    713     }
    714     page.Transform(matrix);
    715     page_bbox.Transform(&matrix);
    716 }
    717 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
    718                              int iMinWidth, FX_DWORD flags)
    719 {
    720     lines.RemoveAll();
    721     if (pPage == NULL) {
    722         return;
    723     }
    724     CPDF_Page page;
    725     page.Load(pDoc, pPage);
    726     CPDF_ParseOptions options;
    727     options.m_bTextOnly = TRUE;
    728     options.m_bSeparateForm = FALSE;
    729     page.ParseContent(&options);
    730     CFX_FloatRect page_bbox = page.GetPageBBox();
    731     if (flags & PDF2TXT_AUTO_ROTATE) {
    732         CheckRotate(page, page_bbox);
    733     }
    734     CTextPage texts;
    735     texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
    736     texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
    737     texts.m_bBreakSpace = TRUE;
    738     FX_POSITION pos = page.GetFirstObjectPosition();
    739     while (pos) {
    740         CPDF_PageObject* pObject = page.GetNextObject(pos);
    741         if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
    742             CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
    743             if (!page_bbox.Contains(rect)) {
    744                 continue;
    745             }
    746         }
    747         texts.ProcessObject(pObject);
    748     }
    749     texts.WriteOutput(lines, iMinWidth);
    750 }
    751 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
    752                      int iMinWidth, FX_DWORD flags)
    753 {
    754     lines.RemoveAll();
    755     CFX_WideStringArray wlines;
    756     PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
    757     for (int i = 0; i < wlines.GetSize(); i ++) {
    758         CFX_WideString wstr = wlines[i];
    759         CFX_ByteString str;
    760         for (int c = 0; c < wstr.GetLength(); c ++) {
    761             str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
    762         }
    763         lines.Add(str);
    764     }
    765 }
    766 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
    767                                        CFX_PtrArray* pObjArray);
    768 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags)
    769 {
    770     buffer.EstimateSize(0, 10240);
    771     CPDF_Page page;
    772     page.Load(pDoc, pPage);
    773     CPDF_ParseOptions options;
    774     options.m_bTextOnly = TRUE;
    775     options.m_bSeparateForm = FALSE;
    776     page.ParseContent(&options);
    777     _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
    778 }
    779