Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "../../include/fpdfapi/fpdf_page.h"
      8 #include "../../include/fpdfapi/fpdf_pageobj.h"
      9 #include "../../include/fpdftext/fpdf_text.h"
     10 #include "txtproc.h"
     11 #include "text_int.h"
     12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_)
     13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
     14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar)
     15 {
     16     if (destcp == 0) {
     17         if (unicode < 0x80) {
     18             return CFX_ByteString((char)unicode);
     19         }
     20         FX_LPCSTR altstr = FCS_GetAltStr(unicode);
     21         if (altstr) {
     22             return CFX_ByteString(altstr, -1);
     23         }
     24         return CFX_ByteString(defchar, -1);
     25     }
     26     FX_BOOL bDef = FALSE;
     27     char buf[10];
     28     int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
     29     if (ret && !bDef) {
     30         return CFX_ByteString(buf, ret);
     31     }
     32     FX_LPCSTR altstr = FCS_GetAltStr(unicode);
     33     if (altstr) {
     34         return CFX_ByteString(altstr, -1);
     35     }
     36     return CFX_ByteString(defchar, -1);
     37 }
     38 CTextPage::CTextPage()
     39 {
     40 }
     41 CTextPage::~CTextPage()
     42 {
     43     int i;
     44     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
     45         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
     46         delete pBaseLine;
     47     }
     48     for (i = 0; i < m_TextColumns.GetSize(); i ++) {
     49         CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
     50         delete pTextColumn;
     51     }
     52 }
     53 void CTextPage::ProcessObject(CPDF_PageObject* pObject)
     54 {
     55     if (pObject->m_Type != PDFPAGE_TEXT) {
     56         return;
     57     }
     58     CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
     59     CPDF_Font* pFont = pText->m_TextState.GetFont();
     60     int count = pText->CountItems();
     61     FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2);
     62     if (pPosArray) {
     63         pText->CalcCharPos(pPosArray);
     64     }
     65     FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
     66     FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
     67     FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
     68     FX_FLOAT spacew = 0;
     69     if (space_charcode != -1) {
     70         spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
     71     }
     72     if (spacew == 0) {
     73         spacew = fontsize_h / 4;
     74     }
     75     if (pText->m_TextState.GetBaselineAngle() != 0) {
     76         int cc = 0;
     77         CFX_AffineMatrix matrix;
     78         pText->GetTextMatrix(&matrix);
     79         for (int i = 0; i < pText->m_nChars; i ++) {
     80             FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
     81             if (charcode == (FX_DWORD) - 1) {
     82                 continue;
     83             }
     84             FX_RECT char_box;
     85             pFont->GetCharBBox(charcode, char_box);
     86             FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
     87             FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
     88             FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
     89             FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
     90             cc ++;
     91             FX_FLOAT char_origx, char_origy;
     92             matrix.Transform(char_left, 0, char_origx, char_origy);
     93             matrix.TransformRect(char_left, char_right, char_top, char_bottom);
     94             CFX_ByteString str;
     95             pFont->AppendChar(str, charcode);
     96             InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
     97                           char_bottom, spacew, fontsize_v, str, pFont);
     98         }
     99         if (pPosArray) {
    100             FX_Free(pPosArray);
    101         }
    102         return;
    103     }
    104     FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
    105     for (int ii = 0; ii < count * 2; ii ++) {
    106         pPosArray[ii] *= ratio_h;
    107     }
    108     FX_FLOAT baseline = pText->m_PosY;
    109     CTextBaseLine* pBaseLine = NULL;
    110     FX_FLOAT topy = pText->m_Top;
    111     FX_FLOAT bottomy = pText->m_Bottom;
    112     FX_FLOAT leftx = pText->m_Left;
    113     int cc = 0;
    114     CFX_ByteString segment;
    115     int space_count = 0;
    116     FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
    117     for (int i = 0; i < pText->m_nChars; i ++) {
    118         FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
    119         if (charcode == (FX_DWORD) - 1) {
    120             continue;
    121         }
    122         FX_FLOAT char_left = pPosArray[cc * 2];
    123         FX_FLOAT char_right = pPosArray[cc * 2 + 1];
    124         cc ++;
    125         if (char_left < last_left || (char_left - last_right) > spacew / 2) {
    126             pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
    127                                       topy, bottomy, spacew, fontsize_v, segment, pFont);
    128             segment_left = char_left;
    129             segment = "";
    130         }
    131         CFX_WideString wCh = pText->GetFont()->UnicodeFromCharCode(charcode);
    132         FX_DWORD ch = wCh.GetLength() > 0 ? wCh.GetAt(0) : charcode;
    133         if (space_count > 1) {
    134             pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
    135                                       topy, bottomy, spacew, fontsize_v, segment, pFont);
    136             segment = "";
    137         } else if (space_count == 1) {
    138             pFont->AppendChar(segment, ' ');
    139         }
    140         if (segment.GetLength() == 0) {
    141             segment_left = char_left;
    142         }
    143         segment_right = char_right;
    144         pFont->AppendChar(segment, charcode);
    145         space_count = 0;
    146         last_left = char_left;
    147         last_right = char_right;
    148     }
    149     if (segment.GetLength())
    150         pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
    151                                   topy, bottomy, spacew, fontsize_v, segment, pFont);
    152     FX_Free(pPosArray);
    153 }
    154 static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_Font* pFont);
    155 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx,
    156                                         FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v,
    157                                         CFX_ByteString& str, CPDF_Font* pFont)
    158 {
    159     if (str.GetLength() == 0) {
    160         return NULL;
    161     }
    162     if (pBaseLine == NULL) {
    163         int i;
    164         for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    165             CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    166             if (pExistLine->m_BaseLine == basey) {
    167                 pBaseLine = pExistLine;
    168                 break;
    169             }
    170             if (pExistLine->m_BaseLine < basey) {
    171                 break;
    172             }
    173         }
    174         if (pBaseLine == NULL) {
    175             pBaseLine = FX_NEW CTextBaseLine;
    176             if (NULL == pBaseLine) {
    177                 return NULL;
    178             }
    179             pBaseLine->m_BaseLine = basey;
    180             m_BaseLines.InsertAt(i, pBaseLine);
    181         }
    182     }
    183     CFX_WideString text;
    184     FX_LPCSTR pStr = str;
    185     int len = str.GetLength(), offset = 0;
    186     while (offset < len) {
    187         FX_DWORD ch = pFont->GetNextChar(pStr, offset);
    188         CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
    189         text += unicode_str;
    190     }
    191     pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
    192     return pBaseLine;
    193 }
    194 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth)
    195 {
    196     FX_FLOAT lastheight = -1;
    197     FX_FLOAT lastbaseline = -1;
    198     FX_FLOAT MinLeftX = 1000000;
    199     FX_FLOAT MaxRightX = 0;
    200     int i;
    201     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    202         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    203         FX_FLOAT leftx, rightx;
    204         if (pBaseLine->GetWidth(leftx, rightx)) {
    205             if (leftx < MinLeftX) {
    206                 MinLeftX = leftx;
    207             }
    208             if (rightx > MaxRightX) {
    209                 MaxRightX = rightx;
    210             }
    211         }
    212     }
    213     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    214         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    215         pBaseLine->MergeBoxes();
    216     }
    217     for (i = 1; i < m_BaseLines.GetSize(); i ++) {
    218         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    219         CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
    220         if (pBaseLine->CanMerge(pPrevLine)) {
    221             pPrevLine->Merge(pBaseLine);
    222             delete pBaseLine;
    223             m_BaseLines.RemoveAt(i);
    224             i --;
    225         }
    226     }
    227     if (m_bAutoWidth) {
    228         int* widths = FX_Alloc(int, m_BaseLines.GetSize());
    229         if (widths) {
    230             for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    231                 widths[i] = 0;
    232                 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    233                 int TotalChars = 0;
    234                 FX_FLOAT TotalWidth = 0;
    235                 int minchars;
    236                 pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
    237                 if (TotalChars) {
    238                     FX_FLOAT charwidth = TotalWidth / TotalChars;
    239                     widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
    240                 }
    241                 if (widths[i] > 1000) {
    242                     widths[i] = 1000;
    243                 }
    244                 if (widths[i] < minchars) {
    245                     widths[i] = minchars;
    246                 }
    247             }
    248             int AvgWidth = 0, widthcount = 0;
    249             for (i = 0; i < m_BaseLines.GetSize(); i ++)
    250                 if (widths[i]) {
    251                     AvgWidth += widths[i];
    252                     widthcount ++;
    253                 }
    254             AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
    255             int MaxWidth = 0;
    256             for (i = 0; i < m_BaseLines.GetSize(); i ++)
    257                 if (MaxWidth < widths[i]) {
    258                     MaxWidth = widths[i];
    259                 }
    260             if (MaxWidth > AvgWidth * 6 / 5) {
    261                 MaxWidth = AvgWidth * 6 / 5;
    262             }
    263             FX_Free(widths);
    264             if (iMinWidth < MaxWidth) {
    265                 iMinWidth = MaxWidth;
    266             }
    267         }
    268     }
    269     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    270         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    271         pBaseLine->MergeBoxes();
    272     }
    273     if (m_bKeepColumn) {
    274         FindColumns();
    275     }
    276     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    277         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    278         if (lastheight >= 0) {
    279             FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
    280             if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
    281                 lines.Add(L"");
    282             }
    283         }
    284         lastheight = pBaseLine->m_MaxFontSizeV;
    285         lastbaseline = pBaseLine->m_BaseLine;
    286         CFX_WideString str;
    287         pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
    288         lines.Add(str);
    289     }
    290 }
    291 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest)
    292 {
    293     wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
    294     FX_LPWSTR pDst = NULL;
    295     FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
    296     if (nCount < 1 ) {
    297         sDest += wChar;
    298         return;
    299     }
    300     pDst = new FX_WCHAR[nCount];
    301     FX_Unicode_GetNormalization(wChar, pDst);
    302     for (int nIndex = 0; nIndex < nCount; nIndex++) {
    303         sDest += pDst[nIndex];
    304     }
    305     delete[] pDst;
    306 }
    307 void NormalizeString(CFX_WideString& str)
    308 {
    309     if (str.GetLength() <= 0) {
    310         return;
    311     }
    312     CFX_WideString sBuffer;
    313     IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
    314     if (NULL == BidiChar)	{
    315         return;
    316     }
    317     CFX_WordArray order;
    318     FX_BOOL bR2L = FALSE;
    319     FX_INT32 start = 0, count = 0, i = 0;
    320     int nR2L = 0, nL2R = 0;
    321     for (i = 0; i < str.GetLength(); i++) {
    322         if(BidiChar->AppendChar(str.GetAt(i))) {
    323             FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
    324             order.Add(start);
    325             order.Add(count);
    326             order.Add(ret);
    327             if(!bR2L) {
    328                 if(ret == 2) {
    329                     nR2L++;
    330                 } else if (ret == 1) {
    331                     nL2R++;
    332                 }
    333             }
    334         }
    335     }
    336     if(BidiChar->EndChar()) {
    337         FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
    338         order.Add(start);
    339         order.Add(count);
    340         order.Add(ret);
    341         if(!bR2L) {
    342             if(ret == 2) {
    343                 nR2L++;
    344             } else if(ret == 1) {
    345                 nL2R++;
    346             }
    347         }
    348     }
    349     if(nR2L > 0 && nR2L >= nL2R) {
    350         bR2L = TRUE;
    351     }
    352     if(bR2L) {
    353         int count = order.GetSize();
    354         for(int j = count - 1; j > 0; j -= 3) {
    355             int ret = order.GetAt(j);
    356             int start = order.GetAt(j - 2);
    357             int count1 = order.GetAt(j - 1);
    358             if(ret == 2 || ret == 0) {
    359                 for(int i = start + count1 - 1; i >= start; i--) {
    360                     NormalizeCompositeChar(str[i], sBuffer);
    361                 }
    362             } else {
    363                 i = j;
    364                 FX_BOOL bSymbol = FALSE;
    365                 while(i > 0 && order.GetAt(i) != 2) {
    366                     bSymbol = !order.GetAt(i);
    367                     i -= 3;
    368                 }
    369                 int end = start + count1 ;
    370                 int n = 0;
    371                 if(bSymbol) {
    372                     n = i + 6;
    373                 } else {
    374                     n = i + 3;
    375                 }
    376                 if(n >= j) {
    377                     for(int m = start; m < end; m++) {
    378                         sBuffer += str[m];
    379                     }
    380                 } else {
    381                     i = j;
    382                     j = n;
    383                     for(; n <= i; n += 3) {
    384                         int ret = order.GetAt(n);
    385                         int start = order.GetAt(n - 2);
    386                         int count1 = order.GetAt(n - 1);
    387                         int end = start + count1 ;
    388                         for(int m = start; m < end; m++) {
    389                             sBuffer += str[m];
    390                         }
    391                     }
    392                 }
    393             }
    394         }
    395     } else {
    396         int count = order.GetSize();
    397         FX_BOOL bL2R = FALSE;
    398         for(int j = 0; j < count; j += 3) {
    399             int ret = order.GetAt(j + 2);
    400             int start = order.GetAt(j);
    401             int count1 = order.GetAt(j + 1);
    402             if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
    403                 int i = j + 3;
    404                 while(bR2L && i < count) {
    405                     if(order.GetAt(i + 2) == 1) {
    406                         break;
    407                     } else {
    408                         i += 3;
    409                     }
    410                 }
    411                 if(i == 3) {
    412                     j = -3;
    413                     bL2R = TRUE;
    414                     continue;
    415                 }
    416                 int end = str.GetLength() - 1;
    417                 if(i < count) {
    418                     end = order.GetAt(i) - 1;
    419                 }
    420                 j = i - 3;
    421                 for(int n = end; n >= start; n--) {
    422                     NormalizeCompositeChar(str[i], sBuffer);
    423                 }
    424             } else {
    425                 int end = start + count1 ;
    426                 for(int i = start; i < end; i++) {
    427                     sBuffer += str[i];
    428                 }
    429             }
    430         }
    431     }
    432     str.Empty();
    433     str += sBuffer;
    434     BidiChar->Release();
    435 }
    436 static FX_BOOL IsNumber(CFX_WideString& str)
    437 {
    438     for (int i = 0; i < str.GetLength(); i ++) {
    439         FX_WCHAR ch = str[i];
    440         if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
    441             return FALSE;
    442         }
    443     }
    444     return TRUE;
    445 }
    446 void CTextPage::FindColumns()
    447 {
    448     int i;
    449     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    450         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    451         for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
    452             CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
    453             CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
    454             if (pColumn == NULL) {
    455                 pColumn = FX_NEW CTextColumn;
    456                 if (pColumn) {
    457                     pColumn->m_Count = 1;
    458                     pColumn->m_AvgPos = pTextBox->m_Right;
    459                     pColumn->m_TextPos = -1;
    460                     m_TextColumns.Add(pColumn);
    461                 }
    462             } else {
    463                 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
    464                                     (pColumn->m_Count + 1);
    465                 pColumn->m_Count ++;
    466             }
    467         }
    468     }
    469     int mincount = m_BaseLines.GetSize() / 4;
    470     for (i = 0; i < m_TextColumns.GetSize(); i ++) {
    471         CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
    472         if (pTextColumn->m_Count >= mincount) {
    473             continue;
    474         }
    475         delete pTextColumn;
    476         m_TextColumns.RemoveAt(i);
    477         i --;
    478     }
    479     for (i = 0; i < m_BaseLines.GetSize(); i ++) {
    480         CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
    481         for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
    482             CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
    483             if (IsNumber(pTextBox->m_Text)) {
    484                 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
    485             }
    486         }
    487     }
    488 }
    489 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
    490 {
    491     for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
    492         CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
    493         if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
    494             return pColumn;
    495         }
    496     }
    497     return NULL;
    498 }
    499 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
    500 {
    501 }
    502 CTextBaseLine::CTextBaseLine()
    503 {
    504     m_Top = -100000;
    505     m_Bottom = 100000;
    506     m_MaxFontSizeV = 0;
    507 }
    508 CTextBaseLine::~CTextBaseLine()
    509 {
    510     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    511         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    512         delete pText;
    513     }
    514 }
    515 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy,
    516                                   FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text)
    517 {
    518     if (m_Top < topy) {
    519         m_Top = topy;
    520     }
    521     if (m_Bottom > bottomy) {
    522         m_Bottom = bottomy;
    523     }
    524     if (m_MaxFontSizeV < fontsize_v) {
    525         m_MaxFontSizeV = fontsize_v;
    526     }
    527     int i;
    528     for (i = 0; i < m_TextList.GetSize(); i ++) {
    529         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    530         if (pText->m_Left > leftx) {
    531             break;
    532         }
    533     }
    534     CTextBox* pText = FX_NEW CTextBox;
    535     if (NULL == pText) {
    536         return;
    537     }
    538     pText->m_Text = text;
    539     pText->m_Left = leftx;
    540     pText->m_Right = rightx;
    541     pText->m_Top = topy;
    542     pText->m_Bottom = bottomy;
    543     pText->m_SpaceWidth = spacew;
    544     pText->m_FontSizeV = fontsize_v;
    545     pText->m_pColumn = NULL;
    546     m_TextList.InsertAt(i, pText);
    547 }
    548 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2,
    549                         FX_FLOAT& interlow, FX_FLOAT& interhigh);
    550 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
    551 {
    552     FX_FLOAT inter_top, inter_bottom;
    553     if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
    554                          inter_bottom, inter_top)) {
    555         return FALSE;
    556     }
    557     FX_FLOAT inter_h = inter_top - inter_bottom;
    558     if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
    559         return FALSE;
    560     }
    561     FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
    562     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    563         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    564         FX_FLOAT width = pText->m_Right - pText->m_Left;
    565         for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
    566             CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
    567             FX_FLOAT inter_left, inter_right;
    568             if (!GetIntersection(pText->m_Left, pText->m_Right,
    569                                  pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) {
    570                 continue;
    571             }
    572             FX_FLOAT inter_w = inter_right - inter_left;
    573             if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) {
    574                 continue;
    575             }
    576             if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
    577                     dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
    578                 return FALSE;
    579             }
    580         }
    581     }
    582     return TRUE;
    583 }
    584 void CTextBaseLine::Merge(CTextBaseLine* pOther)
    585 {
    586     for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
    587         CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
    588         InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
    589                       pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
    590     }
    591 }
    592 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
    593 {
    594     int i;
    595     for (i = 0; i < m_TextList.GetSize(); i ++) {
    596         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    597         if (pText->m_Text != L" ") {
    598             break;
    599         }
    600     }
    601     if (i == m_TextList.GetSize()) {
    602         return FALSE;
    603     }
    604     CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    605     leftx = pText->m_Left;
    606     for (i = m_TextList.GetSize() - 1; i >= 0; i --) {
    607         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    608         if (pText->m_Text != L" ") {
    609             break;
    610         }
    611     }
    612     pText = (CTextBox*)m_TextList.GetAt(i);
    613     rightx = pText->m_Right;
    614     return TRUE;
    615 }
    616 void CTextBaseLine::MergeBoxes()
    617 {
    618     int i = 0;
    619     while (1) {
    620         if (i >= m_TextList.GetSize() - 1) {
    621             break;
    622         }
    623         CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
    624         CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
    625         FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
    626         FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ?
    627                           pNextText->m_SpaceWidth : pThisText->m_SpaceWidth;
    628         if (spacew > 0.0 && dx < spacew * 2) {
    629             pThisText->m_Right = pNextText->m_Right;
    630             if (dx > spacew * 1.5) {
    631                 pThisText->m_Text += L"  ";
    632             } else if (dx > spacew / 3) {
    633                 pThisText->m_Text += L' ';
    634             }
    635             pThisText->m_Text += pNextText->m_Text;
    636             pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ?
    637                                       spacew : pNextText->m_SpaceWidth;
    638             m_TextList.RemoveAt(i + 1);
    639             delete pNextText;
    640         } else {
    641             i ++;
    642         }
    643     }
    644 }
    645 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth,
    646                                 int iTextWidth)
    647 {
    648     int lastpos = -1;
    649     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    650         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    651         int xpos;
    652         if (pText->m_pColumn) {
    653             xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5);
    654             xpos -= pText->m_Text.GetLength();
    655         } else {
    656             xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
    657         }
    658         if (xpos <= lastpos) {
    659             xpos = lastpos + 1;
    660         }
    661         for (int j = lastpos + 1; j < xpos; j ++) {
    662             str += ' ';
    663         }
    664         CFX_WideString sSrc(pText->m_Text);
    665         NormalizeString(sSrc);
    666         str += sSrc;
    667         str += ' ';
    668         lastpos = xpos + pText->m_Text.GetLength();
    669     }
    670 }
    671 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
    672 {
    673     minchars = 0;
    674     for (int i = 0; i < m_TextList.GetSize(); i ++) {
    675         CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
    676         if (pText->m_Right - pText->m_Left < 0.002) {
    677             continue;
    678         }
    679         count += pText->m_Text.GetLength();
    680         width += pText->m_Right - pText->m_Left;
    681         minchars += pText->m_Text.GetLength() + 1;
    682     }
    683 }
    684 #define PI 3.1415926535897932384626433832795
    685 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
    686 {
    687     int total_count = 0, rotated_count[3] = {0, 0, 0};
    688     FX_POSITION pos = page.GetFirstObjectPosition();
    689     while (pos) {
    690         CPDF_PageObject* pObj = page.GetNextObject(pos);
    691         if (pObj->m_Type != PDFPAGE_TEXT) {
    692             continue;
    693         }
    694         total_count ++;
    695         CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
    696         FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
    697         if (angle == 0.0) {
    698             continue;
    699         }
    700         int degree = (int)(angle * 180 / PI + 0.5);
    701         if (degree % 90) {
    702             continue;
    703         }
    704         if (degree < 0) {
    705             degree += 360;
    706         }
    707         int index = degree / 90 % 3 - 1;
    708         if (index < 0) {
    709             continue;
    710         }
    711         rotated_count[index] ++;
    712     }
    713     if (total_count == 0) {
    714         return;
    715     }
    716     CFX_AffineMatrix matrix;
    717     if (rotated_count[0] > total_count * 2 / 3) {
    718         matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
    719     } else if (rotated_count[1] > total_count * 2 / 3) {
    720         matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
    721     } else if (rotated_count[2] > total_count * 2 / 3) {
    722         matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
    723     } else {
    724         return;
    725     }
    726     page.Transform(matrix);
    727     page_bbox.Transform(&matrix);
    728 }
    729 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
    730                              int iMinWidth, FX_DWORD flags)
    731 {
    732     lines.RemoveAll();
    733     if (pPage == NULL) {
    734         return;
    735     }
    736     CPDF_Page page;
    737     page.Load(pDoc, pPage);
    738     CPDF_ParseOptions options;
    739     options.m_bTextOnly = TRUE;
    740     options.m_bSeparateForm = FALSE;
    741     page.ParseContent(&options);
    742     CFX_FloatRect page_bbox = page.GetPageBBox();
    743     if (flags & PDF2TXT_AUTO_ROTATE) {
    744         CheckRotate(page, page_bbox);
    745     }
    746     CTextPage texts;
    747     texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
    748     texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
    749     texts.m_bBreakSpace = TRUE;
    750     FX_POSITION pos = page.GetFirstObjectPosition();
    751     while (pos) {
    752         CPDF_PageObject* pObject = page.GetNextObject(pos);
    753         if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
    754             CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
    755             if (!page_bbox.Contains(rect)) {
    756                 continue;
    757             }
    758         }
    759         texts.ProcessObject(pObject);
    760     }
    761     texts.WriteOutput(lines, iMinWidth);
    762 }
    763 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
    764                      int iMinWidth, FX_DWORD flags)
    765 {
    766     lines.RemoveAll();
    767     CFX_WideStringArray wlines;
    768     PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
    769     for (int i = 0; i < wlines.GetSize(); i ++) {
    770         CFX_WideString wstr = wlines[i];
    771         CFX_ByteString str;
    772         for (int c = 0; c < wstr.GetLength(); c ++) {
    773             str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
    774         }
    775         lines.Add(str);
    776     }
    777 }
    778 #endif
    779 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
    780                                        CFX_PtrArray* pObjArray);
    781 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags)
    782 {
    783     buffer.EstimateSize(0, 10240);
    784     CPDF_Page page;
    785     page.Load(pDoc, pPage);
    786     CPDF_ParseOptions options;
    787     options.m_bTextOnly = TRUE;
    788     options.m_bSeparateForm = FALSE;
    789     page.ParseContent(&options);
    790     _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
    791 }
    792