Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include <cctype>
      8 #include <cwctype>
      9 #include <memory>
     10 
     11 #include "core/include/fpdfapi/fpdf_page.h"
     12 #include "core/include/fpdfapi/fpdf_pageobj.h"
     13 #include "core/include/fpdfapi/fpdf_resource.h"
     14 #include "core/include/fpdftext/fpdf_text.h"
     15 #include "core/include/fxcrt/fx_bidi.h"
     16 #include "core/include/fxcrt/fx_ucd.h"
     17 #include "text_int.h"
     18 #include "txtproc.h"
     19 
     20 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode,
     21                                   int destcp,
     22                                   const FX_CHAR* defchar) {
     23   if (destcp == 0) {
     24     if (unicode < 0x80) {
     25       return CFX_ByteString((char)unicode);
     26     }
     27     const FX_CHAR* altstr = FCS_GetAltStr(unicode);
     28     return CFX_ByteString(altstr ? altstr : defchar);
     29   }
     30   char buf[10];
     31   int iDef = 0;
     32   int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10,
     33                                       NULL, &iDef);
     34   if (ret && !iDef) {
     35     return CFX_ByteString(buf, ret);
     36   }
     37   const FX_CHAR* altstr = FCS_GetAltStr(unicode);
     38   return CFX_ByteString(altstr ? altstr : defchar);
     39 }
     40 CTextPage::CTextPage() {}
     41 CTextPage::~CTextPage() {
     42   int i;
     43   for (i = 0; i < m_BaseLines.GetSize(); i++) {
     44     delete m_BaseLines.GetAt(i);
     45   }
     46   for (i = 0; i < m_TextColumns.GetSize(); i++) {
     47     delete m_TextColumns.GetAt(i);
     48   }
     49 }
     50 void CTextPage::ProcessObject(CPDF_PageObject* pObject) {
     51   if (pObject->m_Type != PDFPAGE_TEXT) {
     52     return;
     53   }
     54   CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
     55   CPDF_Font* pFont = pText->m_TextState.GetFont();
     56   int count = pText->CountItems();
     57   FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2);
     58   pText->CalcCharPos(pPosArray);
     59 
     60   FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
     61   FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
     62   FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
     63   FX_FLOAT spacew = 0;
     64   if (space_charcode != -1) {
     65     spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
     66   }
     67   if (spacew == 0) {
     68     spacew = fontsize_h / 4;
     69   }
     70   if (pText->m_TextState.GetBaselineAngle() != 0) {
     71     int cc = 0;
     72     CFX_Matrix matrix;
     73     pText->GetTextMatrix(&matrix);
     74     for (int i = 0; i < pText->m_nChars; i++) {
     75       FX_DWORD charcode = pText->m_nChars == 1
     76                               ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
     77                               : pText->m_pCharCodes[i];
     78       if (charcode == (FX_DWORD)-1) {
     79         continue;
     80       }
     81       FX_RECT char_box;
     82       pFont->GetCharBBox(charcode, char_box);
     83       FX_FLOAT char_left =
     84           pPosArray ? pPosArray[cc * 2]
     85                     : char_box.left * pText->m_TextState.GetFontSize() / 1000;
     86       FX_FLOAT char_right =
     87           pPosArray ? pPosArray[cc * 2 + 1]
     88                     : char_box.right * pText->m_TextState.GetFontSize() / 1000;
     89       FX_FLOAT char_top =
     90           char_box.top * pText->m_TextState.GetFontSize() / 1000;
     91       FX_FLOAT char_bottom =
     92           char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
     93       cc++;
     94       FX_FLOAT char_origx, char_origy;
     95       matrix.Transform(char_left, 0, char_origx, char_origy);
     96       matrix.TransformRect(char_left, char_right, char_top, char_bottom);
     97       CFX_ByteString str;
     98       pFont->AppendChar(str, charcode);
     99       InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
    100                     char_bottom, spacew, fontsize_v, str, pFont);
    101     }
    102     FX_Free(pPosArray);
    103     return;
    104   }
    105   FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
    106   for (int ii = 0; ii < count * 2; ii++) {
    107     pPosArray[ii] *= ratio_h;
    108   }
    109   FX_FLOAT baseline = pText->m_PosY;
    110   CTextBaseLine* pBaseLine = NULL;
    111   FX_FLOAT topy = pText->m_Top;
    112   FX_FLOAT bottomy = pText->m_Bottom;
    113   FX_FLOAT leftx = pText->m_Left;
    114   int cc = 0;
    115   CFX_ByteString segment;
    116   int space_count = 0;
    117   FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
    118   for (int i = 0; i < pText->m_nChars; i++) {
    119     FX_DWORD charcode = pText->m_nChars == 1
    120                             ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
    121                             : pText->m_pCharCodes[i];
    122     if (charcode == (FX_DWORD)-1) {
    123       continue;
    124     }
    125     FX_FLOAT char_left = pPosArray[cc * 2];
    126     FX_FLOAT char_right = pPosArray[cc * 2 + 1];
    127     cc++;
    128     if (char_left < last_left || (char_left - last_right) > spacew / 2) {
    129       pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
    130                                 leftx + segment_right, topy, bottomy, spacew,
    131                                 fontsize_v, segment, pFont);
    132       segment_left = char_left;
    133       segment = "";
    134     }
    135     if (space_count > 1) {
    136       pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
    137                                 leftx + segment_right, topy, bottomy, spacew,
    138                                 fontsize_v, segment, pFont);
    139       segment = "";
    140     } else if (space_count == 1) {
    141       pFont->AppendChar(segment, ' ');
    142     }
    143     if (segment.GetLength() == 0) {
    144       segment_left = char_left;
    145     }
    146     segment_right = char_right;
    147     pFont->AppendChar(segment, charcode);
    148     space_count = 0;
    149     last_left = char_left;
    150     last_right = char_right;
    151   }
    152   if (segment.GetLength())
    153     pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
    154                               leftx + segment_right, topy, bottomy, spacew,
    155                               fontsize_v, segment, pFont);
    156   FX_Free(pPosArray);
    157 }
    158 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine,
    159                                         FX_FLOAT basey,
    160                                         FX_FLOAT leftx,
    161                                         FX_FLOAT rightx,
    162                                         FX_FLOAT topy,
    163                                         FX_FLOAT bottomy,
    164                                         FX_FLOAT spacew,
    165                                         FX_FLOAT fontsize_v,
    166                                         CFX_ByteString& str,
    167                                         CPDF_Font* pFont) {
    168   if (str.GetLength() == 0) {
    169     return NULL;
    170   }
    171   if (!pBaseLine) {
    172     int i;
    173     for (i = 0; i < m_BaseLines.GetSize(); i++) {
    174       CTextBaseLine* pExistLine = m_BaseLines.GetAt(i);
    175       if (pExistLine->m_BaseLine == basey) {
    176         pBaseLine = pExistLine;
    177         break;
    178       }
    179       if (pExistLine->m_BaseLine < basey) {
    180         break;
    181       }
    182     }
    183     if (!pBaseLine) {
    184       pBaseLine = new CTextBaseLine;
    185       pBaseLine->m_BaseLine = basey;
    186       m_BaseLines.InsertAt(i, pBaseLine);
    187     }
    188   }
    189   CFX_WideString text;
    190   const FX_CHAR* pStr = str;
    191   int len = str.GetLength(), offset = 0;
    192   while (offset < len) {
    193     FX_DWORD ch = pFont->GetNextChar(pStr, len, offset);
    194     CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
    195     if (unicode_str.IsEmpty()) {
    196       text += (FX_WCHAR)ch;
    197     } else {
    198       text += unicode_str;
    199     }
    200   }
    201   pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v,
    202                            text);
    203   return pBaseLine;
    204 }
    205 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) {
    206   FX_FLOAT lastheight = -1;
    207   FX_FLOAT lastbaseline = -1;
    208   FX_FLOAT MinLeftX = 1000000;
    209   FX_FLOAT MaxRightX = 0;
    210   int i;
    211   for (i = 0; i < m_BaseLines.GetSize(); i++) {
    212     CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    213     FX_FLOAT leftx, rightx;
    214     if (pBaseLine->GetWidth(leftx, rightx)) {
    215       if (leftx < MinLeftX) {
    216         MinLeftX = leftx;
    217       }
    218       if (rightx > MaxRightX) {
    219         MaxRightX = rightx;
    220       }
    221     }
    222   }
    223   for (i = 0; i < m_BaseLines.GetSize(); i++) {
    224     m_BaseLines.GetAt(i)->MergeBoxes();
    225   }
    226   for (i = 1; i < m_BaseLines.GetSize(); i++) {
    227     CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    228     CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1);
    229     if (pBaseLine->CanMerge(pPrevLine)) {
    230       pPrevLine->Merge(pBaseLine);
    231       delete pBaseLine;
    232       m_BaseLines.RemoveAt(i);
    233       i--;
    234     }
    235   }
    236   if (m_bAutoWidth) {
    237     int* widths = FX_Alloc(int, m_BaseLines.GetSize());
    238     for (i = 0; i < m_BaseLines.GetSize(); i++) {
    239       widths[i] = 0;
    240       CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    241       int TotalChars = 0;
    242       FX_FLOAT TotalWidth = 0;
    243       int minchars;
    244       pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
    245       if (TotalChars) {
    246         FX_FLOAT charwidth = TotalWidth / TotalChars;
    247         widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
    248       }
    249       if (widths[i] > 1000) {
    250         widths[i] = 1000;
    251       }
    252       if (widths[i] < minchars) {
    253         widths[i] = minchars;
    254       }
    255     }
    256     int AvgWidth = 0, widthcount = 0;
    257     for (i = 0; i < m_BaseLines.GetSize(); i++)
    258       if (widths[i]) {
    259         AvgWidth += widths[i];
    260         widthcount++;
    261       }
    262     AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
    263     int MaxWidth = 0;
    264     for (i = 0; i < m_BaseLines.GetSize(); i++)
    265       if (MaxWidth < widths[i]) {
    266         MaxWidth = widths[i];
    267       }
    268     if (MaxWidth > AvgWidth * 6 / 5) {
    269       MaxWidth = AvgWidth * 6 / 5;
    270     }
    271     FX_Free(widths);
    272     if (iMinWidth < MaxWidth) {
    273       iMinWidth = MaxWidth;
    274     }
    275   }
    276   for (i = 0; i < m_BaseLines.GetSize(); i++) {
    277     m_BaseLines.GetAt(i)->MergeBoxes();
    278   }
    279   if (m_bKeepColumn) {
    280     FindColumns();
    281   }
    282   for (i = 0; i < m_BaseLines.GetSize(); i++) {
    283     CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    284     if (lastheight >= 0) {
    285       FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
    286       if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
    287         lines.Add(L"");
    288       }
    289     }
    290     lastheight = pBaseLine->m_MaxFontSizeV;
    291     lastbaseline = pBaseLine->m_BaseLine;
    292     CFX_WideString str;
    293     pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
    294     lines.Add(str);
    295   }
    296 }
    297 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) {
    298   wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
    299   FX_WCHAR* pDst = NULL;
    300   FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
    301   if (nCount < 1) {
    302     sDest += wChar;
    303     return;
    304   }
    305   pDst = new FX_WCHAR[nCount];
    306   FX_Unicode_GetNormalization(wChar, pDst);
    307   for (int nIndex = 0; nIndex < nCount; nIndex++) {
    308     sDest += pDst[nIndex];
    309   }
    310   delete[] pDst;
    311 }
    312 void NormalizeString(CFX_WideString& str) {
    313   if (str.GetLength() <= 0) {
    314     return;
    315   }
    316   CFX_WideString sBuffer;
    317   std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
    318   CFX_WordArray order;
    319   FX_BOOL bR2L = FALSE;
    320   int32_t start = 0, count = 0, i = 0;
    321   int nR2L = 0, nL2R = 0;
    322   for (i = 0; i < str.GetLength(); i++) {
    323     if (pBidiChar->AppendChar(str.GetAt(i))) {
    324       CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
    325       order.Add(start);
    326       order.Add(count);
    327       order.Add(ret);
    328       if (!bR2L) {
    329         if (ret == CFX_BidiChar::RIGHT) {
    330           nR2L++;
    331         } else if (ret == CFX_BidiChar::LEFT) {
    332           nL2R++;
    333         }
    334       }
    335     }
    336   }
    337   if (pBidiChar->EndChar()) {
    338     CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
    339     order.Add(start);
    340     order.Add(count);
    341     order.Add(ret);
    342     if (!bR2L) {
    343       if (ret == CFX_BidiChar::RIGHT) {
    344         nR2L++;
    345       } else if (ret == CFX_BidiChar::LEFT) {
    346         nL2R++;
    347       }
    348     }
    349   }
    350   if (nR2L > 0 && nR2L >= nL2R) {
    351     bR2L = TRUE;
    352   }
    353   if (bR2L) {
    354     int count = order.GetSize();
    355     for (int j = count - 1; j > 0; j -= 3) {
    356       int ret = order.GetAt(j);
    357       int start = order.GetAt(j - 2);
    358       int count1 = order.GetAt(j - 1);
    359       if (ret == 2 || ret == 0) {
    360         for (int i = start + count1 - 1; i >= start; i--) {
    361           NormalizeCompositeChar(str[i], sBuffer);
    362         }
    363       } else {
    364         i = j;
    365         FX_BOOL bSymbol = FALSE;
    366         while (i > 0 && order.GetAt(i) != 2) {
    367           bSymbol = !order.GetAt(i);
    368           i -= 3;
    369         }
    370         int end = start + count1;
    371         int n = 0;
    372         if (bSymbol) {
    373           n = i + 6;
    374         } else {
    375           n = i + 3;
    376         }
    377         if (n >= j) {
    378           for (int m = start; m < end; m++) {
    379             sBuffer += str[m];
    380           }
    381         } else {
    382           i = j;
    383           j = n;
    384           for (; n <= i; n += 3) {
    385             int start = order.GetAt(n - 2);
    386             int count1 = order.GetAt(n - 1);
    387             int end = start + count1;
    388             for (int m = start; m < end; m++) {
    389               sBuffer += str[m];
    390             }
    391           }
    392         }
    393       }
    394     }
    395   } else {
    396     int count = order.GetSize();
    397     FX_BOOL bL2R = FALSE;
    398     for (int j = 0; j < count; j += 3) {
    399       int ret = order.GetAt(j + 2);
    400       int start = order.GetAt(j);
    401       int count1 = order.GetAt(j + 1);
    402       if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
    403         int i = j + 3;
    404         while (bR2L && i < count) {
    405           if (order.GetAt(i + 2) == 1) {
    406             break;
    407           } else {
    408             i += 3;
    409           }
    410         }
    411         if (i == 3) {
    412           j = -3;
    413           bL2R = TRUE;
    414           continue;
    415         }
    416         int end = str.GetLength() - 1;
    417         if (i < count) {
    418           end = order.GetAt(i) - 1;
    419         }
    420         j = i - 3;
    421         for (int n = end; n >= start; n--) {
    422           NormalizeCompositeChar(str[i], sBuffer);
    423         }
    424       } else {
    425         int end = start + count1;
    426         for (int i = start; i < end; i++) {
    427           sBuffer += str[i];
    428         }
    429       }
    430     }
    431   }
    432   str.Empty();
    433   str += sBuffer;
    434 }
    435 static FX_BOOL IsNumber(CFX_WideString& str) {
    436   for (int i = 0; i < str.GetLength(); i++) {
    437     FX_WCHAR ch = str[i];
    438     // TODO(dsinclair): --.+ +.-- should probably not be a number.
    439     if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ')
    440       return FALSE;
    441   }
    442   return TRUE;
    443 }
    444 void CTextPage::FindColumns() {
    445   int i;
    446   for (i = 0; i < m_BaseLines.GetSize(); i++) {
    447     CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    448     for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
    449       CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
    450       CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
    451       if (pColumn) {
    452         pColumn->m_AvgPos =
    453             (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
    454             (pColumn->m_Count + 1);
    455         pColumn->m_Count++;
    456       } else {
    457         pColumn = new CTextColumn;
    458         pColumn->m_Count = 1;
    459         pColumn->m_AvgPos = pTextBox->m_Right;
    460         pColumn->m_TextPos = -1;
    461         m_TextColumns.Add(pColumn);
    462       }
    463     }
    464   }
    465   int mincount = m_BaseLines.GetSize() / 4;
    466   for (i = 0; i < m_TextColumns.GetSize(); i++) {
    467     CTextColumn* pTextColumn = m_TextColumns.GetAt(i);
    468     if (pTextColumn->m_Count >= mincount) {
    469       continue;
    470     }
    471     delete pTextColumn;
    472     m_TextColumns.RemoveAt(i);
    473     i--;
    474   }
    475   for (i = 0; i < m_BaseLines.GetSize(); i++) {
    476     CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i);
    477     for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
    478       CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j);
    479       if (IsNumber(pTextBox->m_Text)) {
    480         pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
    481       }
    482     }
    483   }
    484 }
    485 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) {
    486   for (int i = 0; i < m_TextColumns.GetSize(); i++) {
    487     CTextColumn* pColumn = m_TextColumns.GetAt(i);
    488     if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
    489       return pColumn;
    490     }
    491   }
    492   return NULL;
    493 }
    494 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {}
    495 CTextBaseLine::CTextBaseLine() {
    496   m_Top = -100000;
    497   m_Bottom = 100000;
    498   m_MaxFontSizeV = 0;
    499 }
    500 CTextBaseLine::~CTextBaseLine() {
    501   for (int i = 0; i < m_TextList.GetSize(); i++) {
    502     delete m_TextList.GetAt(i);
    503   }
    504 }
    505 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx,
    506                                   FX_FLOAT rightx,
    507                                   FX_FLOAT topy,
    508                                   FX_FLOAT bottomy,
    509                                   FX_FLOAT spacew,
    510                                   FX_FLOAT fontsize_v,
    511                                   const CFX_WideString& text) {
    512   if (m_Top < topy) {
    513     m_Top = topy;
    514   }
    515   if (m_Bottom > bottomy) {
    516     m_Bottom = bottomy;
    517   }
    518   if (m_MaxFontSizeV < fontsize_v) {
    519     m_MaxFontSizeV = fontsize_v;
    520   }
    521   int i;
    522   for (i = 0; i < m_TextList.GetSize(); i++) {
    523     CTextBox* pText = m_TextList.GetAt(i);
    524     if (pText->m_Left > leftx) {
    525       break;
    526     }
    527   }
    528   CTextBox* pText = new CTextBox;
    529   pText->m_Text = text;
    530   pText->m_Left = leftx;
    531   pText->m_Right = rightx;
    532   pText->m_Top = topy;
    533   pText->m_Bottom = bottomy;
    534   pText->m_SpaceWidth = spacew;
    535   pText->m_FontSizeV = fontsize_v;
    536   pText->m_pColumn = NULL;
    537   m_TextList.InsertAt(i, pText);
    538 }
    539 FX_BOOL GetIntersection(FX_FLOAT low1,
    540                         FX_FLOAT high1,
    541                         FX_FLOAT low2,
    542                         FX_FLOAT high2,
    543                         FX_FLOAT& interlow,
    544                         FX_FLOAT& interhigh);
    545 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) {
    546   FX_FLOAT inter_top, inter_bottom;
    547   if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
    548                        inter_bottom, inter_top)) {
    549     return FALSE;
    550   }
    551   FX_FLOAT inter_h = inter_top - inter_bottom;
    552   if (inter_h < (m_Top - m_Bottom) / 2 &&
    553       inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
    554     return FALSE;
    555   }
    556   FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
    557   for (int i = 0; i < m_TextList.GetSize(); i++) {
    558     CTextBox* pText = m_TextList.GetAt(i);
    559     for (int j = 0; j < pOther->m_TextList.GetSize(); j++) {
    560       CTextBox* pOtherText = pOther->m_TextList.GetAt(j);
    561       FX_FLOAT inter_left, inter_right;
    562       if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left,
    563                            pOtherText->m_Right, inter_left, inter_right)) {
    564         continue;
    565       }
    566       FX_FLOAT inter_w = inter_right - inter_left;
    567       if (inter_w < pText->m_SpaceWidth / 2 &&
    568           inter_w < pOtherText->m_SpaceWidth / 2) {
    569         continue;
    570       }
    571       if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
    572           dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
    573         return FALSE;
    574       }
    575     }
    576   }
    577   return TRUE;
    578 }
    579 void CTextBaseLine::Merge(CTextBaseLine* pOther) {
    580   for (int i = 0; i < pOther->m_TextList.GetSize(); i++) {
    581     CTextBox* pText = pOther->m_TextList.GetAt(i);
    582     InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
    583                   pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
    584   }
    585 }
    586 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) {
    587   int i;
    588   for (i = 0; i < m_TextList.GetSize(); i++) {
    589     CTextBox* pText = m_TextList.GetAt(i);
    590     if (pText->m_Text != L" ") {
    591       break;
    592     }
    593   }
    594   if (i == m_TextList.GetSize()) {
    595     return FALSE;
    596   }
    597   CTextBox* pText = m_TextList.GetAt(i);
    598   leftx = pText->m_Left;
    599   for (i = m_TextList.GetSize() - 1; i >= 0; i--) {
    600     CTextBox* pText = m_TextList.GetAt(i);
    601     if (pText->m_Text != L" ") {
    602       break;
    603     }
    604   }
    605   pText = m_TextList.GetAt(i);
    606   rightx = pText->m_Right;
    607   return TRUE;
    608 }
    609 void CTextBaseLine::MergeBoxes() {
    610   int i = 0;
    611   while (1) {
    612     if (i >= m_TextList.GetSize() - 1) {
    613       break;
    614     }
    615     CTextBox* pThisText = m_TextList.GetAt(i);
    616     CTextBox* pNextText = m_TextList.GetAt(i + 1);
    617     FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
    618     FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0)
    619                           ? pNextText->m_SpaceWidth
    620                           : pThisText->m_SpaceWidth;
    621     if (spacew > 0.0 && dx < spacew * 2) {
    622       pThisText->m_Right = pNextText->m_Right;
    623       if (dx > spacew * 1.5) {
    624         pThisText->m_Text += L"  ";
    625       } else if (dx > spacew / 3) {
    626         pThisText->m_Text += L' ';
    627       }
    628       pThisText->m_Text += pNextText->m_Text;
    629       pThisText->m_SpaceWidth =
    630           pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth;
    631       m_TextList.RemoveAt(i + 1);
    632       delete pNextText;
    633     } else {
    634       i++;
    635     }
    636   }
    637 }
    638 void CTextBaseLine::WriteOutput(CFX_WideString& str,
    639                                 FX_FLOAT leftx,
    640                                 FX_FLOAT pagewidth,
    641                                 int iTextWidth) {
    642   int lastpos = -1;
    643   for (int i = 0; i < m_TextList.GetSize(); i++) {
    644     CTextBox* pText = m_TextList.GetAt(i);
    645     int xpos;
    646     if (pText->m_pColumn) {
    647       xpos =
    648           (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth +
    649                 0.5);
    650       xpos -= pText->m_Text.GetLength();
    651     } else {
    652       xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
    653     }
    654     if (xpos <= lastpos) {
    655       xpos = lastpos + 1;
    656     }
    657     for (int j = lastpos + 1; j < xpos; j++) {
    658       str += ' ';
    659     }
    660     CFX_WideString sSrc(pText->m_Text);
    661     NormalizeString(sSrc);
    662     str += sSrc;
    663     str += ' ';
    664     lastpos = xpos + pText->m_Text.GetLength();
    665   }
    666 }
    667 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) {
    668   minchars = 0;
    669   for (int i = 0; i < m_TextList.GetSize(); i++) {
    670     CTextBox* pText = m_TextList.GetAt(i);
    671     if (pText->m_Right - pText->m_Left < 0.002) {
    672       continue;
    673     }
    674     count += pText->m_Text.GetLength();
    675     width += pText->m_Right - pText->m_Left;
    676     minchars += pText->m_Text.GetLength() + 1;
    677   }
    678 }
    679 #define PI 3.1415926535897932384626433832795
    680 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) {
    681   int total_count = 0, rotated_count[3] = {0, 0, 0};
    682   FX_POSITION pos = page.GetFirstObjectPosition();
    683   while (pos) {
    684     CPDF_PageObject* pObj = page.GetNextObject(pos);
    685     if (pObj->m_Type != PDFPAGE_TEXT) {
    686       continue;
    687     }
    688     total_count++;
    689     CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
    690     FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
    691     if (angle == 0.0) {
    692       continue;
    693     }
    694     int degree = (int)(angle * 180 / PI + 0.5);
    695     if (degree % 90) {
    696       continue;
    697     }
    698     if (degree < 0) {
    699       degree += 360;
    700     }
    701     int index = degree / 90 % 3 - 1;
    702     if (index < 0) {
    703       continue;
    704     }
    705     rotated_count[index]++;
    706   }
    707   if (total_count == 0) {
    708     return;
    709   }
    710   CFX_Matrix matrix;
    711   if (rotated_count[0] > total_count * 2 / 3) {
    712     matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
    713   } else if (rotated_count[1] > total_count * 2 / 3) {
    714     matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
    715   } else if (rotated_count[2] > total_count * 2 / 3) {
    716     matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
    717   } else {
    718     return;
    719   }
    720   page.Transform(matrix);
    721   page_bbox.Transform(&matrix);
    722 }
    723 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
    724                              CPDF_Document* pDoc,
    725                              CPDF_Dictionary* pPage,
    726                              int iMinWidth,
    727                              FX_DWORD flags) {
    728   lines.RemoveAll();
    729   if (!pPage) {
    730     return;
    731   }
    732   CPDF_Page page;
    733   page.Load(pDoc, pPage);
    734   CPDF_ParseOptions options;
    735   options.m_bTextOnly = TRUE;
    736   options.m_bSeparateForm = FALSE;
    737   page.ParseContent(&options);
    738   CFX_FloatRect page_bbox = page.GetPageBBox();
    739   if (flags & PDF2TXT_AUTO_ROTATE) {
    740     CheckRotate(page, page_bbox);
    741   }
    742   CTextPage texts;
    743   texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
    744   texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
    745   texts.m_bBreakSpace = TRUE;
    746   FX_POSITION pos = page.GetFirstObjectPosition();
    747   while (pos) {
    748     CPDF_PageObject* pObject = page.GetNextObject(pos);
    749     if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
    750       CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right,
    751                          pObject->m_Top);
    752       if (!page_bbox.Contains(rect)) {
    753         continue;
    754       }
    755     }
    756     texts.ProcessObject(pObject);
    757   }
    758   texts.WriteOutput(lines, iMinWidth);
    759 }
    760 void PDF_GetPageText(CFX_ByteStringArray& lines,
    761                      CPDF_Document* pDoc,
    762                      CPDF_Dictionary* pPage,
    763                      int iMinWidth,
    764                      FX_DWORD flags) {
    765   lines.RemoveAll();
    766   CFX_WideStringArray wlines;
    767   PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
    768   for (int i = 0; i < wlines.GetSize(); i++) {
    769     CFX_WideString wstr = wlines[i];
    770     CFX_ByteString str;
    771     for (int c = 0; c < wstr.GetLength(); c++) {
    772       str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
    773     }
    774     lines.Add(str);
    775   }
    776 }
    777 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
    778                                CPDF_Document* pDoc,
    779                                CPDF_Dictionary* pPage,
    780                                FX_DWORD flags) {
    781   buffer.EstimateSize(0, 10240);
    782   CPDF_Page page;
    783   page.Load(pDoc, pPage);
    784   CPDF_ParseOptions options;
    785   options.m_bTextOnly = TRUE;
    786   options.m_bSeparateForm = FALSE;
    787   page.ParseContent(&options);
    788   GetTextStream_Unicode(buffer, &page, TRUE, NULL);
    789 }
    790