Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include <algorithm>
      8 #include <cctype>
      9 #include <cwctype>
     10 #include <memory>
     11 
     12 #include "core/include/fpdfapi/fpdf_module.h"
     13 #include "core/include/fpdfapi/fpdf_page.h"
     14 #include "core/include/fpdfapi/fpdf_pageobj.h"
     15 #include "core/include/fpdfapi/fpdf_resource.h"
     16 #include "core/include/fpdftext/fpdf_text.h"
     17 #include "core/include/fxcrt/fx_bidi.h"
     18 #include "core/include/fxcrt/fx_ext.h"
     19 #include "core/include/fxcrt/fx_ucd.h"
     20 #include "text_int.h"
     21 
     22 namespace {
     23 
     24 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
     25   if (curChar < 255) {
     26     return FALSE;
     27   }
     28   if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
     29       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
     30       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
     31       (curChar >= 0x0400 && curChar <= 0x04FF) ||
     32       (curChar >= 0x0500 && curChar <= 0x052F) ||
     33       (curChar >= 0xA640 && curChar <= 0xA69F) ||
     34       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
     35       (curChar >= 0x2000 && curChar <= 0x206F)) {
     36     return FALSE;
     37   }
     38   return TRUE;
     39 }
     40 
     41 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) {
     42   if (threshold < 300) {
     43     return threshold / 2.0f;
     44   }
     45   if (threshold < 500) {
     46     return threshold / 4.0f;
     47   }
     48   if (threshold < 700) {
     49     return threshold / 5.0f;
     50   }
     51   return threshold / 6.0f;
     52 }
     53 
     54 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj,
     55                              const CFX_Matrix& matrix) {
     56   FX_FLOAT baseSpace = 0.0;
     57   const int nItems = pTextObj->CountItems();
     58   if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) {
     59     FX_BOOL bAllChar = TRUE;
     60     FX_FLOAT spacing = matrix.TransformDistance(
     61         pTextObj->m_TextState.GetObject()->m_CharSpace);
     62     baseSpace = spacing;
     63     for (int i = 0; i < nItems; i++) {
     64       CPDF_TextObjectItem item;
     65       pTextObj->GetItemInfo(i, &item);
     66       if (item.m_CharCode == (FX_DWORD)-1) {
     67         FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
     68         FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000;
     69         baseSpace = std::min(baseSpace, kerning + spacing);
     70         bAllChar = FALSE;
     71       }
     72     }
     73     if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) {
     74       baseSpace = 0.0;
     75     }
     76   }
     77   return baseSpace;
     78 }
     79 
     80 const FX_FLOAT kDefaultFontSize = 1.0f;
     81 
     82 }  // namespace
     83 
     84 CPDFText_ParseOptions::CPDFText_ParseOptions()
     85     : m_bGetCharCodeOnly(FALSE),
     86       m_bNormalizeObjs(TRUE),
     87       m_bOutputHyphen(FALSE) {}
     88 
     89 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage,
     90                                              int flags) {
     91   return new CPDF_TextPage(pPage, flags);
     92 }
     93 
     94 IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(
     95     const IPDF_TextPage* pTextPage) {
     96   return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr;
     97 }
     98 
     99 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
    100   return new CPDF_LinkExtract();
    101 }
    102 
    103 #define TEXT_BLANK_CHAR L' '
    104 #define TEXT_LINEFEED_CHAR L'\n'
    105 #define TEXT_RETURN_CHAR L'\r'
    106 #define TEXT_EMPTY L""
    107 #define TEXT_BLANK L" "
    108 #define TEXT_RETURN_LINEFEED L"\r\n"
    109 #define TEXT_LINEFEED L"\n"
    110 #define TEXT_CHARRATIO_GAPDELTA 0.070
    111 
    112 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags)
    113     : m_pPage(pPage),
    114       m_charList(512),
    115       m_TempCharList(50),
    116       m_parserflag(flags),
    117       m_pPreTextObj(nullptr),
    118       m_bIsParsed(false),
    119       m_TextlineDir(-1),
    120       m_CurlineRect(0, 0, 0, 0) {
    121   m_TextBuf.EstimateSize(0, 10240);
    122   pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
    123                           (int)pPage->GetPageHeight(), 0);
    124 }
    125 
    126 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) {
    127   m_ParseOptions.m_bNormalizeObjs = bNormalize;
    128 }
    129 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
    130   switch (charInfo.m_Unicode) {
    131     case 0x2:
    132     case 0x3:
    133     case 0x93:
    134     case 0x94:
    135     case 0x96:
    136     case 0x97:
    137     case 0x98:
    138     case 0xfffe:
    139       return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
    140     default:
    141       return false;
    142   }
    143 }
    144 FX_BOOL CPDF_TextPage::ParseTextPage() {
    145   m_bIsParsed = false;
    146   if (!m_pPage)
    147     return FALSE;
    148 
    149   m_TextBuf.Clear();
    150   m_charList.RemoveAll();
    151   m_pPreTextObj = NULL;
    152   ProcessObject();
    153   m_bIsParsed = true;
    154   if (!m_ParseOptions.m_bGetCharCodeOnly) {
    155     m_CharIndex.RemoveAll();
    156     int nCount = m_charList.GetSize();
    157     if (nCount) {
    158       m_CharIndex.Add(0);
    159     }
    160     for (int i = 0; i < nCount; i++) {
    161       int indexSize = m_CharIndex.GetSize();
    162       FX_BOOL bNormal = FALSE;
    163       PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i);
    164       if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
    165         bNormal = TRUE;
    166       } else if (charinfo.m_Unicode == 0 || IsControlChar(charinfo)) {
    167         bNormal = FALSE;
    168       } else {
    169         bNormal = TRUE;
    170       }
    171       if (bNormal) {
    172         if (indexSize % 2) {
    173           m_CharIndex.Add(1);
    174         } else {
    175           if (indexSize <= 0) {
    176             continue;
    177           }
    178           m_CharIndex.SetAt(indexSize - 1,
    179                             m_CharIndex.GetAt(indexSize - 1) + 1);
    180         }
    181       } else {
    182         if (indexSize % 2) {
    183           if (indexSize <= 0) {
    184             continue;
    185           }
    186           m_CharIndex.SetAt(indexSize - 1, i + 1);
    187         } else {
    188           m_CharIndex.Add(i + 1);
    189         }
    190       }
    191     }
    192     int indexSize = m_CharIndex.GetSize();
    193     if (indexSize % 2) {
    194       m_CharIndex.RemoveAt(indexSize - 1);
    195     }
    196   }
    197   return TRUE;
    198 }
    199 int CPDF_TextPage::CountChars() const {
    200   if (m_ParseOptions.m_bGetCharCodeOnly) {
    201     return m_TextBuf.GetSize();
    202   }
    203   return m_charList.GetSize();
    204 }
    205 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
    206   int indexSize = m_CharIndex.GetSize();
    207   int count = 0;
    208   for (int i = 0; i < indexSize; i += 2) {
    209     count += m_CharIndex.GetAt(i + 1);
    210     if (count > TextIndex) {
    211       return TextIndex - count + m_CharIndex.GetAt(i + 1) +
    212              m_CharIndex.GetAt(i);
    213     }
    214   }
    215   return -1;
    216 }
    217 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
    218   int indexSize = m_CharIndex.GetSize();
    219   int count = 0;
    220   for (int i = 0; i < indexSize; i += 2) {
    221     count += m_CharIndex.GetAt(i + 1);
    222     if (m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) {
    223       if (CharIndex - m_CharIndex.GetAt(i) < 0) {
    224         return -1;
    225       }
    226       return CharIndex - m_CharIndex.GetAt(i) + count -
    227              m_CharIndex.GetAt(i + 1);
    228     }
    229   }
    230   return -1;
    231 }
    232 void CPDF_TextPage::GetRectArray(int start,
    233                                  int nCount,
    234                                  CFX_RectArray& rectArray) const {
    235   if (m_ParseOptions.m_bGetCharCodeOnly) {
    236     return;
    237   }
    238   if (start < 0 || nCount == 0) {
    239     return;
    240   }
    241   if (!m_bIsParsed) {
    242     return;
    243   }
    244   PAGECHAR_INFO info_curchar;
    245   CPDF_TextObject* pCurObj = NULL;
    246   CFX_FloatRect rect;
    247   int curPos = start;
    248   FX_BOOL flagNewRect = TRUE;
    249   if (nCount + start > m_charList.GetSize() || nCount == -1) {
    250     nCount = m_charList.GetSize() - start;
    251   }
    252   while (nCount--) {
    253     info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++);
    254     if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
    255       continue;
    256     }
    257     if (info_curchar.m_CharBox.Width() < 0.01 ||
    258         info_curchar.m_CharBox.Height() < 0.01) {
    259       continue;
    260     }
    261     if (!pCurObj) {
    262       pCurObj = info_curchar.m_pTextObj;
    263     }
    264     if (pCurObj != info_curchar.m_pTextObj) {
    265       rectArray.Add(rect);
    266       pCurObj = info_curchar.m_pTextObj;
    267       flagNewRect = TRUE;
    268     }
    269     if (flagNewRect) {
    270       FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY;
    271       CFX_Matrix matrix, matrix_reverse;
    272       info_curchar.m_pTextObj->GetTextMatrix(&matrix);
    273       matrix.Concat(info_curchar.m_Matrix);
    274       matrix_reverse.SetReverse(matrix);
    275       matrix_reverse.Transform(orgX, orgY);
    276       rect.left = info_curchar.m_CharBox.left;
    277       rect.right = info_curchar.m_CharBox.right;
    278       if (pCurObj->GetFont()->GetTypeDescent()) {
    279         rect.bottom = orgY +
    280                       pCurObj->GetFont()->GetTypeDescent() *
    281                           pCurObj->GetFontSize() / 1000;
    282         FX_FLOAT xPosTemp = orgX;
    283         matrix.Transform(xPosTemp, rect.bottom);
    284       } else {
    285         rect.bottom = info_curchar.m_CharBox.bottom;
    286       }
    287       if (pCurObj->GetFont()->GetTypeAscent()) {
    288         rect.top =
    289             orgY +
    290             pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
    291         FX_FLOAT xPosTemp =
    292             orgX +
    293             GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
    294                 pCurObj->GetFontSize() / 1000;
    295         matrix.Transform(xPosTemp, rect.top);
    296       } else {
    297         rect.top = info_curchar.m_CharBox.top;
    298       }
    299       flagNewRect = FALSE;
    300       rect = info_curchar.m_CharBox;
    301       rect.Normalize();
    302     } else {
    303       info_curchar.m_CharBox.Normalize();
    304       if (rect.left > info_curchar.m_CharBox.left) {
    305         rect.left = info_curchar.m_CharBox.left;
    306       }
    307       if (rect.right < info_curchar.m_CharBox.right) {
    308         rect.right = info_curchar.m_CharBox.right;
    309       }
    310       if (rect.top < info_curchar.m_CharBox.top) {
    311         rect.top = info_curchar.m_CharBox.top;
    312       }
    313       if (rect.bottom > info_curchar.m_CharBox.bottom) {
    314         rect.bottom = info_curchar.m_CharBox.bottom;
    315       }
    316     }
    317   }
    318   rectArray.Add(rect);
    319   return;
    320 }
    321 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point,
    322                                  FX_FLOAT xTolerance,
    323                                  FX_FLOAT yTolerance) const {
    324   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
    325     return -3;
    326 
    327   int pos = 0;
    328   int NearPos = -1;
    329   double xdif = 5000, ydif = 5000;
    330   while (pos < m_charList.GetSize()) {
    331     PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos));
    332     CFX_FloatRect charrect = charinfo.m_CharBox;
    333     if (charrect.Contains(point.x, point.y)) {
    334       break;
    335     }
    336     if (xTolerance > 0 || yTolerance > 0) {
    337       CFX_FloatRect charRectExt;
    338       charrect.Normalize();
    339       charRectExt.left = charrect.left - xTolerance / 2;
    340       charRectExt.right = charrect.right + xTolerance / 2;
    341       charRectExt.top = charrect.top + yTolerance / 2;
    342       charRectExt.bottom = charrect.bottom - yTolerance / 2;
    343       if (charRectExt.Contains(point.x, point.y)) {
    344         double curXdif, curYdif;
    345         curXdif = FXSYS_fabs(point.x - charrect.left) <
    346                           FXSYS_fabs(point.x - charrect.right)
    347                       ? FXSYS_fabs(point.x - charrect.left)
    348                       : FXSYS_fabs(point.x - charrect.right);
    349         curYdif = FXSYS_fabs(point.y - charrect.bottom) <
    350                           FXSYS_fabs(point.y - charrect.top)
    351                       ? FXSYS_fabs(point.y - charrect.bottom)
    352                       : FXSYS_fabs(point.y - charrect.top);
    353         if (curYdif + curXdif < xdif + ydif) {
    354           ydif = curYdif;
    355           xdif = curXdif;
    356           NearPos = pos;
    357         }
    358       }
    359     }
    360     ++pos;
    361   }
    362   if (pos >= m_charList.GetSize()) {
    363     pos = NearPos;
    364   }
    365   return pos;
    366 }
    367 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
    368   CFX_WideString strText;
    369   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
    370     return strText;
    371 
    372   int nCount = m_charList.GetSize();
    373   int pos = 0;
    374   FX_FLOAT posy = 0;
    375   FX_BOOL IsContainPreChar = FALSE;
    376   FX_BOOL ISAddLineFeed = FALSE;
    377   while (pos < nCount) {
    378     PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
    379     if (IsRectIntersect(rect, charinfo.m_CharBox)) {
    380       if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar &&
    381           ISAddLineFeed) {
    382         posy = charinfo.m_OriginY;
    383         if (strText.GetLength() > 0) {
    384           strText += L"\r\n";
    385         }
    386       }
    387       IsContainPreChar = TRUE;
    388       ISAddLineFeed = FALSE;
    389       if (charinfo.m_Unicode) {
    390         strText += charinfo.m_Unicode;
    391       }
    392     } else if (charinfo.m_Unicode == 32) {
    393       if (IsContainPreChar && charinfo.m_Unicode) {
    394         strText += charinfo.m_Unicode;
    395         IsContainPreChar = FALSE;
    396         ISAddLineFeed = FALSE;
    397       }
    398     } else {
    399       IsContainPreChar = FALSE;
    400       ISAddLineFeed = TRUE;
    401     }
    402   }
    403   return strText;
    404 }
    405 void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect,
    406                                         CFX_RectArray& resRectArray) const {
    407   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
    408     return;
    409 
    410   CFX_FloatRect curRect;
    411   FX_BOOL flagNewRect = TRUE;
    412   CPDF_TextObject* pCurObj = NULL;
    413   int nCount = m_charList.GetSize();
    414   int pos = 0;
    415   while (pos < nCount) {
    416     PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
    417     if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
    418       continue;
    419     }
    420     if (IsRectIntersect(rect, info_curchar.m_CharBox)) {
    421       if (!pCurObj) {
    422         pCurObj = info_curchar.m_pTextObj;
    423       }
    424       if (pCurObj != info_curchar.m_pTextObj) {
    425         resRectArray.Add(curRect);
    426         pCurObj = info_curchar.m_pTextObj;
    427         flagNewRect = TRUE;
    428       }
    429       if (flagNewRect) {
    430         curRect = info_curchar.m_CharBox;
    431         flagNewRect = FALSE;
    432         curRect.Normalize();
    433       } else {
    434         info_curchar.m_CharBox.Normalize();
    435         if (curRect.left > info_curchar.m_CharBox.left) {
    436           curRect.left = info_curchar.m_CharBox.left;
    437         }
    438         if (curRect.right < info_curchar.m_CharBox.right) {
    439           curRect.right = info_curchar.m_CharBox.right;
    440         }
    441         if (curRect.top < info_curchar.m_CharBox.top) {
    442           curRect.top = info_curchar.m_CharBox.top;
    443         }
    444         if (curRect.bottom > info_curchar.m_CharBox.bottom) {
    445           curRect.bottom = info_curchar.m_CharBox.bottom;
    446         }
    447       }
    448     }
    449   }
    450   resRectArray.Add(curRect);
    451   return;
    452 }
    453 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x,
    454                                  FX_FLOAT y,
    455                                  FX_FLOAT xTolerance,
    456                                  FX_FLOAT yTolerance) const {
    457   if (m_ParseOptions.m_bGetCharCodeOnly) {
    458     return -3;
    459   }
    460   CPDF_Point point(x, y);
    461   return GetIndexAtPos(point, xTolerance, yTolerance);
    462 }
    463 
    464 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
    465   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
    466     return;
    467 
    468   if (index < 0 || index >= m_charList.GetSize())
    469     return;
    470 
    471   const PAGECHAR_INFO* charinfo =
    472       static_cast<PAGECHAR_INFO*>(m_charList.GetAt(index));
    473   info->m_Charcode = charinfo->m_CharCode;
    474   info->m_OriginX = charinfo->m_OriginX;
    475   info->m_OriginY = charinfo->m_OriginY;
    476   info->m_Unicode = charinfo->m_Unicode;
    477   info->m_Flag = charinfo->m_Flag;
    478   info->m_CharBox = charinfo->m_CharBox;
    479   info->m_pTextObj = charinfo->m_pTextObj;
    480   if (charinfo->m_pTextObj && charinfo->m_pTextObj->GetFont()) {
    481     info->m_FontSize = charinfo->m_pTextObj->GetFontSize();
    482   } else {
    483     info->m_FontSize = kDefaultFontSize;
    484   }
    485   info->m_Matrix.Copy(charinfo->m_Matrix);
    486 }
    487 
    488 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
    489                                              int32_t& nCount) const {
    490   PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
    491   PAGECHAR_INFO charinfo2 =
    492       *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
    493   if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
    494       FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
    495     return;
    496   }
    497   if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
    498     PAGECHAR_INFO charinfo1 = charinfo;
    499     int startIndex = start;
    500     while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
    501            charinfo1.m_Index == charinfo.m_Index) {
    502       startIndex--;
    503       if (startIndex < 0) {
    504         break;
    505       }
    506       charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex);
    507     }
    508     startIndex++;
    509     start = startIndex;
    510   }
    511   if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
    512     PAGECHAR_INFO charinfo3 = charinfo2;
    513     int endIndex = start + nCount - 1;
    514     while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
    515            charinfo3.m_Index == charinfo2.m_Index) {
    516       endIndex++;
    517       if (endIndex >= m_charList.GetSize()) {
    518         break;
    519       }
    520       charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex);
    521     }
    522     endIndex--;
    523     nCount = endIndex - start + 1;
    524   }
    525 }
    526 CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
    527   if (!m_bIsParsed || nCount == 0)
    528     return L"";
    529 
    530   if (start < 0)
    531     start = 0;
    532 
    533   if (nCount == -1) {
    534     nCount = m_charList.GetSize() - start;
    535     return m_TextBuf.GetWideString().Mid(start,
    536                                          m_TextBuf.GetWideString().GetLength());
    537   }
    538   if (nCount <= 0 || m_charList.GetSize() <= 0) {
    539     return L"";
    540   }
    541   if (nCount + start > m_charList.GetSize() - 1) {
    542     nCount = m_charList.GetSize() - start;
    543   }
    544   if (nCount <= 0) {
    545     return L"";
    546   }
    547   CheckMarkedContentObject(start, nCount);
    548   int startindex = 0;
    549   PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
    550   int startOffset = 0;
    551   while (charinfo.m_Index == -1) {
    552     startOffset++;
    553     if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) {
    554       return L"";
    555     }
    556     charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset);
    557   }
    558   startindex = charinfo.m_Index;
    559   charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
    560   int nCountOffset = 0;
    561   while (charinfo.m_Index == -1) {
    562     nCountOffset++;
    563     if (nCountOffset >= nCount) {
    564       return L"";
    565     }
    566     charinfo =
    567         *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1);
    568   }
    569   nCount = start + nCount - nCountOffset - startindex;
    570   if (nCount <= 0) {
    571     return L"";
    572   }
    573   return m_TextBuf.GetWideString().Mid(startindex, nCount);
    574 }
    575 int CPDF_TextPage::CountRects(int start, int nCount) {
    576   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed || start < 0)
    577     return -1;
    578 
    579   if (nCount == -1 || nCount + start > m_charList.GetSize()) {
    580     nCount = m_charList.GetSize() - start;
    581   }
    582   m_SelRects.RemoveAll();
    583   GetRectArray(start, nCount, m_SelRects);
    584   return m_SelRects.GetSize();
    585 }
    586 void CPDF_TextPage::GetRect(int rectIndex,
    587                             FX_FLOAT& left,
    588                             FX_FLOAT& top,
    589                             FX_FLOAT& right,
    590                             FX_FLOAT& bottom) const {
    591   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
    592     return;
    593 
    594   if (rectIndex < 0 || rectIndex >= m_SelRects.GetSize())
    595     return;
    596 
    597   left = m_SelRects.GetAt(rectIndex).left;
    598   top = m_SelRects.GetAt(rectIndex).top;
    599   right = m_SelRects.GetAt(rectIndex).right;
    600   bottom = m_SelRects.GetAt(rectIndex).bottom;
    601 }
    602 
    603 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) {
    604   if (m_ParseOptions.m_bGetCharCodeOnly) {
    605     return FALSE;
    606   }
    607   if (end == start) {
    608     return FALSE;
    609   }
    610   FPDF_CHAR_INFO info_start;
    611   FPDF_CHAR_INFO info_end;
    612   GetCharInfo(start, &info_start);
    613   GetCharInfo(end, &info_end);
    614   while (info_end.m_CharBox.Width() == 0 || info_end.m_CharBox.Height() == 0) {
    615     if (--end <= start)
    616       return FALSE;
    617 
    618     GetCharInfo(end, &info_end);
    619   }
    620   FX_FLOAT dx = (info_end.m_OriginX - info_start.m_OriginX);
    621   FX_FLOAT dy = (info_end.m_OriginY - info_start.m_OriginY);
    622   if (dx == 0) {
    623     if (dy > 0) {
    624       Rotate = 90;
    625     } else if (dy < 0) {
    626       Rotate = 270;
    627     } else {
    628       Rotate = 0;
    629     }
    630   } else {
    631     float a = FXSYS_atan2(dy, dx);
    632     Rotate = (int)(a * 180 / FX_PI + 0.5);
    633   }
    634   if (Rotate < 0) {
    635     Rotate = -Rotate;
    636   } else if (Rotate > 0) {
    637     Rotate = 360 - Rotate;
    638   }
    639   return TRUE;
    640 }
    641 
    642 FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect,
    643                                          int& Rotate) {
    644   if (m_ParseOptions.m_bGetCharCodeOnly) {
    645     return FALSE;
    646   }
    647   int start, end, count,
    648       n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom,
    649                                TRUE);
    650   if (n < 1) {
    651     return FALSE;
    652   }
    653   if (n > 1) {
    654     GetBoundedSegment(n - 1, start, count);
    655     end = start + count - 1;
    656     GetBoundedSegment(0, start, count);
    657   } else {
    658     GetBoundedSegment(0, start, count);
    659     end = start + count - 1;
    660   }
    661   return GetBaselineRotate(start, end, Rotate);
    662 }
    663 FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) {
    664   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
    665     return FALSE;
    666 
    667   if (rectIndex < 0 || rectIndex > m_SelRects.GetSize())
    668     return FALSE;
    669 
    670   CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
    671   return GetBaselineRotate(rect, Rotate);
    672 }
    673 int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left,
    674                                         FX_FLOAT top,
    675                                         FX_FLOAT right,
    676                                         FX_FLOAT bottom,
    677                                         FX_BOOL bContains) {
    678   if (m_ParseOptions.m_bGetCharCodeOnly)
    679     return -1;
    680 
    681   m_Segment.RemoveAll();
    682   if (!m_bIsParsed)
    683     return -1;
    684 
    685   CFX_FloatRect rect(left, bottom, right, top);
    686   rect.Normalize();
    687   int nCount = m_charList.GetSize();
    688   int pos = 0;
    689   FPDF_SEGMENT segment;
    690   segment.m_Start = 0;
    691   segment.m_nCount = 0;
    692   int segmentStatus = 0;
    693   FX_BOOL IsContainPreChar = FALSE;
    694   while (pos < nCount) {
    695     PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos);
    696     if (bContains && rect.Contains(charinfo.m_CharBox)) {
    697       if (segmentStatus == 0 || segmentStatus == 2) {
    698         segment.m_Start = pos;
    699         segment.m_nCount = 1;
    700         segmentStatus = 1;
    701       } else if (segmentStatus == 1) {
    702         segment.m_nCount++;
    703       }
    704       IsContainPreChar = TRUE;
    705     } else if (!bContains &&
    706                (IsRectIntersect(rect, charinfo.m_CharBox) ||
    707                 rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) {
    708       if (segmentStatus == 0 || segmentStatus == 2) {
    709         segment.m_Start = pos;
    710         segment.m_nCount = 1;
    711         segmentStatus = 1;
    712       } else if (segmentStatus == 1) {
    713         segment.m_nCount++;
    714       }
    715       IsContainPreChar = TRUE;
    716     } else if (charinfo.m_Unicode == 32) {
    717       if (IsContainPreChar == TRUE) {
    718         if (segmentStatus == 0 || segmentStatus == 2) {
    719           segment.m_Start = pos;
    720           segment.m_nCount = 1;
    721           segmentStatus = 1;
    722         } else if (segmentStatus == 1) {
    723           segment.m_nCount++;
    724         }
    725         IsContainPreChar = FALSE;
    726       } else {
    727         if (segmentStatus == 1) {
    728           segmentStatus = 2;
    729           m_Segment.Add(segment);
    730           segment.m_Start = 0;
    731           segment.m_nCount = 0;
    732         }
    733       }
    734     } else {
    735       if (segmentStatus == 1) {
    736         segmentStatus = 2;
    737         m_Segment.Add(segment);
    738         segment.m_Start = 0;
    739         segment.m_nCount = 0;
    740       }
    741       IsContainPreChar = FALSE;
    742     }
    743     pos++;
    744   }
    745   if (segmentStatus == 1) {
    746     segmentStatus = 2;
    747     m_Segment.Add(segment);
    748     segment.m_Start = 0;
    749     segment.m_nCount = 0;
    750   }
    751   return m_Segment.GetSize();
    752 }
    753 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const {
    754   if (m_ParseOptions.m_bGetCharCodeOnly) {
    755     return;
    756   }
    757   if (index < 0 || index >= m_Segment.GetSize()) {
    758     return;
    759   }
    760   start = m_Segment.GetAt(index).m_Start;
    761   count = m_Segment.GetAt(index).m_nCount;
    762 }
    763 int CPDF_TextPage::GetWordBreak(int index, int direction) const {
    764   if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed)
    765     return -1;
    766 
    767   if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT)
    768     return -1;
    769 
    770   if (index < 0 || index >= m_charList.GetSize())
    771     return -1;
    772 
    773   PAGECHAR_INFO charinfo;
    774   charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
    775   if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
    776     return index;
    777   }
    778   if (!IsLetter(charinfo.m_Unicode)) {
    779     return index;
    780   }
    781   int breakPos = index;
    782   if (direction == FPDFTEXT_LEFT) {
    783     while (--breakPos > 0) {
    784       charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
    785       if (!IsLetter(charinfo.m_Unicode)) {
    786         return breakPos;
    787       }
    788     }
    789   } else if (direction == FPDFTEXT_RIGHT) {
    790     while (++breakPos < m_charList.GetSize()) {
    791       charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
    792       if (!IsLetter(charinfo.m_Unicode)) {
    793         return breakPos;
    794       }
    795     }
    796   }
    797   return breakPos;
    798 }
    799 int32_t CPDF_TextPage::FindTextlineFlowDirection() {
    800   if (!m_pPage) {
    801     return -1;
    802   }
    803   const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth();
    804   const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight();
    805   CFX_ByteArray nHorizontalMask;
    806   if (!nHorizontalMask.SetSize(nPageWidth)) {
    807     return -1;
    808   }
    809   uint8_t* pDataH = nHorizontalMask.GetData();
    810   CFX_ByteArray nVerticalMask;
    811   if (!nVerticalMask.SetSize(nPageHeight)) {
    812     return -1;
    813   }
    814   uint8_t* pDataV = nVerticalMask.GetData();
    815   int32_t index = 0;
    816   FX_FLOAT fLineHeight = 0.0f;
    817   CPDF_PageObject* pPageObj = NULL;
    818   FX_POSITION pos = NULL;
    819   pos = m_pPage->GetFirstObjectPosition();
    820   if (!pos) {
    821     return -1;
    822   }
    823   while (pos) {
    824     pPageObj = m_pPage->GetNextObject(pos);
    825     if (NULL == pPageObj) {
    826       continue;
    827     }
    828     if (PDFPAGE_TEXT != pPageObj->m_Type) {
    829       continue;
    830     }
    831     int32_t minH =
    832         (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left;
    833     int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth
    834                        ? nPageWidth
    835                        : (int32_t)pPageObj->m_Right;
    836     int32_t minV =
    837         (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom;
    838     int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight
    839                        ? nPageHeight
    840                        : (int32_t)pPageObj->m_Top;
    841     if (minH >= maxH || minV >= maxV) {
    842       continue;
    843     }
    844     FXSYS_memset(pDataH + minH, 1, maxH - minH);
    845     FXSYS_memset(pDataV + minV, 1, maxV - minV);
    846     if (fLineHeight <= 0.0f) {
    847       fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
    848     }
    849     pPageObj = NULL;
    850   }
    851   int32_t nStartH = 0;
    852   int32_t nEndH = 0;
    853   FX_FLOAT nSumH = 0.0f;
    854   for (index = 0; index < nPageWidth; index++)
    855     if (1 == nHorizontalMask[index]) {
    856       break;
    857     }
    858   nStartH = index;
    859   for (index = nPageWidth; index > 0; index--)
    860     if (1 == nHorizontalMask[index - 1]) {
    861       break;
    862     }
    863   nEndH = index;
    864   for (index = nStartH; index < nEndH; index++) {
    865     nSumH += nHorizontalMask[index];
    866   }
    867   nSumH /= nEndH - nStartH;
    868   int32_t nStartV = 0;
    869   int32_t nEndV = 0;
    870   FX_FLOAT nSumV = 0.0f;
    871   for (index = 0; index < nPageHeight; index++)
    872     if (1 == nVerticalMask[index]) {
    873       break;
    874     }
    875   nStartV = index;
    876   for (index = nPageHeight; index > 0; index--)
    877     if (1 == nVerticalMask[index - 1]) {
    878       break;
    879     }
    880   nEndV = index;
    881   for (index = nStartV; index < nEndV; index++) {
    882     nSumV += nVerticalMask[index];
    883   }
    884   nSumV /= nEndV - nStartV;
    885   if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) {
    886     return 0;
    887   }
    888   if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) {
    889     return 1;
    890   }
    891   if (nSumH > 0.8f) {
    892     return 0;
    893   }
    894   if (nSumH - nSumV > 0.0f) {
    895     return 0;
    896   }
    897   if (nSumV - nSumH > 0.0f) {
    898     return 1;
    899   }
    900   return -1;
    901 }
    902 void CPDF_TextPage::ProcessObject() {
    903   CPDF_PageObject* pPageObj = NULL;
    904   if (!m_pPage) {
    905     return;
    906   }
    907   FX_POSITION pos;
    908   pos = m_pPage->GetFirstObjectPosition();
    909   if (!pos) {
    910     return;
    911   }
    912   m_TextlineDir = FindTextlineFlowDirection();
    913   int nCount = 0;
    914   while (pos) {
    915     pPageObj = m_pPage->GetNextObject(pos);
    916     if (pPageObj) {
    917       if (pPageObj->m_Type == PDFPAGE_TEXT) {
    918         CFX_Matrix matrix;
    919         ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos);
    920         nCount++;
    921       } else if (pPageObj->m_Type == PDFPAGE_FORM) {
    922         CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0);
    923         ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix);
    924       }
    925     }
    926     pPageObj = NULL;
    927   }
    928   int count = m_LineObj.GetSize();
    929   for (int i = 0; i < count; i++) {
    930     ProcessTextObject(m_LineObj.GetAt(i));
    931   }
    932   m_LineObj.RemoveAll();
    933   CloseTempLine();
    934 }
    935 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
    936                                       const CFX_Matrix& formMatrix) {
    937   CPDF_PageObject* pPageObj = NULL;
    938   FX_POSITION pos;
    939   if (!pFormObj) {
    940     return;
    941   }
    942   pos = pFormObj->m_pForm->GetFirstObjectPosition();
    943   if (!pos) {
    944     return;
    945   }
    946   CFX_Matrix curFormMatrix;
    947   curFormMatrix.Copy(pFormObj->m_FormMatrix);
    948   curFormMatrix.Concat(formMatrix);
    949   while (pos) {
    950     pPageObj = pFormObj->m_pForm->GetNextObject(pos);
    951     if (pPageObj) {
    952       if (pPageObj->m_Type == PDFPAGE_TEXT) {
    953         ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos);
    954       } else if (pPageObj->m_Type == PDFPAGE_FORM) {
    955         ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix);
    956       }
    957     }
    958     pPageObj = NULL;
    959   }
    960 }
    961 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const {
    962   if (charCode == -1) {
    963     return 0;
    964   }
    965   int w = pFont->GetCharWidthF(charCode);
    966   if (w == 0) {
    967     CFX_ByteString str;
    968     pFont->AppendChar(str, charCode);
    969     w = pFont->GetStringWidth(str, 1);
    970     if (w == 0) {
    971       FX_RECT BBox;
    972       pFont->GetCharBBox(charCode, BBox);
    973       w = BBox.right - BBox.left;
    974     }
    975   }
    976   return w;
    977 }
    978 void CPDF_TextPage::OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str) {
    979   int32_t start, count;
    980   CFX_BidiChar::Direction ret = pBidi->GetBidiInfo(&start, &count);
    981   if (ret == CFX_BidiChar::RIGHT) {
    982     for (int i = start + count - 1; i >= start; i--) {
    983       m_TextBuf.AppendChar(str.GetAt(i));
    984       m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
    985     }
    986   } else {
    987     int end = start + count;
    988     for (int i = start; i < end; i++) {
    989       m_TextBuf.AppendChar(str.GetAt(i));
    990       m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
    991     }
    992   }
    993 }
    994 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) {
    995   PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
    996   FX_WCHAR wChar = str.GetAt(i);
    997   if (!IsControlChar(Info)) {
    998     Info.m_Index = m_TextBuf.GetLength();
    999     if (wChar >= 0xFB00 && wChar <= 0xFB06) {
   1000       FX_WCHAR* pDst = NULL;
   1001       FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
   1002       if (nCount >= 1) {
   1003         pDst = FX_Alloc(FX_WCHAR, nCount);
   1004         FX_Unicode_GetNormalization(wChar, pDst);
   1005         for (int nIndex = 0; nIndex < nCount; nIndex++) {
   1006           PAGECHAR_INFO Info2 = Info;
   1007           Info2.m_Unicode = pDst[nIndex];
   1008           Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
   1009           m_TextBuf.AppendChar(Info2.m_Unicode);
   1010           if (!m_ParseOptions.m_bGetCharCodeOnly) {
   1011             m_charList.Add(Info2);
   1012           }
   1013         }
   1014         FX_Free(pDst);
   1015         return;
   1016       }
   1017     }
   1018     m_TextBuf.AppendChar(wChar);
   1019   } else {
   1020     Info.m_Index = -1;
   1021   }
   1022   if (!m_ParseOptions.m_bGetCharCodeOnly) {
   1023     m_charList.Add(Info);
   1024   }
   1025 }
   1026 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) {
   1027   PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
   1028   if (!IsControlChar(Info)) {
   1029     Info.m_Index = m_TextBuf.GetLength();
   1030     FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE);
   1031     FX_WCHAR* pDst = NULL;
   1032     FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
   1033     if (nCount >= 1) {
   1034       pDst = FX_Alloc(FX_WCHAR, nCount);
   1035       FX_Unicode_GetNormalization(wChar, pDst);
   1036       for (int nIndex = 0; nIndex < nCount; nIndex++) {
   1037         PAGECHAR_INFO Info2 = Info;
   1038         Info2.m_Unicode = pDst[nIndex];
   1039         Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
   1040         m_TextBuf.AppendChar(Info2.m_Unicode);
   1041         if (!m_ParseOptions.m_bGetCharCodeOnly) {
   1042           m_charList.Add(Info2);
   1043         }
   1044       }
   1045       FX_Free(pDst);
   1046       return;
   1047     }
   1048     Info.m_Unicode = wChar;
   1049     m_TextBuf.AppendChar(Info.m_Unicode);
   1050   } else {
   1051     Info.m_Index = -1;
   1052   }
   1053   if (!m_ParseOptions.m_bGetCharCodeOnly) {
   1054     m_charList.Add(Info);
   1055   }
   1056 }
   1057 void CPDF_TextPage::CloseTempLine() {
   1058   int count1 = m_TempCharList.GetSize();
   1059   if (count1 <= 0) {
   1060     return;
   1061   }
   1062   std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
   1063   CFX_WideString str = m_TempTextBuf.GetWideString();
   1064   CFX_WordArray order;
   1065   FX_BOOL bR2L = FALSE;
   1066   int32_t start = 0, count = 0;
   1067   int nR2L = 0, nL2R = 0;
   1068   FX_BOOL bPrevSpace = FALSE;
   1069   for (int i = 0; i < str.GetLength(); i++) {
   1070     if (str.GetAt(i) == 32) {
   1071       if (bPrevSpace) {
   1072         m_TempTextBuf.Delete(i, 1);
   1073         m_TempCharList.Delete(i);
   1074         str.Delete(i);
   1075         count1--;
   1076         i--;
   1077         continue;
   1078       }
   1079       bPrevSpace = TRUE;
   1080     } else {
   1081       bPrevSpace = FALSE;
   1082     }
   1083     if (pBidiChar->AppendChar(str.GetAt(i))) {
   1084       CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
   1085       order.Add(start);
   1086       order.Add(count);
   1087       order.Add(ret);
   1088       if (!bR2L) {
   1089         if (ret == CFX_BidiChar::RIGHT) {
   1090           nR2L++;
   1091         } else if (ret == CFX_BidiChar::LEFT) {
   1092           nL2R++;
   1093         }
   1094       }
   1095     }
   1096   }
   1097   if (pBidiChar->EndChar()) {
   1098     CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
   1099     order.Add(start);
   1100     order.Add(count);
   1101     order.Add(ret);
   1102     if (!bR2L) {
   1103       if (ret == CFX_BidiChar::RIGHT) {
   1104         nR2L++;
   1105       } else if (ret == CFX_BidiChar::LEFT) {
   1106         nL2R++;
   1107       }
   1108     }
   1109   }
   1110   if (nR2L > 0 && nR2L >= nL2R) {
   1111     bR2L = TRUE;
   1112   }
   1113   if (m_parserflag == FPDFTEXT_RLTB || bR2L) {
   1114     int count = order.GetSize();
   1115     for (int i = count - 1; i > 0; i -= 3) {
   1116       int ret = order.GetAt(i);
   1117       int start = order.GetAt(i - 2);
   1118       int count1 = order.GetAt(i - 1);
   1119       if (ret == 2 || ret == 0) {
   1120         for (int j = start + count1 - 1; j >= start; j--) {
   1121           AddCharInfoByRLDirection(str, j);
   1122         }
   1123       } else {
   1124         int j = i;
   1125         FX_BOOL bSymbol = FALSE;
   1126         while (j > 0 && order.GetAt(j) != 2) {
   1127           bSymbol = !order.GetAt(j);
   1128           j -= 3;
   1129         }
   1130         int end = start + count1;
   1131         int n = 0;
   1132         if (bSymbol) {
   1133           n = j + 6;
   1134         } else {
   1135           n = j + 3;
   1136         }
   1137         if (n >= i) {
   1138           for (int m = start; m < end; m++) {
   1139             AddCharInfoByLRDirection(str, m);
   1140           }
   1141         } else {
   1142           j = i;
   1143           i = n;
   1144           for (; n <= j; n += 3) {
   1145             int start = order.GetAt(n - 2);
   1146             int count1 = order.GetAt(n - 1);
   1147             int end = start + count1;
   1148             for (int m = start; m < end; m++) {
   1149               AddCharInfoByLRDirection(str, m);
   1150             }
   1151           }
   1152         }
   1153       }
   1154     }
   1155   } else {
   1156     int count = order.GetSize();
   1157     FX_BOOL bL2R = FALSE;
   1158     for (int i = 0; i < count; i += 3) {
   1159       int ret = order.GetAt(i + 2);
   1160       int start = order.GetAt(i);
   1161       int count1 = order.GetAt(i + 1);
   1162       if (ret == 2 || (i == 0 && ret == 0 && !bL2R)) {
   1163         int j = i + 3;
   1164         while (bR2L && j < count) {
   1165           if (order.GetAt(j + 2) == 1) {
   1166             break;
   1167           } else {
   1168             j += 3;
   1169           }
   1170         }
   1171         if (j == 3) {
   1172           i = -3;
   1173           bL2R = TRUE;
   1174           continue;
   1175         }
   1176         int end = m_TempCharList.GetSize() - 1;
   1177         if (j < count) {
   1178           end = order.GetAt(j) - 1;
   1179         }
   1180         i = j - 3;
   1181         for (int n = end; n >= start; n--) {
   1182           AddCharInfoByRLDirection(str, n);
   1183         }
   1184       } else {
   1185         int end = start + count1;
   1186         for (int n = start; n < end; n++) {
   1187           AddCharInfoByLRDirection(str, n);
   1188         }
   1189       }
   1190     }
   1191   }
   1192   order.RemoveAll();
   1193   m_TempCharList.RemoveAll();
   1194   m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
   1195 }
   1196 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj,
   1197                                       const CFX_Matrix& formMatrix,
   1198                                       FX_POSITION ObjPos) {
   1199   CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right,
   1200                    pTextObj->m_Top);
   1201   if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
   1202     return;
   1203   }
   1204   int count = m_LineObj.GetSize();
   1205   PDFTEXT_Obj Obj;
   1206   Obj.m_pTextObj = pTextObj;
   1207   Obj.m_formMatrix = formMatrix;
   1208   if (count == 0) {
   1209     m_LineObj.Add(Obj);
   1210     return;
   1211   }
   1212   if (IsSameAsPreTextObject(pTextObj, ObjPos)) {
   1213     return;
   1214   }
   1215   PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1);
   1216   CPDF_TextObjectItem item;
   1217   int nItem = prev_Obj.m_pTextObj->CountItems();
   1218   prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
   1219   FX_FLOAT prev_width =
   1220       GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
   1221       prev_Obj.m_pTextObj->GetFontSize() / 1000;
   1222   CFX_Matrix prev_matrix;
   1223   prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
   1224   prev_width = FXSYS_fabs(prev_width);
   1225   prev_matrix.Concat(prev_Obj.m_formMatrix);
   1226   prev_width = prev_matrix.TransformDistance(prev_width);
   1227   pTextObj->GetItemInfo(0, &item);
   1228   FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
   1229                         pTextObj->GetFontSize() / 1000;
   1230   this_width = FXSYS_fabs(this_width);
   1231   CFX_Matrix this_matrix;
   1232   pTextObj->GetTextMatrix(&this_matrix);
   1233   this_width = FXSYS_fabs(this_width);
   1234   this_matrix.Concat(formMatrix);
   1235   this_width = this_matrix.TransformDistance(this_width);
   1236   FX_FLOAT threshold =
   1237       prev_width > this_width ? prev_width / 4 : this_width / 4;
   1238   FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(),
   1239            prev_y = prev_Obj.m_pTextObj->GetPosY();
   1240   prev_Obj.m_formMatrix.Transform(prev_x, prev_y);
   1241   m_DisplayMatrix.Transform(prev_x, prev_y);
   1242   FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY();
   1243   formMatrix.Transform(this_x, this_y);
   1244   m_DisplayMatrix.Transform(this_x, this_y);
   1245   if (FXSYS_fabs(this_y - prev_y) > threshold * 2) {
   1246     for (int i = 0; i < count; i++) {
   1247       ProcessTextObject(m_LineObj.GetAt(i));
   1248     }
   1249     m_LineObj.RemoveAll();
   1250     m_LineObj.Add(Obj);
   1251     return;
   1252   }
   1253   int i = 0;
   1254   if (m_ParseOptions.m_bNormalizeObjs) {
   1255     for (i = count - 1; i >= 0; i--) {
   1256       PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i);
   1257       CFX_Matrix prev_matrix;
   1258       prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
   1259       FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(),
   1260                Prev_y = prev_Obj.m_pTextObj->GetPosY();
   1261       prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y);
   1262       m_DisplayMatrix.Transform(Prev_x, Prev_y);
   1263       if (this_x >= Prev_x) {
   1264         if (i == count - 1) {
   1265           m_LineObj.Add(Obj);
   1266         } else {
   1267           m_LineObj.InsertAt(i + 1, Obj);
   1268         }
   1269         break;
   1270       }
   1271     }
   1272     if (i < 0) {
   1273       m_LineObj.InsertAt(0, Obj);
   1274     }
   1275   } else {
   1276     m_LineObj.Add(Obj);
   1277   }
   1278 }
   1279 int32_t CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
   1280   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
   1281   CPDF_ContentMarkData* pMarkData =
   1282       (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
   1283   if (!pMarkData) {
   1284     return FPDFTEXT_MC_PASS;
   1285   }
   1286   int nContentMark = pMarkData->CountItems();
   1287   if (nContentMark < 1) {
   1288     return FPDFTEXT_MC_PASS;
   1289   }
   1290   CFX_WideString actText;
   1291   FX_BOOL bExist = FALSE;
   1292   CPDF_Dictionary* pDict = NULL;
   1293   int n = 0;
   1294   for (n = 0; n < nContentMark; n++) {
   1295     CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
   1296     CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
   1297     pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
   1298     CPDF_String* temp =
   1299         ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
   1300     if (temp) {
   1301       bExist = TRUE;
   1302       actText = temp->GetUnicodeText();
   1303     }
   1304   }
   1305   if (!bExist) {
   1306     return FPDFTEXT_MC_PASS;
   1307   }
   1308   if (m_pPreTextObj) {
   1309     if (CPDF_ContentMarkData* pPreMarkData =
   1310             (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) {
   1311       if (pPreMarkData->CountItems() == n) {
   1312         CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1);
   1313         if (pDict == item.GetParam()) {
   1314           return FPDFTEXT_MC_DONE;
   1315         }
   1316       }
   1317     }
   1318   }
   1319   CPDF_Font* pFont = pTextObj->GetFont();
   1320   FX_STRSIZE nItems = actText.GetLength();
   1321   if (nItems < 1) {
   1322     return FPDFTEXT_MC_PASS;
   1323   }
   1324   bExist = FALSE;
   1325   for (FX_STRSIZE i = 0; i < nItems; i++) {
   1326     FX_WCHAR wChar = actText.GetAt(i);
   1327     if (-1 == pFont->CharCodeFromUnicode(wChar)) {
   1328       continue;
   1329     } else {
   1330       bExist = TRUE;
   1331       break;
   1332     }
   1333   }
   1334   if (!bExist) {
   1335     return FPDFTEXT_MC_PASS;
   1336   }
   1337   bExist = FALSE;
   1338   for (FX_STRSIZE i = 0; i < nItems; i++) {
   1339     FX_WCHAR wChar = actText.GetAt(i);
   1340     if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
   1341       bExist = TRUE;
   1342       break;
   1343     }
   1344   }
   1345   if (!bExist) {
   1346     return FPDFTEXT_MC_DONE;
   1347   }
   1348   return FPDFTEXT_MC_DELAY;
   1349 }
   1350 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
   1351   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
   1352   CPDF_ContentMarkData* pMarkData =
   1353       (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
   1354   if (!pMarkData) {
   1355     return;
   1356   }
   1357   int nContentMark = pMarkData->CountItems();
   1358   if (nContentMark < 1) {
   1359     return;
   1360   }
   1361   CFX_WideString actText;
   1362   CPDF_Dictionary* pDict = NULL;
   1363   int n = 0;
   1364   for (n = 0; n < nContentMark; n++) {
   1365     CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
   1366     CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
   1367     pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam()));
   1368     CPDF_String* temp =
   1369         ToString(pDict ? pDict->GetElement("ActualText") : nullptr);
   1370     if (temp) {
   1371       actText = temp->GetUnicodeText();
   1372     }
   1373   }
   1374   FX_STRSIZE nItems = actText.GetLength();
   1375   if (nItems < 1) {
   1376     return;
   1377   }
   1378   CPDF_Font* pFont = pTextObj->GetFont();
   1379   CFX_Matrix formMatrix = Obj.m_formMatrix;
   1380   CFX_Matrix matrix;
   1381   pTextObj->GetTextMatrix(&matrix);
   1382   matrix.Concat(formMatrix);
   1383   FX_FLOAT fPosX = pTextObj->GetPosX();
   1384   FX_FLOAT fPosY = pTextObj->GetPosY();
   1385   int nCharInfoIndex = m_TextBuf.GetLength();
   1386   CFX_FloatRect charBox;
   1387   charBox.top = pTextObj->m_Top;
   1388   charBox.left = pTextObj->m_Left;
   1389   charBox.right = pTextObj->m_Right;
   1390   charBox.bottom = pTextObj->m_Bottom;
   1391   for (FX_STRSIZE k = 0; k < nItems; k++) {
   1392     FX_WCHAR wChar = actText.GetAt(k);
   1393     if (wChar <= 0x80 && !isprint(wChar)) {
   1394       wChar = 0x20;
   1395     }
   1396     if (wChar >= 0xFFFD) {
   1397       continue;
   1398     }
   1399     PAGECHAR_INFO charinfo;
   1400     charinfo.m_OriginX = fPosX;
   1401     charinfo.m_OriginY = fPosY;
   1402     charinfo.m_Index = nCharInfoIndex;
   1403     charinfo.m_Unicode = wChar;
   1404     charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
   1405     charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
   1406     charinfo.m_pTextObj = pTextObj;
   1407     charinfo.m_CharBox.top = charBox.top;
   1408     charinfo.m_CharBox.left = charBox.left;
   1409     charinfo.m_CharBox.right = charBox.right;
   1410     charinfo.m_CharBox.bottom = charBox.bottom;
   1411     charinfo.m_Matrix.Copy(matrix);
   1412     m_TempTextBuf.AppendChar(wChar);
   1413     m_TempCharList.Add(charinfo);
   1414   }
   1415 }
   1416 void CPDF_TextPage::FindPreviousTextObject(void) {
   1417   if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) {
   1418     return;
   1419   }
   1420   PAGECHAR_INFO preChar;
   1421   if (m_TempCharList.GetSize() >= 1) {
   1422     preChar =
   1423         *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
   1424   } else {
   1425     preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1);
   1426   }
   1427   if (preChar.m_pTextObj) {
   1428     m_pPreTextObj = preChar.m_pTextObj;
   1429   }
   1430 }
   1431 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
   1432                                     int32_t iBufStartAppend) {
   1433   int32_t i, j;
   1434   i = iCharListStartAppend;
   1435   j = m_TempCharList.GetSize() - 1;
   1436   for (; i < j; i++, j--) {
   1437     std::swap(m_TempCharList[i], m_TempCharList[j]);
   1438     std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
   1439   }
   1440   FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
   1441   i = iBufStartAppend;
   1442   j = m_TempTextBuf.GetLength() - 1;
   1443   for (; i < j; i++, j--) {
   1444     std::swap(pTempBuffer[i], pTempBuffer[j]);
   1445   }
   1446 }
   1447 FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
   1448                                      const CPDF_Font* pFont,
   1449                                      int nItems) const {
   1450   std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar);
   1451   int32_t nR2L = 0;
   1452   int32_t nL2R = 0;
   1453   int32_t start = 0, count = 0;
   1454   CPDF_TextObjectItem item;
   1455   for (int32_t i = 0; i < nItems; i++) {
   1456     pTextObj->GetItemInfo(i, &item);
   1457     if (item.m_CharCode == (FX_DWORD)-1) {
   1458       continue;
   1459     }
   1460     CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
   1461     FX_WCHAR wChar = wstrItem.GetAt(0);
   1462     if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
   1463       wChar = (FX_WCHAR)item.m_CharCode;
   1464     }
   1465     if (!wChar) {
   1466       continue;
   1467     }
   1468     if (pBidiChar->AppendChar(wChar)) {
   1469       CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
   1470       if (ret == CFX_BidiChar::RIGHT) {
   1471         nR2L++;
   1472       } else if (ret == CFX_BidiChar::LEFT) {
   1473         nL2R++;
   1474       }
   1475     }
   1476   }
   1477   if (pBidiChar->EndChar()) {
   1478     CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count);
   1479     if (ret == CFX_BidiChar::RIGHT) {
   1480       nR2L++;
   1481     } else if (ret == CFX_BidiChar::LEFT) {
   1482       nL2R++;
   1483     }
   1484   }
   1485   return (nR2L > 0 && nR2L >= nL2R);
   1486 }
   1487 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
   1488   CPDF_TextObject* pTextObj = Obj.m_pTextObj;
   1489   if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
   1490     return;
   1491   }
   1492   CFX_Matrix formMatrix = Obj.m_formMatrix;
   1493   CPDF_Font* pFont = pTextObj->GetFont();
   1494   CFX_Matrix matrix;
   1495   pTextObj->GetTextMatrix(&matrix);
   1496   matrix.Concat(formMatrix);
   1497   int32_t bPreMKC = PreMarkedContent(Obj);
   1498   if (FPDFTEXT_MC_DONE == bPreMKC) {
   1499     m_pPreTextObj = pTextObj;
   1500     m_perMatrix.Copy(formMatrix);
   1501     return;
   1502   }
   1503   int result = 0;
   1504   if (m_pPreTextObj) {
   1505     result = ProcessInsertObject(pTextObj, formMatrix);
   1506     if (2 == result) {
   1507       m_CurlineRect =
   1508           CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
   1509                         Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
   1510     } else {
   1511       m_CurlineRect.Union(
   1512           CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
   1513                         Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top));
   1514     }
   1515     PAGECHAR_INFO generateChar;
   1516     if (result == 1) {
   1517       if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) {
   1518         if (!formMatrix.IsIdentity()) {
   1519           generateChar.m_Matrix.Copy(formMatrix);
   1520         }
   1521         m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
   1522         m_TempCharList.Add(generateChar);
   1523       }
   1524     } else if (result == 2) {
   1525       CloseTempLine();
   1526       if (m_TextBuf.GetSize()) {
   1527         if (m_ParseOptions.m_bGetCharCodeOnly) {
   1528           m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
   1529           m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
   1530         } else {
   1531           if (GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) {
   1532             m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
   1533             if (!formMatrix.IsIdentity()) {
   1534               generateChar.m_Matrix.Copy(formMatrix);
   1535             }
   1536             m_charList.Add(generateChar);
   1537           }
   1538           if (GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) {
   1539             m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
   1540             if (!formMatrix.IsIdentity()) {
   1541               generateChar.m_Matrix.Copy(formMatrix);
   1542             }
   1543             m_charList.Add(generateChar);
   1544           }
   1545         }
   1546       }
   1547     } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) {
   1548       int32_t nChars = pTextObj->CountChars();
   1549       if (nChars == 1) {
   1550         CPDF_TextObjectItem item;
   1551         pTextObj->GetCharInfo(0, &item);
   1552         CFX_WideString wstrItem =
   1553             pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
   1554         if (wstrItem.IsEmpty()) {
   1555           wstrItem += (FX_WCHAR)item.m_CharCode;
   1556         }
   1557         FX_WCHAR curChar = wstrItem.GetAt(0);
   1558         if (0x2D == curChar || 0xAD == curChar) {
   1559           return;
   1560         }
   1561       }
   1562       while (m_TempTextBuf.GetSize() > 0 &&
   1563              m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() -
   1564                                                  1) == 0x20) {
   1565         m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
   1566         m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
   1567       }
   1568       PAGECHAR_INFO* cha =
   1569           (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
   1570       m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
   1571       cha->m_Unicode = 0x2;
   1572       cha->m_Flag = FPDFTEXT_CHAR_HYPHEN;
   1573       m_TempTextBuf.AppendChar(0xfffe);
   1574     }
   1575   } else {
   1576     m_CurlineRect =
   1577         CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
   1578                       Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
   1579   }
   1580   if (FPDFTEXT_MC_DELAY == bPreMKC) {
   1581     ProcessMarkedContent(Obj);
   1582     m_pPreTextObj = pTextObj;
   1583     m_perMatrix.Copy(formMatrix);
   1584     return;
   1585   }
   1586   m_pPreTextObj = pTextObj;
   1587   m_perMatrix.Copy(formMatrix);
   1588   int nItems = pTextObj->CountItems();
   1589   FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix);
   1590 
   1591   const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems);
   1592   const FX_BOOL bIsBidiAndMirrorInverse =
   1593       bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
   1594   int32_t iBufStartAppend = m_TempTextBuf.GetLength();
   1595   int32_t iCharListStartAppend = m_TempCharList.GetSize();
   1596 
   1597   FX_FLOAT spacing = 0;
   1598   for (int i = 0; i < nItems; i++) {
   1599     CPDF_TextObjectItem item;
   1600     PAGECHAR_INFO charinfo;
   1601     charinfo.m_OriginX = 0;
   1602     charinfo.m_OriginY = 0;
   1603     pTextObj->GetItemInfo(i, &item);
   1604     if (item.m_CharCode == (FX_DWORD)-1) {
   1605       CFX_WideString str = m_TempTextBuf.GetWideString();
   1606       if (str.IsEmpty()) {
   1607         str = m_TextBuf.GetWideString();
   1608       }
   1609       if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
   1610         continue;
   1611       }
   1612       FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
   1613       spacing = -fontsize_h * item.m_OriginX / 1000;
   1614       continue;
   1615     }
   1616     FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace;
   1617     if (charSpace > 0.001) {
   1618       spacing += matrix.TransformDistance(charSpace);
   1619     } else if (charSpace < -0.001) {
   1620       spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
   1621     }
   1622     spacing -= baseSpace;
   1623     if (spacing && i > 0) {
   1624       int last_width = 0;
   1625       FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
   1626       FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
   1627       FX_FLOAT threshold = 0;
   1628       if (space_charcode != -1) {
   1629         threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
   1630       }
   1631       if (threshold > fontsize_h / 3) {
   1632         threshold = 0;
   1633       } else {
   1634         threshold /= 2;
   1635       }
   1636       if (threshold == 0) {
   1637         threshold = fontsize_h;
   1638         int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
   1639         threshold = this_width > last_width ? (FX_FLOAT)this_width
   1640                                             : (FX_FLOAT)last_width;
   1641         threshold = _NormalizeThreshold(threshold);
   1642         threshold = fontsize_h * threshold / 1000;
   1643       }
   1644       if (threshold && (spacing && spacing >= threshold)) {
   1645         charinfo.m_Unicode = TEXT_BLANK_CHAR;
   1646         charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
   1647         charinfo.m_pTextObj = pTextObj;
   1648         charinfo.m_Index = m_TextBuf.GetLength();
   1649         m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
   1650         charinfo.m_CharCode = -1;
   1651         charinfo.m_Matrix.Copy(formMatrix);
   1652         matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
   1653                          charinfo.m_OriginY);
   1654         charinfo.m_CharBox =
   1655             CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY,
   1656                           charinfo.m_OriginX, charinfo.m_OriginY);
   1657         m_TempCharList.Add(charinfo);
   1658       }
   1659       if (item.m_CharCode == (FX_DWORD)-1) {
   1660         continue;
   1661       }
   1662     }
   1663     spacing = 0;
   1664     CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
   1665     FX_BOOL bNoUnicode = FALSE;
   1666     FX_WCHAR wChar = wstrItem.GetAt(0);
   1667     if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
   1668       if (wstrItem.IsEmpty()) {
   1669         wstrItem += (FX_WCHAR)item.m_CharCode;
   1670       } else {
   1671         wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode);
   1672       }
   1673       bNoUnicode = TRUE;
   1674     }
   1675     charinfo.m_Index = -1;
   1676     charinfo.m_CharCode = item.m_CharCode;
   1677     if (bNoUnicode) {
   1678       charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
   1679     } else {
   1680       charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
   1681     }
   1682     charinfo.m_pTextObj = pTextObj;
   1683     charinfo.m_OriginX = 0, charinfo.m_OriginY = 0;
   1684     matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
   1685                      charinfo.m_OriginY);
   1686     FX_RECT rect(0, 0, 0, 0);
   1687     rect.Intersect(0, 0, 0, 0);
   1688     charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect);
   1689     charinfo.m_CharBox.top =
   1690         rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
   1691     charinfo.m_CharBox.left =
   1692         rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
   1693     charinfo.m_CharBox.right =
   1694         rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
   1695     charinfo.m_CharBox.bottom =
   1696         rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
   1697     if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
   1698       charinfo.m_CharBox.top =
   1699           charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
   1700     }
   1701     if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
   1702       charinfo.m_CharBox.right =
   1703           charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
   1704     }
   1705     matrix.TransformRect(charinfo.m_CharBox);
   1706     charinfo.m_Matrix.Copy(matrix);
   1707     if (wstrItem.IsEmpty()) {
   1708       charinfo.m_Unicode = 0;
   1709       m_TempCharList.Add(charinfo);
   1710       m_TempTextBuf.AppendChar(0xfffe);
   1711       continue;
   1712     } else {
   1713       int nTotal = wstrItem.GetLength();
   1714       FX_BOOL bDel = FALSE;
   1715       const int count = std::min(m_TempCharList.GetSize(), 7);
   1716       FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
   1717           (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
   1718       for (int n = m_TempCharList.GetSize();
   1719            n > m_TempCharList.GetSize() - count; n--) {
   1720         PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(n - 1);
   1721         if (charinfo1->m_CharCode == charinfo.m_CharCode &&
   1722             charinfo1->m_pTextObj->GetFont() ==
   1723                 charinfo.m_pTextObj->GetFont() &&
   1724             FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold &&
   1725             FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) {
   1726           bDel = TRUE;
   1727           break;
   1728         }
   1729       }
   1730       if (!bDel) {
   1731         for (int nIndex = 0; nIndex < nTotal; nIndex++) {
   1732           charinfo.m_Unicode = wstrItem.GetAt(nIndex);
   1733           if (charinfo.m_Unicode) {
   1734             charinfo.m_Index = m_TextBuf.GetLength();
   1735             m_TempTextBuf.AppendChar(charinfo.m_Unicode);
   1736           } else {
   1737             m_TempTextBuf.AppendChar(0xfffe);
   1738           }
   1739           m_TempCharList.Add(charinfo);
   1740         }
   1741       } else if (i == 0) {
   1742         CFX_WideString str = m_TempTextBuf.GetWideString();
   1743         if (!str.IsEmpty() &&
   1744             str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
   1745           m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
   1746           m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
   1747         }
   1748       }
   1749     }
   1750   }
   1751   if (bIsBidiAndMirrorInverse) {
   1752     SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
   1753   }
   1754 }
   1755 int32_t CPDF_TextPage::GetTextObjectWritingMode(
   1756     const CPDF_TextObject* pTextObj) {
   1757   int32_t nChars = pTextObj->CountChars();
   1758   if (nChars == 1) {
   1759     return m_TextlineDir;
   1760   }
   1761   CPDF_TextObjectItem first, last;
   1762   pTextObj->GetCharInfo(0, &first);
   1763   pTextObj->GetCharInfo(nChars - 1, &last);
   1764   CFX_Matrix textMatrix;
   1765   pTextObj->GetTextMatrix(&textMatrix);
   1766   textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY);
   1767   textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY);
   1768   FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX);
   1769   FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY);
   1770   if (dX <= 0.0001f && dY <= 0.0001f) {
   1771     return -1;
   1772   }
   1773   CFX_VectorF v;
   1774   v.Set(dX, dY);
   1775   v.Normalize();
   1776   if (v.y <= 0.0872f) {
   1777     return v.x <= 0.0872f ? m_TextlineDir : 0;
   1778   }
   1779   if (v.x <= 0.0872f) {
   1780     return 1;
   1781   }
   1782   return m_TextlineDir;
   1783 }
   1784 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
   1785   CFX_WideString strCurText = m_TempTextBuf.GetWideString();
   1786   if (strCurText.GetLength() == 0) {
   1787     strCurText = m_TextBuf.GetWideString();
   1788   }
   1789   FX_STRSIZE nCount = strCurText.GetLength();
   1790   int nIndex = nCount - 1;
   1791   FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
   1792   while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) {
   1793     wcTmp = strCurText.GetAt(--nIndex);
   1794   }
   1795   if (0x2D == wcTmp || 0xAD == wcTmp) {
   1796     if (--nIndex > 0) {
   1797       FX_WCHAR preChar = strCurText.GetAt((nIndex));
   1798       if (((preChar >= L'A' && preChar <= L'Z') ||
   1799            (preChar >= L'a' && preChar <= L'z')) &&
   1800           ((curChar >= L'A' && curChar <= L'Z') ||
   1801            (curChar >= L'a' && curChar <= L'z'))) {
   1802         return TRUE;
   1803       }
   1804     }
   1805     int size = m_TempCharList.GetSize();
   1806     PAGECHAR_INFO preChar;
   1807     if (size) {
   1808       preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
   1809     } else {
   1810       size = m_charList.GetSize();
   1811       if (size == 0) {
   1812         return FALSE;
   1813       }
   1814       preChar = (PAGECHAR_INFO)m_charList[size - 1];
   1815     }
   1816     if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag &&
   1817         (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode)) {
   1818       return TRUE;
   1819     }
   1820   }
   1821   return FALSE;
   1822 }
   1823 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj,
   1824                                        const CFX_Matrix& formMatrix) {
   1825   FindPreviousTextObject();
   1826   FX_BOOL bNewline = FALSE;
   1827   int WritingMode = GetTextObjectWritingMode(pObj);
   1828   if (WritingMode == -1) {
   1829     WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
   1830   }
   1831   CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right,
   1832                           pObj->m_Top);
   1833   CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
   1834                           m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
   1835   CPDF_TextObjectItem PrevItem, item;
   1836   int nItem = m_pPreTextObj->CountItems();
   1837   m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
   1838   pObj->GetItemInfo(0, &item);
   1839   CFX_WideString wstrItem =
   1840       pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
   1841   if (wstrItem.IsEmpty()) {
   1842     wstrItem += (FX_WCHAR)item.m_CharCode;
   1843   }
   1844   FX_WCHAR curChar = wstrItem.GetAt(0);
   1845   if (WritingMode == 0) {
   1846     if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
   1847       FX_FLOAT top =
   1848           this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
   1849       FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
   1850                                                             : prev_rect.bottom;
   1851       if (bottom >= top) {
   1852         if (IsHyphen(curChar)) {
   1853           return 3;
   1854         }
   1855         return 2;
   1856       }
   1857     }
   1858   } else if (WritingMode == 1) {
   1859     if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
   1860         prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
   1861       FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
   1862                                                           : m_CurlineRect.left;
   1863       FX_FLOAT right = this_rect.right < m_CurlineRect.right
   1864                            ? this_rect.right
   1865                            : m_CurlineRect.right;
   1866       if (right <= left) {
   1867         if (IsHyphen(curChar)) {
   1868           return 3;
   1869         }
   1870         return 2;
   1871       }
   1872     }
   1873   }
   1874   FX_FLOAT last_pos = PrevItem.m_OriginX;
   1875   int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
   1876   FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
   1877   last_width = FXSYS_fabs(last_width);
   1878   int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
   1879   FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
   1880   this_width = FXSYS_fabs(this_width);
   1881   FX_FLOAT threshold =
   1882       last_width > this_width ? last_width / 4 : this_width / 4;
   1883   CFX_Matrix prev_matrix, prev_reverse;
   1884   m_pPreTextObj->GetTextMatrix(&prev_matrix);
   1885   prev_matrix.Concat(m_perMatrix);
   1886   prev_reverse.SetReverse(prev_matrix);
   1887   FX_FLOAT x = pObj->GetPosX();
   1888   FX_FLOAT y = pObj->GetPosY();
   1889   formMatrix.Transform(x, y);
   1890   prev_reverse.Transform(x, y);
   1891   if (last_width < this_width) {
   1892     threshold = prev_reverse.TransformDistance(threshold);
   1893   }
   1894   CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
   1895                       m_pPreTextObj->m_Right, pObj->m_Top);
   1896   CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
   1897                       m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
   1898   CFX_FloatRect rect3 = rect1;
   1899   rect1.Intersect(rect2);
   1900   if (WritingMode == 0) {
   1901     if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
   1902         ((y > threshold * 2 || y < threshold * -3) &&
   1903          (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) {
   1904       bNewline = TRUE;
   1905       if (nItem > 1) {
   1906         CPDF_TextObjectItem tempItem;
   1907         m_pPreTextObj->GetItemInfo(0, &tempItem);
   1908         CFX_Matrix m;
   1909         m_pPreTextObj->GetTextMatrix(&m);
   1910         if (PrevItem.m_OriginX > tempItem.m_OriginX &&
   1911             m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
   1912             m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
   1913             m.c < 0.1) {
   1914           CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
   1915                            m_pPreTextObj->m_Top);
   1916           if (re.Contains(pObj->GetPosX(), pObj->GetPosY())) {
   1917             bNewline = FALSE;
   1918           } else {
   1919             CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top);
   1920             if (re.Contains(m_pPreTextObj->GetPosX(),
   1921                             m_pPreTextObj->GetPosY())) {
   1922               bNewline = FALSE;
   1923             }
   1924           }
   1925         }
   1926       }
   1927     }
   1928   }
   1929   if (bNewline)
   1930     return IsHyphen(curChar) ? 3 : 2;
   1931 
   1932   int32_t nChars = pObj->CountChars();
   1933   if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) &&
   1934       IsHyphen(curChar)) {
   1935     return 3;
   1936   }
   1937   CFX_WideString PrevStr =
   1938       m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
   1939   FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
   1940   CFX_Matrix matrix;
   1941   pObj->GetTextMatrix(&matrix);
   1942   matrix.Concat(formMatrix);
   1943   threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
   1944   threshold = threshold > 400
   1945                   ? (threshold < 700
   1946                          ? threshold / 4
   1947                          : (threshold > 800 ? threshold / 6 : threshold / 5))
   1948                   : (threshold / 2);
   1949   if (nLastWidth >= nThisWidth) {
   1950     threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
   1951   } else {
   1952     threshold *= FXSYS_fabs(pObj->GetFontSize());
   1953     threshold = matrix.TransformDistance(threshold);
   1954     threshold = prev_reverse.TransformDistance(threshold);
   1955   }
   1956   threshold /= 1000;
   1957   if ((threshold < 1.4881 && threshold > 1.4879) ||
   1958       (threshold < 1.39001 && threshold > 1.38999)) {
   1959     threshold *= 1.5;
   1960   }
   1961   if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
   1962       preChar != L' ') {
   1963     if (curChar != L' ' && preChar != L' ') {
   1964       if ((x - last_pos - last_width) > threshold ||
   1965           (last_pos - x - last_width) > threshold) {
   1966         return 1;
   1967       }
   1968       if (x < 0 && (last_pos - x - last_width) > threshold) {
   1969         return 1;
   1970       }
   1971       if ((x - last_pos - last_width) > this_width ||
   1972           (x - last_pos - this_width) > last_width) {
   1973         return 1;
   1974       }
   1975     }
   1976   }
   1977   return 0;
   1978 }
   1979 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
   1980                                         CPDF_TextObject* pTextObj2) {
   1981   if (!pTextObj1 || !pTextObj2) {
   1982     return FALSE;
   1983   }
   1984   CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
   1985                          pTextObj2->m_Right, pTextObj2->m_Top);
   1986   CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
   1987                          pTextObj1->m_Right, pTextObj1->m_Top);
   1988   if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() &&
   1989       !m_ParseOptions.m_bGetCharCodeOnly) {
   1990     FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
   1991     int nCount = m_charList.GetSize();
   1992     if (nCount >= 2) {
   1993       PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2];
   1994       FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
   1995       if (dbXdif > dbSpace) {
   1996         return FALSE;
   1997       }
   1998     }
   1999   }
   2000   if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
   2001     rcPreObj.Intersect(rcCurObj);
   2002     if (rcPreObj.IsEmpty()) {
   2003       return FALSE;
   2004     }
   2005     if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
   2006         rcCurObj.Width() / 2) {
   2007       return FALSE;
   2008     }
   2009     if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
   2010       return FALSE;
   2011     }
   2012   }
   2013   int nPreCount = pTextObj2->CountItems();
   2014   int nCurCount = pTextObj1->CountItems();
   2015   if (nPreCount != nCurCount) {
   2016     return FALSE;
   2017   }
   2018   CPDF_TextObjectItem itemPer, itemCur;
   2019   for (int i = 0; i < nPreCount; i++) {
   2020     pTextObj2->GetItemInfo(i, &itemPer);
   2021     pTextObj1->GetItemInfo(i, &itemCur);
   2022     if (itemCur.m_CharCode != itemPer.m_CharCode) {
   2023       return FALSE;
   2024     }
   2025   }
   2026   if (FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) >
   2027           GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) *
   2028               pTextObj2->GetFontSize() / 1000 * 0.9 ||
   2029       FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) >
   2030           std::max(std::max(rcPreObj.Height(), rcPreObj.Width()),
   2031                    pTextObj2->GetFontSize()) /
   2032               8) {
   2033     return FALSE;
   2034   }
   2035   return TRUE;
   2036 }
   2037 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
   2038                                              FX_POSITION ObjPos) {
   2039   if (!pTextObj) {
   2040     return FALSE;
   2041   }
   2042   int i = 0;
   2043   if (!ObjPos) {
   2044     ObjPos = m_pPage->GetLastObjectPosition();
   2045   }
   2046   CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos);
   2047   while (i < 5 && ObjPos) {
   2048     pObj = m_pPage->GetPrevObject(ObjPos);
   2049     if (pObj == pTextObj) {
   2050       continue;
   2051     }
   2052     if (pObj->m_Type != PDFPAGE_TEXT) {
   2053       continue;
   2054     }
   2055     if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) {
   2056       return TRUE;
   2057     }
   2058     i++;
   2059   }
   2060   return FALSE;
   2061 }
   2062 
   2063 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
   2064   int size = m_TempCharList.GetSize();
   2065   PAGECHAR_INFO preChar;
   2066   if (size) {
   2067     preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
   2068   } else {
   2069     size = m_charList.GetSize();
   2070     if (size == 0) {
   2071       return FALSE;
   2072     }
   2073     preChar = (PAGECHAR_INFO)m_charList[size - 1];
   2074   }
   2075   info.m_Index = m_TextBuf.GetLength();
   2076   info.m_Unicode = unicode;
   2077   info.m_pTextObj = NULL;
   2078   info.m_CharCode = -1;
   2079   info.m_Flag = FPDFTEXT_CHAR_GENERATED;
   2080   int preWidth = 0;
   2081   if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD)-1)
   2082     preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont());
   2083 
   2084   FX_FLOAT fFontSize = preChar.m_pTextObj ? preChar.m_pTextObj->GetFontSize()
   2085                                           : preChar.m_CharBox.Height();
   2086   if (!fFontSize)
   2087     fFontSize = kDefaultFontSize;
   2088 
   2089   info.m_OriginX = preChar.m_OriginX + preWidth * (fFontSize) / 1000;
   2090   info.m_OriginY = preChar.m_OriginY;
   2091   info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX,
   2092                                  info.m_OriginY);
   2093   return TRUE;
   2094 }
   2095 
   2096 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
   2097                                        const CFX_FloatRect& rect2) {
   2098   CFX_FloatRect rect = rect1;
   2099   rect.Intersect(rect2);
   2100   return !rect.IsEmpty();
   2101 }
   2102 FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) {
   2103   if (unicode < L'A') {
   2104     return FALSE;
   2105   }
   2106   if (unicode > L'Z' && unicode < L'a') {
   2107     return FALSE;
   2108   }
   2109   if (unicode > L'z') {
   2110     return FALSE;
   2111   }
   2112   return TRUE;
   2113 }
   2114 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage)
   2115     : m_pTextPage(pTextPage),
   2116       m_flags(0),
   2117       m_findNextStart(-1),
   2118       m_findPreStart(-1),
   2119       m_bMatchCase(FALSE),
   2120       m_bMatchWholeWord(FALSE),
   2121       m_resStart(0),
   2122       m_resEnd(-1),
   2123       m_IsFind(FALSE) {
   2124   m_strText = m_pTextPage->GetPageText();
   2125   int nCount = pTextPage->CountChars();
   2126   if (nCount) {
   2127     m_CharIndex.Add(0);
   2128   }
   2129   for (int i = 0; i < nCount; i++) {
   2130     FPDF_CHAR_INFO info;
   2131     pTextPage->GetCharInfo(i, &info);
   2132     int indexSize = m_CharIndex.GetSize();
   2133     if (info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) {
   2134       if (indexSize % 2) {
   2135         m_CharIndex.Add(1);
   2136       } else {
   2137         if (indexSize <= 0) {
   2138           continue;
   2139         }
   2140         m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1);
   2141       }
   2142     } else {
   2143       if (indexSize % 2) {
   2144         if (indexSize <= 0) {
   2145           continue;
   2146         }
   2147         m_CharIndex.SetAt(indexSize - 1, i + 1);
   2148       } else {
   2149         m_CharIndex.Add(i + 1);
   2150       }
   2151     }
   2152   }
   2153   int indexSize = m_CharIndex.GetSize();
   2154   if (indexSize % 2) {
   2155     m_CharIndex.RemoveAt(indexSize - 1);
   2156   }
   2157 }
   2158 int CPDF_TextPageFind::GetCharIndex(int index) const {
   2159   return m_pTextPage->CharIndexFromTextIndex(index);
   2160   int indexSize = m_CharIndex.GetSize();
   2161   int count = 0;
   2162   for (int i = 0; i < indexSize; i += 2) {
   2163     count += m_CharIndex.GetAt(i + 1);
   2164     if (count > index) {
   2165       return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i);
   2166     }
   2167   }
   2168   return -1;
   2169 }
   2170 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
   2171                                      int flags,
   2172                                      int startPos) {
   2173   if (!m_pTextPage) {
   2174     return FALSE;
   2175   }
   2176   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
   2177     m_strText = m_pTextPage->GetPageText();
   2178   }
   2179   CFX_WideString findwhatStr = findwhat;
   2180   m_findWhat = findwhatStr;
   2181   m_flags = flags;
   2182   m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
   2183   if (m_strText.IsEmpty()) {
   2184     m_IsFind = FALSE;
   2185     return TRUE;
   2186   }
   2187   FX_STRSIZE len = findwhatStr.GetLength();
   2188   if (!m_bMatchCase) {
   2189     findwhatStr.MakeLower();
   2190     m_strText.MakeLower();
   2191   }
   2192   m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
   2193   m_findNextStart = startPos;
   2194   if (startPos == -1) {
   2195     m_findPreStart = m_strText.GetLength() - 1;
   2196   } else {
   2197     m_findPreStart = startPos;
   2198   }
   2199   m_csFindWhatArray.RemoveAll();
   2200   int i = 0;
   2201   while (i < len) {
   2202     if (findwhatStr.GetAt(i) != ' ') {
   2203       break;
   2204     }
   2205     i++;
   2206   }
   2207   if (i < len) {
   2208     ExtractFindWhat(findwhatStr);
   2209   } else {
   2210     m_csFindWhatArray.Add(findwhatStr);
   2211   }
   2212   if (m_csFindWhatArray.GetSize() <= 0) {
   2213     return FALSE;
   2214   }
   2215   m_IsFind = TRUE;
   2216   m_resStart = 0;
   2217   m_resEnd = -1;
   2218   return TRUE;
   2219 }
   2220 FX_BOOL CPDF_TextPageFind::FindNext() {
   2221   if (!m_pTextPage) {
   2222     return FALSE;
   2223   }
   2224   m_resArray.RemoveAll();
   2225   if (m_findNextStart == -1) {
   2226     return FALSE;
   2227   }
   2228   if (m_strText.IsEmpty()) {
   2229     m_IsFind = FALSE;
   2230     return m_IsFind;
   2231   }
   2232   int strLen = m_strText.GetLength();
   2233   if (m_findNextStart > strLen - 1) {
   2234     m_IsFind = FALSE;
   2235     return m_IsFind;
   2236   }
   2237   int nCount = m_csFindWhatArray.GetSize();
   2238   int nResultPos = 0;
   2239   int nStartPos = 0;
   2240   nStartPos = m_findNextStart;
   2241   FX_BOOL bSpaceStart = FALSE;
   2242   for (int iWord = 0; iWord < nCount; iWord++) {
   2243     CFX_WideString csWord = m_csFindWhatArray[iWord];
   2244     if (csWord.IsEmpty()) {
   2245       if (iWord == nCount - 1) {
   2246         FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
   2247         if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR ||
   2248             strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
   2249           nResultPos = nStartPos + 1;
   2250           break;
   2251         }
   2252         iWord = -1;
   2253       } else if (iWord == 0) {
   2254         bSpaceStart = TRUE;
   2255       }
   2256       continue;
   2257     }
   2258     int endIndex;
   2259     nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
   2260     if (nResultPos == -1) {
   2261       m_IsFind = FALSE;
   2262       return m_IsFind;
   2263     }
   2264     endIndex = nResultPos + csWord.GetLength() - 1;
   2265     if (iWord == 0) {
   2266       m_resStart = nResultPos;
   2267     }
   2268     FX_BOOL bMatch = TRUE;
   2269     if (iWord != 0 && !bSpaceStart) {
   2270       int PreResEndPos = nStartPos;
   2271       int curChar = csWord.GetAt(0);
   2272       CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
   2273       int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
   2274       if (nStartPos == nResultPos &&
   2275           !(_IsIgnoreSpaceCharacter(lastChar) ||
   2276             _IsIgnoreSpaceCharacter(curChar))) {
   2277         bMatch = FALSE;
   2278       }
   2279       for (int d = PreResEndPos; d < nResultPos; d++) {
   2280         FX_WCHAR strInsert = m_strText.GetAt(d);
   2281         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
   2282             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
   2283           bMatch = FALSE;
   2284           break;
   2285         }
   2286       }
   2287     } else if (bSpaceStart) {
   2288       if (nResultPos > 0) {
   2289         FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
   2290         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
   2291             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
   2292           bMatch = FALSE;
   2293           m_resStart = nResultPos;
   2294         } else {
   2295           m_resStart = nResultPos - 1;
   2296         }
   2297       }
   2298     }
   2299     if (m_bMatchWholeWord && bMatch) {
   2300       bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
   2301     }
   2302     nStartPos = endIndex + 1;
   2303     if (!bMatch) {
   2304       iWord = -1;
   2305       if (bSpaceStart) {
   2306         nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
   2307       } else {
   2308         nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
   2309       }
   2310     }
   2311   }
   2312   m_resEnd = nResultPos +
   2313              m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1;
   2314   m_IsFind = TRUE;
   2315   int resStart = GetCharIndex(m_resStart);
   2316   int resEnd = GetCharIndex(m_resEnd);
   2317   m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray);
   2318   if (m_flags & FPDFTEXT_CONSECUTIVE) {
   2319     m_findNextStart = m_resStart + 1;
   2320     m_findPreStart = m_resEnd - 1;
   2321   } else {
   2322     m_findNextStart = m_resEnd + 1;
   2323     m_findPreStart = m_resStart - 1;
   2324   }
   2325   return m_IsFind;
   2326 }
   2327 FX_BOOL CPDF_TextPageFind::FindPrev() {
   2328   if (!m_pTextPage) {
   2329     return FALSE;
   2330   }
   2331   m_resArray.RemoveAll();
   2332   if (m_strText.IsEmpty() || m_findPreStart < 0) {
   2333     m_IsFind = FALSE;
   2334     return m_IsFind;
   2335   }
   2336   CPDF_TextPageFind findEngine(m_pTextPage);
   2337   FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
   2338   if (!ret) {
   2339     m_IsFind = FALSE;
   2340     return m_IsFind;
   2341   }
   2342   int order = -1, MatchedCount = 0;
   2343   while (ret) {
   2344     ret = findEngine.FindNext();
   2345     if (ret) {
   2346       int order1 = findEngine.GetCurOrder();
   2347       int MatchedCount1 = findEngine.GetMatchedCount();
   2348       if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
   2349         break;
   2350       }
   2351       order = order1;
   2352       MatchedCount = MatchedCount1;
   2353     }
   2354   }
   2355   if (order == -1) {
   2356     m_IsFind = FALSE;
   2357     return m_IsFind;
   2358   }
   2359   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
   2360   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
   2361   m_IsFind = TRUE;
   2362   m_pTextPage->GetRectArray(order, MatchedCount, m_resArray);
   2363   if (m_flags & FPDFTEXT_CONSECUTIVE) {
   2364     m_findNextStart = m_resStart + 1;
   2365     m_findPreStart = m_resEnd - 1;
   2366   } else {
   2367     m_findNextStart = m_resEnd + 1;
   2368     m_findPreStart = m_resStart - 1;
   2369   }
   2370   return m_IsFind;
   2371 }
   2372 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
   2373   if (findwhat.IsEmpty()) {
   2374     return;
   2375   }
   2376   int index = 0;
   2377   while (1) {
   2378     CFX_WideString csWord = TEXT_EMPTY;
   2379     int ret =
   2380         ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_CHAR);
   2381     if (csWord.IsEmpty()) {
   2382       if (ret) {
   2383         m_csFindWhatArray.Add(CFX_WideString(L""));
   2384         index++;
   2385         continue;
   2386       } else {
   2387         break;
   2388       }
   2389     }
   2390     int pos = 0;
   2391     while (pos < csWord.GetLength()) {
   2392       CFX_WideString curStr = csWord.Mid(pos, 1);
   2393       FX_WCHAR curChar = csWord.GetAt(pos);
   2394       if (_IsIgnoreSpaceCharacter(curChar)) {
   2395         if (pos > 0 && curChar == 0x2019) {
   2396           pos++;
   2397           continue;
   2398         }
   2399         if (pos > 0) {
   2400           CFX_WideString preStr = csWord.Mid(0, pos);
   2401           m_csFindWhatArray.Add(preStr);
   2402         }
   2403         m_csFindWhatArray.Add(curStr);
   2404         if (pos == csWord.GetLength() - 1) {
   2405           csWord.Empty();
   2406           break;
   2407         }
   2408         csWord = csWord.Right(csWord.GetLength() - pos - 1);
   2409         pos = 0;
   2410         continue;
   2411       }
   2412       pos++;
   2413     }
   2414     if (!csWord.IsEmpty()) {
   2415       m_csFindWhatArray.Add(csWord);
   2416     }
   2417     index++;
   2418   }
   2419 }
   2420 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
   2421                                             int startPos,
   2422                                             int endPos) {
   2423   FX_WCHAR char_left = 0;
   2424   FX_WCHAR char_right = 0;
   2425   int char_count = endPos - startPos + 1;
   2426   if (char_count < 1) {
   2427     return FALSE;
   2428   }
   2429   if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
   2430     return TRUE;
   2431   }
   2432   if (startPos - 1 >= 0) {
   2433     char_left = csPageText.GetAt(startPos - 1);
   2434   }
   2435   if (startPos + char_count < csPageText.GetLength()) {
   2436     char_right = csPageText.GetAt(startPos + char_count);
   2437   }
   2438   if ((char_left > 'A' && char_left < 'a') ||
   2439       (char_left > 'a' && char_left < 'z') ||
   2440       (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
   2441       (char_right > 'A' && char_right < 'a') ||
   2442       (char_right > 'a' && char_right < 'z') ||
   2443       (char_right > 0xfb00 && char_right < 0xfb06) ||
   2444       std::iswdigit(char_right)) {
   2445     return FALSE;
   2446   }
   2447   if (!(('A' > char_left || char_left > 'Z') &&
   2448         ('a' > char_left || char_left > 'z') &&
   2449         ('A' > char_right || char_right > 'Z') &&
   2450         ('a' > char_right || char_right > 'z'))) {
   2451     return FALSE;
   2452   }
   2453   if (char_count > 0) {
   2454     if (csPageText.GetAt(startPos) >= L'0' &&
   2455         csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
   2456         char_left <= L'9') {
   2457       return FALSE;
   2458     }
   2459     if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
   2460         char_right >= L'0' && char_right <= L'9') {
   2461       return FALSE;
   2462     }
   2463   }
   2464   return TRUE;
   2465 }
   2466 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
   2467                                             const FX_WCHAR* lpszFullString,
   2468                                             int iSubString,
   2469                                             FX_WCHAR chSep) {
   2470   if (!lpszFullString) {
   2471     return FALSE;
   2472   }
   2473   while (iSubString--) {
   2474     lpszFullString = FXSYS_wcschr(lpszFullString, chSep);
   2475     if (!lpszFullString) {
   2476       rString.Empty();
   2477       return FALSE;
   2478     }
   2479     lpszFullString++;
   2480     while (*lpszFullString == chSep) {
   2481       lpszFullString++;
   2482     }
   2483   }
   2484   const FX_WCHAR* lpchEnd = FXSYS_wcschr(lpszFullString, chSep);
   2485   int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
   2486                      : (int)FXSYS_wcslen(lpszFullString);
   2487   ASSERT(nLen >= 0);
   2488   FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
   2489                nLen * sizeof(FX_WCHAR));
   2490   rString.ReleaseBuffer();
   2491   return TRUE;
   2492 }
   2493 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
   2494   CFX_WideString str2;
   2495   str2.Empty();
   2496   int nlen = str.GetLength();
   2497   for (int i = nlen - 1; i >= 0; i--) {
   2498     str2 += str.GetAt(i);
   2499   }
   2500   return str2;
   2501 }
   2502 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const {
   2503   rects.Copy(m_resArray);
   2504 }
   2505 int CPDF_TextPageFind::GetCurOrder() const {
   2506   return GetCharIndex(m_resStart);
   2507 }
   2508 int CPDF_TextPageFind::GetMatchedCount() const {
   2509   int resStart = GetCharIndex(m_resStart);
   2510   int resEnd = GetCharIndex(m_resEnd);
   2511   return resEnd - resStart + 1;
   2512 }
   2513 
   2514 CPDF_LinkExtract::CPDF_LinkExtract()
   2515     : m_pTextPage(nullptr), m_bIsParsed(false) {
   2516 }
   2517 
   2518 CPDF_LinkExtract::~CPDF_LinkExtract() {
   2519   DeleteLinkList();
   2520 }
   2521 
   2522 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) {
   2523   if (!pTextPage || !pTextPage->IsParsed())
   2524     return FALSE;
   2525 
   2526   m_pTextPage = (const CPDF_TextPage*)pTextPage;
   2527   m_strPageText = m_pTextPage->GetPageText(0, -1);
   2528   DeleteLinkList();
   2529   if (m_strPageText.IsEmpty()) {
   2530     return FALSE;
   2531   }
   2532   ParseLink();
   2533   m_bIsParsed = true;
   2534   return TRUE;
   2535 }
   2536 
   2537 void CPDF_LinkExtract::DeleteLinkList() {
   2538   while (m_LinkList.GetSize()) {
   2539     CPDF_LinkExt* linkinfo = NULL;
   2540     linkinfo = m_LinkList.GetAt(0);
   2541     m_LinkList.RemoveAt(0);
   2542     delete linkinfo;
   2543   }
   2544   m_LinkList.RemoveAll();
   2545 }
   2546 int CPDF_LinkExtract::CountLinks() const {
   2547   if (!m_bIsParsed) {
   2548     return -1;
   2549   }
   2550   return m_LinkList.GetSize();
   2551 }
   2552 void CPDF_LinkExtract::ParseLink() {
   2553   int start = 0, pos = 0;
   2554   int TotalChar = m_pTextPage->CountChars();
   2555   while (pos < TotalChar) {
   2556     FPDF_CHAR_INFO pageChar;
   2557     m_pTextPage->GetCharInfo(pos, &pageChar);
   2558     if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 ||
   2559         pos == TotalChar - 1) {
   2560       int nCount = pos - start;
   2561       if (pos == TotalChar - 1) {
   2562         nCount++;
   2563       }
   2564       CFX_WideString strBeCheck;
   2565       strBeCheck = m_pTextPage->GetPageText(start, nCount);
   2566       if (strBeCheck.GetLength() > 5) {
   2567         while (strBeCheck.GetLength() > 0) {
   2568           FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
   2569           if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
   2570             strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
   2571             nCount--;
   2572           } else {
   2573             break;
   2574           }
   2575         }
   2576         if (nCount > 5 &&
   2577             (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
   2578           AppendToLinkList(start, nCount, strBeCheck);
   2579         }
   2580       }
   2581       start = ++pos;
   2582     } else {
   2583       pos++;
   2584     }
   2585   }
   2586 }
   2587 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
   2588   CFX_WideString str = strBeCheck;
   2589   str.MakeLower();
   2590   if (str.Find(L"http://www.") != -1) {
   2591     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
   2592     return TRUE;
   2593   }
   2594   if (str.Find(L"http://") != -1) {
   2595     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
   2596     return TRUE;
   2597   }
   2598   if (str.Find(L"https://www.") != -1) {
   2599     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
   2600     return TRUE;
   2601   }
   2602   if (str.Find(L"https://") != -1) {
   2603     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
   2604     return TRUE;
   2605   }
   2606   if (str.Find(L"www.") != -1) {
   2607     strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
   2608     strBeCheck = L"http://" + strBeCheck;
   2609     return TRUE;
   2610   }
   2611   return FALSE;
   2612 }
   2613 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
   2614   int aPos = str.Find(L'@');
   2615   // Invalid when no '@'.
   2616   if (aPos < 1) {
   2617     return FALSE;
   2618   }
   2619 
   2620   // Check the local part.
   2621   int pPos = aPos;  // Used to track the position of '@' or '.'.
   2622   for (int i = aPos - 1; i >= 0; i--) {
   2623     FX_WCHAR ch = str.GetAt(i);
   2624     if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
   2625       continue;
   2626     }
   2627     if (ch != L'.' || i == pPos - 1 || i == 0) {
   2628       if (i == aPos - 1) {
   2629         // There is '.' or invalid char before '@'.
   2630         return FALSE;
   2631       }
   2632       // End extracting for other invalid chars, '.' at the beginning, or
   2633       // consecutive '.'.
   2634       int removed_len = i == pPos - 1 ? i + 2 : i + 1;
   2635       str = str.Right(str.GetLength() - removed_len);
   2636       break;
   2637     }
   2638     // Found a valid '.'.
   2639     pPos = i;
   2640   }
   2641 
   2642   // Check the domain name part.
   2643   aPos = str.Find(L'@');
   2644   if (aPos < 1) {
   2645     return FALSE;
   2646   }
   2647   str.TrimRight(L'.');
   2648   // At least one '.' in domain name, but not at the beginning.
   2649   // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
   2650   // Check whether we should remove this check.
   2651   int ePos = str.Find(L'.', aPos + 1);
   2652   if (ePos == -1 || ePos == aPos + 1) {
   2653     return FALSE;
   2654   }
   2655   // Validate all other chars in domain name.
   2656   int nLen = str.GetLength();
   2657   pPos = 0;  // Used to track the position of '.'.
   2658   for (int i = aPos + 1; i < nLen; i++) {
   2659     FX_WCHAR wch = str.GetAt(i);
   2660     if (wch == L'-' || FXSYS_iswalnum(wch)) {
   2661       continue;
   2662     }
   2663     if (wch != L'.' || i == pPos + 1) {
   2664       // Domain name should end before invalid char.
   2665       int host_end = i == pPos + 1 ? i - 2 : i - 1;
   2666       if (pPos > 0 && host_end - aPos >= 3) {
   2667         // Trim the ending invalid chars if there is at least one '.' and name.
   2668         str = str.Left(host_end + 1);
   2669         break;
   2670       }
   2671       return FALSE;
   2672     }
   2673     pPos = i;
   2674   }
   2675 
   2676   if (str.Find(L"mailto:") == -1) {
   2677     str = L"mailto:" + str;
   2678   }
   2679   return TRUE;
   2680 }
   2681 
   2682 void CPDF_LinkExtract::AppendToLinkList(int start,
   2683                                         int count,
   2684                                         const CFX_WideString& strUrl) {
   2685   CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
   2686   linkInfo->m_strUrl = strUrl;
   2687   linkInfo->m_Start = start;
   2688   linkInfo->m_Count = count;
   2689   m_LinkList.Add(linkInfo);
   2690 }
   2691 
   2692 CFX_WideString CPDF_LinkExtract::GetURL(int index) const {
   2693   if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
   2694     return L"";
   2695   }
   2696   CPDF_LinkExt* link = NULL;
   2697   link = m_LinkList.GetAt(index);
   2698   if (!link) {
   2699     return L"";
   2700   }
   2701   return link->m_strUrl;
   2702 }
   2703 void CPDF_LinkExtract::GetBoundedSegment(int index,
   2704                                          int& start,
   2705                                          int& count) const {
   2706   if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
   2707     return;
   2708   }
   2709   CPDF_LinkExt* link = NULL;
   2710   link = m_LinkList.GetAt(index);
   2711   if (!link) {
   2712     return;
   2713   }
   2714   start = link->m_Start;
   2715   count = link->m_Count;
   2716 }
   2717 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const {
   2718   if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
   2719     return;
   2720   }
   2721   CPDF_LinkExt* link = NULL;
   2722   link = m_LinkList.GetAt(index);
   2723   if (!link) {
   2724     return;
   2725   }
   2726   m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
   2727 }
   2728