Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "core/fpdftext/cpdf_textpagefind.h"
      8 
      9 #include <cwchar>
     10 #include <cwctype>
     11 #include <vector>
     12 
     13 #include "core/fpdftext/cpdf_textpage.h"
     14 #include "core/fxcrt/fx_string.h"
     15 #include "core/fxcrt/fx_system.h"
     16 #include "third_party/base/stl_util.h"
     17 
     18 namespace {
     19 
     20 bool IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
     21   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
     22       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
     23       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
     24       (curChar >= 0x0400 && curChar <= 0x04FF) ||
     25       (curChar >= 0x0500 && curChar <= 0x052F) ||
     26       (curChar >= 0xA640 && curChar <= 0xA69F) ||
     27       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
     28       (curChar >= 0x2000 && curChar <= 0x206F)) {
     29     return false;
     30   }
     31   return true;
     32 }
     33 
     34 }  // namespace
     35 
     36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
     37     : m_pTextPage(pTextPage),
     38       m_flags(0),
     39       m_findNextStart(-1),
     40       m_findPreStart(-1),
     41       m_bMatchCase(false),
     42       m_bMatchWholeWord(false),
     43       m_resStart(0),
     44       m_resEnd(-1),
     45       m_IsFind(false) {
     46   m_strText = m_pTextPage->GetPageText();
     47   int nCount = pTextPage->CountChars();
     48   if (nCount)
     49     m_CharIndex.push_back(0);
     50   for (int i = 0; i < nCount; i++) {
     51     FPDF_CHAR_INFO info;
     52     pTextPage->GetCharInfo(i, &info);
     53     int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
     54     if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
     55         info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
     56       if (indexSize % 2) {
     57         m_CharIndex.push_back(1);
     58       } else {
     59         if (indexSize <= 0)
     60           continue;
     61         m_CharIndex[indexSize - 1] += 1;
     62       }
     63     } else {
     64       if (indexSize % 2) {
     65         if (indexSize <= 0)
     66           continue;
     67         m_CharIndex[indexSize - 1] = i + 1;
     68       } else {
     69         m_CharIndex.push_back(i + 1);
     70       }
     71     }
     72   }
     73   int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
     74   if (indexSize % 2)
     75     m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
     76 }
     77 
     78 CPDF_TextPageFind::~CPDF_TextPageFind() {}
     79 
     80 int CPDF_TextPageFind::GetCharIndex(int index) const {
     81   return m_pTextPage->CharIndexFromTextIndex(index);
     82 }
     83 
     84 bool CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
     85                                   int flags,
     86                                   int startPos) {
     87   if (!m_pTextPage)
     88     return false;
     89   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
     90     m_strText = m_pTextPage->GetPageText();
     91   CFX_WideString findwhatStr = findwhat;
     92   m_findWhat = findwhatStr;
     93   m_flags = flags;
     94   m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
     95   if (m_strText.IsEmpty()) {
     96     m_IsFind = false;
     97     return true;
     98   }
     99   FX_STRSIZE len = findwhatStr.GetLength();
    100   if (!m_bMatchCase) {
    101     findwhatStr.MakeLower();
    102     m_strText.MakeLower();
    103   }
    104   m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
    105   m_findNextStart = startPos;
    106   if (startPos == -1)
    107     m_findPreStart = m_strText.GetLength() - 1;
    108   else
    109     m_findPreStart = startPos;
    110   m_csFindWhatArray.clear();
    111   int i = 0;
    112   while (i < len) {
    113     if (findwhatStr.GetAt(i) != ' ')
    114       break;
    115     i++;
    116   }
    117   if (i < len)
    118     ExtractFindWhat(findwhatStr);
    119   else
    120     m_csFindWhatArray.push_back(findwhatStr);
    121   if (m_csFindWhatArray.empty())
    122     return false;
    123   m_IsFind = true;
    124   m_resStart = 0;
    125   m_resEnd = -1;
    126   return true;
    127 }
    128 
    129 bool CPDF_TextPageFind::FindNext() {
    130   if (!m_pTextPage)
    131     return false;
    132   m_resArray.clear();
    133   if (m_findNextStart == -1)
    134     return false;
    135   if (m_strText.IsEmpty()) {
    136     m_IsFind = false;
    137     return m_IsFind;
    138   }
    139   int strLen = m_strText.GetLength();
    140   if (m_findNextStart > strLen - 1) {
    141     m_IsFind = false;
    142     return m_IsFind;
    143   }
    144   int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
    145   int nResultPos = 0;
    146   int nStartPos = 0;
    147   nStartPos = m_findNextStart;
    148   bool bSpaceStart = false;
    149   for (int iWord = 0; iWord < nCount; iWord++) {
    150     CFX_WideString csWord = m_csFindWhatArray[iWord];
    151     if (csWord.IsEmpty()) {
    152       if (iWord == nCount - 1) {
    153         FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
    154         if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
    155             strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
    156           nResultPos = nStartPos + 1;
    157           break;
    158         }
    159         iWord = -1;
    160       } else if (iWord == 0) {
    161         bSpaceStart = true;
    162       }
    163       continue;
    164     }
    165     int endIndex;
    166     nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
    167     if (nResultPos == -1) {
    168       m_IsFind = false;
    169       return m_IsFind;
    170     }
    171     endIndex = nResultPos + csWord.GetLength() - 1;
    172     if (iWord == 0)
    173       m_resStart = nResultPos;
    174     bool bMatch = true;
    175     if (iWord != 0 && !bSpaceStart) {
    176       int PreResEndPos = nStartPos;
    177       int curChar = csWord.GetAt(0);
    178       CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
    179       int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
    180       if (nStartPos == nResultPos &&
    181           !(IsIgnoreSpaceCharacter(lastChar) ||
    182             IsIgnoreSpaceCharacter(curChar))) {
    183         bMatch = false;
    184       }
    185       for (int d = PreResEndPos; d < nResultPos; d++) {
    186         FX_WCHAR strInsert = m_strText.GetAt(d);
    187         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
    188             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
    189           bMatch = false;
    190           break;
    191         }
    192       }
    193     } else if (bSpaceStart) {
    194       if (nResultPos > 0) {
    195         FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
    196         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
    197             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
    198           bMatch = false;
    199           m_resStart = nResultPos;
    200         } else {
    201           m_resStart = nResultPos - 1;
    202         }
    203       }
    204     }
    205     if (m_bMatchWholeWord && bMatch) {
    206       bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
    207     }
    208     nStartPos = endIndex + 1;
    209     if (!bMatch) {
    210       iWord = -1;
    211       if (bSpaceStart)
    212         nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
    213       else
    214         nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
    215     }
    216   }
    217   m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
    218   m_IsFind = true;
    219   int resStart = GetCharIndex(m_resStart);
    220   int resEnd = GetCharIndex(m_resEnd);
    221   m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
    222   if (m_flags & FPDFTEXT_CONSECUTIVE) {
    223     m_findNextStart = m_resStart + 1;
    224     m_findPreStart = m_resEnd - 1;
    225   } else {
    226     m_findNextStart = m_resEnd + 1;
    227     m_findPreStart = m_resStart - 1;
    228   }
    229   return m_IsFind;
    230 }
    231 
    232 bool CPDF_TextPageFind::FindPrev() {
    233   if (!m_pTextPage)
    234     return false;
    235   m_resArray.clear();
    236   if (m_strText.IsEmpty() || m_findPreStart < 0) {
    237     m_IsFind = false;
    238     return m_IsFind;
    239   }
    240   CPDF_TextPageFind findEngine(m_pTextPage);
    241   bool ret = findEngine.FindFirst(m_findWhat, m_flags);
    242   if (!ret) {
    243     m_IsFind = false;
    244     return m_IsFind;
    245   }
    246   int order = -1, MatchedCount = 0;
    247   while (ret) {
    248     ret = findEngine.FindNext();
    249     if (ret) {
    250       int order1 = findEngine.GetCurOrder();
    251       int MatchedCount1 = findEngine.GetMatchedCount();
    252       if (((order1 + MatchedCount1) - 1) > m_findPreStart)
    253         break;
    254       order = order1;
    255       MatchedCount = MatchedCount1;
    256     }
    257   }
    258   if (order == -1) {
    259     m_IsFind = false;
    260     return m_IsFind;
    261   }
    262   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
    263   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
    264   m_IsFind = true;
    265   m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
    266   if (m_flags & FPDFTEXT_CONSECUTIVE) {
    267     m_findNextStart = m_resStart + 1;
    268     m_findPreStart = m_resEnd - 1;
    269   } else {
    270     m_findNextStart = m_resEnd + 1;
    271     m_findPreStart = m_resStart - 1;
    272   }
    273   return m_IsFind;
    274 }
    275 
    276 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
    277   if (findwhat.IsEmpty())
    278     return;
    279   int index = 0;
    280   while (1) {
    281     CFX_WideString csWord = TEXT_EMPTY;
    282     int ret =
    283         ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
    284     if (csWord.IsEmpty()) {
    285       if (ret) {
    286         m_csFindWhatArray.push_back(L"");
    287         index++;
    288         continue;
    289       } else {
    290         break;
    291       }
    292     }
    293     int pos = 0;
    294     while (pos < csWord.GetLength()) {
    295       CFX_WideString curStr = csWord.Mid(pos, 1);
    296       FX_WCHAR curChar = csWord.GetAt(pos);
    297       if (IsIgnoreSpaceCharacter(curChar)) {
    298         if (pos > 0 && curChar == 0x2019) {
    299           pos++;
    300           continue;
    301         }
    302         if (pos > 0)
    303           m_csFindWhatArray.push_back(csWord.Mid(0, pos));
    304         m_csFindWhatArray.push_back(curStr);
    305         if (pos == csWord.GetLength() - 1) {
    306           csWord.clear();
    307           break;
    308         }
    309         csWord = csWord.Right(csWord.GetLength() - pos - 1);
    310         pos = 0;
    311         continue;
    312       }
    313       pos++;
    314     }
    315     if (!csWord.IsEmpty())
    316       m_csFindWhatArray.push_back(csWord);
    317     index++;
    318   }
    319 }
    320 
    321 bool CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
    322                                          int startPos,
    323                                          int endPos) {
    324   FX_WCHAR char_left = 0;
    325   FX_WCHAR char_right = 0;
    326   int char_count = endPos - startPos + 1;
    327   if (char_count < 1)
    328     return false;
    329   if (char_count == 1 && csPageText.GetAt(startPos) > 255)
    330     return true;
    331   if (startPos - 1 >= 0)
    332     char_left = csPageText.GetAt(startPos - 1);
    333   if (startPos + char_count < csPageText.GetLength())
    334     char_right = csPageText.GetAt(startPos + char_count);
    335   if ((char_left > 'A' && char_left < 'a') ||
    336       (char_left > 'a' && char_left < 'z') ||
    337       (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
    338       (char_right > 'A' && char_right < 'a') ||
    339       (char_right > 'a' && char_right < 'z') ||
    340       (char_right > 0xfb00 && char_right < 0xfb06) ||
    341       std::iswdigit(char_right)) {
    342     return false;
    343   }
    344   if (!(('A' > char_left || char_left > 'Z') &&
    345         ('a' > char_left || char_left > 'z') &&
    346         ('A' > char_right || char_right > 'Z') &&
    347         ('a' > char_right || char_right > 'z'))) {
    348     return false;
    349   }
    350   if (char_count > 0) {
    351     if (csPageText.GetAt(startPos) >= L'0' &&
    352         csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
    353         char_left <= L'9') {
    354       return false;
    355     }
    356     if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
    357         char_right >= L'0' && char_right <= L'9') {
    358       return false;
    359     }
    360   }
    361   return true;
    362 }
    363 
    364 bool CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
    365                                          const FX_WCHAR* lpszFullString,
    366                                          int iSubString,
    367                                          FX_WCHAR chSep) {
    368   if (!lpszFullString)
    369     return false;
    370   while (iSubString--) {
    371     lpszFullString = std::wcschr(lpszFullString, chSep);
    372     if (!lpszFullString) {
    373       rString.clear();
    374       return false;
    375     }
    376     lpszFullString++;
    377     while (*lpszFullString == chSep)
    378       lpszFullString++;
    379   }
    380   const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep);
    381   int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
    382                      : (int)FXSYS_wcslen(lpszFullString);
    383   ASSERT(nLen >= 0);
    384   FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
    385                nLen * sizeof(FX_WCHAR));
    386   rString.ReleaseBuffer();
    387   return true;
    388 }
    389 
    390 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
    391   CFX_WideString str2;
    392   str2.clear();
    393   int nlen = str.GetLength();
    394   for (int i = nlen - 1; i >= 0; i--)
    395     str2 += str.GetAt(i);
    396   return str2;
    397 }
    398 
    399 int CPDF_TextPageFind::GetCurOrder() const {
    400   return GetCharIndex(m_resStart);
    401 }
    402 
    403 int CPDF_TextPageFind::GetMatchedCount() const {
    404   int resStart = GetCharIndex(m_resStart);
    405   int resEnd = GetCharIndex(m_resEnd);
    406   return resEnd - resStart + 1;
    407 }
    408