Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "core/fpdftext/cpdf_textpagefind.h"
      8 
      9 #include <cwchar>
     10 #include <cwctype>
     11 #include <vector>
     12 
     13 #include "core/fpdftext/cpdf_textpage.h"
     14 #include "core/fxcrt/fx_string.h"
     15 #include "core/fxcrt/fx_system.h"
     16 #include "third_party/base/stl_util.h"
     17 
     18 namespace {
     19 
     20 bool IsIgnoreSpaceCharacter(wchar_t curChar) {
     21   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
     22       (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
     23       (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
     24       (curChar >= 0x0400 && curChar <= 0x04FF) ||
     25       (curChar >= 0x0500 && curChar <= 0x052F) ||
     26       (curChar >= 0xA640 && curChar <= 0xA69F) ||
     27       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
     28       (curChar >= 0x2000 && curChar <= 0x206F)) {
     29     return false;
     30   }
     31   return true;
     32 }
     33 
     34 }  // namespace
     35 
     36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
     37     : m_pTextPage(pTextPage),
     38       m_flags(0),
     39       m_bMatchCase(false),
     40       m_bMatchWholeWord(false),
     41       m_resStart(0),
     42       m_resEnd(-1),
     43       m_IsFind(false) {
     44   m_strText = m_pTextPage->GetAllPageText();
     45   int nCount = pTextPage->CountChars();
     46   if (nCount)
     47     m_CharIndex.push_back(0);
     48   for (int i = 0; i < nCount; i++) {
     49     FPDF_CHAR_INFO info;
     50     pTextPage->GetCharInfo(i, &info);
     51     int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
     52     if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
     53         info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
     54       if (indexSize % 2) {
     55         m_CharIndex.push_back(1);
     56       } else {
     57         if (indexSize <= 0)
     58           continue;
     59         m_CharIndex[indexSize - 1] += 1;
     60       }
     61     } else {
     62       if (indexSize % 2) {
     63         if (indexSize <= 0)
     64           continue;
     65         m_CharIndex[indexSize - 1] = i + 1;
     66       } else {
     67         m_CharIndex.push_back(i + 1);
     68       }
     69     }
     70   }
     71   int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
     72   if (indexSize % 2)
     73     m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
     74 }
     75 
     76 CPDF_TextPageFind::~CPDF_TextPageFind() {}
     77 
     78 int CPDF_TextPageFind::GetCharIndex(int index) const {
     79   return m_pTextPage->CharIndexFromTextIndex(index);
     80 }
     81 
     82 bool CPDF_TextPageFind::FindFirst(const WideString& findwhat,
     83                                   int flags,
     84                                   Optional<size_t> startPos) {
     85   if (!m_pTextPage)
     86     return false;
     87   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
     88     m_strText = m_pTextPage->GetAllPageText();
     89   WideString findwhatStr = findwhat;
     90   m_findWhat = findwhatStr;
     91   m_flags = flags;
     92   m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
     93   if (m_strText.IsEmpty()) {
     94     m_IsFind = false;
     95     return true;
     96   }
     97   size_t len = findwhatStr.GetLength();
     98   if (!m_bMatchCase) {
     99     findwhatStr.MakeLower();
    100     m_strText.MakeLower();
    101   }
    102   m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD);
    103   m_findNextStart = startPos;
    104   if (!startPos.has_value()) {
    105     if (!m_strText.IsEmpty())
    106       m_findPreStart = m_strText.GetLength() - 1;
    107   } else {
    108     m_findPreStart = startPos;
    109   }
    110 
    111   m_csFindWhatArray.clear();
    112   size_t i = 0;
    113   for (i = 0; i < len; ++i)
    114     if (findwhatStr[i] != ' ')
    115       break;
    116   if (i < len)
    117     ExtractFindWhat(findwhatStr);
    118   else
    119     m_csFindWhatArray.push_back(findwhatStr);
    120   if (m_csFindWhatArray.empty())
    121     return false;
    122 
    123   m_IsFind = true;
    124   m_resStart = 0;
    125   m_resEnd = -1;
    126   return true;
    127 }
    128 
    129 bool CPDF_TextPageFind::FindNext() {
    130   if (!m_pTextPage)
    131     return false;
    132   m_resArray.clear();
    133   if (!m_findNextStart.has_value())
    134     return false;
    135   if (m_strText.IsEmpty()) {
    136     m_IsFind = false;
    137     return m_IsFind;
    138   }
    139   size_t strLen = m_strText.GetLength();
    140   if (m_findNextStart.value() > strLen - 1) {
    141     m_IsFind = false;
    142     return m_IsFind;
    143   }
    144   int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
    145   Optional<size_t> nResultPos = 0;
    146   size_t nStartPos = m_findNextStart.value();
    147   bool bSpaceStart = false;
    148   for (int iWord = 0; iWord < nCount; iWord++) {
    149     WideString csWord = m_csFindWhatArray[iWord];
    150     if (csWord.IsEmpty()) {
    151       if (iWord == nCount - 1) {
    152         wchar_t strInsert = m_strText[nStartPos];
    153         if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
    154             strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
    155           nResultPos = nStartPos + 1;
    156           break;
    157         }
    158         iWord = -1;
    159       } else if (iWord == 0) {
    160         bSpaceStart = true;
    161       }
    162       continue;
    163     }
    164     nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
    165     if (!nResultPos.has_value()) {
    166       m_IsFind = false;
    167       return m_IsFind;
    168     }
    169     size_t endIndex = nResultPos.value() + csWord.GetLength() - 1;
    170     if (iWord == 0)
    171       m_resStart = nResultPos.value();
    172     bool bMatch = true;
    173     if (iWord != 0 && !bSpaceStart) {
    174       size_t PreResEndPos = nStartPos;
    175       int curChar = csWord[0];
    176       WideString lastWord = m_csFindWhatArray[iWord - 1];
    177       int lastChar = lastWord[lastWord.GetLength() - 1];
    178       if (nStartPos == nResultPos.value() &&
    179           !(IsIgnoreSpaceCharacter(lastChar) ||
    180             IsIgnoreSpaceCharacter(curChar))) {
    181         bMatch = false;
    182       }
    183       for (size_t d = PreResEndPos; d < nResultPos.value(); d++) {
    184         wchar_t strInsert = m_strText[d];
    185         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
    186             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
    187           bMatch = false;
    188           break;
    189         }
    190       }
    191     } else if (bSpaceStart) {
    192       if (nResultPos.value() > 0) {
    193         wchar_t strInsert = m_strText[nResultPos.value() - 1];
    194         if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
    195             strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
    196           bMatch = false;
    197           m_resStart = nResultPos.value();
    198         } else {
    199           m_resStart = nResultPos.value() - 1;
    200         }
    201       }
    202     }
    203     if (m_bMatchWholeWord && bMatch) {
    204       bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex);
    205     }
    206     nStartPos = endIndex + 1;
    207     if (!bMatch) {
    208       iWord = -1;
    209       if (bSpaceStart)
    210         nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
    211       else
    212         nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
    213     }
    214   }
    215   m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1;
    216   m_IsFind = true;
    217   int resStart = GetCharIndex(m_resStart);
    218   int resEnd = GetCharIndex(m_resEnd);
    219   m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
    220   if (m_flags & FPDFTEXT_CONSECUTIVE) {
    221     m_findNextStart = m_resStart + 1;
    222     m_findPreStart = m_resEnd - 1;
    223   } else {
    224     m_findNextStart = m_resEnd + 1;
    225     m_findPreStart = m_resStart - 1;
    226   }
    227   return m_IsFind;
    228 }
    229 
    230 bool CPDF_TextPageFind::FindPrev() {
    231   if (!m_pTextPage)
    232     return false;
    233   m_resArray.clear();
    234   if (m_strText.IsEmpty() || !m_findPreStart.has_value()) {
    235     m_IsFind = false;
    236     return m_IsFind;
    237   }
    238   CPDF_TextPageFind findEngine(m_pTextPage.Get());
    239   bool ret = findEngine.FindFirst(m_findWhat, m_flags, Optional<size_t>(0));
    240   if (!ret) {
    241     m_IsFind = false;
    242     return m_IsFind;
    243   }
    244   int order = -1;
    245   int MatchedCount = 0;
    246   while (ret) {
    247     ret = findEngine.FindNext();
    248     if (ret) {
    249       int order1 = findEngine.GetCurOrder();
    250       int MatchedCount1 = findEngine.GetMatchedCount();
    251       int temp = order1 + MatchedCount1;
    252       if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1)
    253         break;
    254       order = order1;
    255       MatchedCount = MatchedCount1;
    256     }
    257   }
    258   if (order == -1) {
    259     m_IsFind = false;
    260     return m_IsFind;
    261   }
    262   m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
    263   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
    264   m_IsFind = true;
    265   m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
    266   if (m_flags & FPDFTEXT_CONSECUTIVE) {
    267     m_findNextStart = m_resStart + 1;
    268     m_findPreStart = m_resEnd - 1;
    269   } else {
    270     m_findNextStart = m_resEnd + 1;
    271     m_findPreStart = m_resStart - 1;
    272   }
    273   return m_IsFind;
    274 }
    275 
    276 void CPDF_TextPageFind::ExtractFindWhat(const WideString& findwhat) {
    277   if (findwhat.IsEmpty())
    278     return;
    279   int index = 0;
    280   while (1) {
    281     Optional<WideString> word =
    282         ExtractSubString(findwhat.c_str(), index, TEXT_SPACE_CHAR);
    283     if (!word)
    284       break;
    285 
    286     if (word->IsEmpty()) {
    287       m_csFindWhatArray.push_back(L"");
    288       index++;
    289       continue;
    290     }
    291 
    292     size_t pos = 0;
    293     while (pos < word->GetLength()) {
    294       WideString curStr = word->Mid(pos, 1);
    295       wchar_t curChar = word->operator[](pos);
    296       if (IsIgnoreSpaceCharacter(curChar)) {
    297         if (pos > 0 && curChar == 0x2019) {
    298           pos++;
    299           continue;
    300         }
    301         if (pos > 0)
    302           m_csFindWhatArray.push_back(word->Left(pos));
    303         m_csFindWhatArray.push_back(curStr);
    304         if (pos == word->GetLength() - 1) {
    305           word->clear();
    306           break;
    307         }
    308         word.emplace(word->Right(word->GetLength() - pos - 1));
    309         pos = 0;
    310         continue;
    311       }
    312       pos++;
    313     }
    314 
    315     if (!word->IsEmpty())
    316       m_csFindWhatArray.push_back(word.value());
    317     index++;
    318   }
    319 }
    320 
    321 bool CPDF_TextPageFind::IsMatchWholeWord(const WideString& csPageText,
    322                                          size_t startPos,
    323                                          size_t endPos) {
    324   if (startPos > endPos)
    325     return false;
    326   wchar_t char_left = 0;
    327   wchar_t char_right = 0;
    328   size_t char_count = endPos - startPos + 1;
    329   if (char_count == 0)
    330     return false;
    331   if (char_count == 1 && csPageText[startPos] > 255)
    332     return true;
    333   if (startPos >= 1)
    334     char_left = csPageText[startPos - 1];
    335   if (startPos + char_count < csPageText.GetLength())
    336     char_right = csPageText[startPos + char_count];
    337   if ((char_left > 'A' && char_left < 'a') ||
    338       (char_left > 'a' && char_left < 'z') ||
    339       (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
    340       (char_right > 'A' && char_right < 'a') ||
    341       (char_right > 'a' && char_right < 'z') ||
    342       (char_right > 0xfb00 && char_right < 0xfb06) ||
    343       std::iswdigit(char_right)) {
    344     return false;
    345   }
    346   if (!(('A' > char_left || char_left > 'Z') &&
    347         ('a' > char_left || char_left > 'z') &&
    348         ('A' > char_right || char_right > 'Z') &&
    349         ('a' > char_right || char_right > 'z'))) {
    350     return false;
    351   }
    352   if (char_count > 0) {
    353     if (std::iswdigit(char_left) && std::iswdigit(csPageText[startPos]))
    354       return false;
    355     if (std::iswdigit(char_right) && std::iswdigit(csPageText[endPos]))
    356       return false;
    357   }
    358   return true;
    359 }
    360 
    361 Optional<WideString> CPDF_TextPageFind::ExtractSubString(
    362     const wchar_t* lpszFullString,
    363     int iSubString,
    364     wchar_t chSep) {
    365   if (!lpszFullString)
    366     return {};
    367 
    368   while (iSubString--) {
    369     lpszFullString = std::wcschr(lpszFullString, chSep);
    370     if (!lpszFullString)
    371       return {};
    372 
    373     lpszFullString++;
    374     while (*lpszFullString == chSep)
    375       lpszFullString++;
    376   }
    377 
    378   const wchar_t* lpchEnd = std::wcschr(lpszFullString, chSep);
    379   int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString)
    380                      : static_cast<int>(wcslen(lpszFullString));
    381   if (nLen < 0)
    382     return {};
    383 
    384   return {WideString(lpszFullString, static_cast<size_t>(nLen))};
    385 }
    386 
    387 int CPDF_TextPageFind::GetCurOrder() const {
    388   return GetCharIndex(m_resStart);
    389 }
    390 
    391 int CPDF_TextPageFind::GetMatchedCount() const {
    392   int resStart = GetCharIndex(m_resStart);
    393   int resEnd = GetCharIndex(m_resEnd);
    394   return resEnd - resStart + 1;
    395 }
    396