1 // Copyright 2016 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "core/fpdftext/cpdf_textpagefind.h" 8 9 #include <cwchar> 10 #include <cwctype> 11 #include <vector> 12 13 #include "core/fpdftext/cpdf_textpage.h" 14 #include "core/fxcrt/fx_string.h" 15 #include "core/fxcrt/fx_system.h" 16 #include "third_party/base/stl_util.h" 17 18 namespace { 19 20 bool IsIgnoreSpaceCharacter(wchar_t curChar) { 21 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) || 22 (curChar >= 0xFE70 && curChar <= 0xFEFF) || 23 (curChar >= 0xFB50 && curChar <= 0xFDFF) || 24 (curChar >= 0x0400 && curChar <= 0x04FF) || 25 (curChar >= 0x0500 && curChar <= 0x052F) || 26 (curChar >= 0xA640 && curChar <= 0xA69F) || 27 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || 28 (curChar >= 0x2000 && curChar <= 0x206F)) { 29 return false; 30 } 31 return true; 32 } 33 34 } // namespace 35 36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) 37 : m_pTextPage(pTextPage), 38 m_flags(0), 39 m_bMatchCase(false), 40 m_bMatchWholeWord(false), 41 m_resStart(0), 42 m_resEnd(-1), 43 m_IsFind(false) { 44 m_strText = m_pTextPage->GetAllPageText(); 45 int nCount = pTextPage->CountChars(); 46 if (nCount) 47 m_CharIndex.push_back(0); 48 for (int i = 0; i < nCount; i++) { 49 FPDF_CHAR_INFO info; 50 pTextPage->GetCharInfo(i, &info); 51 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 52 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL || 53 info.m_Flag == FPDFTEXT_CHAR_GENERATED) { 54 if (indexSize % 2) { 55 m_CharIndex.push_back(1); 56 } else { 57 if (indexSize <= 0) 58 continue; 59 m_CharIndex[indexSize - 1] += 1; 60 } 61 } else { 62 if (indexSize % 2) { 63 if (indexSize <= 0) 64 continue; 65 m_CharIndex[indexSize - 1] = i + 1; 66 } else { 67 m_CharIndex.push_back(i + 1); 68 } 69 } 70 } 71 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 72 if (indexSize % 2) 73 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); 74 } 75 76 CPDF_TextPageFind::~CPDF_TextPageFind() {} 77 78 int CPDF_TextPageFind::GetCharIndex(int index) const { 79 return m_pTextPage->CharIndexFromTextIndex(index); 80 } 81 82 bool CPDF_TextPageFind::FindFirst(const WideString& findwhat, 83 int flags, 84 Optional<size_t> startPos) { 85 if (!m_pTextPage) 86 return false; 87 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) 88 m_strText = m_pTextPage->GetAllPageText(); 89 WideString findwhatStr = findwhat; 90 m_findWhat = findwhatStr; 91 m_flags = flags; 92 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; 93 if (m_strText.IsEmpty()) { 94 m_IsFind = false; 95 return true; 96 } 97 size_t len = findwhatStr.GetLength(); 98 if (!m_bMatchCase) { 99 findwhatStr.MakeLower(); 100 m_strText.MakeLower(); 101 } 102 m_bMatchWholeWord = !!(flags & FPDFTEXT_MATCHWHOLEWORD); 103 m_findNextStart = startPos; 104 if (!startPos.has_value()) { 105 if (!m_strText.IsEmpty()) 106 m_findPreStart = m_strText.GetLength() - 1; 107 } else { 108 m_findPreStart = startPos; 109 } 110 111 m_csFindWhatArray.clear(); 112 size_t i = 0; 113 for (i = 0; i < len; ++i) 114 if (findwhatStr[i] != ' ') 115 break; 116 if (i < len) 117 ExtractFindWhat(findwhatStr); 118 else 119 m_csFindWhatArray.push_back(findwhatStr); 120 if (m_csFindWhatArray.empty()) 121 return false; 122 123 m_IsFind = true; 124 m_resStart = 0; 125 m_resEnd = -1; 126 return true; 127 } 128 129 bool CPDF_TextPageFind::FindNext() { 130 if (!m_pTextPage) 131 return false; 132 m_resArray.clear(); 133 if (!m_findNextStart.has_value()) 134 return false; 135 if (m_strText.IsEmpty()) { 136 m_IsFind = false; 137 return m_IsFind; 138 } 139 size_t strLen = m_strText.GetLength(); 140 if (m_findNextStart.value() > strLen - 1) { 141 m_IsFind = false; 142 return m_IsFind; 143 } 144 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); 145 Optional<size_t> nResultPos = 0; 146 size_t nStartPos = m_findNextStart.value(); 147 bool bSpaceStart = false; 148 for (int iWord = 0; iWord < nCount; iWord++) { 149 WideString csWord = m_csFindWhatArray[iWord]; 150 if (csWord.IsEmpty()) { 151 if (iWord == nCount - 1) { 152 wchar_t strInsert = m_strText[nStartPos]; 153 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR || 154 strInsert == TEXT_RETURN_CHAR || strInsert == 160) { 155 nResultPos = nStartPos + 1; 156 break; 157 } 158 iWord = -1; 159 } else if (iWord == 0) { 160 bSpaceStart = true; 161 } 162 continue; 163 } 164 nResultPos = m_strText.Find(csWord.c_str(), nStartPos); 165 if (!nResultPos.has_value()) { 166 m_IsFind = false; 167 return m_IsFind; 168 } 169 size_t endIndex = nResultPos.value() + csWord.GetLength() - 1; 170 if (iWord == 0) 171 m_resStart = nResultPos.value(); 172 bool bMatch = true; 173 if (iWord != 0 && !bSpaceStart) { 174 size_t PreResEndPos = nStartPos; 175 int curChar = csWord[0]; 176 WideString lastWord = m_csFindWhatArray[iWord - 1]; 177 int lastChar = lastWord[lastWord.GetLength() - 1]; 178 if (nStartPos == nResultPos.value() && 179 !(IsIgnoreSpaceCharacter(lastChar) || 180 IsIgnoreSpaceCharacter(curChar))) { 181 bMatch = false; 182 } 183 for (size_t d = PreResEndPos; d < nResultPos.value(); d++) { 184 wchar_t strInsert = m_strText[d]; 185 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && 186 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 187 bMatch = false; 188 break; 189 } 190 } 191 } else if (bSpaceStart) { 192 if (nResultPos.value() > 0) { 193 wchar_t strInsert = m_strText[nResultPos.value() - 1]; 194 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && 195 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 196 bMatch = false; 197 m_resStart = nResultPos.value(); 198 } else { 199 m_resStart = nResultPos.value() - 1; 200 } 201 } 202 } 203 if (m_bMatchWholeWord && bMatch) { 204 bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex); 205 } 206 nStartPos = endIndex + 1; 207 if (!bMatch) { 208 iWord = -1; 209 if (bSpaceStart) 210 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); 211 else 212 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); 213 } 214 } 215 m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1; 216 m_IsFind = true; 217 int resStart = GetCharIndex(m_resStart); 218 int resEnd = GetCharIndex(m_resEnd); 219 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1); 220 if (m_flags & FPDFTEXT_CONSECUTIVE) { 221 m_findNextStart = m_resStart + 1; 222 m_findPreStart = m_resEnd - 1; 223 } else { 224 m_findNextStart = m_resEnd + 1; 225 m_findPreStart = m_resStart - 1; 226 } 227 return m_IsFind; 228 } 229 230 bool CPDF_TextPageFind::FindPrev() { 231 if (!m_pTextPage) 232 return false; 233 m_resArray.clear(); 234 if (m_strText.IsEmpty() || !m_findPreStart.has_value()) { 235 m_IsFind = false; 236 return m_IsFind; 237 } 238 CPDF_TextPageFind findEngine(m_pTextPage.Get()); 239 bool ret = findEngine.FindFirst(m_findWhat, m_flags, Optional<size_t>(0)); 240 if (!ret) { 241 m_IsFind = false; 242 return m_IsFind; 243 } 244 int order = -1; 245 int MatchedCount = 0; 246 while (ret) { 247 ret = findEngine.FindNext(); 248 if (ret) { 249 int order1 = findEngine.GetCurOrder(); 250 int MatchedCount1 = findEngine.GetMatchedCount(); 251 int temp = order1 + MatchedCount1; 252 if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1) 253 break; 254 order = order1; 255 MatchedCount = MatchedCount1; 256 } 257 } 258 if (order == -1) { 259 m_IsFind = false; 260 return m_IsFind; 261 } 262 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); 263 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); 264 m_IsFind = true; 265 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount); 266 if (m_flags & FPDFTEXT_CONSECUTIVE) { 267 m_findNextStart = m_resStart + 1; 268 m_findPreStart = m_resEnd - 1; 269 } else { 270 m_findNextStart = m_resEnd + 1; 271 m_findPreStart = m_resStart - 1; 272 } 273 return m_IsFind; 274 } 275 276 void CPDF_TextPageFind::ExtractFindWhat(const WideString& findwhat) { 277 if (findwhat.IsEmpty()) 278 return; 279 int index = 0; 280 while (1) { 281 Optional<WideString> word = 282 ExtractSubString(findwhat.c_str(), index, TEXT_SPACE_CHAR); 283 if (!word) 284 break; 285 286 if (word->IsEmpty()) { 287 m_csFindWhatArray.push_back(L""); 288 index++; 289 continue; 290 } 291 292 size_t pos = 0; 293 while (pos < word->GetLength()) { 294 WideString curStr = word->Mid(pos, 1); 295 wchar_t curChar = word->operator[](pos); 296 if (IsIgnoreSpaceCharacter(curChar)) { 297 if (pos > 0 && curChar == 0x2019) { 298 pos++; 299 continue; 300 } 301 if (pos > 0) 302 m_csFindWhatArray.push_back(word->Left(pos)); 303 m_csFindWhatArray.push_back(curStr); 304 if (pos == word->GetLength() - 1) { 305 word->clear(); 306 break; 307 } 308 word.emplace(word->Right(word->GetLength() - pos - 1)); 309 pos = 0; 310 continue; 311 } 312 pos++; 313 } 314 315 if (!word->IsEmpty()) 316 m_csFindWhatArray.push_back(word.value()); 317 index++; 318 } 319 } 320 321 bool CPDF_TextPageFind::IsMatchWholeWord(const WideString& csPageText, 322 size_t startPos, 323 size_t endPos) { 324 if (startPos > endPos) 325 return false; 326 wchar_t char_left = 0; 327 wchar_t char_right = 0; 328 size_t char_count = endPos - startPos + 1; 329 if (char_count == 0) 330 return false; 331 if (char_count == 1 && csPageText[startPos] > 255) 332 return true; 333 if (startPos >= 1) 334 char_left = csPageText[startPos - 1]; 335 if (startPos + char_count < csPageText.GetLength()) 336 char_right = csPageText[startPos + char_count]; 337 if ((char_left > 'A' && char_left < 'a') || 338 (char_left > 'a' && char_left < 'z') || 339 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || 340 (char_right > 'A' && char_right < 'a') || 341 (char_right > 'a' && char_right < 'z') || 342 (char_right > 0xfb00 && char_right < 0xfb06) || 343 std::iswdigit(char_right)) { 344 return false; 345 } 346 if (!(('A' > char_left || char_left > 'Z') && 347 ('a' > char_left || char_left > 'z') && 348 ('A' > char_right || char_right > 'Z') && 349 ('a' > char_right || char_right > 'z'))) { 350 return false; 351 } 352 if (char_count > 0) { 353 if (std::iswdigit(char_left) && std::iswdigit(csPageText[startPos])) 354 return false; 355 if (std::iswdigit(char_right) && std::iswdigit(csPageText[endPos])) 356 return false; 357 } 358 return true; 359 } 360 361 Optional<WideString> CPDF_TextPageFind::ExtractSubString( 362 const wchar_t* lpszFullString, 363 int iSubString, 364 wchar_t chSep) { 365 if (!lpszFullString) 366 return {}; 367 368 while (iSubString--) { 369 lpszFullString = std::wcschr(lpszFullString, chSep); 370 if (!lpszFullString) 371 return {}; 372 373 lpszFullString++; 374 while (*lpszFullString == chSep) 375 lpszFullString++; 376 } 377 378 const wchar_t* lpchEnd = std::wcschr(lpszFullString, chSep); 379 int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString) 380 : static_cast<int>(wcslen(lpszFullString)); 381 if (nLen < 0) 382 return {}; 383 384 return {WideString(lpszFullString, static_cast<size_t>(nLen))}; 385 } 386 387 int CPDF_TextPageFind::GetCurOrder() const { 388 return GetCharIndex(m_resStart); 389 } 390 391 int CPDF_TextPageFind::GetMatchedCount() const { 392 int resStart = GetCharIndex(m_resStart); 393 int resEnd = GetCharIndex(m_resEnd); 394 return resEnd - resStart + 1; 395 } 396