1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include <algorithm> 8 #include <cctype> 9 #include <cwctype> 10 #include <memory> 11 12 #include "core/include/fpdfapi/fpdf_module.h" 13 #include "core/include/fpdfapi/fpdf_page.h" 14 #include "core/include/fpdfapi/fpdf_pageobj.h" 15 #include "core/include/fpdfapi/fpdf_resource.h" 16 #include "core/include/fpdftext/fpdf_text.h" 17 #include "core/include/fxcrt/fx_bidi.h" 18 #include "core/include/fxcrt/fx_ext.h" 19 #include "core/include/fxcrt/fx_ucd.h" 20 #include "text_int.h" 21 22 namespace { 23 24 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { 25 if (curChar < 255) { 26 return FALSE; 27 } 28 if ((curChar >= 0x0600 && curChar <= 0x06FF) || 29 (curChar >= 0xFE70 && curChar <= 0xFEFF) || 30 (curChar >= 0xFB50 && curChar <= 0xFDFF) || 31 (curChar >= 0x0400 && curChar <= 0x04FF) || 32 (curChar >= 0x0500 && curChar <= 0x052F) || 33 (curChar >= 0xA640 && curChar <= 0xA69F) || 34 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || 35 (curChar >= 0x2000 && curChar <= 0x206F)) { 36 return FALSE; 37 } 38 return TRUE; 39 } 40 41 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) { 42 if (threshold < 300) { 43 return threshold / 2.0f; 44 } 45 if (threshold < 500) { 46 return threshold / 4.0f; 47 } 48 if (threshold < 700) { 49 return threshold / 5.0f; 50 } 51 return threshold / 6.0f; 52 } 53 54 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, 55 const CFX_Matrix& matrix) { 56 FX_FLOAT baseSpace = 0.0; 57 const int nItems = pTextObj->CountItems(); 58 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { 59 FX_BOOL bAllChar = TRUE; 60 FX_FLOAT spacing = matrix.TransformDistance( 61 pTextObj->m_TextState.GetObject()->m_CharSpace); 62 baseSpace = spacing; 63 for (int i = 0; i < nItems; i++) { 64 CPDF_TextObjectItem item; 65 pTextObj->GetItemInfo(i, &item); 66 if (item.m_CharCode == (FX_DWORD)-1) { 67 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 68 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000; 69 baseSpace = std::min(baseSpace, kerning + spacing); 70 bAllChar = FALSE; 71 } 72 } 73 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) { 74 baseSpace = 0.0; 75 } 76 } 77 return baseSpace; 78 } 79 80 const FX_FLOAT kDefaultFontSize = 1.0f; 81 82 } // namespace 83 84 CPDFText_ParseOptions::CPDFText_ParseOptions() 85 : m_bGetCharCodeOnly(FALSE), 86 m_bNormalizeObjs(TRUE), 87 m_bOutputHyphen(FALSE) {} 88 89 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, 90 int flags) { 91 return new CPDF_TextPage(pPage, flags); 92 } 93 94 IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind( 95 const IPDF_TextPage* pTextPage) { 96 return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr; 97 } 98 99 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() { 100 return new CPDF_LinkExtract(); 101 } 102 103 #define TEXT_BLANK_CHAR L' ' 104 #define TEXT_LINEFEED_CHAR L'\n' 105 #define TEXT_RETURN_CHAR L'\r' 106 #define TEXT_EMPTY L"" 107 #define TEXT_BLANK L" " 108 #define TEXT_RETURN_LINEFEED L"\r\n" 109 #define TEXT_LINEFEED L"\n" 110 #define TEXT_CHARRATIO_GAPDELTA 0.070 111 112 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags) 113 : m_pPage(pPage), 114 m_charList(512), 115 m_TempCharList(50), 116 m_parserflag(flags), 117 m_pPreTextObj(nullptr), 118 m_bIsParsed(false), 119 m_TextlineDir(-1), 120 m_CurlineRect(0, 0, 0, 0) { 121 m_TextBuf.EstimateSize(0, 10240); 122 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(), 123 (int)pPage->GetPageHeight(), 0); 124 } 125 126 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) { 127 m_ParseOptions.m_bNormalizeObjs = bNormalize; 128 } 129 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) { 130 switch (charInfo.m_Unicode) { 131 case 0x2: 132 case 0x3: 133 case 0x93: 134 case 0x94: 135 case 0x96: 136 case 0x97: 137 case 0x98: 138 case 0xfffe: 139 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN; 140 default: 141 return false; 142 } 143 } 144 FX_BOOL CPDF_TextPage::ParseTextPage() { 145 m_bIsParsed = false; 146 if (!m_pPage) 147 return FALSE; 148 149 m_TextBuf.Clear(); 150 m_charList.RemoveAll(); 151 m_pPreTextObj = NULL; 152 ProcessObject(); 153 m_bIsParsed = true; 154 if (!m_ParseOptions.m_bGetCharCodeOnly) { 155 m_CharIndex.RemoveAll(); 156 int nCount = m_charList.GetSize(); 157 if (nCount) { 158 m_CharIndex.Add(0); 159 } 160 for (int i = 0; i < nCount; i++) { 161 int indexSize = m_CharIndex.GetSize(); 162 FX_BOOL bNormal = FALSE; 163 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i); 164 if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 165 bNormal = TRUE; 166 } else if (charinfo.m_Unicode == 0 || IsControlChar(charinfo)) { 167 bNormal = FALSE; 168 } else { 169 bNormal = TRUE; 170 } 171 if (bNormal) { 172 if (indexSize % 2) { 173 m_CharIndex.Add(1); 174 } else { 175 if (indexSize <= 0) { 176 continue; 177 } 178 m_CharIndex.SetAt(indexSize - 1, 179 m_CharIndex.GetAt(indexSize - 1) + 1); 180 } 181 } else { 182 if (indexSize % 2) { 183 if (indexSize <= 0) { 184 continue; 185 } 186 m_CharIndex.SetAt(indexSize - 1, i + 1); 187 } else { 188 m_CharIndex.Add(i + 1); 189 } 190 } 191 } 192 int indexSize = m_CharIndex.GetSize(); 193 if (indexSize % 2) { 194 m_CharIndex.RemoveAt(indexSize - 1); 195 } 196 } 197 return TRUE; 198 } 199 int CPDF_TextPage::CountChars() const { 200 if (m_ParseOptions.m_bGetCharCodeOnly) { 201 return m_TextBuf.GetSize(); 202 } 203 return m_charList.GetSize(); 204 } 205 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const { 206 int indexSize = m_CharIndex.GetSize(); 207 int count = 0; 208 for (int i = 0; i < indexSize; i += 2) { 209 count += m_CharIndex.GetAt(i + 1); 210 if (count > TextIndex) { 211 return TextIndex - count + m_CharIndex.GetAt(i + 1) + 212 m_CharIndex.GetAt(i); 213 } 214 } 215 return -1; 216 } 217 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const { 218 int indexSize = m_CharIndex.GetSize(); 219 int count = 0; 220 for (int i = 0; i < indexSize; i += 2) { 221 count += m_CharIndex.GetAt(i + 1); 222 if (m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) { 223 if (CharIndex - m_CharIndex.GetAt(i) < 0) { 224 return -1; 225 } 226 return CharIndex - m_CharIndex.GetAt(i) + count - 227 m_CharIndex.GetAt(i + 1); 228 } 229 } 230 return -1; 231 } 232 void CPDF_TextPage::GetRectArray(int start, 233 int nCount, 234 CFX_RectArray& rectArray) const { 235 if (m_ParseOptions.m_bGetCharCodeOnly) { 236 return; 237 } 238 if (start < 0 || nCount == 0) { 239 return; 240 } 241 if (!m_bIsParsed) { 242 return; 243 } 244 PAGECHAR_INFO info_curchar; 245 CPDF_TextObject* pCurObj = NULL; 246 CFX_FloatRect rect; 247 int curPos = start; 248 FX_BOOL flagNewRect = TRUE; 249 if (nCount + start > m_charList.GetSize() || nCount == -1) { 250 nCount = m_charList.GetSize() - start; 251 } 252 while (nCount--) { 253 info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++); 254 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) { 255 continue; 256 } 257 if (info_curchar.m_CharBox.Width() < 0.01 || 258 info_curchar.m_CharBox.Height() < 0.01) { 259 continue; 260 } 261 if (!pCurObj) { 262 pCurObj = info_curchar.m_pTextObj; 263 } 264 if (pCurObj != info_curchar.m_pTextObj) { 265 rectArray.Add(rect); 266 pCurObj = info_curchar.m_pTextObj; 267 flagNewRect = TRUE; 268 } 269 if (flagNewRect) { 270 FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY; 271 CFX_Matrix matrix, matrix_reverse; 272 info_curchar.m_pTextObj->GetTextMatrix(&matrix); 273 matrix.Concat(info_curchar.m_Matrix); 274 matrix_reverse.SetReverse(matrix); 275 matrix_reverse.Transform(orgX, orgY); 276 rect.left = info_curchar.m_CharBox.left; 277 rect.right = info_curchar.m_CharBox.right; 278 if (pCurObj->GetFont()->GetTypeDescent()) { 279 rect.bottom = orgY + 280 pCurObj->GetFont()->GetTypeDescent() * 281 pCurObj->GetFontSize() / 1000; 282 FX_FLOAT xPosTemp = orgX; 283 matrix.Transform(xPosTemp, rect.bottom); 284 } else { 285 rect.bottom = info_curchar.m_CharBox.bottom; 286 } 287 if (pCurObj->GetFont()->GetTypeAscent()) { 288 rect.top = 289 orgY + 290 pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000; 291 FX_FLOAT xPosTemp = 292 orgX + 293 GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) * 294 pCurObj->GetFontSize() / 1000; 295 matrix.Transform(xPosTemp, rect.top); 296 } else { 297 rect.top = info_curchar.m_CharBox.top; 298 } 299 flagNewRect = FALSE; 300 rect = info_curchar.m_CharBox; 301 rect.Normalize(); 302 } else { 303 info_curchar.m_CharBox.Normalize(); 304 if (rect.left > info_curchar.m_CharBox.left) { 305 rect.left = info_curchar.m_CharBox.left; 306 } 307 if (rect.right < info_curchar.m_CharBox.right) { 308 rect.right = info_curchar.m_CharBox.right; 309 } 310 if (rect.top < info_curchar.m_CharBox.top) { 311 rect.top = info_curchar.m_CharBox.top; 312 } 313 if (rect.bottom > info_curchar.m_CharBox.bottom) { 314 rect.bottom = info_curchar.m_CharBox.bottom; 315 } 316 } 317 } 318 rectArray.Add(rect); 319 return; 320 } 321 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point, 322 FX_FLOAT xTolerance, 323 FX_FLOAT yTolerance) const { 324 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed) 325 return -3; 326 327 int pos = 0; 328 int NearPos = -1; 329 double xdif = 5000, ydif = 5000; 330 while (pos < m_charList.GetSize()) { 331 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos)); 332 CFX_FloatRect charrect = charinfo.m_CharBox; 333 if (charrect.Contains(point.x, point.y)) { 334 break; 335 } 336 if (xTolerance > 0 || yTolerance > 0) { 337 CFX_FloatRect charRectExt; 338 charrect.Normalize(); 339 charRectExt.left = charrect.left - xTolerance / 2; 340 charRectExt.right = charrect.right + xTolerance / 2; 341 charRectExt.top = charrect.top + yTolerance / 2; 342 charRectExt.bottom = charrect.bottom - yTolerance / 2; 343 if (charRectExt.Contains(point.x, point.y)) { 344 double curXdif, curYdif; 345 curXdif = FXSYS_fabs(point.x - charrect.left) < 346 FXSYS_fabs(point.x - charrect.right) 347 ? FXSYS_fabs(point.x - charrect.left) 348 : FXSYS_fabs(point.x - charrect.right); 349 curYdif = FXSYS_fabs(point.y - charrect.bottom) < 350 FXSYS_fabs(point.y - charrect.top) 351 ? FXSYS_fabs(point.y - charrect.bottom) 352 : FXSYS_fabs(point.y - charrect.top); 353 if (curYdif + curXdif < xdif + ydif) { 354 ydif = curYdif; 355 xdif = curXdif; 356 NearPos = pos; 357 } 358 } 359 } 360 ++pos; 361 } 362 if (pos >= m_charList.GetSize()) { 363 pos = NearPos; 364 } 365 return pos; 366 } 367 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const { 368 CFX_WideString strText; 369 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed) 370 return strText; 371 372 int nCount = m_charList.GetSize(); 373 int pos = 0; 374 FX_FLOAT posy = 0; 375 FX_BOOL IsContainPreChar = FALSE; 376 FX_BOOL ISAddLineFeed = FALSE; 377 while (pos < nCount) { 378 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++); 379 if (IsRectIntersect(rect, charinfo.m_CharBox)) { 380 if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar && 381 ISAddLineFeed) { 382 posy = charinfo.m_OriginY; 383 if (strText.GetLength() > 0) { 384 strText += L"\r\n"; 385 } 386 } 387 IsContainPreChar = TRUE; 388 ISAddLineFeed = FALSE; 389 if (charinfo.m_Unicode) { 390 strText += charinfo.m_Unicode; 391 } 392 } else if (charinfo.m_Unicode == 32) { 393 if (IsContainPreChar && charinfo.m_Unicode) { 394 strText += charinfo.m_Unicode; 395 IsContainPreChar = FALSE; 396 ISAddLineFeed = FALSE; 397 } 398 } else { 399 IsContainPreChar = FALSE; 400 ISAddLineFeed = TRUE; 401 } 402 } 403 return strText; 404 } 405 void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect, 406 CFX_RectArray& resRectArray) const { 407 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed) 408 return; 409 410 CFX_FloatRect curRect; 411 FX_BOOL flagNewRect = TRUE; 412 CPDF_TextObject* pCurObj = NULL; 413 int nCount = m_charList.GetSize(); 414 int pos = 0; 415 while (pos < nCount) { 416 PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++); 417 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) { 418 continue; 419 } 420 if (IsRectIntersect(rect, info_curchar.m_CharBox)) { 421 if (!pCurObj) { 422 pCurObj = info_curchar.m_pTextObj; 423 } 424 if (pCurObj != info_curchar.m_pTextObj) { 425 resRectArray.Add(curRect); 426 pCurObj = info_curchar.m_pTextObj; 427 flagNewRect = TRUE; 428 } 429 if (flagNewRect) { 430 curRect = info_curchar.m_CharBox; 431 flagNewRect = FALSE; 432 curRect.Normalize(); 433 } else { 434 info_curchar.m_CharBox.Normalize(); 435 if (curRect.left > info_curchar.m_CharBox.left) { 436 curRect.left = info_curchar.m_CharBox.left; 437 } 438 if (curRect.right < info_curchar.m_CharBox.right) { 439 curRect.right = info_curchar.m_CharBox.right; 440 } 441 if (curRect.top < info_curchar.m_CharBox.top) { 442 curRect.top = info_curchar.m_CharBox.top; 443 } 444 if (curRect.bottom > info_curchar.m_CharBox.bottom) { 445 curRect.bottom = info_curchar.m_CharBox.bottom; 446 } 447 } 448 } 449 } 450 resRectArray.Add(curRect); 451 return; 452 } 453 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x, 454 FX_FLOAT y, 455 FX_FLOAT xTolerance, 456 FX_FLOAT yTolerance) const { 457 if (m_ParseOptions.m_bGetCharCodeOnly) { 458 return -3; 459 } 460 CPDF_Point point(x, y); 461 return GetIndexAtPos(point, xTolerance, yTolerance); 462 } 463 464 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const { 465 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed) 466 return; 467 468 if (index < 0 || index >= m_charList.GetSize()) 469 return; 470 471 const PAGECHAR_INFO* charinfo = 472 static_cast<PAGECHAR_INFO*>(m_charList.GetAt(index)); 473 info->m_Charcode = charinfo->m_CharCode; 474 info->m_OriginX = charinfo->m_OriginX; 475 info->m_OriginY = charinfo->m_OriginY; 476 info->m_Unicode = charinfo->m_Unicode; 477 info->m_Flag = charinfo->m_Flag; 478 info->m_CharBox = charinfo->m_CharBox; 479 info->m_pTextObj = charinfo->m_pTextObj; 480 if (charinfo->m_pTextObj && charinfo->m_pTextObj->GetFont()) { 481 info->m_FontSize = charinfo->m_pTextObj->GetFontSize(); 482 } else { 483 info->m_FontSize = kDefaultFontSize; 484 } 485 info->m_Matrix.Copy(charinfo->m_Matrix); 486 } 487 488 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start, 489 int32_t& nCount) const { 490 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start); 491 PAGECHAR_INFO charinfo2 = 492 *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1); 493 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag && 494 FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) { 495 return; 496 } 497 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) { 498 PAGECHAR_INFO charinfo1 = charinfo; 499 int startIndex = start; 500 while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag && 501 charinfo1.m_Index == charinfo.m_Index) { 502 startIndex--; 503 if (startIndex < 0) { 504 break; 505 } 506 charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex); 507 } 508 startIndex++; 509 start = startIndex; 510 } 511 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) { 512 PAGECHAR_INFO charinfo3 = charinfo2; 513 int endIndex = start + nCount - 1; 514 while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag && 515 charinfo3.m_Index == charinfo2.m_Index) { 516 endIndex++; 517 if (endIndex >= m_charList.GetSize()) { 518 break; 519 } 520 charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex); 521 } 522 endIndex--; 523 nCount = endIndex - start + 1; 524 } 525 } 526 CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const { 527 if (!m_bIsParsed || nCount == 0) 528 return L""; 529 530 if (start < 0) 531 start = 0; 532 533 if (nCount == -1) { 534 nCount = m_charList.GetSize() - start; 535 return m_TextBuf.GetWideString().Mid(start, 536 m_TextBuf.GetWideString().GetLength()); 537 } 538 if (nCount <= 0 || m_charList.GetSize() <= 0) { 539 return L""; 540 } 541 if (nCount + start > m_charList.GetSize() - 1) { 542 nCount = m_charList.GetSize() - start; 543 } 544 if (nCount <= 0) { 545 return L""; 546 } 547 CheckMarkedContentObject(start, nCount); 548 int startindex = 0; 549 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start); 550 int startOffset = 0; 551 while (charinfo.m_Index == -1) { 552 startOffset++; 553 if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) { 554 return L""; 555 } 556 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset); 557 } 558 startindex = charinfo.m_Index; 559 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1); 560 int nCountOffset = 0; 561 while (charinfo.m_Index == -1) { 562 nCountOffset++; 563 if (nCountOffset >= nCount) { 564 return L""; 565 } 566 charinfo = 567 *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1); 568 } 569 nCount = start + nCount - nCountOffset - startindex; 570 if (nCount <= 0) { 571 return L""; 572 } 573 return m_TextBuf.GetWideString().Mid(startindex, nCount); 574 } 575 int CPDF_TextPage::CountRects(int start, int nCount) { 576 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed || start < 0) 577 return -1; 578 579 if (nCount == -1 || nCount + start > m_charList.GetSize()) { 580 nCount = m_charList.GetSize() - start; 581 } 582 m_SelRects.RemoveAll(); 583 GetRectArray(start, nCount, m_SelRects); 584 return m_SelRects.GetSize(); 585 } 586 void CPDF_TextPage::GetRect(int rectIndex, 587 FX_FLOAT& left, 588 FX_FLOAT& top, 589 FX_FLOAT& right, 590 FX_FLOAT& bottom) const { 591 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed) 592 return; 593 594 if (rectIndex < 0 || rectIndex >= m_SelRects.GetSize()) 595 return; 596 597 left = m_SelRects.GetAt(rectIndex).left; 598 top = m_SelRects.GetAt(rectIndex).top; 599 right = m_SelRects.GetAt(rectIndex).right; 600 bottom = m_SelRects.GetAt(rectIndex).bottom; 601 } 602 603 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) { 604 if (m_ParseOptions.m_bGetCharCodeOnly) { 605 return FALSE; 606 } 607 if (end == start) { 608 return FALSE; 609 } 610 FPDF_CHAR_INFO info_start; 611 FPDF_CHAR_INFO info_end; 612 GetCharInfo(start, &info_start); 613 GetCharInfo(end, &info_end); 614 while (info_end.m_CharBox.Width() == 0 || info_end.m_CharBox.Height() == 0) { 615 if (--end <= start) 616 return FALSE; 617 618 GetCharInfo(end, &info_end); 619 } 620 FX_FLOAT dx = (info_end.m_OriginX - info_start.m_OriginX); 621 FX_FLOAT dy = (info_end.m_OriginY - info_start.m_OriginY); 622 if (dx == 0) { 623 if (dy > 0) { 624 Rotate = 90; 625 } else if (dy < 0) { 626 Rotate = 270; 627 } else { 628 Rotate = 0; 629 } 630 } else { 631 float a = FXSYS_atan2(dy, dx); 632 Rotate = (int)(a * 180 / FX_PI + 0.5); 633 } 634 if (Rotate < 0) { 635 Rotate = -Rotate; 636 } else if (Rotate > 0) { 637 Rotate = 360 - Rotate; 638 } 639 return TRUE; 640 } 641 642 FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect, 643 int& Rotate) { 644 if (m_ParseOptions.m_bGetCharCodeOnly) { 645 return FALSE; 646 } 647 int start, end, count, 648 n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom, 649 TRUE); 650 if (n < 1) { 651 return FALSE; 652 } 653 if (n > 1) { 654 GetBoundedSegment(n - 1, start, count); 655 end = start + count - 1; 656 GetBoundedSegment(0, start, count); 657 } else { 658 GetBoundedSegment(0, start, count); 659 end = start + count - 1; 660 } 661 return GetBaselineRotate(start, end, Rotate); 662 } 663 FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) { 664 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed) 665 return FALSE; 666 667 if (rectIndex < 0 || rectIndex > m_SelRects.GetSize()) 668 return FALSE; 669 670 CFX_FloatRect rect = m_SelRects.GetAt(rectIndex); 671 return GetBaselineRotate(rect, Rotate); 672 } 673 int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left, 674 FX_FLOAT top, 675 FX_FLOAT right, 676 FX_FLOAT bottom, 677 FX_BOOL bContains) { 678 if (m_ParseOptions.m_bGetCharCodeOnly) 679 return -1; 680 681 m_Segment.RemoveAll(); 682 if (!m_bIsParsed) 683 return -1; 684 685 CFX_FloatRect rect(left, bottom, right, top); 686 rect.Normalize(); 687 int nCount = m_charList.GetSize(); 688 int pos = 0; 689 FPDF_SEGMENT segment; 690 segment.m_Start = 0; 691 segment.m_nCount = 0; 692 int segmentStatus = 0; 693 FX_BOOL IsContainPreChar = FALSE; 694 while (pos < nCount) { 695 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos); 696 if (bContains && rect.Contains(charinfo.m_CharBox)) { 697 if (segmentStatus == 0 || segmentStatus == 2) { 698 segment.m_Start = pos; 699 segment.m_nCount = 1; 700 segmentStatus = 1; 701 } else if (segmentStatus == 1) { 702 segment.m_nCount++; 703 } 704 IsContainPreChar = TRUE; 705 } else if (!bContains && 706 (IsRectIntersect(rect, charinfo.m_CharBox) || 707 rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) { 708 if (segmentStatus == 0 || segmentStatus == 2) { 709 segment.m_Start = pos; 710 segment.m_nCount = 1; 711 segmentStatus = 1; 712 } else if (segmentStatus == 1) { 713 segment.m_nCount++; 714 } 715 IsContainPreChar = TRUE; 716 } else if (charinfo.m_Unicode == 32) { 717 if (IsContainPreChar == TRUE) { 718 if (segmentStatus == 0 || segmentStatus == 2) { 719 segment.m_Start = pos; 720 segment.m_nCount = 1; 721 segmentStatus = 1; 722 } else if (segmentStatus == 1) { 723 segment.m_nCount++; 724 } 725 IsContainPreChar = FALSE; 726 } else { 727 if (segmentStatus == 1) { 728 segmentStatus = 2; 729 m_Segment.Add(segment); 730 segment.m_Start = 0; 731 segment.m_nCount = 0; 732 } 733 } 734 } else { 735 if (segmentStatus == 1) { 736 segmentStatus = 2; 737 m_Segment.Add(segment); 738 segment.m_Start = 0; 739 segment.m_nCount = 0; 740 } 741 IsContainPreChar = FALSE; 742 } 743 pos++; 744 } 745 if (segmentStatus == 1) { 746 segmentStatus = 2; 747 m_Segment.Add(segment); 748 segment.m_Start = 0; 749 segment.m_nCount = 0; 750 } 751 return m_Segment.GetSize(); 752 } 753 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const { 754 if (m_ParseOptions.m_bGetCharCodeOnly) { 755 return; 756 } 757 if (index < 0 || index >= m_Segment.GetSize()) { 758 return; 759 } 760 start = m_Segment.GetAt(index).m_Start; 761 count = m_Segment.GetAt(index).m_nCount; 762 } 763 int CPDF_TextPage::GetWordBreak(int index, int direction) const { 764 if (m_ParseOptions.m_bGetCharCodeOnly || !m_bIsParsed) 765 return -1; 766 767 if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT) 768 return -1; 769 770 if (index < 0 || index >= m_charList.GetSize()) 771 return -1; 772 773 PAGECHAR_INFO charinfo; 774 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 775 if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 776 return index; 777 } 778 if (!IsLetter(charinfo.m_Unicode)) { 779 return index; 780 } 781 int breakPos = index; 782 if (direction == FPDFTEXT_LEFT) { 783 while (--breakPos > 0) { 784 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 785 if (!IsLetter(charinfo.m_Unicode)) { 786 return breakPos; 787 } 788 } 789 } else if (direction == FPDFTEXT_RIGHT) { 790 while (++breakPos < m_charList.GetSize()) { 791 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 792 if (!IsLetter(charinfo.m_Unicode)) { 793 return breakPos; 794 } 795 } 796 } 797 return breakPos; 798 } 799 int32_t CPDF_TextPage::FindTextlineFlowDirection() { 800 if (!m_pPage) { 801 return -1; 802 } 803 const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth(); 804 const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight(); 805 CFX_ByteArray nHorizontalMask; 806 if (!nHorizontalMask.SetSize(nPageWidth)) { 807 return -1; 808 } 809 uint8_t* pDataH = nHorizontalMask.GetData(); 810 CFX_ByteArray nVerticalMask; 811 if (!nVerticalMask.SetSize(nPageHeight)) { 812 return -1; 813 } 814 uint8_t* pDataV = nVerticalMask.GetData(); 815 int32_t index = 0; 816 FX_FLOAT fLineHeight = 0.0f; 817 CPDF_PageObject* pPageObj = NULL; 818 FX_POSITION pos = NULL; 819 pos = m_pPage->GetFirstObjectPosition(); 820 if (!pos) { 821 return -1; 822 } 823 while (pos) { 824 pPageObj = m_pPage->GetNextObject(pos); 825 if (NULL == pPageObj) { 826 continue; 827 } 828 if (PDFPAGE_TEXT != pPageObj->m_Type) { 829 continue; 830 } 831 int32_t minH = 832 (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left; 833 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth 834 ? nPageWidth 835 : (int32_t)pPageObj->m_Right; 836 int32_t minV = 837 (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom; 838 int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight 839 ? nPageHeight 840 : (int32_t)pPageObj->m_Top; 841 if (minH >= maxH || minV >= maxV) { 842 continue; 843 } 844 FXSYS_memset(pDataH + minH, 1, maxH - minH); 845 FXSYS_memset(pDataV + minV, 1, maxV - minV); 846 if (fLineHeight <= 0.0f) { 847 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom; 848 } 849 pPageObj = NULL; 850 } 851 int32_t nStartH = 0; 852 int32_t nEndH = 0; 853 FX_FLOAT nSumH = 0.0f; 854 for (index = 0; index < nPageWidth; index++) 855 if (1 == nHorizontalMask[index]) { 856 break; 857 } 858 nStartH = index; 859 for (index = nPageWidth; index > 0; index--) 860 if (1 == nHorizontalMask[index - 1]) { 861 break; 862 } 863 nEndH = index; 864 for (index = nStartH; index < nEndH; index++) { 865 nSumH += nHorizontalMask[index]; 866 } 867 nSumH /= nEndH - nStartH; 868 int32_t nStartV = 0; 869 int32_t nEndV = 0; 870 FX_FLOAT nSumV = 0.0f; 871 for (index = 0; index < nPageHeight; index++) 872 if (1 == nVerticalMask[index]) { 873 break; 874 } 875 nStartV = index; 876 for (index = nPageHeight; index > 0; index--) 877 if (1 == nVerticalMask[index - 1]) { 878 break; 879 } 880 nEndV = index; 881 for (index = nStartV; index < nEndV; index++) { 882 nSumV += nVerticalMask[index]; 883 } 884 nSumV /= nEndV - nStartV; 885 if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) { 886 return 0; 887 } 888 if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) { 889 return 1; 890 } 891 if (nSumH > 0.8f) { 892 return 0; 893 } 894 if (nSumH - nSumV > 0.0f) { 895 return 0; 896 } 897 if (nSumV - nSumH > 0.0f) { 898 return 1; 899 } 900 return -1; 901 } 902 void CPDF_TextPage::ProcessObject() { 903 CPDF_PageObject* pPageObj = NULL; 904 if (!m_pPage) { 905 return; 906 } 907 FX_POSITION pos; 908 pos = m_pPage->GetFirstObjectPosition(); 909 if (!pos) { 910 return; 911 } 912 m_TextlineDir = FindTextlineFlowDirection(); 913 int nCount = 0; 914 while (pos) { 915 pPageObj = m_pPage->GetNextObject(pos); 916 if (pPageObj) { 917 if (pPageObj->m_Type == PDFPAGE_TEXT) { 918 CFX_Matrix matrix; 919 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos); 920 nCount++; 921 } else if (pPageObj->m_Type == PDFPAGE_FORM) { 922 CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0); 923 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix); 924 } 925 } 926 pPageObj = NULL; 927 } 928 int count = m_LineObj.GetSize(); 929 for (int i = 0; i < count; i++) { 930 ProcessTextObject(m_LineObj.GetAt(i)); 931 } 932 m_LineObj.RemoveAll(); 933 CloseTempLine(); 934 } 935 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, 936 const CFX_Matrix& formMatrix) { 937 CPDF_PageObject* pPageObj = NULL; 938 FX_POSITION pos; 939 if (!pFormObj) { 940 return; 941 } 942 pos = pFormObj->m_pForm->GetFirstObjectPosition(); 943 if (!pos) { 944 return; 945 } 946 CFX_Matrix curFormMatrix; 947 curFormMatrix.Copy(pFormObj->m_FormMatrix); 948 curFormMatrix.Concat(formMatrix); 949 while (pos) { 950 pPageObj = pFormObj->m_pForm->GetNextObject(pos); 951 if (pPageObj) { 952 if (pPageObj->m_Type == PDFPAGE_TEXT) { 953 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos); 954 } else if (pPageObj->m_Type == PDFPAGE_FORM) { 955 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix); 956 } 957 } 958 pPageObj = NULL; 959 } 960 } 961 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const { 962 if (charCode == -1) { 963 return 0; 964 } 965 int w = pFont->GetCharWidthF(charCode); 966 if (w == 0) { 967 CFX_ByteString str; 968 pFont->AppendChar(str, charCode); 969 w = pFont->GetStringWidth(str, 1); 970 if (w == 0) { 971 FX_RECT BBox; 972 pFont->GetCharBBox(charCode, BBox); 973 w = BBox.right - BBox.left; 974 } 975 } 976 return w; 977 } 978 void CPDF_TextPage::OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str) { 979 int32_t start, count; 980 CFX_BidiChar::Direction ret = pBidi->GetBidiInfo(&start, &count); 981 if (ret == CFX_BidiChar::RIGHT) { 982 for (int i = start + count - 1; i >= start; i--) { 983 m_TextBuf.AppendChar(str.GetAt(i)); 984 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i)); 985 } 986 } else { 987 int end = start + count; 988 for (int i = start; i < end; i++) { 989 m_TextBuf.AppendChar(str.GetAt(i)); 990 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i)); 991 } 992 } 993 } 994 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) { 995 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i); 996 FX_WCHAR wChar = str.GetAt(i); 997 if (!IsControlChar(Info)) { 998 Info.m_Index = m_TextBuf.GetLength(); 999 if (wChar >= 0xFB00 && wChar <= 0xFB06) { 1000 FX_WCHAR* pDst = NULL; 1001 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 1002 if (nCount >= 1) { 1003 pDst = FX_Alloc(FX_WCHAR, nCount); 1004 FX_Unicode_GetNormalization(wChar, pDst); 1005 for (int nIndex = 0; nIndex < nCount; nIndex++) { 1006 PAGECHAR_INFO Info2 = Info; 1007 Info2.m_Unicode = pDst[nIndex]; 1008 Info2.m_Flag = FPDFTEXT_CHAR_PIECE; 1009 m_TextBuf.AppendChar(Info2.m_Unicode); 1010 if (!m_ParseOptions.m_bGetCharCodeOnly) { 1011 m_charList.Add(Info2); 1012 } 1013 } 1014 FX_Free(pDst); 1015 return; 1016 } 1017 } 1018 m_TextBuf.AppendChar(wChar); 1019 } else { 1020 Info.m_Index = -1; 1021 } 1022 if (!m_ParseOptions.m_bGetCharCodeOnly) { 1023 m_charList.Add(Info); 1024 } 1025 } 1026 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) { 1027 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i); 1028 if (!IsControlChar(Info)) { 1029 Info.m_Index = m_TextBuf.GetLength(); 1030 FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE); 1031 FX_WCHAR* pDst = NULL; 1032 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 1033 if (nCount >= 1) { 1034 pDst = FX_Alloc(FX_WCHAR, nCount); 1035 FX_Unicode_GetNormalization(wChar, pDst); 1036 for (int nIndex = 0; nIndex < nCount; nIndex++) { 1037 PAGECHAR_INFO Info2 = Info; 1038 Info2.m_Unicode = pDst[nIndex]; 1039 Info2.m_Flag = FPDFTEXT_CHAR_PIECE; 1040 m_TextBuf.AppendChar(Info2.m_Unicode); 1041 if (!m_ParseOptions.m_bGetCharCodeOnly) { 1042 m_charList.Add(Info2); 1043 } 1044 } 1045 FX_Free(pDst); 1046 return; 1047 } 1048 Info.m_Unicode = wChar; 1049 m_TextBuf.AppendChar(Info.m_Unicode); 1050 } else { 1051 Info.m_Index = -1; 1052 } 1053 if (!m_ParseOptions.m_bGetCharCodeOnly) { 1054 m_charList.Add(Info); 1055 } 1056 } 1057 void CPDF_TextPage::CloseTempLine() { 1058 int count1 = m_TempCharList.GetSize(); 1059 if (count1 <= 0) { 1060 return; 1061 } 1062 std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar); 1063 CFX_WideString str = m_TempTextBuf.GetWideString(); 1064 CFX_WordArray order; 1065 FX_BOOL bR2L = FALSE; 1066 int32_t start = 0, count = 0; 1067 int nR2L = 0, nL2R = 0; 1068 FX_BOOL bPrevSpace = FALSE; 1069 for (int i = 0; i < str.GetLength(); i++) { 1070 if (str.GetAt(i) == 32) { 1071 if (bPrevSpace) { 1072 m_TempTextBuf.Delete(i, 1); 1073 m_TempCharList.Delete(i); 1074 str.Delete(i); 1075 count1--; 1076 i--; 1077 continue; 1078 } 1079 bPrevSpace = TRUE; 1080 } else { 1081 bPrevSpace = FALSE; 1082 } 1083 if (pBidiChar->AppendChar(str.GetAt(i))) { 1084 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); 1085 order.Add(start); 1086 order.Add(count); 1087 order.Add(ret); 1088 if (!bR2L) { 1089 if (ret == CFX_BidiChar::RIGHT) { 1090 nR2L++; 1091 } else if (ret == CFX_BidiChar::LEFT) { 1092 nL2R++; 1093 } 1094 } 1095 } 1096 } 1097 if (pBidiChar->EndChar()) { 1098 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); 1099 order.Add(start); 1100 order.Add(count); 1101 order.Add(ret); 1102 if (!bR2L) { 1103 if (ret == CFX_BidiChar::RIGHT) { 1104 nR2L++; 1105 } else if (ret == CFX_BidiChar::LEFT) { 1106 nL2R++; 1107 } 1108 } 1109 } 1110 if (nR2L > 0 && nR2L >= nL2R) { 1111 bR2L = TRUE; 1112 } 1113 if (m_parserflag == FPDFTEXT_RLTB || bR2L) { 1114 int count = order.GetSize(); 1115 for (int i = count - 1; i > 0; i -= 3) { 1116 int ret = order.GetAt(i); 1117 int start = order.GetAt(i - 2); 1118 int count1 = order.GetAt(i - 1); 1119 if (ret == 2 || ret == 0) { 1120 for (int j = start + count1 - 1; j >= start; j--) { 1121 AddCharInfoByRLDirection(str, j); 1122 } 1123 } else { 1124 int j = i; 1125 FX_BOOL bSymbol = FALSE; 1126 while (j > 0 && order.GetAt(j) != 2) { 1127 bSymbol = !order.GetAt(j); 1128 j -= 3; 1129 } 1130 int end = start + count1; 1131 int n = 0; 1132 if (bSymbol) { 1133 n = j + 6; 1134 } else { 1135 n = j + 3; 1136 } 1137 if (n >= i) { 1138 for (int m = start; m < end; m++) { 1139 AddCharInfoByLRDirection(str, m); 1140 } 1141 } else { 1142 j = i; 1143 i = n; 1144 for (; n <= j; n += 3) { 1145 int start = order.GetAt(n - 2); 1146 int count1 = order.GetAt(n - 1); 1147 int end = start + count1; 1148 for (int m = start; m < end; m++) { 1149 AddCharInfoByLRDirection(str, m); 1150 } 1151 } 1152 } 1153 } 1154 } 1155 } else { 1156 int count = order.GetSize(); 1157 FX_BOOL bL2R = FALSE; 1158 for (int i = 0; i < count; i += 3) { 1159 int ret = order.GetAt(i + 2); 1160 int start = order.GetAt(i); 1161 int count1 = order.GetAt(i + 1); 1162 if (ret == 2 || (i == 0 && ret == 0 && !bL2R)) { 1163 int j = i + 3; 1164 while (bR2L && j < count) { 1165 if (order.GetAt(j + 2) == 1) { 1166 break; 1167 } else { 1168 j += 3; 1169 } 1170 } 1171 if (j == 3) { 1172 i = -3; 1173 bL2R = TRUE; 1174 continue; 1175 } 1176 int end = m_TempCharList.GetSize() - 1; 1177 if (j < count) { 1178 end = order.GetAt(j) - 1; 1179 } 1180 i = j - 3; 1181 for (int n = end; n >= start; n--) { 1182 AddCharInfoByRLDirection(str, n); 1183 } 1184 } else { 1185 int end = start + count1; 1186 for (int n = start; n < end; n++) { 1187 AddCharInfoByLRDirection(str, n); 1188 } 1189 } 1190 } 1191 } 1192 order.RemoveAll(); 1193 m_TempCharList.RemoveAll(); 1194 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength()); 1195 } 1196 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj, 1197 const CFX_Matrix& formMatrix, 1198 FX_POSITION ObjPos) { 1199 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right, 1200 pTextObj->m_Top); 1201 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) { 1202 return; 1203 } 1204 int count = m_LineObj.GetSize(); 1205 PDFTEXT_Obj Obj; 1206 Obj.m_pTextObj = pTextObj; 1207 Obj.m_formMatrix = formMatrix; 1208 if (count == 0) { 1209 m_LineObj.Add(Obj); 1210 return; 1211 } 1212 if (IsSameAsPreTextObject(pTextObj, ObjPos)) { 1213 return; 1214 } 1215 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1); 1216 CPDF_TextObjectItem item; 1217 int nItem = prev_Obj.m_pTextObj->CountItems(); 1218 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item); 1219 FX_FLOAT prev_width = 1220 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) * 1221 prev_Obj.m_pTextObj->GetFontSize() / 1000; 1222 CFX_Matrix prev_matrix; 1223 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix); 1224 prev_width = FXSYS_fabs(prev_width); 1225 prev_matrix.Concat(prev_Obj.m_formMatrix); 1226 prev_width = prev_matrix.TransformDistance(prev_width); 1227 pTextObj->GetItemInfo(0, &item); 1228 FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) * 1229 pTextObj->GetFontSize() / 1000; 1230 this_width = FXSYS_fabs(this_width); 1231 CFX_Matrix this_matrix; 1232 pTextObj->GetTextMatrix(&this_matrix); 1233 this_width = FXSYS_fabs(this_width); 1234 this_matrix.Concat(formMatrix); 1235 this_width = this_matrix.TransformDistance(this_width); 1236 FX_FLOAT threshold = 1237 prev_width > this_width ? prev_width / 4 : this_width / 4; 1238 FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(), 1239 prev_y = prev_Obj.m_pTextObj->GetPosY(); 1240 prev_Obj.m_formMatrix.Transform(prev_x, prev_y); 1241 m_DisplayMatrix.Transform(prev_x, prev_y); 1242 FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY(); 1243 formMatrix.Transform(this_x, this_y); 1244 m_DisplayMatrix.Transform(this_x, this_y); 1245 if (FXSYS_fabs(this_y - prev_y) > threshold * 2) { 1246 for (int i = 0; i < count; i++) { 1247 ProcessTextObject(m_LineObj.GetAt(i)); 1248 } 1249 m_LineObj.RemoveAll(); 1250 m_LineObj.Add(Obj); 1251 return; 1252 } 1253 int i = 0; 1254 if (m_ParseOptions.m_bNormalizeObjs) { 1255 for (i = count - 1; i >= 0; i--) { 1256 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i); 1257 CFX_Matrix prev_matrix; 1258 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix); 1259 FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(), 1260 Prev_y = prev_Obj.m_pTextObj->GetPosY(); 1261 prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y); 1262 m_DisplayMatrix.Transform(Prev_x, Prev_y); 1263 if (this_x >= Prev_x) { 1264 if (i == count - 1) { 1265 m_LineObj.Add(Obj); 1266 } else { 1267 m_LineObj.InsertAt(i + 1, Obj); 1268 } 1269 break; 1270 } 1271 } 1272 if (i < 0) { 1273 m_LineObj.InsertAt(0, Obj); 1274 } 1275 } else { 1276 m_LineObj.Add(Obj); 1277 } 1278 } 1279 int32_t CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) { 1280 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1281 CPDF_ContentMarkData* pMarkData = 1282 (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject(); 1283 if (!pMarkData) { 1284 return FPDFTEXT_MC_PASS; 1285 } 1286 int nContentMark = pMarkData->CountItems(); 1287 if (nContentMark < 1) { 1288 return FPDFTEXT_MC_PASS; 1289 } 1290 CFX_WideString actText; 1291 FX_BOOL bExist = FALSE; 1292 CPDF_Dictionary* pDict = NULL; 1293 int n = 0; 1294 for (n = 0; n < nContentMark; n++) { 1295 CPDF_ContentMarkItem& item = pMarkData->GetItem(n); 1296 CFX_ByteString tagStr = (CFX_ByteString)item.GetName(); 1297 pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam())); 1298 CPDF_String* temp = 1299 ToString(pDict ? pDict->GetElement("ActualText") : nullptr); 1300 if (temp) { 1301 bExist = TRUE; 1302 actText = temp->GetUnicodeText(); 1303 } 1304 } 1305 if (!bExist) { 1306 return FPDFTEXT_MC_PASS; 1307 } 1308 if (m_pPreTextObj) { 1309 if (CPDF_ContentMarkData* pPreMarkData = 1310 (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) { 1311 if (pPreMarkData->CountItems() == n) { 1312 CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1); 1313 if (pDict == item.GetParam()) { 1314 return FPDFTEXT_MC_DONE; 1315 } 1316 } 1317 } 1318 } 1319 CPDF_Font* pFont = pTextObj->GetFont(); 1320 FX_STRSIZE nItems = actText.GetLength(); 1321 if (nItems < 1) { 1322 return FPDFTEXT_MC_PASS; 1323 } 1324 bExist = FALSE; 1325 for (FX_STRSIZE i = 0; i < nItems; i++) { 1326 FX_WCHAR wChar = actText.GetAt(i); 1327 if (-1 == pFont->CharCodeFromUnicode(wChar)) { 1328 continue; 1329 } else { 1330 bExist = TRUE; 1331 break; 1332 } 1333 } 1334 if (!bExist) { 1335 return FPDFTEXT_MC_PASS; 1336 } 1337 bExist = FALSE; 1338 for (FX_STRSIZE i = 0; i < nItems; i++) { 1339 FX_WCHAR wChar = actText.GetAt(i); 1340 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) { 1341 bExist = TRUE; 1342 break; 1343 } 1344 } 1345 if (!bExist) { 1346 return FPDFTEXT_MC_DONE; 1347 } 1348 return FPDFTEXT_MC_DELAY; 1349 } 1350 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) { 1351 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1352 CPDF_ContentMarkData* pMarkData = 1353 (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject(); 1354 if (!pMarkData) { 1355 return; 1356 } 1357 int nContentMark = pMarkData->CountItems(); 1358 if (nContentMark < 1) { 1359 return; 1360 } 1361 CFX_WideString actText; 1362 CPDF_Dictionary* pDict = NULL; 1363 int n = 0; 1364 for (n = 0; n < nContentMark; n++) { 1365 CPDF_ContentMarkItem& item = pMarkData->GetItem(n); 1366 CFX_ByteString tagStr = (CFX_ByteString)item.GetName(); 1367 pDict = ToDictionary(static_cast<CPDF_Object*>(item.GetParam())); 1368 CPDF_String* temp = 1369 ToString(pDict ? pDict->GetElement("ActualText") : nullptr); 1370 if (temp) { 1371 actText = temp->GetUnicodeText(); 1372 } 1373 } 1374 FX_STRSIZE nItems = actText.GetLength(); 1375 if (nItems < 1) { 1376 return; 1377 } 1378 CPDF_Font* pFont = pTextObj->GetFont(); 1379 CFX_Matrix formMatrix = Obj.m_formMatrix; 1380 CFX_Matrix matrix; 1381 pTextObj->GetTextMatrix(&matrix); 1382 matrix.Concat(formMatrix); 1383 FX_FLOAT fPosX = pTextObj->GetPosX(); 1384 FX_FLOAT fPosY = pTextObj->GetPosY(); 1385 int nCharInfoIndex = m_TextBuf.GetLength(); 1386 CFX_FloatRect charBox; 1387 charBox.top = pTextObj->m_Top; 1388 charBox.left = pTextObj->m_Left; 1389 charBox.right = pTextObj->m_Right; 1390 charBox.bottom = pTextObj->m_Bottom; 1391 for (FX_STRSIZE k = 0; k < nItems; k++) { 1392 FX_WCHAR wChar = actText.GetAt(k); 1393 if (wChar <= 0x80 && !isprint(wChar)) { 1394 wChar = 0x20; 1395 } 1396 if (wChar >= 0xFFFD) { 1397 continue; 1398 } 1399 PAGECHAR_INFO charinfo; 1400 charinfo.m_OriginX = fPosX; 1401 charinfo.m_OriginY = fPosY; 1402 charinfo.m_Index = nCharInfoIndex; 1403 charinfo.m_Unicode = wChar; 1404 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar); 1405 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE; 1406 charinfo.m_pTextObj = pTextObj; 1407 charinfo.m_CharBox.top = charBox.top; 1408 charinfo.m_CharBox.left = charBox.left; 1409 charinfo.m_CharBox.right = charBox.right; 1410 charinfo.m_CharBox.bottom = charBox.bottom; 1411 charinfo.m_Matrix.Copy(matrix); 1412 m_TempTextBuf.AppendChar(wChar); 1413 m_TempCharList.Add(charinfo); 1414 } 1415 } 1416 void CPDF_TextPage::FindPreviousTextObject(void) { 1417 if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) { 1418 return; 1419 } 1420 PAGECHAR_INFO preChar; 1421 if (m_TempCharList.GetSize() >= 1) { 1422 preChar = 1423 *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1); 1424 } else { 1425 preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1); 1426 } 1427 if (preChar.m_pTextObj) { 1428 m_pPreTextObj = preChar.m_pTextObj; 1429 } 1430 } 1431 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend, 1432 int32_t iBufStartAppend) { 1433 int32_t i, j; 1434 i = iCharListStartAppend; 1435 j = m_TempCharList.GetSize() - 1; 1436 for (; i < j; i++, j--) { 1437 std::swap(m_TempCharList[i], m_TempCharList[j]); 1438 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index); 1439 } 1440 FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer(); 1441 i = iBufStartAppend; 1442 j = m_TempTextBuf.GetLength() - 1; 1443 for (; i < j; i++, j--) { 1444 std::swap(pTempBuffer[i], pTempBuffer[j]); 1445 } 1446 } 1447 FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj, 1448 const CPDF_Font* pFont, 1449 int nItems) const { 1450 std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar); 1451 int32_t nR2L = 0; 1452 int32_t nL2R = 0; 1453 int32_t start = 0, count = 0; 1454 CPDF_TextObjectItem item; 1455 for (int32_t i = 0; i < nItems; i++) { 1456 pTextObj->GetItemInfo(i, &item); 1457 if (item.m_CharCode == (FX_DWORD)-1) { 1458 continue; 1459 } 1460 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 1461 FX_WCHAR wChar = wstrItem.GetAt(0); 1462 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) { 1463 wChar = (FX_WCHAR)item.m_CharCode; 1464 } 1465 if (!wChar) { 1466 continue; 1467 } 1468 if (pBidiChar->AppendChar(wChar)) { 1469 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); 1470 if (ret == CFX_BidiChar::RIGHT) { 1471 nR2L++; 1472 } else if (ret == CFX_BidiChar::LEFT) { 1473 nL2R++; 1474 } 1475 } 1476 } 1477 if (pBidiChar->EndChar()) { 1478 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); 1479 if (ret == CFX_BidiChar::RIGHT) { 1480 nR2L++; 1481 } else if (ret == CFX_BidiChar::LEFT) { 1482 nL2R++; 1483 } 1484 } 1485 return (nR2L > 0 && nR2L >= nL2R); 1486 } 1487 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { 1488 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1489 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) { 1490 return; 1491 } 1492 CFX_Matrix formMatrix = Obj.m_formMatrix; 1493 CPDF_Font* pFont = pTextObj->GetFont(); 1494 CFX_Matrix matrix; 1495 pTextObj->GetTextMatrix(&matrix); 1496 matrix.Concat(formMatrix); 1497 int32_t bPreMKC = PreMarkedContent(Obj); 1498 if (FPDFTEXT_MC_DONE == bPreMKC) { 1499 m_pPreTextObj = pTextObj; 1500 m_perMatrix.Copy(formMatrix); 1501 return; 1502 } 1503 int result = 0; 1504 if (m_pPreTextObj) { 1505 result = ProcessInsertObject(pTextObj, formMatrix); 1506 if (2 == result) { 1507 m_CurlineRect = 1508 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, 1509 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top); 1510 } else { 1511 m_CurlineRect.Union( 1512 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, 1513 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top)); 1514 } 1515 PAGECHAR_INFO generateChar; 1516 if (result == 1) { 1517 if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) { 1518 if (!formMatrix.IsIdentity()) { 1519 generateChar.m_Matrix.Copy(formMatrix); 1520 } 1521 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); 1522 m_TempCharList.Add(generateChar); 1523 } 1524 } else if (result == 2) { 1525 CloseTempLine(); 1526 if (m_TextBuf.GetSize()) { 1527 if (m_ParseOptions.m_bGetCharCodeOnly) { 1528 m_TextBuf.AppendChar(TEXT_RETURN_CHAR); 1529 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR); 1530 } else { 1531 if (GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) { 1532 m_TextBuf.AppendChar(TEXT_RETURN_CHAR); 1533 if (!formMatrix.IsIdentity()) { 1534 generateChar.m_Matrix.Copy(formMatrix); 1535 } 1536 m_charList.Add(generateChar); 1537 } 1538 if (GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) { 1539 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR); 1540 if (!formMatrix.IsIdentity()) { 1541 generateChar.m_Matrix.Copy(formMatrix); 1542 } 1543 m_charList.Add(generateChar); 1544 } 1545 } 1546 } 1547 } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) { 1548 int32_t nChars = pTextObj->CountChars(); 1549 if (nChars == 1) { 1550 CPDF_TextObjectItem item; 1551 pTextObj->GetCharInfo(0, &item); 1552 CFX_WideString wstrItem = 1553 pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 1554 if (wstrItem.IsEmpty()) { 1555 wstrItem += (FX_WCHAR)item.m_CharCode; 1556 } 1557 FX_WCHAR curChar = wstrItem.GetAt(0); 1558 if (0x2D == curChar || 0xAD == curChar) { 1559 return; 1560 } 1561 } 1562 while (m_TempTextBuf.GetSize() > 0 && 1563 m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() - 1564 1) == 0x20) { 1565 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1566 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); 1567 } 1568 PAGECHAR_INFO* cha = 1569 (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1); 1570 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1571 cha->m_Unicode = 0x2; 1572 cha->m_Flag = FPDFTEXT_CHAR_HYPHEN; 1573 m_TempTextBuf.AppendChar(0xfffe); 1574 } 1575 } else { 1576 m_CurlineRect = 1577 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, 1578 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top); 1579 } 1580 if (FPDFTEXT_MC_DELAY == bPreMKC) { 1581 ProcessMarkedContent(Obj); 1582 m_pPreTextObj = pTextObj; 1583 m_perMatrix.Copy(formMatrix); 1584 return; 1585 } 1586 m_pPreTextObj = pTextObj; 1587 m_perMatrix.Copy(formMatrix); 1588 int nItems = pTextObj->CountItems(); 1589 FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix); 1590 1591 const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems); 1592 const FX_BOOL bIsBidiAndMirrorInverse = 1593 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0; 1594 int32_t iBufStartAppend = m_TempTextBuf.GetLength(); 1595 int32_t iCharListStartAppend = m_TempCharList.GetSize(); 1596 1597 FX_FLOAT spacing = 0; 1598 for (int i = 0; i < nItems; i++) { 1599 CPDF_TextObjectItem item; 1600 PAGECHAR_INFO charinfo; 1601 charinfo.m_OriginX = 0; 1602 charinfo.m_OriginY = 0; 1603 pTextObj->GetItemInfo(i, &item); 1604 if (item.m_CharCode == (FX_DWORD)-1) { 1605 CFX_WideString str = m_TempTextBuf.GetWideString(); 1606 if (str.IsEmpty()) { 1607 str = m_TextBuf.GetWideString(); 1608 } 1609 if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) { 1610 continue; 1611 } 1612 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1613 spacing = -fontsize_h * item.m_OriginX / 1000; 1614 continue; 1615 } 1616 FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace; 1617 if (charSpace > 0.001) { 1618 spacing += matrix.TransformDistance(charSpace); 1619 } else if (charSpace < -0.001) { 1620 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); 1621 } 1622 spacing -= baseSpace; 1623 if (spacing && i > 0) { 1624 int last_width = 0; 1625 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1626 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 1627 FX_FLOAT threshold = 0; 1628 if (space_charcode != -1) { 1629 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 1630 } 1631 if (threshold > fontsize_h / 3) { 1632 threshold = 0; 1633 } else { 1634 threshold /= 2; 1635 } 1636 if (threshold == 0) { 1637 threshold = fontsize_h; 1638 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); 1639 threshold = this_width > last_width ? (FX_FLOAT)this_width 1640 : (FX_FLOAT)last_width; 1641 threshold = _NormalizeThreshold(threshold); 1642 threshold = fontsize_h * threshold / 1000; 1643 } 1644 if (threshold && (spacing && spacing >= threshold)) { 1645 charinfo.m_Unicode = TEXT_BLANK_CHAR; 1646 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; 1647 charinfo.m_pTextObj = pTextObj; 1648 charinfo.m_Index = m_TextBuf.GetLength(); 1649 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); 1650 charinfo.m_CharCode = -1; 1651 charinfo.m_Matrix.Copy(formMatrix); 1652 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, 1653 charinfo.m_OriginY); 1654 charinfo.m_CharBox = 1655 CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY, 1656 charinfo.m_OriginX, charinfo.m_OriginY); 1657 m_TempCharList.Add(charinfo); 1658 } 1659 if (item.m_CharCode == (FX_DWORD)-1) { 1660 continue; 1661 } 1662 } 1663 spacing = 0; 1664 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 1665 FX_BOOL bNoUnicode = FALSE; 1666 FX_WCHAR wChar = wstrItem.GetAt(0); 1667 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) { 1668 if (wstrItem.IsEmpty()) { 1669 wstrItem += (FX_WCHAR)item.m_CharCode; 1670 } else { 1671 wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode); 1672 } 1673 bNoUnicode = TRUE; 1674 } 1675 charinfo.m_Index = -1; 1676 charinfo.m_CharCode = item.m_CharCode; 1677 if (bNoUnicode) { 1678 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE; 1679 } else { 1680 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL; 1681 } 1682 charinfo.m_pTextObj = pTextObj; 1683 charinfo.m_OriginX = 0, charinfo.m_OriginY = 0; 1684 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, 1685 charinfo.m_OriginY); 1686 FX_RECT rect(0, 0, 0, 0); 1687 rect.Intersect(0, 0, 0, 0); 1688 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect); 1689 charinfo.m_CharBox.top = 1690 rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY; 1691 charinfo.m_CharBox.left = 1692 rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX; 1693 charinfo.m_CharBox.right = 1694 rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX; 1695 charinfo.m_CharBox.bottom = 1696 rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY; 1697 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) { 1698 charinfo.m_CharBox.top = 1699 charinfo.m_CharBox.bottom + pTextObj->GetFontSize(); 1700 } 1701 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) { 1702 charinfo.m_CharBox.right = 1703 charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode); 1704 } 1705 matrix.TransformRect(charinfo.m_CharBox); 1706 charinfo.m_Matrix.Copy(matrix); 1707 if (wstrItem.IsEmpty()) { 1708 charinfo.m_Unicode = 0; 1709 m_TempCharList.Add(charinfo); 1710 m_TempTextBuf.AppendChar(0xfffe); 1711 continue; 1712 } else { 1713 int nTotal = wstrItem.GetLength(); 1714 FX_BOOL bDel = FALSE; 1715 const int count = std::min(m_TempCharList.GetSize(), 7); 1716 FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance( 1717 (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize()); 1718 for (int n = m_TempCharList.GetSize(); 1719 n > m_TempCharList.GetSize() - count; n--) { 1720 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(n - 1); 1721 if (charinfo1->m_CharCode == charinfo.m_CharCode && 1722 charinfo1->m_pTextObj->GetFont() == 1723 charinfo.m_pTextObj->GetFont() && 1724 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold && 1725 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) { 1726 bDel = TRUE; 1727 break; 1728 } 1729 } 1730 if (!bDel) { 1731 for (int nIndex = 0; nIndex < nTotal; nIndex++) { 1732 charinfo.m_Unicode = wstrItem.GetAt(nIndex); 1733 if (charinfo.m_Unicode) { 1734 charinfo.m_Index = m_TextBuf.GetLength(); 1735 m_TempTextBuf.AppendChar(charinfo.m_Unicode); 1736 } else { 1737 m_TempTextBuf.AppendChar(0xfffe); 1738 } 1739 m_TempCharList.Add(charinfo); 1740 } 1741 } else if (i == 0) { 1742 CFX_WideString str = m_TempTextBuf.GetWideString(); 1743 if (!str.IsEmpty() && 1744 str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) { 1745 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1746 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); 1747 } 1748 } 1749 } 1750 } 1751 if (bIsBidiAndMirrorInverse) { 1752 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend); 1753 } 1754 } 1755 int32_t CPDF_TextPage::GetTextObjectWritingMode( 1756 const CPDF_TextObject* pTextObj) { 1757 int32_t nChars = pTextObj->CountChars(); 1758 if (nChars == 1) { 1759 return m_TextlineDir; 1760 } 1761 CPDF_TextObjectItem first, last; 1762 pTextObj->GetCharInfo(0, &first); 1763 pTextObj->GetCharInfo(nChars - 1, &last); 1764 CFX_Matrix textMatrix; 1765 pTextObj->GetTextMatrix(&textMatrix); 1766 textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY); 1767 textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY); 1768 FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX); 1769 FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY); 1770 if (dX <= 0.0001f && dY <= 0.0001f) { 1771 return -1; 1772 } 1773 CFX_VectorF v; 1774 v.Set(dX, dY); 1775 v.Normalize(); 1776 if (v.y <= 0.0872f) { 1777 return v.x <= 0.0872f ? m_TextlineDir : 0; 1778 } 1779 if (v.x <= 0.0872f) { 1780 return 1; 1781 } 1782 return m_TextlineDir; 1783 } 1784 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) { 1785 CFX_WideString strCurText = m_TempTextBuf.GetWideString(); 1786 if (strCurText.GetLength() == 0) { 1787 strCurText = m_TextBuf.GetWideString(); 1788 } 1789 FX_STRSIZE nCount = strCurText.GetLength(); 1790 int nIndex = nCount - 1; 1791 FX_WCHAR wcTmp = strCurText.GetAt(nIndex); 1792 while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) { 1793 wcTmp = strCurText.GetAt(--nIndex); 1794 } 1795 if (0x2D == wcTmp || 0xAD == wcTmp) { 1796 if (--nIndex > 0) { 1797 FX_WCHAR preChar = strCurText.GetAt((nIndex)); 1798 if (((preChar >= L'A' && preChar <= L'Z') || 1799 (preChar >= L'a' && preChar <= L'z')) && 1800 ((curChar >= L'A' && curChar <= L'Z') || 1801 (curChar >= L'a' && curChar <= L'z'))) { 1802 return TRUE; 1803 } 1804 } 1805 int size = m_TempCharList.GetSize(); 1806 PAGECHAR_INFO preChar; 1807 if (size) { 1808 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 1809 } else { 1810 size = m_charList.GetSize(); 1811 if (size == 0) { 1812 return FALSE; 1813 } 1814 preChar = (PAGECHAR_INFO)m_charList[size - 1]; 1815 } 1816 if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag && 1817 (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode)) { 1818 return TRUE; 1819 } 1820 } 1821 return FALSE; 1822 } 1823 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj, 1824 const CFX_Matrix& formMatrix) { 1825 FindPreviousTextObject(); 1826 FX_BOOL bNewline = FALSE; 1827 int WritingMode = GetTextObjectWritingMode(pObj); 1828 if (WritingMode == -1) { 1829 WritingMode = GetTextObjectWritingMode(m_pPreTextObj); 1830 } 1831 CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right, 1832 pObj->m_Top); 1833 CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, 1834 m_pPreTextObj->m_Right, m_pPreTextObj->m_Top); 1835 CPDF_TextObjectItem PrevItem, item; 1836 int nItem = m_pPreTextObj->CountItems(); 1837 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem); 1838 pObj->GetItemInfo(0, &item); 1839 CFX_WideString wstrItem = 1840 pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 1841 if (wstrItem.IsEmpty()) { 1842 wstrItem += (FX_WCHAR)item.m_CharCode; 1843 } 1844 FX_WCHAR curChar = wstrItem.GetAt(0); 1845 if (WritingMode == 0) { 1846 if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) { 1847 FX_FLOAT top = 1848 this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top; 1849 FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom 1850 : prev_rect.bottom; 1851 if (bottom >= top) { 1852 if (IsHyphen(curChar)) { 1853 return 3; 1854 } 1855 return 2; 1856 } 1857 } 1858 } else if (WritingMode == 1) { 1859 if (this_rect.Width() > pObj->GetFontSize() * 0.1f && 1860 prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) { 1861 FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left 1862 : m_CurlineRect.left; 1863 FX_FLOAT right = this_rect.right < m_CurlineRect.right 1864 ? this_rect.right 1865 : m_CurlineRect.right; 1866 if (right <= left) { 1867 if (IsHyphen(curChar)) { 1868 return 3; 1869 } 1870 return 2; 1871 } 1872 } 1873 } 1874 FX_FLOAT last_pos = PrevItem.m_OriginX; 1875 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont()); 1876 FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000; 1877 last_width = FXSYS_fabs(last_width); 1878 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); 1879 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000; 1880 this_width = FXSYS_fabs(this_width); 1881 FX_FLOAT threshold = 1882 last_width > this_width ? last_width / 4 : this_width / 4; 1883 CFX_Matrix prev_matrix, prev_reverse; 1884 m_pPreTextObj->GetTextMatrix(&prev_matrix); 1885 prev_matrix.Concat(m_perMatrix); 1886 prev_reverse.SetReverse(prev_matrix); 1887 FX_FLOAT x = pObj->GetPosX(); 1888 FX_FLOAT y = pObj->GetPosY(); 1889 formMatrix.Transform(x, y); 1890 prev_reverse.Transform(x, y); 1891 if (last_width < this_width) { 1892 threshold = prev_reverse.TransformDistance(threshold); 1893 } 1894 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom, 1895 m_pPreTextObj->m_Right, pObj->m_Top); 1896 CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, 1897 m_pPreTextObj->m_Right, m_pPreTextObj->m_Top); 1898 CFX_FloatRect rect3 = rect1; 1899 rect1.Intersect(rect2); 1900 if (WritingMode == 0) { 1901 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) || 1902 ((y > threshold * 2 || y < threshold * -3) && 1903 (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) { 1904 bNewline = TRUE; 1905 if (nItem > 1) { 1906 CPDF_TextObjectItem tempItem; 1907 m_pPreTextObj->GetItemInfo(0, &tempItem); 1908 CFX_Matrix m; 1909 m_pPreTextObj->GetTextMatrix(&m); 1910 if (PrevItem.m_OriginX > tempItem.m_OriginX && 1911 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 && 1912 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 && 1913 m.c < 0.1) { 1914 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000, 1915 m_pPreTextObj->m_Top); 1916 if (re.Contains(pObj->GetPosX(), pObj->GetPosY())) { 1917 bNewline = FALSE; 1918 } else { 1919 CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top); 1920 if (re.Contains(m_pPreTextObj->GetPosX(), 1921 m_pPreTextObj->GetPosY())) { 1922 bNewline = FALSE; 1923 } 1924 } 1925 } 1926 } 1927 } 1928 } 1929 if (bNewline) 1930 return IsHyphen(curChar) ? 3 : 2; 1931 1932 int32_t nChars = pObj->CountChars(); 1933 if (nChars == 1 && (0x2D == curChar || 0xAD == curChar) && 1934 IsHyphen(curChar)) { 1935 return 3; 1936 } 1937 CFX_WideString PrevStr = 1938 m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode); 1939 FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1); 1940 CFX_Matrix matrix; 1941 pObj->GetTextMatrix(&matrix); 1942 matrix.Concat(formMatrix); 1943 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); 1944 threshold = threshold > 400 1945 ? (threshold < 700 1946 ? threshold / 4 1947 : (threshold > 800 ? threshold / 6 : threshold / 5)) 1948 : (threshold / 2); 1949 if (nLastWidth >= nThisWidth) { 1950 threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize()); 1951 } else { 1952 threshold *= FXSYS_fabs(pObj->GetFontSize()); 1953 threshold = matrix.TransformDistance(threshold); 1954 threshold = prev_reverse.TransformDistance(threshold); 1955 } 1956 threshold /= 1000; 1957 if ((threshold < 1.4881 && threshold > 1.4879) || 1958 (threshold < 1.39001 && threshold > 1.38999)) { 1959 threshold *= 1.5; 1960 } 1961 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && 1962 preChar != L' ') { 1963 if (curChar != L' ' && preChar != L' ') { 1964 if ((x - last_pos - last_width) > threshold || 1965 (last_pos - x - last_width) > threshold) { 1966 return 1; 1967 } 1968 if (x < 0 && (last_pos - x - last_width) > threshold) { 1969 return 1; 1970 } 1971 if ((x - last_pos - last_width) > this_width || 1972 (x - last_pos - this_width) > last_width) { 1973 return 1; 1974 } 1975 } 1976 } 1977 return 0; 1978 } 1979 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1, 1980 CPDF_TextObject* pTextObj2) { 1981 if (!pTextObj1 || !pTextObj2) { 1982 return FALSE; 1983 } 1984 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, 1985 pTextObj2->m_Right, pTextObj2->m_Top); 1986 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, 1987 pTextObj1->m_Right, pTextObj1->m_Top); 1988 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() && 1989 !m_ParseOptions.m_bGetCharCodeOnly) { 1990 FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left); 1991 int nCount = m_charList.GetSize(); 1992 if (nCount >= 2) { 1993 PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2]; 1994 FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width(); 1995 if (dbXdif > dbSpace) { 1996 return FALSE; 1997 } 1998 } 1999 } 2000 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { 2001 rcPreObj.Intersect(rcCurObj); 2002 if (rcPreObj.IsEmpty()) { 2003 return FALSE; 2004 } 2005 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > 2006 rcCurObj.Width() / 2) { 2007 return FALSE; 2008 } 2009 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) { 2010 return FALSE; 2011 } 2012 } 2013 int nPreCount = pTextObj2->CountItems(); 2014 int nCurCount = pTextObj1->CountItems(); 2015 if (nPreCount != nCurCount) { 2016 return FALSE; 2017 } 2018 CPDF_TextObjectItem itemPer, itemCur; 2019 for (int i = 0; i < nPreCount; i++) { 2020 pTextObj2->GetItemInfo(i, &itemPer); 2021 pTextObj1->GetItemInfo(i, &itemCur); 2022 if (itemCur.m_CharCode != itemPer.m_CharCode) { 2023 return FALSE; 2024 } 2025 } 2026 if (FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) > 2027 GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) * 2028 pTextObj2->GetFontSize() / 1000 * 0.9 || 2029 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) > 2030 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), 2031 pTextObj2->GetFontSize()) / 2032 8) { 2033 return FALSE; 2034 } 2035 return TRUE; 2036 } 2037 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, 2038 FX_POSITION ObjPos) { 2039 if (!pTextObj) { 2040 return FALSE; 2041 } 2042 int i = 0; 2043 if (!ObjPos) { 2044 ObjPos = m_pPage->GetLastObjectPosition(); 2045 } 2046 CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos); 2047 while (i < 5 && ObjPos) { 2048 pObj = m_pPage->GetPrevObject(ObjPos); 2049 if (pObj == pTextObj) { 2050 continue; 2051 } 2052 if (pObj->m_Type != PDFPAGE_TEXT) { 2053 continue; 2054 } 2055 if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) { 2056 return TRUE; 2057 } 2058 i++; 2059 } 2060 return FALSE; 2061 } 2062 2063 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) { 2064 int size = m_TempCharList.GetSize(); 2065 PAGECHAR_INFO preChar; 2066 if (size) { 2067 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 2068 } else { 2069 size = m_charList.GetSize(); 2070 if (size == 0) { 2071 return FALSE; 2072 } 2073 preChar = (PAGECHAR_INFO)m_charList[size - 1]; 2074 } 2075 info.m_Index = m_TextBuf.GetLength(); 2076 info.m_Unicode = unicode; 2077 info.m_pTextObj = NULL; 2078 info.m_CharCode = -1; 2079 info.m_Flag = FPDFTEXT_CHAR_GENERATED; 2080 int preWidth = 0; 2081 if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD)-1) 2082 preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont()); 2083 2084 FX_FLOAT fFontSize = preChar.m_pTextObj ? preChar.m_pTextObj->GetFontSize() 2085 : preChar.m_CharBox.Height(); 2086 if (!fFontSize) 2087 fFontSize = kDefaultFontSize; 2088 2089 info.m_OriginX = preChar.m_OriginX + preWidth * (fFontSize) / 1000; 2090 info.m_OriginY = preChar.m_OriginY; 2091 info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX, 2092 info.m_OriginY); 2093 return TRUE; 2094 } 2095 2096 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, 2097 const CFX_FloatRect& rect2) { 2098 CFX_FloatRect rect = rect1; 2099 rect.Intersect(rect2); 2100 return !rect.IsEmpty(); 2101 } 2102 FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) { 2103 if (unicode < L'A') { 2104 return FALSE; 2105 } 2106 if (unicode > L'Z' && unicode < L'a') { 2107 return FALSE; 2108 } 2109 if (unicode > L'z') { 2110 return FALSE; 2111 } 2112 return TRUE; 2113 } 2114 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage) 2115 : m_pTextPage(pTextPage), 2116 m_flags(0), 2117 m_findNextStart(-1), 2118 m_findPreStart(-1), 2119 m_bMatchCase(FALSE), 2120 m_bMatchWholeWord(FALSE), 2121 m_resStart(0), 2122 m_resEnd(-1), 2123 m_IsFind(FALSE) { 2124 m_strText = m_pTextPage->GetPageText(); 2125 int nCount = pTextPage->CountChars(); 2126 if (nCount) { 2127 m_CharIndex.Add(0); 2128 } 2129 for (int i = 0; i < nCount; i++) { 2130 FPDF_CHAR_INFO info; 2131 pTextPage->GetCharInfo(i, &info); 2132 int indexSize = m_CharIndex.GetSize(); 2133 if (info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) { 2134 if (indexSize % 2) { 2135 m_CharIndex.Add(1); 2136 } else { 2137 if (indexSize <= 0) { 2138 continue; 2139 } 2140 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1); 2141 } 2142 } else { 2143 if (indexSize % 2) { 2144 if (indexSize <= 0) { 2145 continue; 2146 } 2147 m_CharIndex.SetAt(indexSize - 1, i + 1); 2148 } else { 2149 m_CharIndex.Add(i + 1); 2150 } 2151 } 2152 } 2153 int indexSize = m_CharIndex.GetSize(); 2154 if (indexSize % 2) { 2155 m_CharIndex.RemoveAt(indexSize - 1); 2156 } 2157 } 2158 int CPDF_TextPageFind::GetCharIndex(int index) const { 2159 return m_pTextPage->CharIndexFromTextIndex(index); 2160 int indexSize = m_CharIndex.GetSize(); 2161 int count = 0; 2162 for (int i = 0; i < indexSize; i += 2) { 2163 count += m_CharIndex.GetAt(i + 1); 2164 if (count > index) { 2165 return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i); 2166 } 2167 } 2168 return -1; 2169 } 2170 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, 2171 int flags, 2172 int startPos) { 2173 if (!m_pTextPage) { 2174 return FALSE; 2175 } 2176 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) { 2177 m_strText = m_pTextPage->GetPageText(); 2178 } 2179 CFX_WideString findwhatStr = findwhat; 2180 m_findWhat = findwhatStr; 2181 m_flags = flags; 2182 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; 2183 if (m_strText.IsEmpty()) { 2184 m_IsFind = FALSE; 2185 return TRUE; 2186 } 2187 FX_STRSIZE len = findwhatStr.GetLength(); 2188 if (!m_bMatchCase) { 2189 findwhatStr.MakeLower(); 2190 m_strText.MakeLower(); 2191 } 2192 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD; 2193 m_findNextStart = startPos; 2194 if (startPos == -1) { 2195 m_findPreStart = m_strText.GetLength() - 1; 2196 } else { 2197 m_findPreStart = startPos; 2198 } 2199 m_csFindWhatArray.RemoveAll(); 2200 int i = 0; 2201 while (i < len) { 2202 if (findwhatStr.GetAt(i) != ' ') { 2203 break; 2204 } 2205 i++; 2206 } 2207 if (i < len) { 2208 ExtractFindWhat(findwhatStr); 2209 } else { 2210 m_csFindWhatArray.Add(findwhatStr); 2211 } 2212 if (m_csFindWhatArray.GetSize() <= 0) { 2213 return FALSE; 2214 } 2215 m_IsFind = TRUE; 2216 m_resStart = 0; 2217 m_resEnd = -1; 2218 return TRUE; 2219 } 2220 FX_BOOL CPDF_TextPageFind::FindNext() { 2221 if (!m_pTextPage) { 2222 return FALSE; 2223 } 2224 m_resArray.RemoveAll(); 2225 if (m_findNextStart == -1) { 2226 return FALSE; 2227 } 2228 if (m_strText.IsEmpty()) { 2229 m_IsFind = FALSE; 2230 return m_IsFind; 2231 } 2232 int strLen = m_strText.GetLength(); 2233 if (m_findNextStart > strLen - 1) { 2234 m_IsFind = FALSE; 2235 return m_IsFind; 2236 } 2237 int nCount = m_csFindWhatArray.GetSize(); 2238 int nResultPos = 0; 2239 int nStartPos = 0; 2240 nStartPos = m_findNextStart; 2241 FX_BOOL bSpaceStart = FALSE; 2242 for (int iWord = 0; iWord < nCount; iWord++) { 2243 CFX_WideString csWord = m_csFindWhatArray[iWord]; 2244 if (csWord.IsEmpty()) { 2245 if (iWord == nCount - 1) { 2246 FX_WCHAR strInsert = m_strText.GetAt(nStartPos); 2247 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR || 2248 strInsert == TEXT_RETURN_CHAR || strInsert == 160) { 2249 nResultPos = nStartPos + 1; 2250 break; 2251 } 2252 iWord = -1; 2253 } else if (iWord == 0) { 2254 bSpaceStart = TRUE; 2255 } 2256 continue; 2257 } 2258 int endIndex; 2259 nResultPos = m_strText.Find(csWord.c_str(), nStartPos); 2260 if (nResultPos == -1) { 2261 m_IsFind = FALSE; 2262 return m_IsFind; 2263 } 2264 endIndex = nResultPos + csWord.GetLength() - 1; 2265 if (iWord == 0) { 2266 m_resStart = nResultPos; 2267 } 2268 FX_BOOL bMatch = TRUE; 2269 if (iWord != 0 && !bSpaceStart) { 2270 int PreResEndPos = nStartPos; 2271 int curChar = csWord.GetAt(0); 2272 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; 2273 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); 2274 if (nStartPos == nResultPos && 2275 !(_IsIgnoreSpaceCharacter(lastChar) || 2276 _IsIgnoreSpaceCharacter(curChar))) { 2277 bMatch = FALSE; 2278 } 2279 for (int d = PreResEndPos; d < nResultPos; d++) { 2280 FX_WCHAR strInsert = m_strText.GetAt(d); 2281 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && 2282 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 2283 bMatch = FALSE; 2284 break; 2285 } 2286 } 2287 } else if (bSpaceStart) { 2288 if (nResultPos > 0) { 2289 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); 2290 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && 2291 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 2292 bMatch = FALSE; 2293 m_resStart = nResultPos; 2294 } else { 2295 m_resStart = nResultPos - 1; 2296 } 2297 } 2298 } 2299 if (m_bMatchWholeWord && bMatch) { 2300 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); 2301 } 2302 nStartPos = endIndex + 1; 2303 if (!bMatch) { 2304 iWord = -1; 2305 if (bSpaceStart) { 2306 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); 2307 } else { 2308 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); 2309 } 2310 } 2311 } 2312 m_resEnd = nResultPos + 2313 m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1; 2314 m_IsFind = TRUE; 2315 int resStart = GetCharIndex(m_resStart); 2316 int resEnd = GetCharIndex(m_resEnd); 2317 m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray); 2318 if (m_flags & FPDFTEXT_CONSECUTIVE) { 2319 m_findNextStart = m_resStart + 1; 2320 m_findPreStart = m_resEnd - 1; 2321 } else { 2322 m_findNextStart = m_resEnd + 1; 2323 m_findPreStart = m_resStart - 1; 2324 } 2325 return m_IsFind; 2326 } 2327 FX_BOOL CPDF_TextPageFind::FindPrev() { 2328 if (!m_pTextPage) { 2329 return FALSE; 2330 } 2331 m_resArray.RemoveAll(); 2332 if (m_strText.IsEmpty() || m_findPreStart < 0) { 2333 m_IsFind = FALSE; 2334 return m_IsFind; 2335 } 2336 CPDF_TextPageFind findEngine(m_pTextPage); 2337 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); 2338 if (!ret) { 2339 m_IsFind = FALSE; 2340 return m_IsFind; 2341 } 2342 int order = -1, MatchedCount = 0; 2343 while (ret) { 2344 ret = findEngine.FindNext(); 2345 if (ret) { 2346 int order1 = findEngine.GetCurOrder(); 2347 int MatchedCount1 = findEngine.GetMatchedCount(); 2348 if (((order1 + MatchedCount1) - 1) > m_findPreStart) { 2349 break; 2350 } 2351 order = order1; 2352 MatchedCount = MatchedCount1; 2353 } 2354 } 2355 if (order == -1) { 2356 m_IsFind = FALSE; 2357 return m_IsFind; 2358 } 2359 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); 2360 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); 2361 m_IsFind = TRUE; 2362 m_pTextPage->GetRectArray(order, MatchedCount, m_resArray); 2363 if (m_flags & FPDFTEXT_CONSECUTIVE) { 2364 m_findNextStart = m_resStart + 1; 2365 m_findPreStart = m_resEnd - 1; 2366 } else { 2367 m_findNextStart = m_resEnd + 1; 2368 m_findPreStart = m_resStart - 1; 2369 } 2370 return m_IsFind; 2371 } 2372 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { 2373 if (findwhat.IsEmpty()) { 2374 return; 2375 } 2376 int index = 0; 2377 while (1) { 2378 CFX_WideString csWord = TEXT_EMPTY; 2379 int ret = 2380 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_CHAR); 2381 if (csWord.IsEmpty()) { 2382 if (ret) { 2383 m_csFindWhatArray.Add(CFX_WideString(L"")); 2384 index++; 2385 continue; 2386 } else { 2387 break; 2388 } 2389 } 2390 int pos = 0; 2391 while (pos < csWord.GetLength()) { 2392 CFX_WideString curStr = csWord.Mid(pos, 1); 2393 FX_WCHAR curChar = csWord.GetAt(pos); 2394 if (_IsIgnoreSpaceCharacter(curChar)) { 2395 if (pos > 0 && curChar == 0x2019) { 2396 pos++; 2397 continue; 2398 } 2399 if (pos > 0) { 2400 CFX_WideString preStr = csWord.Mid(0, pos); 2401 m_csFindWhatArray.Add(preStr); 2402 } 2403 m_csFindWhatArray.Add(curStr); 2404 if (pos == csWord.GetLength() - 1) { 2405 csWord.Empty(); 2406 break; 2407 } 2408 csWord = csWord.Right(csWord.GetLength() - pos - 1); 2409 pos = 0; 2410 continue; 2411 } 2412 pos++; 2413 } 2414 if (!csWord.IsEmpty()) { 2415 m_csFindWhatArray.Add(csWord); 2416 } 2417 index++; 2418 } 2419 } 2420 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, 2421 int startPos, 2422 int endPos) { 2423 FX_WCHAR char_left = 0; 2424 FX_WCHAR char_right = 0; 2425 int char_count = endPos - startPos + 1; 2426 if (char_count < 1) { 2427 return FALSE; 2428 } 2429 if (char_count == 1 && csPageText.GetAt(startPos) > 255) { 2430 return TRUE; 2431 } 2432 if (startPos - 1 >= 0) { 2433 char_left = csPageText.GetAt(startPos - 1); 2434 } 2435 if (startPos + char_count < csPageText.GetLength()) { 2436 char_right = csPageText.GetAt(startPos + char_count); 2437 } 2438 if ((char_left > 'A' && char_left < 'a') || 2439 (char_left > 'a' && char_left < 'z') || 2440 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || 2441 (char_right > 'A' && char_right < 'a') || 2442 (char_right > 'a' && char_right < 'z') || 2443 (char_right > 0xfb00 && char_right < 0xfb06) || 2444 std::iswdigit(char_right)) { 2445 return FALSE; 2446 } 2447 if (!(('A' > char_left || char_left > 'Z') && 2448 ('a' > char_left || char_left > 'z') && 2449 ('A' > char_right || char_right > 'Z') && 2450 ('a' > char_right || char_right > 'z'))) { 2451 return FALSE; 2452 } 2453 if (char_count > 0) { 2454 if (csPageText.GetAt(startPos) >= L'0' && 2455 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && 2456 char_left <= L'9') { 2457 return FALSE; 2458 } 2459 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && 2460 char_right >= L'0' && char_right <= L'9') { 2461 return FALSE; 2462 } 2463 } 2464 return TRUE; 2465 } 2466 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, 2467 const FX_WCHAR* lpszFullString, 2468 int iSubString, 2469 FX_WCHAR chSep) { 2470 if (!lpszFullString) { 2471 return FALSE; 2472 } 2473 while (iSubString--) { 2474 lpszFullString = FXSYS_wcschr(lpszFullString, chSep); 2475 if (!lpszFullString) { 2476 rString.Empty(); 2477 return FALSE; 2478 } 2479 lpszFullString++; 2480 while (*lpszFullString == chSep) { 2481 lpszFullString++; 2482 } 2483 } 2484 const FX_WCHAR* lpchEnd = FXSYS_wcschr(lpszFullString, chSep); 2485 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString) 2486 : (int)FXSYS_wcslen(lpszFullString); 2487 ASSERT(nLen >= 0); 2488 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, 2489 nLen * sizeof(FX_WCHAR)); 2490 rString.ReleaseBuffer(); 2491 return TRUE; 2492 } 2493 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) { 2494 CFX_WideString str2; 2495 str2.Empty(); 2496 int nlen = str.GetLength(); 2497 for (int i = nlen - 1; i >= 0; i--) { 2498 str2 += str.GetAt(i); 2499 } 2500 return str2; 2501 } 2502 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const { 2503 rects.Copy(m_resArray); 2504 } 2505 int CPDF_TextPageFind::GetCurOrder() const { 2506 return GetCharIndex(m_resStart); 2507 } 2508 int CPDF_TextPageFind::GetMatchedCount() const { 2509 int resStart = GetCharIndex(m_resStart); 2510 int resEnd = GetCharIndex(m_resEnd); 2511 return resEnd - resStart + 1; 2512 } 2513 2514 CPDF_LinkExtract::CPDF_LinkExtract() 2515 : m_pTextPage(nullptr), m_bIsParsed(false) { 2516 } 2517 2518 CPDF_LinkExtract::~CPDF_LinkExtract() { 2519 DeleteLinkList(); 2520 } 2521 2522 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) { 2523 if (!pTextPage || !pTextPage->IsParsed()) 2524 return FALSE; 2525 2526 m_pTextPage = (const CPDF_TextPage*)pTextPage; 2527 m_strPageText = m_pTextPage->GetPageText(0, -1); 2528 DeleteLinkList(); 2529 if (m_strPageText.IsEmpty()) { 2530 return FALSE; 2531 } 2532 ParseLink(); 2533 m_bIsParsed = true; 2534 return TRUE; 2535 } 2536 2537 void CPDF_LinkExtract::DeleteLinkList() { 2538 while (m_LinkList.GetSize()) { 2539 CPDF_LinkExt* linkinfo = NULL; 2540 linkinfo = m_LinkList.GetAt(0); 2541 m_LinkList.RemoveAt(0); 2542 delete linkinfo; 2543 } 2544 m_LinkList.RemoveAll(); 2545 } 2546 int CPDF_LinkExtract::CountLinks() const { 2547 if (!m_bIsParsed) { 2548 return -1; 2549 } 2550 return m_LinkList.GetSize(); 2551 } 2552 void CPDF_LinkExtract::ParseLink() { 2553 int start = 0, pos = 0; 2554 int TotalChar = m_pTextPage->CountChars(); 2555 while (pos < TotalChar) { 2556 FPDF_CHAR_INFO pageChar; 2557 m_pTextPage->GetCharInfo(pos, &pageChar); 2558 if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 || 2559 pos == TotalChar - 1) { 2560 int nCount = pos - start; 2561 if (pos == TotalChar - 1) { 2562 nCount++; 2563 } 2564 CFX_WideString strBeCheck; 2565 strBeCheck = m_pTextPage->GetPageText(start, nCount); 2566 if (strBeCheck.GetLength() > 5) { 2567 while (strBeCheck.GetLength() > 0) { 2568 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); 2569 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { 2570 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); 2571 nCount--; 2572 } else { 2573 break; 2574 } 2575 } 2576 if (nCount > 5 && 2577 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { 2578 AppendToLinkList(start, nCount, strBeCheck); 2579 } 2580 } 2581 start = ++pos; 2582 } else { 2583 pos++; 2584 } 2585 } 2586 } 2587 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { 2588 CFX_WideString str = strBeCheck; 2589 str.MakeLower(); 2590 if (str.Find(L"http://www.") != -1) { 2591 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); 2592 return TRUE; 2593 } 2594 if (str.Find(L"http://") != -1) { 2595 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); 2596 return TRUE; 2597 } 2598 if (str.Find(L"https://www.") != -1) { 2599 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); 2600 return TRUE; 2601 } 2602 if (str.Find(L"https://") != -1) { 2603 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); 2604 return TRUE; 2605 } 2606 if (str.Find(L"www.") != -1) { 2607 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 2608 strBeCheck = L"http://" + strBeCheck; 2609 return TRUE; 2610 } 2611 return FALSE; 2612 } 2613 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { 2614 int aPos = str.Find(L'@'); 2615 // Invalid when no '@'. 2616 if (aPos < 1) { 2617 return FALSE; 2618 } 2619 2620 // Check the local part. 2621 int pPos = aPos; // Used to track the position of '@' or '.'. 2622 for (int i = aPos - 1; i >= 0; i--) { 2623 FX_WCHAR ch = str.GetAt(i); 2624 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) { 2625 continue; 2626 } 2627 if (ch != L'.' || i == pPos - 1 || i == 0) { 2628 if (i == aPos - 1) { 2629 // There is '.' or invalid char before '@'. 2630 return FALSE; 2631 } 2632 // End extracting for other invalid chars, '.' at the beginning, or 2633 // consecutive '.'. 2634 int removed_len = i == pPos - 1 ? i + 2 : i + 1; 2635 str = str.Right(str.GetLength() - removed_len); 2636 break; 2637 } 2638 // Found a valid '.'. 2639 pPos = i; 2640 } 2641 2642 // Check the domain name part. 2643 aPos = str.Find(L'@'); 2644 if (aPos < 1) { 2645 return FALSE; 2646 } 2647 str.TrimRight(L'.'); 2648 // At least one '.' in domain name, but not at the beginning. 2649 // TODO(weili): RFC5322 allows domain names to be a local name without '.'. 2650 // Check whether we should remove this check. 2651 int ePos = str.Find(L'.', aPos + 1); 2652 if (ePos == -1 || ePos == aPos + 1) { 2653 return FALSE; 2654 } 2655 // Validate all other chars in domain name. 2656 int nLen = str.GetLength(); 2657 pPos = 0; // Used to track the position of '.'. 2658 for (int i = aPos + 1; i < nLen; i++) { 2659 FX_WCHAR wch = str.GetAt(i); 2660 if (wch == L'-' || FXSYS_iswalnum(wch)) { 2661 continue; 2662 } 2663 if (wch != L'.' || i == pPos + 1) { 2664 // Domain name should end before invalid char. 2665 int host_end = i == pPos + 1 ? i - 2 : i - 1; 2666 if (pPos > 0 && host_end - aPos >= 3) { 2667 // Trim the ending invalid chars if there is at least one '.' and name. 2668 str = str.Left(host_end + 1); 2669 break; 2670 } 2671 return FALSE; 2672 } 2673 pPos = i; 2674 } 2675 2676 if (str.Find(L"mailto:") == -1) { 2677 str = L"mailto:" + str; 2678 } 2679 return TRUE; 2680 } 2681 2682 void CPDF_LinkExtract::AppendToLinkList(int start, 2683 int count, 2684 const CFX_WideString& strUrl) { 2685 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; 2686 linkInfo->m_strUrl = strUrl; 2687 linkInfo->m_Start = start; 2688 linkInfo->m_Count = count; 2689 m_LinkList.Add(linkInfo); 2690 } 2691 2692 CFX_WideString CPDF_LinkExtract::GetURL(int index) const { 2693 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { 2694 return L""; 2695 } 2696 CPDF_LinkExt* link = NULL; 2697 link = m_LinkList.GetAt(index); 2698 if (!link) { 2699 return L""; 2700 } 2701 return link->m_strUrl; 2702 } 2703 void CPDF_LinkExtract::GetBoundedSegment(int index, 2704 int& start, 2705 int& count) const { 2706 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { 2707 return; 2708 } 2709 CPDF_LinkExt* link = NULL; 2710 link = m_LinkList.GetAt(index); 2711 if (!link) { 2712 return; 2713 } 2714 start = link->m_Start; 2715 count = link->m_Count; 2716 } 2717 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const { 2718 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { 2719 return; 2720 } 2721 CPDF_LinkExt* link = NULL; 2722 link = m_LinkList.GetAt(index); 2723 if (!link) { 2724 return; 2725 } 2726 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); 2727 } 2728