1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "core/fpdftext/cpdf_textpage.h" 8 9 #include <algorithm> 10 #include <utility> 11 #include <vector> 12 13 #include "core/fpdfapi/font/cpdf_font.h" 14 #include "core/fpdfapi/page/cpdf_form.h" 15 #include "core/fpdfapi/page/cpdf_formobject.h" 16 #include "core/fpdfapi/page/cpdf_page.h" 17 #include "core/fpdfapi/page/cpdf_pageobject.h" 18 #include "core/fpdfapi/page/cpdf_textobject.h" 19 #include "core/fpdfapi/parser/cpdf_dictionary.h" 20 #include "core/fpdfapi/parser/cpdf_string.h" 21 #include "core/fpdftext/unicodenormalizationdata.h" 22 #include "core/fxcrt/fx_bidi.h" 23 #include "core/fxcrt/fx_extension.h" 24 #include "core/fxcrt/fx_unicode.h" 25 #include "third_party/base/stl_util.h" 26 27 namespace { 28 29 const float kDefaultFontSize = 1.0f; 30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { 31 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, 32 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; 33 34 float NormalizeThreshold(float threshold) { 35 if (threshold < 300) 36 return threshold / 2.0f; 37 if (threshold < 500) 38 return threshold / 4.0f; 39 if (threshold < 700) 40 return threshold / 5.0f; 41 return threshold / 6.0f; 42 } 43 44 float CalculateBaseSpace(const CPDF_TextObject* pTextObj, 45 const CFX_Matrix& matrix) { 46 float baseSpace = 0.0; 47 const size_t nItems = pTextObj->CountItems(); 48 if (pTextObj->m_TextState.GetCharSpace() && nItems >= 3) { 49 bool bAllChar = true; 50 float spacing = 51 matrix.TransformDistance(pTextObj->m_TextState.GetCharSpace()); 52 baseSpace = spacing; 53 for (size_t i = 0; i < nItems; ++i) { 54 CPDF_TextObjectItem item; 55 pTextObj->GetItemInfo(i, &item); 56 if (item.m_CharCode == static_cast<uint32_t>(-1)) { 57 float fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 58 float kerning = -fontsize_h * item.m_Origin.x / 1000; 59 baseSpace = std::min(baseSpace, kerning + spacing); 60 bAllChar = false; 61 } 62 } 63 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) 64 baseSpace = 0.0; 65 } 66 return baseSpace; 67 } 68 69 size_t Unicode_GetNormalization(wchar_t wch, wchar_t* pDst) { 70 wch = wch & 0xFFFF; 71 wchar_t wFind = g_UnicodeData_Normalization[wch]; 72 if (!wFind) { 73 if (pDst) 74 *pDst = wch; 75 return 1; 76 } 77 if (wFind >= 0x8000) { 78 wch = wFind - 0x8000; 79 wFind = 1; 80 } else { 81 wch = wFind & 0x0FFF; 82 wFind >>= 12; 83 } 84 const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind]; 85 if (pMap == g_UnicodeData_Normalization_Map4) { 86 pMap = g_UnicodeData_Normalization_Map4 + wch; 87 wFind = (wchar_t)(*pMap++); 88 } else { 89 pMap += wch; 90 } 91 if (pDst) { 92 wchar_t n = wFind; 93 while (n--) 94 *pDst++ = *pMap++; 95 } 96 return static_cast<size_t>(wFind); 97 } 98 99 float MaskPercentFilled(const std::vector<bool>& mask, 100 int32_t start, 101 int32_t end) { 102 if (start >= end) 103 return 0; 104 float count = std::count_if(mask.begin() + start, mask.begin() + end, 105 [](bool r) { return r; }); 106 return count / (end - start); 107 } 108 109 bool IsHyphenCode(wchar_t c) { 110 return c == 0x2D || c == 0xAD; 111 } 112 113 } // namespace 114 115 PDFTEXT_Obj::PDFTEXT_Obj() {} 116 117 PDFTEXT_Obj::PDFTEXT_Obj(const PDFTEXT_Obj& that) = default; 118 119 PDFTEXT_Obj::~PDFTEXT_Obj() {} 120 121 FPDF_CHAR_INFO::FPDF_CHAR_INFO() 122 : m_Unicode(0), 123 m_Charcode(0), 124 m_Flag(0), 125 m_FontSize(0), 126 m_pTextObj(nullptr) {} 127 128 FPDF_CHAR_INFO::~FPDF_CHAR_INFO() {} 129 130 PAGECHAR_INFO::PAGECHAR_INFO() 131 : m_Index(0), m_CharCode(0), m_Unicode(0), m_Flag(0), m_pTextObj(nullptr) {} 132 133 PAGECHAR_INFO::PAGECHAR_INFO(const PAGECHAR_INFO&) = default; 134 135 PAGECHAR_INFO::~PAGECHAR_INFO() {} 136 137 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags) 138 : m_pPage(pPage), 139 m_parserflag(flags), 140 m_pPreTextObj(nullptr), 141 m_bIsParsed(false), 142 m_TextlineDir(TextOrientation::Unknown) { 143 m_TextBuf.EstimateSize(0, 10240); 144 m_DisplayMatrix = 145 pPage->GetDisplayMatrix(0, 0, static_cast<int>(pPage->GetPageWidth()), 146 static_cast<int>(pPage->GetPageHeight()), 0); 147 } 148 149 CPDF_TextPage::~CPDF_TextPage() {} 150 151 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) { 152 switch (charInfo.m_Unicode) { 153 case 0x2: 154 case 0x3: 155 case 0x93: 156 case 0x94: 157 case 0x96: 158 case 0x97: 159 case 0x98: 160 case 0xfffe: 161 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN; 162 default: 163 return false; 164 } 165 } 166 167 void CPDF_TextPage::ParseTextPage() { 168 m_bIsParsed = false; 169 m_TextBuf.Clear(); 170 m_CharList.clear(); 171 m_pPreTextObj = nullptr; 172 ProcessObject(); 173 174 m_bIsParsed = true; 175 m_CharIndex.clear(); 176 int nCount = pdfium::CollectionSize<int>(m_CharList); 177 if (nCount) 178 m_CharIndex.push_back(0); 179 180 for (int i = 0; i < nCount; i++) { 181 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 182 const PAGECHAR_INFO& charinfo = m_CharList[i]; 183 if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED || 184 (charinfo.m_Unicode != 0 && !IsControlChar(charinfo))) { 185 if (indexSize % 2) { 186 m_CharIndex.push_back(1); 187 } else { 188 if (indexSize <= 0) 189 continue; 190 m_CharIndex[indexSize - 1] += 1; 191 } 192 } else { 193 if (indexSize % 2) { 194 if (indexSize <= 0) 195 continue; 196 m_CharIndex[indexSize - 1] = i + 1; 197 } else { 198 m_CharIndex.push_back(i + 1); 199 } 200 } 201 } 202 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 203 if (indexSize % 2) 204 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); 205 } 206 207 int CPDF_TextPage::CountChars() const { 208 return pdfium::CollectionSize<int>(m_CharList); 209 } 210 211 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const { 212 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 213 int count = 0; 214 for (int i = 0; i < indexSize; i += 2) { 215 count += m_CharIndex[i + 1]; 216 if (count > TextIndex) 217 return TextIndex - count + m_CharIndex[i + 1] + m_CharIndex[i]; 218 } 219 return -1; 220 } 221 222 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const { 223 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); 224 int count = 0; 225 for (int i = 0; i < indexSize; i += 2) { 226 count += m_CharIndex[i + 1]; 227 if (m_CharIndex[i + 1] + m_CharIndex[i] > CharIndex) { 228 if (CharIndex - m_CharIndex[i] < 0) 229 return -1; 230 231 return CharIndex - m_CharIndex[i] + count - m_CharIndex[i + 1]; 232 } 233 } 234 return -1; 235 } 236 237 std::vector<CFX_FloatRect> CPDF_TextPage::GetRectArray(int start, 238 int nCount) const { 239 if (start < 0 || nCount == 0 || !m_bIsParsed) 240 return std::vector<CFX_FloatRect>(); 241 242 if (nCount + start > pdfium::CollectionSize<int>(m_CharList) || 243 nCount == -1) { 244 nCount = pdfium::CollectionSize<int>(m_CharList) - start; 245 } 246 247 std::vector<CFX_FloatRect> rectArray; 248 CPDF_TextObject* pCurObj = nullptr; 249 CFX_FloatRect rect; 250 int curPos = start; 251 bool bFlagNewRect = true; 252 while (nCount--) { 253 PAGECHAR_INFO info_curchar = m_CharList[curPos++]; 254 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) 255 continue; 256 if (info_curchar.m_CharBox.Width() < 0.01 || 257 info_curchar.m_CharBox.Height() < 0.01) { 258 continue; 259 } 260 if (!pCurObj) 261 pCurObj = info_curchar.m_pTextObj.Get(); 262 if (pCurObj != info_curchar.m_pTextObj) { 263 rectArray.push_back(rect); 264 pCurObj = info_curchar.m_pTextObj.Get(); 265 bFlagNewRect = true; 266 } 267 if (bFlagNewRect) { 268 CFX_Matrix matrix = info_curchar.m_pTextObj->GetTextMatrix(); 269 matrix.Concat(info_curchar.m_Matrix); 270 271 CFX_PointF origin = matrix.GetInverse().Transform(info_curchar.m_Origin); 272 rect.left = info_curchar.m_CharBox.left; 273 rect.right = info_curchar.m_CharBox.right; 274 if (pCurObj->GetFont()->GetTypeDescent()) { 275 rect.bottom = origin.y + 276 pCurObj->GetFont()->GetTypeDescent() * 277 pCurObj->GetFontSize() / 1000; 278 279 rect.bottom = matrix.Transform(CFX_PointF(origin.x, rect.bottom)).y; 280 } else { 281 rect.bottom = info_curchar.m_CharBox.bottom; 282 } 283 if (pCurObj->GetFont()->GetTypeAscent()) { 284 rect.top = 285 origin.y + 286 pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000; 287 float xPosTemp = 288 origin.x + 289 GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) * 290 pCurObj->GetFontSize() / 1000; 291 rect.top = matrix.Transform(CFX_PointF(xPosTemp, rect.top)).y; 292 } else { 293 rect.top = info_curchar.m_CharBox.top; 294 } 295 bFlagNewRect = false; 296 rect = info_curchar.m_CharBox; 297 rect.Normalize(); 298 } else { 299 info_curchar.m_CharBox.Normalize(); 300 rect.left = std::min(rect.left, info_curchar.m_CharBox.left); 301 rect.right = std::max(rect.right, info_curchar.m_CharBox.right); 302 rect.top = std::max(rect.top, info_curchar.m_CharBox.top); 303 rect.bottom = std::min(rect.bottom, info_curchar.m_CharBox.bottom); 304 } 305 } 306 rectArray.push_back(rect); 307 return rectArray; 308 } 309 310 int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point, 311 const CFX_SizeF& tolerance) const { 312 if (!m_bIsParsed) 313 return -3; 314 315 int pos = 0; 316 int NearPos = -1; 317 double xdif = 5000; 318 double ydif = 5000; 319 while (pos < pdfium::CollectionSize<int>(m_CharList)) { 320 PAGECHAR_INFO charinfo = m_CharList[pos]; 321 CFX_FloatRect charrect = charinfo.m_CharBox; 322 if (charrect.Contains(point)) 323 break; 324 if (tolerance.width > 0 || tolerance.height > 0) { 325 CFX_FloatRect charRectExt; 326 charrect.Normalize(); 327 charRectExt.left = charrect.left - tolerance.width / 2; 328 charRectExt.right = charrect.right + tolerance.width / 2; 329 charRectExt.top = charrect.top + tolerance.height / 2; 330 charRectExt.bottom = charrect.bottom - tolerance.height / 2; 331 if (charRectExt.Contains(point)) { 332 double curXdif, curYdif; 333 curXdif = fabs(point.x - charrect.left) < fabs(point.x - charrect.right) 334 ? fabs(point.x - charrect.left) 335 : fabs(point.x - charrect.right); 336 curYdif = fabs(point.y - charrect.bottom) < fabs(point.y - charrect.top) 337 ? fabs(point.y - charrect.bottom) 338 : fabs(point.y - charrect.top); 339 if (curYdif + curXdif < xdif + ydif) { 340 ydif = curYdif; 341 xdif = curXdif; 342 NearPos = pos; 343 } 344 } 345 } 346 ++pos; 347 } 348 return pos < pdfium::CollectionSize<int>(m_CharList) ? pos : NearPos; 349 } 350 351 WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const { 352 if (!m_bIsParsed) 353 return WideString(); 354 355 float posy = 0; 356 bool IsContainPreChar = false; 357 bool IsAddLineFeed = false; 358 WideString strText; 359 for (const auto& charinfo : m_CharList) { 360 if (IsRectIntersect(rect, charinfo.m_CharBox)) { 361 if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar && 362 IsAddLineFeed) { 363 posy = charinfo.m_Origin.y; 364 if (!strText.IsEmpty()) 365 strText += L"\r\n"; 366 } 367 IsContainPreChar = true; 368 IsAddLineFeed = false; 369 if (charinfo.m_Unicode) 370 strText += charinfo.m_Unicode; 371 } else if (charinfo.m_Unicode == 32) { 372 if (IsContainPreChar && charinfo.m_Unicode) { 373 strText += charinfo.m_Unicode; 374 IsContainPreChar = false; 375 IsAddLineFeed = false; 376 } 377 } else { 378 IsContainPreChar = false; 379 IsAddLineFeed = true; 380 } 381 } 382 return strText; 383 } 384 385 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const { 386 if (!m_bIsParsed || !pdfium::IndexInBounds(m_CharList, index)) 387 return; 388 389 const PAGECHAR_INFO& charinfo = m_CharList[index]; 390 info->m_Charcode = charinfo.m_CharCode; 391 info->m_Origin = charinfo.m_Origin; 392 info->m_Unicode = charinfo.m_Unicode; 393 info->m_Flag = charinfo.m_Flag; 394 info->m_CharBox = charinfo.m_CharBox; 395 info->m_pTextObj = charinfo.m_pTextObj; 396 if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont()) 397 info->m_FontSize = charinfo.m_pTextObj->GetFontSize(); 398 else 399 info->m_FontSize = kDefaultFontSize; 400 info->m_Matrix = charinfo.m_Matrix; 401 } 402 403 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start, 404 int32_t& nCount) const { 405 PAGECHAR_INFO charinfo = m_CharList[start]; 406 PAGECHAR_INFO charinfo2 = m_CharList[start + nCount - 1]; 407 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag && 408 FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) { 409 return; 410 } 411 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) { 412 PAGECHAR_INFO charinfo1 = charinfo; 413 int startIndex = start; 414 while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag && 415 charinfo1.m_Index == charinfo.m_Index) { 416 startIndex--; 417 if (startIndex < 0) 418 break; 419 charinfo1 = m_CharList[startIndex]; 420 } 421 startIndex++; 422 start = startIndex; 423 } 424 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) { 425 PAGECHAR_INFO charinfo3 = charinfo2; 426 int endIndex = start + nCount - 1; 427 while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag && 428 charinfo3.m_Index == charinfo2.m_Index) { 429 endIndex++; 430 if (endIndex >= pdfium::CollectionSize<int>(m_CharList)) 431 break; 432 charinfo3 = m_CharList[endIndex]; 433 } 434 endIndex--; 435 nCount = endIndex - start + 1; 436 } 437 } 438 439 WideString CPDF_TextPage::GetPageText(int start, int count) const { 440 if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed || 441 m_CharList.empty() || m_TextBuf.GetLength() == 0) { 442 return L""; 443 } 444 445 int text_start = TextIndexFromCharIndex(start); 446 if (text_start < 0) 447 return L""; 448 449 count = std::min(count, CountChars() - start); 450 451 int last = start + count - 1; 452 int text_last = TextIndexFromCharIndex(last); 453 if (text_last < 0 || text_last < text_start) 454 return L""; 455 456 int text_count = text_last - text_start + 1; 457 458 return WideString(m_TextBuf.AsStringView().Mid( 459 static_cast<size_t>(text_start), static_cast<size_t>(text_count))); 460 } 461 462 int CPDF_TextPage::CountRects(int start, int nCount) { 463 if (!m_bIsParsed || start < 0) 464 return -1; 465 466 if (nCount == -1 || 467 nCount + start > pdfium::CollectionSize<int>(m_CharList)) { 468 nCount = pdfium::CollectionSize<int>(m_CharList) - start; 469 } 470 m_SelRects = GetRectArray(start, nCount); 471 return pdfium::CollectionSize<int>(m_SelRects); 472 } 473 474 bool CPDF_TextPage::GetRect(int rectIndex, CFX_FloatRect* pRect) const { 475 if (!m_bIsParsed || !pdfium::IndexInBounds(m_SelRects, rectIndex)) 476 return false; 477 478 *pRect = m_SelRects[rectIndex]; 479 return true; 480 } 481 482 CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation() 483 const { 484 if (m_pPage->GetPageObjectList()->empty()) 485 return TextOrientation::Unknown; 486 487 const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth()); 488 const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight()); 489 if (nPageWidth <= 0 || nPageHeight <= 0) 490 return TextOrientation::Unknown; 491 492 std::vector<bool> nHorizontalMask(nPageWidth); 493 std::vector<bool> nVerticalMask(nPageHeight); 494 float fLineHeight = 0.0f; 495 int32_t nStartH = nPageWidth; 496 int32_t nEndH = 0; 497 int32_t nStartV = nPageHeight; 498 int32_t nEndV = 0; 499 for (const auto& pPageObj : *m_pPage->GetPageObjectList()) { 500 if (!pPageObj->IsText()) 501 continue; 502 503 int32_t minH = std::max(static_cast<int32_t>(pPageObj->m_Left), 0); 504 int32_t maxH = 505 std::min(static_cast<int32_t>(pPageObj->m_Right), nPageWidth); 506 int32_t minV = std::max(static_cast<int32_t>(pPageObj->m_Bottom), 0); 507 int32_t maxV = std::min(static_cast<int32_t>(pPageObj->m_Top), nPageHeight); 508 if (minH >= maxH || minV >= maxV) 509 continue; 510 511 for (int32_t i = minH; i < maxH; ++i) 512 nHorizontalMask[i] = true; 513 for (int32_t i = minV; i < maxV; ++i) 514 nVerticalMask[i] = true; 515 516 nStartH = std::min(nStartH, minH); 517 nEndH = std::max(nEndH, maxH); 518 nStartV = std::min(nStartV, minV); 519 nEndV = std::max(nEndV, maxV); 520 521 if (fLineHeight <= 0.0f) 522 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom; 523 } 524 const int32_t nDoubleLineHeight = 2 * fLineHeight; 525 if ((nEndV - nStartV) < nDoubleLineHeight) 526 return TextOrientation::Horizontal; 527 if ((nEndH - nStartH) < nDoubleLineHeight) 528 return TextOrientation::Vertical; 529 530 const float nSumH = MaskPercentFilled(nHorizontalMask, nStartH, nEndH); 531 if (nSumH > 0.8f) 532 return TextOrientation::Horizontal; 533 534 const float nSumV = MaskPercentFilled(nVerticalMask, nStartV, nEndV); 535 if (nSumH > nSumV) 536 return TextOrientation::Horizontal; 537 if (nSumH < nSumV) 538 return TextOrientation::Vertical; 539 return TextOrientation::Unknown; 540 } 541 542 void CPDF_TextPage::AppendGeneratedCharacter(wchar_t unicode, 543 const CFX_Matrix& formMatrix) { 544 PAGECHAR_INFO generateChar; 545 if (!GenerateCharInfo(unicode, generateChar)) 546 return; 547 548 m_TextBuf.AppendChar(unicode); 549 if (!formMatrix.IsIdentity()) 550 generateChar.m_Matrix = formMatrix; 551 m_CharList.push_back(generateChar); 552 } 553 554 void CPDF_TextPage::ProcessObject() { 555 if (m_pPage->GetPageObjectList()->empty()) 556 return; 557 558 m_TextlineDir = FindTextlineFlowOrientation(); 559 const CPDF_PageObjectList* pObjList = m_pPage->GetPageObjectList(); 560 for (auto it = pObjList->begin(); it != pObjList->end(); ++it) { 561 if (CPDF_PageObject* pObj = it->get()) { 562 if (pObj->IsText()) { 563 CFX_Matrix matrix; 564 ProcessTextObject(pObj->AsText(), matrix, pObjList, it); 565 } else if (pObj->IsForm()) { 566 CFX_Matrix formMatrix; 567 ProcessFormObject(pObj->AsForm(), formMatrix); 568 } 569 } 570 } 571 for (const auto& obj : m_LineObj) 572 ProcessTextObject(obj); 573 574 m_LineObj.clear(); 575 CloseTempLine(); 576 } 577 578 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, 579 const CFX_Matrix& formMatrix) { 580 const CPDF_PageObjectList* pObjectList = 581 pFormObj->form()->GetPageObjectList(); 582 if (pObjectList->empty()) 583 return; 584 585 CFX_Matrix curFormMatrix = pFormObj->form_matrix(); 586 curFormMatrix.Concat(formMatrix); 587 588 for (auto it = pObjectList->begin(); it != pObjectList->end(); ++it) { 589 if (CPDF_PageObject* pPageObj = it->get()) { 590 if (pPageObj->IsText()) 591 ProcessTextObject(pPageObj->AsText(), curFormMatrix, pObjectList, it); 592 else if (pPageObj->IsForm()) 593 ProcessFormObject(pPageObj->AsForm(), curFormMatrix); 594 } 595 } 596 } 597 598 int CPDF_TextPage::GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const { 599 if (charCode == CPDF_Font::kInvalidCharCode) 600 return 0; 601 602 if (int w = pFont->GetCharWidthF(charCode)) 603 return w; 604 605 ByteString str; 606 pFont->AppendChar(&str, charCode); 607 if (int w = pFont->GetStringWidth(str.c_str(), 1)) 608 return w; 609 610 return pFont->GetCharBBox(charCode).Width(); 611 } 612 613 void CPDF_TextPage::AddCharInfoByLRDirection(wchar_t wChar, 614 PAGECHAR_INFO info) { 615 if (IsControlChar(info)) { 616 info.m_Index = -1; 617 m_CharList.push_back(info); 618 return; 619 } 620 621 info.m_Index = m_TextBuf.GetLength(); 622 if (wChar >= 0xFB00 && wChar <= 0xFB06) { 623 wchar_t* pDst = nullptr; 624 size_t nCount = Unicode_GetNormalization(wChar, pDst); 625 if (nCount >= 1) { 626 pDst = FX_Alloc(wchar_t, nCount); 627 Unicode_GetNormalization(wChar, pDst); 628 for (size_t nIndex = 0; nIndex < nCount; nIndex++) { 629 PAGECHAR_INFO info2 = info; 630 info2.m_Unicode = pDst[nIndex]; 631 info2.m_Flag = FPDFTEXT_CHAR_PIECE; 632 m_TextBuf.AppendChar(info2.m_Unicode); 633 m_CharList.push_back(info2); 634 } 635 FX_Free(pDst); 636 return; 637 } 638 } 639 m_TextBuf.AppendChar(wChar); 640 m_CharList.push_back(info); 641 } 642 643 void CPDF_TextPage::AddCharInfoByRLDirection(wchar_t wChar, 644 PAGECHAR_INFO info) { 645 if (IsControlChar(info)) { 646 info.m_Index = -1; 647 m_CharList.push_back(info); 648 return; 649 } 650 651 info.m_Index = m_TextBuf.GetLength(); 652 wChar = FX_GetMirrorChar(wChar); 653 wchar_t* pDst = nullptr; 654 size_t nCount = Unicode_GetNormalization(wChar, pDst); 655 if (nCount >= 1) { 656 pDst = FX_Alloc(wchar_t, nCount); 657 Unicode_GetNormalization(wChar, pDst); 658 for (size_t nIndex = 0; nIndex < nCount; nIndex++) { 659 PAGECHAR_INFO info2 = info; 660 info2.m_Unicode = pDst[nIndex]; 661 info2.m_Flag = FPDFTEXT_CHAR_PIECE; 662 m_TextBuf.AppendChar(info2.m_Unicode); 663 m_CharList.push_back(info2); 664 } 665 FX_Free(pDst); 666 return; 667 } 668 info.m_Unicode = wChar; 669 m_TextBuf.AppendChar(info.m_Unicode); 670 m_CharList.push_back(info); 671 } 672 673 void CPDF_TextPage::CloseTempLine() { 674 if (m_TempCharList.empty()) 675 return; 676 677 WideString str = m_TempTextBuf.MakeString(); 678 bool bPrevSpace = false; 679 for (size_t i = 0; i < str.GetLength(); i++) { 680 if (str[i] != ' ') { 681 bPrevSpace = false; 682 continue; 683 } 684 if (bPrevSpace) { 685 m_TempTextBuf.Delete(i, 1); 686 m_TempCharList.erase(m_TempCharList.begin() + i); 687 str.Delete(i); 688 i--; 689 } 690 bPrevSpace = true; 691 } 692 CFX_BidiString bidi(str); 693 if (m_parserflag == FPDFText_Direction::Right) 694 bidi.SetOverallDirectionRight(); 695 CFX_BidiChar::Direction eCurrentDirection = bidi.OverallDirection(); 696 for (const auto& segment : bidi) { 697 if (segment.direction == CFX_BidiChar::RIGHT || 698 (segment.direction == CFX_BidiChar::NEUTRAL && 699 eCurrentDirection == CFX_BidiChar::RIGHT)) { 700 eCurrentDirection = CFX_BidiChar::RIGHT; 701 for (int m = segment.start + segment.count; m > segment.start; --m) 702 AddCharInfoByRLDirection(bidi.CharAt(m - 1), m_TempCharList[m - 1]); 703 } else { 704 eCurrentDirection = CFX_BidiChar::LEFT; 705 for (int m = segment.start; m < segment.start + segment.count; m++) 706 AddCharInfoByLRDirection(bidi.CharAt(m), m_TempCharList[m]); 707 } 708 } 709 m_TempCharList.clear(); 710 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength()); 711 } 712 713 void CPDF_TextPage::ProcessTextObject( 714 CPDF_TextObject* pTextObj, 715 const CFX_Matrix& formMatrix, 716 const CPDF_PageObjectList* pObjList, 717 CPDF_PageObjectList::const_iterator ObjPos) { 718 if (fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) 719 return; 720 721 size_t count = m_LineObj.size(); 722 PDFTEXT_Obj Obj; 723 Obj.m_pTextObj = pTextObj; 724 Obj.m_formMatrix = formMatrix; 725 if (count == 0) { 726 m_LineObj.push_back(Obj); 727 return; 728 } 729 if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos)) 730 return; 731 732 PDFTEXT_Obj prev_Obj = m_LineObj[count - 1]; 733 size_t nItem = prev_Obj.m_pTextObj->CountItems(); 734 if (nItem == 0) 735 return; 736 737 CPDF_TextObjectItem item; 738 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item); 739 float prev_width = 740 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) * 741 prev_Obj.m_pTextObj->GetFontSize() / 1000; 742 743 CFX_Matrix prev_matrix = prev_Obj.m_pTextObj->GetTextMatrix(); 744 prev_width = fabs(prev_width); 745 prev_matrix.Concat(prev_Obj.m_formMatrix); 746 prev_width = prev_matrix.TransformDistance(prev_width); 747 pTextObj->GetItemInfo(0, &item); 748 float this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) * 749 pTextObj->GetFontSize() / 1000; 750 this_width = fabs(this_width); 751 752 CFX_Matrix this_matrix = pTextObj->GetTextMatrix(); 753 this_width = fabs(this_width); 754 this_matrix.Concat(formMatrix); 755 this_width = this_matrix.TransformDistance(this_width); 756 757 float threshold = prev_width > this_width ? prev_width / 4 : this_width / 4; 758 CFX_PointF prev_pos = m_DisplayMatrix.Transform( 759 prev_Obj.m_formMatrix.Transform(prev_Obj.m_pTextObj->GetPos())); 760 CFX_PointF this_pos = 761 m_DisplayMatrix.Transform(formMatrix.Transform(pTextObj->GetPos())); 762 if (fabs(this_pos.y - prev_pos.y) > threshold * 2) { 763 for (size_t i = 0; i < count; i++) 764 ProcessTextObject(m_LineObj[i]); 765 m_LineObj.clear(); 766 m_LineObj.push_back(Obj); 767 return; 768 } 769 770 for (size_t i = count; i > 0; --i) { 771 PDFTEXT_Obj prev_text_obj = m_LineObj[i - 1]; 772 CFX_PointF new_prev_pos = 773 m_DisplayMatrix.Transform(prev_text_obj.m_formMatrix.Transform( 774 prev_text_obj.m_pTextObj->GetPos())); 775 if (this_pos.x >= new_prev_pos.x) { 776 m_LineObj.insert(m_LineObj.begin() + i, Obj); 777 return; 778 } 779 } 780 m_LineObj.insert(m_LineObj.begin(), Obj); 781 } 782 783 FPDFText_MarkedContent CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) { 784 CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get(); 785 if (!pTextObj->m_ContentMark.HasRef()) 786 return FPDFText_MarkedContent::Pass; 787 788 size_t nContentMark = pTextObj->m_ContentMark.CountItems(); 789 if (nContentMark == 0) 790 return FPDFText_MarkedContent::Pass; 791 792 WideString actText; 793 bool bExist = false; 794 CPDF_Dictionary* pDict = nullptr; 795 for (size_t i = 0; i < nContentMark; ++i) { 796 const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(i); 797 pDict = item.GetParam(); 798 if (!pDict) 799 continue; 800 CPDF_String* temp = ToString(pDict->GetObjectFor("ActualText")); 801 if (temp) { 802 bExist = true; 803 actText = temp->GetUnicodeText(); 804 } 805 } 806 if (!bExist) 807 return FPDFText_MarkedContent::Pass; 808 809 if (m_pPreTextObj) { 810 const CPDF_ContentMark& mark = m_pPreTextObj->m_ContentMark; 811 if (mark.HasRef() && mark.CountItems() == nContentMark && 812 mark.GetItem(nContentMark - 1).GetParam() == pDict) { 813 return FPDFText_MarkedContent::Done; 814 } 815 } 816 817 if (actText.IsEmpty()) 818 return FPDFText_MarkedContent::Pass; 819 820 CPDF_Font* pFont = pTextObj->GetFont(); 821 bExist = false; 822 for (size_t i = 0; i < actText.GetLength(); i++) { 823 if (pFont->CharCodeFromUnicode(actText[i]) != CPDF_Font::kInvalidCharCode) { 824 bExist = true; 825 break; 826 } 827 } 828 if (!bExist) 829 return FPDFText_MarkedContent::Pass; 830 831 bExist = false; 832 for (size_t i = 0; i < actText.GetLength(); i++) { 833 wchar_t wChar = actText[i]; 834 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) { 835 bExist = true; 836 break; 837 } 838 } 839 if (!bExist) 840 return FPDFText_MarkedContent::Done; 841 842 return FPDFText_MarkedContent::Delay; 843 } 844 845 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) { 846 CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get(); 847 if (!pTextObj->m_ContentMark.HasRef()) 848 return; 849 850 int nContentMark = pTextObj->m_ContentMark.CountItems(); 851 if (nContentMark < 1) 852 return; 853 854 WideString actText; 855 for (int n = 0; n < nContentMark; n++) { 856 const CPDF_ContentMarkItem& item = pTextObj->m_ContentMark.GetItem(n); 857 CPDF_Dictionary* pDict = item.GetParam(); 858 if (pDict) 859 actText = pDict->GetUnicodeTextFor("ActualText"); 860 } 861 if (actText.IsEmpty()) 862 return; 863 864 CPDF_Font* pFont = pTextObj->GetFont(); 865 CFX_Matrix matrix = pTextObj->GetTextMatrix(); 866 matrix.Concat(Obj.m_formMatrix); 867 868 for (size_t k = 0; k < actText.GetLength(); k++) { 869 wchar_t wChar = actText[k]; 870 if (wChar <= 0x80 && !isprint(wChar)) 871 wChar = 0x20; 872 if (wChar >= 0xFFFD) 873 continue; 874 875 PAGECHAR_INFO charinfo; 876 charinfo.m_Origin = pTextObj->GetPos(); 877 charinfo.m_Index = m_TextBuf.GetLength(); 878 charinfo.m_Unicode = wChar; 879 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar); 880 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE; 881 charinfo.m_pTextObj = pTextObj; 882 charinfo.m_CharBox = pTextObj->GetRect(); 883 charinfo.m_Matrix = matrix; 884 m_TempTextBuf.AppendChar(wChar); 885 m_TempCharList.push_back(charinfo); 886 } 887 } 888 889 void CPDF_TextPage::FindPreviousTextObject() { 890 if (m_TempCharList.empty() && m_CharList.empty()) 891 return; 892 893 PAGECHAR_INFO preChar = 894 m_TempCharList.empty() ? m_CharList.back() : m_TempCharList.back(); 895 896 if (preChar.m_pTextObj) 897 m_pPreTextObj = preChar.m_pTextObj; 898 } 899 900 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend, 901 int32_t iBufStartAppend) { 902 int32_t i = iCharListStartAppend; 903 int32_t j = pdfium::CollectionSize<int32_t>(m_TempCharList) - 1; 904 for (; i < j; i++, j--) { 905 std::swap(m_TempCharList[i], m_TempCharList[j]); 906 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index); 907 } 908 wchar_t* pTempBuffer = m_TempTextBuf.GetBuffer(); 909 i = iBufStartAppend; 910 j = m_TempTextBuf.GetLength() - 1; 911 for (; i < j; i++, j--) 912 std::swap(pTempBuffer[i], pTempBuffer[j]); 913 } 914 915 bool CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj, 916 const CPDF_Font* pFont, 917 size_t nItems) const { 918 WideString str; 919 for (size_t i = 0; i < nItems; ++i) { 920 CPDF_TextObjectItem item; 921 pTextObj->GetItemInfo(i, &item); 922 if (item.m_CharCode == static_cast<uint32_t>(-1)) 923 continue; 924 WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 925 wchar_t wChar = !wstrItem.IsEmpty() ? wstrItem[0] : 0; 926 if (wChar == 0) 927 wChar = item.m_CharCode; 928 if (wChar) 929 str += wChar; 930 } 931 return CFX_BidiString(str).OverallDirection() == CFX_BidiChar::RIGHT; 932 } 933 934 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { 935 CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get(); 936 if (fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) 937 return; 938 CFX_Matrix formMatrix = Obj.m_formMatrix; 939 CPDF_Font* pFont = pTextObj->GetFont(); 940 CFX_Matrix matrix = pTextObj->GetTextMatrix(); 941 matrix.Concat(formMatrix); 942 943 FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj); 944 if (ePreMKC == FPDFText_MarkedContent::Done) { 945 m_pPreTextObj = pTextObj; 946 m_perMatrix = formMatrix; 947 return; 948 } 949 GenerateCharacter result = GenerateCharacter::None; 950 if (m_pPreTextObj) { 951 result = ProcessInsertObject(pTextObj, formMatrix); 952 if (result == GenerateCharacter::LineBreak) 953 m_CurlineRect = Obj.m_pTextObj->GetRect(); 954 else 955 m_CurlineRect.Union(Obj.m_pTextObj->GetRect()); 956 957 switch (result) { 958 case GenerateCharacter::None: 959 break; 960 case GenerateCharacter::Space: { 961 PAGECHAR_INFO generateChar; 962 if (GenerateCharInfo(TEXT_SPACE_CHAR, generateChar)) { 963 if (!formMatrix.IsIdentity()) 964 generateChar.m_Matrix = formMatrix; 965 m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR); 966 m_TempCharList.push_back(generateChar); 967 } 968 break; 969 } 970 case GenerateCharacter::LineBreak: 971 CloseTempLine(); 972 if (m_TextBuf.GetSize()) { 973 AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix); 974 AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix); 975 } 976 break; 977 case GenerateCharacter::Hyphen: 978 if (pTextObj->CountChars() == 1) { 979 CPDF_TextObjectItem item; 980 pTextObj->GetCharInfo(0, &item); 981 WideString wstrItem = 982 pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 983 if (wstrItem.IsEmpty()) 984 wstrItem += (wchar_t)item.m_CharCode; 985 wchar_t curChar = wstrItem[0]; 986 if (curChar == 0x2D || curChar == 0xAD) 987 return; 988 } 989 while (m_TempTextBuf.GetSize() > 0 && 990 m_TempTextBuf.AsStringView()[m_TempTextBuf.GetLength() - 1] == 991 0x20) { 992 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 993 m_TempCharList.pop_back(); 994 } 995 PAGECHAR_INFO* charinfo = &m_TempCharList.back(); 996 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 997 charinfo->m_Unicode = 0x2; 998 charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN; 999 m_TempTextBuf.AppendChar(0xfffe); 1000 break; 1001 } 1002 } else { 1003 m_CurlineRect = Obj.m_pTextObj->GetRect(); 1004 } 1005 1006 if (ePreMKC == FPDFText_MarkedContent::Delay) { 1007 ProcessMarkedContent(Obj); 1008 m_pPreTextObj = pTextObj; 1009 m_perMatrix = formMatrix; 1010 return; 1011 } 1012 m_pPreTextObj = pTextObj; 1013 m_perMatrix = formMatrix; 1014 size_t nItems = pTextObj->CountItems(); 1015 float baseSpace = CalculateBaseSpace(pTextObj, matrix); 1016 1017 const bool bR2L = IsRightToLeft(pTextObj, pFont, nItems); 1018 const bool bIsBidiAndMirrorInverse = 1019 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0; 1020 int32_t iBufStartAppend = m_TempTextBuf.GetLength(); 1021 int32_t iCharListStartAppend = 1022 pdfium::CollectionSize<int32_t>(m_TempCharList); 1023 1024 float spacing = 0; 1025 for (size_t i = 0; i < nItems; ++i) { 1026 CPDF_TextObjectItem item; 1027 PAGECHAR_INFO charinfo; 1028 pTextObj->GetItemInfo(i, &item); 1029 if (item.m_CharCode == static_cast<uint32_t>(-1)) { 1030 WideString str = m_TempTextBuf.MakeString(); 1031 if (str.IsEmpty()) 1032 str = m_TextBuf.AsStringView(); 1033 if (str.IsEmpty() || str[str.GetLength() - 1] == TEXT_SPACE_CHAR) 1034 continue; 1035 1036 float fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1037 spacing = -fontsize_h * item.m_Origin.x / 1000; 1038 continue; 1039 } 1040 float charSpace = pTextObj->m_TextState.GetCharSpace(); 1041 if (charSpace > 0.001) 1042 spacing += matrix.TransformDistance(charSpace); 1043 else if (charSpace < -0.001) 1044 spacing -= matrix.TransformDistance(fabs(charSpace)); 1045 spacing -= baseSpace; 1046 if (spacing && i > 0) { 1047 int last_width = 0; 1048 float fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1049 uint32_t space_charcode = pFont->CharCodeFromUnicode(' '); 1050 float threshold = 0; 1051 if (space_charcode != CPDF_Font::kInvalidCharCode) 1052 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 1053 if (threshold > fontsize_h / 3) 1054 threshold = 0; 1055 else 1056 threshold /= 2; 1057 if (threshold == 0) { 1058 threshold = fontsize_h; 1059 int this_width = abs(GetCharWidth(item.m_CharCode, pFont)); 1060 threshold = 1061 this_width > last_width ? (float)this_width : (float)last_width; 1062 threshold = NormalizeThreshold(threshold); 1063 threshold = fontsize_h * threshold / 1000; 1064 } 1065 if (threshold && (spacing && spacing >= threshold)) { 1066 charinfo.m_Unicode = TEXT_SPACE_CHAR; 1067 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; 1068 charinfo.m_pTextObj = pTextObj; 1069 charinfo.m_Index = m_TextBuf.GetLength(); 1070 m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR); 1071 charinfo.m_CharCode = CPDF_Font::kInvalidCharCode; 1072 charinfo.m_Matrix = formMatrix; 1073 charinfo.m_Origin = matrix.Transform(item.m_Origin); 1074 charinfo.m_CharBox = 1075 CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y, 1076 charinfo.m_Origin.x, charinfo.m_Origin.y); 1077 m_TempCharList.push_back(charinfo); 1078 } 1079 if (item.m_CharCode == CPDF_Font::kInvalidCharCode) 1080 continue; 1081 } 1082 spacing = 0; 1083 WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 1084 bool bNoUnicode = false; 1085 if (wstrItem.IsEmpty() && item.m_CharCode) { 1086 wstrItem += static_cast<wchar_t>(item.m_CharCode); 1087 bNoUnicode = true; 1088 } 1089 charinfo.m_Index = -1; 1090 charinfo.m_CharCode = item.m_CharCode; 1091 if (bNoUnicode) 1092 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE; 1093 else 1094 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL; 1095 1096 charinfo.m_pTextObj = pTextObj; 1097 charinfo.m_Origin = matrix.Transform(item.m_Origin); 1098 1099 FX_RECT rect = 1100 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode); 1101 charinfo.m_CharBox.top = 1102 rect.top * pTextObj->GetFontSize() / 1000 + item.m_Origin.y; 1103 charinfo.m_CharBox.left = 1104 rect.left * pTextObj->GetFontSize() / 1000 + item.m_Origin.x; 1105 charinfo.m_CharBox.right = 1106 rect.right * pTextObj->GetFontSize() / 1000 + item.m_Origin.x; 1107 charinfo.m_CharBox.bottom = 1108 rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_Origin.y; 1109 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) { 1110 charinfo.m_CharBox.top = 1111 charinfo.m_CharBox.bottom + pTextObj->GetFontSize(); 1112 } 1113 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) { 1114 charinfo.m_CharBox.right = 1115 charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode); 1116 } 1117 charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox); 1118 charinfo.m_Matrix = matrix; 1119 if (wstrItem.IsEmpty()) { 1120 charinfo.m_Unicode = 0; 1121 m_TempCharList.push_back(charinfo); 1122 m_TempTextBuf.AppendChar(0xfffe); 1123 continue; 1124 } 1125 int nTotal = wstrItem.GetLength(); 1126 bool bDel = false; 1127 const int count = std::min(pdfium::CollectionSize<int>(m_TempCharList), 7); 1128 float threshold = charinfo.m_Matrix.TransformXDistance( 1129 (float)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize()); 1130 for (int n = pdfium::CollectionSize<int>(m_TempCharList); 1131 n > pdfium::CollectionSize<int>(m_TempCharList) - count; n--) { 1132 const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1]; 1133 CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin; 1134 if (charinfo1.m_CharCode == charinfo.m_CharCode && 1135 charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() && 1136 fabs(diff.x) < threshold && fabs(diff.y) < threshold) { 1137 bDel = true; 1138 break; 1139 } 1140 } 1141 if (!bDel) { 1142 for (int nIndex = 0; nIndex < nTotal; nIndex++) { 1143 charinfo.m_Unicode = wstrItem[nIndex]; 1144 if (charinfo.m_Unicode) { 1145 charinfo.m_Index = m_TextBuf.GetLength(); 1146 m_TempTextBuf.AppendChar(charinfo.m_Unicode); 1147 } else { 1148 m_TempTextBuf.AppendChar(0xfffe); 1149 } 1150 m_TempCharList.push_back(charinfo); 1151 } 1152 } else if (i == 0) { 1153 WideString str = m_TempTextBuf.MakeString(); 1154 if (!str.IsEmpty() && str[str.GetLength() - 1] == TEXT_SPACE_CHAR) { 1155 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1156 m_TempCharList.pop_back(); 1157 } 1158 } 1159 } 1160 if (bIsBidiAndMirrorInverse) 1161 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend); 1162 } 1163 1164 CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode( 1165 const CPDF_TextObject* pTextObj) const { 1166 size_t nChars = pTextObj->CountChars(); 1167 if (nChars <= 1) 1168 return m_TextlineDir; 1169 1170 CPDF_TextObjectItem first, last; 1171 pTextObj->GetCharInfo(0, &first); 1172 pTextObj->GetCharInfo(nChars - 1, &last); 1173 1174 CFX_Matrix textMatrix = pTextObj->GetTextMatrix(); 1175 first.m_Origin = textMatrix.Transform(first.m_Origin); 1176 last.m_Origin = textMatrix.Transform(last.m_Origin); 1177 1178 float dX = fabs(last.m_Origin.x - first.m_Origin.x); 1179 float dY = fabs(last.m_Origin.y - first.m_Origin.y); 1180 if (dX <= 0.0001f && dY <= 0.0001f) 1181 return TextOrientation::Unknown; 1182 1183 CFX_VectorF v(dX, dY); 1184 v.Normalize(); 1185 if (v.y <= 0.0872f) 1186 return v.x <= 0.0872f ? m_TextlineDir : TextOrientation::Horizontal; 1187 1188 if (v.x <= 0.0872f) 1189 return TextOrientation::Vertical; 1190 1191 return m_TextlineDir; 1192 } 1193 1194 bool CPDF_TextPage::IsHyphen(wchar_t curChar) const { 1195 WideStringView curText = m_TempTextBuf.AsStringView(); 1196 if (curText.IsEmpty()) 1197 curText = m_TextBuf.AsStringView(); 1198 1199 if (curText.IsEmpty()) 1200 return false; 1201 1202 auto iter = curText.rbegin(); 1203 for (; (iter + 1) != curText.rend() && *iter == 0x20; iter++) { 1204 // Do nothing 1205 } 1206 1207 if (!IsHyphenCode(*iter)) 1208 return false; 1209 1210 if ((iter + 1) != curText.rend()) { 1211 iter++; 1212 if (FXSYS_iswalpha(*iter) && FXSYS_iswalpha(*iter)) 1213 return true; 1214 } 1215 1216 const PAGECHAR_INFO* preInfo; 1217 if (!m_TempCharList.empty()) 1218 preInfo = &m_TempCharList.back(); 1219 else if (!m_CharList.empty()) 1220 preInfo = &m_CharList.back(); 1221 else 1222 return false; 1223 1224 return FPDFTEXT_CHAR_PIECE == preInfo->m_Flag && 1225 IsHyphenCode(preInfo->m_Unicode); 1226 } 1227 1228 CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject( 1229 const CPDF_TextObject* pObj, 1230 const CFX_Matrix& formMatrix) { 1231 FindPreviousTextObject(); 1232 TextOrientation WritingMode = GetTextObjectWritingMode(pObj); 1233 if (WritingMode == TextOrientation::Unknown) 1234 WritingMode = GetTextObjectWritingMode(m_pPreTextObj.Get()); 1235 1236 size_t nItem = m_pPreTextObj->CountItems(); 1237 if (nItem == 0) 1238 return GenerateCharacter::None; 1239 1240 CPDF_TextObjectItem PrevItem; 1241 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem); 1242 1243 CPDF_TextObjectItem item; 1244 pObj->GetItemInfo(0, &item); 1245 1246 CFX_FloatRect this_rect = pObj->GetRect(); 1247 CFX_FloatRect prev_rect = m_pPreTextObj->GetRect(); 1248 1249 WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 1250 if (wstrItem.IsEmpty()) 1251 wstrItem += static_cast<wchar_t>(item.m_CharCode); 1252 wchar_t curChar = wstrItem[0]; 1253 if (WritingMode == TextOrientation::Horizontal) { 1254 if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) { 1255 float top = this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top; 1256 float bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom 1257 : prev_rect.bottom; 1258 if (bottom >= top) { 1259 return IsHyphen(curChar) ? GenerateCharacter::Hyphen 1260 : GenerateCharacter::LineBreak; 1261 } 1262 } 1263 } else if (WritingMode == TextOrientation::Vertical) { 1264 if (this_rect.Width() > pObj->GetFontSize() * 0.1f && 1265 prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) { 1266 float left = this_rect.left > m_CurlineRect.left ? this_rect.left 1267 : m_CurlineRect.left; 1268 float right = this_rect.right < m_CurlineRect.right ? this_rect.right 1269 : m_CurlineRect.right; 1270 if (right <= left) { 1271 return IsHyphen(curChar) ? GenerateCharacter::Hyphen 1272 : GenerateCharacter::LineBreak; 1273 } 1274 } 1275 } 1276 1277 float last_pos = PrevItem.m_Origin.x; 1278 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont()); 1279 float last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000; 1280 last_width = fabs(last_width); 1281 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); 1282 float this_width = nThisWidth * pObj->GetFontSize() / 1000; 1283 this_width = fabs(this_width); 1284 float threshold = last_width > this_width ? last_width / 4 : this_width / 4; 1285 1286 CFX_Matrix prev_matrix = m_pPreTextObj->GetTextMatrix(); 1287 prev_matrix.Concat(m_perMatrix); 1288 1289 CFX_Matrix prev_reverse = prev_matrix.GetInverse(); 1290 1291 CFX_PointF pos = prev_reverse.Transform(formMatrix.Transform(pObj->GetPos())); 1292 if (last_width < this_width) 1293 threshold = prev_reverse.TransformDistance(threshold); 1294 1295 bool bNewline = false; 1296 if (WritingMode == TextOrientation::Horizontal) { 1297 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom, 1298 m_pPreTextObj->m_Right, pObj->m_Top); 1299 CFX_FloatRect rect2 = m_pPreTextObj->GetRect(); 1300 CFX_FloatRect rect3 = rect1; 1301 rect1.Intersect(rect2); 1302 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) || 1303 ((pos.y > threshold * 2 || pos.y < threshold * -3) && 1304 (fabs(pos.y) < 1 ? fabs(pos.x) < fabs(pos.y) : true))) { 1305 bNewline = true; 1306 if (nItem > 1) { 1307 CPDF_TextObjectItem tempItem; 1308 m_pPreTextObj->GetItemInfo(0, &tempItem); 1309 CFX_Matrix m = m_pPreTextObj->GetTextMatrix(); 1310 if (PrevItem.m_Origin.x > tempItem.m_Origin.x && 1311 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 && 1312 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 && 1313 m.c < 0.1) { 1314 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000, 1315 m_pPreTextObj->m_Top); 1316 if (re.Contains(pObj->GetPos())) { 1317 bNewline = false; 1318 } else { 1319 CFX_FloatRect rect(0, pObj->m_Bottom, 1000, pObj->m_Top); 1320 if (rect.Contains(m_pPreTextObj->GetPos())) 1321 bNewline = false; 1322 } 1323 } 1324 } 1325 } 1326 } 1327 if (bNewline) { 1328 return IsHyphen(curChar) ? GenerateCharacter::Hyphen 1329 : GenerateCharacter::LineBreak; 1330 } 1331 1332 if (pObj->CountChars() == 1 && (0x2D == curChar || 0xAD == curChar) && 1333 IsHyphen(curChar)) { 1334 return GenerateCharacter::Hyphen; 1335 } 1336 WideString PrevStr = 1337 m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode); 1338 wchar_t preChar = PrevStr.Last(); 1339 CFX_Matrix matrix = pObj->GetTextMatrix(); 1340 matrix.Concat(formMatrix); 1341 1342 threshold = (float)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); 1343 threshold = threshold > 400 1344 ? (threshold < 700 1345 ? threshold / 4 1346 : (threshold > 800 ? threshold / 6 : threshold / 5)) 1347 : (threshold / 2); 1348 if (nLastWidth >= nThisWidth) { 1349 threshold *= fabs(m_pPreTextObj->GetFontSize()); 1350 } else { 1351 threshold *= fabs(pObj->GetFontSize()); 1352 threshold = matrix.TransformDistance(threshold); 1353 threshold = prev_reverse.TransformDistance(threshold); 1354 } 1355 threshold /= 1000; 1356 if ((threshold < 1.4881 && threshold > 1.4879) || 1357 (threshold < 1.39001 && threshold > 1.38999)) { 1358 threshold *= 1.5; 1359 } 1360 if (fabs(last_pos + last_width - pos.x) > threshold && curChar != L' ' && 1361 preChar != L' ') { 1362 if (curChar != L' ' && preChar != L' ') { 1363 if ((pos.x - last_pos - last_width) > threshold || 1364 (last_pos - pos.x - last_width) > threshold) { 1365 return GenerateCharacter::Space; 1366 } 1367 if (pos.x < 0 && (last_pos - pos.x - last_width) > threshold) 1368 return GenerateCharacter::Space; 1369 if ((pos.x - last_pos - last_width) > this_width || 1370 (pos.x - last_pos - this_width) > last_width) { 1371 return GenerateCharacter::Space; 1372 } 1373 } 1374 } 1375 return GenerateCharacter::None; 1376 } 1377 1378 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1, 1379 CPDF_TextObject* pTextObj2) { 1380 if (!pTextObj1 || !pTextObj2) 1381 return false; 1382 1383 CFX_FloatRect rcPreObj = pTextObj2->GetRect(); 1384 CFX_FloatRect rcCurObj = pTextObj1->GetRect(); 1385 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) { 1386 float dbXdif = fabs(rcPreObj.left - rcCurObj.left); 1387 size_t nCount = m_CharList.size(); 1388 if (nCount >= 2) { 1389 PAGECHAR_INFO perCharTemp = m_CharList[nCount - 2]; 1390 float dbSpace = perCharTemp.m_CharBox.Width(); 1391 if (dbXdif > dbSpace) 1392 return false; 1393 } 1394 } 1395 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { 1396 rcPreObj.Intersect(rcCurObj); 1397 if (rcPreObj.IsEmpty()) 1398 return false; 1399 if (fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) { 1400 return false; 1401 } 1402 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) 1403 return false; 1404 } 1405 1406 size_t nPreCount = pTextObj2->CountItems(); 1407 if (nPreCount != pTextObj1->CountItems()) 1408 return false; 1409 1410 // If both objects have no items, consider them same. 1411 if (nPreCount == 0) 1412 return true; 1413 1414 CPDF_TextObjectItem itemPer; 1415 CPDF_TextObjectItem itemCur; 1416 for (size_t i = 0; i < nPreCount; ++i) { 1417 pTextObj2->GetItemInfo(i, &itemPer); 1418 pTextObj1->GetItemInfo(i, &itemCur); 1419 if (itemCur.m_CharCode != itemPer.m_CharCode) 1420 return false; 1421 } 1422 1423 CFX_PointF diff = pTextObj1->GetPos() - pTextObj2->GetPos(); 1424 float font_size = pTextObj2->GetFontSize(); 1425 float char_size = GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()); 1426 float max_pre_size = 1427 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), font_size); 1428 if (fabs(diff.x) > char_size * font_size / 1000 * 0.9 || 1429 fabs(diff.y) > max_pre_size / 8) { 1430 return false; 1431 } 1432 return true; 1433 } 1434 1435 bool CPDF_TextPage::IsSameAsPreTextObject( 1436 CPDF_TextObject* pTextObj, 1437 const CPDF_PageObjectList* pObjList, 1438 CPDF_PageObjectList::const_iterator iter) { 1439 int i = 0; 1440 while (i < 5 && iter != pObjList->begin()) { 1441 --iter; 1442 CPDF_PageObject* pOtherObj = iter->get(); 1443 if (pOtherObj == pTextObj || !pOtherObj->IsText()) 1444 continue; 1445 if (IsSameTextObject(pOtherObj->AsText(), pTextObj)) 1446 return true; 1447 ++i; 1448 } 1449 return false; 1450 } 1451 1452 bool CPDF_TextPage::GenerateCharInfo(wchar_t unicode, PAGECHAR_INFO& info) { 1453 const PAGECHAR_INFO* preChar; 1454 if (!m_TempCharList.empty()) 1455 preChar = &m_TempCharList.back(); 1456 else if (!m_CharList.empty()) 1457 preChar = &m_CharList.back(); 1458 else 1459 return false; 1460 1461 info.m_Index = m_TextBuf.GetLength(); 1462 info.m_Unicode = unicode; 1463 info.m_pTextObj = nullptr; 1464 info.m_CharCode = CPDF_Font::kInvalidCharCode; 1465 info.m_Flag = FPDFTEXT_CHAR_GENERATED; 1466 1467 int preWidth = 0; 1468 if (preChar->m_pTextObj && preChar->m_CharCode != -1) { 1469 preWidth = 1470 GetCharWidth(preChar->m_CharCode, preChar->m_pTextObj->GetFont()); 1471 } 1472 1473 float fFontSize = preChar->m_pTextObj ? preChar->m_pTextObj->GetFontSize() 1474 : preChar->m_CharBox.Height(); 1475 if (!fFontSize) 1476 fFontSize = kDefaultFontSize; 1477 1478 info.m_Origin = CFX_PointF( 1479 preChar->m_Origin.x + preWidth * (fFontSize) / 1000, preChar->m_Origin.y); 1480 info.m_CharBox = CFX_FloatRect(info.m_Origin.x, info.m_Origin.y, 1481 info.m_Origin.x, info.m_Origin.y); 1482 return true; 1483 } 1484 1485 bool CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, 1486 const CFX_FloatRect& rect2) { 1487 CFX_FloatRect rect = rect1; 1488 rect.Intersect(rect2); 1489 return !rect.IsEmpty(); 1490 } 1491