1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "../../include/fpdfapi/fpdf_resource.h" 8 #include "../../include/fpdfapi/fpdf_pageobj.h" 9 #include "../../include/fpdftext/fpdf_text.h" 10 #include "../../include/fpdfapi/fpdf_page.h" 11 #include "../../include/fpdfapi/fpdf_module.h" 12 #include <ctype.h> 13 #include "text_int.h" 14 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) 15 { 16 if(curChar < 255 ) { 17 return FALSE; 18 } 19 if ( (curChar >= 0x0600 && curChar <= 0x06FF) 20 || (curChar >= 0xFE70 && curChar <= 0xFEFF) 21 || (curChar >= 0xFB50 && curChar <= 0xFDFF) 22 || (curChar >= 0x0400 && curChar <= 0x04FF) 23 || (curChar >= 0x0500 && curChar <= 0x052F) 24 || (curChar >= 0xA640 && curChar <= 0xA69F) 25 || (curChar >= 0x2DE0 && curChar <= 0x2DFF) 26 || curChar == 8467 27 || (curChar >= 0x2000 && curChar <= 0x206F)) { 28 return FALSE; 29 } 30 return TRUE; 31 } 32 CPDFText_ParseOptions::CPDFText_ParseOptions() 33 : m_bGetCharCodeOnly(FALSE), m_bNormalizeObjs(TRUE), m_bOutputHyphen(FALSE) 34 { 35 } 36 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions) 37 { 38 CPDF_TextPage* pTextPageEx = FX_NEW CPDF_TextPage(pPage, ParserOptions); 39 return pTextPageEx; 40 } 41 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, int flags) 42 { 43 CPDF_TextPage* pTextPage = FX_NEW CPDF_TextPage(pPage, flags); 44 return pTextPage; 45 } 46 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_PageObjects* pObjs, int flags) 47 { 48 CPDF_TextPage* pTextPage = FX_NEW CPDF_TextPage(pObjs, flags); 49 return pTextPage; 50 } 51 IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(const IPDF_TextPage* pTextPage) 52 { 53 if (!pTextPage) { 54 return NULL; 55 } 56 return FX_NEW CPDF_TextPageFind(pTextPage); 57 } 58 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() 59 { 60 return FX_NEW CPDF_LinkExtract(); 61 } 62 #define TEXT_BLANK_CHAR L' ' 63 #define TEXT_LINEFEED_CHAR L'\n' 64 #define TEXT_RETURN_CHAR L'\r' 65 #define TEXT_EMPTY L"" 66 #define TEXT_BLANK L" " 67 #define TEXT_RETURN_LINEFEED L"\r\n" 68 #define TEXT_LINEFEED L"\n" 69 #define TEXT_CHARRATIO_GAPDELTA 0.070 70 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags) 71 : m_pPreTextObj(NULL), 72 m_IsParsered(FALSE), 73 m_charList(512), 74 m_TempCharList(50), 75 m_TextlineDir(-1), 76 m_CurlineRect(0, 0, 0, 0) 77 { 78 m_pPage = pPage; 79 m_parserflag = flags; 80 m_TextBuf.EstimateSize(0, 10240); 81 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0); 82 } 83 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions) 84 : m_pPreTextObj(NULL) 85 , m_IsParsered(FALSE) 86 , m_charList(512) 87 , m_TempCharList(50) 88 , m_TextlineDir(-1) 89 , m_CurlineRect(0, 0, 0, 0) 90 , m_ParseOptions(ParserOptions) 91 { 92 m_pPage = pPage; 93 m_parserflag = 0; 94 m_TextBuf.EstimateSize(0, 10240); 95 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0); 96 } 97 CPDF_TextPage::CPDF_TextPage(const CPDF_PageObjects* pPage, int flags) 98 : m_pPreTextObj(NULL), 99 m_IsParsered(FALSE), 100 m_charList(512), 101 m_TempCharList(50), 102 m_TextlineDir(-1), 103 m_CurlineRect(0, 0, 0, 0) 104 { 105 m_pPage = pPage; 106 m_parserflag = flags; 107 m_TextBuf.EstimateSize(0, 10240); 108 CFX_FloatRect pageRect = pPage->CalcBoundingBox(); 109 m_DisplayMatrix = CFX_AffineMatrix(1, 0, 0, -1, pageRect.right, pageRect.top); 110 } 111 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) 112 { 113 m_ParseOptions.m_bNormalizeObjs = bNormalize; 114 } 115 FX_BOOL CPDF_TextPage::IsControlChar(PAGECHAR_INFO* pCharInfo) 116 { 117 if(!pCharInfo) { 118 return FALSE; 119 } 120 switch(pCharInfo->m_Unicode) { 121 case 0x2: 122 case 0x3: 123 case 0x93: 124 case 0x94: 125 case 0x96: 126 case 0x97: 127 case 0x98: 128 case 0xfffe: 129 if(pCharInfo->m_Flag == FPDFTEXT_CHAR_HYPHEN) { 130 return FALSE; 131 } else { 132 return TRUE; 133 } 134 default: 135 return FALSE; 136 } 137 } 138 FX_BOOL CPDF_TextPage::ParseTextPage() 139 { 140 if (!m_pPage) { 141 m_IsParsered = FALSE; 142 return FALSE; 143 } 144 m_IsParsered = FALSE; 145 m_TextBuf.Clear(); 146 m_charList.RemoveAll(); 147 m_pPreTextObj = NULL; 148 ProcessObject(); 149 m_IsParsered = TRUE; 150 if(!m_ParseOptions.m_bGetCharCodeOnly) { 151 m_CharIndex.RemoveAll(); 152 int nCount = m_charList.GetSize(); 153 if(nCount) { 154 m_CharIndex.Add(0); 155 } 156 for(int i = 0; i < nCount; i++) { 157 int indexSize = m_CharIndex.GetSize(); 158 FX_BOOL bNormal = FALSE; 159 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i); 160 if(charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 161 bNormal = TRUE; 162 } 163 #ifdef FOXIT_CHROME_BUILD 164 else if(charinfo.m_Unicode == 0 || IsControlChar(&charinfo)) 165 #else 166 else if(charinfo.m_Unicode == 0) 167 #endif 168 bNormal = FALSE; 169 else { 170 bNormal = TRUE; 171 } 172 if(bNormal) { 173 if(indexSize % 2) { 174 m_CharIndex.Add(1); 175 } else { 176 if(indexSize <= 0) { 177 continue; 178 } 179 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1); 180 } 181 } else { 182 if(indexSize % 2) { 183 if(indexSize <= 0) { 184 continue; 185 } 186 m_CharIndex.SetAt(indexSize - 1, i + 1); 187 } else { 188 m_CharIndex.Add(i + 1); 189 } 190 } 191 } 192 int indexSize = m_CharIndex.GetSize(); 193 if(indexSize % 2) { 194 m_CharIndex.RemoveAt(indexSize - 1); 195 } 196 } 197 return TRUE; 198 } 199 int CPDF_TextPage::CountChars() const 200 { 201 if(m_ParseOptions.m_bGetCharCodeOnly) { 202 return m_TextBuf.GetSize(); 203 } 204 return m_charList.GetSize(); 205 } 206 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const 207 { 208 int indexSize = m_CharIndex.GetSize(); 209 int count = 0; 210 for(int i = 0; i < indexSize; i += 2) { 211 count += m_CharIndex.GetAt(i + 1); 212 if(count > TextIndex) { 213 return TextIndex - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i); 214 } 215 } 216 return -1; 217 } 218 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const 219 { 220 int indexSize = m_CharIndex.GetSize(); 221 int count = 0; 222 for(int i = 0; i < indexSize; i += 2) { 223 count += m_CharIndex.GetAt(i + 1); 224 if(m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) { 225 if(CharIndex - m_CharIndex.GetAt(i) < 0) { 226 return -1; 227 } 228 return CharIndex - m_CharIndex.GetAt(i) + count - m_CharIndex.GetAt(i + 1); 229 } 230 } 231 return -1; 232 } 233 void CPDF_TextPage::GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const 234 { 235 if(m_ParseOptions.m_bGetCharCodeOnly) { 236 return; 237 } 238 if(start < 0 || nCount == 0) { 239 return; 240 } 241 if (!m_IsParsered) { 242 return; 243 } 244 PAGECHAR_INFO info_curchar; 245 CPDF_TextObject* pCurObj = NULL; 246 CFX_FloatRect rect; 247 int curPos = start; 248 FX_BOOL flagNewRect = TRUE; 249 if (nCount + start > m_charList.GetSize() || nCount == -1) { 250 nCount = m_charList.GetSize() - start; 251 } 252 while (nCount--) { 253 info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++); 254 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) { 255 continue; 256 } 257 if(info_curchar.m_CharBox.Width() < 0.01 || info_curchar.m_CharBox.Height() < 0.01) { 258 continue; 259 } 260 if(!pCurObj) { 261 pCurObj = info_curchar.m_pTextObj; 262 } 263 if (pCurObj != info_curchar.m_pTextObj) { 264 rectArray.Add(rect); 265 pCurObj = info_curchar.m_pTextObj; 266 flagNewRect = TRUE; 267 } 268 if (flagNewRect) { 269 FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY; 270 CFX_AffineMatrix matrix, matrix_reverse; 271 info_curchar.m_pTextObj->GetTextMatrix(&matrix); 272 matrix.Concat(info_curchar.m_Matrix); 273 matrix_reverse.SetReverse(matrix); 274 matrix_reverse.Transform(orgX, orgY); 275 rect.left = info_curchar.m_CharBox.left; 276 rect.right = info_curchar.m_CharBox.right; 277 if (pCurObj->GetFont()->GetTypeDescent()) { 278 rect.bottom = orgY + pCurObj->GetFont()->GetTypeDescent() * pCurObj->GetFontSize() / 1000; 279 FX_FLOAT xPosTemp = orgX; 280 matrix.Transform(xPosTemp, rect.bottom); 281 } else { 282 rect.bottom = info_curchar.m_CharBox.bottom; 283 } 284 if (pCurObj->GetFont()->GetTypeAscent()) { 285 rect.top = orgY + pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000; 286 FX_FLOAT xPosTemp = orgX + GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) * pCurObj->GetFontSize() / 1000; 287 matrix.Transform(xPosTemp, rect.top); 288 } else { 289 rect.top = info_curchar.m_CharBox.top; 290 } 291 flagNewRect = FALSE; 292 rect = info_curchar.m_CharBox; 293 rect.Normalize(); 294 } else { 295 info_curchar.m_CharBox.Normalize(); 296 if (rect.left > info_curchar.m_CharBox.left) { 297 rect.left = info_curchar.m_CharBox.left; 298 } 299 if (rect.right < info_curchar.m_CharBox.right) { 300 rect.right = info_curchar.m_CharBox.right; 301 } 302 if ( rect.top < info_curchar.m_CharBox.top) { 303 rect.top = info_curchar.m_CharBox.top; 304 } 305 if (rect.bottom > info_curchar.m_CharBox.bottom) { 306 rect.bottom = info_curchar.m_CharBox.bottom; 307 } 308 } 309 } 310 rectArray.Add(rect); 311 return; 312 } 313 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point , FX_FLOAT xTorelance, FX_FLOAT yTorelance) const 314 { 315 if(m_ParseOptions.m_bGetCharCodeOnly) { 316 return -3; 317 } 318 if (!m_IsParsered) { 319 return -3; 320 } 321 FX_FLOAT distance = 0; 322 int pos = 0; 323 int NearPos = -1; 324 double xdif = 5000, ydif = 5000; 325 while(pos < m_charList.GetSize()) { 326 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos)); 327 CFX_FloatRect charrect = charinfo.m_CharBox; 328 if (charrect.Contains(point.x, point.y)) { 329 break; 330 } 331 if (xTorelance > 0 || yTorelance > 0) { 332 CFX_FloatRect charRectExt; 333 charrect.Normalize(); 334 charRectExt.left = charrect.left - xTorelance / 2; 335 charRectExt.right = charrect.right + xTorelance / 2; 336 charRectExt.top = charrect.top + yTorelance / 2; 337 charRectExt.bottom = charrect.bottom - yTorelance / 2; 338 if (charRectExt.Contains(point.x, point.y)) { 339 double curXdif, curYdif; 340 curXdif = FXSYS_fabs(point.x - charrect.left) < FXSYS_fabs(point.x - charrect.right) ? FXSYS_fabs(point.x - charrect.left) : FXSYS_fabs(point.x - charrect.right); 341 curYdif = FXSYS_fabs(point.y - charrect.bottom) < FXSYS_fabs(point.y - charrect.top ) ? FXSYS_fabs(point.y - charrect.bottom) : FXSYS_fabs(point.y - charrect.top); 342 if (curYdif + curXdif < xdif + ydif) { 343 ydif = curYdif; 344 xdif = curXdif; 345 NearPos = pos; 346 } 347 } 348 } 349 ++pos; 350 } 351 if (pos >= m_charList.GetSize()) { 352 pos = NearPos; 353 } 354 return pos; 355 } 356 CFX_WideString CPDF_TextPage::GetTextByRect(CFX_FloatRect rect) const 357 { 358 CFX_WideString strText; 359 if(m_ParseOptions.m_bGetCharCodeOnly || !m_IsParsered) { 360 return strText; 361 } 362 int nCount = m_charList.GetSize(); 363 int pos = 0; 364 FX_FLOAT posy = 0; 365 FX_BOOL IsContainPreChar = FALSE; 366 FX_BOOL ISAddLineFeed = FALSE; 367 while (pos < nCount) { 368 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++); 369 if (IsRectIntersect(rect, charinfo.m_CharBox)) { 370 if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar && ISAddLineFeed) { 371 posy = charinfo.m_OriginY; 372 if (strText.GetLength() > 0) { 373 strText += L"\r\n"; 374 } 375 } 376 IsContainPreChar = TRUE; 377 ISAddLineFeed = FALSE; 378 if (charinfo.m_Unicode) { 379 strText += charinfo.m_Unicode; 380 } 381 } else if (charinfo.m_Unicode == 32) { 382 if (IsContainPreChar && charinfo.m_Unicode) { 383 strText += charinfo.m_Unicode; 384 IsContainPreChar = FALSE; 385 ISAddLineFeed = FALSE; 386 } 387 } else { 388 IsContainPreChar = FALSE; 389 ISAddLineFeed = TRUE; 390 } 391 } 392 return strText; 393 } 394 void CPDF_TextPage::GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const 395 { 396 if(m_ParseOptions.m_bGetCharCodeOnly) { 397 return; 398 } 399 if (!m_IsParsered) { 400 return; 401 } 402 CFX_FloatRect curRect; 403 FX_BOOL flagNewRect = TRUE; 404 CPDF_TextObject* pCurObj = NULL; 405 int nCount = m_charList.GetSize(); 406 int pos = 0; 407 while (pos < nCount) { 408 PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++); 409 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) { 410 continue; 411 } 412 if(pos == 494) { 413 int a = 0; 414 } 415 if (IsRectIntersect(rect, info_curchar.m_CharBox)) { 416 if(!pCurObj) { 417 pCurObj = info_curchar.m_pTextObj; 418 } 419 if (pCurObj != info_curchar.m_pTextObj) { 420 resRectArray.Add(curRect); 421 pCurObj = info_curchar.m_pTextObj; 422 flagNewRect = TRUE; 423 } 424 if (flagNewRect) { 425 curRect = info_curchar.m_CharBox; 426 flagNewRect = FALSE; 427 curRect.Normalize(); 428 } else { 429 info_curchar.m_CharBox.Normalize(); 430 if (curRect.left > info_curchar.m_CharBox.left) { 431 curRect.left = info_curchar.m_CharBox.left; 432 } 433 if (curRect.right < info_curchar.m_CharBox.right) { 434 curRect.right = info_curchar.m_CharBox.right; 435 } 436 if ( curRect.top < info_curchar.m_CharBox.top) { 437 curRect.top = info_curchar.m_CharBox.top; 438 } 439 if (curRect.bottom > info_curchar.m_CharBox.bottom) { 440 curRect.bottom = info_curchar.m_CharBox.bottom; 441 } 442 } 443 } 444 } 445 resRectArray.Add(curRect); 446 return; 447 } 448 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const 449 { 450 if(m_ParseOptions.m_bGetCharCodeOnly) { 451 return -3; 452 } 453 CPDF_Point point(x, y); 454 return GetIndexAtPos(point, xTorelance, yTorelance); 455 } 456 int CPDF_TextPage::GetOrderByDirection(int order, int direction) const 457 { 458 if(m_ParseOptions.m_bGetCharCodeOnly) { 459 return -3; 460 } 461 if (!m_IsParsered) { 462 return -3; 463 } 464 if (direction == FPDFTEXT_RIGHT || direction == FPDFTEXT_LEFT) { 465 order += direction; 466 while(order >= 0 && order < m_charList.GetSize()) { 467 PAGECHAR_INFO cinfo = *(PAGECHAR_INFO*)m_charList.GetAt(order); 468 if (cinfo.m_Flag != FPDFTEXT_CHAR_GENERATED) { 469 break; 470 } else { 471 if (cinfo.m_Unicode == TEXT_LINEFEED_CHAR || cinfo.m_Unicode == TEXT_RETURN_CHAR) { 472 order += direction; 473 } else { 474 break; 475 } 476 } 477 } 478 if (order >= m_charList.GetSize()) { 479 order = -2; 480 } 481 return order; 482 } 483 PAGECHAR_INFO charinfo; 484 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(order); 485 CPDF_Point curPos(charinfo.m_OriginX, charinfo.m_OriginY); 486 FX_FLOAT difPosY = 0.0, minXdif = 1000; 487 int minIndex = -2; 488 int index = order; 489 FX_FLOAT height = charinfo.m_CharBox.Height(); 490 if (direction == FPDFTEXT_UP) { 491 minIndex = -1; 492 while (1) { 493 if (--index < 0) { 494 return -1; 495 } 496 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 497 if (FXSYS_fabs(charinfo.m_OriginY - curPos.y) > FX_MAX(height, charinfo.m_CharBox.Height()) / 2) { 498 difPosY = charinfo.m_OriginY; 499 minIndex = index; 500 break; 501 } 502 } 503 FX_FLOAT PreXdif = charinfo.m_OriginX - curPos.x; 504 minXdif = PreXdif; 505 if (PreXdif == 0) { 506 return index; 507 } 508 FX_FLOAT curXdif = 0; 509 while (--index >= 0) { 510 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 511 if (difPosY != charinfo.m_OriginY) { 512 break; 513 } 514 curXdif = charinfo.m_OriginX - curPos.x; 515 if (curXdif == 0) { 516 return index; 517 } 518 int signflag = 0; 519 if (curXdif > 0) { 520 signflag = 1; 521 } else { 522 signflag = -1; 523 } 524 if (signflag * PreXdif < 0) { 525 if (FXSYS_fabs(PreXdif) < FXSYS_fabs(curXdif)) { 526 return index + 1; 527 } else { 528 return index; 529 } 530 } 531 if (FXSYS_fabs(curXdif) < FXSYS_fabs(minXdif)) { 532 minIndex = index; 533 minXdif = curXdif; 534 } 535 PreXdif = curXdif; 536 if (difPosY != charinfo.m_OriginY) { 537 break; 538 } 539 } 540 return minIndex; 541 } else if(FPDFTEXT_DOWN) { 542 minIndex = -2; 543 while (1) { 544 if (++index > m_charList.GetSize() - 1) { 545 return minIndex; 546 } 547 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 548 if (FXSYS_fabs(charinfo.m_OriginY - curPos.y) > FX_MAX(height, charinfo.m_CharBox.Height()) / 2) { 549 difPosY = charinfo.m_OriginY; 550 minIndex = index; 551 break; 552 } 553 } 554 FX_FLOAT PreXdif = charinfo.m_OriginX - curPos.x; 555 minXdif = PreXdif; 556 if (PreXdif == 0) { 557 return index; 558 } 559 FX_FLOAT curXdif = 0; 560 while (++index < m_charList.GetSize()) { 561 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 562 if (difPosY != charinfo.m_OriginY) { 563 break; 564 } 565 curXdif = charinfo.m_OriginX - curPos.x; 566 if (curXdif == 0) { 567 return index; 568 } 569 int signflag = 0; 570 if (curXdif > 0) { 571 signflag = 1; 572 } else { 573 signflag = -1; 574 } 575 if (signflag * PreXdif < 0) { 576 if (FXSYS_fabs(PreXdif) < FXSYS_fabs(curXdif)) { 577 return index - 1; 578 } else { 579 return index; 580 } 581 } 582 if (FXSYS_fabs(curXdif) < FXSYS_fabs(minXdif)) { 583 minXdif = curXdif; 584 minIndex = index; 585 } 586 PreXdif = curXdif; 587 } 588 return minIndex; 589 } 590 } 591 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO & info) const 592 { 593 if(m_ParseOptions.m_bGetCharCodeOnly) { 594 return; 595 } 596 if (!m_IsParsered) { 597 return; 598 } 599 if (index < 0 || index >= m_charList.GetSize()) { 600 return; 601 } 602 PAGECHAR_INFO charinfo; 603 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 604 info.m_Charcode = charinfo.m_CharCode; 605 info.m_OriginX = charinfo.m_OriginX; 606 info.m_OriginY = charinfo.m_OriginY; 607 info.m_Unicode = charinfo.m_Unicode; 608 info.m_Flag = charinfo.m_Flag; 609 info.m_CharBox = charinfo.m_CharBox; 610 info.m_pTextObj = charinfo.m_pTextObj; 611 if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont()) { 612 info.m_FontSize = charinfo.m_pTextObj->GetFontSize(); 613 } 614 info.m_Matrix.Copy(charinfo.m_Matrix); 615 return; 616 } 617 void CPDF_TextPage::CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const 618 { 619 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start); 620 PAGECHAR_INFO charinfo2 = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1); 621 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag && FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) { 622 return; 623 } 624 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) { 625 PAGECHAR_INFO charinfo1 = charinfo; 626 int startIndex = start; 627 while(FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag && charinfo1.m_Index == charinfo.m_Index) { 628 startIndex--; 629 if (startIndex < 0) { 630 break; 631 } 632 charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex); 633 } 634 startIndex++; 635 start = startIndex; 636 } 637 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) { 638 PAGECHAR_INFO charinfo3 = charinfo2; 639 int endIndex = start + nCount - 1; 640 while(FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag && charinfo3.m_Index == charinfo2.m_Index) { 641 endIndex++; 642 if (endIndex >= m_charList.GetSize()) { 643 break; 644 } 645 charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex); 646 } 647 endIndex--; 648 nCount = endIndex - start + 1; 649 } 650 } 651 CFX_WideString CPDF_TextPage::GetPageText(int start , int nCount) const 652 { 653 if (!m_IsParsered || nCount == 0) { 654 return L""; 655 } 656 if (start < 0) { 657 start = 0; 658 } 659 if (nCount == -1) { 660 nCount = m_charList.GetSize() - start; 661 return m_TextBuf.GetWideString().Mid(start, m_TextBuf.GetWideString().GetLength()); 662 } 663 if(nCount <= 0 || m_charList.GetSize() <= 0) { 664 return L""; 665 } 666 if(nCount + start > m_charList.GetSize() - 1) { 667 nCount = m_charList.GetSize() - start; 668 } 669 if (nCount <= 0) { 670 return L""; 671 } 672 CheckMarkedContentObject(start, nCount); 673 int startindex = 0; 674 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start); 675 int startOffset = 0; 676 while(charinfo.m_Index == -1) { 677 startOffset++; 678 if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) { 679 return L""; 680 } 681 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset); 682 } 683 startindex = charinfo.m_Index; 684 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1); 685 int nCountOffset = 0; 686 while (charinfo.m_Index == -1) { 687 nCountOffset++; 688 if (nCountOffset >= nCount) { 689 return L""; 690 } 691 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1); 692 } 693 nCount = start + nCount - nCountOffset - startindex; 694 if(nCount <= 0) { 695 return L""; 696 } 697 return m_TextBuf.GetWideString().Mid(startindex, nCount); 698 } 699 int CPDF_TextPage::CountRects(int start, int nCount) 700 { 701 if(m_ParseOptions.m_bGetCharCodeOnly) { 702 return -1; 703 } 704 if (!m_IsParsered) { 705 return -1; 706 } 707 if (start < 0) { 708 return -1; 709 } 710 if (nCount == -1 || nCount + start > m_charList.GetSize() ) { 711 nCount = m_charList.GetSize() - start; 712 } 713 m_SelRects.RemoveAll(); 714 GetRectArray(start, nCount, m_SelRects); 715 return m_SelRects.GetSize(); 716 } 717 void CPDF_TextPage::GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const 718 { 719 if(m_ParseOptions.m_bGetCharCodeOnly) { 720 return ; 721 } 722 if (!m_IsParsered || rectIndex < 0 || rectIndex >= m_SelRects.GetSize()) { 723 return; 724 } 725 left = m_SelRects.GetAt(rectIndex).left; 726 top = m_SelRects.GetAt(rectIndex).top; 727 right = m_SelRects.GetAt(rectIndex).right; 728 bottom = m_SelRects.GetAt(rectIndex).bottom; 729 } 730 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) 731 { 732 if(m_ParseOptions.m_bGetCharCodeOnly) { 733 return FALSE; 734 } 735 if(end == start) { 736 return FALSE; 737 } 738 FX_FLOAT dx, dy; 739 FPDF_CHAR_INFO info1, info2; 740 GetCharInfo(start, info1); 741 GetCharInfo(end, info2); 742 while(info2.m_CharBox.Width() == 0 || info2.m_CharBox.Height() == 0) { 743 end--; 744 if(end <= start) { 745 return FALSE; 746 } 747 GetCharInfo(end, info2); 748 } 749 dx = (info2.m_OriginX - info1.m_OriginX); 750 dy = (info2.m_OriginY - info1.m_OriginY); 751 if(dx == 0) { 752 if(dy > 0) { 753 Rotate = 90; 754 } else if (dy < 0) { 755 Rotate = 270; 756 } else { 757 Rotate = 0; 758 } 759 } else { 760 float a = FXSYS_atan2(dy, dx); 761 Rotate = (int)(a * 180 / FX_PI + 0.5); 762 } 763 if(Rotate < 0) { 764 Rotate = -Rotate; 765 } else if(Rotate > 0) { 766 Rotate = 360 - Rotate; 767 } 768 return TRUE; 769 } 770 FX_BOOL CPDF_TextPage::GetBaselineRotate(CFX_FloatRect rect , int& Rotate) 771 { 772 if(m_ParseOptions.m_bGetCharCodeOnly) { 773 return FALSE; 774 } 775 int start, end, count, n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom, TRUE); 776 if(n < 1) { 777 return FALSE; 778 } 779 if(n > 1) { 780 GetBoundedSegment(n - 1, start, count); 781 end = start + count - 1; 782 GetBoundedSegment(0, start, count); 783 } else { 784 GetBoundedSegment(0, start, count); 785 end = start + count - 1; 786 } 787 return GetBaselineRotate(start, end, Rotate); 788 } 789 FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) 790 { 791 if(m_ParseOptions.m_bGetCharCodeOnly) { 792 return FALSE; 793 } 794 if (!m_IsParsered || rectIndex < 0 || rectIndex > m_SelRects.GetSize()) { 795 return FALSE; 796 } 797 CFX_FloatRect rect = m_SelRects.GetAt(rectIndex); 798 return GetBaselineRotate(rect , Rotate); 799 } 800 int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains ) 801 { 802 if(m_ParseOptions.m_bGetCharCodeOnly) { 803 return -1; 804 } 805 m_Segment.RemoveAll(); 806 if (!m_IsParsered) { 807 return -1; 808 } 809 CFX_FloatRect rect(left, bottom, right, top); 810 rect.Normalize(); 811 int nCount = m_charList.GetSize(); 812 int pos = 0; 813 FPDF_SEGMENT segment; 814 segment.m_Start = 0; 815 segment.m_nCount = 0; 816 FX_BOOL segmentStatus = 0; 817 FX_BOOL IsContainPreChar = FALSE; 818 while (pos < nCount) { 819 if(pos == 493) { 820 int a = 0; 821 } 822 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos); 823 if(bContains && rect.Contains(charinfo.m_CharBox)) { 824 if (segmentStatus == 0 || segmentStatus == 2) { 825 segment.m_Start = pos; 826 segment.m_nCount = 1; 827 segmentStatus = 1; 828 } else if (segmentStatus == 1) { 829 segment.m_nCount++; 830 } 831 IsContainPreChar = TRUE; 832 } else if (!bContains && (IsRectIntersect(rect, charinfo.m_CharBox) || rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) { 833 if (segmentStatus == 0 || segmentStatus == 2) { 834 segment.m_Start = pos; 835 segment.m_nCount = 1; 836 segmentStatus = 1; 837 } else if (segmentStatus == 1) { 838 segment.m_nCount++; 839 } 840 IsContainPreChar = TRUE; 841 } else if (charinfo.m_Unicode == 32) { 842 if (IsContainPreChar == TRUE) { 843 if (segmentStatus == 0 || segmentStatus == 2) { 844 segment.m_Start = pos; 845 segment.m_nCount = 1; 846 segmentStatus = 1; 847 } else if (segmentStatus == 1) { 848 segment.m_nCount++; 849 } 850 IsContainPreChar = FALSE; 851 } else { 852 if (segmentStatus == 1) { 853 segmentStatus = 2; 854 m_Segment.Add(segment); 855 segment.m_Start = 0; 856 segment.m_nCount = 0; 857 } 858 } 859 } else { 860 if (segmentStatus == 1) { 861 segmentStatus = 2; 862 m_Segment.Add(segment); 863 segment.m_Start = 0; 864 segment.m_nCount = 0; 865 } 866 IsContainPreChar = FALSE; 867 } 868 pos++; 869 } 870 if (segmentStatus == 1) { 871 segmentStatus = 2; 872 m_Segment.Add(segment); 873 segment.m_Start = 0; 874 segment.m_nCount = 0; 875 } 876 return m_Segment.GetSize(); 877 } 878 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const 879 { 880 if(m_ParseOptions.m_bGetCharCodeOnly) { 881 return ; 882 } 883 if (index < 0 || index >= m_Segment.GetSize()) { 884 return; 885 } 886 start = m_Segment.GetAt(index).m_Start; 887 count = m_Segment.GetAt(index).m_nCount; 888 } 889 int CPDF_TextPage::GetWordBreak(int index, int direction) const 890 { 891 if(m_ParseOptions.m_bGetCharCodeOnly) { 892 return -1; 893 } 894 if (!m_IsParsered) { 895 return -1; 896 } 897 if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT) { 898 return -1; 899 } 900 if (index < 0 || index >= m_charList.GetSize()) { 901 return -1; 902 } 903 PAGECHAR_INFO charinfo; 904 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 905 if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 906 return index; 907 } 908 if (!IsLetter(charinfo.m_Unicode)) { 909 return index; 910 } 911 int breakPos = index; 912 if (direction == FPDFTEXT_LEFT) { 913 while (--breakPos > 0) { 914 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 915 if (!IsLetter(charinfo.m_Unicode)) { 916 return breakPos; 917 } 918 } 919 return breakPos; 920 } else if (direction == FPDFTEXT_RIGHT) { 921 while (++breakPos < m_charList.GetSize()) { 922 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 923 if (!IsLetter(charinfo.m_Unicode)) { 924 return breakPos; 925 } 926 } 927 return breakPos; 928 } 929 return breakPos; 930 } 931 FX_INT32 CPDF_TextPage::FindTextlineFlowDirection() 932 { 933 if (!m_pPage) { 934 return -1; 935 } 936 const FX_INT32 nPageWidth = (FX_INT32)((CPDF_Page*)m_pPage)->GetPageWidth(); 937 const FX_INT32 nPageHeight = (FX_INT32)((CPDF_Page*)m_pPage)->GetPageHeight(); 938 CFX_ByteArray nHorizontalMask; 939 if (!nHorizontalMask.SetSize(nPageWidth)) { 940 return -1; 941 } 942 FX_BYTE* pDataH = nHorizontalMask.GetData(); 943 CFX_ByteArray nVerticalMask; 944 if (!nVerticalMask.SetSize(nPageHeight)) { 945 return -1; 946 } 947 FX_BYTE* pDataV = nVerticalMask.GetData(); 948 FX_INT32 index = 0; 949 FX_FLOAT fLineHeight = 0.0f; 950 CPDF_PageObject* pPageObj = NULL; 951 FX_POSITION pos = NULL; 952 pos = m_pPage->GetFirstObjectPosition(); 953 if(!pos) { 954 return -1; 955 } 956 while(pos) { 957 pPageObj = m_pPage->GetNextObject(pos); 958 if(NULL == pPageObj) { 959 continue; 960 } 961 if(PDFPAGE_TEXT != pPageObj->m_Type) { 962 continue; 963 } 964 FX_INT32 minH = (FX_INT32)pPageObj->m_Left < 0 ? 0 : (FX_INT32)pPageObj->m_Left; 965 FX_INT32 maxH = (FX_INT32)pPageObj->m_Right > nPageWidth ? nPageWidth : (FX_INT32)pPageObj->m_Right; 966 FX_INT32 minV = (FX_INT32)pPageObj->m_Bottom < 0 ? 0 : (FX_INT32)pPageObj->m_Bottom; 967 FX_INT32 maxV = (FX_INT32)pPageObj->m_Top > nPageHeight ? nPageHeight : (FX_INT32)pPageObj->m_Top; 968 if (minH >= maxH || minV >= maxV){ 969 continue; 970 } 971 972 FXSYS_memset8(pDataH + minH, 1, maxH - minH); 973 FXSYS_memset8(pDataV + minV, 1, maxV - minV); 974 975 if (fLineHeight <= 0.0f) { 976 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom; 977 } 978 979 pPageObj = NULL; 980 } 981 FX_INT32 nStartH = 0; 982 FX_INT32 nEndH = 0; 983 FX_FLOAT nSumH = 0.0f; 984 for (index = 0; index < nPageWidth; index++) 985 if(1 == nHorizontalMask[index]) { 986 break; 987 } 988 nStartH = index; 989 for (index = nPageWidth; index > 0; index--) 990 if(1 == nHorizontalMask[index - 1]) { 991 break; 992 } 993 nEndH = index; 994 for (index = nStartH; index < nEndH; index++) { 995 nSumH += nHorizontalMask[index]; 996 } 997 nSumH /= nEndH - nStartH; 998 FX_INT32 nStartV = 0; 999 FX_INT32 nEndV = 0; 1000 FX_FLOAT nSumV = 0.0f; 1001 for (index = 0; index < nPageHeight; index++) 1002 if(1 == nVerticalMask[index]) { 1003 break; 1004 } 1005 nStartV = index; 1006 for (index = nPageHeight; index > 0; index--) 1007 if(1 == nVerticalMask[index - 1]) { 1008 break; 1009 } 1010 nEndV = index; 1011 for (index = nStartV; index < nEndV; index++) { 1012 nSumV += nVerticalMask[index]; 1013 } 1014 nSumV /= nEndV - nStartV; 1015 if ((nEndV - nStartV) < (FX_INT32)(2 * fLineHeight)) { 1016 return 0; 1017 } 1018 if ((nEndH - nStartH) < (FX_INT32)(2 * fLineHeight)) { 1019 return 1; 1020 } 1021 if (nSumH > 0.8f) { 1022 return 0; 1023 } 1024 if (nSumH - nSumV > 0.0f) { 1025 return 0; 1026 } 1027 if (nSumV - nSumH > 0.0f) { 1028 return 1; 1029 } 1030 return -1; 1031 } 1032 void CPDF_TextPage::ProcessObject() 1033 { 1034 CPDF_PageObject* pPageObj = NULL; 1035 if (!m_pPage) { 1036 return; 1037 } 1038 FX_POSITION pos; 1039 pos = m_pPage->GetFirstObjectPosition(); 1040 if (!pos) { 1041 return; 1042 } 1043 m_TextlineDir = FindTextlineFlowDirection(); 1044 int nCount = 0; 1045 while (pos) { 1046 pPageObj = m_pPage->GetNextObject(pos); 1047 if(pPageObj) { 1048 if(pPageObj->m_Type == PDFPAGE_TEXT) { 1049 if (nCount == 3) { 1050 nCount = nCount; 1051 } 1052 CFX_AffineMatrix matrix; 1053 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos); 1054 nCount++; 1055 } else if (pPageObj->m_Type == PDFPAGE_FORM) { 1056 CFX_AffineMatrix formMatrix(1, 0, 0, 1, 0, 0); 1057 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix); 1058 } 1059 } 1060 pPageObj = NULL; 1061 } 1062 int count = m_LineObj.GetSize(); 1063 for(int i = 0; i < count; i++) { 1064 ProcessTextObject(m_LineObj.GetAt(i)); 1065 } 1066 m_LineObj.RemoveAll(); 1067 CloseTempLine(); 1068 } 1069 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, CFX_AffineMatrix formMatrix) 1070 { 1071 CPDF_PageObject* pPageObj = NULL; 1072 FX_POSITION pos; 1073 if (!pFormObj) { 1074 return; 1075 } 1076 pos = pFormObj->m_pForm->GetFirstObjectPosition(); 1077 if (!pos) { 1078 return; 1079 } 1080 CFX_AffineMatrix curFormMatrix; 1081 curFormMatrix.Copy(pFormObj->m_FormMatrix); 1082 curFormMatrix.Concat(formMatrix); 1083 while (pos) { 1084 pPageObj = pFormObj->m_pForm->GetNextObject(pos); 1085 if(pPageObj) { 1086 if(pPageObj->m_Type == PDFPAGE_TEXT) { 1087 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos); 1088 } else if (pPageObj->m_Type == PDFPAGE_FORM) { 1089 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix); 1090 } 1091 } 1092 pPageObj = NULL; 1093 } 1094 } 1095 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const 1096 { 1097 if(charCode == -1) { 1098 return 0; 1099 } 1100 int w = pFont->GetCharWidthF(charCode); 1101 if(w == 0) { 1102 CFX_ByteString str; 1103 pFont->AppendChar(str, charCode); 1104 w = pFont->GetStringWidth(str, 1); 1105 if(w == 0) { 1106 FX_RECT BBox; 1107 pFont->GetCharBBox(charCode, BBox); 1108 w = BBox.right - BBox.left; 1109 } 1110 } 1111 return w; 1112 } 1113 void CPDF_TextPage::OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str) 1114 { 1115 FX_INT32 start, count; 1116 FX_INT32 ret = pBidi->GetBidiInfo(start, count); 1117 if(ret == 2) { 1118 for(int i = start + count - 1; i >= start; i--) { 1119 m_TextBuf.AppendChar(str.GetAt(i)); 1120 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i)); 1121 } 1122 } else { 1123 int end = start + count ; 1124 for(int i = start; i < end; i++) { 1125 m_TextBuf.AppendChar(str.GetAt(i)); 1126 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i)); 1127 } 1128 } 1129 } 1130 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) 1131 { 1132 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i); 1133 FX_WCHAR wChar = str.GetAt(i); 1134 #ifdef FOXIT_CHROME_BUILD 1135 if(!IsControlChar(&Info)) { 1136 #else 1137 if(wChar != 0xfffe) { 1138 #endif 1139 Info.m_Index = m_TextBuf.GetLength(); 1140 if (wChar >= 0xFB00 && wChar <= 0xFB06) { 1141 FX_LPWSTR pDst = NULL; 1142 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 1143 if (nCount >= 1) { 1144 pDst = FX_Alloc(FX_WCHAR, nCount); 1145 if (!pDst) { 1146 return; 1147 } 1148 FX_Unicode_GetNormalization(wChar, pDst); 1149 for (int nIndex = 0; nIndex < nCount; nIndex++) { 1150 PAGECHAR_INFO Info2 = Info; 1151 Info2.m_Unicode = pDst[nIndex]; 1152 Info2.m_Flag = FPDFTEXT_CHAR_PIECE; 1153 m_TextBuf.AppendChar(Info2.m_Unicode); 1154 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1155 m_charList.Add(Info2); 1156 } 1157 } 1158 FX_Free(pDst); 1159 return; 1160 } 1161 } 1162 m_TextBuf.AppendChar(wChar); 1163 } else { 1164 Info.m_Index = -1; 1165 } 1166 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1167 m_charList.Add(Info); 1168 } 1169 } 1170 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) 1171 { 1172 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i); 1173 #ifdef FOXIT_CHROME_BUILD 1174 if(!IsControlChar(&Info)) { 1175 #else 1176 if(str.GetAt(i) != 0xfffe) { 1177 #endif 1178 Info.m_Index = m_TextBuf.GetLength(); 1179 FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE); 1180 FX_LPWSTR pDst = NULL; 1181 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 1182 if (nCount >= 1) { 1183 pDst = FX_Alloc(FX_WCHAR, nCount); 1184 if (!pDst) { 1185 return; 1186 } 1187 FX_Unicode_GetNormalization(wChar, pDst); 1188 for (int nIndex = 0; nIndex < nCount; nIndex++) { 1189 PAGECHAR_INFO Info2 = Info; 1190 Info2.m_Unicode = pDst[nIndex]; 1191 Info2.m_Flag = FPDFTEXT_CHAR_PIECE; 1192 m_TextBuf.AppendChar(Info2.m_Unicode); 1193 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1194 m_charList.Add(Info2); 1195 } 1196 } 1197 FX_Free(pDst); 1198 return; 1199 } else { 1200 Info.m_Unicode = wChar; 1201 } 1202 m_TextBuf.AppendChar(Info.m_Unicode); 1203 } else { 1204 Info.m_Index = -1; 1205 } 1206 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1207 m_charList.Add(Info); 1208 } 1209 } 1210 void CPDF_TextPage::CloseTempLine() 1211 { 1212 int count1 = m_TempCharList.GetSize(); 1213 if (count1 <= 0) { 1214 return; 1215 } 1216 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); 1217 CFX_WideString str = m_TempTextBuf.GetWideString(); 1218 CFX_WordArray order; 1219 FX_BOOL bR2L = FALSE; 1220 FX_INT32 start = 0, count = 0, i = 0; 1221 int nR2L = 0, nL2R = 0; 1222 FX_BOOL bPrevSpace = FALSE; 1223 for (i = 0; i < str.GetLength(); i++) { 1224 if(str.GetAt(i) == 32) { 1225 if(bPrevSpace) { 1226 m_TempTextBuf.Delete(i, 1); 1227 m_TempCharList.Delete(i); 1228 str.Delete(i); 1229 count1 --; 1230 i--; 1231 continue; 1232 } 1233 bPrevSpace = TRUE; 1234 } else { 1235 bPrevSpace = FALSE; 1236 } 1237 if(BidiChar && BidiChar->AppendChar(str.GetAt(i))) { 1238 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 1239 order.Add(start); 1240 order.Add(count); 1241 order.Add(ret); 1242 if(!bR2L) { 1243 if(ret == 2) { 1244 nR2L++; 1245 } else if (ret == 1) { 1246 nL2R++; 1247 } 1248 } 1249 } 1250 } 1251 if(BidiChar && BidiChar->EndChar()) { 1252 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 1253 order.Add(start); 1254 order.Add(count); 1255 order.Add(ret); 1256 if(!bR2L) { 1257 if(ret == 2) { 1258 nR2L++; 1259 } else if(ret == 1) { 1260 nL2R++; 1261 } 1262 } 1263 } 1264 if(nR2L > 0 && nR2L >= nL2R) { 1265 bR2L = TRUE; 1266 } 1267 if(this->m_parserflag == FPDFTEXT_RLTB || bR2L) { 1268 int count = order.GetSize(); 1269 for(int j = count - 1; j > 0; j -= 3) { 1270 int ret = order.GetAt(j); 1271 int start = order.GetAt(j - 2); 1272 int count1 = order.GetAt(j - 1); 1273 if(ret == 2 || ret == 0) { 1274 for(int i = start + count1 - 1; i >= start; i--) { 1275 AddCharInfoByRLDirection(str, i); 1276 } 1277 } else { 1278 i = j; 1279 FX_BOOL bSymbol = FALSE; 1280 while(i > 0 && order.GetAt(i) != 2) { 1281 bSymbol = !order.GetAt(i); 1282 i -= 3; 1283 } 1284 int end = start + count1 ; 1285 int n = 0; 1286 if(bSymbol) { 1287 n = i + 6; 1288 } else { 1289 n = i + 3; 1290 } 1291 if(n >= j) { 1292 for(int m = start; m < end; m++) { 1293 AddCharInfoByLRDirection(str, m); 1294 } 1295 } else { 1296 i = j; 1297 j = n; 1298 for(; n <= i; n += 3) { 1299 int ret = order.GetAt(n); 1300 int start = order.GetAt(n - 2); 1301 int count1 = order.GetAt(n - 1); 1302 int end = start + count1 ; 1303 for(int m = start; m < end; m++) { 1304 AddCharInfoByLRDirection(str, m); 1305 } 1306 } 1307 } 1308 } 1309 } 1310 } else { 1311 int count = order.GetSize(); 1312 FX_BOOL bL2R = FALSE; 1313 for(int j = 0; j < count; j += 3) { 1314 int ret = order.GetAt(j + 2); 1315 int start = order.GetAt(j); 1316 int count1 = order.GetAt(j + 1); 1317 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) { 1318 int i = j + 3; 1319 while(bR2L && i < count) { 1320 if(order.GetAt(i + 2) == 1) { 1321 break; 1322 } else { 1323 i += 3; 1324 } 1325 } 1326 if(i == 3) { 1327 j = -3; 1328 bL2R = TRUE; 1329 continue; 1330 } 1331 int end = m_TempCharList.GetSize() - 1; 1332 if(i < count) { 1333 end = order.GetAt(i) - 1; 1334 } 1335 j = i - 3; 1336 for(int n = end; n >= start; n--) { 1337 AddCharInfoByRLDirection(str, n); 1338 } 1339 } else { 1340 int end = start + count1 ; 1341 for(int i = start; i < end; i++) { 1342 AddCharInfoByLRDirection(str, i); 1343 } 1344 } 1345 } 1346 } 1347 int ntext = m_TextBuf.GetSize(); 1348 ntext = m_charList.GetSize(); 1349 order.RemoveAll(); 1350 m_TempCharList.RemoveAll(); 1351 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength()); 1352 BidiChar->Release(); 1353 } 1354 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPos) 1355 { 1356 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right, pTextObj->m_Top); 1357 if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) { 1358 return; 1359 } 1360 int count = m_LineObj.GetSize(); 1361 PDFTEXT_Obj Obj; 1362 Obj.m_pTextObj = pTextObj; 1363 Obj.m_formMatrix = formMatrix; 1364 if(count == 0) { 1365 m_LineObj.Add(Obj); 1366 return; 1367 } 1368 if (IsSameAsPreTextObject(pTextObj, ObjPos)) { 1369 return; 1370 } 1371 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1); 1372 CPDF_TextObjectItem item; 1373 int nItem = prev_Obj.m_pTextObj->CountItems(); 1374 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item); 1375 FX_FLOAT prev_width = GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) * prev_Obj.m_pTextObj->GetFontSize() / 1000; 1376 CFX_AffineMatrix prev_matrix; 1377 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix); 1378 prev_width = FXSYS_fabs(prev_width); 1379 prev_matrix.Concat(prev_Obj.m_formMatrix); 1380 prev_width = prev_matrix.TransformDistance(prev_width); 1381 pTextObj->GetItemInfo(0, &item); 1382 FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) * pTextObj->GetFontSize() / 1000; 1383 this_width = FXSYS_fabs(this_width); 1384 CFX_AffineMatrix this_matrix; 1385 pTextObj->GetTextMatrix(&this_matrix); 1386 this_width = FXSYS_fabs(this_width); 1387 this_matrix.Concat(formMatrix); 1388 this_width = this_matrix.TransformDistance(this_width); 1389 FX_FLOAT threshold = prev_width > this_width ? prev_width / 4 : this_width / 4; 1390 FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(), prev_y = prev_Obj.m_pTextObj->GetPosY(); 1391 prev_Obj.m_formMatrix.Transform(prev_x, prev_y); 1392 m_DisplayMatrix.Transform(prev_x, prev_y); 1393 FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY(); 1394 formMatrix.Transform(this_x, this_y); 1395 m_DisplayMatrix.Transform(this_x, this_y); 1396 if (FXSYS_fabs(this_y - prev_y) > threshold * 2) { 1397 for(int i = 0; i < count; i++) { 1398 ProcessTextObject(m_LineObj.GetAt(i)); 1399 } 1400 m_LineObj.RemoveAll(); 1401 m_LineObj.Add(Obj); 1402 return; 1403 } 1404 int i = 0; 1405 if(m_ParseOptions.m_bNormalizeObjs) { 1406 for(i = count - 1; i >= 0; i--) { 1407 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i); 1408 CFX_AffineMatrix prev_matrix; 1409 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix); 1410 FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(), Prev_y = prev_Obj.m_pTextObj->GetPosY(); 1411 prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y); 1412 m_DisplayMatrix.Transform(Prev_x, Prev_y); 1413 if(this_x >= Prev_x) { 1414 if(i == count - 1) { 1415 m_LineObj.Add(Obj); 1416 } else { 1417 m_LineObj.InsertAt(i + 1, Obj); 1418 } 1419 break; 1420 } 1421 } 1422 if(i < 0) { 1423 m_LineObj.InsertAt(0, Obj); 1424 } 1425 } else { 1426 m_LineObj.Add(Obj); 1427 } 1428 } 1429 FX_INT32 CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) 1430 { 1431 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1432 CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject(); 1433 if(!pMarkData) { 1434 return FPDFTEXT_MC_PASS; 1435 } 1436 int nContentMark = pMarkData->CountItems(); 1437 if (nContentMark < 1) { 1438 return FPDFTEXT_MC_PASS; 1439 } 1440 CFX_WideString actText; 1441 FX_BOOL bExist = FALSE; 1442 CPDF_Dictionary* pDict = NULL; 1443 int n = 0; 1444 for (n = 0; n < nContentMark; n++) { 1445 CPDF_ContentMarkItem& item = pMarkData->GetItem(n); 1446 CFX_ByteString tagStr = (CFX_ByteString)item.GetName(); 1447 pDict = (CPDF_Dictionary*)item.GetParam(); 1448 CPDF_String* temp = (CPDF_String*)pDict->GetElement(FX_BSTRC("ActualText")); 1449 if (temp) { 1450 bExist = TRUE; 1451 actText = temp->GetUnicodeText(); 1452 } 1453 } 1454 if (!bExist) { 1455 return FPDFTEXT_MC_PASS; 1456 } 1457 if (m_pPreTextObj) { 1458 if (CPDF_ContentMarkData* pPreMarkData = (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) { 1459 if (pPreMarkData->CountItems() == n) { 1460 CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1); 1461 if (pDict == item.GetParam()) { 1462 return FPDFTEXT_MC_DONE; 1463 } 1464 } 1465 } 1466 } 1467 CPDF_Font* pFont = pTextObj->GetFont(); 1468 FX_STRSIZE nItems = actText.GetLength(); 1469 if (nItems < 1) { 1470 return FPDFTEXT_MC_PASS; 1471 } 1472 bExist = FALSE; 1473 for (FX_STRSIZE i = 0; i < nItems; i++) { 1474 FX_WCHAR wChar = actText.GetAt(i); 1475 if (-1 == pFont->CharCodeFromUnicode(wChar)) { 1476 continue; 1477 } else { 1478 bExist = TRUE; 1479 break; 1480 } 1481 } 1482 if (!bExist) { 1483 return FPDFTEXT_MC_PASS; 1484 } 1485 bExist = FALSE; 1486 for (FX_STRSIZE j = 0; j < nItems; j++) { 1487 FX_WCHAR wChar = actText.GetAt(j); 1488 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) { 1489 bExist = TRUE; 1490 break; 1491 } 1492 } 1493 if (!bExist) { 1494 return FPDFTEXT_MC_DONE; 1495 } 1496 return FPDFTEXT_MC_DELAY; 1497 } 1498 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) 1499 { 1500 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1501 CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject(); 1502 if(!pMarkData) { 1503 return; 1504 } 1505 int nContentMark = pMarkData->CountItems(); 1506 if (nContentMark < 1) { 1507 return; 1508 } 1509 CFX_WideString actText; 1510 CPDF_Dictionary* pDict = NULL; 1511 int n = 0; 1512 for (n = 0; n < nContentMark; n++) { 1513 CPDF_ContentMarkItem& item = pMarkData->GetItem(n); 1514 CFX_ByteString tagStr = (CFX_ByteString)item.GetName(); 1515 pDict = (CPDF_Dictionary*)item.GetParam(); 1516 CPDF_String* temp = (CPDF_String*)pDict->GetElement(FX_BSTRC("ActualText")); 1517 if (temp) { 1518 actText = temp->GetUnicodeText(); 1519 } 1520 } 1521 FX_STRSIZE nItems = actText.GetLength(); 1522 if (nItems < 1) { 1523 return; 1524 } 1525 CPDF_Font* pFont = pTextObj->GetFont(); 1526 CFX_AffineMatrix formMatrix = Obj.m_formMatrix; 1527 CFX_AffineMatrix matrix; 1528 pTextObj->GetTextMatrix(&matrix); 1529 matrix.Concat(formMatrix); 1530 FX_FLOAT fPosX = pTextObj->GetPosX(); 1531 FX_FLOAT fPosY = pTextObj->GetPosY(); 1532 int nCharInfoIndex = m_TextBuf.GetLength(); 1533 CFX_FloatRect charBox; 1534 charBox.top = pTextObj->m_Top; 1535 charBox.left = pTextObj->m_Left; 1536 charBox.right = pTextObj->m_Right; 1537 charBox.bottom = pTextObj->m_Bottom; 1538 for (FX_STRSIZE k = 0; k < nItems; k++) { 1539 FX_WCHAR wChar = actText.GetAt(k); 1540 if (wChar <= 0x80 && !isprint(wChar)) { 1541 wChar = 0x20; 1542 } 1543 if (wChar >= 0xFFFD) { 1544 continue; 1545 } 1546 PAGECHAR_INFO charinfo; 1547 charinfo.m_OriginX = fPosX; 1548 charinfo.m_OriginY = fPosY; 1549 charinfo.m_Index = nCharInfoIndex; 1550 charinfo.m_Unicode = wChar; 1551 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar); 1552 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE; 1553 charinfo.m_pTextObj = pTextObj; 1554 charinfo.m_CharBox.top = charBox.top; 1555 charinfo.m_CharBox.left = charBox.left; 1556 charinfo.m_CharBox.right = charBox.right; 1557 charinfo.m_CharBox.bottom = charBox.bottom; 1558 charinfo.m_Matrix.Copy(matrix); 1559 m_TempTextBuf.AppendChar(wChar); 1560 m_TempCharList.Add(charinfo); 1561 } 1562 } 1563 void CPDF_TextPage::FindPreviousTextObject(void) 1564 { 1565 if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) { 1566 return; 1567 } 1568 PAGECHAR_INFO preChar; 1569 if (m_TempCharList.GetSize() >= 1) { 1570 preChar = *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1); 1571 } else { 1572 preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1); 1573 } 1574 if (preChar.m_pTextObj) { 1575 m_pPreTextObj = preChar.m_pTextObj; 1576 } 1577 } 1578 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) 1579 { 1580 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1581 if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) { 1582 return; 1583 } 1584 CFX_AffineMatrix formMatrix = Obj.m_formMatrix; 1585 CPDF_Font* pFont = pTextObj->GetFont(); 1586 CFX_AffineMatrix matrix; 1587 pTextObj->GetTextMatrix(&matrix); 1588 matrix.Concat(formMatrix); 1589 FX_INT32 bPreMKC = PreMarkedContent(Obj); 1590 if (FPDFTEXT_MC_DONE == bPreMKC) { 1591 m_pPreTextObj = pTextObj; 1592 m_perMatrix.Copy(formMatrix); 1593 return; 1594 } 1595 int result = 0; 1596 if (m_pPreTextObj) { 1597 result = ProcessInsertObject(pTextObj, formMatrix); 1598 if (2 == result) { 1599 m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top); 1600 } else { 1601 m_CurlineRect.Union(CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top)); 1602 } 1603 PAGECHAR_INFO generateChar; 1604 if (result == 1) { 1605 if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) { 1606 if (!formMatrix.IsIdentity()) { 1607 generateChar.m_Matrix.Copy(formMatrix); 1608 } 1609 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); 1610 m_TempCharList.Add(generateChar); 1611 } 1612 } else if(result == 2) { 1613 CloseTempLine(); 1614 if(m_TextBuf.GetSize()) { 1615 if(m_ParseOptions.m_bGetCharCodeOnly) { 1616 m_TextBuf.AppendChar(TEXT_RETURN_CHAR); 1617 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR); 1618 } else { 1619 if(GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) { 1620 m_TextBuf.AppendChar(TEXT_RETURN_CHAR); 1621 if (!formMatrix.IsIdentity()) { 1622 generateChar.m_Matrix.Copy(formMatrix); 1623 } 1624 m_charList.Add(generateChar); 1625 } 1626 if(GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) { 1627 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR); 1628 if (!formMatrix.IsIdentity()) { 1629 generateChar.m_Matrix.Copy(formMatrix); 1630 } 1631 m_charList.Add(generateChar); 1632 } 1633 } 1634 } 1635 } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) { 1636 FX_INT32 nChars = pTextObj->CountChars(); 1637 if (nChars == 1) { 1638 CPDF_TextObjectItem item; 1639 pTextObj->GetCharInfo(0, &item); 1640 CFX_WideString wstrItem = pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 1641 if(wstrItem.IsEmpty()) { 1642 wstrItem += (FX_WCHAR)item.m_CharCode; 1643 } 1644 FX_WCHAR curChar = wstrItem.GetAt(0); 1645 if (0x2D == curChar || 0xAD == curChar) { 1646 return; 1647 } 1648 } 1649 while (m_TempTextBuf.GetSize() > 0 && m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() - 1) == 0x20) { 1650 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1651 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); 1652 } 1653 PAGECHAR_INFO* cha = (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1); 1654 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1655 #ifdef FOXIT_CHROME_BUILD 1656 cha->m_Unicode = 0x2; 1657 cha->m_Flag = FPDFTEXT_CHAR_HYPHEN; 1658 m_TempTextBuf.AppendChar(0xfffe); 1659 #else 1660 cha->m_Unicode = 0; 1661 m_TempTextBuf.AppendChar(0xfffe); 1662 #endif 1663 } 1664 } else { 1665 m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top); 1666 } 1667 if (FPDFTEXT_MC_DELAY == bPreMKC) { 1668 ProcessMarkedContent(Obj); 1669 m_pPreTextObj = pTextObj; 1670 m_perMatrix.Copy(formMatrix); 1671 return; 1672 } 1673 m_pPreTextObj = pTextObj; 1674 m_perMatrix.Copy(formMatrix); 1675 int nItems = pTextObj->CountItems(); 1676 FX_FLOAT spacing = 0; 1677 FX_FLOAT baseSpace = 0.0; 1678 FX_BOOL bAllChar = TRUE; 1679 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { 1680 spacing = matrix.TransformDistance(pTextObj->m_TextState.GetObject()->m_CharSpace); 1681 baseSpace = spacing; 1682 for (int i = 0; i < nItems; i++) { 1683 CPDF_TextObjectItem item; 1684 pTextObj->GetItemInfo(i, &item); 1685 if (item.m_CharCode == (FX_DWORD) - 1) { 1686 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1687 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000; 1688 if(kerning + spacing < baseSpace) { 1689 baseSpace = kerning + spacing; 1690 } 1691 bAllChar = FALSE; 1692 } 1693 } 1694 spacing = 0; 1695 if(baseSpace < 0.0 || (nItems == 3 && !bAllChar)) { 1696 baseSpace = 0.0; 1697 } 1698 } 1699 for (int i = 0; i < nItems; i++) { 1700 CPDF_TextObjectItem item; 1701 PAGECHAR_INFO charinfo; 1702 charinfo.m_OriginX = 0; 1703 charinfo.m_OriginY = 0; 1704 pTextObj->GetItemInfo(i, &item); 1705 if (item.m_CharCode == (FX_DWORD) - 1) { 1706 CFX_WideString str = m_TempTextBuf.GetWideString(); 1707 if(str.IsEmpty()) { 1708 str = m_TextBuf.GetWideString(); 1709 } 1710 if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) { 1711 continue; 1712 } 1713 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1714 spacing = -fontsize_h * item.m_OriginX / 1000; 1715 continue; 1716 } 1717 FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace; 1718 if (charSpace > 0.001) { 1719 spacing += matrix.TransformDistance(charSpace); 1720 } else if(charSpace < -0.001) { 1721 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); 1722 } 1723 spacing -= baseSpace; 1724 if (spacing && i > 0) { 1725 int last_width = 0; 1726 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1727 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 1728 FX_FLOAT threshold = 0; 1729 if (space_charcode != -1) { 1730 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ; 1731 } 1732 if (threshold > fontsize_h / 3) { 1733 threshold = 0; 1734 } else { 1735 threshold /= 2; 1736 } 1737 if (threshold == 0) { 1738 threshold = fontsize_h; 1739 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); 1740 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; 1741 int nDivide = 6; 1742 if (threshold < 300) { 1743 nDivide = 2; 1744 } else if (threshold < 500) { 1745 nDivide = 4; 1746 } else if (threshold < 700) { 1747 nDivide = 5; 1748 } 1749 threshold = threshold / nDivide; 1750 threshold = fontsize_h * threshold / 1000; 1751 } 1752 if (threshold && (spacing && spacing >= threshold) ) { 1753 charinfo.m_Unicode = TEXT_BLANK_CHAR; 1754 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; 1755 charinfo.m_pTextObj = pTextObj; 1756 charinfo.m_Index = m_TextBuf.GetLength(); 1757 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); 1758 charinfo.m_CharCode = -1; 1759 charinfo.m_Matrix.Copy(formMatrix); 1760 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY); 1761 charinfo.m_CharBox = CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY); 1762 m_TempCharList.Add(charinfo); 1763 } 1764 if (item.m_CharCode == (FX_DWORD) - 1) { 1765 continue; 1766 } 1767 } 1768 spacing = 0; 1769 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 1770 FX_BOOL bNoUnicode = FALSE; 1771 FX_WCHAR wChar = wstrItem.GetAt(0); 1772 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) { 1773 if(wstrItem.IsEmpty()) { 1774 wstrItem += (FX_WCHAR)item.m_CharCode; 1775 } else { 1776 wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode); 1777 } 1778 bNoUnicode = TRUE; 1779 } 1780 charinfo.m_Index = -1; 1781 charinfo.m_CharCode = item.m_CharCode; 1782 if(bNoUnicode) { 1783 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE; 1784 } else { 1785 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL; 1786 } 1787 charinfo.m_pTextObj = pTextObj; 1788 charinfo.m_OriginX = 0, charinfo.m_OriginY = 0; 1789 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY); 1790 FX_RECT rect(0, 0, 0, 0); 1791 rect.Intersect(0, 0, 0, 0); 1792 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect); 1793 charinfo.m_CharBox.top = rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY; 1794 charinfo.m_CharBox.left = rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX; 1795 charinfo.m_CharBox.right = rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX; 1796 charinfo.m_CharBox.bottom = rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY; 1797 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) { 1798 charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + pTextObj->GetFontSize(); 1799 } 1800 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) { 1801 charinfo.m_CharBox.right = charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode); 1802 } 1803 matrix.TransformRect(charinfo.m_CharBox); 1804 charinfo.m_Matrix.Copy(matrix); 1805 if (wstrItem.IsEmpty()) { 1806 charinfo.m_Unicode = 0; 1807 m_TempCharList.Add(charinfo); 1808 m_TempTextBuf.AppendChar(0xfffe); 1809 continue; 1810 } else { 1811 int nTotal = wstrItem.GetLength(); 1812 int n = 0; 1813 FX_BOOL bDel = FALSE; 1814 while (n < m_TempCharList.GetSize() && n < 7) { 1815 n++; 1816 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - n); 1817 if(charinfo1->m_CharCode == charinfo.m_CharCode && 1818 charinfo1->m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() && 1819 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize() && 1820 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize() ) { 1821 bDel = TRUE; 1822 break; 1823 } 1824 } 1825 if(!bDel) { 1826 for (int nIndex = 0; nIndex < nTotal; nIndex++) { 1827 charinfo.m_Unicode = wstrItem.GetAt(nIndex); 1828 if (charinfo.m_Unicode) { 1829 charinfo.m_Index = m_TextBuf.GetLength(); 1830 m_TempTextBuf.AppendChar(charinfo.m_Unicode); 1831 } else { 1832 m_TempTextBuf.AppendChar(0xfffe); 1833 } 1834 m_TempCharList.Add(charinfo); 1835 } 1836 } else if(i == 0) { 1837 CFX_WideString str = m_TempTextBuf.GetWideString(); 1838 if (!str.IsEmpty() && str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) { 1839 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1840 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); 1841 } 1842 } 1843 } 1844 } 1845 } 1846 FX_INT32 CPDF_TextPage::GetTextObjectWritingMode(const CPDF_TextObject* pTextObj) 1847 { 1848 FX_INT32 nChars = pTextObj->CountChars(); 1849 if (nChars == 1) { 1850 return m_TextlineDir; 1851 } 1852 CPDF_TextObjectItem first, last; 1853 pTextObj->GetCharInfo(0, &first); 1854 pTextObj->GetCharInfo(nChars - 1, &last); 1855 CFX_Matrix textMatrix; 1856 pTextObj->GetTextMatrix(&textMatrix); 1857 textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY); 1858 textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY); 1859 FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX); 1860 FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY); 1861 if (dX <= 0.0001f && dY <= 0.0001f) { 1862 return -1; 1863 } 1864 CFX_VectorF v; 1865 v.Set(dX, dY); 1866 v.Normalize(); 1867 if (v.y <= 0.0872f) { 1868 if (v.x <= 0.0872f) { 1869 return m_TextlineDir; 1870 } 1871 return 0; 1872 } else if (v.x <= 0.0872f) { 1873 return 1; 1874 } 1875 return m_TextlineDir; 1876 } 1877 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) 1878 { 1879 CFX_WideString strCurText = m_TempTextBuf.GetWideString(); 1880 if(strCurText.GetLength() == 0) { 1881 strCurText = m_TextBuf.GetWideString(); 1882 } 1883 FX_STRSIZE nCount = strCurText.GetLength(); 1884 int nIndex = nCount - 1; 1885 FX_WCHAR wcTmp = strCurText.GetAt(nIndex); 1886 while(wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) { 1887 wcTmp = strCurText.GetAt(--nIndex); 1888 } 1889 if (0x2D == wcTmp || 0xAD == wcTmp) { 1890 if (--nIndex > 0) { 1891 FX_WCHAR preChar = strCurText.GetAt((nIndex)); 1892 if (((preChar >= L'A' && preChar <= L'Z') || (preChar >= L'a' && preChar <= L'z')) 1893 && ((curChar >= L'A' && curChar <= L'Z') || (curChar >= L'a' && curChar <= L'z'))) { 1894 return TRUE; 1895 } 1896 } 1897 int size = m_TempCharList.GetSize(); 1898 PAGECHAR_INFO preChar; 1899 if (size) { 1900 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 1901 } else { 1902 size = m_charList.GetSize(); 1903 if(size == 0) { 1904 return FALSE; 1905 } 1906 preChar = (PAGECHAR_INFO)m_charList[size - 1]; 1907 } 1908 if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag) 1909 if (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode) { 1910 return TRUE; 1911 } 1912 } 1913 return FALSE; 1914 } 1915 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix) 1916 { 1917 FindPreviousTextObject(); 1918 FX_BOOL bNewline = FALSE; 1919 int WritingMode = GetTextObjectWritingMode(pObj); 1920 if(WritingMode == -1) { 1921 WritingMode = GetTextObjectWritingMode(m_pPreTextObj); 1922 } 1923 CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right, pObj->m_Top); 1924 CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pPreTextObj->m_Right, m_pPreTextObj->m_Top); 1925 CPDF_TextObjectItem PrevItem, item; 1926 int nItem = m_pPreTextObj->CountItems(); 1927 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem); 1928 pObj->GetItemInfo(0, &item); 1929 CFX_WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 1930 if(wstrItem.IsEmpty()) { 1931 wstrItem += (FX_WCHAR)item.m_CharCode; 1932 } 1933 FX_WCHAR curChar = wstrItem.GetAt(0); 1934 if(WritingMode == 0) { 1935 if(this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) { 1936 FX_FLOAT top = this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top; 1937 FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom : prev_rect.bottom; 1938 if(bottom >= top) { 1939 if(IsHyphen(curChar)) { 1940 return 3; 1941 } 1942 return 2; 1943 } 1944 } 1945 } else if (WritingMode == 1) { 1946 if(this_rect.Width() > pObj->GetFontSize() * 0.1f && prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) { 1947 FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left : m_CurlineRect.left; 1948 FX_FLOAT right = this_rect.right < m_CurlineRect.right ? this_rect.right : m_CurlineRect.right; 1949 if(right <= left) { 1950 if(IsHyphen(curChar)) { 1951 return 3; 1952 } 1953 return 2; 1954 } 1955 } 1956 } 1957 FX_FLOAT last_pos = PrevItem.m_OriginX; 1958 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont()); 1959 FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000; 1960 last_width = FXSYS_fabs(last_width); 1961 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); 1962 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000; 1963 this_width = FXSYS_fabs(this_width); 1964 FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4; 1965 CFX_AffineMatrix prev_matrix, prev_reverse; 1966 m_pPreTextObj->GetTextMatrix(&prev_matrix); 1967 prev_matrix.Concat(m_perMatrix); 1968 prev_reverse.SetReverse(prev_matrix); 1969 FX_FLOAT x = pObj->GetPosX(); 1970 FX_FLOAT y = pObj->GetPosY(); 1971 formMatrix.Transform(x, y); 1972 prev_reverse.Transform(x, y); 1973 if(last_width < this_width) { 1974 threshold = prev_reverse.TransformDistance(threshold); 1975 } 1976 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom, m_pPreTextObj->m_Right, pObj->m_Top); 1977 CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pPreTextObj->m_Right, m_pPreTextObj->m_Top); 1978 CFX_FloatRect rect3 = rect1; 1979 rect1.Intersect(rect2); 1980 if (WritingMode == 0) { 1981 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) 1982 || ((y > threshold * 2 || y < threshold * -3) && (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) { 1983 bNewline = TRUE; 1984 if(nItem > 1 ) { 1985 CPDF_TextObjectItem tempItem; 1986 m_pPreTextObj->GetItemInfo(0, &tempItem); 1987 CFX_AffineMatrix m; 1988 m_pPreTextObj->GetTextMatrix(&m); 1989 if(PrevItem.m_OriginX > tempItem.m_OriginX && 1990 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 && 1991 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 1992 && m.b < 0.1 && m.c < 0.1 ) { 1993 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000, m_pPreTextObj->m_Top); 1994 if(re.Contains(pObj->GetPosX(), pObj->GetPosY())) { 1995 bNewline = FALSE; 1996 } else { 1997 CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top); 1998 if(re.Contains(m_pPreTextObj->GetPosX(), m_pPreTextObj->GetPosY())) { 1999 bNewline = FALSE; 2000 } 2001 } 2002 } 2003 } 2004 } 2005 } 2006 if(bNewline) { 2007 if(IsHyphen(curChar)) { 2008 return 3; 2009 } 2010 return 2; 2011 } 2012 FX_INT32 nChars = pObj->CountChars(); 2013 if (nChars == 1 && ( 0x2D == curChar || 0xAD == curChar)) 2014 if (IsHyphen(curChar)) { 2015 return 3; 2016 } 2017 CFX_WideString PrevStr = m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode); 2018 FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1); 2019 CFX_AffineMatrix matrix; 2020 pObj->GetTextMatrix(&matrix); 2021 matrix.Concat(formMatrix); 2022 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); 2023 threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 : (threshold > 800 ? threshold / 6 : threshold / 5)) : (threshold / 2); 2024 if(nLastWidth >= nThisWidth) { 2025 threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize()); 2026 } else { 2027 threshold *= FXSYS_fabs(pObj->GetFontSize()); 2028 threshold = matrix.TransformDistance(threshold); 2029 threshold = prev_reverse.TransformDistance(threshold); 2030 } 2031 threshold /= 1000; 2032 if((threshold < 1.4881 && threshold > 1.4879) 2033 || (threshold < 1.39001 && threshold > 1.38999)) { 2034 threshold *= 1.5; 2035 } 2036 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ') 2037 if (curChar != L' ' && preChar != L' ') { 2038 if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) { 2039 return 1; 2040 } 2041 if(x < 0 && (last_pos - x - last_width) > threshold) { 2042 return 1; 2043 } 2044 if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) { 2045 return 1; 2046 } 2047 } 2048 return 0; 2049 } 2050 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2) 2051 { 2052 if (!pTextObj1 || !pTextObj2) { 2053 return FALSE; 2054 } 2055 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top); 2056 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top); 2057 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() && !m_ParseOptions.m_bGetCharCodeOnly) { 2058 FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left); 2059 int nCount = m_charList.GetSize(); 2060 if (nCount >= 2) { 2061 PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2]; 2062 FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width(); 2063 if (dbXdif > dbSpace) { 2064 return FALSE; 2065 } 2066 } 2067 } 2068 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { 2069 rcPreObj.Intersect(rcCurObj); 2070 if (rcPreObj.IsEmpty()) { 2071 return FALSE; 2072 } 2073 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) { 2074 return FALSE; 2075 } 2076 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) { 2077 return FALSE; 2078 } 2079 } 2080 int nPreCount = pTextObj2->CountItems(); 2081 int nCurCount = pTextObj1->CountItems(); 2082 if (nPreCount != nCurCount) { 2083 return FALSE; 2084 } 2085 CPDF_TextObjectItem itemPer, itemCur; 2086 for (int i = 0; i < nPreCount; i++) { 2087 pTextObj2->GetItemInfo(i, &itemPer); 2088 pTextObj1->GetItemInfo(i, &itemCur); 2089 if (itemCur.m_CharCode != itemPer.m_CharCode) { 2090 return FALSE; 2091 } 2092 } 2093 if(FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) > GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont())*pTextObj2->GetFontSize() / 1000 * 0.9 || 2094 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) > 2095 FX_MAX(FX_MAX(rcPreObj.Height() , rcPreObj.Width()), pTextObj2->GetFontSize()) / 8) { 2096 return FALSE; 2097 } 2098 return TRUE; 2099 } 2100 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos) 2101 { 2102 if (!pTextObj) { 2103 return FALSE; 2104 } 2105 int i = 0; 2106 if (!ObjPos) { 2107 ObjPos = m_pPage->GetLastObjectPosition(); 2108 } 2109 CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos); 2110 while (i < 5 && ObjPos) { 2111 pObj = m_pPage->GetPrevObject(ObjPos); 2112 if(pObj == pTextObj) { 2113 continue; 2114 } 2115 if(pObj->m_Type != PDFPAGE_TEXT) { 2116 continue; 2117 } 2118 if(IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) { 2119 return TRUE; 2120 } 2121 i++; 2122 } 2123 return FALSE; 2124 } 2125 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) 2126 { 2127 int size = m_TempCharList.GetSize(); 2128 PAGECHAR_INFO preChar; 2129 if (size) { 2130 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 2131 } else { 2132 size = m_charList.GetSize(); 2133 if(size == 0) { 2134 return FALSE; 2135 } 2136 preChar = (PAGECHAR_INFO)m_charList[size - 1]; 2137 } 2138 info.m_Index = m_TextBuf.GetLength(); 2139 info.m_Unicode = unicode; 2140 info.m_pTextObj = NULL; 2141 info.m_CharCode = -1; 2142 info.m_Flag = FPDFTEXT_CHAR_GENERATED; 2143 int preWidth = 0; 2144 if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD) - 1) { 2145 preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont()); 2146 } 2147 FX_FLOAT fs = 0; 2148 if(preChar.m_pTextObj) { 2149 fs = preChar.m_pTextObj->GetFontSize(); 2150 } else { 2151 fs = preChar.m_CharBox.Height(); 2152 } 2153 if(!fs) { 2154 fs = 1; 2155 } 2156 info.m_OriginX = preChar.m_OriginX + preWidth * (fs) / 1000; 2157 info.m_OriginY = preChar.m_OriginY; 2158 info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX, info.m_OriginY); 2159 return TRUE; 2160 } 2161 FX_BOOL CPDF_TextPage::IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2) 2162 { 2163 rect1.Intersect(rect2); 2164 if(rect1.IsEmpty()) { 2165 return FALSE; 2166 } 2167 return TRUE; 2168 } 2169 FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) 2170 { 2171 if (unicode < L'A') { 2172 return FALSE; 2173 } 2174 if (unicode > L'Z' && unicode < L'a') { 2175 return FALSE; 2176 } 2177 if (unicode > L'z') { 2178 return FALSE; 2179 } 2180 return TRUE; 2181 } 2182 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage) 2183 : m_IsFind(FALSE), 2184 m_pTextPage(NULL) 2185 { 2186 if (!pTextPage) { 2187 return; 2188 } 2189 CPDF_ModuleMgr* pPDFModule = CPDF_ModuleMgr::Get(); 2190 m_pTextPage = pTextPage; 2191 m_strText = m_pTextPage->GetPageText(); 2192 int nCount = pTextPage->CountChars(); 2193 if(nCount) { 2194 m_CharIndex.Add(0); 2195 } 2196 for(int i = 0; i < nCount; i++) { 2197 FPDF_CHAR_INFO info; 2198 pTextPage->GetCharInfo(i, info); 2199 int indexSize = m_CharIndex.GetSize(); 2200 if(info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) { 2201 if(indexSize % 2) { 2202 m_CharIndex.Add(1); 2203 } else { 2204 if(indexSize <= 0) { 2205 continue; 2206 } 2207 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1); 2208 } 2209 } else { 2210 if(indexSize % 2) { 2211 if(indexSize <= 0) { 2212 continue; 2213 } 2214 m_CharIndex.SetAt(indexSize - 1, i + 1); 2215 } else { 2216 m_CharIndex.Add(i + 1); 2217 } 2218 } 2219 } 2220 int indexSize = m_CharIndex.GetSize(); 2221 if(indexSize % 2) { 2222 m_CharIndex.RemoveAt(indexSize - 1); 2223 } 2224 m_resStart = 0; 2225 m_resEnd = -1; 2226 } 2227 int CPDF_TextPageFind::GetCharIndex(int index) const 2228 { 2229 return m_pTextPage->CharIndexFromTextIndex(index); 2230 int indexSize = m_CharIndex.GetSize(); 2231 int count = 0; 2232 for(int i = 0; i < indexSize; i += 2) { 2233 count += m_CharIndex.GetAt(i + 1); 2234 if(count > index) { 2235 return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i); 2236 } 2237 } 2238 return -1; 2239 } 2240 FX_BOOL CPDF_TextPageFind::FindFirst(CFX_WideString findwhat, int flags, int startPos) 2241 { 2242 if (!m_pTextPage) { 2243 return FALSE; 2244 } 2245 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) { 2246 m_strText = m_pTextPage->GetPageText(); 2247 } 2248 m_findWhat = findwhat; 2249 m_flags = flags; 2250 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; 2251 if (m_strText.IsEmpty()) { 2252 m_IsFind = FALSE; 2253 return TRUE; 2254 } 2255 FX_STRSIZE len = findwhat.GetLength(); 2256 if (!m_bMatchCase) { 2257 findwhat.MakeLower(); 2258 m_strText.MakeLower(); 2259 } 2260 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD; 2261 m_findNextStart = startPos; 2262 if (startPos == -1) { 2263 m_findPreStart = m_strText.GetLength() - 1; 2264 } else { 2265 m_findPreStart = startPos; 2266 } 2267 m_csFindWhatArray.RemoveAll(); 2268 int i = 0; 2269 while(i < len) { 2270 if(findwhat.GetAt(i) != ' ') { 2271 break; 2272 } 2273 i++; 2274 } 2275 if(i < len) { 2276 ExtractFindWhat(findwhat); 2277 } else { 2278 m_csFindWhatArray.Add(findwhat); 2279 } 2280 if(m_csFindWhatArray.GetSize() <= 0) { 2281 return FALSE; 2282 } 2283 m_IsFind = TRUE; 2284 m_resStart = 0; 2285 m_resEnd = -1; 2286 return TRUE; 2287 } 2288 FX_BOOL CPDF_TextPageFind::FindNext() 2289 { 2290 if (!m_pTextPage) { 2291 return FALSE; 2292 } 2293 m_resArray.RemoveAll(); 2294 if(m_findNextStart == -1) { 2295 return FALSE; 2296 } 2297 if(m_strText.IsEmpty()) { 2298 m_IsFind = FALSE; 2299 return m_IsFind; 2300 } 2301 int strLen = m_strText.GetLength(); 2302 if (m_findNextStart > strLen - 1) { 2303 m_IsFind = FALSE; 2304 return m_IsFind; 2305 } 2306 int nCount = m_csFindWhatArray.GetSize(); 2307 int nResultPos = 0; 2308 int nStartPos = 0; 2309 nStartPos = m_findNextStart; 2310 FX_BOOL bSpaceStart = FALSE; 2311 for(int iWord = 0; iWord < nCount; iWord++) { 2312 CFX_WideString csWord = m_csFindWhatArray[iWord]; 2313 if(csWord.IsEmpty()) { 2314 if(iWord == nCount - 1) { 2315 FX_WCHAR strInsert = m_strText.GetAt(nStartPos); 2316 if(strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR || strInsert == TEXT_RETURN_CHAR || strInsert == 160) { 2317 nResultPos = nStartPos + 1; 2318 break; 2319 } 2320 iWord = -1; 2321 } else if(iWord == 0) { 2322 bSpaceStart = TRUE; 2323 } 2324 continue; 2325 } 2326 int endIndex; 2327 nResultPos = m_strText.Find(csWord, nStartPos); 2328 if (nResultPos == -1) { 2329 m_IsFind = FALSE; 2330 return m_IsFind; 2331 } 2332 endIndex = nResultPos + csWord.GetLength() - 1; 2333 if(iWord == 0) { 2334 m_resStart = nResultPos; 2335 } 2336 FX_BOOL bMatch = TRUE; 2337 if(iWord != 0 && !bSpaceStart) { 2338 int PreResEndPos = nStartPos; 2339 int curChar = csWord.GetAt(0); 2340 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; 2341 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); 2342 if(nStartPos == nResultPos && !(_IsIgnoreSpaceCharacter(lastChar) || _IsIgnoreSpaceCharacter(curChar))) { 2343 bMatch = FALSE; 2344 } 2345 for(int d = PreResEndPos; d < nResultPos; d++) { 2346 FX_WCHAR strInsert = m_strText.GetAt(d); 2347 if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 2348 bMatch = FALSE; 2349 break; 2350 } 2351 } 2352 } else if(bSpaceStart) { 2353 if(nResultPos > 0) { 2354 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); 2355 if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 2356 bMatch = FALSE; 2357 m_resStart = nResultPos; 2358 } else { 2359 m_resStart = nResultPos - 1; 2360 } 2361 } 2362 } 2363 if(m_bMatchWholeWord && bMatch) { 2364 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); 2365 } 2366 nStartPos = endIndex + 1; 2367 if(!bMatch) { 2368 iWord = -1; 2369 if(bSpaceStart) { 2370 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); 2371 } else { 2372 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); 2373 } 2374 } 2375 } 2376 m_resEnd = nResultPos + m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1; 2377 m_IsFind = TRUE; 2378 int resStart = GetCharIndex(m_resStart); 2379 int resEnd = GetCharIndex(m_resEnd); 2380 m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray); 2381 if(m_flags & FPDFTEXT_CONSECUTIVE) { 2382 m_findNextStart = m_resStart + 1; 2383 m_findPreStart = m_resEnd - 1; 2384 } else { 2385 m_findNextStart = m_resEnd + 1; 2386 m_findPreStart = m_resStart - 1; 2387 } 2388 return m_IsFind; 2389 } 2390 FX_BOOL CPDF_TextPageFind::FindPrev() 2391 { 2392 if (!m_pTextPage) { 2393 return FALSE; 2394 } 2395 m_resArray.RemoveAll(); 2396 if(m_strText.IsEmpty() || m_findPreStart < 0) { 2397 m_IsFind = FALSE; 2398 return m_IsFind; 2399 } 2400 CPDF_TextPageFind findEngine(m_pTextPage); 2401 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); 2402 if(!ret) { 2403 m_IsFind = FALSE; 2404 return m_IsFind; 2405 } 2406 int order = -1, MatchedCount = 0; 2407 while(ret) { 2408 ret = findEngine.FindNext(); 2409 if(ret) { 2410 int order1 = findEngine.GetCurOrder() ; 2411 int MatchedCount1 = findEngine.GetMatchedCount(); 2412 if(((order1 + MatchedCount1) - 1) > m_findPreStart) { 2413 break; 2414 } 2415 order = order1; 2416 MatchedCount = MatchedCount1; 2417 } 2418 } 2419 if(order == -1) { 2420 m_IsFind = FALSE; 2421 return m_IsFind; 2422 } 2423 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); 2424 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); 2425 m_IsFind = TRUE; 2426 m_pTextPage->GetRectArray(order, MatchedCount, m_resArray); 2427 if(m_flags & FPDFTEXT_CONSECUTIVE) { 2428 m_findNextStart = m_resStart + 1; 2429 m_findPreStart = m_resEnd - 1; 2430 } else { 2431 m_findNextStart = m_resEnd + 1; 2432 m_findPreStart = m_resStart - 1; 2433 } 2434 return m_IsFind; 2435 } 2436 void CPDF_TextPageFind::ExtractFindWhat(CFX_WideString findwhat) 2437 { 2438 if(findwhat.IsEmpty()) { 2439 return ; 2440 } 2441 int index = 0; 2442 while(1) { 2443 CFX_WideString csWord = TEXT_EMPTY; 2444 int ret = ExtractSubString(csWord, findwhat, index, TEXT_BLANK_CHAR); 2445 if(csWord.IsEmpty()) { 2446 if(ret) { 2447 m_csFindWhatArray.Add(CFX_WideString(L"")); 2448 index++; 2449 continue; 2450 } else { 2451 break; 2452 } 2453 } 2454 int pos = 0; 2455 FX_BOOL bLastIgnore = FALSE; 2456 while(pos < csWord.GetLength()) { 2457 CFX_WideString curStr = csWord.Mid(pos, 1); 2458 FX_WCHAR curChar = csWord.GetAt(pos); 2459 if (_IsIgnoreSpaceCharacter(curChar)) { 2460 if (pos > 0 && curChar == 0x2019) { 2461 pos++; 2462 continue; 2463 } 2464 if (pos > 0 ) { 2465 CFX_WideString preStr = csWord.Mid(0, pos); 2466 m_csFindWhatArray.Add(preStr); 2467 } 2468 m_csFindWhatArray.Add(curStr); 2469 if (pos == csWord.GetLength() - 1) { 2470 csWord.Empty(); 2471 break; 2472 } 2473 csWord = csWord.Right(csWord.GetLength() - pos - 1); 2474 pos = 0; 2475 bLastIgnore = TRUE; 2476 continue; 2477 } else { 2478 bLastIgnore = FALSE; 2479 } 2480 pos++; 2481 } 2482 if (!csWord.IsEmpty()) { 2483 m_csFindWhatArray.Add(csWord); 2484 } 2485 index++; 2486 } 2487 return; 2488 } 2489 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos) 2490 { 2491 int char_left = 0; 2492 int char_right = 0; 2493 int char_count = endPos - startPos + 1; 2494 if(char_count < 1) { 2495 return FALSE; 2496 } 2497 if (char_count == 1 && csPageText.GetAt(startPos) > 255) { 2498 return TRUE; 2499 } 2500 if(startPos - 1 >= 0 ) { 2501 char_left = csPageText.GetAt(startPos - 1); 2502 } 2503 if(startPos + char_count < csPageText.GetLength()) { 2504 char_right = csPageText.GetAt(startPos + char_count); 2505 } 2506 if(char_left == 0x61) { 2507 int a = 0; 2508 } 2509 if ((char_left > 'A' && char_left < 'a') || (char_left > 'a' && char_left < 'z') || (char_left > 0xfb00 && char_left < 0xfb06) || (char_left >= '0' && char_left <= '9') || 2510 (char_right > 'A' && char_right < 'a') || (char_right > 'a' && char_right < 'z') || (char_right > 0xfb00 && char_right < 0xfb06) || (char_right >= '0' && char_right <= '9')) { 2511 return FALSE; 2512 } 2513 if(!(('A' > char_left || char_left > 'Z') && ('a' > char_left || char_left > 'z') 2514 && ('A' > char_right || char_right > 'Z') && ('a' > char_right || char_right > 'z'))) { 2515 return FALSE; 2516 } 2517 if (char_count > 0) { 2518 if (csPageText.GetAt(startPos) >= L'0' && csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && char_left <= L'9') { 2519 return FALSE; 2520 } 2521 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && char_right >= L'0' && char_right <= L'9') { 2522 return FALSE; 2523 } 2524 } 2525 return TRUE; 2526 } 2527 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString, 2528 int iSubString, FX_WCHAR chSep) 2529 { 2530 if (lpszFullString == NULL) { 2531 return FALSE; 2532 } 2533 while (iSubString--) { 2534 lpszFullString = FXSYS_wcschr(lpszFullString, chSep); 2535 if (lpszFullString == NULL) { 2536 rString.Empty(); 2537 return FALSE; 2538 } 2539 lpszFullString++; 2540 while(*lpszFullString == chSep) { 2541 lpszFullString++; 2542 } 2543 } 2544 FX_LPCWSTR lpchEnd = FXSYS_wcschr(lpszFullString, chSep); 2545 int nLen = (lpchEnd == NULL) ? 2546 (int)FXSYS_wcslen(lpszFullString) : (int)(lpchEnd - lpszFullString); 2547 ASSERT(nLen >= 0); 2548 FXSYS_memcpy32(rString.GetBuffer(nLen), lpszFullString, nLen * sizeof(FX_WCHAR)); 2549 rString.ReleaseBuffer(); 2550 return TRUE; 2551 } 2552 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString str) 2553 { 2554 CFX_WideString str2; 2555 str2.Empty(); 2556 int nlen = str.GetLength(); 2557 for(int i = nlen - 1; i >= 0; i--) { 2558 str2 += str.GetAt(i); 2559 } 2560 return str2; 2561 } 2562 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const 2563 { 2564 rects.Copy(m_resArray); 2565 } 2566 int CPDF_TextPageFind::GetCurOrder() const 2567 { 2568 return GetCharIndex(m_resStart); 2569 } 2570 int CPDF_TextPageFind::GetMatchedCount()const 2571 { 2572 int resStart = GetCharIndex(m_resStart); 2573 int resEnd = GetCharIndex(m_resEnd); 2574 return resEnd - resStart + 1; 2575 } 2576 CPDF_LinkExtract::CPDF_LinkExtract() 2577 : m_pTextPage(NULL), 2578 m_IsParserd(FALSE) 2579 { 2580 } 2581 CPDF_LinkExtract::~CPDF_LinkExtract() 2582 { 2583 DeleteLinkList(); 2584 } 2585 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) 2586 { 2587 if (!pTextPage || !pTextPage->IsParsered()) { 2588 return FALSE; 2589 } 2590 m_pTextPage = (const CPDF_TextPage*)pTextPage; 2591 m_strPageText = m_pTextPage->GetPageText(0, -1); 2592 DeleteLinkList(); 2593 if (m_strPageText.IsEmpty()) { 2594 return FALSE; 2595 } 2596 parserLink(); 2597 m_IsParserd = TRUE; 2598 return TRUE; 2599 } 2600 void CPDF_LinkExtract::DeleteLinkList() 2601 { 2602 while (m_LinkList.GetSize()) { 2603 CPDF_LinkExt* linkinfo = NULL; 2604 linkinfo = m_LinkList.GetAt(0); 2605 m_LinkList.RemoveAt(0); 2606 delete linkinfo; 2607 } 2608 m_LinkList.RemoveAll(); 2609 } 2610 int CPDF_LinkExtract::CountLinks() const 2611 { 2612 if (!m_IsParserd) { 2613 return -1; 2614 } 2615 return m_LinkList.GetSize(); 2616 } 2617 void CPDF_LinkExtract::parserLink() 2618 { 2619 int start = 0, pos = 0; 2620 int TotalChar = m_pTextPage->CountChars(); 2621 while (pos < TotalChar) { 2622 FPDF_CHAR_INFO pageChar; 2623 m_pTextPage->GetCharInfo(pos, pageChar); 2624 if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { 2625 int nCount = pos - start; 2626 if(pos == TotalChar - 1) { 2627 nCount++; 2628 } 2629 CFX_WideString strBeCheck; 2630 strBeCheck = m_pTextPage->GetPageText(start, nCount); 2631 if (strBeCheck.GetLength() > 5) { 2632 while(strBeCheck.GetLength() > 0) { 2633 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); 2634 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { 2635 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); 2636 nCount--; 2637 } else { 2638 break; 2639 } 2640 } 2641 if (nCount > 5 && (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { 2642 if (!AppendToLinkList(start, nCount, strBeCheck)) { 2643 break; 2644 } 2645 } 2646 } 2647 start = ++pos; 2648 } else { 2649 pos++; 2650 } 2651 } 2652 } 2653 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) 2654 { 2655 CFX_WideString str = strBeCheck; 2656 str.MakeLower(); 2657 if (str.Find(L"http://www.") != -1) { 2658 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); 2659 return TRUE; 2660 } else if (str.Find(L"http://") != -1) { 2661 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); 2662 return TRUE; 2663 } else if (str.Find(L"https://www.") != -1) { 2664 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); 2665 return TRUE; 2666 } else if (str.Find(L"https://") != -1) { 2667 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); 2668 return TRUE; 2669 } else if (str.Find(L"www.") != -1) { 2670 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 2671 strBeCheck = L"http://" + strBeCheck; 2672 return TRUE; 2673 } else { 2674 return FALSE; 2675 } 2676 } 2677 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) 2678 { 2679 str.MakeLower(); 2680 int aPos = str.Find(L'@'); 2681 if (aPos < 1) { 2682 return FALSE; 2683 } 2684 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') { 2685 return FALSE; 2686 } 2687 int i; 2688 for (i = aPos - 1; i >= 0; i--) { 2689 FX_WCHAR ch = str.GetAt(i); 2690 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || (ch >= L'0' && ch <= L'9')) { 2691 continue; 2692 } else { 2693 if (i == aPos - 1) { 2694 return FALSE; 2695 } 2696 str = str.Right(str.GetLength() - i - 1); 2697 break; 2698 } 2699 } 2700 aPos = str.Find(L'@'); 2701 if (aPos < 1) { 2702 return FALSE; 2703 } 2704 CFX_WideString strtemp = L""; 2705 for (i = 0; i < aPos; i++) { 2706 FX_WCHAR wch = str.GetAt(i); 2707 if (wch >= L'a' && wch <= L'z') { 2708 break; 2709 } else { 2710 strtemp = str.Right(str.GetLength() - i + 1); 2711 } 2712 } 2713 if (strtemp != L"") { 2714 str = strtemp; 2715 } 2716 aPos = str.Find(L'@'); 2717 if (aPos < 1) { 2718 return FALSE; 2719 } 2720 str.TrimRight(L'.'); 2721 strtemp = str; 2722 int ePos = str.Find(L'.'); 2723 if (ePos == -1) { 2724 return FALSE; 2725 } 2726 while (ePos != -1) { 2727 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); 2728 ePos = strtemp.Find('.'); 2729 } 2730 ePos = strtemp.GetLength(); 2731 for (i = 0; i < ePos; i++) { 2732 FX_WCHAR wch = str.GetAt(i); 2733 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2734 continue; 2735 } else { 2736 str = str.Left(str.GetLength() - ePos + i + 1); 2737 ePos = ePos - i - 1; 2738 break; 2739 } 2740 } 2741 int nLen = str.GetLength(); 2742 for (i = aPos + 1; i < nLen - ePos; i++) { 2743 FX_WCHAR wch = str.GetAt(i); 2744 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2745 continue; 2746 } else { 2747 return FALSE; 2748 } 2749 } 2750 if (str.Find(L"mailto:") == -1) { 2751 str = L"mailto:" + str; 2752 } 2753 return TRUE; 2754 } 2755 FX_BOOL CPDF_LinkExtract::AppendToLinkList(int start, int count, CFX_WideString strUrl) 2756 { 2757 CPDF_LinkExt* linkInfo = NULL; 2758 linkInfo = FX_NEW CPDF_LinkExt; 2759 if (!linkInfo) { 2760 return FALSE; 2761 } 2762 linkInfo->m_strUrl = strUrl; 2763 linkInfo->m_Start = start; 2764 linkInfo->m_Count = count; 2765 m_LinkList.Add(linkInfo); 2766 return TRUE; 2767 } 2768 CFX_WideString CPDF_LinkExtract::GetURL(int index) const 2769 { 2770 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { 2771 return L""; 2772 } 2773 CPDF_LinkExt* link = NULL; 2774 link = m_LinkList.GetAt(index); 2775 if (!link) { 2776 return L""; 2777 } 2778 return link->m_strUrl; 2779 } 2780 void CPDF_LinkExtract::GetBoundedSegment(int index, int& start, int& count) const 2781 { 2782 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { 2783 return ; 2784 } 2785 CPDF_LinkExt* link = NULL; 2786 link = m_LinkList.GetAt(index); 2787 if (!link) { 2788 return ; 2789 } 2790 start = link->m_Start; 2791 count = link->m_Count; 2792 } 2793 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const 2794 { 2795 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { 2796 return; 2797 } 2798 CPDF_LinkExt* link = NULL; 2799 link = m_LinkList.GetAt(index); 2800 if (!link) { 2801 return ; 2802 } 2803 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); 2804 } 2805