1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "../../include/fpdfapi/fpdf_resource.h" 8 #include "../../include/fpdfapi/fpdf_pageobj.h" 9 #include "../../include/fpdftext/fpdf_text.h" 10 #include "../../include/fpdfapi/fpdf_page.h" 11 #include "../../include/fpdfapi/fpdf_module.h" 12 #include <ctype.h> 13 #include <algorithm> 14 #include "text_int.h" 15 16 namespace { 17 18 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) 19 { 20 if(curChar < 255 ) { 21 return FALSE; 22 } 23 if ( (curChar >= 0x0600 && curChar <= 0x06FF) 24 || (curChar >= 0xFE70 && curChar <= 0xFEFF) 25 || (curChar >= 0xFB50 && curChar <= 0xFDFF) 26 || (curChar >= 0x0400 && curChar <= 0x04FF) 27 || (curChar >= 0x0500 && curChar <= 0x052F) 28 || (curChar >= 0xA640 && curChar <= 0xA69F) 29 || (curChar >= 0x2DE0 && curChar <= 0x2DFF) 30 || curChar == 8467 31 || (curChar >= 0x2000 && curChar <= 0x206F)) { 32 return FALSE; 33 } 34 return TRUE; 35 } 36 37 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) 38 { 39 if (threshold < 300) { 40 return threshold / 2.0f; 41 } else if (threshold < 500) { 42 return threshold / 4.0f; 43 } else if (threshold < 700) { 44 return threshold / 5.0f; 45 } 46 return threshold / 6.0f; 47 } 48 49 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, 50 const CFX_AffineMatrix& matrix) 51 { 52 FX_FLOAT baseSpace = 0.0; 53 const int nItems = pTextObj->CountItems(); 54 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { 55 FX_BOOL bAllChar = TRUE; 56 FX_FLOAT spacing = matrix.TransformDistance( 57 pTextObj->m_TextState.GetObject()->m_CharSpace); 58 baseSpace = spacing; 59 for (int i = 0; i < nItems; i++) { 60 CPDF_TextObjectItem item; 61 pTextObj->GetItemInfo(i, &item); 62 if (item.m_CharCode == (FX_DWORD) - 1) { 63 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 64 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000; 65 baseSpace = std::min(baseSpace, kerning + spacing); 66 bAllChar = FALSE; 67 } 68 } 69 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) { 70 baseSpace = 0.0; 71 } 72 } 73 return baseSpace; 74 } 75 76 } // namespace 77 78 CPDFText_ParseOptions::CPDFText_ParseOptions() 79 : m_bGetCharCodeOnly(FALSE), m_bNormalizeObjs(TRUE), m_bOutputHyphen(FALSE) 80 { 81 } 82 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions) 83 { 84 return new CPDF_TextPage(pPage, ParserOptions); 85 } 86 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, int flags) 87 { 88 return new CPDF_TextPage(pPage, flags); 89 } 90 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_PageObjects* pObjs, int flags) 91 { 92 return new CPDF_TextPage(pObjs, flags); 93 } 94 IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(const IPDF_TextPage* pTextPage) 95 { 96 if (!pTextPage) { 97 return NULL; 98 } 99 return new CPDF_TextPageFind(pTextPage); 100 } 101 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() 102 { 103 return new CPDF_LinkExtract(); 104 } 105 #define TEXT_BLANK_CHAR L' ' 106 #define TEXT_LINEFEED_CHAR L'\n' 107 #define TEXT_RETURN_CHAR L'\r' 108 #define TEXT_EMPTY L"" 109 #define TEXT_BLANK L" " 110 #define TEXT_RETURN_LINEFEED L"\r\n" 111 #define TEXT_LINEFEED L"\n" 112 #define TEXT_CHARRATIO_GAPDELTA 0.070 113 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags) 114 : m_charList(512), 115 m_TempCharList(50), 116 m_pPreTextObj(NULL), 117 m_IsParsered(FALSE), 118 m_TextlineDir(-1), 119 m_CurlineRect(0, 0, 0, 0) 120 { 121 m_pPage = pPage; 122 m_parserflag = flags; 123 m_TextBuf.EstimateSize(0, 10240); 124 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0); 125 } 126 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions) 127 : m_ParseOptions(ParserOptions) 128 , m_charList(512) 129 , m_TempCharList(50) 130 , m_pPreTextObj(NULL) 131 , m_IsParsered(FALSE) 132 , m_TextlineDir(-1) 133 , m_CurlineRect(0, 0, 0, 0) 134 { 135 m_pPage = pPage; 136 m_parserflag = 0; 137 m_TextBuf.EstimateSize(0, 10240); 138 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0); 139 } 140 CPDF_TextPage::CPDF_TextPage(const CPDF_PageObjects* pPage, int flags) 141 : m_charList(512), 142 m_TempCharList(50), 143 m_pPreTextObj(NULL), 144 m_IsParsered(FALSE), 145 m_TextlineDir(-1), 146 m_CurlineRect(0, 0, 0, 0) 147 { 148 m_pPage = pPage; 149 m_parserflag = flags; 150 m_TextBuf.EstimateSize(0, 10240); 151 CFX_FloatRect pageRect = pPage->CalcBoundingBox(); 152 m_DisplayMatrix = CFX_AffineMatrix(1, 0, 0, -1, pageRect.right, pageRect.top); 153 } 154 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) 155 { 156 m_ParseOptions.m_bNormalizeObjs = bNormalize; 157 } 158 FX_BOOL CPDF_TextPage::IsControlChar(PAGECHAR_INFO* pCharInfo) 159 { 160 if(!pCharInfo) { 161 return FALSE; 162 } 163 switch(pCharInfo->m_Unicode) { 164 case 0x2: 165 case 0x3: 166 case 0x93: 167 case 0x94: 168 case 0x96: 169 case 0x97: 170 case 0x98: 171 case 0xfffe: 172 if(pCharInfo->m_Flag == FPDFTEXT_CHAR_HYPHEN) { 173 return FALSE; 174 } else { 175 return TRUE; 176 } 177 default: 178 return FALSE; 179 } 180 } 181 FX_BOOL CPDF_TextPage::ParseTextPage() 182 { 183 if (!m_pPage) { 184 m_IsParsered = FALSE; 185 return FALSE; 186 } 187 m_IsParsered = FALSE; 188 m_TextBuf.Clear(); 189 m_charList.RemoveAll(); 190 m_pPreTextObj = NULL; 191 ProcessObject(); 192 m_IsParsered = TRUE; 193 if(!m_ParseOptions.m_bGetCharCodeOnly) { 194 m_CharIndex.RemoveAll(); 195 int nCount = m_charList.GetSize(); 196 if(nCount) { 197 m_CharIndex.Add(0); 198 } 199 for(int i = 0; i < nCount; i++) { 200 int indexSize = m_CharIndex.GetSize(); 201 FX_BOOL bNormal = FALSE; 202 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i); 203 if(charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 204 bNormal = TRUE; 205 } 206 else if(charinfo.m_Unicode == 0 || IsControlChar(&charinfo)) 207 bNormal = FALSE; 208 else { 209 bNormal = TRUE; 210 } 211 if(bNormal) { 212 if(indexSize % 2) { 213 m_CharIndex.Add(1); 214 } else { 215 if(indexSize <= 0) { 216 continue; 217 } 218 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1); 219 } 220 } else { 221 if(indexSize % 2) { 222 if(indexSize <= 0) { 223 continue; 224 } 225 m_CharIndex.SetAt(indexSize - 1, i + 1); 226 } else { 227 m_CharIndex.Add(i + 1); 228 } 229 } 230 } 231 int indexSize = m_CharIndex.GetSize(); 232 if(indexSize % 2) { 233 m_CharIndex.RemoveAt(indexSize - 1); 234 } 235 } 236 return TRUE; 237 } 238 int CPDF_TextPage::CountChars() const 239 { 240 if(m_ParseOptions.m_bGetCharCodeOnly) { 241 return m_TextBuf.GetSize(); 242 } 243 return m_charList.GetSize(); 244 } 245 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const 246 { 247 int indexSize = m_CharIndex.GetSize(); 248 int count = 0; 249 for(int i = 0; i < indexSize; i += 2) { 250 count += m_CharIndex.GetAt(i + 1); 251 if(count > TextIndex) { 252 return TextIndex - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i); 253 } 254 } 255 return -1; 256 } 257 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const 258 { 259 int indexSize = m_CharIndex.GetSize(); 260 int count = 0; 261 for(int i = 0; i < indexSize; i += 2) { 262 count += m_CharIndex.GetAt(i + 1); 263 if(m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) { 264 if(CharIndex - m_CharIndex.GetAt(i) < 0) { 265 return -1; 266 } 267 return CharIndex - m_CharIndex.GetAt(i) + count - m_CharIndex.GetAt(i + 1); 268 } 269 } 270 return -1; 271 } 272 void CPDF_TextPage::GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const 273 { 274 if(m_ParseOptions.m_bGetCharCodeOnly) { 275 return; 276 } 277 if(start < 0 || nCount == 0) { 278 return; 279 } 280 if (!m_IsParsered) { 281 return; 282 } 283 PAGECHAR_INFO info_curchar; 284 CPDF_TextObject* pCurObj = NULL; 285 CFX_FloatRect rect; 286 int curPos = start; 287 FX_BOOL flagNewRect = TRUE; 288 if (nCount + start > m_charList.GetSize() || nCount == -1) { 289 nCount = m_charList.GetSize() - start; 290 } 291 while (nCount--) { 292 info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++); 293 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) { 294 continue; 295 } 296 if(info_curchar.m_CharBox.Width() < 0.01 || info_curchar.m_CharBox.Height() < 0.01) { 297 continue; 298 } 299 if(!pCurObj) { 300 pCurObj = info_curchar.m_pTextObj; 301 } 302 if (pCurObj != info_curchar.m_pTextObj) { 303 rectArray.Add(rect); 304 pCurObj = info_curchar.m_pTextObj; 305 flagNewRect = TRUE; 306 } 307 if (flagNewRect) { 308 FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY; 309 CFX_AffineMatrix matrix, matrix_reverse; 310 info_curchar.m_pTextObj->GetTextMatrix(&matrix); 311 matrix.Concat(info_curchar.m_Matrix); 312 matrix_reverse.SetReverse(matrix); 313 matrix_reverse.Transform(orgX, orgY); 314 rect.left = info_curchar.m_CharBox.left; 315 rect.right = info_curchar.m_CharBox.right; 316 if (pCurObj->GetFont()->GetTypeDescent()) { 317 rect.bottom = orgY + pCurObj->GetFont()->GetTypeDescent() * pCurObj->GetFontSize() / 1000; 318 FX_FLOAT xPosTemp = orgX; 319 matrix.Transform(xPosTemp, rect.bottom); 320 } else { 321 rect.bottom = info_curchar.m_CharBox.bottom; 322 } 323 if (pCurObj->GetFont()->GetTypeAscent()) { 324 rect.top = orgY + pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000; 325 FX_FLOAT xPosTemp = orgX + GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) * pCurObj->GetFontSize() / 1000; 326 matrix.Transform(xPosTemp, rect.top); 327 } else { 328 rect.top = info_curchar.m_CharBox.top; 329 } 330 flagNewRect = FALSE; 331 rect = info_curchar.m_CharBox; 332 rect.Normalize(); 333 } else { 334 info_curchar.m_CharBox.Normalize(); 335 if (rect.left > info_curchar.m_CharBox.left) { 336 rect.left = info_curchar.m_CharBox.left; 337 } 338 if (rect.right < info_curchar.m_CharBox.right) { 339 rect.right = info_curchar.m_CharBox.right; 340 } 341 if ( rect.top < info_curchar.m_CharBox.top) { 342 rect.top = info_curchar.m_CharBox.top; 343 } 344 if (rect.bottom > info_curchar.m_CharBox.bottom) { 345 rect.bottom = info_curchar.m_CharBox.bottom; 346 } 347 } 348 } 349 rectArray.Add(rect); 350 return; 351 } 352 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point , FX_FLOAT xTorelance, FX_FLOAT yTorelance) const 353 { 354 if(m_ParseOptions.m_bGetCharCodeOnly) { 355 return -3; 356 } 357 if (!m_IsParsered) { 358 return -3; 359 } 360 int pos = 0; 361 int NearPos = -1; 362 double xdif = 5000, ydif = 5000; 363 while(pos < m_charList.GetSize()) { 364 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos)); 365 CFX_FloatRect charrect = charinfo.m_CharBox; 366 if (charrect.Contains(point.x, point.y)) { 367 break; 368 } 369 if (xTorelance > 0 || yTorelance > 0) { 370 CFX_FloatRect charRectExt; 371 charrect.Normalize(); 372 charRectExt.left = charrect.left - xTorelance / 2; 373 charRectExt.right = charrect.right + xTorelance / 2; 374 charRectExt.top = charrect.top + yTorelance / 2; 375 charRectExt.bottom = charrect.bottom - yTorelance / 2; 376 if (charRectExt.Contains(point.x, point.y)) { 377 double curXdif, curYdif; 378 curXdif = FXSYS_fabs(point.x - charrect.left) < FXSYS_fabs(point.x - charrect.right) ? FXSYS_fabs(point.x - charrect.left) : FXSYS_fabs(point.x - charrect.right); 379 curYdif = FXSYS_fabs(point.y - charrect.bottom) < FXSYS_fabs(point.y - charrect.top ) ? FXSYS_fabs(point.y - charrect.bottom) : FXSYS_fabs(point.y - charrect.top); 380 if (curYdif + curXdif < xdif + ydif) { 381 ydif = curYdif; 382 xdif = curXdif; 383 NearPos = pos; 384 } 385 } 386 } 387 ++pos; 388 } 389 if (pos >= m_charList.GetSize()) { 390 pos = NearPos; 391 } 392 return pos; 393 } 394 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const 395 { 396 CFX_WideString strText; 397 if(m_ParseOptions.m_bGetCharCodeOnly || !m_IsParsered) { 398 return strText; 399 } 400 int nCount = m_charList.GetSize(); 401 int pos = 0; 402 FX_FLOAT posy = 0; 403 FX_BOOL IsContainPreChar = FALSE; 404 FX_BOOL ISAddLineFeed = FALSE; 405 while (pos < nCount) { 406 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++); 407 if (IsRectIntersect(rect, charinfo.m_CharBox)) { 408 if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar && ISAddLineFeed) { 409 posy = charinfo.m_OriginY; 410 if (strText.GetLength() > 0) { 411 strText += L"\r\n"; 412 } 413 } 414 IsContainPreChar = TRUE; 415 ISAddLineFeed = FALSE; 416 if (charinfo.m_Unicode) { 417 strText += charinfo.m_Unicode; 418 } 419 } else if (charinfo.m_Unicode == 32) { 420 if (IsContainPreChar && charinfo.m_Unicode) { 421 strText += charinfo.m_Unicode; 422 IsContainPreChar = FALSE; 423 ISAddLineFeed = FALSE; 424 } 425 } else { 426 IsContainPreChar = FALSE; 427 ISAddLineFeed = TRUE; 428 } 429 } 430 return strText; 431 } 432 void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect, CFX_RectArray& resRectArray) const 433 { 434 if(m_ParseOptions.m_bGetCharCodeOnly) { 435 return; 436 } 437 if (!m_IsParsered) { 438 return; 439 } 440 CFX_FloatRect curRect; 441 FX_BOOL flagNewRect = TRUE; 442 CPDF_TextObject* pCurObj = NULL; 443 int nCount = m_charList.GetSize(); 444 int pos = 0; 445 while (pos < nCount) { 446 PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++); 447 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) { 448 continue; 449 } 450 if (IsRectIntersect(rect, info_curchar.m_CharBox)) { 451 if(!pCurObj) { 452 pCurObj = info_curchar.m_pTextObj; 453 } 454 if (pCurObj != info_curchar.m_pTextObj) { 455 resRectArray.Add(curRect); 456 pCurObj = info_curchar.m_pTextObj; 457 flagNewRect = TRUE; 458 } 459 if (flagNewRect) { 460 curRect = info_curchar.m_CharBox; 461 flagNewRect = FALSE; 462 curRect.Normalize(); 463 } else { 464 info_curchar.m_CharBox.Normalize(); 465 if (curRect.left > info_curchar.m_CharBox.left) { 466 curRect.left = info_curchar.m_CharBox.left; 467 } 468 if (curRect.right < info_curchar.m_CharBox.right) { 469 curRect.right = info_curchar.m_CharBox.right; 470 } 471 if ( curRect.top < info_curchar.m_CharBox.top) { 472 curRect.top = info_curchar.m_CharBox.top; 473 } 474 if (curRect.bottom > info_curchar.m_CharBox.bottom) { 475 curRect.bottom = info_curchar.m_CharBox.bottom; 476 } 477 } 478 } 479 } 480 resRectArray.Add(curRect); 481 return; 482 } 483 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const 484 { 485 if(m_ParseOptions.m_bGetCharCodeOnly) { 486 return -3; 487 } 488 CPDF_Point point(x, y); 489 return GetIndexAtPos(point, xTorelance, yTorelance); 490 } 491 int CPDF_TextPage::GetOrderByDirection(int order, int direction) const 492 { 493 if(m_ParseOptions.m_bGetCharCodeOnly) { 494 return -3; 495 } 496 if (!m_IsParsered) { 497 return -3; 498 } 499 if (direction == FPDFTEXT_RIGHT || direction == FPDFTEXT_LEFT) { 500 order += direction; 501 while(order >= 0 && order < m_charList.GetSize()) { 502 PAGECHAR_INFO cinfo = *(PAGECHAR_INFO*)m_charList.GetAt(order); 503 if (cinfo.m_Flag != FPDFTEXT_CHAR_GENERATED) { 504 break; 505 } else { 506 if (cinfo.m_Unicode == TEXT_LINEFEED_CHAR || cinfo.m_Unicode == TEXT_RETURN_CHAR) { 507 order += direction; 508 } else { 509 break; 510 } 511 } 512 } 513 if (order >= m_charList.GetSize()) { 514 order = -2; 515 } 516 return order; 517 } 518 PAGECHAR_INFO charinfo; 519 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(order); 520 CPDF_Point curPos(charinfo.m_OriginX, charinfo.m_OriginY); 521 FX_FLOAT difPosY = 0.0, minXdif = 1000; 522 int minIndex = -2; 523 int index = order; 524 FX_FLOAT height = charinfo.m_CharBox.Height(); 525 if (direction == FPDFTEXT_UP) { 526 minIndex = -1; 527 while (1) { 528 if (--index < 0) { 529 return -1; 530 } 531 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 532 if (FXSYS_fabs(charinfo.m_OriginY - curPos.y) > FX_MAX(height, charinfo.m_CharBox.Height()) / 2) { 533 difPosY = charinfo.m_OriginY; 534 minIndex = index; 535 break; 536 } 537 } 538 FX_FLOAT PreXdif = charinfo.m_OriginX - curPos.x; 539 minXdif = PreXdif; 540 if (PreXdif == 0) { 541 return index; 542 } 543 FX_FLOAT curXdif = 0; 544 while (--index >= 0) { 545 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 546 if (difPosY != charinfo.m_OriginY) { 547 break; 548 } 549 curXdif = charinfo.m_OriginX - curPos.x; 550 if (curXdif == 0) { 551 return index; 552 } 553 int signflag = 0; 554 if (curXdif > 0) { 555 signflag = 1; 556 } else { 557 signflag = -1; 558 } 559 if (signflag * PreXdif < 0) { 560 if (FXSYS_fabs(PreXdif) < FXSYS_fabs(curXdif)) { 561 return index + 1; 562 } else { 563 return index; 564 } 565 } 566 if (FXSYS_fabs(curXdif) < FXSYS_fabs(minXdif)) { 567 minIndex = index; 568 minXdif = curXdif; 569 } 570 PreXdif = curXdif; 571 if (difPosY != charinfo.m_OriginY) { 572 break; 573 } 574 } 575 return minIndex; 576 } else if(FPDFTEXT_DOWN) { 577 minIndex = -2; 578 while (1) { 579 if (++index > m_charList.GetSize() - 1) { 580 return minIndex; 581 } 582 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 583 if (FXSYS_fabs(charinfo.m_OriginY - curPos.y) > FX_MAX(height, charinfo.m_CharBox.Height()) / 2) { 584 difPosY = charinfo.m_OriginY; 585 minIndex = index; 586 break; 587 } 588 } 589 FX_FLOAT PreXdif = charinfo.m_OriginX - curPos.x; 590 minXdif = PreXdif; 591 if (PreXdif == 0) { 592 return index; 593 } 594 FX_FLOAT curXdif = 0; 595 while (++index < m_charList.GetSize()) { 596 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 597 if (difPosY != charinfo.m_OriginY) { 598 break; 599 } 600 curXdif = charinfo.m_OriginX - curPos.x; 601 if (curXdif == 0) { 602 return index; 603 } 604 int signflag = 0; 605 if (curXdif > 0) { 606 signflag = 1; 607 } else { 608 signflag = -1; 609 } 610 if (signflag * PreXdif < 0) { 611 if (FXSYS_fabs(PreXdif) < FXSYS_fabs(curXdif)) { 612 return index - 1; 613 } else { 614 return index; 615 } 616 } 617 if (FXSYS_fabs(curXdif) < FXSYS_fabs(minXdif)) { 618 minXdif = curXdif; 619 minIndex = index; 620 } 621 PreXdif = curXdif; 622 } 623 return minIndex; 624 } 625 } 626 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO & info) const 627 { 628 if(m_ParseOptions.m_bGetCharCodeOnly) { 629 return; 630 } 631 if (!m_IsParsered) { 632 return; 633 } 634 if (index < 0 || index >= m_charList.GetSize()) { 635 return; 636 } 637 PAGECHAR_INFO charinfo; 638 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 639 info.m_Charcode = charinfo.m_CharCode; 640 info.m_OriginX = charinfo.m_OriginX; 641 info.m_OriginY = charinfo.m_OriginY; 642 info.m_Unicode = charinfo.m_Unicode; 643 info.m_Flag = charinfo.m_Flag; 644 info.m_CharBox = charinfo.m_CharBox; 645 info.m_pTextObj = charinfo.m_pTextObj; 646 if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont()) { 647 info.m_FontSize = charinfo.m_pTextObj->GetFontSize(); 648 } 649 info.m_Matrix.Copy(charinfo.m_Matrix); 650 return; 651 } 652 void CPDF_TextPage::CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const 653 { 654 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start); 655 PAGECHAR_INFO charinfo2 = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1); 656 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag && FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) { 657 return; 658 } 659 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) { 660 PAGECHAR_INFO charinfo1 = charinfo; 661 int startIndex = start; 662 while(FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag && charinfo1.m_Index == charinfo.m_Index) { 663 startIndex--; 664 if (startIndex < 0) { 665 break; 666 } 667 charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex); 668 } 669 startIndex++; 670 start = startIndex; 671 } 672 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) { 673 PAGECHAR_INFO charinfo3 = charinfo2; 674 int endIndex = start + nCount - 1; 675 while(FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag && charinfo3.m_Index == charinfo2.m_Index) { 676 endIndex++; 677 if (endIndex >= m_charList.GetSize()) { 678 break; 679 } 680 charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex); 681 } 682 endIndex--; 683 nCount = endIndex - start + 1; 684 } 685 } 686 CFX_WideString CPDF_TextPage::GetPageText(int start , int nCount) const 687 { 688 if (!m_IsParsered || nCount == 0) { 689 return L""; 690 } 691 if (start < 0) { 692 start = 0; 693 } 694 if (nCount == -1) { 695 nCount = m_charList.GetSize() - start; 696 return m_TextBuf.GetWideString().Mid(start, m_TextBuf.GetWideString().GetLength()); 697 } 698 if(nCount <= 0 || m_charList.GetSize() <= 0) { 699 return L""; 700 } 701 if(nCount + start > m_charList.GetSize() - 1) { 702 nCount = m_charList.GetSize() - start; 703 } 704 if (nCount <= 0) { 705 return L""; 706 } 707 CheckMarkedContentObject(start, nCount); 708 int startindex = 0; 709 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start); 710 int startOffset = 0; 711 while(charinfo.m_Index == -1) { 712 startOffset++; 713 if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) { 714 return L""; 715 } 716 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset); 717 } 718 startindex = charinfo.m_Index; 719 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1); 720 int nCountOffset = 0; 721 while (charinfo.m_Index == -1) { 722 nCountOffset++; 723 if (nCountOffset >= nCount) { 724 return L""; 725 } 726 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1); 727 } 728 nCount = start + nCount - nCountOffset - startindex; 729 if(nCount <= 0) { 730 return L""; 731 } 732 return m_TextBuf.GetWideString().Mid(startindex, nCount); 733 } 734 int CPDF_TextPage::CountRects(int start, int nCount) 735 { 736 if(m_ParseOptions.m_bGetCharCodeOnly) { 737 return -1; 738 } 739 if (!m_IsParsered) { 740 return -1; 741 } 742 if (start < 0) { 743 return -1; 744 } 745 if (nCount == -1 || nCount + start > m_charList.GetSize() ) { 746 nCount = m_charList.GetSize() - start; 747 } 748 m_SelRects.RemoveAll(); 749 GetRectArray(start, nCount, m_SelRects); 750 return m_SelRects.GetSize(); 751 } 752 void CPDF_TextPage::GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const 753 { 754 if(m_ParseOptions.m_bGetCharCodeOnly) { 755 return ; 756 } 757 if (!m_IsParsered || rectIndex < 0 || rectIndex >= m_SelRects.GetSize()) { 758 return; 759 } 760 left = m_SelRects.GetAt(rectIndex).left; 761 top = m_SelRects.GetAt(rectIndex).top; 762 right = m_SelRects.GetAt(rectIndex).right; 763 bottom = m_SelRects.GetAt(rectIndex).bottom; 764 } 765 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) 766 { 767 if(m_ParseOptions.m_bGetCharCodeOnly) { 768 return FALSE; 769 } 770 if(end == start) { 771 return FALSE; 772 } 773 FX_FLOAT dx, dy; 774 FPDF_CHAR_INFO info1, info2; 775 GetCharInfo(start, info1); 776 GetCharInfo(end, info2); 777 while(info2.m_CharBox.Width() == 0 || info2.m_CharBox.Height() == 0) { 778 end--; 779 if(end <= start) { 780 return FALSE; 781 } 782 GetCharInfo(end, info2); 783 } 784 dx = (info2.m_OriginX - info1.m_OriginX); 785 dy = (info2.m_OriginY - info1.m_OriginY); 786 if(dx == 0) { 787 if(dy > 0) { 788 Rotate = 90; 789 } else if (dy < 0) { 790 Rotate = 270; 791 } else { 792 Rotate = 0; 793 } 794 } else { 795 float a = FXSYS_atan2(dy, dx); 796 Rotate = (int)(a * 180 / FX_PI + 0.5); 797 } 798 if(Rotate < 0) { 799 Rotate = -Rotate; 800 } else if(Rotate > 0) { 801 Rotate = 360 - Rotate; 802 } 803 return TRUE; 804 } 805 FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect , int& Rotate) 806 { 807 if(m_ParseOptions.m_bGetCharCodeOnly) { 808 return FALSE; 809 } 810 int start, end, count, n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom, TRUE); 811 if(n < 1) { 812 return FALSE; 813 } 814 if(n > 1) { 815 GetBoundedSegment(n - 1, start, count); 816 end = start + count - 1; 817 GetBoundedSegment(0, start, count); 818 } else { 819 GetBoundedSegment(0, start, count); 820 end = start + count - 1; 821 } 822 return GetBaselineRotate(start, end, Rotate); 823 } 824 FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) 825 { 826 if(m_ParseOptions.m_bGetCharCodeOnly) { 827 return FALSE; 828 } 829 if (!m_IsParsered || rectIndex < 0 || rectIndex > m_SelRects.GetSize()) { 830 return FALSE; 831 } 832 CFX_FloatRect rect = m_SelRects.GetAt(rectIndex); 833 return GetBaselineRotate(rect , Rotate); 834 } 835 int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains ) 836 { 837 if(m_ParseOptions.m_bGetCharCodeOnly) { 838 return -1; 839 } 840 m_Segment.RemoveAll(); 841 if (!m_IsParsered) { 842 return -1; 843 } 844 CFX_FloatRect rect(left, bottom, right, top); 845 rect.Normalize(); 846 int nCount = m_charList.GetSize(); 847 int pos = 0; 848 FPDF_SEGMENT segment; 849 segment.m_Start = 0; 850 segment.m_nCount = 0; 851 FX_BOOL segmentStatus = 0; 852 FX_BOOL IsContainPreChar = FALSE; 853 while (pos < nCount) { 854 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos); 855 if(bContains && rect.Contains(charinfo.m_CharBox)) { 856 if (segmentStatus == 0 || segmentStatus == 2) { 857 segment.m_Start = pos; 858 segment.m_nCount = 1; 859 segmentStatus = 1; 860 } else if (segmentStatus == 1) { 861 segment.m_nCount++; 862 } 863 IsContainPreChar = TRUE; 864 } else if (!bContains && (IsRectIntersect(rect, charinfo.m_CharBox) || rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) { 865 if (segmentStatus == 0 || segmentStatus == 2) { 866 segment.m_Start = pos; 867 segment.m_nCount = 1; 868 segmentStatus = 1; 869 } else if (segmentStatus == 1) { 870 segment.m_nCount++; 871 } 872 IsContainPreChar = TRUE; 873 } else if (charinfo.m_Unicode == 32) { 874 if (IsContainPreChar == TRUE) { 875 if (segmentStatus == 0 || segmentStatus == 2) { 876 segment.m_Start = pos; 877 segment.m_nCount = 1; 878 segmentStatus = 1; 879 } else if (segmentStatus == 1) { 880 segment.m_nCount++; 881 } 882 IsContainPreChar = FALSE; 883 } else { 884 if (segmentStatus == 1) { 885 segmentStatus = 2; 886 m_Segment.Add(segment); 887 segment.m_Start = 0; 888 segment.m_nCount = 0; 889 } 890 } 891 } else { 892 if (segmentStatus == 1) { 893 segmentStatus = 2; 894 m_Segment.Add(segment); 895 segment.m_Start = 0; 896 segment.m_nCount = 0; 897 } 898 IsContainPreChar = FALSE; 899 } 900 pos++; 901 } 902 if (segmentStatus == 1) { 903 segmentStatus = 2; 904 m_Segment.Add(segment); 905 segment.m_Start = 0; 906 segment.m_nCount = 0; 907 } 908 return m_Segment.GetSize(); 909 } 910 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const 911 { 912 if(m_ParseOptions.m_bGetCharCodeOnly) { 913 return ; 914 } 915 if (index < 0 || index >= m_Segment.GetSize()) { 916 return; 917 } 918 start = m_Segment.GetAt(index).m_Start; 919 count = m_Segment.GetAt(index).m_nCount; 920 } 921 int CPDF_TextPage::GetWordBreak(int index, int direction) const 922 { 923 if(m_ParseOptions.m_bGetCharCodeOnly) { 924 return -1; 925 } 926 if (!m_IsParsered) { 927 return -1; 928 } 929 if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT) { 930 return -1; 931 } 932 if (index < 0 || index >= m_charList.GetSize()) { 933 return -1; 934 } 935 PAGECHAR_INFO charinfo; 936 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 937 if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 938 return index; 939 } 940 if (!IsLetter(charinfo.m_Unicode)) { 941 return index; 942 } 943 int breakPos = index; 944 if (direction == FPDFTEXT_LEFT) { 945 while (--breakPos > 0) { 946 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 947 if (!IsLetter(charinfo.m_Unicode)) { 948 return breakPos; 949 } 950 } 951 return breakPos; 952 } else if (direction == FPDFTEXT_RIGHT) { 953 while (++breakPos < m_charList.GetSize()) { 954 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 955 if (!IsLetter(charinfo.m_Unicode)) { 956 return breakPos; 957 } 958 } 959 return breakPos; 960 } 961 return breakPos; 962 } 963 FX_INT32 CPDF_TextPage::FindTextlineFlowDirection() 964 { 965 if (!m_pPage) { 966 return -1; 967 } 968 const FX_INT32 nPageWidth = (FX_INT32)((CPDF_Page*)m_pPage)->GetPageWidth(); 969 const FX_INT32 nPageHeight = (FX_INT32)((CPDF_Page*)m_pPage)->GetPageHeight(); 970 CFX_ByteArray nHorizontalMask; 971 if (!nHorizontalMask.SetSize(nPageWidth)) { 972 return -1; 973 } 974 FX_BYTE* pDataH = nHorizontalMask.GetData(); 975 CFX_ByteArray nVerticalMask; 976 if (!nVerticalMask.SetSize(nPageHeight)) { 977 return -1; 978 } 979 FX_BYTE* pDataV = nVerticalMask.GetData(); 980 FX_INT32 index = 0; 981 FX_FLOAT fLineHeight = 0.0f; 982 CPDF_PageObject* pPageObj = NULL; 983 FX_POSITION pos = NULL; 984 pos = m_pPage->GetFirstObjectPosition(); 985 if(!pos) { 986 return -1; 987 } 988 while(pos) { 989 pPageObj = m_pPage->GetNextObject(pos); 990 if(NULL == pPageObj) { 991 continue; 992 } 993 if(PDFPAGE_TEXT != pPageObj->m_Type) { 994 continue; 995 } 996 FX_INT32 minH = (FX_INT32)pPageObj->m_Left < 0 ? 0 : (FX_INT32)pPageObj->m_Left; 997 FX_INT32 maxH = (FX_INT32)pPageObj->m_Right > nPageWidth ? nPageWidth : (FX_INT32)pPageObj->m_Right; 998 FX_INT32 minV = (FX_INT32)pPageObj->m_Bottom < 0 ? 0 : (FX_INT32)pPageObj->m_Bottom; 999 FX_INT32 maxV = (FX_INT32)pPageObj->m_Top > nPageHeight ? nPageHeight : (FX_INT32)pPageObj->m_Top; 1000 if (minH >= maxH || minV >= maxV) { 1001 continue; 1002 } 1003 FXSYS_memset8(pDataH + minH, 1, maxH - minH); 1004 FXSYS_memset8(pDataV + minV, 1, maxV - minV); 1005 if (fLineHeight <= 0.0f) { 1006 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom; 1007 } 1008 pPageObj = NULL; 1009 } 1010 FX_INT32 nStartH = 0; 1011 FX_INT32 nEndH = 0; 1012 FX_FLOAT nSumH = 0.0f; 1013 for (index = 0; index < nPageWidth; index++) 1014 if(1 == nHorizontalMask[index]) { 1015 break; 1016 } 1017 nStartH = index; 1018 for (index = nPageWidth; index > 0; index--) 1019 if(1 == nHorizontalMask[index - 1]) { 1020 break; 1021 } 1022 nEndH = index; 1023 for (index = nStartH; index < nEndH; index++) { 1024 nSumH += nHorizontalMask[index]; 1025 } 1026 nSumH /= nEndH - nStartH; 1027 FX_INT32 nStartV = 0; 1028 FX_INT32 nEndV = 0; 1029 FX_FLOAT nSumV = 0.0f; 1030 for (index = 0; index < nPageHeight; index++) 1031 if(1 == nVerticalMask[index]) { 1032 break; 1033 } 1034 nStartV = index; 1035 for (index = nPageHeight; index > 0; index--) 1036 if(1 == nVerticalMask[index - 1]) { 1037 break; 1038 } 1039 nEndV = index; 1040 for (index = nStartV; index < nEndV; index++) { 1041 nSumV += nVerticalMask[index]; 1042 } 1043 nSumV /= nEndV - nStartV; 1044 if ((nEndV - nStartV) < (FX_INT32)(2 * fLineHeight)) { 1045 return 0; 1046 } 1047 if ((nEndH - nStartH) < (FX_INT32)(2 * fLineHeight)) { 1048 return 1; 1049 } 1050 if (nSumH > 0.8f) { 1051 return 0; 1052 } 1053 if (nSumH - nSumV > 0.0f) { 1054 return 0; 1055 } 1056 if (nSumV - nSumH > 0.0f) { 1057 return 1; 1058 } 1059 return -1; 1060 } 1061 void CPDF_TextPage::ProcessObject() 1062 { 1063 CPDF_PageObject* pPageObj = NULL; 1064 if (!m_pPage) { 1065 return; 1066 } 1067 FX_POSITION pos; 1068 pos = m_pPage->GetFirstObjectPosition(); 1069 if (!pos) { 1070 return; 1071 } 1072 m_TextlineDir = FindTextlineFlowDirection(); 1073 int nCount = 0; 1074 while (pos) { 1075 pPageObj = m_pPage->GetNextObject(pos); 1076 if(pPageObj) { 1077 if(pPageObj->m_Type == PDFPAGE_TEXT) { 1078 CFX_AffineMatrix matrix; 1079 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos); 1080 nCount++; 1081 } else if (pPageObj->m_Type == PDFPAGE_FORM) { 1082 CFX_AffineMatrix formMatrix(1, 0, 0, 1, 0, 0); 1083 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix); 1084 } 1085 } 1086 pPageObj = NULL; 1087 } 1088 int count = m_LineObj.GetSize(); 1089 for(int i = 0; i < count; i++) { 1090 ProcessTextObject(m_LineObj.GetAt(i)); 1091 } 1092 m_LineObj.RemoveAll(); 1093 CloseTempLine(); 1094 } 1095 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, const CFX_AffineMatrix& formMatrix) 1096 { 1097 CPDF_PageObject* pPageObj = NULL; 1098 FX_POSITION pos; 1099 if (!pFormObj) { 1100 return; 1101 } 1102 pos = pFormObj->m_pForm->GetFirstObjectPosition(); 1103 if (!pos) { 1104 return; 1105 } 1106 CFX_AffineMatrix curFormMatrix; 1107 curFormMatrix.Copy(pFormObj->m_FormMatrix); 1108 curFormMatrix.Concat(formMatrix); 1109 while (pos) { 1110 pPageObj = pFormObj->m_pForm->GetNextObject(pos); 1111 if(pPageObj) { 1112 if(pPageObj->m_Type == PDFPAGE_TEXT) { 1113 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos); 1114 } else if (pPageObj->m_Type == PDFPAGE_FORM) { 1115 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix); 1116 } 1117 } 1118 pPageObj = NULL; 1119 } 1120 } 1121 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const 1122 { 1123 if(charCode == -1) { 1124 return 0; 1125 } 1126 int w = pFont->GetCharWidthF(charCode); 1127 if(w == 0) { 1128 CFX_ByteString str; 1129 pFont->AppendChar(str, charCode); 1130 w = pFont->GetStringWidth(str, 1); 1131 if(w == 0) { 1132 FX_RECT BBox; 1133 pFont->GetCharBBox(charCode, BBox); 1134 w = BBox.right - BBox.left; 1135 } 1136 } 1137 return w; 1138 } 1139 void CPDF_TextPage::OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str) 1140 { 1141 FX_INT32 start, count; 1142 FX_INT32 ret = pBidi->GetBidiInfo(start, count); 1143 if(ret == 2) { 1144 for(int i = start + count - 1; i >= start; i--) { 1145 m_TextBuf.AppendChar(str.GetAt(i)); 1146 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i)); 1147 } 1148 } else { 1149 int end = start + count ; 1150 for(int i = start; i < end; i++) { 1151 m_TextBuf.AppendChar(str.GetAt(i)); 1152 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i)); 1153 } 1154 } 1155 } 1156 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) 1157 { 1158 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i); 1159 FX_WCHAR wChar = str.GetAt(i); 1160 if(!IsControlChar(&Info)) { 1161 Info.m_Index = m_TextBuf.GetLength(); 1162 if (wChar >= 0xFB00 && wChar <= 0xFB06) { 1163 FX_LPWSTR pDst = NULL; 1164 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 1165 if (nCount >= 1) { 1166 pDst = FX_Alloc(FX_WCHAR, nCount); 1167 FX_Unicode_GetNormalization(wChar, pDst); 1168 for (int nIndex = 0; nIndex < nCount; nIndex++) { 1169 PAGECHAR_INFO Info2 = Info; 1170 Info2.m_Unicode = pDst[nIndex]; 1171 Info2.m_Flag = FPDFTEXT_CHAR_PIECE; 1172 m_TextBuf.AppendChar(Info2.m_Unicode); 1173 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1174 m_charList.Add(Info2); 1175 } 1176 } 1177 FX_Free(pDst); 1178 return; 1179 } 1180 } 1181 m_TextBuf.AppendChar(wChar); 1182 } else { 1183 Info.m_Index = -1; 1184 } 1185 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1186 m_charList.Add(Info); 1187 } 1188 } 1189 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) 1190 { 1191 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i); 1192 if(!IsControlChar(&Info)) { 1193 Info.m_Index = m_TextBuf.GetLength(); 1194 FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE); 1195 FX_LPWSTR pDst = NULL; 1196 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 1197 if (nCount >= 1) { 1198 pDst = FX_Alloc(FX_WCHAR, nCount); 1199 FX_Unicode_GetNormalization(wChar, pDst); 1200 for (int nIndex = 0; nIndex < nCount; nIndex++) { 1201 PAGECHAR_INFO Info2 = Info; 1202 Info2.m_Unicode = pDst[nIndex]; 1203 Info2.m_Flag = FPDFTEXT_CHAR_PIECE; 1204 m_TextBuf.AppendChar(Info2.m_Unicode); 1205 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1206 m_charList.Add(Info2); 1207 } 1208 } 1209 FX_Free(pDst); 1210 return; 1211 } else { 1212 Info.m_Unicode = wChar; 1213 } 1214 m_TextBuf.AppendChar(Info.m_Unicode); 1215 } else { 1216 Info.m_Index = -1; 1217 } 1218 if( !m_ParseOptions.m_bGetCharCodeOnly) { 1219 m_charList.Add(Info); 1220 } 1221 } 1222 void CPDF_TextPage::CloseTempLine() 1223 { 1224 int count1 = m_TempCharList.GetSize(); 1225 if (count1 <= 0) { 1226 return; 1227 } 1228 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); 1229 CFX_WideString str = m_TempTextBuf.GetWideString(); 1230 CFX_WordArray order; 1231 FX_BOOL bR2L = FALSE; 1232 FX_INT32 start = 0, count = 0; 1233 int nR2L = 0, nL2R = 0; 1234 FX_BOOL bPrevSpace = FALSE; 1235 for (int i = 0; i < str.GetLength(); i++) { 1236 if(str.GetAt(i) == 32) { 1237 if(bPrevSpace) { 1238 m_TempTextBuf.Delete(i, 1); 1239 m_TempCharList.Delete(i); 1240 str.Delete(i); 1241 count1--; 1242 i--; 1243 continue; 1244 } 1245 bPrevSpace = TRUE; 1246 } else { 1247 bPrevSpace = FALSE; 1248 } 1249 if(BidiChar && BidiChar->AppendChar(str.GetAt(i))) { 1250 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 1251 order.Add(start); 1252 order.Add(count); 1253 order.Add(ret); 1254 if(!bR2L) { 1255 if(ret == 2) { 1256 nR2L++; 1257 } else if (ret == 1) { 1258 nL2R++; 1259 } 1260 } 1261 } 1262 } 1263 if(BidiChar && BidiChar->EndChar()) { 1264 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 1265 order.Add(start); 1266 order.Add(count); 1267 order.Add(ret); 1268 if(!bR2L) { 1269 if(ret == 2) { 1270 nR2L++; 1271 } else if(ret == 1) { 1272 nL2R++; 1273 } 1274 } 1275 } 1276 if(nR2L > 0 && nR2L >= nL2R) { 1277 bR2L = TRUE; 1278 } 1279 if(this->m_parserflag == FPDFTEXT_RLTB || bR2L) { 1280 int count = order.GetSize(); 1281 for(int i = count - 1; i > 0; i -= 3) { 1282 int ret = order.GetAt(i); 1283 int start = order.GetAt(i - 2); 1284 int count1 = order.GetAt(i - 1); 1285 if(ret == 2 || ret == 0) { 1286 for(int j = start + count1 - 1; j >= start; j--) { 1287 AddCharInfoByRLDirection(str, j); 1288 } 1289 } else { 1290 int j = i; 1291 FX_BOOL bSymbol = FALSE; 1292 while(j > 0 && order.GetAt(j) != 2) { 1293 bSymbol = !order.GetAt(j); 1294 j -= 3; 1295 } 1296 int end = start + count1 ; 1297 int n = 0; 1298 if(bSymbol) { 1299 n = j + 6; 1300 } else { 1301 n = j + 3; 1302 } 1303 if(n >= i) { 1304 for(int m = start; m < end; m++) { 1305 AddCharInfoByLRDirection(str, m); 1306 } 1307 } else { 1308 j = i; 1309 i = n; 1310 for(; n <= j; n += 3) { 1311 int start = order.GetAt(n - 2); 1312 int count1 = order.GetAt(n - 1); 1313 int end = start + count1 ; 1314 for(int m = start; m < end; m++) { 1315 AddCharInfoByLRDirection(str, m); 1316 } 1317 } 1318 } 1319 } 1320 } 1321 } else { 1322 int count = order.GetSize(); 1323 FX_BOOL bL2R = FALSE; 1324 for(int i = 0; i < count; i += 3) { 1325 int ret = order.GetAt(i + 2); 1326 int start = order.GetAt(i); 1327 int count1 = order.GetAt(i + 1); 1328 if(ret == 2 || (i == 0 && ret == 0 && !bL2R)) { 1329 int j = i + 3; 1330 while(bR2L && j < count) { 1331 if(order.GetAt(j + 2) == 1) { 1332 break; 1333 } else { 1334 j += 3; 1335 } 1336 } 1337 if(j == 3) { 1338 i = -3; 1339 bL2R = TRUE; 1340 continue; 1341 } 1342 int end = m_TempCharList.GetSize() - 1; 1343 if(j < count) { 1344 end = order.GetAt(j) - 1; 1345 } 1346 i = j - 3; 1347 for(int n = end; n >= start; n--) { 1348 AddCharInfoByRLDirection(str, n); 1349 } 1350 } else { 1351 int end = start + count1 ; 1352 for(int n = start; n < end; n++) { 1353 AddCharInfoByLRDirection(str, n); 1354 } 1355 } 1356 } 1357 } 1358 order.RemoveAll(); 1359 m_TempCharList.RemoveAll(); 1360 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength()); 1361 BidiChar->Release(); 1362 } 1363 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj, const CFX_AffineMatrix& formMatrix, FX_POSITION ObjPos) 1364 { 1365 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right, pTextObj->m_Top); 1366 if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) { 1367 return; 1368 } 1369 int count = m_LineObj.GetSize(); 1370 PDFTEXT_Obj Obj; 1371 Obj.m_pTextObj = pTextObj; 1372 Obj.m_formMatrix = formMatrix; 1373 if(count == 0) { 1374 m_LineObj.Add(Obj); 1375 return; 1376 } 1377 if (IsSameAsPreTextObject(pTextObj, ObjPos)) { 1378 return; 1379 } 1380 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1); 1381 CPDF_TextObjectItem item; 1382 int nItem = prev_Obj.m_pTextObj->CountItems(); 1383 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item); 1384 FX_FLOAT prev_width = GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) * prev_Obj.m_pTextObj->GetFontSize() / 1000; 1385 CFX_AffineMatrix prev_matrix; 1386 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix); 1387 prev_width = FXSYS_fabs(prev_width); 1388 prev_matrix.Concat(prev_Obj.m_formMatrix); 1389 prev_width = prev_matrix.TransformDistance(prev_width); 1390 pTextObj->GetItemInfo(0, &item); 1391 FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) * pTextObj->GetFontSize() / 1000; 1392 this_width = FXSYS_fabs(this_width); 1393 CFX_AffineMatrix this_matrix; 1394 pTextObj->GetTextMatrix(&this_matrix); 1395 this_width = FXSYS_fabs(this_width); 1396 this_matrix.Concat(formMatrix); 1397 this_width = this_matrix.TransformDistance(this_width); 1398 FX_FLOAT threshold = prev_width > this_width ? prev_width / 4 : this_width / 4; 1399 FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(), prev_y = prev_Obj.m_pTextObj->GetPosY(); 1400 prev_Obj.m_formMatrix.Transform(prev_x, prev_y); 1401 m_DisplayMatrix.Transform(prev_x, prev_y); 1402 FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY(); 1403 formMatrix.Transform(this_x, this_y); 1404 m_DisplayMatrix.Transform(this_x, this_y); 1405 if (FXSYS_fabs(this_y - prev_y) > threshold * 2) { 1406 for(int i = 0; i < count; i++) { 1407 ProcessTextObject(m_LineObj.GetAt(i)); 1408 } 1409 m_LineObj.RemoveAll(); 1410 m_LineObj.Add(Obj); 1411 return; 1412 } 1413 int i = 0; 1414 if(m_ParseOptions.m_bNormalizeObjs) { 1415 for(i = count - 1; i >= 0; i--) { 1416 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i); 1417 CFX_AffineMatrix prev_matrix; 1418 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix); 1419 FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(), Prev_y = prev_Obj.m_pTextObj->GetPosY(); 1420 prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y); 1421 m_DisplayMatrix.Transform(Prev_x, Prev_y); 1422 if(this_x >= Prev_x) { 1423 if(i == count - 1) { 1424 m_LineObj.Add(Obj); 1425 } else { 1426 m_LineObj.InsertAt(i + 1, Obj); 1427 } 1428 break; 1429 } 1430 } 1431 if(i < 0) { 1432 m_LineObj.InsertAt(0, Obj); 1433 } 1434 } else { 1435 m_LineObj.Add(Obj); 1436 } 1437 } 1438 FX_INT32 CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) 1439 { 1440 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1441 CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject(); 1442 if(!pMarkData) { 1443 return FPDFTEXT_MC_PASS; 1444 } 1445 int nContentMark = pMarkData->CountItems(); 1446 if (nContentMark < 1) { 1447 return FPDFTEXT_MC_PASS; 1448 } 1449 CFX_WideString actText; 1450 FX_BOOL bExist = FALSE; 1451 CPDF_Dictionary* pDict = NULL; 1452 int n = 0; 1453 for (n = 0; n < nContentMark; n++) { 1454 CPDF_ContentMarkItem& item = pMarkData->GetItem(n); 1455 CFX_ByteString tagStr = (CFX_ByteString)item.GetName(); 1456 pDict = (CPDF_Dictionary*)item.GetParam(); 1457 CPDF_String* temp = (CPDF_String*)(pDict ? pDict->GetElement(FX_BSTRC("ActualText")) : NULL); 1458 if (temp) { 1459 bExist = TRUE; 1460 actText = temp->GetUnicodeText(); 1461 } 1462 } 1463 if (!bExist) { 1464 return FPDFTEXT_MC_PASS; 1465 } 1466 if (m_pPreTextObj) { 1467 if (CPDF_ContentMarkData* pPreMarkData = (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) { 1468 if (pPreMarkData->CountItems() == n) { 1469 CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1); 1470 if (pDict == item.GetParam()) { 1471 return FPDFTEXT_MC_DONE; 1472 } 1473 } 1474 } 1475 } 1476 CPDF_Font* pFont = pTextObj->GetFont(); 1477 FX_STRSIZE nItems = actText.GetLength(); 1478 if (nItems < 1) { 1479 return FPDFTEXT_MC_PASS; 1480 } 1481 bExist = FALSE; 1482 for (FX_STRSIZE i = 0; i < nItems; i++) { 1483 FX_WCHAR wChar = actText.GetAt(i); 1484 if (-1 == pFont->CharCodeFromUnicode(wChar)) { 1485 continue; 1486 } else { 1487 bExist = TRUE; 1488 break; 1489 } 1490 } 1491 if (!bExist) { 1492 return FPDFTEXT_MC_PASS; 1493 } 1494 bExist = FALSE; 1495 for (FX_STRSIZE i = 0; i < nItems; i++) { 1496 FX_WCHAR wChar = actText.GetAt(i); 1497 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) { 1498 bExist = TRUE; 1499 break; 1500 } 1501 } 1502 if (!bExist) { 1503 return FPDFTEXT_MC_DONE; 1504 } 1505 return FPDFTEXT_MC_DELAY; 1506 } 1507 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) 1508 { 1509 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1510 CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject(); 1511 if(!pMarkData) { 1512 return; 1513 } 1514 int nContentMark = pMarkData->CountItems(); 1515 if (nContentMark < 1) { 1516 return; 1517 } 1518 CFX_WideString actText; 1519 CPDF_Dictionary* pDict = NULL; 1520 int n = 0; 1521 for (n = 0; n < nContentMark; n++) { 1522 CPDF_ContentMarkItem& item = pMarkData->GetItem(n); 1523 CFX_ByteString tagStr = (CFX_ByteString)item.GetName(); 1524 pDict = (CPDF_Dictionary*)item.GetParam(); 1525 CPDF_String* temp = (CPDF_String*)(pDict ? pDict->GetElement(FX_BSTRC("ActualText")) : NULL); 1526 if (temp) { 1527 actText = temp->GetUnicodeText(); 1528 } 1529 } 1530 FX_STRSIZE nItems = actText.GetLength(); 1531 if (nItems < 1) { 1532 return; 1533 } 1534 CPDF_Font* pFont = pTextObj->GetFont(); 1535 CFX_AffineMatrix formMatrix = Obj.m_formMatrix; 1536 CFX_AffineMatrix matrix; 1537 pTextObj->GetTextMatrix(&matrix); 1538 matrix.Concat(formMatrix); 1539 FX_FLOAT fPosX = pTextObj->GetPosX(); 1540 FX_FLOAT fPosY = pTextObj->GetPosY(); 1541 int nCharInfoIndex = m_TextBuf.GetLength(); 1542 CFX_FloatRect charBox; 1543 charBox.top = pTextObj->m_Top; 1544 charBox.left = pTextObj->m_Left; 1545 charBox.right = pTextObj->m_Right; 1546 charBox.bottom = pTextObj->m_Bottom; 1547 for (FX_STRSIZE k = 0; k < nItems; k++) { 1548 FX_WCHAR wChar = actText.GetAt(k); 1549 if (wChar <= 0x80 && !isprint(wChar)) { 1550 wChar = 0x20; 1551 } 1552 if (wChar >= 0xFFFD) { 1553 continue; 1554 } 1555 PAGECHAR_INFO charinfo; 1556 charinfo.m_OriginX = fPosX; 1557 charinfo.m_OriginY = fPosY; 1558 charinfo.m_Index = nCharInfoIndex; 1559 charinfo.m_Unicode = wChar; 1560 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar); 1561 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE; 1562 charinfo.m_pTextObj = pTextObj; 1563 charinfo.m_CharBox.top = charBox.top; 1564 charinfo.m_CharBox.left = charBox.left; 1565 charinfo.m_CharBox.right = charBox.right; 1566 charinfo.m_CharBox.bottom = charBox.bottom; 1567 charinfo.m_Matrix.Copy(matrix); 1568 m_TempTextBuf.AppendChar(wChar); 1569 m_TempCharList.Add(charinfo); 1570 } 1571 } 1572 void CPDF_TextPage::FindPreviousTextObject(void) 1573 { 1574 if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) { 1575 return; 1576 } 1577 PAGECHAR_INFO preChar; 1578 if (m_TempCharList.GetSize() >= 1) { 1579 preChar = *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1); 1580 } else { 1581 preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1); 1582 } 1583 if (preChar.m_pTextObj) { 1584 m_pPreTextObj = preChar.m_pTextObj; 1585 } 1586 } 1587 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) 1588 { 1589 CPDF_TextObject* pTextObj = Obj.m_pTextObj; 1590 if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) { 1591 return; 1592 } 1593 CFX_AffineMatrix formMatrix = Obj.m_formMatrix; 1594 CPDF_Font* pFont = pTextObj->GetFont(); 1595 CFX_AffineMatrix matrix; 1596 pTextObj->GetTextMatrix(&matrix); 1597 matrix.Concat(formMatrix); 1598 FX_INT32 bPreMKC = PreMarkedContent(Obj); 1599 if (FPDFTEXT_MC_DONE == bPreMKC) { 1600 m_pPreTextObj = pTextObj; 1601 m_perMatrix.Copy(formMatrix); 1602 return; 1603 } 1604 int result = 0; 1605 if (m_pPreTextObj) { 1606 result = ProcessInsertObject(pTextObj, formMatrix); 1607 if (2 == result) { 1608 m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top); 1609 } else { 1610 m_CurlineRect.Union(CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top)); 1611 } 1612 PAGECHAR_INFO generateChar; 1613 if (result == 1) { 1614 if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) { 1615 if (!formMatrix.IsIdentity()) { 1616 generateChar.m_Matrix.Copy(formMatrix); 1617 } 1618 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); 1619 m_TempCharList.Add(generateChar); 1620 } 1621 } else if(result == 2) { 1622 CloseTempLine(); 1623 if(m_TextBuf.GetSize()) { 1624 if(m_ParseOptions.m_bGetCharCodeOnly) { 1625 m_TextBuf.AppendChar(TEXT_RETURN_CHAR); 1626 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR); 1627 } else { 1628 if(GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) { 1629 m_TextBuf.AppendChar(TEXT_RETURN_CHAR); 1630 if (!formMatrix.IsIdentity()) { 1631 generateChar.m_Matrix.Copy(formMatrix); 1632 } 1633 m_charList.Add(generateChar); 1634 } 1635 if(GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) { 1636 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR); 1637 if (!formMatrix.IsIdentity()) { 1638 generateChar.m_Matrix.Copy(formMatrix); 1639 } 1640 m_charList.Add(generateChar); 1641 } 1642 } 1643 } 1644 } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) { 1645 FX_INT32 nChars = pTextObj->CountChars(); 1646 if (nChars == 1) { 1647 CPDF_TextObjectItem item; 1648 pTextObj->GetCharInfo(0, &item); 1649 CFX_WideString wstrItem = pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 1650 if(wstrItem.IsEmpty()) { 1651 wstrItem += (FX_WCHAR)item.m_CharCode; 1652 } 1653 FX_WCHAR curChar = wstrItem.GetAt(0); 1654 if (0x2D == curChar || 0xAD == curChar) { 1655 return; 1656 } 1657 } 1658 while (m_TempTextBuf.GetSize() > 0 && m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() - 1) == 0x20) { 1659 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1660 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); 1661 } 1662 PAGECHAR_INFO* cha = (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1); 1663 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1664 cha->m_Unicode = 0x2; 1665 cha->m_Flag = FPDFTEXT_CHAR_HYPHEN; 1666 m_TempTextBuf.AppendChar(0xfffe); 1667 } 1668 } else { 1669 m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top); 1670 } 1671 if (FPDFTEXT_MC_DELAY == bPreMKC) { 1672 ProcessMarkedContent(Obj); 1673 m_pPreTextObj = pTextObj; 1674 m_perMatrix.Copy(formMatrix); 1675 return; 1676 } 1677 m_pPreTextObj = pTextObj; 1678 m_perMatrix.Copy(formMatrix); 1679 int nItems = pTextObj->CountItems(); 1680 FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix); 1681 1682 const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems); 1683 const FX_BOOL bIsBidiAndMirrorInverse = 1684 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0; 1685 FX_INT32 iBufStartAppend = m_TempTextBuf.GetLength(); 1686 FX_INT32 iCharListStartAppend = m_TempCharList.GetSize(); 1687 1688 FX_FLOAT spacing = 0; 1689 for (int i = 0; i < nItems; i++) { 1690 CPDF_TextObjectItem item; 1691 PAGECHAR_INFO charinfo; 1692 charinfo.m_OriginX = 0; 1693 charinfo.m_OriginY = 0; 1694 pTextObj->GetItemInfo(i, &item); 1695 if (item.m_CharCode == (FX_DWORD) - 1) { 1696 CFX_WideString str = m_TempTextBuf.GetWideString(); 1697 if(str.IsEmpty()) { 1698 str = m_TextBuf.GetWideString(); 1699 } 1700 if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) { 1701 continue; 1702 } 1703 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1704 spacing = -fontsize_h * item.m_OriginX / 1000; 1705 continue; 1706 } 1707 FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace; 1708 if (charSpace > 0.001) { 1709 spacing += matrix.TransformDistance(charSpace); 1710 } else if(charSpace < -0.001) { 1711 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); 1712 } 1713 spacing -= baseSpace; 1714 if (spacing && i > 0) { 1715 int last_width = 0; 1716 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1717 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 1718 FX_FLOAT threshold = 0; 1719 if (space_charcode != -1) { 1720 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ; 1721 } 1722 if (threshold > fontsize_h / 3) { 1723 threshold = 0; 1724 } else { 1725 threshold /= 2; 1726 } 1727 if (threshold == 0) { 1728 threshold = fontsize_h; 1729 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); 1730 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; 1731 threshold = _NormalizeThreshold(threshold); 1732 threshold = fontsize_h * threshold / 1000; 1733 } 1734 if (threshold && (spacing && spacing >= threshold) ) { 1735 charinfo.m_Unicode = TEXT_BLANK_CHAR; 1736 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; 1737 charinfo.m_pTextObj = pTextObj; 1738 charinfo.m_Index = m_TextBuf.GetLength(); 1739 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); 1740 charinfo.m_CharCode = -1; 1741 charinfo.m_Matrix.Copy(formMatrix); 1742 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY); 1743 charinfo.m_CharBox = CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY); 1744 m_TempCharList.Add(charinfo); 1745 } 1746 if (item.m_CharCode == (FX_DWORD) - 1) { 1747 continue; 1748 } 1749 } 1750 spacing = 0; 1751 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 1752 FX_BOOL bNoUnicode = FALSE; 1753 FX_WCHAR wChar = wstrItem.GetAt(0); 1754 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) { 1755 if(wstrItem.IsEmpty()) { 1756 wstrItem += (FX_WCHAR)item.m_CharCode; 1757 } else { 1758 wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode); 1759 } 1760 bNoUnicode = TRUE; 1761 } 1762 charinfo.m_Index = -1; 1763 charinfo.m_CharCode = item.m_CharCode; 1764 if(bNoUnicode) { 1765 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE; 1766 } else { 1767 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL; 1768 } 1769 charinfo.m_pTextObj = pTextObj; 1770 charinfo.m_OriginX = 0, charinfo.m_OriginY = 0; 1771 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY); 1772 FX_RECT rect(0, 0, 0, 0); 1773 rect.Intersect(0, 0, 0, 0); 1774 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect); 1775 charinfo.m_CharBox.top = rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY; 1776 charinfo.m_CharBox.left = rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX; 1777 charinfo.m_CharBox.right = rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX; 1778 charinfo.m_CharBox.bottom = rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY; 1779 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) { 1780 charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + pTextObj->GetFontSize(); 1781 } 1782 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) { 1783 charinfo.m_CharBox.right = charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode); 1784 } 1785 matrix.TransformRect(charinfo.m_CharBox); 1786 charinfo.m_Matrix.Copy(matrix); 1787 if (wstrItem.IsEmpty()) { 1788 charinfo.m_Unicode = 0; 1789 m_TempCharList.Add(charinfo); 1790 m_TempTextBuf.AppendChar(0xfffe); 1791 continue; 1792 } else { 1793 int nTotal = wstrItem.GetLength(); 1794 FX_BOOL bDel = FALSE; 1795 const int count = std::min(m_TempCharList.GetSize(), 7); 1796 FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance((FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize()); 1797 for (int n = m_TempCharList.GetSize(); 1798 n > m_TempCharList.GetSize() - count; 1799 n--) { 1800 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(n - 1); 1801 if(charinfo1->m_CharCode == charinfo.m_CharCode && 1802 charinfo1->m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() && 1803 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold && 1804 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) { 1805 bDel = TRUE; 1806 break; 1807 } 1808 } 1809 if(!bDel) { 1810 for (int nIndex = 0; nIndex < nTotal; nIndex++) { 1811 charinfo.m_Unicode = wstrItem.GetAt(nIndex); 1812 if (charinfo.m_Unicode) { 1813 charinfo.m_Index = m_TextBuf.GetLength(); 1814 m_TempTextBuf.AppendChar(charinfo.m_Unicode); 1815 } else { 1816 m_TempTextBuf.AppendChar(0xfffe); 1817 } 1818 m_TempCharList.Add(charinfo); 1819 } 1820 } else if(i == 0) { 1821 CFX_WideString str = m_TempTextBuf.GetWideString(); 1822 if (!str.IsEmpty() && str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) { 1823 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); 1824 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); 1825 } 1826 } 1827 } 1828 } 1829 if (bIsBidiAndMirrorInverse) { 1830 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend); 1831 } 1832 } 1833 void CPDF_TextPage::SwapTempTextBuf(FX_INT32 iCharListStartAppend, 1834 FX_INT32 iBufStartAppend) 1835 { 1836 FX_INT32 i, j; 1837 i = iCharListStartAppend; 1838 j = m_TempCharList.GetSize() - 1; 1839 for (; i < j; i++, j--) { 1840 std::swap(m_TempCharList[i], m_TempCharList[j]); 1841 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index); 1842 } 1843 FX_WCHAR * pTempBuffer = m_TempTextBuf.GetBuffer(); 1844 i = iBufStartAppend; 1845 j = m_TempTextBuf.GetLength() - 1; 1846 for (; i < j; i++, j--) { 1847 std::swap(pTempBuffer[i], pTempBuffer[j]); 1848 } 1849 } 1850 FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj, 1851 const CPDF_Font* pFont, 1852 int nItems) const 1853 { 1854 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); 1855 FX_INT32 nR2L = 0; 1856 FX_INT32 nL2R = 0; 1857 FX_INT32 start = 0, count = 0; 1858 CPDF_TextObjectItem item; 1859 for (FX_INT32 i = 0; i < nItems; i++) { 1860 pTextObj->GetItemInfo(i, &item); 1861 if (item.m_CharCode == (FX_DWORD)-1) { 1862 continue; 1863 } 1864 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 1865 FX_WCHAR wChar = wstrItem.GetAt(0); 1866 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) { 1867 wChar = (FX_WCHAR)item.m_CharCode; 1868 } 1869 if (!wChar) { 1870 continue; 1871 } 1872 if (BidiChar && BidiChar->AppendChar(wChar)) { 1873 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 1874 if (ret == 2) { 1875 nR2L++; 1876 } 1877 else if (ret == 1) { 1878 nL2R++; 1879 } 1880 } 1881 } 1882 if (BidiChar && BidiChar->EndChar()) { 1883 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 1884 if (ret == 2) { 1885 nR2L++; 1886 } 1887 else if (ret == 1) { 1888 nL2R++; 1889 } 1890 } 1891 if (BidiChar) 1892 BidiChar->Release(); 1893 return (nR2L > 0 && nR2L >= nL2R); 1894 } 1895 FX_INT32 CPDF_TextPage::GetTextObjectWritingMode(const CPDF_TextObject* pTextObj) 1896 { 1897 FX_INT32 nChars = pTextObj->CountChars(); 1898 if (nChars == 1) { 1899 return m_TextlineDir; 1900 } 1901 CPDF_TextObjectItem first, last; 1902 pTextObj->GetCharInfo(0, &first); 1903 pTextObj->GetCharInfo(nChars - 1, &last); 1904 CFX_Matrix textMatrix; 1905 pTextObj->GetTextMatrix(&textMatrix); 1906 textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY); 1907 textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY); 1908 FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX); 1909 FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY); 1910 if (dX <= 0.0001f && dY <= 0.0001f) { 1911 return -1; 1912 } 1913 CFX_VectorF v; 1914 v.Set(dX, dY); 1915 v.Normalize(); 1916 if (v.y <= 0.0872f) { 1917 if (v.x <= 0.0872f) { 1918 return m_TextlineDir; 1919 } 1920 return 0; 1921 } else if (v.x <= 0.0872f) { 1922 return 1; 1923 } 1924 return m_TextlineDir; 1925 } 1926 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) 1927 { 1928 CFX_WideString strCurText = m_TempTextBuf.GetWideString(); 1929 if(strCurText.GetLength() == 0) { 1930 strCurText = m_TextBuf.GetWideString(); 1931 } 1932 FX_STRSIZE nCount = strCurText.GetLength(); 1933 int nIndex = nCount - 1; 1934 FX_WCHAR wcTmp = strCurText.GetAt(nIndex); 1935 while(wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) { 1936 wcTmp = strCurText.GetAt(--nIndex); 1937 } 1938 if (0x2D == wcTmp || 0xAD == wcTmp) { 1939 if (--nIndex > 0) { 1940 FX_WCHAR preChar = strCurText.GetAt((nIndex)); 1941 if (((preChar >= L'A' && preChar <= L'Z') || (preChar >= L'a' && preChar <= L'z')) 1942 && ((curChar >= L'A' && curChar <= L'Z') || (curChar >= L'a' && curChar <= L'z'))) { 1943 return TRUE; 1944 } 1945 } 1946 int size = m_TempCharList.GetSize(); 1947 PAGECHAR_INFO preChar; 1948 if (size) { 1949 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 1950 } else { 1951 size = m_charList.GetSize(); 1952 if(size == 0) { 1953 return FALSE; 1954 } 1955 preChar = (PAGECHAR_INFO)m_charList[size - 1]; 1956 } 1957 if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag) 1958 if (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode) { 1959 return TRUE; 1960 } 1961 } 1962 return FALSE; 1963 } 1964 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj, const CFX_AffineMatrix& formMatrix) 1965 { 1966 FindPreviousTextObject(); 1967 FX_BOOL bNewline = FALSE; 1968 int WritingMode = GetTextObjectWritingMode(pObj); 1969 if(WritingMode == -1) { 1970 WritingMode = GetTextObjectWritingMode(m_pPreTextObj); 1971 } 1972 CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right, pObj->m_Top); 1973 CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pPreTextObj->m_Right, m_pPreTextObj->m_Top); 1974 CPDF_TextObjectItem PrevItem, item; 1975 int nItem = m_pPreTextObj->CountItems(); 1976 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem); 1977 pObj->GetItemInfo(0, &item); 1978 CFX_WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 1979 if(wstrItem.IsEmpty()) { 1980 wstrItem += (FX_WCHAR)item.m_CharCode; 1981 } 1982 FX_WCHAR curChar = wstrItem.GetAt(0); 1983 if(WritingMode == 0) { 1984 if(this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) { 1985 FX_FLOAT top = this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top; 1986 FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom : prev_rect.bottom; 1987 if(bottom >= top) { 1988 if(IsHyphen(curChar)) { 1989 return 3; 1990 } 1991 return 2; 1992 } 1993 } 1994 } else if (WritingMode == 1) { 1995 if(this_rect.Width() > pObj->GetFontSize() * 0.1f && prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) { 1996 FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left : m_CurlineRect.left; 1997 FX_FLOAT right = this_rect.right < m_CurlineRect.right ? this_rect.right : m_CurlineRect.right; 1998 if(right <= left) { 1999 if(IsHyphen(curChar)) { 2000 return 3; 2001 } 2002 return 2; 2003 } 2004 } 2005 } 2006 FX_FLOAT last_pos = PrevItem.m_OriginX; 2007 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont()); 2008 FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000; 2009 last_width = FXSYS_fabs(last_width); 2010 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); 2011 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000; 2012 this_width = FXSYS_fabs(this_width); 2013 FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4; 2014 CFX_AffineMatrix prev_matrix, prev_reverse; 2015 m_pPreTextObj->GetTextMatrix(&prev_matrix); 2016 prev_matrix.Concat(m_perMatrix); 2017 prev_reverse.SetReverse(prev_matrix); 2018 FX_FLOAT x = pObj->GetPosX(); 2019 FX_FLOAT y = pObj->GetPosY(); 2020 formMatrix.Transform(x, y); 2021 prev_reverse.Transform(x, y); 2022 if(last_width < this_width) { 2023 threshold = prev_reverse.TransformDistance(threshold); 2024 } 2025 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom, m_pPreTextObj->m_Right, pObj->m_Top); 2026 CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pPreTextObj->m_Right, m_pPreTextObj->m_Top); 2027 CFX_FloatRect rect3 = rect1; 2028 rect1.Intersect(rect2); 2029 if (WritingMode == 0) { 2030 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) 2031 || ((y > threshold * 2 || y < threshold * -3) && (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) { 2032 bNewline = TRUE; 2033 if(nItem > 1 ) { 2034 CPDF_TextObjectItem tempItem; 2035 m_pPreTextObj->GetItemInfo(0, &tempItem); 2036 CFX_AffineMatrix m; 2037 m_pPreTextObj->GetTextMatrix(&m); 2038 if(PrevItem.m_OriginX > tempItem.m_OriginX && 2039 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 && 2040 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 2041 && m.b < 0.1 && m.c < 0.1 ) { 2042 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000, m_pPreTextObj->m_Top); 2043 if(re.Contains(pObj->GetPosX(), pObj->GetPosY())) { 2044 bNewline = FALSE; 2045 } else { 2046 CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top); 2047 if(re.Contains(m_pPreTextObj->GetPosX(), m_pPreTextObj->GetPosY())) { 2048 bNewline = FALSE; 2049 } 2050 } 2051 } 2052 } 2053 } 2054 } 2055 if(bNewline) { 2056 if(IsHyphen(curChar)) { 2057 return 3; 2058 } 2059 return 2; 2060 } 2061 FX_INT32 nChars = pObj->CountChars(); 2062 if (nChars == 1 && ( 0x2D == curChar || 0xAD == curChar)) 2063 if (IsHyphen(curChar)) { 2064 return 3; 2065 } 2066 CFX_WideString PrevStr = m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode); 2067 FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1); 2068 CFX_AffineMatrix matrix; 2069 pObj->GetTextMatrix(&matrix); 2070 matrix.Concat(formMatrix); 2071 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); 2072 threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 : (threshold > 800 ? threshold / 6 : threshold / 5)) : (threshold / 2); 2073 if(nLastWidth >= nThisWidth) { 2074 threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize()); 2075 } else { 2076 threshold *= FXSYS_fabs(pObj->GetFontSize()); 2077 threshold = matrix.TransformDistance(threshold); 2078 threshold = prev_reverse.TransformDistance(threshold); 2079 } 2080 threshold /= 1000; 2081 if((threshold < 1.4881 && threshold > 1.4879) 2082 || (threshold < 1.39001 && threshold > 1.38999)) { 2083 threshold *= 1.5; 2084 } 2085 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ') 2086 if (curChar != L' ' && preChar != L' ') { 2087 if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) { 2088 return 1; 2089 } 2090 if(x < 0 && (last_pos - x - last_width) > threshold) { 2091 return 1; 2092 } 2093 if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) { 2094 return 1; 2095 } 2096 } 2097 return 0; 2098 } 2099 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2) 2100 { 2101 if (!pTextObj1 || !pTextObj2) { 2102 return FALSE; 2103 } 2104 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top); 2105 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top); 2106 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() && !m_ParseOptions.m_bGetCharCodeOnly) { 2107 FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left); 2108 int nCount = m_charList.GetSize(); 2109 if (nCount >= 2) { 2110 PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2]; 2111 FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width(); 2112 if (dbXdif > dbSpace) { 2113 return FALSE; 2114 } 2115 } 2116 } 2117 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { 2118 rcPreObj.Intersect(rcCurObj); 2119 if (rcPreObj.IsEmpty()) { 2120 return FALSE; 2121 } 2122 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) { 2123 return FALSE; 2124 } 2125 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) { 2126 return FALSE; 2127 } 2128 } 2129 int nPreCount = pTextObj2->CountItems(); 2130 int nCurCount = pTextObj1->CountItems(); 2131 if (nPreCount != nCurCount) { 2132 return FALSE; 2133 } 2134 CPDF_TextObjectItem itemPer, itemCur; 2135 for (int i = 0; i < nPreCount; i++) { 2136 pTextObj2->GetItemInfo(i, &itemPer); 2137 pTextObj1->GetItemInfo(i, &itemCur); 2138 if (itemCur.m_CharCode != itemPer.m_CharCode) { 2139 return FALSE; 2140 } 2141 } 2142 if(FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) > GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont())*pTextObj2->GetFontSize() / 1000 * 0.9 || 2143 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) > 2144 FX_MAX(FX_MAX(rcPreObj.Height() , rcPreObj.Width()), pTextObj2->GetFontSize()) / 8) { 2145 return FALSE; 2146 } 2147 return TRUE; 2148 } 2149 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos) 2150 { 2151 if (!pTextObj) { 2152 return FALSE; 2153 } 2154 int i = 0; 2155 if (!ObjPos) { 2156 ObjPos = m_pPage->GetLastObjectPosition(); 2157 } 2158 CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos); 2159 while (i < 5 && ObjPos) { 2160 pObj = m_pPage->GetPrevObject(ObjPos); 2161 if(pObj == pTextObj) { 2162 continue; 2163 } 2164 if(pObj->m_Type != PDFPAGE_TEXT) { 2165 continue; 2166 } 2167 if(IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) { 2168 return TRUE; 2169 } 2170 i++; 2171 } 2172 return FALSE; 2173 } 2174 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) 2175 { 2176 int size = m_TempCharList.GetSize(); 2177 PAGECHAR_INFO preChar; 2178 if (size) { 2179 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 2180 } else { 2181 size = m_charList.GetSize(); 2182 if(size == 0) { 2183 return FALSE; 2184 } 2185 preChar = (PAGECHAR_INFO)m_charList[size - 1]; 2186 } 2187 info.m_Index = m_TextBuf.GetLength(); 2188 info.m_Unicode = unicode; 2189 info.m_pTextObj = NULL; 2190 info.m_CharCode = -1; 2191 info.m_Flag = FPDFTEXT_CHAR_GENERATED; 2192 int preWidth = 0; 2193 if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD) - 1) { 2194 preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont()); 2195 } 2196 FX_FLOAT fs = 0; 2197 if(preChar.m_pTextObj) { 2198 fs = preChar.m_pTextObj->GetFontSize(); 2199 } else { 2200 fs = preChar.m_CharBox.Height(); 2201 } 2202 if(!fs) { 2203 fs = 1; 2204 } 2205 info.m_OriginX = preChar.m_OriginX + preWidth * (fs) / 1000; 2206 info.m_OriginY = preChar.m_OriginY; 2207 info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX, info.m_OriginY); 2208 return TRUE; 2209 } 2210 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2) 2211 { 2212 CFX_FloatRect rect = rect1; 2213 rect.Intersect(rect2); 2214 return !rect.IsEmpty(); 2215 } 2216 FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) 2217 { 2218 if (unicode < L'A') { 2219 return FALSE; 2220 } 2221 if (unicode > L'Z' && unicode < L'a') { 2222 return FALSE; 2223 } 2224 if (unicode > L'z') { 2225 return FALSE; 2226 } 2227 return TRUE; 2228 } 2229 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage) 2230 : m_pTextPage(pTextPage), 2231 m_flags(0), 2232 m_findNextStart(-1), 2233 m_findPreStart(-1), 2234 m_bMatchCase(FALSE), 2235 m_bMatchWholeWord(FALSE), 2236 m_resStart(0), 2237 m_resEnd(-1), 2238 m_IsFind(FALSE) 2239 { 2240 m_strText = m_pTextPage->GetPageText(); 2241 int nCount = pTextPage->CountChars(); 2242 if(nCount) { 2243 m_CharIndex.Add(0); 2244 } 2245 for(int i = 0; i < nCount; i++) { 2246 FPDF_CHAR_INFO info; 2247 pTextPage->GetCharInfo(i, info); 2248 int indexSize = m_CharIndex.GetSize(); 2249 if(info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) { 2250 if(indexSize % 2) { 2251 m_CharIndex.Add(1); 2252 } else { 2253 if(indexSize <= 0) { 2254 continue; 2255 } 2256 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1); 2257 } 2258 } else { 2259 if(indexSize % 2) { 2260 if(indexSize <= 0) { 2261 continue; 2262 } 2263 m_CharIndex.SetAt(indexSize - 1, i + 1); 2264 } else { 2265 m_CharIndex.Add(i + 1); 2266 } 2267 } 2268 } 2269 int indexSize = m_CharIndex.GetSize(); 2270 if(indexSize % 2) { 2271 m_CharIndex.RemoveAt(indexSize - 1); 2272 } 2273 } 2274 int CPDF_TextPageFind::GetCharIndex(int index) const 2275 { 2276 return m_pTextPage->CharIndexFromTextIndex(index); 2277 int indexSize = m_CharIndex.GetSize(); 2278 int count = 0; 2279 for(int i = 0; i < indexSize; i += 2) { 2280 count += m_CharIndex.GetAt(i + 1); 2281 if(count > index) { 2282 return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i); 2283 } 2284 } 2285 return -1; 2286 } 2287 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, int flags, int startPos) 2288 { 2289 if (!m_pTextPage) { 2290 return FALSE; 2291 } 2292 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) { 2293 m_strText = m_pTextPage->GetPageText(); 2294 } 2295 CFX_WideString findwhatStr = findwhat; 2296 m_findWhat = findwhatStr; 2297 m_flags = flags; 2298 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; 2299 if (m_strText.IsEmpty()) { 2300 m_IsFind = FALSE; 2301 return TRUE; 2302 } 2303 FX_STRSIZE len = findwhatStr.GetLength(); 2304 if (!m_bMatchCase) { 2305 findwhatStr.MakeLower(); 2306 m_strText.MakeLower(); 2307 } 2308 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD; 2309 m_findNextStart = startPos; 2310 if (startPos == -1) { 2311 m_findPreStart = m_strText.GetLength() - 1; 2312 } else { 2313 m_findPreStart = startPos; 2314 } 2315 m_csFindWhatArray.RemoveAll(); 2316 int i = 0; 2317 while(i < len) { 2318 if(findwhatStr.GetAt(i) != ' ') { 2319 break; 2320 } 2321 i++; 2322 } 2323 if(i < len) { 2324 ExtractFindWhat(findwhatStr); 2325 } else { 2326 m_csFindWhatArray.Add(findwhatStr); 2327 } 2328 if(m_csFindWhatArray.GetSize() <= 0) { 2329 return FALSE; 2330 } 2331 m_IsFind = TRUE; 2332 m_resStart = 0; 2333 m_resEnd = -1; 2334 return TRUE; 2335 } 2336 FX_BOOL CPDF_TextPageFind::FindNext() 2337 { 2338 if (!m_pTextPage) { 2339 return FALSE; 2340 } 2341 m_resArray.RemoveAll(); 2342 if(m_findNextStart == -1) { 2343 return FALSE; 2344 } 2345 if(m_strText.IsEmpty()) { 2346 m_IsFind = FALSE; 2347 return m_IsFind; 2348 } 2349 int strLen = m_strText.GetLength(); 2350 if (m_findNextStart > strLen - 1) { 2351 m_IsFind = FALSE; 2352 return m_IsFind; 2353 } 2354 int nCount = m_csFindWhatArray.GetSize(); 2355 int nResultPos = 0; 2356 int nStartPos = 0; 2357 nStartPos = m_findNextStart; 2358 FX_BOOL bSpaceStart = FALSE; 2359 for(int iWord = 0; iWord < nCount; iWord++) { 2360 CFX_WideString csWord = m_csFindWhatArray[iWord]; 2361 if(csWord.IsEmpty()) { 2362 if(iWord == nCount - 1) { 2363 FX_WCHAR strInsert = m_strText.GetAt(nStartPos); 2364 if(strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR || strInsert == TEXT_RETURN_CHAR || strInsert == 160) { 2365 nResultPos = nStartPos + 1; 2366 break; 2367 } 2368 iWord = -1; 2369 } else if(iWord == 0) { 2370 bSpaceStart = TRUE; 2371 } 2372 continue; 2373 } 2374 int endIndex; 2375 nResultPos = m_strText.Find(csWord.c_str(), nStartPos); 2376 if (nResultPos == -1) { 2377 m_IsFind = FALSE; 2378 return m_IsFind; 2379 } 2380 endIndex = nResultPos + csWord.GetLength() - 1; 2381 if(iWord == 0) { 2382 m_resStart = nResultPos; 2383 } 2384 FX_BOOL bMatch = TRUE; 2385 if(iWord != 0 && !bSpaceStart) { 2386 int PreResEndPos = nStartPos; 2387 int curChar = csWord.GetAt(0); 2388 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; 2389 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); 2390 if(nStartPos == nResultPos && !(_IsIgnoreSpaceCharacter(lastChar) || _IsIgnoreSpaceCharacter(curChar))) { 2391 bMatch = FALSE; 2392 } 2393 for(int d = PreResEndPos; d < nResultPos; d++) { 2394 FX_WCHAR strInsert = m_strText.GetAt(d); 2395 if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 2396 bMatch = FALSE; 2397 break; 2398 } 2399 } 2400 } else if(bSpaceStart) { 2401 if(nResultPos > 0) { 2402 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); 2403 if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) { 2404 bMatch = FALSE; 2405 m_resStart = nResultPos; 2406 } else { 2407 m_resStart = nResultPos - 1; 2408 } 2409 } 2410 } 2411 if(m_bMatchWholeWord && bMatch) { 2412 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); 2413 } 2414 nStartPos = endIndex + 1; 2415 if(!bMatch) { 2416 iWord = -1; 2417 if(bSpaceStart) { 2418 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); 2419 } else { 2420 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); 2421 } 2422 } 2423 } 2424 m_resEnd = nResultPos + m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1; 2425 m_IsFind = TRUE; 2426 int resStart = GetCharIndex(m_resStart); 2427 int resEnd = GetCharIndex(m_resEnd); 2428 m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray); 2429 if(m_flags & FPDFTEXT_CONSECUTIVE) { 2430 m_findNextStart = m_resStart + 1; 2431 m_findPreStart = m_resEnd - 1; 2432 } else { 2433 m_findNextStart = m_resEnd + 1; 2434 m_findPreStart = m_resStart - 1; 2435 } 2436 return m_IsFind; 2437 } 2438 FX_BOOL CPDF_TextPageFind::FindPrev() 2439 { 2440 if (!m_pTextPage) { 2441 return FALSE; 2442 } 2443 m_resArray.RemoveAll(); 2444 if(m_strText.IsEmpty() || m_findPreStart < 0) { 2445 m_IsFind = FALSE; 2446 return m_IsFind; 2447 } 2448 CPDF_TextPageFind findEngine(m_pTextPage); 2449 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); 2450 if(!ret) { 2451 m_IsFind = FALSE; 2452 return m_IsFind; 2453 } 2454 int order = -1, MatchedCount = 0; 2455 while(ret) { 2456 ret = findEngine.FindNext(); 2457 if(ret) { 2458 int order1 = findEngine.GetCurOrder() ; 2459 int MatchedCount1 = findEngine.GetMatchedCount(); 2460 if(((order1 + MatchedCount1) - 1) > m_findPreStart) { 2461 break; 2462 } 2463 order = order1; 2464 MatchedCount = MatchedCount1; 2465 } 2466 } 2467 if(order == -1) { 2468 m_IsFind = FALSE; 2469 return m_IsFind; 2470 } 2471 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); 2472 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); 2473 m_IsFind = TRUE; 2474 m_pTextPage->GetRectArray(order, MatchedCount, m_resArray); 2475 if(m_flags & FPDFTEXT_CONSECUTIVE) { 2476 m_findNextStart = m_resStart + 1; 2477 m_findPreStart = m_resEnd - 1; 2478 } else { 2479 m_findNextStart = m_resEnd + 1; 2480 m_findPreStart = m_resStart - 1; 2481 } 2482 return m_IsFind; 2483 } 2484 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) 2485 { 2486 if(findwhat.IsEmpty()) { 2487 return ; 2488 } 2489 int index = 0; 2490 while(1) { 2491 CFX_WideString csWord = TEXT_EMPTY; 2492 int ret = ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_CHAR); 2493 if(csWord.IsEmpty()) { 2494 if(ret) { 2495 m_csFindWhatArray.Add(CFX_WideString(L"")); 2496 index++; 2497 continue; 2498 } else { 2499 break; 2500 } 2501 } 2502 int pos = 0; 2503 while(pos < csWord.GetLength()) { 2504 CFX_WideString curStr = csWord.Mid(pos, 1); 2505 FX_WCHAR curChar = csWord.GetAt(pos); 2506 if (_IsIgnoreSpaceCharacter(curChar)) { 2507 if (pos > 0 && curChar == 0x2019) { 2508 pos++; 2509 continue; 2510 } 2511 if (pos > 0 ) { 2512 CFX_WideString preStr = csWord.Mid(0, pos); 2513 m_csFindWhatArray.Add(preStr); 2514 } 2515 m_csFindWhatArray.Add(curStr); 2516 if (pos == csWord.GetLength() - 1) { 2517 csWord.Empty(); 2518 break; 2519 } 2520 csWord = csWord.Right(csWord.GetLength() - pos - 1); 2521 pos = 0; 2522 continue; 2523 } 2524 pos++; 2525 } 2526 if (!csWord.IsEmpty()) { 2527 m_csFindWhatArray.Add(csWord); 2528 } 2529 index++; 2530 } 2531 } 2532 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, int startPos, int endPos) 2533 { 2534 int char_left = 0; 2535 int char_right = 0; 2536 int char_count = endPos - startPos + 1; 2537 if(char_count < 1) { 2538 return FALSE; 2539 } 2540 if (char_count == 1 && csPageText.GetAt(startPos) > 255) { 2541 return TRUE; 2542 } 2543 if(startPos - 1 >= 0 ) { 2544 char_left = csPageText.GetAt(startPos - 1); 2545 } 2546 if(startPos + char_count < csPageText.GetLength()) { 2547 char_right = csPageText.GetAt(startPos + char_count); 2548 } 2549 if ((char_left > 'A' && char_left < 'a') || (char_left > 'a' && char_left < 'z') || (char_left > 0xfb00 && char_left < 0xfb06) || (char_left >= '0' && char_left <= '9') || 2550 (char_right > 'A' && char_right < 'a') || (char_right > 'a' && char_right < 'z') || (char_right > 0xfb00 && char_right < 0xfb06) || (char_right >= '0' && char_right <= '9')) { 2551 return FALSE; 2552 } 2553 if(!(('A' > char_left || char_left > 'Z') && ('a' > char_left || char_left > 'z') 2554 && ('A' > char_right || char_right > 'Z') && ('a' > char_right || char_right > 'z'))) { 2555 return FALSE; 2556 } 2557 if (char_count > 0) { 2558 if (csPageText.GetAt(startPos) >= L'0' && csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && char_left <= L'9') { 2559 return FALSE; 2560 } 2561 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && char_right >= L'0' && char_right <= L'9') { 2562 return FALSE; 2563 } 2564 } 2565 return TRUE; 2566 } 2567 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString, 2568 int iSubString, FX_WCHAR chSep) 2569 { 2570 if (lpszFullString == NULL) { 2571 return FALSE; 2572 } 2573 while (iSubString--) { 2574 lpszFullString = FXSYS_wcschr(lpszFullString, chSep); 2575 if (lpszFullString == NULL) { 2576 rString.Empty(); 2577 return FALSE; 2578 } 2579 lpszFullString++; 2580 while(*lpszFullString == chSep) { 2581 lpszFullString++; 2582 } 2583 } 2584 FX_LPCWSTR lpchEnd = FXSYS_wcschr(lpszFullString, chSep); 2585 int nLen = (lpchEnd == NULL) ? 2586 (int)FXSYS_wcslen(lpszFullString) : (int)(lpchEnd - lpszFullString); 2587 ASSERT(nLen >= 0); 2588 FXSYS_memcpy32(rString.GetBuffer(nLen), lpszFullString, nLen * sizeof(FX_WCHAR)); 2589 rString.ReleaseBuffer(); 2590 return TRUE; 2591 } 2592 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) 2593 { 2594 CFX_WideString str2; 2595 str2.Empty(); 2596 int nlen = str.GetLength(); 2597 for(int i = nlen - 1; i >= 0; i--) { 2598 str2 += str.GetAt(i); 2599 } 2600 return str2; 2601 } 2602 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const 2603 { 2604 rects.Copy(m_resArray); 2605 } 2606 int CPDF_TextPageFind::GetCurOrder() const 2607 { 2608 return GetCharIndex(m_resStart); 2609 } 2610 int CPDF_TextPageFind::GetMatchedCount()const 2611 { 2612 int resStart = GetCharIndex(m_resStart); 2613 int resEnd = GetCharIndex(m_resEnd); 2614 return resEnd - resStart + 1; 2615 } 2616 CPDF_LinkExtract::CPDF_LinkExtract() 2617 : m_pTextPage(NULL), 2618 m_IsParserd(FALSE) 2619 { 2620 } 2621 CPDF_LinkExtract::~CPDF_LinkExtract() 2622 { 2623 DeleteLinkList(); 2624 } 2625 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) 2626 { 2627 if (!pTextPage || !pTextPage->IsParsered()) { 2628 return FALSE; 2629 } 2630 m_pTextPage = (const CPDF_TextPage*)pTextPage; 2631 m_strPageText = m_pTextPage->GetPageText(0, -1); 2632 DeleteLinkList(); 2633 if (m_strPageText.IsEmpty()) { 2634 return FALSE; 2635 } 2636 parserLink(); 2637 m_IsParserd = TRUE; 2638 return TRUE; 2639 } 2640 void CPDF_LinkExtract::DeleteLinkList() 2641 { 2642 while (m_LinkList.GetSize()) { 2643 CPDF_LinkExt* linkinfo = NULL; 2644 linkinfo = m_LinkList.GetAt(0); 2645 m_LinkList.RemoveAt(0); 2646 delete linkinfo; 2647 } 2648 m_LinkList.RemoveAll(); 2649 } 2650 int CPDF_LinkExtract::CountLinks() const 2651 { 2652 if (!m_IsParserd) { 2653 return -1; 2654 } 2655 return m_LinkList.GetSize(); 2656 } 2657 void CPDF_LinkExtract::parserLink() 2658 { 2659 int start = 0, pos = 0; 2660 int TotalChar = m_pTextPage->CountChars(); 2661 while (pos < TotalChar) { 2662 FPDF_CHAR_INFO pageChar; 2663 m_pTextPage->GetCharInfo(pos, pageChar); 2664 if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { 2665 int nCount = pos - start; 2666 if(pos == TotalChar - 1) { 2667 nCount++; 2668 } 2669 CFX_WideString strBeCheck; 2670 strBeCheck = m_pTextPage->GetPageText(start, nCount); 2671 if (strBeCheck.GetLength() > 5) { 2672 while(strBeCheck.GetLength() > 0) { 2673 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); 2674 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { 2675 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); 2676 nCount--; 2677 } else { 2678 break; 2679 } 2680 } 2681 if (nCount > 5 && (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { 2682 if (!AppendToLinkList(start, nCount, strBeCheck)) { 2683 break; 2684 } 2685 } 2686 } 2687 start = ++pos; 2688 } else { 2689 pos++; 2690 } 2691 } 2692 } 2693 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) 2694 { 2695 CFX_WideString str = strBeCheck; 2696 str.MakeLower(); 2697 if (str.Find(L"http://www.") != -1) { 2698 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); 2699 return TRUE; 2700 } else if (str.Find(L"http://") != -1) { 2701 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); 2702 return TRUE; 2703 } else if (str.Find(L"https://www.") != -1) { 2704 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); 2705 return TRUE; 2706 } else if (str.Find(L"https://") != -1) { 2707 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); 2708 return TRUE; 2709 } else if (str.Find(L"www.") != -1) { 2710 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 2711 strBeCheck = L"http://" + strBeCheck; 2712 return TRUE; 2713 } else { 2714 return FALSE; 2715 } 2716 } 2717 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) 2718 { 2719 str.MakeLower(); 2720 int aPos = str.Find(L'@'); 2721 if (aPos < 1) { 2722 return FALSE; 2723 } 2724 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') { 2725 return FALSE; 2726 } 2727 int i; 2728 for (i = aPos - 1; i >= 0; i--) { 2729 FX_WCHAR ch = str.GetAt(i); 2730 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || (ch >= L'0' && ch <= L'9')) { 2731 continue; 2732 } else { 2733 if (i == aPos - 1) { 2734 return FALSE; 2735 } 2736 str = str.Right(str.GetLength() - i - 1); 2737 break; 2738 } 2739 } 2740 aPos = str.Find(L'@'); 2741 if (aPos < 1) { 2742 return FALSE; 2743 } 2744 CFX_WideString strtemp = L""; 2745 for (i = 0; i < aPos; i++) { 2746 FX_WCHAR wch = str.GetAt(i); 2747 if (wch >= L'a' && wch <= L'z') { 2748 break; 2749 } else { 2750 strtemp = str.Right(str.GetLength() - i + 1); 2751 } 2752 } 2753 if (strtemp != L"") { 2754 str = strtemp; 2755 } 2756 aPos = str.Find(L'@'); 2757 if (aPos < 1) { 2758 return FALSE; 2759 } 2760 str.TrimRight(L'.'); 2761 strtemp = str; 2762 int ePos = str.Find(L'.'); 2763 if (ePos == -1) { 2764 return FALSE; 2765 } 2766 while (ePos != -1) { 2767 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); 2768 ePos = strtemp.Find('.'); 2769 } 2770 ePos = strtemp.GetLength(); 2771 for (i = 0; i < ePos; i++) { 2772 FX_WCHAR wch = str.GetAt(i); 2773 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2774 continue; 2775 } else { 2776 str = str.Left(str.GetLength() - ePos + i + 1); 2777 ePos = ePos - i - 1; 2778 break; 2779 } 2780 } 2781 int nLen = str.GetLength(); 2782 for (i = aPos + 1; i < nLen - ePos; i++) { 2783 FX_WCHAR wch = str.GetAt(i); 2784 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2785 continue; 2786 } else { 2787 return FALSE; 2788 } 2789 } 2790 if (str.Find(L"mailto:") == -1) { 2791 str = L"mailto:" + str; 2792 } 2793 return TRUE; 2794 } 2795 FX_BOOL CPDF_LinkExtract::AppendToLinkList(int start, int count, const CFX_WideString& strUrl) 2796 { 2797 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; 2798 linkInfo->m_strUrl = strUrl; 2799 linkInfo->m_Start = start; 2800 linkInfo->m_Count = count; 2801 m_LinkList.Add(linkInfo); 2802 return TRUE; 2803 } 2804 CFX_WideString CPDF_LinkExtract::GetURL(int index) const 2805 { 2806 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { 2807 return L""; 2808 } 2809 CPDF_LinkExt* link = NULL; 2810 link = m_LinkList.GetAt(index); 2811 if (!link) { 2812 return L""; 2813 } 2814 return link->m_strUrl; 2815 } 2816 void CPDF_LinkExtract::GetBoundedSegment(int index, int& start, int& count) const 2817 { 2818 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { 2819 return ; 2820 } 2821 CPDF_LinkExt* link = NULL; 2822 link = m_LinkList.GetAt(index); 2823 if (!link) { 2824 return ; 2825 } 2826 start = link->m_Start; 2827 count = link->m_Count; 2828 } 2829 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const 2830 { 2831 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { 2832 return; 2833 } 2834 CPDF_LinkExt* link = NULL; 2835 link = m_LinkList.GetAt(index); 2836 if (!link) { 2837 return ; 2838 } 2839 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); 2840 } 2841