1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include <cctype> 8 #include <cwctype> 9 #include <memory> 10 11 #include "core/include/fpdfapi/fpdf_page.h" 12 #include "core/include/fpdfapi/fpdf_pageobj.h" 13 #include "core/include/fpdfapi/fpdf_resource.h" 14 #include "core/include/fpdftext/fpdf_text.h" 15 #include "core/include/fxcrt/fx_bidi.h" 16 #include "core/include/fxcrt/fx_ucd.h" 17 #include "text_int.h" 18 #include "txtproc.h" 19 20 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, 21 int destcp, 22 const FX_CHAR* defchar) { 23 if (destcp == 0) { 24 if (unicode < 0x80) { 25 return CFX_ByteString((char)unicode); 26 } 27 const FX_CHAR* altstr = FCS_GetAltStr(unicode); 28 return CFX_ByteString(altstr ? altstr : defchar); 29 } 30 char buf[10]; 31 int iDef = 0; 32 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, 33 NULL, &iDef); 34 if (ret && !iDef) { 35 return CFX_ByteString(buf, ret); 36 } 37 const FX_CHAR* altstr = FCS_GetAltStr(unicode); 38 return CFX_ByteString(altstr ? altstr : defchar); 39 } 40 CTextPage::CTextPage() {} 41 CTextPage::~CTextPage() { 42 int i; 43 for (i = 0; i < m_BaseLines.GetSize(); i++) { 44 delete m_BaseLines.GetAt(i); 45 } 46 for (i = 0; i < m_TextColumns.GetSize(); i++) { 47 delete m_TextColumns.GetAt(i); 48 } 49 } 50 void CTextPage::ProcessObject(CPDF_PageObject* pObject) { 51 if (pObject->m_Type != PDFPAGE_TEXT) { 52 return; 53 } 54 CPDF_TextObject* pText = (CPDF_TextObject*)pObject; 55 CPDF_Font* pFont = pText->m_TextState.GetFont(); 56 int count = pText->CountItems(); 57 FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2); 58 pText->CalcCharPos(pPosArray); 59 60 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); 61 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); 62 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 63 FX_FLOAT spacew = 0; 64 if (space_charcode != -1) { 65 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 66 } 67 if (spacew == 0) { 68 spacew = fontsize_h / 4; 69 } 70 if (pText->m_TextState.GetBaselineAngle() != 0) { 71 int cc = 0; 72 CFX_Matrix matrix; 73 pText->GetTextMatrix(&matrix); 74 for (int i = 0; i < pText->m_nChars; i++) { 75 FX_DWORD charcode = pText->m_nChars == 1 76 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes 77 : pText->m_pCharCodes[i]; 78 if (charcode == (FX_DWORD)-1) { 79 continue; 80 } 81 FX_RECT char_box; 82 pFont->GetCharBBox(charcode, char_box); 83 FX_FLOAT char_left = 84 pPosArray ? pPosArray[cc * 2] 85 : char_box.left * pText->m_TextState.GetFontSize() / 1000; 86 FX_FLOAT char_right = 87 pPosArray ? pPosArray[cc * 2 + 1] 88 : char_box.right * pText->m_TextState.GetFontSize() / 1000; 89 FX_FLOAT char_top = 90 char_box.top * pText->m_TextState.GetFontSize() / 1000; 91 FX_FLOAT char_bottom = 92 char_box.bottom * pText->m_TextState.GetFontSize() / 1000; 93 cc++; 94 FX_FLOAT char_origx, char_origy; 95 matrix.Transform(char_left, 0, char_origx, char_origy); 96 matrix.TransformRect(char_left, char_right, char_top, char_bottom); 97 CFX_ByteString str; 98 pFont->AppendChar(str, charcode); 99 InsertTextBox(NULL, char_origy, char_left, char_right, char_top, 100 char_bottom, spacew, fontsize_v, str, pFont); 101 } 102 FX_Free(pPosArray); 103 return; 104 } 105 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); 106 for (int ii = 0; ii < count * 2; ii++) { 107 pPosArray[ii] *= ratio_h; 108 } 109 FX_FLOAT baseline = pText->m_PosY; 110 CTextBaseLine* pBaseLine = NULL; 111 FX_FLOAT topy = pText->m_Top; 112 FX_FLOAT bottomy = pText->m_Bottom; 113 FX_FLOAT leftx = pText->m_Left; 114 int cc = 0; 115 CFX_ByteString segment; 116 int space_count = 0; 117 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; 118 for (int i = 0; i < pText->m_nChars; i++) { 119 FX_DWORD charcode = pText->m_nChars == 1 120 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes 121 : pText->m_pCharCodes[i]; 122 if (charcode == (FX_DWORD)-1) { 123 continue; 124 } 125 FX_FLOAT char_left = pPosArray[cc * 2]; 126 FX_FLOAT char_right = pPosArray[cc * 2 + 1]; 127 cc++; 128 if (char_left < last_left || (char_left - last_right) > spacew / 2) { 129 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, 130 leftx + segment_right, topy, bottomy, spacew, 131 fontsize_v, segment, pFont); 132 segment_left = char_left; 133 segment = ""; 134 } 135 if (space_count > 1) { 136 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, 137 leftx + segment_right, topy, bottomy, spacew, 138 fontsize_v, segment, pFont); 139 segment = ""; 140 } else if (space_count == 1) { 141 pFont->AppendChar(segment, ' '); 142 } 143 if (segment.GetLength() == 0) { 144 segment_left = char_left; 145 } 146 segment_right = char_right; 147 pFont->AppendChar(segment, charcode); 148 space_count = 0; 149 last_left = char_left; 150 last_right = char_right; 151 } 152 if (segment.GetLength()) 153 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, 154 leftx + segment_right, topy, bottomy, spacew, 155 fontsize_v, segment, pFont); 156 FX_Free(pPosArray); 157 } 158 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, 159 FX_FLOAT basey, 160 FX_FLOAT leftx, 161 FX_FLOAT rightx, 162 FX_FLOAT topy, 163 FX_FLOAT bottomy, 164 FX_FLOAT spacew, 165 FX_FLOAT fontsize_v, 166 CFX_ByteString& str, 167 CPDF_Font* pFont) { 168 if (str.GetLength() == 0) { 169 return NULL; 170 } 171 if (!pBaseLine) { 172 int i; 173 for (i = 0; i < m_BaseLines.GetSize(); i++) { 174 CTextBaseLine* pExistLine = m_BaseLines.GetAt(i); 175 if (pExistLine->m_BaseLine == basey) { 176 pBaseLine = pExistLine; 177 break; 178 } 179 if (pExistLine->m_BaseLine < basey) { 180 break; 181 } 182 } 183 if (!pBaseLine) { 184 pBaseLine = new CTextBaseLine; 185 pBaseLine->m_BaseLine = basey; 186 m_BaseLines.InsertAt(i, pBaseLine); 187 } 188 } 189 CFX_WideString text; 190 const FX_CHAR* pStr = str; 191 int len = str.GetLength(), offset = 0; 192 while (offset < len) { 193 FX_DWORD ch = pFont->GetNextChar(pStr, len, offset); 194 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); 195 if (unicode_str.IsEmpty()) { 196 text += (FX_WCHAR)ch; 197 } else { 198 text += unicode_str; 199 } 200 } 201 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, 202 text); 203 return pBaseLine; 204 } 205 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) { 206 FX_FLOAT lastheight = -1; 207 FX_FLOAT lastbaseline = -1; 208 FX_FLOAT MinLeftX = 1000000; 209 FX_FLOAT MaxRightX = 0; 210 int i; 211 for (i = 0; i < m_BaseLines.GetSize(); i++) { 212 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); 213 FX_FLOAT leftx, rightx; 214 if (pBaseLine->GetWidth(leftx, rightx)) { 215 if (leftx < MinLeftX) { 216 MinLeftX = leftx; 217 } 218 if (rightx > MaxRightX) { 219 MaxRightX = rightx; 220 } 221 } 222 } 223 for (i = 0; i < m_BaseLines.GetSize(); i++) { 224 m_BaseLines.GetAt(i)->MergeBoxes(); 225 } 226 for (i = 1; i < m_BaseLines.GetSize(); i++) { 227 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); 228 CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1); 229 if (pBaseLine->CanMerge(pPrevLine)) { 230 pPrevLine->Merge(pBaseLine); 231 delete pBaseLine; 232 m_BaseLines.RemoveAt(i); 233 i--; 234 } 235 } 236 if (m_bAutoWidth) { 237 int* widths = FX_Alloc(int, m_BaseLines.GetSize()); 238 for (i = 0; i < m_BaseLines.GetSize(); i++) { 239 widths[i] = 0; 240 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); 241 int TotalChars = 0; 242 FX_FLOAT TotalWidth = 0; 243 int minchars; 244 pBaseLine->CountChars(TotalChars, TotalWidth, minchars); 245 if (TotalChars) { 246 FX_FLOAT charwidth = TotalWidth / TotalChars; 247 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); 248 } 249 if (widths[i] > 1000) { 250 widths[i] = 1000; 251 } 252 if (widths[i] < minchars) { 253 widths[i] = minchars; 254 } 255 } 256 int AvgWidth = 0, widthcount = 0; 257 for (i = 0; i < m_BaseLines.GetSize(); i++) 258 if (widths[i]) { 259 AvgWidth += widths[i]; 260 widthcount++; 261 } 262 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); 263 int MaxWidth = 0; 264 for (i = 0; i < m_BaseLines.GetSize(); i++) 265 if (MaxWidth < widths[i]) { 266 MaxWidth = widths[i]; 267 } 268 if (MaxWidth > AvgWidth * 6 / 5) { 269 MaxWidth = AvgWidth * 6 / 5; 270 } 271 FX_Free(widths); 272 if (iMinWidth < MaxWidth) { 273 iMinWidth = MaxWidth; 274 } 275 } 276 for (i = 0; i < m_BaseLines.GetSize(); i++) { 277 m_BaseLines.GetAt(i)->MergeBoxes(); 278 } 279 if (m_bKeepColumn) { 280 FindColumns(); 281 } 282 for (i = 0; i < m_BaseLines.GetSize(); i++) { 283 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); 284 if (lastheight >= 0) { 285 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; 286 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) { 287 lines.Add(L""); 288 } 289 } 290 lastheight = pBaseLine->m_MaxFontSizeV; 291 lastbaseline = pBaseLine->m_BaseLine; 292 CFX_WideString str; 293 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); 294 lines.Add(str); 295 } 296 } 297 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) { 298 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); 299 FX_WCHAR* pDst = NULL; 300 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 301 if (nCount < 1) { 302 sDest += wChar; 303 return; 304 } 305 pDst = new FX_WCHAR[nCount]; 306 FX_Unicode_GetNormalization(wChar, pDst); 307 for (int nIndex = 0; nIndex < nCount; nIndex++) { 308 sDest += pDst[nIndex]; 309 } 310 delete[] pDst; 311 } 312 void NormalizeString(CFX_WideString& str) { 313 if (str.GetLength() <= 0) { 314 return; 315 } 316 CFX_WideString sBuffer; 317 std::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar); 318 CFX_WordArray order; 319 FX_BOOL bR2L = FALSE; 320 int32_t start = 0, count = 0, i = 0; 321 int nR2L = 0, nL2R = 0; 322 for (i = 0; i < str.GetLength(); i++) { 323 if (pBidiChar->AppendChar(str.GetAt(i))) { 324 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); 325 order.Add(start); 326 order.Add(count); 327 order.Add(ret); 328 if (!bR2L) { 329 if (ret == CFX_BidiChar::RIGHT) { 330 nR2L++; 331 } else if (ret == CFX_BidiChar::LEFT) { 332 nL2R++; 333 } 334 } 335 } 336 } 337 if (pBidiChar->EndChar()) { 338 CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); 339 order.Add(start); 340 order.Add(count); 341 order.Add(ret); 342 if (!bR2L) { 343 if (ret == CFX_BidiChar::RIGHT) { 344 nR2L++; 345 } else if (ret == CFX_BidiChar::LEFT) { 346 nL2R++; 347 } 348 } 349 } 350 if (nR2L > 0 && nR2L >= nL2R) { 351 bR2L = TRUE; 352 } 353 if (bR2L) { 354 int count = order.GetSize(); 355 for (int j = count - 1; j > 0; j -= 3) { 356 int ret = order.GetAt(j); 357 int start = order.GetAt(j - 2); 358 int count1 = order.GetAt(j - 1); 359 if (ret == 2 || ret == 0) { 360 for (int i = start + count1 - 1; i >= start; i--) { 361 NormalizeCompositeChar(str[i], sBuffer); 362 } 363 } else { 364 i = j; 365 FX_BOOL bSymbol = FALSE; 366 while (i > 0 && order.GetAt(i) != 2) { 367 bSymbol = !order.GetAt(i); 368 i -= 3; 369 } 370 int end = start + count1; 371 int n = 0; 372 if (bSymbol) { 373 n = i + 6; 374 } else { 375 n = i + 3; 376 } 377 if (n >= j) { 378 for (int m = start; m < end; m++) { 379 sBuffer += str[m]; 380 } 381 } else { 382 i = j; 383 j = n; 384 for (; n <= i; n += 3) { 385 int start = order.GetAt(n - 2); 386 int count1 = order.GetAt(n - 1); 387 int end = start + count1; 388 for (int m = start; m < end; m++) { 389 sBuffer += str[m]; 390 } 391 } 392 } 393 } 394 } 395 } else { 396 int count = order.GetSize(); 397 FX_BOOL bL2R = FALSE; 398 for (int j = 0; j < count; j += 3) { 399 int ret = order.GetAt(j + 2); 400 int start = order.GetAt(j); 401 int count1 = order.GetAt(j + 1); 402 if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) { 403 int i = j + 3; 404 while (bR2L && i < count) { 405 if (order.GetAt(i + 2) == 1) { 406 break; 407 } else { 408 i += 3; 409 } 410 } 411 if (i == 3) { 412 j = -3; 413 bL2R = TRUE; 414 continue; 415 } 416 int end = str.GetLength() - 1; 417 if (i < count) { 418 end = order.GetAt(i) - 1; 419 } 420 j = i - 3; 421 for (int n = end; n >= start; n--) { 422 NormalizeCompositeChar(str[i], sBuffer); 423 } 424 } else { 425 int end = start + count1; 426 for (int i = start; i < end; i++) { 427 sBuffer += str[i]; 428 } 429 } 430 } 431 } 432 str.Empty(); 433 str += sBuffer; 434 } 435 static FX_BOOL IsNumber(CFX_WideString& str) { 436 for (int i = 0; i < str.GetLength(); i++) { 437 FX_WCHAR ch = str[i]; 438 // TODO(dsinclair): --.+ +.-- should probably not be a number. 439 if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ') 440 return FALSE; 441 } 442 return TRUE; 443 } 444 void CTextPage::FindColumns() { 445 int i; 446 for (i = 0; i < m_BaseLines.GetSize(); i++) { 447 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); 448 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { 449 CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); 450 CTextColumn* pColumn = FindColumn(pTextBox->m_Right); 451 if (pColumn) { 452 pColumn->m_AvgPos = 453 (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) / 454 (pColumn->m_Count + 1); 455 pColumn->m_Count++; 456 } else { 457 pColumn = new CTextColumn; 458 pColumn->m_Count = 1; 459 pColumn->m_AvgPos = pTextBox->m_Right; 460 pColumn->m_TextPos = -1; 461 m_TextColumns.Add(pColumn); 462 } 463 } 464 } 465 int mincount = m_BaseLines.GetSize() / 4; 466 for (i = 0; i < m_TextColumns.GetSize(); i++) { 467 CTextColumn* pTextColumn = m_TextColumns.GetAt(i); 468 if (pTextColumn->m_Count >= mincount) { 469 continue; 470 } 471 delete pTextColumn; 472 m_TextColumns.RemoveAt(i); 473 i--; 474 } 475 for (i = 0; i < m_BaseLines.GetSize(); i++) { 476 CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); 477 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { 478 CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); 479 if (IsNumber(pTextBox->m_Text)) { 480 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); 481 } 482 } 483 } 484 } 485 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) { 486 for (int i = 0; i < m_TextColumns.GetSize(); i++) { 487 CTextColumn* pColumn = m_TextColumns.GetAt(i); 488 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { 489 return pColumn; 490 } 491 } 492 return NULL; 493 } 494 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {} 495 CTextBaseLine::CTextBaseLine() { 496 m_Top = -100000; 497 m_Bottom = 100000; 498 m_MaxFontSizeV = 0; 499 } 500 CTextBaseLine::~CTextBaseLine() { 501 for (int i = 0; i < m_TextList.GetSize(); i++) { 502 delete m_TextList.GetAt(i); 503 } 504 } 505 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, 506 FX_FLOAT rightx, 507 FX_FLOAT topy, 508 FX_FLOAT bottomy, 509 FX_FLOAT spacew, 510 FX_FLOAT fontsize_v, 511 const CFX_WideString& text) { 512 if (m_Top < topy) { 513 m_Top = topy; 514 } 515 if (m_Bottom > bottomy) { 516 m_Bottom = bottomy; 517 } 518 if (m_MaxFontSizeV < fontsize_v) { 519 m_MaxFontSizeV = fontsize_v; 520 } 521 int i; 522 for (i = 0; i < m_TextList.GetSize(); i++) { 523 CTextBox* pText = m_TextList.GetAt(i); 524 if (pText->m_Left > leftx) { 525 break; 526 } 527 } 528 CTextBox* pText = new CTextBox; 529 pText->m_Text = text; 530 pText->m_Left = leftx; 531 pText->m_Right = rightx; 532 pText->m_Top = topy; 533 pText->m_Bottom = bottomy; 534 pText->m_SpaceWidth = spacew; 535 pText->m_FontSizeV = fontsize_v; 536 pText->m_pColumn = NULL; 537 m_TextList.InsertAt(i, pText); 538 } 539 FX_BOOL GetIntersection(FX_FLOAT low1, 540 FX_FLOAT high1, 541 FX_FLOAT low2, 542 FX_FLOAT high2, 543 FX_FLOAT& interlow, 544 FX_FLOAT& interhigh); 545 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) { 546 FX_FLOAT inter_top, inter_bottom; 547 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top, 548 inter_bottom, inter_top)) { 549 return FALSE; 550 } 551 FX_FLOAT inter_h = inter_top - inter_bottom; 552 if (inter_h < (m_Top - m_Bottom) / 2 && 553 inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) { 554 return FALSE; 555 } 556 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); 557 for (int i = 0; i < m_TextList.GetSize(); i++) { 558 CTextBox* pText = m_TextList.GetAt(i); 559 for (int j = 0; j < pOther->m_TextList.GetSize(); j++) { 560 CTextBox* pOtherText = pOther->m_TextList.GetAt(j); 561 FX_FLOAT inter_left, inter_right; 562 if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left, 563 pOtherText->m_Right, inter_left, inter_right)) { 564 continue; 565 } 566 FX_FLOAT inter_w = inter_right - inter_left; 567 if (inter_w < pText->m_SpaceWidth / 2 && 568 inter_w < pOtherText->m_SpaceWidth / 2) { 569 continue; 570 } 571 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || 572 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { 573 return FALSE; 574 } 575 } 576 } 577 return TRUE; 578 } 579 void CTextBaseLine::Merge(CTextBaseLine* pOther) { 580 for (int i = 0; i < pOther->m_TextList.GetSize(); i++) { 581 CTextBox* pText = pOther->m_TextList.GetAt(i); 582 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom, 583 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text); 584 } 585 } 586 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) { 587 int i; 588 for (i = 0; i < m_TextList.GetSize(); i++) { 589 CTextBox* pText = m_TextList.GetAt(i); 590 if (pText->m_Text != L" ") { 591 break; 592 } 593 } 594 if (i == m_TextList.GetSize()) { 595 return FALSE; 596 } 597 CTextBox* pText = m_TextList.GetAt(i); 598 leftx = pText->m_Left; 599 for (i = m_TextList.GetSize() - 1; i >= 0; i--) { 600 CTextBox* pText = m_TextList.GetAt(i); 601 if (pText->m_Text != L" ") { 602 break; 603 } 604 } 605 pText = m_TextList.GetAt(i); 606 rightx = pText->m_Right; 607 return TRUE; 608 } 609 void CTextBaseLine::MergeBoxes() { 610 int i = 0; 611 while (1) { 612 if (i >= m_TextList.GetSize() - 1) { 613 break; 614 } 615 CTextBox* pThisText = m_TextList.GetAt(i); 616 CTextBox* pNextText = m_TextList.GetAt(i + 1); 617 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; 618 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) 619 ? pNextText->m_SpaceWidth 620 : pThisText->m_SpaceWidth; 621 if (spacew > 0.0 && dx < spacew * 2) { 622 pThisText->m_Right = pNextText->m_Right; 623 if (dx > spacew * 1.5) { 624 pThisText->m_Text += L" "; 625 } else if (dx > spacew / 3) { 626 pThisText->m_Text += L' '; 627 } 628 pThisText->m_Text += pNextText->m_Text; 629 pThisText->m_SpaceWidth = 630 pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth; 631 m_TextList.RemoveAt(i + 1); 632 delete pNextText; 633 } else { 634 i++; 635 } 636 } 637 } 638 void CTextBaseLine::WriteOutput(CFX_WideString& str, 639 FX_FLOAT leftx, 640 FX_FLOAT pagewidth, 641 int iTextWidth) { 642 int lastpos = -1; 643 for (int i = 0; i < m_TextList.GetSize(); i++) { 644 CTextBox* pText = m_TextList.GetAt(i); 645 int xpos; 646 if (pText->m_pColumn) { 647 xpos = 648 (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 649 0.5); 650 xpos -= pText->m_Text.GetLength(); 651 } else { 652 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5); 653 } 654 if (xpos <= lastpos) { 655 xpos = lastpos + 1; 656 } 657 for (int j = lastpos + 1; j < xpos; j++) { 658 str += ' '; 659 } 660 CFX_WideString sSrc(pText->m_Text); 661 NormalizeString(sSrc); 662 str += sSrc; 663 str += ' '; 664 lastpos = xpos + pText->m_Text.GetLength(); 665 } 666 } 667 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) { 668 minchars = 0; 669 for (int i = 0; i < m_TextList.GetSize(); i++) { 670 CTextBox* pText = m_TextList.GetAt(i); 671 if (pText->m_Right - pText->m_Left < 0.002) { 672 continue; 673 } 674 count += pText->m_Text.GetLength(); 675 width += pText->m_Right - pText->m_Left; 676 minchars += pText->m_Text.GetLength() + 1; 677 } 678 } 679 #define PI 3.1415926535897932384626433832795 680 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) { 681 int total_count = 0, rotated_count[3] = {0, 0, 0}; 682 FX_POSITION pos = page.GetFirstObjectPosition(); 683 while (pos) { 684 CPDF_PageObject* pObj = page.GetNextObject(pos); 685 if (pObj->m_Type != PDFPAGE_TEXT) { 686 continue; 687 } 688 total_count++; 689 CPDF_TextObject* pText = (CPDF_TextObject*)pObj; 690 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); 691 if (angle == 0.0) { 692 continue; 693 } 694 int degree = (int)(angle * 180 / PI + 0.5); 695 if (degree % 90) { 696 continue; 697 } 698 if (degree < 0) { 699 degree += 360; 700 } 701 int index = degree / 90 % 3 - 1; 702 if (index < 0) { 703 continue; 704 } 705 rotated_count[index]++; 706 } 707 if (total_count == 0) { 708 return; 709 } 710 CFX_Matrix matrix; 711 if (rotated_count[0] > total_count * 2 / 3) { 712 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); 713 } else if (rotated_count[1] > total_count * 2 / 3) { 714 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); 715 } else if (rotated_count[2] > total_count * 2 / 3) { 716 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); 717 } else { 718 return; 719 } 720 page.Transform(matrix); 721 page_bbox.Transform(&matrix); 722 } 723 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, 724 CPDF_Document* pDoc, 725 CPDF_Dictionary* pPage, 726 int iMinWidth, 727 FX_DWORD flags) { 728 lines.RemoveAll(); 729 if (!pPage) { 730 return; 731 } 732 CPDF_Page page; 733 page.Load(pDoc, pPage); 734 CPDF_ParseOptions options; 735 options.m_bTextOnly = TRUE; 736 options.m_bSeparateForm = FALSE; 737 page.ParseContent(&options); 738 CFX_FloatRect page_bbox = page.GetPageBBox(); 739 if (flags & PDF2TXT_AUTO_ROTATE) { 740 CheckRotate(page, page_bbox); 741 } 742 CTextPage texts; 743 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; 744 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; 745 texts.m_bBreakSpace = TRUE; 746 FX_POSITION pos = page.GetFirstObjectPosition(); 747 while (pos) { 748 CPDF_PageObject* pObject = page.GetNextObject(pos); 749 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { 750 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, 751 pObject->m_Top); 752 if (!page_bbox.Contains(rect)) { 753 continue; 754 } 755 } 756 texts.ProcessObject(pObject); 757 } 758 texts.WriteOutput(lines, iMinWidth); 759 } 760 void PDF_GetPageText(CFX_ByteStringArray& lines, 761 CPDF_Document* pDoc, 762 CPDF_Dictionary* pPage, 763 int iMinWidth, 764 FX_DWORD flags) { 765 lines.RemoveAll(); 766 CFX_WideStringArray wlines; 767 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); 768 for (int i = 0; i < wlines.GetSize(); i++) { 769 CFX_WideString wstr = wlines[i]; 770 CFX_ByteString str; 771 for (int c = 0; c < wstr.GetLength(); c++) { 772 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); 773 } 774 lines.Add(str); 775 } 776 } 777 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, 778 CPDF_Document* pDoc, 779 CPDF_Dictionary* pPage, 780 FX_DWORD flags) { 781 buffer.EstimateSize(0, 10240); 782 CPDF_Page page; 783 page.Load(pDoc, pPage); 784 CPDF_ParseOptions options; 785 options.m_bTextOnly = TRUE; 786 options.m_bSeparateForm = FALSE; 787 page.ParseContent(&options); 788 GetTextStream_Unicode(buffer, &page, TRUE, NULL); 789 } 790