1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "../../include/fpdfapi/fpdf_page.h" 8 #include "../../include/fpdfapi/fpdf_pageobj.h" 9 #include "../../include/fpdftext/fpdf_text.h" 10 #include "txtproc.h" 11 #include "text_int.h" 12 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR); 13 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar) 14 { 15 if (destcp == 0) { 16 if (unicode < 0x80) { 17 return CFX_ByteString((char)unicode); 18 } 19 FX_LPCSTR altstr = FCS_GetAltStr(unicode); 20 if (altstr) { 21 return CFX_ByteString(altstr, -1); 22 } 23 return CFX_ByteString(defchar, -1); 24 } 25 FX_BOOL bDef = FALSE; 26 char buf[10]; 27 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef); 28 if (ret && !bDef) { 29 return CFX_ByteString(buf, ret); 30 } 31 FX_LPCSTR altstr = FCS_GetAltStr(unicode); 32 if (altstr) { 33 return CFX_ByteString(altstr, -1); 34 } 35 return CFX_ByteString(defchar, -1); 36 } 37 CTextPage::CTextPage() 38 { 39 } 40 CTextPage::~CTextPage() 41 { 42 int i; 43 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 44 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 45 delete pBaseLine; 46 } 47 for (i = 0; i < m_TextColumns.GetSize(); i ++) { 48 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); 49 delete pTextColumn; 50 } 51 } 52 void CTextPage::ProcessObject(CPDF_PageObject* pObject) 53 { 54 if (pObject->m_Type != PDFPAGE_TEXT) { 55 return; 56 } 57 CPDF_TextObject* pText = (CPDF_TextObject*)pObject; 58 CPDF_Font* pFont = pText->m_TextState.GetFont(); 59 int count = pText->CountItems(); 60 FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2); 61 pText->CalcCharPos(pPosArray); 62 63 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); 64 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); 65 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 66 FX_FLOAT spacew = 0; 67 if (space_charcode != -1) { 68 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 69 } 70 if (spacew == 0) { 71 spacew = fontsize_h / 4; 72 } 73 if (pText->m_TextState.GetBaselineAngle() != 0) { 74 int cc = 0; 75 CFX_AffineMatrix matrix; 76 pText->GetTextMatrix(&matrix); 77 for (int i = 0; i < pText->m_nChars; i ++) { 78 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i]; 79 if (charcode == (FX_DWORD) - 1) { 80 continue; 81 } 82 FX_RECT char_box; 83 pFont->GetCharBBox(charcode, char_box); 84 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000; 85 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000; 86 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000; 87 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000; 88 cc ++; 89 FX_FLOAT char_origx, char_origy; 90 matrix.Transform(char_left, 0, char_origx, char_origy); 91 matrix.TransformRect(char_left, char_right, char_top, char_bottom); 92 CFX_ByteString str; 93 pFont->AppendChar(str, charcode); 94 InsertTextBox(NULL, char_origy, char_left, char_right, char_top, 95 char_bottom, spacew, fontsize_v, str, pFont); 96 } 97 if (pPosArray) { 98 FX_Free(pPosArray); 99 } 100 return; 101 } 102 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); 103 for (int ii = 0; ii < count * 2; ii ++) { 104 pPosArray[ii] *= ratio_h; 105 } 106 FX_FLOAT baseline = pText->m_PosY; 107 CTextBaseLine* pBaseLine = NULL; 108 FX_FLOAT topy = pText->m_Top; 109 FX_FLOAT bottomy = pText->m_Bottom; 110 FX_FLOAT leftx = pText->m_Left; 111 int cc = 0; 112 CFX_ByteString segment; 113 int space_count = 0; 114 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; 115 for (int i = 0; i < pText->m_nChars; i ++) { 116 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i]; 117 if (charcode == (FX_DWORD) - 1) { 118 continue; 119 } 120 FX_FLOAT char_left = pPosArray[cc * 2]; 121 FX_FLOAT char_right = pPosArray[cc * 2 + 1]; 122 cc ++; 123 if (char_left < last_left || (char_left - last_right) > spacew / 2) { 124 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 125 topy, bottomy, spacew, fontsize_v, segment, pFont); 126 segment_left = char_left; 127 segment = ""; 128 } 129 if (space_count > 1) { 130 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 131 topy, bottomy, spacew, fontsize_v, segment, pFont); 132 segment = ""; 133 } else if (space_count == 1) { 134 pFont->AppendChar(segment, ' '); 135 } 136 if (segment.GetLength() == 0) { 137 segment_left = char_left; 138 } 139 segment_right = char_right; 140 pFont->AppendChar(segment, charcode); 141 space_count = 0; 142 last_left = char_left; 143 last_right = char_right; 144 } 145 if (segment.GetLength()) 146 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 147 topy, bottomy, spacew, fontsize_v, segment, pFont); 148 FX_Free(pPosArray); 149 } 150 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx, 151 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v, 152 CFX_ByteString& str, CPDF_Font* pFont) 153 { 154 if (str.GetLength() == 0) { 155 return NULL; 156 } 157 if (pBaseLine == NULL) { 158 int i; 159 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 160 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 161 if (pExistLine->m_BaseLine == basey) { 162 pBaseLine = pExistLine; 163 break; 164 } 165 if (pExistLine->m_BaseLine < basey) { 166 break; 167 } 168 } 169 if (pBaseLine == NULL) { 170 pBaseLine = new CTextBaseLine; 171 pBaseLine->m_BaseLine = basey; 172 m_BaseLines.InsertAt(i, pBaseLine); 173 } 174 } 175 CFX_WideString text; 176 FX_LPCSTR pStr = str; 177 int len = str.GetLength(), offset = 0; 178 while (offset < len) { 179 FX_DWORD ch = pFont->GetNextChar(pStr, len, offset); 180 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); 181 if (unicode_str.IsEmpty()) { 182 text += (FX_WCHAR)ch; 183 } 184 else { 185 text += unicode_str; 186 } 187 } 188 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text); 189 return pBaseLine; 190 } 191 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) 192 { 193 FX_FLOAT lastheight = -1; 194 FX_FLOAT lastbaseline = -1; 195 FX_FLOAT MinLeftX = 1000000; 196 FX_FLOAT MaxRightX = 0; 197 int i; 198 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 199 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 200 FX_FLOAT leftx, rightx; 201 if (pBaseLine->GetWidth(leftx, rightx)) { 202 if (leftx < MinLeftX) { 203 MinLeftX = leftx; 204 } 205 if (rightx > MaxRightX) { 206 MaxRightX = rightx; 207 } 208 } 209 } 210 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 211 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 212 pBaseLine->MergeBoxes(); 213 } 214 for (i = 1; i < m_BaseLines.GetSize(); i ++) { 215 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 216 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1); 217 if (pBaseLine->CanMerge(pPrevLine)) { 218 pPrevLine->Merge(pBaseLine); 219 delete pBaseLine; 220 m_BaseLines.RemoveAt(i); 221 i --; 222 } 223 } 224 if (m_bAutoWidth) { 225 int* widths = FX_Alloc(int, m_BaseLines.GetSize()); 226 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 227 widths[i] = 0; 228 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 229 int TotalChars = 0; 230 FX_FLOAT TotalWidth = 0; 231 int minchars; 232 pBaseLine->CountChars(TotalChars, TotalWidth, minchars); 233 if (TotalChars) { 234 FX_FLOAT charwidth = TotalWidth / TotalChars; 235 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); 236 } 237 if (widths[i] > 1000) { 238 widths[i] = 1000; 239 } 240 if (widths[i] < minchars) { 241 widths[i] = minchars; 242 } 243 } 244 int AvgWidth = 0, widthcount = 0; 245 for (i = 0; i < m_BaseLines.GetSize(); i ++) 246 if (widths[i]) { 247 AvgWidth += widths[i]; 248 widthcount ++; 249 } 250 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); 251 int MaxWidth = 0; 252 for (i = 0; i < m_BaseLines.GetSize(); i ++) 253 if (MaxWidth < widths[i]) { 254 MaxWidth = widths[i]; 255 } 256 if (MaxWidth > AvgWidth * 6 / 5) { 257 MaxWidth = AvgWidth * 6 / 5; 258 } 259 FX_Free(widths); 260 if (iMinWidth < MaxWidth) { 261 iMinWidth = MaxWidth; 262 } 263 } 264 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 265 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 266 pBaseLine->MergeBoxes(); 267 } 268 if (m_bKeepColumn) { 269 FindColumns(); 270 } 271 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 272 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 273 if (lastheight >= 0) { 274 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; 275 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) { 276 lines.Add(L""); 277 } 278 } 279 lastheight = pBaseLine->m_MaxFontSizeV; 280 lastbaseline = pBaseLine->m_BaseLine; 281 CFX_WideString str; 282 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); 283 lines.Add(str); 284 } 285 } 286 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) 287 { 288 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); 289 FX_LPWSTR pDst = NULL; 290 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 291 if (nCount < 1 ) { 292 sDest += wChar; 293 return; 294 } 295 pDst = new FX_WCHAR[nCount]; 296 FX_Unicode_GetNormalization(wChar, pDst); 297 for (int nIndex = 0; nIndex < nCount; nIndex++) { 298 sDest += pDst[nIndex]; 299 } 300 delete[] pDst; 301 } 302 void NormalizeString(CFX_WideString& str) 303 { 304 if (str.GetLength() <= 0) { 305 return; 306 } 307 CFX_WideString sBuffer; 308 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); 309 if (NULL == BidiChar) { 310 return; 311 } 312 CFX_WordArray order; 313 FX_BOOL bR2L = FALSE; 314 FX_INT32 start = 0, count = 0, i = 0; 315 int nR2L = 0, nL2R = 0; 316 for (i = 0; i < str.GetLength(); i++) { 317 if(BidiChar->AppendChar(str.GetAt(i))) { 318 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 319 order.Add(start); 320 order.Add(count); 321 order.Add(ret); 322 if(!bR2L) { 323 if(ret == 2) { 324 nR2L++; 325 } else if (ret == 1) { 326 nL2R++; 327 } 328 } 329 } 330 } 331 if(BidiChar->EndChar()) { 332 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 333 order.Add(start); 334 order.Add(count); 335 order.Add(ret); 336 if(!bR2L) { 337 if(ret == 2) { 338 nR2L++; 339 } else if(ret == 1) { 340 nL2R++; 341 } 342 } 343 } 344 if(nR2L > 0 && nR2L >= nL2R) { 345 bR2L = TRUE; 346 } 347 if(bR2L) { 348 int count = order.GetSize(); 349 for(int j = count - 1; j > 0; j -= 3) { 350 int ret = order.GetAt(j); 351 int start = order.GetAt(j - 2); 352 int count1 = order.GetAt(j - 1); 353 if(ret == 2 || ret == 0) { 354 for(int i = start + count1 - 1; i >= start; i--) { 355 NormalizeCompositeChar(str[i], sBuffer); 356 } 357 } else { 358 i = j; 359 FX_BOOL bSymbol = FALSE; 360 while(i > 0 && order.GetAt(i) != 2) { 361 bSymbol = !order.GetAt(i); 362 i -= 3; 363 } 364 int end = start + count1 ; 365 int n = 0; 366 if(bSymbol) { 367 n = i + 6; 368 } else { 369 n = i + 3; 370 } 371 if(n >= j) { 372 for(int m = start; m < end; m++) { 373 sBuffer += str[m]; 374 } 375 } else { 376 i = j; 377 j = n; 378 for(; n <= i; n += 3) { 379 int start = order.GetAt(n - 2); 380 int count1 = order.GetAt(n - 1); 381 int end = start + count1 ; 382 for(int m = start; m < end; m++) { 383 sBuffer += str[m]; 384 } 385 } 386 } 387 } 388 } 389 } else { 390 int count = order.GetSize(); 391 FX_BOOL bL2R = FALSE; 392 for(int j = 0; j < count; j += 3) { 393 int ret = order.GetAt(j + 2); 394 int start = order.GetAt(j); 395 int count1 = order.GetAt(j + 1); 396 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) { 397 int i = j + 3; 398 while(bR2L && i < count) { 399 if(order.GetAt(i + 2) == 1) { 400 break; 401 } else { 402 i += 3; 403 } 404 } 405 if(i == 3) { 406 j = -3; 407 bL2R = TRUE; 408 continue; 409 } 410 int end = str.GetLength() - 1; 411 if(i < count) { 412 end = order.GetAt(i) - 1; 413 } 414 j = i - 3; 415 for(int n = end; n >= start; n--) { 416 NormalizeCompositeChar(str[i], sBuffer); 417 } 418 } else { 419 int end = start + count1 ; 420 for(int i = start; i < end; i++) { 421 sBuffer += str[i]; 422 } 423 } 424 } 425 } 426 str.Empty(); 427 str += sBuffer; 428 BidiChar->Release(); 429 } 430 static FX_BOOL IsNumber(CFX_WideString& str) 431 { 432 for (int i = 0; i < str.GetLength(); i ++) { 433 FX_WCHAR ch = str[i]; 434 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') { 435 return FALSE; 436 } 437 } 438 return TRUE; 439 } 440 void CTextPage::FindColumns() 441 { 442 int i; 443 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 444 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 445 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) { 446 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); 447 CTextColumn* pColumn = FindColumn(pTextBox->m_Right); 448 if (pColumn == NULL) { 449 pColumn = new CTextColumn; 450 pColumn->m_Count = 1; 451 pColumn->m_AvgPos = pTextBox->m_Right; 452 pColumn->m_TextPos = -1; 453 m_TextColumns.Add(pColumn); 454 } else { 455 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) / 456 (pColumn->m_Count + 1); 457 pColumn->m_Count ++; 458 } 459 } 460 } 461 int mincount = m_BaseLines.GetSize() / 4; 462 for (i = 0; i < m_TextColumns.GetSize(); i ++) { 463 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); 464 if (pTextColumn->m_Count >= mincount) { 465 continue; 466 } 467 delete pTextColumn; 468 m_TextColumns.RemoveAt(i); 469 i --; 470 } 471 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 472 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 473 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) { 474 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); 475 if (IsNumber(pTextBox->m_Text)) { 476 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); 477 } 478 } 479 } 480 } 481 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) 482 { 483 for (int i = 0; i < m_TextColumns.GetSize(); i ++) { 484 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i); 485 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { 486 return pColumn; 487 } 488 } 489 return NULL; 490 } 491 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) 492 { 493 } 494 CTextBaseLine::CTextBaseLine() 495 { 496 m_Top = -100000; 497 m_Bottom = 100000; 498 m_MaxFontSizeV = 0; 499 } 500 CTextBaseLine::~CTextBaseLine() 501 { 502 for (int i = 0; i < m_TextList.GetSize(); i ++) { 503 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 504 delete pText; 505 } 506 } 507 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, 508 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text) 509 { 510 if (m_Top < topy) { 511 m_Top = topy; 512 } 513 if (m_Bottom > bottomy) { 514 m_Bottom = bottomy; 515 } 516 if (m_MaxFontSizeV < fontsize_v) { 517 m_MaxFontSizeV = fontsize_v; 518 } 519 int i; 520 for (i = 0; i < m_TextList.GetSize(); i ++) { 521 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 522 if (pText->m_Left > leftx) { 523 break; 524 } 525 } 526 CTextBox* pText = new CTextBox; 527 pText->m_Text = text; 528 pText->m_Left = leftx; 529 pText->m_Right = rightx; 530 pText->m_Top = topy; 531 pText->m_Bottom = bottomy; 532 pText->m_SpaceWidth = spacew; 533 pText->m_FontSizeV = fontsize_v; 534 pText->m_pColumn = NULL; 535 m_TextList.InsertAt(i, pText); 536 } 537 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2, 538 FX_FLOAT& interlow, FX_FLOAT& interhigh); 539 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) 540 { 541 FX_FLOAT inter_top, inter_bottom; 542 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top, 543 inter_bottom, inter_top)) { 544 return FALSE; 545 } 546 FX_FLOAT inter_h = inter_top - inter_bottom; 547 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) { 548 return FALSE; 549 } 550 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); 551 for (int i = 0; i < m_TextList.GetSize(); i ++) { 552 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 553 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) { 554 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j); 555 FX_FLOAT inter_left, inter_right; 556 if (!GetIntersection(pText->m_Left, pText->m_Right, 557 pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) { 558 continue; 559 } 560 FX_FLOAT inter_w = inter_right - inter_left; 561 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) { 562 continue; 563 } 564 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || 565 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { 566 return FALSE; 567 } 568 } 569 } 570 return TRUE; 571 } 572 void CTextBaseLine::Merge(CTextBaseLine* pOther) 573 { 574 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) { 575 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i); 576 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom, 577 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text); 578 } 579 } 580 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) 581 { 582 int i; 583 for (i = 0; i < m_TextList.GetSize(); i ++) { 584 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 585 if (pText->m_Text != L" ") { 586 break; 587 } 588 } 589 if (i == m_TextList.GetSize()) { 590 return FALSE; 591 } 592 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 593 leftx = pText->m_Left; 594 for (i = m_TextList.GetSize() - 1; i >= 0; i --) { 595 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 596 if (pText->m_Text != L" ") { 597 break; 598 } 599 } 600 pText = (CTextBox*)m_TextList.GetAt(i); 601 rightx = pText->m_Right; 602 return TRUE; 603 } 604 void CTextBaseLine::MergeBoxes() 605 { 606 int i = 0; 607 while (1) { 608 if (i >= m_TextList.GetSize() - 1) { 609 break; 610 } 611 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i); 612 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1); 613 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; 614 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ? 615 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth; 616 if (spacew > 0.0 && dx < spacew * 2) { 617 pThisText->m_Right = pNextText->m_Right; 618 if (dx > spacew * 1.5) { 619 pThisText->m_Text += L" "; 620 } else if (dx > spacew / 3) { 621 pThisText->m_Text += L' '; 622 } 623 pThisText->m_Text += pNextText->m_Text; 624 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ? 625 spacew : pNextText->m_SpaceWidth; 626 m_TextList.RemoveAt(i + 1); 627 delete pNextText; 628 } else { 629 i ++; 630 } 631 } 632 } 633 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth, 634 int iTextWidth) 635 { 636 int lastpos = -1; 637 for (int i = 0; i < m_TextList.GetSize(); i ++) { 638 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 639 int xpos; 640 if (pText->m_pColumn) { 641 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5); 642 xpos -= pText->m_Text.GetLength(); 643 } else { 644 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5); 645 } 646 if (xpos <= lastpos) { 647 xpos = lastpos + 1; 648 } 649 for (int j = lastpos + 1; j < xpos; j ++) { 650 str += ' '; 651 } 652 CFX_WideString sSrc(pText->m_Text); 653 NormalizeString(sSrc); 654 str += sSrc; 655 str += ' '; 656 lastpos = xpos + pText->m_Text.GetLength(); 657 } 658 } 659 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) 660 { 661 minchars = 0; 662 for (int i = 0; i < m_TextList.GetSize(); i ++) { 663 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 664 if (pText->m_Right - pText->m_Left < 0.002) { 665 continue; 666 } 667 count += pText->m_Text.GetLength(); 668 width += pText->m_Right - pText->m_Left; 669 minchars += pText->m_Text.GetLength() + 1; 670 } 671 } 672 #define PI 3.1415926535897932384626433832795 673 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) 674 { 675 int total_count = 0, rotated_count[3] = {0, 0, 0}; 676 FX_POSITION pos = page.GetFirstObjectPosition(); 677 while (pos) { 678 CPDF_PageObject* pObj = page.GetNextObject(pos); 679 if (pObj->m_Type != PDFPAGE_TEXT) { 680 continue; 681 } 682 total_count ++; 683 CPDF_TextObject* pText = (CPDF_TextObject*)pObj; 684 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); 685 if (angle == 0.0) { 686 continue; 687 } 688 int degree = (int)(angle * 180 / PI + 0.5); 689 if (degree % 90) { 690 continue; 691 } 692 if (degree < 0) { 693 degree += 360; 694 } 695 int index = degree / 90 % 3 - 1; 696 if (index < 0) { 697 continue; 698 } 699 rotated_count[index] ++; 700 } 701 if (total_count == 0) { 702 return; 703 } 704 CFX_AffineMatrix matrix; 705 if (rotated_count[0] > total_count * 2 / 3) { 706 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); 707 } else if (rotated_count[1] > total_count * 2 / 3) { 708 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); 709 } else if (rotated_count[2] > total_count * 2 / 3) { 710 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); 711 } else { 712 return; 713 } 714 page.Transform(matrix); 715 page_bbox.Transform(&matrix); 716 } 717 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 718 int iMinWidth, FX_DWORD flags) 719 { 720 lines.RemoveAll(); 721 if (pPage == NULL) { 722 return; 723 } 724 CPDF_Page page; 725 page.Load(pDoc, pPage); 726 CPDF_ParseOptions options; 727 options.m_bTextOnly = TRUE; 728 options.m_bSeparateForm = FALSE; 729 page.ParseContent(&options); 730 CFX_FloatRect page_bbox = page.GetPageBBox(); 731 if (flags & PDF2TXT_AUTO_ROTATE) { 732 CheckRotate(page, page_bbox); 733 } 734 CTextPage texts; 735 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; 736 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; 737 texts.m_bBreakSpace = TRUE; 738 FX_POSITION pos = page.GetFirstObjectPosition(); 739 while (pos) { 740 CPDF_PageObject* pObject = page.GetNextObject(pos); 741 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { 742 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top); 743 if (!page_bbox.Contains(rect)) { 744 continue; 745 } 746 } 747 texts.ProcessObject(pObject); 748 } 749 texts.WriteOutput(lines, iMinWidth); 750 } 751 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 752 int iMinWidth, FX_DWORD flags) 753 { 754 lines.RemoveAll(); 755 CFX_WideStringArray wlines; 756 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); 757 for (int i = 0; i < wlines.GetSize(); i ++) { 758 CFX_WideString wstr = wlines[i]; 759 CFX_ByteString str; 760 for (int c = 0; c < wstr.GetLength(); c ++) { 761 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); 762 } 763 lines.Add(str); 764 } 765 } 766 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF, 767 CFX_PtrArray* pObjArray); 768 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags) 769 { 770 buffer.EstimateSize(0, 10240); 771 CPDF_Page page; 772 page.Load(pDoc, pPage); 773 CPDF_ParseOptions options; 774 options.m_bTextOnly = TRUE; 775 options.m_bSeparateForm = FALSE; 776 page.ParseContent(&options); 777 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL); 778 } 779