1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "../../include/fpdfapi/fpdf_page.h" 8 #include "../../include/fpdfapi/fpdf_pageobj.h" 9 #include "../../include/fpdftext/fpdf_text.h" 10 #include "txtproc.h" 11 #include "text_int.h" 12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_) 13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR); 14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defchar) 15 { 16 if (destcp == 0) { 17 if (unicode < 0x80) { 18 return CFX_ByteString((char)unicode); 19 } 20 FX_LPCSTR altstr = FCS_GetAltStr(unicode); 21 if (altstr) { 22 return CFX_ByteString(altstr, -1); 23 } 24 return CFX_ByteString(defchar, -1); 25 } 26 FX_BOOL bDef = FALSE; 27 char buf[10]; 28 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef); 29 if (ret && !bDef) { 30 return CFX_ByteString(buf, ret); 31 } 32 FX_LPCSTR altstr = FCS_GetAltStr(unicode); 33 if (altstr) { 34 return CFX_ByteString(altstr, -1); 35 } 36 return CFX_ByteString(defchar, -1); 37 } 38 CTextPage::CTextPage() 39 { 40 } 41 CTextPage::~CTextPage() 42 { 43 int i; 44 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 45 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 46 delete pBaseLine; 47 } 48 for (i = 0; i < m_TextColumns.GetSize(); i ++) { 49 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); 50 delete pTextColumn; 51 } 52 } 53 void CTextPage::ProcessObject(CPDF_PageObject* pObject) 54 { 55 if (pObject->m_Type != PDFPAGE_TEXT) { 56 return; 57 } 58 CPDF_TextObject* pText = (CPDF_TextObject*)pObject; 59 CPDF_Font* pFont = pText->m_TextState.GetFont(); 60 int count = pText->CountItems(); 61 FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2); 62 if (pPosArray) { 63 pText->CalcCharPos(pPosArray); 64 } 65 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); 66 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); 67 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 68 FX_FLOAT spacew = 0; 69 if (space_charcode != -1) { 70 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 71 } 72 if (spacew == 0) { 73 spacew = fontsize_h / 4; 74 } 75 if (pText->m_TextState.GetBaselineAngle() != 0) { 76 int cc = 0; 77 CFX_AffineMatrix matrix; 78 pText->GetTextMatrix(&matrix); 79 for (int i = 0; i < pText->m_nChars; i ++) { 80 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i]; 81 if (charcode == (FX_DWORD) - 1) { 82 continue; 83 } 84 FX_RECT char_box; 85 pFont->GetCharBBox(charcode, char_box); 86 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000; 87 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000; 88 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000; 89 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000; 90 cc ++; 91 FX_FLOAT char_origx, char_origy; 92 matrix.Transform(char_left, 0, char_origx, char_origy); 93 matrix.TransformRect(char_left, char_right, char_top, char_bottom); 94 CFX_ByteString str; 95 pFont->AppendChar(str, charcode); 96 InsertTextBox(NULL, char_origy, char_left, char_right, char_top, 97 char_bottom, spacew, fontsize_v, str, pFont); 98 } 99 if (pPosArray) { 100 FX_Free(pPosArray); 101 } 102 return; 103 } 104 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); 105 for (int ii = 0; ii < count * 2; ii ++) { 106 pPosArray[ii] *= ratio_h; 107 } 108 FX_FLOAT baseline = pText->m_PosY; 109 CTextBaseLine* pBaseLine = NULL; 110 FX_FLOAT topy = pText->m_Top; 111 FX_FLOAT bottomy = pText->m_Bottom; 112 FX_FLOAT leftx = pText->m_Left; 113 int cc = 0; 114 CFX_ByteString segment; 115 int space_count = 0; 116 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; 117 for (int i = 0; i < pText->m_nChars; i ++) { 118 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i]; 119 if (charcode == (FX_DWORD) - 1) { 120 continue; 121 } 122 FX_FLOAT char_left = pPosArray[cc * 2]; 123 FX_FLOAT char_right = pPosArray[cc * 2 + 1]; 124 cc ++; 125 if (char_left < last_left || (char_left - last_right) > spacew / 2) { 126 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 127 topy, bottomy, spacew, fontsize_v, segment, pFont); 128 segment_left = char_left; 129 segment = ""; 130 } 131 CFX_WideString wCh = pText->GetFont()->UnicodeFromCharCode(charcode); 132 FX_DWORD ch = wCh.GetLength() > 0 ? wCh.GetAt(0) : charcode; 133 if (space_count > 1) { 134 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 135 topy, bottomy, spacew, fontsize_v, segment, pFont); 136 segment = ""; 137 } else if (space_count == 1) { 138 pFont->AppendChar(segment, ' '); 139 } 140 if (segment.GetLength() == 0) { 141 segment_left = char_left; 142 } 143 segment_right = char_right; 144 pFont->AppendChar(segment, charcode); 145 space_count = 0; 146 last_left = char_left; 147 last_right = char_right; 148 } 149 if (segment.GetLength()) 150 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 151 topy, bottomy, spacew, fontsize_v, segment, pFont); 152 FX_Free(pPosArray); 153 } 154 static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_Font* pFont); 155 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey, FX_FLOAT leftx, 156 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v, 157 CFX_ByteString& str, CPDF_Font* pFont) 158 { 159 if (str.GetLength() == 0) { 160 return NULL; 161 } 162 if (pBaseLine == NULL) { 163 int i; 164 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 165 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 166 if (pExistLine->m_BaseLine == basey) { 167 pBaseLine = pExistLine; 168 break; 169 } 170 if (pExistLine->m_BaseLine < basey) { 171 break; 172 } 173 } 174 if (pBaseLine == NULL) { 175 pBaseLine = FX_NEW CTextBaseLine; 176 if (NULL == pBaseLine) { 177 return NULL; 178 } 179 pBaseLine->m_BaseLine = basey; 180 m_BaseLines.InsertAt(i, pBaseLine); 181 } 182 } 183 CFX_WideString text; 184 FX_LPCSTR pStr = str; 185 int len = str.GetLength(), offset = 0; 186 while (offset < len) { 187 FX_DWORD ch = pFont->GetNextChar(pStr, offset); 188 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); 189 text += unicode_str; 190 } 191 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, text); 192 return pBaseLine; 193 } 194 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) 195 { 196 FX_FLOAT lastheight = -1; 197 FX_FLOAT lastbaseline = -1; 198 FX_FLOAT MinLeftX = 1000000; 199 FX_FLOAT MaxRightX = 0; 200 int i; 201 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 202 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 203 FX_FLOAT leftx, rightx; 204 if (pBaseLine->GetWidth(leftx, rightx)) { 205 if (leftx < MinLeftX) { 206 MinLeftX = leftx; 207 } 208 if (rightx > MaxRightX) { 209 MaxRightX = rightx; 210 } 211 } 212 } 213 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 214 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 215 pBaseLine->MergeBoxes(); 216 } 217 for (i = 1; i < m_BaseLines.GetSize(); i ++) { 218 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 219 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1); 220 if (pBaseLine->CanMerge(pPrevLine)) { 221 pPrevLine->Merge(pBaseLine); 222 delete pBaseLine; 223 m_BaseLines.RemoveAt(i); 224 i --; 225 } 226 } 227 if (m_bAutoWidth) { 228 int* widths = FX_Alloc(int, m_BaseLines.GetSize()); 229 if (widths) { 230 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 231 widths[i] = 0; 232 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 233 int TotalChars = 0; 234 FX_FLOAT TotalWidth = 0; 235 int minchars; 236 pBaseLine->CountChars(TotalChars, TotalWidth, minchars); 237 if (TotalChars) { 238 FX_FLOAT charwidth = TotalWidth / TotalChars; 239 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); 240 } 241 if (widths[i] > 1000) { 242 widths[i] = 1000; 243 } 244 if (widths[i] < minchars) { 245 widths[i] = minchars; 246 } 247 } 248 int AvgWidth = 0, widthcount = 0; 249 for (i = 0; i < m_BaseLines.GetSize(); i ++) 250 if (widths[i]) { 251 AvgWidth += widths[i]; 252 widthcount ++; 253 } 254 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); 255 int MaxWidth = 0; 256 for (i = 0; i < m_BaseLines.GetSize(); i ++) 257 if (MaxWidth < widths[i]) { 258 MaxWidth = widths[i]; 259 } 260 if (MaxWidth > AvgWidth * 6 / 5) { 261 MaxWidth = AvgWidth * 6 / 5; 262 } 263 FX_Free(widths); 264 if (iMinWidth < MaxWidth) { 265 iMinWidth = MaxWidth; 266 } 267 } 268 } 269 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 270 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 271 pBaseLine->MergeBoxes(); 272 } 273 if (m_bKeepColumn) { 274 FindColumns(); 275 } 276 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 277 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 278 if (lastheight >= 0) { 279 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; 280 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) { 281 lines.Add(L""); 282 } 283 } 284 lastheight = pBaseLine->m_MaxFontSizeV; 285 lastbaseline = pBaseLine->m_BaseLine; 286 CFX_WideString str; 287 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); 288 lines.Add(str); 289 } 290 } 291 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) 292 { 293 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); 294 FX_LPWSTR pDst = NULL; 295 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 296 if (nCount < 1 ) { 297 sDest += wChar; 298 return; 299 } 300 pDst = new FX_WCHAR[nCount]; 301 FX_Unicode_GetNormalization(wChar, pDst); 302 for (int nIndex = 0; nIndex < nCount; nIndex++) { 303 sDest += pDst[nIndex]; 304 } 305 delete[] pDst; 306 } 307 void NormalizeString(CFX_WideString& str) 308 { 309 if (str.GetLength() <= 0) { 310 return; 311 } 312 CFX_WideString sBuffer; 313 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); 314 if (NULL == BidiChar) { 315 return; 316 } 317 CFX_WordArray order; 318 FX_BOOL bR2L = FALSE; 319 FX_INT32 start = 0, count = 0, i = 0; 320 int nR2L = 0, nL2R = 0; 321 for (i = 0; i < str.GetLength(); i++) { 322 if(BidiChar->AppendChar(str.GetAt(i))) { 323 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 324 order.Add(start); 325 order.Add(count); 326 order.Add(ret); 327 if(!bR2L) { 328 if(ret == 2) { 329 nR2L++; 330 } else if (ret == 1) { 331 nL2R++; 332 } 333 } 334 } 335 } 336 if(BidiChar->EndChar()) { 337 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 338 order.Add(start); 339 order.Add(count); 340 order.Add(ret); 341 if(!bR2L) { 342 if(ret == 2) { 343 nR2L++; 344 } else if(ret == 1) { 345 nL2R++; 346 } 347 } 348 } 349 if(nR2L > 0 && nR2L >= nL2R) { 350 bR2L = TRUE; 351 } 352 if(bR2L) { 353 int count = order.GetSize(); 354 for(int j = count - 1; j > 0; j -= 3) { 355 int ret = order.GetAt(j); 356 int start = order.GetAt(j - 2); 357 int count1 = order.GetAt(j - 1); 358 if(ret == 2 || ret == 0) { 359 for(int i = start + count1 - 1; i >= start; i--) { 360 NormalizeCompositeChar(str[i], sBuffer); 361 } 362 } else { 363 i = j; 364 FX_BOOL bSymbol = FALSE; 365 while(i > 0 && order.GetAt(i) != 2) { 366 bSymbol = !order.GetAt(i); 367 i -= 3; 368 } 369 int end = start + count1 ; 370 int n = 0; 371 if(bSymbol) { 372 n = i + 6; 373 } else { 374 n = i + 3; 375 } 376 if(n >= j) { 377 for(int m = start; m < end; m++) { 378 sBuffer += str[m]; 379 } 380 } else { 381 i = j; 382 j = n; 383 for(; n <= i; n += 3) { 384 int ret = order.GetAt(n); 385 int start = order.GetAt(n - 2); 386 int count1 = order.GetAt(n - 1); 387 int end = start + count1 ; 388 for(int m = start; m < end; m++) { 389 sBuffer += str[m]; 390 } 391 } 392 } 393 } 394 } 395 } else { 396 int count = order.GetSize(); 397 FX_BOOL bL2R = FALSE; 398 for(int j = 0; j < count; j += 3) { 399 int ret = order.GetAt(j + 2); 400 int start = order.GetAt(j); 401 int count1 = order.GetAt(j + 1); 402 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) { 403 int i = j + 3; 404 while(bR2L && i < count) { 405 if(order.GetAt(i + 2) == 1) { 406 break; 407 } else { 408 i += 3; 409 } 410 } 411 if(i == 3) { 412 j = -3; 413 bL2R = TRUE; 414 continue; 415 } 416 int end = str.GetLength() - 1; 417 if(i < count) { 418 end = order.GetAt(i) - 1; 419 } 420 j = i - 3; 421 for(int n = end; n >= start; n--) { 422 NormalizeCompositeChar(str[i], sBuffer); 423 } 424 } else { 425 int end = start + count1 ; 426 for(int i = start; i < end; i++) { 427 sBuffer += str[i]; 428 } 429 } 430 } 431 } 432 str.Empty(); 433 str += sBuffer; 434 BidiChar->Release(); 435 } 436 static FX_BOOL IsNumber(CFX_WideString& str) 437 { 438 for (int i = 0; i < str.GetLength(); i ++) { 439 FX_WCHAR ch = str[i]; 440 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') { 441 return FALSE; 442 } 443 } 444 return TRUE; 445 } 446 void CTextPage::FindColumns() 447 { 448 int i; 449 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 450 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 451 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) { 452 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); 453 CTextColumn* pColumn = FindColumn(pTextBox->m_Right); 454 if (pColumn == NULL) { 455 pColumn = FX_NEW CTextColumn; 456 if (pColumn) { 457 pColumn->m_Count = 1; 458 pColumn->m_AvgPos = pTextBox->m_Right; 459 pColumn->m_TextPos = -1; 460 m_TextColumns.Add(pColumn); 461 } 462 } else { 463 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) / 464 (pColumn->m_Count + 1); 465 pColumn->m_Count ++; 466 } 467 } 468 } 469 int mincount = m_BaseLines.GetSize() / 4; 470 for (i = 0; i < m_TextColumns.GetSize(); i ++) { 471 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); 472 if (pTextColumn->m_Count >= mincount) { 473 continue; 474 } 475 delete pTextColumn; 476 m_TextColumns.RemoveAt(i); 477 i --; 478 } 479 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 480 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 481 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) { 482 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); 483 if (IsNumber(pTextBox->m_Text)) { 484 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); 485 } 486 } 487 } 488 } 489 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) 490 { 491 for (int i = 0; i < m_TextColumns.GetSize(); i ++) { 492 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i); 493 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { 494 return pColumn; 495 } 496 } 497 return NULL; 498 } 499 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) 500 { 501 } 502 CTextBaseLine::CTextBaseLine() 503 { 504 m_Top = -100000; 505 m_Bottom = 100000; 506 m_MaxFontSizeV = 0; 507 } 508 CTextBaseLine::~CTextBaseLine() 509 { 510 for (int i = 0; i < m_TextList.GetSize(); i ++) { 511 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 512 delete pText; 513 } 514 } 515 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, 516 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CFX_WideString& text) 517 { 518 if (m_Top < topy) { 519 m_Top = topy; 520 } 521 if (m_Bottom > bottomy) { 522 m_Bottom = bottomy; 523 } 524 if (m_MaxFontSizeV < fontsize_v) { 525 m_MaxFontSizeV = fontsize_v; 526 } 527 int i; 528 for (i = 0; i < m_TextList.GetSize(); i ++) { 529 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 530 if (pText->m_Left > leftx) { 531 break; 532 } 533 } 534 CTextBox* pText = FX_NEW CTextBox; 535 if (NULL == pText) { 536 return; 537 } 538 pText->m_Text = text; 539 pText->m_Left = leftx; 540 pText->m_Right = rightx; 541 pText->m_Top = topy; 542 pText->m_Bottom = bottomy; 543 pText->m_SpaceWidth = spacew; 544 pText->m_FontSizeV = fontsize_v; 545 pText->m_pColumn = NULL; 546 m_TextList.InsertAt(i, pText); 547 } 548 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT high2, 549 FX_FLOAT& interlow, FX_FLOAT& interhigh); 550 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) 551 { 552 FX_FLOAT inter_top, inter_bottom; 553 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top, 554 inter_bottom, inter_top)) { 555 return FALSE; 556 } 557 FX_FLOAT inter_h = inter_top - inter_bottom; 558 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) { 559 return FALSE; 560 } 561 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); 562 for (int i = 0; i < m_TextList.GetSize(); i ++) { 563 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 564 FX_FLOAT width = pText->m_Right - pText->m_Left; 565 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) { 566 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j); 567 FX_FLOAT inter_left, inter_right; 568 if (!GetIntersection(pText->m_Left, pText->m_Right, 569 pOtherText->m_Left, pOtherText->m_Right, inter_left, inter_right)) { 570 continue; 571 } 572 FX_FLOAT inter_w = inter_right - inter_left; 573 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_SpaceWidth / 2) { 574 continue; 575 } 576 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || 577 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { 578 return FALSE; 579 } 580 } 581 } 582 return TRUE; 583 } 584 void CTextBaseLine::Merge(CTextBaseLine* pOther) 585 { 586 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) { 587 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i); 588 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom, 589 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text); 590 } 591 } 592 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) 593 { 594 int i; 595 for (i = 0; i < m_TextList.GetSize(); i ++) { 596 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 597 if (pText->m_Text != L" ") { 598 break; 599 } 600 } 601 if (i == m_TextList.GetSize()) { 602 return FALSE; 603 } 604 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 605 leftx = pText->m_Left; 606 for (i = m_TextList.GetSize() - 1; i >= 0; i --) { 607 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 608 if (pText->m_Text != L" ") { 609 break; 610 } 611 } 612 pText = (CTextBox*)m_TextList.GetAt(i); 613 rightx = pText->m_Right; 614 return TRUE; 615 } 616 void CTextBaseLine::MergeBoxes() 617 { 618 int i = 0; 619 while (1) { 620 if (i >= m_TextList.GetSize() - 1) { 621 break; 622 } 623 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i); 624 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1); 625 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; 626 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ? 627 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth; 628 if (spacew > 0.0 && dx < spacew * 2) { 629 pThisText->m_Right = pNextText->m_Right; 630 if (dx > spacew * 1.5) { 631 pThisText->m_Text += L" "; 632 } else if (dx > spacew / 3) { 633 pThisText->m_Text += L' '; 634 } 635 pThisText->m_Text += pNextText->m_Text; 636 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ? 637 spacew : pNextText->m_SpaceWidth; 638 m_TextList.RemoveAt(i + 1); 639 delete pNextText; 640 } else { 641 i ++; 642 } 643 } 644 } 645 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pagewidth, 646 int iTextWidth) 647 { 648 int lastpos = -1; 649 for (int i = 0; i < m_TextList.GetSize(); i ++) { 650 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 651 int xpos; 652 if (pText->m_pColumn) { 653 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + 0.5); 654 xpos -= pText->m_Text.GetLength(); 655 } else { 656 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5); 657 } 658 if (xpos <= lastpos) { 659 xpos = lastpos + 1; 660 } 661 for (int j = lastpos + 1; j < xpos; j ++) { 662 str += ' '; 663 } 664 CFX_WideString sSrc(pText->m_Text); 665 NormalizeString(sSrc); 666 str += sSrc; 667 str += ' '; 668 lastpos = xpos + pText->m_Text.GetLength(); 669 } 670 } 671 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) 672 { 673 minchars = 0; 674 for (int i = 0; i < m_TextList.GetSize(); i ++) { 675 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 676 if (pText->m_Right - pText->m_Left < 0.002) { 677 continue; 678 } 679 count += pText->m_Text.GetLength(); 680 width += pText->m_Right - pText->m_Left; 681 minchars += pText->m_Text.GetLength() + 1; 682 } 683 } 684 #define PI 3.1415926535897932384626433832795 685 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) 686 { 687 int total_count = 0, rotated_count[3] = {0, 0, 0}; 688 FX_POSITION pos = page.GetFirstObjectPosition(); 689 while (pos) { 690 CPDF_PageObject* pObj = page.GetNextObject(pos); 691 if (pObj->m_Type != PDFPAGE_TEXT) { 692 continue; 693 } 694 total_count ++; 695 CPDF_TextObject* pText = (CPDF_TextObject*)pObj; 696 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); 697 if (angle == 0.0) { 698 continue; 699 } 700 int degree = (int)(angle * 180 / PI + 0.5); 701 if (degree % 90) { 702 continue; 703 } 704 if (degree < 0) { 705 degree += 360; 706 } 707 int index = degree / 90 % 3 - 1; 708 if (index < 0) { 709 continue; 710 } 711 rotated_count[index] ++; 712 } 713 if (total_count == 0) { 714 return; 715 } 716 CFX_AffineMatrix matrix; 717 if (rotated_count[0] > total_count * 2 / 3) { 718 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); 719 } else if (rotated_count[1] > total_count * 2 / 3) { 720 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); 721 } else if (rotated_count[2] > total_count * 2 / 3) { 722 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); 723 } else { 724 return; 725 } 726 page.Transform(matrix); 727 page_bbox.Transform(&matrix); 728 } 729 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 730 int iMinWidth, FX_DWORD flags) 731 { 732 lines.RemoveAll(); 733 if (pPage == NULL) { 734 return; 735 } 736 CPDF_Page page; 737 page.Load(pDoc, pPage); 738 CPDF_ParseOptions options; 739 options.m_bTextOnly = TRUE; 740 options.m_bSeparateForm = FALSE; 741 page.ParseContent(&options); 742 CFX_FloatRect page_bbox = page.GetPageBBox(); 743 if (flags & PDF2TXT_AUTO_ROTATE) { 744 CheckRotate(page, page_bbox); 745 } 746 CTextPage texts; 747 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; 748 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; 749 texts.m_bBreakSpace = TRUE; 750 FX_POSITION pos = page.GetFirstObjectPosition(); 751 while (pos) { 752 CPDF_PageObject* pObject = page.GetNextObject(pos); 753 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { 754 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top); 755 if (!page_bbox.Contains(rect)) { 756 continue; 757 } 758 } 759 texts.ProcessObject(pObject); 760 } 761 texts.WriteOutput(lines, iMinWidth); 762 } 763 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, 764 int iMinWidth, FX_DWORD flags) 765 { 766 lines.RemoveAll(); 767 CFX_WideStringArray wlines; 768 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); 769 for (int i = 0; i < wlines.GetSize(); i ++) { 770 CFX_WideString wstr = wlines[i]; 771 CFX_ByteString str; 772 for (int c = 0; c < wstr.GetLength(); c ++) { 773 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); 774 } 775 lines.Add(str); 776 } 777 } 778 #endif 779 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF, 780 CFX_PtrArray* pObjArray); 781 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage, FX_DWORD flags) 782 { 783 buffer.EstimateSize(0, 10240); 784 CPDF_Page page; 785 page.Load(pDoc, pPage); 786 CPDF_ParseOptions options; 787 options.m_bTextOnly = TRUE; 788 options.m_bSeparateForm = FALSE; 789 page.ParseContent(&options); 790 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL); 791 } 792