1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "../../include/fpdfapi/fpdf_pageobj.h" 8 #include "../../include/fpdftext/fpdf_text.h" 9 #include "../../include/fpdfapi/fpdf_page.h" 10 class CPDF_TextStream : public CFX_Object 11 { 12 public: 13 CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray); 14 ~CPDF_TextStream() {} 15 FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine); 16 CFX_WideTextBuf& m_Buffer; 17 FX_BOOL m_bUseLF; 18 CFX_PtrArray* m_pObjArray; 19 const CPDF_TextObject* m_pLastObj; 20 }; 21 CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray) : m_Buffer(buffer) 22 { 23 m_pLastObj = NULL; 24 m_bUseLF = bUseLF; 25 m_pObjArray = pObjArray; 26 } 27 FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1, const CPDF_TextObject* pTextObj2) 28 { 29 if (!pTextObj1 || !pTextObj2) { 30 return FALSE; 31 } 32 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top); 33 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top); 34 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) { 35 return TRUE; 36 } 37 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { 38 rcPreObj.Intersect(rcCurObj); 39 if (rcPreObj.IsEmpty()) { 40 return FALSE; 41 } 42 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) { 43 return FALSE; 44 } 45 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) { 46 return FALSE; 47 } 48 } 49 int nPreCount = pTextObj2->CountItems(); 50 int nCurCount = pTextObj1->CountItems(); 51 if (nPreCount != nCurCount) { 52 return FALSE; 53 } 54 for (int i = 0; i < nPreCount; i++) { 55 CPDF_TextObjectItem itemPer, itemCur; 56 pTextObj2->GetItemInfo(i, &itemPer); 57 pTextObj1->GetItemInfo(i, &itemCur); 58 if (itemCur.m_CharCode != itemPer.m_CharCode) { 59 return FALSE; 60 } 61 } 62 return TRUE; 63 } 64 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) 65 { 66 if(charCode == -1) { 67 return 0; 68 } 69 int w = pFont->GetCharWidthF(charCode); 70 if(w == 0) { 71 CFX_ByteString str; 72 pFont->AppendChar(str, charCode); 73 w = pFont->GetStringWidth(str, 1); 74 if(w == 0) { 75 FX_RECT BBox; 76 pFont->GetCharBBox(charCode, BBox); 77 w = BBox.right - BBox.left; 78 } 79 } 80 return w; 81 } 82 int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj, const CPDF_TextObject* pObj) 83 { 84 if(FPDFText_IsSameTextObject(pPrevObj, pObj)) { 85 return -1; 86 } 87 CPDF_TextObjectItem item; 88 int nItem = pPrevObj->CountItems(); 89 pPrevObj->GetItemInfo(nItem - 1, &item); 90 FX_WCHAR preChar = 0, curChar = 0; 91 CFX_WideString wstr = pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 92 if(wstr.GetLength()) { 93 preChar = wstr.GetAt(0); 94 } 95 FX_FLOAT last_pos = item.m_OriginX; 96 int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont()); 97 FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000; 98 last_width = FXSYS_fabs(last_width); 99 pObj->GetItemInfo(0, &item); 100 wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 101 if(wstr.GetLength()) { 102 curChar = wstr.GetAt(0); 103 } 104 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); 105 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000; 106 this_width = FXSYS_fabs(this_width); 107 FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4; 108 CFX_AffineMatrix prev_matrix, prev_reverse; 109 pPrevObj->GetTextMatrix(&prev_matrix); 110 prev_reverse.SetReverse(prev_matrix); 111 FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY(); 112 prev_reverse.Transform(x, y); 113 if (FXSYS_fabs(y) > threshold * 2) { 114 return 2; 115 } 116 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); 117 threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 : threshold / 5) : (threshold / 2); 118 threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize()) : FXSYS_fabs(pObj->GetFontSize()); 119 threshold /= 1000; 120 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ') 121 if(curChar != L' ' && preChar != L' ') { 122 if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) { 123 return 1; 124 } 125 if(x < 0 && (last_pos - x - last_width) > threshold) { 126 return 1; 127 } 128 if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) { 129 return 1; 130 } 131 } 132 if(last_pos + last_width > x + this_width && curChar == L' ') { 133 return 3; 134 } 135 return 0; 136 } 137 FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine) 138 { 139 if(pObj->m_Bottom > 380 && pObj->m_Left < 45 && pObj->m_Top < 402) { 140 int i = 0; 141 } 142 CPDF_Font* pFont = pObj->GetFont(); 143 CFX_AffineMatrix matrix; 144 pObj->GetTextMatrix(&matrix); 145 FX_FLOAT fs = pObj->GetFontSize(); 146 int item_index = 0; 147 if (m_pLastObj) { 148 int result = FPDFText_ProcessInterObj(m_pLastObj, pObj); 149 if (result == 2) { 150 int len = m_Buffer.GetLength(); 151 if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') { 152 m_Buffer.Delete(len - 1, 1); 153 if (m_pObjArray) { 154 m_pObjArray->RemoveAt((len - 1) * 2, 2); 155 } 156 } else { 157 if (bFirstLine) { 158 return TRUE; 159 } 160 if (m_bUseLF) { 161 m_Buffer.AppendChar(L'\r'); 162 m_Buffer.AppendChar(L'\n'); 163 if (m_pObjArray) { 164 for (int i = 0; i < 4; i ++) { 165 m_pObjArray->Add(NULL); 166 } 167 } 168 } else { 169 m_Buffer.AppendChar(' '); 170 if (m_pObjArray) { 171 m_pObjArray->Add(NULL); 172 m_pObjArray->Add(NULL); 173 } 174 } 175 } 176 } else if (result == 1) { 177 m_Buffer.AppendChar(L' '); 178 if (m_pObjArray) { 179 m_pObjArray->Add(NULL); 180 m_pObjArray->Add(NULL); 181 } 182 } else if (result == -1) { 183 m_pLastObj = pObj; 184 return FALSE; 185 } else if (result == 3) { 186 item_index = 1; 187 } 188 } 189 m_pLastObj = pObj; 190 int nItems = pObj->CountItems(); 191 FX_FLOAT Ignorekerning = 0; 192 for(int i = 1; i < nItems - 1; i += 2) { 193 CPDF_TextObjectItem item; 194 pObj->GetItemInfo(i, &item); 195 if (item.m_CharCode == (FX_DWORD) - 1) { 196 if(i == 1) { 197 Ignorekerning = item.m_OriginX; 198 } else if(Ignorekerning > item.m_OriginX) { 199 Ignorekerning = item.m_OriginX; 200 } 201 } else { 202 Ignorekerning = 0; 203 break; 204 } 205 } 206 FX_FLOAT spacing = 0; 207 for (; item_index < nItems; item_index ++) { 208 CPDF_TextObjectItem item; 209 pObj->GetItemInfo(item_index, &item); 210 if (item.m_CharCode == (FX_DWORD) - 1) { 211 CFX_WideString wstr = m_Buffer.GetWideString(); 212 if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') { 213 continue; 214 } 215 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); 216 spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000; 217 continue; 218 } 219 FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace; 220 if(nItems > 3 && !spacing) { 221 charSpace = 0; 222 } 223 if((spacing || charSpace) && item_index > 0) { 224 int last_width = 0; 225 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); 226 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 227 FX_FLOAT threshold = 0; 228 if (space_charcode != -1) { 229 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ; 230 } 231 if(threshold > fontsize_h / 3) { 232 threshold = 0; 233 } else { 234 threshold /= 2; 235 } 236 if (threshold == 0) { 237 threshold = fontsize_h; 238 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); 239 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; 240 int nDivide = 6; 241 if (threshold < 300) { 242 nDivide = 2; 243 } else if (threshold < 500) { 244 nDivide = 4; 245 } else if (threshold < 700) { 246 nDivide = 5; 247 } 248 threshold = threshold / nDivide; 249 threshold = fontsize_h * threshold / 1000; 250 } 251 if(charSpace > 0.001) { 252 spacing += matrix.TransformDistance(charSpace); 253 } else if(charSpace < -0.001) { 254 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); 255 } 256 if (threshold && (spacing && spacing >= threshold) ) { 257 m_Buffer.AppendChar(L' '); 258 if (m_pObjArray) { 259 m_pObjArray->Add(NULL); 260 m_pObjArray->Add(NULL); 261 } 262 } 263 if (item.m_CharCode == (FX_DWORD) - 1) { 264 continue; 265 } 266 spacing = 0; 267 } 268 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode); 269 if (unicode_str.IsEmpty()) { 270 m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode); 271 if (m_pObjArray) { 272 m_pObjArray->Add((void*)pObj); 273 m_pObjArray->Add((void*)(FX_INTPTR)item_index); 274 } 275 } else { 276 m_Buffer << unicode_str; 277 if (m_pObjArray) { 278 for (int i = 0; i < unicode_str.GetLength(); i ++) { 279 m_pObjArray->Add((void*)pObj); 280 m_pObjArray->Add((void*)(FX_INTPTR)item_index); 281 } 282 } 283 } 284 } 285 return FALSE; 286 } 287 void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF, 288 CFX_PtrArray* pObjArray) 289 { 290 CPDF_TextStream textstream(buffer, bUseLF, pObjArray); 291 FX_POSITION pos = pPage->GetFirstObjectPosition(); 292 while (pos) { 293 CPDF_PageObject* pObject = pPage->GetNextObject(pos); 294 if (pObject == NULL) { 295 continue; 296 } 297 if (pObject->m_Type != PDFPAGE_TEXT) { 298 continue; 299 } 300 textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE); 301 } 302 } 303 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage) 304 { 305 CFX_WideTextBuf buffer; 306 buffer.EstimateSize(0, 1024); 307 CPDF_Page page; 308 page.Load(pDoc, pPage); 309 CPDF_ParseOptions options; 310 options.m_bTextOnly = TRUE; 311 options.m_bSeparateForm = FALSE; 312 page.ParseContent(&options); 313 CPDF_TextStream textstream(buffer, FALSE, NULL); 314 FX_POSITION pos = page.GetFirstObjectPosition(); 315 while (pos) { 316 CPDF_PageObject* pObject = page.GetNextObject(pos); 317 if (pObject->m_Type != PDFPAGE_TEXT) { 318 continue; 319 } 320 if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) { 321 break; 322 } 323 } 324 return buffer.GetWideString(); 325 } 326