1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "core/include/fpdfapi/fpdf_page.h" 8 #include "core/include/fpdfapi/fpdf_pageobj.h" 9 #include "text_int.h" 10 11 class CPDF_TextStream { 12 public: 13 CPDF_TextStream(CFX_WideTextBuf& buffer, 14 FX_BOOL bUseLF, 15 CFX_PtrArray* pObjArray); 16 ~CPDF_TextStream() {} 17 FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine); 18 CFX_WideTextBuf& m_Buffer; 19 FX_BOOL m_bUseLF; 20 CFX_PtrArray* m_pObjArray; 21 const CPDF_TextObject* m_pLastObj; 22 }; 23 CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer, 24 FX_BOOL bUseLF, 25 CFX_PtrArray* pObjArray) 26 : m_Buffer(buffer) { 27 m_pLastObj = NULL; 28 m_bUseLF = bUseLF; 29 m_pObjArray = pObjArray; 30 } 31 FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1, 32 const CPDF_TextObject* pTextObj2) { 33 if (!pTextObj1 || !pTextObj2) { 34 return FALSE; 35 } 36 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, 37 pTextObj2->m_Right, pTextObj2->m_Top); 38 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, 39 pTextObj1->m_Right, pTextObj1->m_Top); 40 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) { 41 return TRUE; 42 } 43 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { 44 rcPreObj.Intersect(rcCurObj); 45 if (rcPreObj.IsEmpty()) { 46 return FALSE; 47 } 48 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > 49 rcCurObj.Width() / 2) { 50 return FALSE; 51 } 52 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) { 53 return FALSE; 54 } 55 } 56 int nPreCount = pTextObj2->CountItems(); 57 int nCurCount = pTextObj1->CountItems(); 58 if (nPreCount != nCurCount) { 59 return FALSE; 60 } 61 for (int i = 0; i < nPreCount; i++) { 62 CPDF_TextObjectItem itemPer, itemCur; 63 pTextObj2->GetItemInfo(i, &itemPer); 64 pTextObj1->GetItemInfo(i, &itemCur); 65 if (itemCur.m_CharCode != itemPer.m_CharCode) { 66 return FALSE; 67 } 68 } 69 return TRUE; 70 } 71 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) { 72 if (charCode == -1) { 73 return 0; 74 } 75 int w = pFont->GetCharWidthF(charCode); 76 if (w == 0) { 77 CFX_ByteString str; 78 pFont->AppendChar(str, charCode); 79 w = pFont->GetStringWidth(str, 1); 80 if (w == 0) { 81 FX_RECT BBox; 82 pFont->GetCharBBox(charCode, BBox); 83 w = BBox.right - BBox.left; 84 } 85 } 86 return w; 87 } 88 int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj, 89 const CPDF_TextObject* pObj) { 90 if (FPDFText_IsSameTextObject(pPrevObj, pObj)) { 91 return -1; 92 } 93 CPDF_TextObjectItem item; 94 int nItem = pPrevObj->CountItems(); 95 pPrevObj->GetItemInfo(nItem - 1, &item); 96 FX_WCHAR preChar = 0, curChar = 0; 97 CFX_WideString wstr = 98 pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 99 if (wstr.GetLength()) { 100 preChar = wstr.GetAt(0); 101 } 102 FX_FLOAT last_pos = item.m_OriginX; 103 int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont()); 104 FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000; 105 last_width = FXSYS_fabs(last_width); 106 pObj->GetItemInfo(0, &item); 107 wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); 108 if (wstr.GetLength()) { 109 curChar = wstr.GetAt(0); 110 } 111 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); 112 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000; 113 this_width = FXSYS_fabs(this_width); 114 FX_FLOAT threshold = 115 last_width > this_width ? last_width / 4 : this_width / 4; 116 CFX_Matrix prev_matrix, prev_reverse; 117 pPrevObj->GetTextMatrix(&prev_matrix); 118 prev_reverse.SetReverse(prev_matrix); 119 FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY(); 120 prev_reverse.Transform(x, y); 121 if (FXSYS_fabs(y) > threshold * 2) { 122 return 2; 123 } 124 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); 125 threshold = threshold > 400 126 ? (threshold < 700 ? threshold / 4 : threshold / 5) 127 : (threshold / 2); 128 threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize()) 129 : FXSYS_fabs(pObj->GetFontSize()); 130 threshold /= 1000; 131 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && 132 preChar != L' ') 133 if (curChar != L' ' && preChar != L' ') { 134 if ((x - last_pos - last_width) > threshold || 135 (last_pos - x - last_width) > threshold) { 136 return 1; 137 } 138 if (x < 0 && (last_pos - x - last_width) > threshold) { 139 return 1; 140 } 141 if ((x - last_pos - last_width) > this_width || 142 (x - last_pos - this_width) > last_width) { 143 return 1; 144 } 145 } 146 if (last_pos + last_width > x + this_width && curChar == L' ') { 147 return 3; 148 } 149 return 0; 150 } 151 FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, 152 FX_BOOL bFirstLine) { 153 CPDF_Font* pFont = pObj->GetFont(); 154 CFX_Matrix matrix; 155 pObj->GetTextMatrix(&matrix); 156 int item_index = 0; 157 if (m_pLastObj) { 158 int result = FPDFText_ProcessInterObj(m_pLastObj, pObj); 159 if (result == 2) { 160 int len = m_Buffer.GetLength(); 161 if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') { 162 m_Buffer.Delete(len - 1, 1); 163 if (m_pObjArray) { 164 m_pObjArray->RemoveAt((len - 1) * 2, 2); 165 } 166 } else { 167 if (bFirstLine) { 168 return TRUE; 169 } 170 if (m_bUseLF) { 171 m_Buffer.AppendChar(L'\r'); 172 m_Buffer.AppendChar(L'\n'); 173 if (m_pObjArray) { 174 for (int i = 0; i < 4; i++) { 175 m_pObjArray->Add(NULL); 176 } 177 } 178 } else { 179 m_Buffer.AppendChar(' '); 180 if (m_pObjArray) { 181 m_pObjArray->Add(NULL); 182 m_pObjArray->Add(NULL); 183 } 184 } 185 } 186 } else if (result == 1) { 187 m_Buffer.AppendChar(L' '); 188 if (m_pObjArray) { 189 m_pObjArray->Add(NULL); 190 m_pObjArray->Add(NULL); 191 } 192 } else if (result == -1) { 193 m_pLastObj = pObj; 194 return FALSE; 195 } else if (result == 3) { 196 item_index = 1; 197 } 198 } 199 m_pLastObj = pObj; 200 int nItems = pObj->CountItems(); 201 FX_FLOAT Ignorekerning = 0; 202 for (int i = 1; i < nItems - 1; i += 2) { 203 CPDF_TextObjectItem item; 204 pObj->GetItemInfo(i, &item); 205 if (item.m_CharCode == (FX_DWORD)-1) { 206 if (i == 1) { 207 Ignorekerning = item.m_OriginX; 208 } else if (Ignorekerning > item.m_OriginX) { 209 Ignorekerning = item.m_OriginX; 210 } 211 } else { 212 Ignorekerning = 0; 213 break; 214 } 215 } 216 FX_FLOAT spacing = 0; 217 for (; item_index < nItems; item_index++) { 218 CPDF_TextObjectItem item; 219 pObj->GetItemInfo(item_index, &item); 220 if (item.m_CharCode == (FX_DWORD)-1) { 221 CFX_WideString wstr = m_Buffer.GetWideString(); 222 if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') { 223 continue; 224 } 225 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); 226 spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000; 227 continue; 228 } 229 FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace; 230 if (nItems > 3 && !spacing) { 231 charSpace = 0; 232 } 233 if ((spacing || charSpace) && item_index > 0) { 234 int last_width = 0; 235 FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); 236 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 237 FX_FLOAT threshold = 0; 238 if (space_charcode != -1) { 239 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 240 } 241 if (threshold > fontsize_h / 3) { 242 threshold = 0; 243 } else { 244 threshold /= 2; 245 } 246 if (threshold == 0) { 247 threshold = fontsize_h; 248 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); 249 threshold = this_width > last_width ? (FX_FLOAT)this_width 250 : (FX_FLOAT)last_width; 251 int nDivide = 6; 252 if (threshold < 300) { 253 nDivide = 2; 254 } else if (threshold < 500) { 255 nDivide = 4; 256 } else if (threshold < 700) { 257 nDivide = 5; 258 } 259 threshold = threshold / nDivide; 260 threshold = fontsize_h * threshold / 1000; 261 } 262 if (charSpace > 0.001) { 263 spacing += matrix.TransformDistance(charSpace); 264 } else if (charSpace < -0.001) { 265 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); 266 } 267 if (threshold && (spacing && spacing >= threshold)) { 268 m_Buffer.AppendChar(L' '); 269 if (m_pObjArray) { 270 m_pObjArray->Add(NULL); 271 m_pObjArray->Add(NULL); 272 } 273 } 274 if (item.m_CharCode == (FX_DWORD)-1) { 275 continue; 276 } 277 spacing = 0; 278 } 279 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode); 280 if (unicode_str.IsEmpty()) { 281 m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode); 282 if (m_pObjArray) { 283 m_pObjArray->Add((void*)pObj); 284 m_pObjArray->Add((void*)(intptr_t)item_index); 285 } 286 } else { 287 m_Buffer << unicode_str; 288 if (m_pObjArray) { 289 for (int i = 0; i < unicode_str.GetLength(); i++) { 290 m_pObjArray->Add((void*)pObj); 291 m_pObjArray->Add((void*)(intptr_t)item_index); 292 } 293 } 294 } 295 } 296 return FALSE; 297 } 298 void GetTextStream_Unicode(CFX_WideTextBuf& buffer, 299 CPDF_PageObjects* pPage, 300 FX_BOOL bUseLF, 301 CFX_PtrArray* pObjArray) { 302 CPDF_TextStream textstream(buffer, bUseLF, pObjArray); 303 FX_POSITION pos = pPage->GetFirstObjectPosition(); 304 while (pos) { 305 CPDF_PageObject* pObject = pPage->GetNextObject(pos); 306 if (pObject && pObject->m_Type == PDFPAGE_TEXT) 307 textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE); 308 } 309 } 310 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, 311 CPDF_Dictionary* pPage) { 312 CFX_WideTextBuf buffer; 313 buffer.EstimateSize(0, 1024); 314 CPDF_Page page; 315 page.Load(pDoc, pPage); 316 CPDF_ParseOptions options; 317 options.m_bTextOnly = TRUE; 318 options.m_bSeparateForm = FALSE; 319 page.ParseContent(&options); 320 CPDF_TextStream textstream(buffer, FALSE, NULL); 321 FX_POSITION pos = page.GetFirstObjectPosition(); 322 while (pos) { 323 CPDF_PageObject* pObject = page.GetNextObject(pos); 324 if (pObject->m_Type != PDFPAGE_TEXT) { 325 continue; 326 } 327 if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) { 328 break; 329 } 330 } 331 return buffer.GetWideString(); 332 } 333