Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "../../include/fpdfapi/fpdf_pageobj.h"
      8 #include "../../include/fpdftext/fpdf_text.h"
      9 #include "../../include/fpdfapi/fpdf_page.h"
     10 class CPDF_TextStream : public CFX_Object
     11 {
     12 public:
     13     CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray);
     14     ~CPDF_TextStream() {}
     15     FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
     16     CFX_WideTextBuf&	m_Buffer;
     17     FX_BOOL				m_bUseLF;
     18     CFX_PtrArray*		m_pObjArray;
     19     const CPDF_TextObject*	m_pLastObj;
     20 };
     21 CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray) : m_Buffer(buffer)
     22 {
     23     m_pLastObj = NULL;
     24     m_bUseLF = bUseLF;
     25     m_pObjArray = pObjArray;
     26 }
     27 FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1, const CPDF_TextObject* pTextObj2)
     28 {
     29     if (!pTextObj1 || !pTextObj2) {
     30         return FALSE;
     31     }
     32     CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top);
     33     CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top);
     34     if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
     35         return TRUE;
     36     }
     37     if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
     38         rcPreObj.Intersect(rcCurObj);
     39         if (rcPreObj.IsEmpty()) {
     40             return FALSE;
     41         }
     42         if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
     43             return FALSE;
     44         }
     45         if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
     46             return FALSE;
     47         }
     48     }
     49     int nPreCount = pTextObj2->CountItems();
     50     int nCurCount = pTextObj1->CountItems();
     51     if (nPreCount != nCurCount) {
     52         return FALSE;
     53     }
     54     for (int i = 0; i < nPreCount; i++) {
     55         CPDF_TextObjectItem itemPer, itemCur;
     56         pTextObj2->GetItemInfo(i, &itemPer);
     57         pTextObj1->GetItemInfo(i, &itemCur);
     58         if (itemCur.m_CharCode != itemPer.m_CharCode) {
     59             return FALSE;
     60         }
     61     }
     62     return TRUE;
     63 }
     64 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont)
     65 {
     66     if(charCode == -1) {
     67         return 0;
     68     }
     69     int w = pFont->GetCharWidthF(charCode);
     70     if(w == 0) {
     71         CFX_ByteString str;
     72         pFont->AppendChar(str, charCode);
     73         w = pFont->GetStringWidth(str, 1);
     74         if(w == 0) {
     75             FX_RECT BBox;
     76             pFont->GetCharBBox(charCode, BBox);
     77             w = BBox.right - BBox.left;
     78         }
     79     }
     80     return w;
     81 }
     82 int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj, const CPDF_TextObject* pObj)
     83 {
     84     if(FPDFText_IsSameTextObject(pPrevObj, pObj)) {
     85         return -1;
     86     }
     87     CPDF_TextObjectItem item;
     88     int nItem = pPrevObj->CountItems();
     89     pPrevObj->GetItemInfo(nItem - 1, &item);
     90     FX_WCHAR preChar = 0, curChar = 0;
     91     CFX_WideString wstr = pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
     92     if(wstr.GetLength()) {
     93         preChar = wstr.GetAt(0);
     94     }
     95     FX_FLOAT last_pos = item.m_OriginX;
     96     int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
     97     FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
     98     last_width = FXSYS_fabs(last_width);
     99     pObj->GetItemInfo(0, &item);
    100     wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
    101     if(wstr.GetLength()) {
    102         curChar = wstr.GetAt(0);
    103     }
    104     int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
    105     FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
    106     this_width = FXSYS_fabs(this_width);
    107     FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4;
    108     CFX_AffineMatrix prev_matrix, prev_reverse;
    109     pPrevObj->GetTextMatrix(&prev_matrix);
    110     prev_reverse.SetReverse(prev_matrix);
    111     FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
    112     prev_reverse.Transform(x, y);
    113     if (FXSYS_fabs(y) > threshold * 2) {
    114         return 2;
    115     }
    116     threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
    117     threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 :  threshold / 5) : (threshold / 2);
    118     threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize()) : FXSYS_fabs(pObj->GetFontSize());
    119     threshold /= 1000;
    120     if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ')
    121         if(curChar != L' ' && preChar != L' ') {
    122             if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) {
    123                 return 1;
    124             }
    125             if(x < 0 && (last_pos - x - last_width) > threshold) {
    126                 return 1;
    127             }
    128             if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) {
    129                 return 1;
    130             }
    131         }
    132     if(last_pos + last_width > x + this_width && curChar == L' ') {
    133         return 3;
    134     }
    135     return 0;
    136 }
    137 FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine)
    138 {
    139     if(pObj->m_Bottom > 380 && pObj->m_Left < 45 && pObj->m_Top < 402) {
    140         int i = 0;
    141     }
    142     CPDF_Font* pFont = pObj->GetFont();
    143     CFX_AffineMatrix matrix;
    144     pObj->GetTextMatrix(&matrix);
    145     FX_FLOAT fs = pObj->GetFontSize();
    146     int item_index = 0;
    147     if (m_pLastObj) {
    148         int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
    149         if (result == 2) {
    150             int len = m_Buffer.GetLength();
    151             if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
    152                 m_Buffer.Delete(len - 1, 1);
    153                 if (m_pObjArray) {
    154                     m_pObjArray->RemoveAt((len - 1) * 2, 2);
    155                 }
    156             } else {
    157                 if (bFirstLine) {
    158                     return TRUE;
    159                 }
    160                 if (m_bUseLF) {
    161                     m_Buffer.AppendChar(L'\r');
    162                     m_Buffer.AppendChar(L'\n');
    163                     if (m_pObjArray) {
    164                         for (int i = 0; i < 4; i ++) {
    165                             m_pObjArray->Add(NULL);
    166                         }
    167                     }
    168                 } else {
    169                     m_Buffer.AppendChar(' ');
    170                     if (m_pObjArray) {
    171                         m_pObjArray->Add(NULL);
    172                         m_pObjArray->Add(NULL);
    173                     }
    174                 }
    175             }
    176         } else if (result == 1) {
    177             m_Buffer.AppendChar(L' ');
    178             if (m_pObjArray) {
    179                 m_pObjArray->Add(NULL);
    180                 m_pObjArray->Add(NULL);
    181             }
    182         } else if (result == -1) {
    183             m_pLastObj = pObj;
    184             return FALSE;
    185         } else if (result == 3) {
    186             item_index = 1;
    187         }
    188     }
    189     m_pLastObj = pObj;
    190     int nItems = pObj->CountItems();
    191     FX_FLOAT Ignorekerning = 0;
    192     for(int i = 1; i < nItems - 1; i += 2) {
    193         CPDF_TextObjectItem item;
    194         pObj->GetItemInfo(i, &item);
    195         if (item.m_CharCode == (FX_DWORD) - 1) {
    196             if(i == 1) {
    197                 Ignorekerning = item.m_OriginX;
    198             } else if(Ignorekerning > item.m_OriginX) {
    199                 Ignorekerning = item.m_OriginX;
    200             }
    201         } else {
    202             Ignorekerning = 0;
    203             break;
    204         }
    205     }
    206     FX_FLOAT spacing = 0;
    207     for (; item_index < nItems; item_index ++) {
    208         CPDF_TextObjectItem item;
    209         pObj->GetItemInfo(item_index, &item);
    210         if (item.m_CharCode == (FX_DWORD) - 1) {
    211             CFX_WideString wstr = m_Buffer.GetWideString();
    212             if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
    213                 continue;
    214             }
    215             FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
    216             spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
    217             continue;
    218         }
    219         FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
    220         if(nItems > 3 && !spacing) {
    221             charSpace = 0;
    222         }
    223         if((spacing || charSpace) && item_index > 0) {
    224             int last_width = 0;
    225             FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
    226             FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
    227             FX_FLOAT threshold = 0;
    228             if (space_charcode != -1) {
    229                 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ;
    230             }
    231             if(threshold > fontsize_h / 3) {
    232                 threshold = 0;
    233             } else {
    234                 threshold /= 2;
    235             }
    236             if (threshold == 0) {
    237                 threshold = fontsize_h;
    238                 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
    239                 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width;
    240                 int nDivide = 6;
    241                 if (threshold < 300) {
    242                     nDivide = 2;
    243                 } else if (threshold < 500) {
    244                     nDivide = 4;
    245                 } else if (threshold < 700) {
    246                     nDivide = 5;
    247                 }
    248                 threshold = threshold / nDivide;
    249                 threshold = fontsize_h * threshold / 1000;
    250             }
    251             if(charSpace > 0.001) {
    252                 spacing += matrix.TransformDistance(charSpace);
    253             } else if(charSpace < -0.001) {
    254                 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
    255             }
    256             if (threshold && (spacing && spacing >= threshold) ) {
    257                 m_Buffer.AppendChar(L' ');
    258                 if (m_pObjArray) {
    259                     m_pObjArray->Add(NULL);
    260                     m_pObjArray->Add(NULL);
    261                 }
    262             }
    263             if (item.m_CharCode == (FX_DWORD) - 1) {
    264                 continue;
    265             }
    266             spacing = 0;
    267         }
    268         CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
    269         if (unicode_str.IsEmpty()) {
    270             m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
    271             if (m_pObjArray) {
    272                 m_pObjArray->Add((void*)pObj);
    273                 m_pObjArray->Add((void*)(FX_INTPTR)item_index);
    274             }
    275         } else {
    276             m_Buffer << unicode_str;
    277             if (m_pObjArray) {
    278                 for (int i = 0; i < unicode_str.GetLength(); i ++) {
    279                     m_pObjArray->Add((void*)pObj);
    280                     m_pObjArray->Add((void*)(FX_INTPTR)item_index);
    281                 }
    282             }
    283         }
    284     }
    285     return FALSE;
    286 }
    287 void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF,
    288                                 CFX_PtrArray* pObjArray)
    289 {
    290     CPDF_TextStream textstream(buffer, bUseLF, pObjArray);
    291     FX_POSITION pos = pPage->GetFirstObjectPosition();
    292     while (pos) {
    293         CPDF_PageObject* pObject = pPage->GetNextObject(pos);
    294         if (pObject == NULL) {
    295             continue;
    296         }
    297         if (pObject->m_Type != PDFPAGE_TEXT) {
    298             continue;
    299         }
    300         textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
    301     }
    302 }
    303 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage)
    304 {
    305     CFX_WideTextBuf buffer;
    306     buffer.EstimateSize(0, 1024);
    307     CPDF_Page page;
    308     page.Load(pDoc, pPage);
    309     CPDF_ParseOptions options;
    310     options.m_bTextOnly = TRUE;
    311     options.m_bSeparateForm = FALSE;
    312     page.ParseContent(&options);
    313     CPDF_TextStream textstream(buffer, FALSE, NULL);
    314     FX_POSITION pos = page.GetFirstObjectPosition();
    315     while (pos) {
    316         CPDF_PageObject* pObject = page.GetNextObject(pos);
    317         if (pObject->m_Type != PDFPAGE_TEXT) {
    318             continue;
    319         }
    320         if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
    321             break;
    322         }
    323     }
    324     return buffer.GetWideString();
    325 }
    326