Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #include "core/include/fpdfapi/fpdf_page.h"
      8 #include "core/include/fpdfapi/fpdf_pageobj.h"
      9 #include "text_int.h"
     10 
     11 class CPDF_TextStream {
     12  public:
     13   CPDF_TextStream(CFX_WideTextBuf& buffer,
     14                   FX_BOOL bUseLF,
     15                   CFX_PtrArray* pObjArray);
     16   ~CPDF_TextStream() {}
     17   FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
     18   CFX_WideTextBuf& m_Buffer;
     19   FX_BOOL m_bUseLF;
     20   CFX_PtrArray* m_pObjArray;
     21   const CPDF_TextObject* m_pLastObj;
     22 };
     23 CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer,
     24                                  FX_BOOL bUseLF,
     25                                  CFX_PtrArray* pObjArray)
     26     : m_Buffer(buffer) {
     27   m_pLastObj = NULL;
     28   m_bUseLF = bUseLF;
     29   m_pObjArray = pObjArray;
     30 }
     31 FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1,
     32                                   const CPDF_TextObject* pTextObj2) {
     33   if (!pTextObj1 || !pTextObj2) {
     34     return FALSE;
     35   }
     36   CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
     37                          pTextObj2->m_Right, pTextObj2->m_Top);
     38   CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
     39                          pTextObj1->m_Right, pTextObj1->m_Top);
     40   if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
     41     return TRUE;
     42   }
     43   if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
     44     rcPreObj.Intersect(rcCurObj);
     45     if (rcPreObj.IsEmpty()) {
     46       return FALSE;
     47     }
     48     if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
     49         rcCurObj.Width() / 2) {
     50       return FALSE;
     51     }
     52     if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
     53       return FALSE;
     54     }
     55   }
     56   int nPreCount = pTextObj2->CountItems();
     57   int nCurCount = pTextObj1->CountItems();
     58   if (nPreCount != nCurCount) {
     59     return FALSE;
     60   }
     61   for (int i = 0; i < nPreCount; i++) {
     62     CPDF_TextObjectItem itemPer, itemCur;
     63     pTextObj2->GetItemInfo(i, &itemPer);
     64     pTextObj1->GetItemInfo(i, &itemCur);
     65     if (itemCur.m_CharCode != itemPer.m_CharCode) {
     66       return FALSE;
     67     }
     68   }
     69   return TRUE;
     70 }
     71 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) {
     72   if (charCode == -1) {
     73     return 0;
     74   }
     75   int w = pFont->GetCharWidthF(charCode);
     76   if (w == 0) {
     77     CFX_ByteString str;
     78     pFont->AppendChar(str, charCode);
     79     w = pFont->GetStringWidth(str, 1);
     80     if (w == 0) {
     81       FX_RECT BBox;
     82       pFont->GetCharBBox(charCode, BBox);
     83       w = BBox.right - BBox.left;
     84     }
     85   }
     86   return w;
     87 }
     88 int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj,
     89                              const CPDF_TextObject* pObj) {
     90   if (FPDFText_IsSameTextObject(pPrevObj, pObj)) {
     91     return -1;
     92   }
     93   CPDF_TextObjectItem item;
     94   int nItem = pPrevObj->CountItems();
     95   pPrevObj->GetItemInfo(nItem - 1, &item);
     96   FX_WCHAR preChar = 0, curChar = 0;
     97   CFX_WideString wstr =
     98       pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
     99   if (wstr.GetLength()) {
    100     preChar = wstr.GetAt(0);
    101   }
    102   FX_FLOAT last_pos = item.m_OriginX;
    103   int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
    104   FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
    105   last_width = FXSYS_fabs(last_width);
    106   pObj->GetItemInfo(0, &item);
    107   wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
    108   if (wstr.GetLength()) {
    109     curChar = wstr.GetAt(0);
    110   }
    111   int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
    112   FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
    113   this_width = FXSYS_fabs(this_width);
    114   FX_FLOAT threshold =
    115       last_width > this_width ? last_width / 4 : this_width / 4;
    116   CFX_Matrix prev_matrix, prev_reverse;
    117   pPrevObj->GetTextMatrix(&prev_matrix);
    118   prev_reverse.SetReverse(prev_matrix);
    119   FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
    120   prev_reverse.Transform(x, y);
    121   if (FXSYS_fabs(y) > threshold * 2) {
    122     return 2;
    123   }
    124   threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
    125   threshold = threshold > 400
    126                   ? (threshold < 700 ? threshold / 4 : threshold / 5)
    127                   : (threshold / 2);
    128   threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize())
    129                                        : FXSYS_fabs(pObj->GetFontSize());
    130   threshold /= 1000;
    131   if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
    132       preChar != L' ')
    133     if (curChar != L' ' && preChar != L' ') {
    134       if ((x - last_pos - last_width) > threshold ||
    135           (last_pos - x - last_width) > threshold) {
    136         return 1;
    137       }
    138       if (x < 0 && (last_pos - x - last_width) > threshold) {
    139         return 1;
    140       }
    141       if ((x - last_pos - last_width) > this_width ||
    142           (x - last_pos - this_width) > last_width) {
    143         return 1;
    144       }
    145     }
    146   if (last_pos + last_width > x + this_width && curChar == L' ') {
    147     return 3;
    148   }
    149   return 0;
    150 }
    151 FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj,
    152                                        FX_BOOL bFirstLine) {
    153   CPDF_Font* pFont = pObj->GetFont();
    154   CFX_Matrix matrix;
    155   pObj->GetTextMatrix(&matrix);
    156   int item_index = 0;
    157   if (m_pLastObj) {
    158     int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
    159     if (result == 2) {
    160       int len = m_Buffer.GetLength();
    161       if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
    162         m_Buffer.Delete(len - 1, 1);
    163         if (m_pObjArray) {
    164           m_pObjArray->RemoveAt((len - 1) * 2, 2);
    165         }
    166       } else {
    167         if (bFirstLine) {
    168           return TRUE;
    169         }
    170         if (m_bUseLF) {
    171           m_Buffer.AppendChar(L'\r');
    172           m_Buffer.AppendChar(L'\n');
    173           if (m_pObjArray) {
    174             for (int i = 0; i < 4; i++) {
    175               m_pObjArray->Add(NULL);
    176             }
    177           }
    178         } else {
    179           m_Buffer.AppendChar(' ');
    180           if (m_pObjArray) {
    181             m_pObjArray->Add(NULL);
    182             m_pObjArray->Add(NULL);
    183           }
    184         }
    185       }
    186     } else if (result == 1) {
    187       m_Buffer.AppendChar(L' ');
    188       if (m_pObjArray) {
    189         m_pObjArray->Add(NULL);
    190         m_pObjArray->Add(NULL);
    191       }
    192     } else if (result == -1) {
    193       m_pLastObj = pObj;
    194       return FALSE;
    195     } else if (result == 3) {
    196       item_index = 1;
    197     }
    198   }
    199   m_pLastObj = pObj;
    200   int nItems = pObj->CountItems();
    201   FX_FLOAT Ignorekerning = 0;
    202   for (int i = 1; i < nItems - 1; i += 2) {
    203     CPDF_TextObjectItem item;
    204     pObj->GetItemInfo(i, &item);
    205     if (item.m_CharCode == (FX_DWORD)-1) {
    206       if (i == 1) {
    207         Ignorekerning = item.m_OriginX;
    208       } else if (Ignorekerning > item.m_OriginX) {
    209         Ignorekerning = item.m_OriginX;
    210       }
    211     } else {
    212       Ignorekerning = 0;
    213       break;
    214     }
    215   }
    216   FX_FLOAT spacing = 0;
    217   for (; item_index < nItems; item_index++) {
    218     CPDF_TextObjectItem item;
    219     pObj->GetItemInfo(item_index, &item);
    220     if (item.m_CharCode == (FX_DWORD)-1) {
    221       CFX_WideString wstr = m_Buffer.GetWideString();
    222       if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
    223         continue;
    224       }
    225       FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
    226       spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
    227       continue;
    228     }
    229     FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
    230     if (nItems > 3 && !spacing) {
    231       charSpace = 0;
    232     }
    233     if ((spacing || charSpace) && item_index > 0) {
    234       int last_width = 0;
    235       FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
    236       FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
    237       FX_FLOAT threshold = 0;
    238       if (space_charcode != -1) {
    239         threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
    240       }
    241       if (threshold > fontsize_h / 3) {
    242         threshold = 0;
    243       } else {
    244         threshold /= 2;
    245       }
    246       if (threshold == 0) {
    247         threshold = fontsize_h;
    248         int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
    249         threshold = this_width > last_width ? (FX_FLOAT)this_width
    250                                             : (FX_FLOAT)last_width;
    251         int nDivide = 6;
    252         if (threshold < 300) {
    253           nDivide = 2;
    254         } else if (threshold < 500) {
    255           nDivide = 4;
    256         } else if (threshold < 700) {
    257           nDivide = 5;
    258         }
    259         threshold = threshold / nDivide;
    260         threshold = fontsize_h * threshold / 1000;
    261       }
    262       if (charSpace > 0.001) {
    263         spacing += matrix.TransformDistance(charSpace);
    264       } else if (charSpace < -0.001) {
    265         spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
    266       }
    267       if (threshold && (spacing && spacing >= threshold)) {
    268         m_Buffer.AppendChar(L' ');
    269         if (m_pObjArray) {
    270           m_pObjArray->Add(NULL);
    271           m_pObjArray->Add(NULL);
    272         }
    273       }
    274       if (item.m_CharCode == (FX_DWORD)-1) {
    275         continue;
    276       }
    277       spacing = 0;
    278     }
    279     CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
    280     if (unicode_str.IsEmpty()) {
    281       m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
    282       if (m_pObjArray) {
    283         m_pObjArray->Add((void*)pObj);
    284         m_pObjArray->Add((void*)(intptr_t)item_index);
    285       }
    286     } else {
    287       m_Buffer << unicode_str;
    288       if (m_pObjArray) {
    289         for (int i = 0; i < unicode_str.GetLength(); i++) {
    290           m_pObjArray->Add((void*)pObj);
    291           m_pObjArray->Add((void*)(intptr_t)item_index);
    292         }
    293       }
    294     }
    295   }
    296   return FALSE;
    297 }
    298 void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
    299                            CPDF_PageObjects* pPage,
    300                            FX_BOOL bUseLF,
    301                            CFX_PtrArray* pObjArray) {
    302   CPDF_TextStream textstream(buffer, bUseLF, pObjArray);
    303   FX_POSITION pos = pPage->GetFirstObjectPosition();
    304   while (pos) {
    305     CPDF_PageObject* pObject = pPage->GetNextObject(pos);
    306     if (pObject && pObject->m_Type == PDFPAGE_TEXT)
    307       textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
    308   }
    309 }
    310 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc,
    311                                             CPDF_Dictionary* pPage) {
    312   CFX_WideTextBuf buffer;
    313   buffer.EstimateSize(0, 1024);
    314   CPDF_Page page;
    315   page.Load(pDoc, pPage);
    316   CPDF_ParseOptions options;
    317   options.m_bTextOnly = TRUE;
    318   options.m_bSeparateForm = FALSE;
    319   page.ParseContent(&options);
    320   CPDF_TextStream textstream(buffer, FALSE, NULL);
    321   FX_POSITION pos = page.GetFirstObjectPosition();
    322   while (pos) {
    323     CPDF_PageObject* pObject = page.GetNextObject(pos);
    324     if (pObject->m_Type != PDFPAGE_TEXT) {
    325       continue;
    326     }
    327     if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
    328       break;
    329     }
    330   }
    331   return buffer.GetWideString();
    332 }
    333