Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
      8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
      9 
     10 #include <deque>
     11 #include <vector>
     12 
     13 #include "core/fpdfapi/page/cpdf_pageobjectlist.h"
     14 #include "core/fxcrt/cfx_widetextbuf.h"
     15 #include "core/fxcrt/fx_coordinates.h"
     16 #include "core/fxcrt/fx_string.h"
     17 #include "core/fxcrt/unowned_ptr.h"
     18 
     19 class CPDF_Font;
     20 class CPDF_FormObject;
     21 class CPDF_Page;
     22 class CPDF_TextObject;
     23 
     24 #define FPDFTEXT_MATCHCASE 0x00000001
     25 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
     26 #define FPDFTEXT_CONSECUTIVE 0x00000004
     27 
     28 #define FPDFTEXT_CHAR_NORMAL 0
     29 #define FPDFTEXT_CHAR_GENERATED 1
     30 #define FPDFTEXT_CHAR_UNUNICODE 2
     31 #define FPDFTEXT_CHAR_HYPHEN 3
     32 #define FPDFTEXT_CHAR_PIECE 4
     33 
     34 #define TEXT_SPACE_CHAR L' '
     35 #define TEXT_LINEFEED_CHAR L'\n'
     36 #define TEXT_RETURN_CHAR L'\r'
     37 #define TEXT_HYPHEN_CHAR L'-'
     38 #define TEXT_EMPTY L""
     39 #define TEXT_HYPHEN L"-"
     40 #define TEXT_CHARRATIO_GAPDELTA 0.070
     41 
     42 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };
     43 
     44 enum class FPDFText_Direction { Left = -1, Right = 1 };
     45 
     46 class FPDF_CHAR_INFO {
     47  public:
     48   FPDF_CHAR_INFO();
     49   ~FPDF_CHAR_INFO();
     50 
     51   wchar_t m_Unicode;
     52   wchar_t m_Charcode;
     53   int32_t m_Flag;
     54   float m_FontSize;
     55   CFX_PointF m_Origin;
     56   CFX_FloatRect m_CharBox;
     57   UnownedPtr<CPDF_TextObject> m_pTextObj;
     58   CFX_Matrix m_Matrix;
     59 };
     60 
     61 struct FPDF_SEGMENT {
     62   int m_Start;
     63   int m_nCount;
     64 };
     65 
     66 class PAGECHAR_INFO {
     67  public:
     68   PAGECHAR_INFO();
     69   PAGECHAR_INFO(const PAGECHAR_INFO&);
     70   ~PAGECHAR_INFO();
     71 
     72   int m_Index;
     73   int m_CharCode;
     74   wchar_t m_Unicode;
     75   int32_t m_Flag;
     76   CFX_PointF m_Origin;
     77   CFX_FloatRect m_CharBox;
     78   UnownedPtr<CPDF_TextObject> m_pTextObj;
     79   CFX_Matrix m_Matrix;
     80 };
     81 
     82 struct PDFTEXT_Obj {
     83   PDFTEXT_Obj();
     84   PDFTEXT_Obj(const PDFTEXT_Obj& that);
     85   ~PDFTEXT_Obj();
     86 
     87   UnownedPtr<CPDF_TextObject> m_pTextObj;
     88   CFX_Matrix m_formMatrix;
     89 };
     90 
     91 class CPDF_TextPage {
     92  public:
     93   CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags);
     94   ~CPDF_TextPage();
     95 
     96   // IPDF_TextPage:
     97   void ParseTextPage();
     98   bool IsParsed() const { return m_bIsParsed; }
     99   int CharIndexFromTextIndex(int TextIndex) const;
    100   int TextIndexFromCharIndex(int CharIndex) const;
    101   int CountChars() const;
    102   void GetCharInfo(int index, FPDF_CHAR_INFO* info) const;
    103   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
    104   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
    105   WideString GetTextByRect(const CFX_FloatRect& rect) const;
    106 
    107   // Returns string with the text from |m_TextBuf| that are covered by the input
    108   // range. |start| and |count| are in terms of the m_CharIndex, so the range
    109   // will be converted into appropriate indices.
    110   WideString GetPageText(int start, int count) const;
    111   WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
    112 
    113   int CountRects(int start, int nCount);
    114   bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
    115 
    116   static bool IsRectIntersect(const CFX_FloatRect& rect1,
    117                               const CFX_FloatRect& rect2);
    118 
    119  private:
    120   enum class TextOrientation {
    121     Unknown,
    122     Horizontal,
    123     Vertical,
    124   };
    125 
    126   enum class GenerateCharacter {
    127     None,
    128     Space,
    129     LineBreak,
    130     Hyphen,
    131   };
    132 
    133   bool IsHyphen(wchar_t curChar) const;
    134   bool IsControlChar(const PAGECHAR_INFO& charInfo);
    135   void ProcessObject();
    136   void ProcessFormObject(CPDF_FormObject* pFormObj,
    137                          const CFX_Matrix& formMatrix);
    138   void ProcessTextObject(PDFTEXT_Obj pObj);
    139   void ProcessTextObject(CPDF_TextObject* pTextObj,
    140                          const CFX_Matrix& formMatrix,
    141                          const CPDF_PageObjectList* pObjList,
    142                          CPDF_PageObjectList::const_iterator ObjPos);
    143   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
    144                                         const CFX_Matrix& formMatrix);
    145   bool GenerateCharInfo(wchar_t unicode, PAGECHAR_INFO& info);
    146   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
    147                              const CPDF_PageObjectList* pObjList,
    148                              CPDF_PageObjectList::const_iterator ObjPos);
    149   bool IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
    150   int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const;
    151   void CloseTempLine();
    152   FPDFText_MarkedContent PreMarkedContent(PDFTEXT_Obj pObj);
    153   void ProcessMarkedContent(PDFTEXT_Obj pObj);
    154   void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
    155   void FindPreviousTextObject();
    156   void AddCharInfoByLRDirection(wchar_t wChar, PAGECHAR_INFO info);
    157   void AddCharInfoByRLDirection(wchar_t wChar, PAGECHAR_INFO info);
    158   TextOrientation GetTextObjectWritingMode(
    159       const CPDF_TextObject* pTextObj) const;
    160   TextOrientation FindTextlineFlowOrientation() const;
    161   void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
    162 
    163   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
    164   bool IsRightToLeft(const CPDF_TextObject* pTextObj,
    165                      const CPDF_Font* pFont,
    166                      size_t nItems) const;
    167 
    168   UnownedPtr<const CPDF_Page> const m_pPage;
    169   std::vector<uint16_t> m_CharIndex;
    170   std::deque<PAGECHAR_INFO> m_CharList;
    171   std::deque<PAGECHAR_INFO> m_TempCharList;
    172   CFX_WideTextBuf m_TextBuf;
    173   CFX_WideTextBuf m_TempTextBuf;
    174   const FPDFText_Direction m_parserflag;
    175   UnownedPtr<CPDF_TextObject> m_pPreTextObj;
    176   CFX_Matrix m_perMatrix;
    177   bool m_bIsParsed;
    178   CFX_Matrix m_DisplayMatrix;
    179   std::vector<CFX_FloatRect> m_SelRects;
    180   std::vector<PDFTEXT_Obj> m_LineObj;
    181   TextOrientation m_TextlineDir;
    182   CFX_FloatRect m_CurlineRect;
    183 };
    184 
    185 #endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
    186