Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
      8 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
      9 
     10 #include <deque>
     11 #include <vector>
     12 
     13 #include "core/fpdfapi/page/cpdf_pageobjectlist.h"
     14 #include "core/fxcrt/fx_basic.h"
     15 #include "core/fxcrt/fx_coordinates.h"
     16 #include "core/fxcrt/fx_string.h"
     17 
     18 class CPDF_Font;
     19 class CPDF_FormObject;
     20 class CPDF_Page;
     21 class CPDF_TextObject;
     22 
     23 #define FPDFTEXT_MATCHCASE 0x00000001
     24 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
     25 #define FPDFTEXT_CONSECUTIVE 0x00000004
     26 
     27 #define FPDFTEXT_CHAR_ERROR -1
     28 #define FPDFTEXT_CHAR_NORMAL 0
     29 #define FPDFTEXT_CHAR_GENERATED 1
     30 #define FPDFTEXT_CHAR_UNUNICODE 2
     31 #define FPDFTEXT_CHAR_HYPHEN 3
     32 #define FPDFTEXT_CHAR_PIECE 4
     33 
     34 #define TEXT_SPACE_CHAR L' '
     35 #define TEXT_LINEFEED_CHAR L'\n'
     36 #define TEXT_RETURN_CHAR L'\r'
     37 #define TEXT_EMPTY L""
     38 #define TEXT_SPACE L" "
     39 #define TEXT_RETURN_LINEFEED L"\r\n"
     40 #define TEXT_LINEFEED L"\n"
     41 #define TEXT_CHARRATIO_GAPDELTA 0.070
     42 
     43 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };
     44 
     45 enum class FPDFText_Direction { Left = -1, Right = 1 };
     46 
     47 class FPDF_CHAR_INFO {
     48  public:
     49   FPDF_CHAR_INFO();
     50   ~FPDF_CHAR_INFO();
     51 
     52   FX_WCHAR m_Unicode;
     53   FX_WCHAR m_Charcode;
     54   int32_t m_Flag;
     55   FX_FLOAT m_FontSize;
     56   CFX_PointF m_Origin;
     57   CFX_FloatRect m_CharBox;
     58   CPDF_TextObject* m_pTextObj;
     59   CFX_Matrix m_Matrix;
     60 };
     61 
     62 struct FPDF_SEGMENT {
     63   int m_Start;
     64   int m_nCount;
     65 };
     66 
     67 class PAGECHAR_INFO {
     68  public:
     69   PAGECHAR_INFO();
     70   PAGECHAR_INFO(const PAGECHAR_INFO&);
     71   ~PAGECHAR_INFO();
     72 
     73   int m_Index;
     74   int m_CharCode;
     75   FX_WCHAR m_Unicode;
     76   int32_t m_Flag;
     77   CFX_PointF m_Origin;
     78   CFX_FloatRect m_CharBox;
     79   CPDF_TextObject* m_pTextObj;
     80   CFX_Matrix m_Matrix;
     81 };
     82 
     83 struct PDFTEXT_Obj {
     84   CPDF_TextObject* m_pTextObj;
     85   CFX_Matrix m_formMatrix;
     86 };
     87 
     88 class CPDF_TextPage {
     89  public:
     90   CPDF_TextPage(const CPDF_Page* pPage, FPDFText_Direction flags);
     91   ~CPDF_TextPage();
     92 
     93   // IPDF_TextPage:
     94   void ParseTextPage();
     95   bool IsParsed() const { return m_bIsParsed; }
     96   int CharIndexFromTextIndex(int TextIndex) const;
     97   int TextIndexFromCharIndex(int CharIndex) const;
     98   int CountChars() const;
     99   void GetCharInfo(int index, FPDF_CHAR_INFO* info) const;
    100   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
    101   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
    102   CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const;
    103   CFX_WideString GetPageText(int start = 0, int nCount = -1) const;
    104   int CountRects(int start, int nCount);
    105   void GetRect(int rectIndex,
    106                FX_FLOAT& left,
    107                FX_FLOAT& top,
    108                FX_FLOAT& right,
    109                FX_FLOAT& bottom) const;
    110 
    111   static bool IsRectIntersect(const CFX_FloatRect& rect1,
    112                               const CFX_FloatRect& rect2);
    113 
    114  private:
    115   enum class TextOrientation {
    116     Unknown,
    117     Horizontal,
    118     Vertical,
    119   };
    120 
    121   enum class GenerateCharacter {
    122     None,
    123     Space,
    124     LineBreak,
    125     Hyphen,
    126   };
    127 
    128   bool IsHyphen(FX_WCHAR curChar);
    129   bool IsControlChar(const PAGECHAR_INFO& charInfo);
    130   void ProcessObject();
    131   void ProcessFormObject(CPDF_FormObject* pFormObj,
    132                          const CFX_Matrix& formMatrix);
    133   void ProcessTextObject(PDFTEXT_Obj pObj);
    134   void ProcessTextObject(CPDF_TextObject* pTextObj,
    135                          const CFX_Matrix& formMatrix,
    136                          const CPDF_PageObjectList* pObjList,
    137                          CPDF_PageObjectList::const_iterator ObjPos);
    138   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
    139                                         const CFX_Matrix& formMatrix);
    140   bool GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
    141   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
    142                              const CPDF_PageObjectList* pObjList,
    143                              CPDF_PageObjectList::const_iterator ObjPos);
    144   bool IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
    145   int GetCharWidth(uint32_t charCode, CPDF_Font* pFont) const;
    146   void CloseTempLine();
    147   FPDFText_MarkedContent PreMarkedContent(PDFTEXT_Obj pObj);
    148   void ProcessMarkedContent(PDFTEXT_Obj pObj);
    149   void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
    150   void FindPreviousTextObject();
    151   void AddCharInfoByLRDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
    152   void AddCharInfoByRLDirection(FX_WCHAR wChar, PAGECHAR_INFO info);
    153   TextOrientation GetTextObjectWritingMode(
    154       const CPDF_TextObject* pTextObj) const;
    155   TextOrientation FindTextlineFlowOrientation() const;
    156   void AppendGeneratedCharacter(FX_WCHAR unicode, const CFX_Matrix& formMatrix);
    157 
    158   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
    159   bool IsRightToLeft(const CPDF_TextObject* pTextObj,
    160                      const CPDF_Font* pFont,
    161                      int nItems) const;
    162 
    163   const CPDF_Page* const m_pPage;
    164   std::vector<uint16_t> m_CharIndex;
    165   std::deque<PAGECHAR_INFO> m_CharList;
    166   std::deque<PAGECHAR_INFO> m_TempCharList;
    167   CFX_WideTextBuf m_TextBuf;
    168   CFX_WideTextBuf m_TempTextBuf;
    169   const FPDFText_Direction m_parserflag;
    170   CPDF_TextObject* m_pPreTextObj;
    171   CFX_Matrix m_perMatrix;
    172   bool m_bIsParsed;
    173   CFX_Matrix m_DisplayMatrix;
    174   std::vector<CFX_FloatRect> m_SelRects;
    175   std::vector<PDFTEXT_Obj> m_LineObj;
    176   TextOrientation m_TextlineDir;
    177   CFX_FloatRect m_CurlineRect;
    178 };
    179 
    180 #endif  // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
    181