Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_
      8 #define CORE_SRC_FPDFTEXT_TEXT_INT_H_
      9 
     10 #include "core/include/fpdftext/fpdf_text.h"
     11 #include "core/include/fxcrt/fx_basic.h"
     12 
     13 class CFX_BidiChar;
     14 class CPDF_DocProgressiveSearch;
     15 class CPDF_FormObject;
     16 class CPDF_LinkExtract;
     17 class CPDF_TextPageFind;
     18 
     19 #define FPDFTEXT_CHAR_ERROR -1
     20 #define FPDFTEXT_CHAR_NORMAL 0
     21 #define FPDFTEXT_CHAR_GENERATED 1
     22 #define FPDFTEXT_CHAR_UNUNICODE 2
     23 #define FPDFTEXT_CHAR_HYPHEN 3
     24 #define FPDFTEXT_CHAR_PIECE 4
     25 #define FPDFTEXT_MC_PASS 0
     26 #define FPDFTEXT_MC_DONE 1
     27 #define FPDFTEXT_MC_DELAY 2
     28 
     29 typedef struct _PAGECHAR_INFO {
     30   int m_CharCode;
     31   FX_WCHAR m_Unicode;
     32   FX_FLOAT m_OriginX;
     33   FX_FLOAT m_OriginY;
     34   int32_t m_Flag;
     35   CFX_FloatRect m_CharBox;
     36   CPDF_TextObject* m_pTextObj;
     37   CFX_Matrix m_Matrix;
     38   int m_Index;
     39 } PAGECHAR_INFO;
     40 typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray;
     41 typedef struct {
     42   int m_Start;
     43   int m_nCount;
     44 } FPDF_SEGMENT;
     45 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array;
     46 typedef struct {
     47   CPDF_TextObject* m_pTextObj;
     48   CFX_Matrix m_formMatrix;
     49 } PDFTEXT_Obj;
     50 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ;
     51 
     52 class CPDF_TextPage : public IPDF_TextPage {
     53  public:
     54   CPDF_TextPage(const CPDF_Page* pPage, int flags);
     55   ~CPDF_TextPage() override {}
     56 
     57   // IPDF_TextPage
     58   FX_BOOL ParseTextPage() override;
     59   void NormalizeObjects(FX_BOOL bNormalize) override;
     60   bool IsParsed() const override { return m_bIsParsed; }
     61   int CharIndexFromTextIndex(int TextIndex) const override;
     62   int TextIndexFromCharIndex(int CharIndex) const override;
     63   int CountChars() const override;
     64   void GetCharInfo(int index, FPDF_CHAR_INFO* info) const override;
     65   void GetRectArray(int start,
     66                     int nCount,
     67                     CFX_RectArray& rectArray) const override;
     68   int GetIndexAtPos(CPDF_Point point,
     69                     FX_FLOAT xTolerance,
     70                     FX_FLOAT yTolerance) const override;
     71   int GetIndexAtPos(FX_FLOAT x,
     72                     FX_FLOAT y,
     73                     FX_FLOAT xTolerance,
     74                     FX_FLOAT yTolerance) const override;
     75   CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override;
     76   void GetRectsArrayByRect(const CFX_FloatRect& rect,
     77                            CFX_RectArray& resRectArray) const override;
     78   CFX_WideString GetPageText(int start = 0, int nCount = -1) const override;
     79   int CountRects(int start, int nCount) override;
     80   void GetRect(int rectIndex,
     81                FX_FLOAT& left,
     82                FX_FLOAT& top,
     83                FX_FLOAT& right,
     84                FX_FLOAT& bottom) const override;
     85   FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override;
     86   FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override;
     87   int CountBoundedSegments(FX_FLOAT left,
     88                            FX_FLOAT top,
     89                            FX_FLOAT right,
     90                            FX_FLOAT bottom,
     91                            FX_BOOL bContains = FALSE) override;
     92   void GetBoundedSegment(int index, int& start, int& count) const override;
     93   int GetWordBreak(int index, int direction) const override;
     94 
     95   const PAGECHAR_InfoArray* GetCharList() const { return &m_charList; }
     96   static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1,
     97                                  const CFX_FloatRect& rect2);
     98   static FX_BOOL IsLetter(FX_WCHAR unicode);
     99 
    100  private:
    101   FX_BOOL IsHyphen(FX_WCHAR curChar);
    102   bool IsControlChar(const PAGECHAR_INFO& charInfo);
    103   FX_BOOL GetBaselineRotate(int start, int end, int& Rotate);
    104   void ProcessObject();
    105   void ProcessFormObject(CPDF_FormObject* pFormObj,
    106                          const CFX_Matrix& formMatrix);
    107   void ProcessTextObject(PDFTEXT_Obj pObj);
    108   void ProcessTextObject(CPDF_TextObject* pTextObj,
    109                          const CFX_Matrix& formMatrix,
    110                          FX_POSITION ObjPos);
    111   int ProcessInsertObject(const CPDF_TextObject* pObj,
    112                           const CFX_Matrix& formMatrix);
    113   FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info);
    114   FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos);
    115   FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1,
    116                            CPDF_TextObject* pTextObj2);
    117   int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const;
    118   void CloseTempLine();
    119   void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str);
    120   int32_t PreMarkedContent(PDFTEXT_Obj pObj);
    121   void ProcessMarkedContent(PDFTEXT_Obj pObj);
    122   void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const;
    123   void FindPreviousTextObject(void);
    124   void AddCharInfoByLRDirection(CFX_WideString& str, int i);
    125   void AddCharInfoByRLDirection(CFX_WideString& str, int i);
    126   int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj);
    127   int32_t FindTextlineFlowDirection();
    128 
    129   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
    130   FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj,
    131                         const CPDF_Font* pFont,
    132                         int nItems) const;
    133 
    134   CPDFText_ParseOptions m_ParseOptions;
    135   CFX_WordArray m_CharIndex;
    136   const CPDF_PageObjects* const m_pPage;
    137   PAGECHAR_InfoArray m_charList;
    138   CFX_WideTextBuf m_TextBuf;
    139   PAGECHAR_InfoArray m_TempCharList;
    140   CFX_WideTextBuf m_TempTextBuf;
    141   const int m_parserflag;
    142   CPDF_TextObject* m_pPreTextObj;
    143   CFX_Matrix m_perMatrix;
    144   bool m_bIsParsed;
    145   CFX_Matrix m_DisplayMatrix;
    146   SEGMENT_Array m_Segment;
    147   CFX_RectArray m_SelRects;
    148   LINEOBJ m_LineObj;
    149   int32_t m_TextlineDir;
    150   CFX_FloatRect m_CurlineRect;
    151 };
    152 
    153 class CPDF_TextPageFind : public IPDF_TextPageFind {
    154  public:
    155   explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage);
    156   ~CPDF_TextPageFind() override {}
    157 
    158   // IPDF_TextPageFind
    159   FX_BOOL FindFirst(const CFX_WideString& findwhat,
    160                     int flags,
    161                     int startPos = 0) override;
    162   FX_BOOL FindNext() override;
    163   FX_BOOL FindPrev() override;
    164   void GetRectArray(CFX_RectArray& rects) const override;
    165   int GetCurOrder() const override;
    166   int GetMatchedCount() const override;
    167 
    168  protected:
    169   void ExtractFindWhat(const CFX_WideString& findwhat);
    170   FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText,
    171                            int startPos,
    172                            int endPos);
    173   FX_BOOL ExtractSubString(CFX_WideString& rString,
    174                            const FX_WCHAR* lpszFullString,
    175                            int iSubString,
    176                            FX_WCHAR chSep);
    177   CFX_WideString MakeReverse(const CFX_WideString& str);
    178   int ReverseFind(const CFX_WideString& csPageText,
    179                   const CFX_WideString& csWord,
    180                   int nStartPos,
    181                   int& WordLength);
    182   int GetCharIndex(int index) const;
    183 
    184  private:
    185   CFX_WordArray m_CharIndex;
    186   const IPDF_TextPage* m_pTextPage;
    187   CFX_WideString m_strText;
    188   CFX_WideString m_findWhat;
    189   int m_flags;
    190   CFX_WideStringArray m_csFindWhatArray;
    191   int m_findNextStart;
    192   int m_findPreStart;
    193   FX_BOOL m_bMatchCase;
    194   FX_BOOL m_bMatchWholeWord;
    195   int m_resStart;
    196   int m_resEnd;
    197   CFX_RectArray m_resArray;
    198   FX_BOOL m_IsFind;
    199 };
    200 
    201 class CPDF_LinkExt {
    202  public:
    203   CPDF_LinkExt() {}
    204   int m_Start;
    205   int m_Count;
    206   CFX_WideString m_strUrl;
    207   virtual ~CPDF_LinkExt() {}
    208 };
    209 
    210 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray;
    211 
    212 class CPDF_LinkExtract : public IPDF_LinkExtract {
    213  public:
    214   CPDF_LinkExtract();
    215   ~CPDF_LinkExtract() override;
    216 
    217   // IPDF_LinkExtract
    218   FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override;
    219   int CountLinks() const override;
    220   CFX_WideString GetURL(int index) const override;
    221   void GetBoundedSegment(int index, int& start, int& count) const override;
    222   void GetRects(int index, CFX_RectArray& rects) const override;
    223 
    224   FX_BOOL IsExtract() const { return m_bIsParsed; }
    225 
    226  protected:
    227   void ParseLink();
    228   void DeleteLinkList();
    229   FX_BOOL CheckWebLink(CFX_WideString& strBeCheck);
    230   bool CheckMailLink(CFX_WideString& str);
    231   void AppendToLinkList(int start, int count, const CFX_WideString& strUrl);
    232 
    233  private:
    234   LINK_InfoArray m_LinkList;
    235   const CPDF_TextPage* m_pTextPage;
    236   CFX_WideString m_strPageText;
    237   bool m_bIsParsed;
    238 };
    239 
    240 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst);
    241 void NormalizeString(CFX_WideString& str);
    242 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest);
    243 void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
    244                            CPDF_PageObjects* pPage,
    245                            FX_BOOL bUseLF,
    246                            CFX_PtrArray* pObjArray);
    247 
    248 #endif  // CORE_SRC_FPDFTEXT_TEXT_INT_H_
    249