1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef _PDF_TEXT_INT_H_ 8 #define _PDF_TEXT_INT_H_ 9 class CPDF_TextParseOptions : public CFX_Object 10 { 11 public: 12 CPDF_TextParseOptions(); 13 FX_BOOL m_bCheckObjectOrder; 14 FX_BOOL m_bCheckDirection; 15 int m_nCheckSameObject; 16 }; 17 class CPDF_TextPage; 18 class CPDF_LinkExtract; 19 class CPDF_TextPageFind; 20 class CPDF_DocProgressiveSearch; 21 #define FPDFTEXT_CHAR_ERROR -1 22 #define FPDFTEXT_CHAR_NORMAL 0 23 #define FPDFTEXT_CHAR_GENERATED 1 24 #define FPDFTEXT_CHAR_UNUNICODE 2 25 #define FPDFTEXT_CHAR_HYPHEN 3 26 #define FPDFTEXT_CHAR_PIECE 4 27 #define FPDFTEXT_MC_PASS 0 28 #define FPDFTEXT_MC_DONE 1 29 #define FPDFTEXT_MC_DELAY 2 30 typedef struct _PAGECHAR_INFO: public CFX_Object { 31 int m_CharCode; 32 FX_WCHAR m_Unicode; 33 FX_FLOAT m_OriginX; 34 FX_FLOAT m_OriginY; 35 FX_INT32 m_Flag; 36 CFX_FloatRect m_CharBox; 37 CPDF_TextObject* m_pTextObj; 38 CFX_AffineMatrix m_Matrix; 39 int m_Index; 40 } PAGECHAR_INFO; 41 typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray; 42 typedef struct { 43 int m_Start; 44 int m_nCount; 45 } FPDF_SEGMENT; 46 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array; 47 typedef struct { 48 CPDF_TextObject* m_pTextObj; 49 CFX_AffineMatrix m_formMatrix; 50 } PDFTEXT_Obj; 51 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ; 52 class CPDF_TextPage: public IPDF_TextPage 53 { 54 public: 55 CPDF_TextPage(const CPDF_Page* pPage, int flags = 0); 56 CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0); 57 CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); 58 virtual FX_BOOL ParseTextPage(); 59 virtual void NormalizeObjects(FX_BOOL bNormalize); 60 virtual FX_BOOL IsParsered() const 61 { 62 return m_IsParsered; 63 } 64 virtual ~CPDF_TextPage() {}; 65 public: 66 virtual int CharIndexFromTextIndex(int TextIndex)const ; 67 virtual int TextIndexFromCharIndex(int CharIndex)const; 68 virtual int CountChars() const; 69 virtual void GetCharInfo(int index, FPDF_CHAR_INFO & info) const; 70 virtual void GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const; 71 virtual int GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const; 72 virtual int GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, 73 FX_FLOAT yTorelance) const; 74 virtual CFX_WideString GetTextByRect(CFX_FloatRect rect) const; 75 virtual void GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const; 76 virtual int GetOrderByDirection(int order, int direction) const; 77 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const; 78 79 virtual int CountRects(int start, int nCount); 80 virtual void GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top 81 , FX_FLOAT& right, FX_FLOAT &bottom) const; 82 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate); 83 virtual FX_BOOL GetBaselineRotate(CFX_FloatRect rect, int& Rotate); 84 virtual int CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, 85 FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE); 86 virtual void GetBoundedSegment(int index, int& start, int& count) const; 87 virtual int GetWordBreak(int index, int direction) const; 88 public: 89 const PAGECHAR_InfoArray* GetCharList() const 90 { 91 return &m_charList; 92 } 93 static FX_BOOL IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2); 94 static FX_BOOL IsLetter(FX_WCHAR unicode); 95 private: 96 FX_BOOL IsHyphen(FX_WCHAR curChar); 97 FX_BOOL IsControlChar(PAGECHAR_INFO* pCharInfo); 98 FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); 99 void ProcessObject(); 100 void ProcessFormObject(CPDF_FormObject* pFormObj, CFX_AffineMatrix formMatrix); 101 void ProcessTextObject(PDFTEXT_Obj pObj); 102 void ProcessTextObject(CPDF_TextObject* pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPos); 103 int ProcessInsertObject(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix); 104 FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); 105 FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); 106 FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); 107 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; 108 void CloseTempLine(); 109 void OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str); 110 FX_INT32 PreMarkedContent(PDFTEXT_Obj pObj); 111 void ProcessMarkedContent(PDFTEXT_Obj pObj); 112 void CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const; 113 void FindPreviousTextObject(void); 114 void AddCharInfoByLRDirection(CFX_WideString& str, int i); 115 void AddCharInfoByRLDirection(CFX_WideString& str, int i); 116 FX_INT32 GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); 117 FX_INT32 FindTextlineFlowDirection(); 118 protected: 119 CPDFText_ParseOptions m_ParseOptions; 120 CFX_WordArray m_CharIndex; 121 const CPDF_PageObjects* m_pPage; 122 PAGECHAR_InfoArray m_charList; 123 CFX_WideTextBuf m_TextBuf; 124 PAGECHAR_InfoArray m_TempCharList; 125 CFX_WideTextBuf m_TempTextBuf; 126 int m_parserflag; 127 CPDF_TextObject* m_pPreTextObj; 128 CFX_AffineMatrix m_perMatrix; 129 FX_BOOL m_IsParsered; 130 CFX_AffineMatrix m_DisplayMatrix; 131 132 SEGMENT_Array m_Segment; 133 CFX_RectArray m_SelRects; 134 LINEOBJ m_LineObj; 135 FX_BOOL m_TextlineDir; 136 CFX_FloatRect m_CurlineRect; 137 }; 138 class CPDF_TextPageFind: public IPDF_TextPageFind 139 { 140 public: 141 CPDF_TextPageFind(const IPDF_TextPage* pTextPage); 142 virtual ~CPDF_TextPageFind() {}; 143 public: 144 virtual FX_BOOL FindFirst(CFX_WideString findwhat, int flags, int startPos = 0); 145 virtual FX_BOOL FindNext(); 146 virtual FX_BOOL FindPrev(); 147 148 virtual void GetRectArray(CFX_RectArray& rects) const; 149 virtual int GetCurOrder() const; 150 virtual int GetMatchedCount()const; 151 protected: 152 void ExtractFindWhat(CFX_WideString findwhat); 153 FX_BOOL IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos); 154 FX_BOOL ExtractSubString(CFX_WideString& rString, FX_LPCWSTR lpszFullString, 155 int iSubString, FX_WCHAR chSep); 156 CFX_WideString MakeReverse(const CFX_WideString str); 157 int ReverseFind(CFX_WideString csPageText, CFX_WideString csWord, int nStartPos, int& WordLength); 158 int GetCharIndex(int index) const; 159 private: 160 CFX_WordArray m_CharIndex; 161 const IPDF_TextPage* m_pTextPage; 162 CFX_WideString m_strText; 163 CFX_WideString m_findWhat; 164 int m_flags; 165 CFX_WideStringArray m_csFindWhatArray; 166 int m_findNextStart; 167 int m_findPreStart; 168 FX_BOOL m_bMatchCase; 169 FX_BOOL m_bMatchWholeWord; 170 int m_resStart; 171 int m_resEnd; 172 CFX_RectArray m_resArray; 173 FX_BOOL m_IsFind; 174 }; 175 class CPDF_LinkExt: public CFX_Object 176 { 177 public: 178 CPDF_LinkExt() {}; 179 int m_Start; 180 int m_Count; 181 CFX_WideString m_strUrl; 182 virtual ~CPDF_LinkExt() {}; 183 }; 184 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; 185 class CPDF_LinkExtract: public IPDF_LinkExtract 186 { 187 public: 188 CPDF_LinkExtract(); 189 virtual ~CPDF_LinkExtract(); 190 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage); 191 virtual FX_BOOL IsExtract() const 192 { 193 return m_IsParserd; 194 } 195 public: 196 virtual int CountLinks() const; 197 virtual CFX_WideString GetURL(int index) const; 198 virtual void GetBoundedSegment(int index, int& start, int& count) const; 199 virtual void GetRects(int index, CFX_RectArray& rects)const; 200 protected: 201 void parserLink(); 202 void DeleteLinkList(); 203 FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); 204 FX_BOOL CheckMailLink(CFX_WideString& str); 205 FX_BOOL AppendToLinkList(int start, int count, CFX_WideString strUrl); 206 private: 207 LINK_InfoArray m_LinkList; 208 const CPDF_TextPage* m_pTextPage; 209 CFX_WideString m_strPageText; 210 FX_BOOL m_IsParserd; 211 }; 212 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst); 213 void NormalizeString(CFX_WideString& str); 214 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); 215 #endif 216