1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ 8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ 9 10 #include "core/include/fpdfapi/fpdf_parser.h" 11 12 class CPDF_Page; 13 class CPDF_PageObjects; 14 class CPDF_TextObject; 15 class IPDF_LinkExtract; 16 class IPDF_ReflowedPage; 17 class IPDF_TextPage; 18 class IPDF_TextPageFind; 19 20 #define PDF2TXT_AUTO_ROTATE 1 21 #define PDF2TXT_AUTO_WIDTH 2 22 #define PDF2TXT_KEEP_COLUMN 4 23 #define PDF2TXT_USE_OCR 8 24 #define PDF2TXT_INCLUDE_INVISIBLE 16 25 void PDF_GetPageText(CFX_ByteStringArray& lines, 26 CPDF_Document* pDoc, 27 CPDF_Dictionary* pPage, 28 int iMinWidth, 29 FX_DWORD flags); 30 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, 31 CPDF_Document* pDoc, 32 CPDF_Dictionary* pPage, 33 int iMinWidth, 34 FX_DWORD flags); 35 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, 36 CPDF_Document* pDoc, 37 CPDF_Dictionary* pPage, 38 FX_DWORD flags); 39 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, 40 CPDF_Dictionary* pPage); 41 #define CHAR_ERROR -1 42 #define CHAR_NORMAL 0 43 #define CHAR_GENERATED 1 44 #define CHAR_UNUNICODE 2 45 46 struct FPDF_CHAR_INFO { 47 FX_WCHAR m_Unicode; 48 FX_WCHAR m_Charcode; 49 int32_t m_Flag; 50 FX_FLOAT m_FontSize; 51 FX_FLOAT m_OriginX; 52 FX_FLOAT m_OriginY; 53 CFX_FloatRect m_CharBox; 54 CPDF_TextObject* m_pTextObj; 55 CFX_Matrix m_Matrix; 56 }; 57 58 typedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; 59 #define FPDFTEXT_LRTB 0 60 #define FPDFTEXT_RLTB 1 61 #define FPDFTEXT_TBRL 2 62 #define FPDFTEXT_LEFT -1 63 #define FPDFTEXT_RIGHT 1 64 #define FPDFTEXT_UP -2 65 #define FPDFTEXT_DOWN 2 66 #define FPDFTEXT_WRITINGMODE_UNKNOW 0 67 #define FPDFTEXT_WRITINGMODE_LRTB 1 68 #define FPDFTEXT_WRITINGMODE_RLTB 2 69 #define FPDFTEXT_WRITINGMODE_TBRL 3 70 class CPDFText_ParseOptions { 71 public: 72 CPDFText_ParseOptions(); 73 FX_BOOL m_bGetCharCodeOnly; 74 FX_BOOL m_bNormalizeObjs; 75 FX_BOOL m_bOutputHyphen; 76 }; 77 78 class IPDF_TextPage { 79 public: 80 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0); 81 static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage); 82 83 virtual ~IPDF_TextPage() {} 84 85 virtual void NormalizeObjects(FX_BOOL bNormalize) = 0; 86 87 virtual FX_BOOL ParseTextPage() = 0; 88 89 virtual bool IsParsed() const = 0; 90 91 virtual int CharIndexFromTextIndex(int TextIndex) const = 0; 92 93 virtual int TextIndexFromCharIndex(int CharIndex) const = 0; 94 95 virtual int CountChars() const = 0; 96 97 virtual void GetCharInfo(int index, FPDF_CHAR_INFO* info) const = 0; 98 99 virtual void GetRectArray(int start, 100 int nCount, 101 CFX_RectArray& rectArray) const = 0; 102 103 virtual int GetIndexAtPos(CPDF_Point point, 104 FX_FLOAT xTolerance, 105 FX_FLOAT yTolerance) const = 0; 106 107 virtual int GetIndexAtPos(FX_FLOAT x, 108 FX_FLOAT y, 109 FX_FLOAT xTolerance, 110 FX_FLOAT yTolerance) const = 0; 111 112 virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0; 113 114 virtual void GetRectsArrayByRect(const CFX_FloatRect& rect, 115 CFX_RectArray& resRectArray) const = 0; 116 117 virtual int CountRects(int start, int nCount) = 0; 118 119 virtual void GetRect(int rectIndex, 120 FX_FLOAT& left, 121 FX_FLOAT& top, 122 FX_FLOAT& right, 123 FX_FLOAT& bottom) const = 0; 124 125 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0; 126 127 virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0; 128 129 virtual int CountBoundedSegments(FX_FLOAT left, 130 FX_FLOAT top, 131 FX_FLOAT right, 132 FX_FLOAT bottom, 133 FX_BOOL bContains = FALSE) = 0; 134 135 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 136 137 virtual int GetWordBreak(int index, int direction) const = 0; 138 139 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0; 140 }; 141 142 #define FPDFTEXT_MATCHCASE 0x00000001 143 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 144 #define FPDFTEXT_CONSECUTIVE 0x00000004 145 class IPDF_TextPageFind { 146 public: 147 virtual ~IPDF_TextPageFind() {} 148 149 static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage); 150 151 public: 152 virtual FX_BOOL FindFirst(const CFX_WideString& findwhat, 153 int flags, 154 int startPos = 0) = 0; 155 156 virtual FX_BOOL FindNext() = 0; 157 158 virtual FX_BOOL FindPrev() = 0; 159 160 virtual void GetRectArray(CFX_RectArray& rects) const = 0; 161 162 virtual int GetCurOrder() const = 0; 163 164 virtual int GetMatchedCount() const = 0; 165 }; 166 class IPDF_LinkExtract { 167 public: 168 virtual ~IPDF_LinkExtract() {} 169 170 static IPDF_LinkExtract* CreateLinkExtract(); 171 172 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0; 173 174 public: 175 virtual int CountLinks() const = 0; 176 177 virtual CFX_WideString GetURL(int index) const = 0; 178 179 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; 180 181 virtual void GetRects(int index, CFX_RectArray& rects) const = 0; 182 }; 183 184 #endif // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ 185