Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef _FPDF_TEXT_H_
      8 #define _FPDF_TEXT_H_
      9 #ifndef _FPDF_PARSER_
     10 #include "../fpdfapi/fpdf_parser.h"
     11 #endif
     12 #ifndef _FPDF_PAGEOBJ_H_
     13 #include "../fpdfapi/fpdf_pageobj.h"
     14 #endif
     15 #ifndef _FPDF_PAGE_
     16 #include "../fpdfapi/fpdf_page.h"
     17 #endif
     18 class CPDF_PageObjects;
     19 #define PDF2TXT_AUTO_ROTATE		1
     20 #define PDF2TXT_AUTO_WIDTH		2
     21 #define PDF2TXT_KEEP_COLUMN		4
     22 #define PDF2TXT_USE_OCR			8
     23 #define PDF2TXT_INCLUDE_INVISIBLE	16
     24 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
     25                      int iMinWidth, FX_DWORD flags);
     26 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
     27                              int iMinWidth, FX_DWORD flags);
     28 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
     29                                FX_DWORD flags);
     30 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage);
     31 class IPDF_TextPage;
     32 class IPDF_LinkExtract;
     33 class IPDF_TextPageFind;
     34 #define CHAR_ERROR			-1
     35 #define CHAR_NORMAL			0
     36 #define CHAR_GENERATED		1
     37 #define CHAR_UNUNICODE		2
     38 typedef struct {
     39     FX_WCHAR			m_Unicode;
     40     FX_WCHAR			m_Charcode;
     41     FX_INT32			m_Flag;
     42     FX_FLOAT			m_FontSize;
     43     FX_FLOAT			m_OriginX;
     44     FX_FLOAT			m_OriginY;
     45     CFX_FloatRect		m_CharBox;
     46     CPDF_TextObject*	m_pTextObj;
     47     CFX_AffineMatrix	m_Matrix;
     48 } FPDF_CHAR_INFO;
     49 typedef	CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray;
     50 #define FPDFTEXT_LRTB	0
     51 #define FPDFTEXT_RLTB	1
     52 #define FPDFTEXT_TBRL	2
     53 #define FPDFTEXT_LEFT			-1
     54 #define FPDFTEXT_RIGHT			1
     55 #define FPDFTEXT_UP				-2
     56 #define FPDFTEXT_DOWN			2
     57 class IPDF_ReflowedPage;
     58 #define FPDFTEXT_WRITINGMODE_UNKNOW	0
     59 #define FPDFTEXT_WRITINGMODE_LRTB	1
     60 #define FPDFTEXT_WRITINGMODE_RLTB	2
     61 #define FPDFTEXT_WRITINGMODE_TBRL	3
     62 class CPDFText_ParseOptions : public CFX_Object
     63 {
     64 public:
     65 
     66     CPDFText_ParseOptions();
     67     FX_BOOL			m_bGetCharCodeOnly;
     68     FX_BOOL			m_bNormalizeObjs;
     69     FX_BOOL			m_bOutputHyphen;
     70 };
     71 class IPDF_TextPage : public CFX_Object
     72 {
     73 public:
     74 
     75     virtual ~IPDF_TextPage() {}
     76     static IPDF_TextPage*	CreateTextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions);
     77     static IPDF_TextPage*	CreateTextPage(const CPDF_Page* pPage, int flags = 0);
     78     static IPDF_TextPage*	CreateTextPage(const CPDF_PageObjects* pObjs, int flags = 0);
     79     static IPDF_TextPage*	CreateReflowTextPage(IPDF_ReflowedPage* pRefPage);
     80 
     81     virtual void			NormalizeObjects(FX_BOOL bNormalize) = 0;
     82 
     83     virtual FX_BOOL			ParseTextPage() = 0;
     84 
     85 
     86     virtual FX_BOOL			IsParsered() const = 0;
     87 public:
     88 
     89     virtual int CharIndexFromTextIndex(int TextIndex) const = 0;
     90 
     91     virtual int TextIndexFromCharIndex(int CharIndex) const = 0;
     92 
     93 
     94     virtual int				CountChars() const = 0;
     95 
     96     virtual	void			GetCharInfo(int index, FPDF_CHAR_INFO & info) const = 0;
     97 
     98     virtual void			GetRectArray(int start, int nCount, CFX_RectArray& rectArray) const = 0;
     99 
    100 
    101 
    102     virtual int				GetIndexAtPos(CPDF_Point point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0;
    103 
    104     virtual int				GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0;
    105 
    106     virtual	int				GetOrderByDirection(int index, int direction) const = 0;
    107 
    108     virtual CFX_WideString	GetTextByRect(CFX_FloatRect rect) const = 0;
    109 
    110     virtual void			GetRectsArrayByRect(CFX_FloatRect rect, CFX_RectArray& resRectArray) const = 0;
    111 
    112 
    113     virtual int				CountRects(int start, int nCount) = 0;
    114 
    115     virtual	void			GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const = 0;
    116 
    117     virtual FX_BOOL			GetBaselineRotate(int rectIndex, int& Rotate) = 0;
    118 
    119     virtual FX_BOOL			GetBaselineRotate(CFX_FloatRect rect, int& Rotate) = 0;
    120 
    121     virtual	int				CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) = 0;
    122 
    123     virtual	void			GetBoundedSegment(int index, int& start, int& count) const = 0;
    124 
    125 
    126     virtual int				GetWordBreak(int index, int direction) const = 0;
    127 
    128     virtual CFX_WideString	GetPageText(int start = 0, int nCount = -1 ) const = 0;
    129 };
    130 #define FPDFTEXT_MATCHCASE      0x00000001
    131 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
    132 #define FPDFTEXT_CONSECUTIVE	0x00000004
    133 class IPDF_TextPageFind : public CFX_Object
    134 {
    135 public:
    136 
    137     virtual	~IPDF_TextPageFind() {}
    138 
    139     static	IPDF_TextPageFind*	CreatePageFind(const IPDF_TextPage* pTextPage);
    140 public:
    141 
    142     virtual	FX_BOOL				FindFirst(CFX_WideString findwhat, int flags, int startPos = 0) = 0;
    143 
    144     virtual	FX_BOOL				FindNext() = 0;
    145 
    146     virtual	FX_BOOL				FindPrev() = 0;
    147 
    148     virtual void				GetRectArray(CFX_RectArray& rects) const = 0;
    149 
    150     virtual int					GetCurOrder() const = 0;
    151 
    152     virtual int					GetMatchedCount() const = 0;
    153 };
    154 class IPDF_LinkExtract : public CFX_Object
    155 {
    156 public:
    157 
    158     virtual	~IPDF_LinkExtract() {}
    159 
    160     static	IPDF_LinkExtract*	CreateLinkExtract();
    161 
    162     virtual FX_BOOL				ExtractLinks(const IPDF_TextPage* pTextPage) = 0;
    163 public:
    164 
    165     virtual int					CountLinks() const = 0;
    166 
    167     virtual CFX_WideString		GetURL(int index) const = 0;
    168 
    169     virtual	void				GetBoundedSegment(int index, int& start, int& count) const = 0;
    170 
    171     virtual void				GetRects(int index, CFX_RectArray& rects) const = 0;
    172 };
    173 #endif
    174