Home | History | Annotate | Download | only in fpdftext
      1 // Copyright 2014 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_
      8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_
      9 
     10 #include "core/include/fpdfapi/fpdf_parser.h"
     11 
     12 class CPDF_Page;
     13 class CPDF_PageObjects;
     14 class CPDF_TextObject;
     15 class IPDF_LinkExtract;
     16 class IPDF_ReflowedPage;
     17 class IPDF_TextPage;
     18 class IPDF_TextPageFind;
     19 
     20 #define PDF2TXT_AUTO_ROTATE 1
     21 #define PDF2TXT_AUTO_WIDTH 2
     22 #define PDF2TXT_KEEP_COLUMN 4
     23 #define PDF2TXT_USE_OCR 8
     24 #define PDF2TXT_INCLUDE_INVISIBLE 16
     25 void PDF_GetPageText(CFX_ByteStringArray& lines,
     26                      CPDF_Document* pDoc,
     27                      CPDF_Dictionary* pPage,
     28                      int iMinWidth,
     29                      FX_DWORD flags);
     30 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
     31                              CPDF_Document* pDoc,
     32                              CPDF_Dictionary* pPage,
     33                              int iMinWidth,
     34                              FX_DWORD flags);
     35 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
     36                                CPDF_Document* pDoc,
     37                                CPDF_Dictionary* pPage,
     38                                FX_DWORD flags);
     39 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc,
     40                                             CPDF_Dictionary* pPage);
     41 #define CHAR_ERROR -1
     42 #define CHAR_NORMAL 0
     43 #define CHAR_GENERATED 1
     44 #define CHAR_UNUNICODE 2
     45 
     46 struct FPDF_CHAR_INFO {
     47   FX_WCHAR m_Unicode;
     48   FX_WCHAR m_Charcode;
     49   int32_t m_Flag;
     50   FX_FLOAT m_FontSize;
     51   FX_FLOAT m_OriginX;
     52   FX_FLOAT m_OriginY;
     53   CFX_FloatRect m_CharBox;
     54   CPDF_TextObject* m_pTextObj;
     55   CFX_Matrix m_Matrix;
     56 };
     57 
     58 typedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray;
     59 #define FPDFTEXT_LRTB 0
     60 #define FPDFTEXT_RLTB 1
     61 #define FPDFTEXT_TBRL 2
     62 #define FPDFTEXT_LEFT -1
     63 #define FPDFTEXT_RIGHT 1
     64 #define FPDFTEXT_UP -2
     65 #define FPDFTEXT_DOWN 2
     66 #define FPDFTEXT_WRITINGMODE_UNKNOW 0
     67 #define FPDFTEXT_WRITINGMODE_LRTB 1
     68 #define FPDFTEXT_WRITINGMODE_RLTB 2
     69 #define FPDFTEXT_WRITINGMODE_TBRL 3
     70 class CPDFText_ParseOptions {
     71  public:
     72   CPDFText_ParseOptions();
     73   FX_BOOL m_bGetCharCodeOnly;
     74   FX_BOOL m_bNormalizeObjs;
     75   FX_BOOL m_bOutputHyphen;
     76 };
     77 
     78 class IPDF_TextPage {
     79  public:
     80   static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0);
     81   static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage);
     82 
     83   virtual ~IPDF_TextPage() {}
     84 
     85   virtual void NormalizeObjects(FX_BOOL bNormalize) = 0;
     86 
     87   virtual FX_BOOL ParseTextPage() = 0;
     88 
     89   virtual bool IsParsed() const = 0;
     90 
     91   virtual int CharIndexFromTextIndex(int TextIndex) const = 0;
     92 
     93   virtual int TextIndexFromCharIndex(int CharIndex) const = 0;
     94 
     95   virtual int CountChars() const = 0;
     96 
     97   virtual void GetCharInfo(int index, FPDF_CHAR_INFO* info) const = 0;
     98 
     99   virtual void GetRectArray(int start,
    100                             int nCount,
    101                             CFX_RectArray& rectArray) const = 0;
    102 
    103   virtual int GetIndexAtPos(CPDF_Point point,
    104                             FX_FLOAT xTolerance,
    105                             FX_FLOAT yTolerance) const = 0;
    106 
    107   virtual int GetIndexAtPos(FX_FLOAT x,
    108                             FX_FLOAT y,
    109                             FX_FLOAT xTolerance,
    110                             FX_FLOAT yTolerance) const = 0;
    111 
    112   virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0;
    113 
    114   virtual void GetRectsArrayByRect(const CFX_FloatRect& rect,
    115                                    CFX_RectArray& resRectArray) const = 0;
    116 
    117   virtual int CountRects(int start, int nCount) = 0;
    118 
    119   virtual void GetRect(int rectIndex,
    120                        FX_FLOAT& left,
    121                        FX_FLOAT& top,
    122                        FX_FLOAT& right,
    123                        FX_FLOAT& bottom) const = 0;
    124 
    125   virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0;
    126 
    127   virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0;
    128 
    129   virtual int CountBoundedSegments(FX_FLOAT left,
    130                                    FX_FLOAT top,
    131                                    FX_FLOAT right,
    132                                    FX_FLOAT bottom,
    133                                    FX_BOOL bContains = FALSE) = 0;
    134 
    135   virtual void GetBoundedSegment(int index, int& start, int& count) const = 0;
    136 
    137   virtual int GetWordBreak(int index, int direction) const = 0;
    138 
    139   virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0;
    140 };
    141 
    142 #define FPDFTEXT_MATCHCASE 0x00000001
    143 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
    144 #define FPDFTEXT_CONSECUTIVE 0x00000004
    145 class IPDF_TextPageFind {
    146  public:
    147   virtual ~IPDF_TextPageFind() {}
    148 
    149   static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage);
    150 
    151  public:
    152   virtual FX_BOOL FindFirst(const CFX_WideString& findwhat,
    153                             int flags,
    154                             int startPos = 0) = 0;
    155 
    156   virtual FX_BOOL FindNext() = 0;
    157 
    158   virtual FX_BOOL FindPrev() = 0;
    159 
    160   virtual void GetRectArray(CFX_RectArray& rects) const = 0;
    161 
    162   virtual int GetCurOrder() const = 0;
    163 
    164   virtual int GetMatchedCount() const = 0;
    165 };
    166 class IPDF_LinkExtract {
    167  public:
    168   virtual ~IPDF_LinkExtract() {}
    169 
    170   static IPDF_LinkExtract* CreateLinkExtract();
    171 
    172   virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0;
    173 
    174  public:
    175   virtual int CountLinks() const = 0;
    176 
    177   virtual CFX_WideString GetURL(int index) const = 0;
    178 
    179   virtual void GetBoundedSegment(int index, int& start, int& count) const = 0;
    180 
    181   virtual void GetRects(int index, CFX_RectArray& rects) const = 0;
    182 };
    183 
    184 #endif  // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_
    185