Home | History | Annotate | Download | only in parser
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef CORE_FPDFAPI_PARSER_CPDF_DATA_AVAIL_H_
      8 #define CORE_FPDFAPI_PARSER_CPDF_DATA_AVAIL_H_
      9 
     10 #include <memory>
     11 #include <set>
     12 #include <vector>
     13 
     14 #include "core/fpdfapi/parser/cpdf_parser.h"
     15 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
     16 #include "core/fxcrt/fx_basic.h"
     17 
     18 class CPDF_Dictionary;
     19 class CPDF_HintTables;
     20 class CPDF_IndirectObjectHolder;
     21 class CPDF_LinearizedHeader;
     22 class CPDF_Parser;
     23 
     24 enum PDF_DATAAVAIL_STATUS {
     25   PDF_DATAAVAIL_HEADER = 0,
     26   PDF_DATAAVAIL_FIRSTPAGE,
     27   PDF_DATAAVAIL_HINTTABLE,
     28   PDF_DATAAVAIL_END,
     29   PDF_DATAAVAIL_CROSSREF,
     30   PDF_DATAAVAIL_CROSSREF_ITEM,
     31   PDF_DATAAVAIL_CROSSREF_STREAM,
     32   PDF_DATAAVAIL_TRAILER,
     33   PDF_DATAAVAIL_LOADALLCROSSREF,
     34   PDF_DATAAVAIL_ROOT,
     35   PDF_DATAAVAIL_INFO,
     36   PDF_DATAAVAIL_ACROFORM,
     37   PDF_DATAAVAIL_ACROFORM_SUBOBJECT,
     38   PDF_DATAAVAIL_PAGETREE,
     39   PDF_DATAAVAIL_PAGE,
     40   PDF_DATAAVAIL_PAGE_LATERLOAD,
     41   PDF_DATAAVAIL_RESOURCES,
     42   PDF_DATAAVAIL_DONE,
     43   PDF_DATAAVAIL_ERROR,
     44   PDF_DATAAVAIL_LOADALLFILE,
     45   PDF_DATAAVAIL_TRAILER_APPEND
     46 };
     47 
     48 enum PDF_PAGENODE_TYPE {
     49   PDF_PAGENODE_UNKNOWN = 0,
     50   PDF_PAGENODE_PAGE,
     51   PDF_PAGENODE_PAGES,
     52   PDF_PAGENODE_ARRAY,
     53 };
     54 
     55 class CPDF_DataAvail final {
     56  public:
     57   // Must match PDF_DATA_* definitions in public/fpdf_dataavail.h, but cannot
     58   // #include that header. fpdfsdk/fpdf_dataavail.cpp has static_asserts
     59   // to make sure the two sets of values match.
     60   enum DocAvailStatus {
     61     DataError = -1,        // PDF_DATA_ERROR
     62     DataNotAvailable = 0,  // PDF_DATA_NOTAVAIL
     63     DataAvailable = 1,     // PDF_DATA_AVAIL
     64   };
     65 
     66   // Must match PDF_*LINEAR* definitions in public/fpdf_dataavail.h, but cannot
     67   // #include that header. fpdfsdk/fpdf_dataavail.cpp has static_asserts
     68   // to make sure the two sets of values match.
     69   enum DocLinearizationStatus {
     70     LinearizationUnknown = -1,  // PDF_LINEARIZATION_UNKNOWN
     71     NotLinearized = 0,          // PDF_NOT_LINEARIZED
     72     Linearized = 1,             // PDF_LINEARIZED
     73   };
     74 
     75   // Must match PDF_FORM_* definitions in public/fpdf_dataavail.h, but cannot
     76   // #include that header. fpdfsdk/fpdf_dataavail.cpp has static_asserts
     77   // to make sure the two sets of values match.
     78   enum DocFormStatus {
     79     FormError = -1,        // PDF_FORM_ERROR
     80     FormNotAvailable = 0,  // PDF_FORM_NOTAVAIL
     81     FormAvailable = 1,     // PDF_FORM_AVAIL
     82     FormNotExist = 2,      // PDF_FORM_NOTEXIST
     83   };
     84 
     85   class FileAvail {
     86    public:
     87     virtual ~FileAvail();
     88     virtual bool IsDataAvail(FX_FILESIZE offset, uint32_t size) = 0;
     89   };
     90 
     91   class DownloadHints {
     92    public:
     93     virtual ~DownloadHints();
     94     virtual void AddSegment(FX_FILESIZE offset, uint32_t size) = 0;
     95   };
     96 
     97   CPDF_DataAvail(FileAvail* pFileAvail,
     98                  const CFX_RetainPtr<IFX_SeekableReadStream>& pFileRead,
     99                  bool bSupportHintTable);
    100   ~CPDF_DataAvail();
    101 
    102   bool IsDataAvail(FX_FILESIZE offset, uint32_t size, DownloadHints* pHints);
    103   DocAvailStatus IsDocAvail(DownloadHints* pHints);
    104   void SetDocument(CPDF_Document* pDoc);
    105   DocAvailStatus IsPageAvail(uint32_t dwPage, DownloadHints* pHints);
    106   DocFormStatus IsFormAvail(DownloadHints* pHints);
    107   DocLinearizationStatus IsLinearizedPDF();
    108   bool IsLinearized();
    109   void GetLinearizedMainXRefInfo(FX_FILESIZE* pPos, uint32_t* pSize);
    110   CFX_RetainPtr<IFX_SeekableReadStream> GetFileRead() const {
    111     return m_pFileRead;
    112   }
    113   int GetPageCount() const;
    114   CPDF_Dictionary* GetPage(int index);
    115 
    116  protected:
    117   class PageNode {
    118    public:
    119     PageNode();
    120     ~PageNode();
    121 
    122     PDF_PAGENODE_TYPE m_type;
    123     uint32_t m_dwPageNo;
    124     std::vector<std::unique_ptr<PageNode>> m_ChildNodes;
    125   };
    126 
    127   static const int kMaxDataAvailRecursionDepth = 64;
    128   static int s_CurrentDataAvailRecursionDepth;
    129   static const int kMaxPageRecursionDepth = 1024;
    130 
    131   uint32_t GetObjectSize(uint32_t objnum, FX_FILESIZE& offset);
    132   bool AreObjectsAvailable(std::vector<CPDF_Object*>& obj_array,
    133                            bool bParsePage,
    134                            DownloadHints* pHints,
    135                            std::vector<CPDF_Object*>& ret_array);
    136   bool CheckDocStatus(DownloadHints* pHints);
    137   bool CheckHeader(DownloadHints* pHints);
    138   bool CheckFirstPage(DownloadHints* pHints);
    139   bool CheckHintTables(DownloadHints* pHints);
    140   bool CheckEnd(DownloadHints* pHints);
    141   bool CheckCrossRef(DownloadHints* pHints);
    142   bool CheckCrossRefItem(DownloadHints* pHints);
    143   bool CheckTrailer(DownloadHints* pHints);
    144   bool CheckRoot(DownloadHints* pHints);
    145   bool CheckInfo(DownloadHints* pHints);
    146   bool CheckPages(DownloadHints* pHints);
    147   bool CheckPage(DownloadHints* pHints);
    148   bool CheckResources(DownloadHints* pHints);
    149   bool CheckAnnots(DownloadHints* pHints);
    150   bool CheckAcroForm(DownloadHints* pHints);
    151   bool CheckAcroFormSubObject(DownloadHints* pHints);
    152   bool CheckTrailerAppend(DownloadHints* pHints);
    153   bool CheckPageStatus(DownloadHints* pHints);
    154   bool CheckAllCrossRefStream(DownloadHints* pHints);
    155 
    156   int32_t CheckCrossRefStream(DownloadHints* pHints, FX_FILESIZE& xref_offset);
    157   bool IsLinearizedFile(uint8_t* pData, uint32_t dwLen);
    158   void SetStartOffset(FX_FILESIZE dwOffset);
    159   bool GetNextToken(CFX_ByteString& token);
    160   bool GetNextChar(uint8_t& ch);
    161   std::unique_ptr<CPDF_Object> ParseIndirectObjectAt(
    162       FX_FILESIZE pos,
    163       uint32_t objnum,
    164       CPDF_IndirectObjectHolder* pObjList = nullptr);
    165   std::unique_ptr<CPDF_Object> GetObject(uint32_t objnum,
    166                                          DownloadHints* pHints,
    167                                          bool* pExistInFile);
    168   bool GetPageKids(CPDF_Parser* pParser, CPDF_Object* pPages);
    169   bool PreparePageItem();
    170   bool LoadPages(DownloadHints* pHints);
    171   bool LoadAllXref(DownloadHints* pHints);
    172   bool LoadAllFile(DownloadHints* pHints);
    173   DocAvailStatus CheckLinearizedData(DownloadHints* pHints);
    174   bool CheckPageAnnots(uint32_t dwPage, DownloadHints* pHints);
    175 
    176   DocAvailStatus CheckLinearizedFirstPage(uint32_t dwPage,
    177                                           DownloadHints* pHints);
    178   bool HaveResourceAncestor(CPDF_Dictionary* pDict);
    179   bool CheckPage(uint32_t dwPage, DownloadHints* pHints);
    180   bool LoadDocPages(DownloadHints* pHints);
    181   bool LoadDocPage(uint32_t dwPage, DownloadHints* pHints);
    182   bool CheckPageNode(const PageNode& pageNode,
    183                      int32_t iPage,
    184                      int32_t& iCount,
    185                      DownloadHints* pHints,
    186                      int level);
    187   bool CheckUnknownPageNode(uint32_t dwPageNo,
    188                             PageNode* pPageNode,
    189                             DownloadHints* pHints);
    190   bool CheckArrayPageNode(uint32_t dwPageNo,
    191                           PageNode* pPageNode,
    192                           DownloadHints* pHints);
    193   bool CheckPageCount(DownloadHints* pHints);
    194   bool IsFirstCheck(uint32_t dwPage);
    195   void ResetFirstCheck(uint32_t dwPage);
    196   bool ValidatePage(uint32_t dwPage);
    197   bool ValidateForm();
    198 
    199   FileAvail* const m_pFileAvail;
    200   CFX_RetainPtr<IFX_SeekableReadStream> m_pFileRead;
    201   CPDF_Parser m_parser;
    202   CPDF_SyntaxParser m_syntaxParser;
    203   std::unique_ptr<CPDF_Object> m_pRoot;
    204   uint32_t m_dwRootObjNum;
    205   uint32_t m_dwInfoObjNum;
    206   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
    207   CPDF_Object* m_pTrailer;
    208   bool m_bDocAvail;
    209   FX_FILESIZE m_dwHeaderOffset;
    210   FX_FILESIZE m_dwLastXRefOffset;
    211   FX_FILESIZE m_dwXRefOffset;
    212   FX_FILESIZE m_dwTrailerOffset;
    213   FX_FILESIZE m_dwCurrentOffset;
    214   PDF_DATAAVAIL_STATUS m_docStatus;
    215   FX_FILESIZE m_dwFileLen;
    216   CPDF_Document* m_pDocument;
    217   std::set<uint32_t> m_ObjectSet;
    218   std::vector<CPDF_Object*> m_objs_array;
    219   FX_FILESIZE m_Pos;
    220   FX_FILESIZE m_bufferOffset;
    221   uint32_t m_bufferSize;
    222   CFX_ByteString m_WordBuf;
    223   uint8_t m_bufferData[512];
    224   std::vector<uint32_t> m_XRefStreamList;
    225   std::vector<uint32_t> m_PageObjList;
    226   uint32_t m_PagesObjNum;
    227   bool m_bLinearedDataOK;
    228   bool m_bMainXRefLoadTried;
    229   bool m_bMainXRefLoadedOK;
    230   bool m_bPagesTreeLoad;
    231   bool m_bPagesLoad;
    232   CPDF_Parser* m_pCurrentParser;
    233   FX_FILESIZE m_dwCurrentXRefSteam;
    234   bool m_bAnnotsLoad;
    235   bool m_bHaveAcroForm;
    236   uint32_t m_dwAcroFormObjNum;
    237   bool m_bAcroFormLoad;
    238   CPDF_Object* m_pAcroForm;
    239   std::vector<CPDF_Object*> m_arrayAcroforms;
    240   CPDF_Dictionary* m_pPageDict;
    241   CPDF_Object* m_pPageResource;
    242   bool m_bNeedDownLoadResource;
    243   bool m_bPageLoadedOK;
    244   bool m_bLinearizedFormParamLoad;
    245   std::vector<std::unique_ptr<CPDF_Object>> m_PagesArray;
    246   uint32_t m_dwEncryptObjNum;
    247   FX_FILESIZE m_dwPrevXRefOffset;
    248   bool m_bTotalLoadPageTree;
    249   bool m_bCurPageDictLoadOK;
    250   PageNode m_PageNode;
    251   std::set<uint32_t> m_pageMapCheckState;
    252   std::set<uint32_t> m_pagesLoadState;
    253   std::unique_ptr<CPDF_HintTables> m_pHintTables;
    254   bool m_bSupportHintTable;
    255 };
    256 
    257 #endif  // CORE_FPDFAPI_PARSER_CPDF_DATA_AVAIL_H_
    258