Home | History | Annotate | Download | only in parser
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
      8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
      9 
     10 #include <map>
     11 #include <memory>
     12 #include <set>
     13 #include <vector>
     14 
     15 #include "core/fxcrt/fx_basic.h"
     16 
     17 class CPDF_Array;
     18 class CPDF_CryptoHandler;
     19 class CPDF_Dictionary;
     20 class CPDF_Document;
     21 class CPDF_IndirectObjectHolder;
     22 class CPDF_LinearizedHeader;
     23 class CPDF_Object;
     24 class CPDF_SecurityHandler;
     25 class CPDF_StreamAcc;
     26 class CPDF_SyntaxParser;
     27 class IFX_SeekableReadStream;
     28 
     29 class CPDF_Parser {
     30  public:
     31   enum Error {
     32     SUCCESS = 0,
     33     FILE_ERROR,
     34     FORMAT_ERROR,
     35     PASSWORD_ERROR,
     36     HANDLER_ERROR
     37   };
     38 
     39   // A limit on the maximum object number in the xref table. Theoretical limits
     40   // are higher, but this may be large enough in practice.
     41   static const uint32_t kMaxObjectNumber = 1048576;
     42 
     43   CPDF_Parser();
     44   ~CPDF_Parser();
     45 
     46   Error StartParse(const CFX_RetainPtr<IFX_SeekableReadStream>& pFile,
     47                    CPDF_Document* pDocument);
     48   Error StartLinearizedParse(const CFX_RetainPtr<IFX_SeekableReadStream>& pFile,
     49                              CPDF_Document* pDocument);
     50 
     51   void SetPassword(const FX_CHAR* password) { m_Password = password; }
     52   CFX_ByteString GetPassword() { return m_Password; }
     53   CPDF_Dictionary* GetTrailer() const { return m_pTrailer.get(); }
     54   FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
     55 
     56   uint32_t GetPermissions() const;
     57   uint32_t GetRootObjNum();
     58   uint32_t GetInfoObjNum();
     59   CPDF_Array* GetIDArray();
     60 
     61   CPDF_Dictionary* GetEncryptDict() const { return m_pEncryptDict; }
     62 
     63   std::unique_ptr<CPDF_Object> ParseIndirectObject(
     64       CPDF_IndirectObjectHolder* pObjList,
     65       uint32_t objnum);
     66 
     67   uint32_t GetLastObjNum() const;
     68   bool IsValidObjectNumber(uint32_t objnum) const;
     69   FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
     70   uint8_t GetObjectType(uint32_t objnum) const;
     71   uint16_t GetObjectGenNum(uint32_t objnum) const;
     72   bool IsVersionUpdated() const { return m_bVersionUpdated; }
     73   bool IsObjectFreeOrNull(uint32_t objnum) const;
     74   CPDF_CryptoHandler* GetCryptoHandler();
     75   CFX_RetainPtr<IFX_SeekableReadStream> GetFileAccess() const;
     76 
     77   FX_FILESIZE GetObjectOffset(uint32_t objnum) const;
     78   FX_FILESIZE GetObjectSize(uint32_t objnum) const;
     79 
     80   void GetIndirectBinary(uint32_t objnum, uint8_t*& pBuffer, uint32_t& size);
     81   int GetFileVersion() const { return m_FileVersion; }
     82   bool IsXRefStream() const { return m_bXRefStream; }
     83 
     84   std::unique_ptr<CPDF_Object> ParseIndirectObjectAt(
     85       CPDF_IndirectObjectHolder* pObjList,
     86       FX_FILESIZE pos,
     87       uint32_t objnum);
     88 
     89   std::unique_ptr<CPDF_Object> ParseIndirectObjectAtByStrict(
     90       CPDF_IndirectObjectHolder* pObjList,
     91       FX_FILESIZE pos,
     92       uint32_t objnum,
     93       FX_FILESIZE* pResultPos);
     94 
     95   uint32_t GetFirstPageNo() const;
     96 
     97  protected:
     98   struct ObjectInfo {
     99     ObjectInfo() : pos(0), type(0), gennum(0) {}
    100 
    101     FX_FILESIZE pos;
    102     uint8_t type;
    103     uint16_t gennum;
    104   };
    105 
    106   std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
    107   std::map<uint32_t, ObjectInfo> m_ObjectInfo;
    108 
    109   bool LoadCrossRefV4(FX_FILESIZE pos, FX_FILESIZE streampos, bool bSkip);
    110   bool RebuildCrossRef();
    111 
    112  private:
    113   friend class CPDF_DataAvail;
    114 
    115   enum class ParserState {
    116     kDefault,
    117     kComment,
    118     kWhitespace,
    119     kString,
    120     kHexString,
    121     kEscapedString,
    122     kXref,
    123     kObjNum,
    124     kPostObjNum,
    125     kGenNum,
    126     kPostGenNum,
    127     kTrailer,
    128     kBeginObj,
    129     kEndObj
    130   };
    131 
    132   CPDF_Object* ParseDirect(CPDF_Object* pObj);
    133   bool LoadAllCrossRefV4(FX_FILESIZE pos);
    134   bool LoadAllCrossRefV5(FX_FILESIZE pos);
    135   bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
    136   std::unique_ptr<CPDF_Dictionary> LoadTrailerV4();
    137   Error SetEncryptHandler();
    138   void ReleaseEncryptHandler();
    139   bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount);
    140   bool LoadLinearizedCrossRefV4(FX_FILESIZE pos, uint32_t dwObjCount);
    141   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos);
    142   Error LoadLinearizedMainXRefTable();
    143   CPDF_StreamAcc* GetObjectStream(uint32_t number);
    144   bool IsLinearizedFile(
    145       const CFX_RetainPtr<IFX_SeekableReadStream>& pFileAccess,
    146       uint32_t offset);
    147   void SetEncryptDictionary(CPDF_Dictionary* pDict);
    148   void ShrinkObjectMap(uint32_t size);
    149   // A simple check whether the cross reference table matches with
    150   // the objects.
    151   bool VerifyCrossRefV4();
    152 
    153   CPDF_Document* m_pDocument;  // not owned
    154   bool m_bHasParsed;
    155   bool m_bXRefStream;
    156   bool m_bVersionUpdated;
    157   int m_FileVersion;
    158   CPDF_Dictionary* m_pEncryptDict;
    159   FX_FILESIZE m_LastXRefOffset;
    160   std::unique_ptr<CPDF_SecurityHandler> m_pSecurityHandler;
    161   CFX_ByteString m_Password;
    162   std::set<FX_FILESIZE> m_SortedOffset;
    163   std::unique_ptr<CPDF_Dictionary> m_pTrailer;
    164   std::vector<std::unique_ptr<CPDF_Dictionary>> m_Trailers;
    165   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
    166   uint32_t m_dwXrefStartObjNum;
    167 
    168   // A map of object numbers to indirect streams. Map owns the streams.
    169   std::map<uint32_t, std::unique_ptr<CPDF_StreamAcc>> m_ObjectStreamMap;
    170 
    171   // Mapping of object numbers to offsets. The offsets are relative to the first
    172   // object in the stream.
    173   using StreamObjectCache = std::map<uint32_t, uint32_t>;
    174 
    175   // Mapping of streams to their object caches. This is valid as long as the
    176   // streams in |m_ObjectStreamMap| are valid.
    177   std::map<CPDF_StreamAcc*, StreamObjectCache> m_ObjCache;
    178 
    179   // All indirect object numbers that are being parsed.
    180   std::set<uint32_t> m_ParsingObjNums;
    181 };
    182 
    183 #endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
    184