Home | History | Annotate | Download | only in parser
      1 // Copyright 2016 PDFium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
      6 
      7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
      8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
      9 
     10 #include <limits>
     11 #include <map>
     12 #include <memory>
     13 #include <set>
     14 #include <vector>
     15 
     16 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
     17 #include "core/fxcrt/fx_string.h"
     18 #include "core/fxcrt/fx_system.h"
     19 #include "core/fxcrt/retain_ptr.h"
     20 #include "core/fxcrt/unowned_ptr.h"
     21 
     22 class CPDF_Array;
     23 class CPDF_CryptoHandler;
     24 class CPDF_Dictionary;
     25 class CPDF_Document;
     26 class CPDF_IndirectObjectHolder;
     27 class CPDF_LinearizedHeader;
     28 class CPDF_Object;
     29 class CPDF_SecurityHandler;
     30 class CPDF_StreamAcc;
     31 class CPDF_SyntaxParser;
     32 class IFX_SeekableReadStream;
     33 
     34 class CPDF_Parser {
     35  public:
     36   enum Error {
     37     SUCCESS = 0,
     38     FILE_ERROR,
     39     FORMAT_ERROR,
     40     PASSWORD_ERROR,
     41     HANDLER_ERROR
     42   };
     43 
     44   // A limit on the maximum object number in the xref table. Theoretical limits
     45   // are higher, but this may be large enough in practice.
     46   static const uint32_t kMaxObjectNumber = 1048576;
     47 
     48   static const size_t kInvalidPos = std::numeric_limits<size_t>::max();
     49 
     50   CPDF_Parser();
     51   ~CPDF_Parser();
     52 
     53   Error StartParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
     54                    CPDF_Document* pDocument);
     55   Error StartLinearizedParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
     56                              CPDF_Document* pDocument);
     57 
     58   void SetPassword(const char* password) { m_Password = password; }
     59   ByteString GetPassword() { return m_Password; }
     60 
     61   CPDF_Dictionary* GetTrailer() const;
     62 
     63   // Returns a new trailer which combines the last read trailer with the /Root
     64   // and /Info from previous ones.
     65   std::unique_ptr<CPDF_Dictionary> GetCombinedTrailer() const;
     66 
     67   FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
     68 
     69   uint32_t GetPermissions() const;
     70   uint32_t GetRootObjNum();
     71   uint32_t GetInfoObjNum();
     72   const CPDF_Array* GetIDArray() const;
     73 
     74   CPDF_Dictionary* GetEncryptDict() const { return m_pEncryptDict.Get(); }
     75 
     76   std::unique_ptr<CPDF_Object> ParseIndirectObject(
     77       CPDF_IndirectObjectHolder* pObjList,
     78       uint32_t objnum);
     79 
     80   uint32_t GetLastObjNum() const;
     81   bool IsValidObjectNumber(uint32_t objnum) const;
     82   FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
     83   uint16_t GetObjectGenNum(uint32_t objnum) const;
     84   bool IsObjectFreeOrNull(uint32_t objnum) const;
     85   CPDF_SecurityHandler* GetSecurityHandler() const {
     86     return m_pSecurityHandler.get();
     87   }
     88   RetainPtr<IFX_SeekableReadStream> GetFileAccess() const;
     89   bool IsObjectFree(uint32_t objnum) const;
     90 
     91   FX_FILESIZE GetObjectOffset(uint32_t objnum) const;
     92 
     93   int GetFileVersion() const { return m_FileVersion; }
     94   bool IsXRefStream() const { return m_bXRefStream; }
     95 
     96   std::unique_ptr<CPDF_Object> ParseIndirectObjectAt(
     97       CPDF_IndirectObjectHolder* pObjList,
     98       FX_FILESIZE pos,
     99       uint32_t objnum);
    100 
    101   std::unique_ptr<CPDF_Object> ParseIndirectObjectAtByStrict(
    102       CPDF_IndirectObjectHolder* pObjList,
    103       FX_FILESIZE pos,
    104       uint32_t objnum,
    105       FX_FILESIZE* pResultPos);
    106 
    107   uint32_t GetFirstPageNo() const;
    108 
    109  protected:
    110   enum class ObjectType : uint8_t {
    111     kFree = 0x00,
    112     kNotCompressed = 0x01,
    113     kCompressed = 0x02,
    114     kNull = 0xFF,
    115   };
    116 
    117   struct ObjectInfo {
    118     ObjectInfo() : pos(0), type(ObjectType::kFree), gennum(0) {}
    119     // if type is ObjectType::kCompressed the archive_obj_num should be used.
    120     // if type is ObjectType::kNotCompressed the pos should be used.
    121     // In other cases its are unused.
    122     union {
    123       FX_FILESIZE pos;
    124       FX_FILESIZE archive_obj_num;
    125     };
    126     ObjectType type;
    127     uint16_t gennum;
    128   };
    129 
    130   std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
    131   std::map<uint32_t, ObjectInfo> m_ObjectInfo;
    132 
    133   bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip);
    134   bool RebuildCrossRef();
    135 
    136  private:
    137   friend class CPDF_DataAvail;
    138 
    139   class TrailerData;
    140 
    141   enum class ParserState {
    142     kDefault,
    143     kComment,
    144     kWhitespace,
    145     kString,
    146     kHexString,
    147     kEscapedString,
    148     kXref,
    149     kObjNum,
    150     kPostObjNum,
    151     kGenNum,
    152     kPostGenNum,
    153     kTrailer,
    154     kBeginObj,
    155     kEndObj
    156   };
    157 
    158   struct CrossRefObjData {
    159     uint32_t obj_num = 0;
    160     ObjectInfo info;
    161   };
    162 
    163   Error StartParseInternal(CPDF_Document* pDocument);
    164   FX_FILESIZE ParseStartXRef();
    165   bool LoadAllCrossRefV4(FX_FILESIZE pos);
    166   bool LoadAllCrossRefV5(FX_FILESIZE pos);
    167   bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
    168   std::unique_ptr<CPDF_Dictionary> LoadTrailerV4();
    169   Error SetEncryptHandler();
    170   void ReleaseEncryptHandler();
    171   bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos);
    172   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos);
    173   Error LoadLinearizedMainXRefTable();
    174   RetainPtr<CPDF_StreamAcc> GetObjectStream(uint32_t number);
    175   std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
    176   void SetEncryptDictionary(CPDF_Dictionary* pDict);
    177   void ShrinkObjectMap(uint32_t size);
    178   // A simple check whether the cross reference table matches with
    179   // the objects.
    180   bool VerifyCrossRefV4();
    181 
    182   // If out_objects is null, the parser position will be moved to end subsection
    183   // without additional validation.
    184   bool ParseAndAppendCrossRefSubsectionData(
    185       uint32_t start_objnum,
    186       uint32_t count,
    187       std::vector<CrossRefObjData>* out_objects);
    188   bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects);
    189   void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
    190 
    191   std::unique_ptr<CPDF_Object> ParseIndirectObjectAtInternal(
    192       CPDF_IndirectObjectHolder* pObjList,
    193       FX_FILESIZE pos,
    194       uint32_t objnum,
    195       CPDF_SyntaxParser::ParseType parse_type,
    196       FX_FILESIZE* pResultPos);
    197 
    198   bool InitSyntaxParser(const RetainPtr<IFX_SeekableReadStream>& file_access);
    199   bool ParseFileVersion();
    200 
    201   UnownedPtr<CPDF_Document> m_pDocument;
    202   ObjectType GetObjectType(uint32_t objnum) const;
    203   ObjectType GetObjectTypeFromCrossRefStreamType(
    204       int cross_ref_stream_type) const;
    205 
    206   bool m_bHasParsed;
    207   bool m_bXRefStream;
    208   int m_FileVersion;
    209   // m_TrailerData must be destroyed after m_pSecurityHandler due to the
    210   // ownership of the ID array data.
    211   std::unique_ptr<TrailerData> m_TrailerData;
    212   UnownedPtr<CPDF_Dictionary> m_pEncryptDict;
    213   FX_FILESIZE m_LastXRefOffset;
    214   std::unique_ptr<CPDF_SecurityHandler> m_pSecurityHandler;
    215   ByteString m_Password;
    216   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
    217 
    218   // A map of object numbers to indirect streams.
    219   std::map<uint32_t, RetainPtr<CPDF_StreamAcc>> m_ObjectStreamMap;
    220 
    221   // Mapping of object numbers to offsets. The offsets are relative to the first
    222   // object in the stream.
    223   using StreamObjectCache = std::map<uint32_t, uint32_t>;
    224 
    225   // Mapping of streams to their object caches. This is valid as long as the
    226   // streams in |m_ObjectStreamMap| are valid.
    227   std::map<RetainPtr<CPDF_StreamAcc>, StreamObjectCache> m_ObjCache;
    228 
    229   // All indirect object numbers that are being parsed.
    230   std::set<uint32_t> m_ParsingObjNums;
    231 
    232   uint32_t m_MetadataObjnum = 0;
    233 };
    234 
    235 #endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
    236