Home | History | Annotate | Download | only in src
      1 // Copyright 2008 Google Inc.
      2 // Author: Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef OPEN_VCDIFF_HEADERPARSER_H_
     17 #define OPEN_VCDIFF_HEADERPARSER_H_
     18 
     19 #include <config.h>
     20 #include <stddef.h>  // NULL
     21 #include <stdint.h>  // int32_t, uint32_t
     22 #include "checksum.h"  // VCDChecksum
     23 #include "vcdiff_defs.h"  // VCDiffResult
     24 
     25 namespace open_vcdiff {
     26 
     27 // This class contains a contiguous memory buffer with start and end pointers,
     28 // as well as a position pointer which shows how much of the buffer has been
     29 // parsed and how much remains.
     30 //
     31 // Because no virtual destructor is defined for ParseableChunk, a pointer to
     32 // a child class of ParseableChunk must be destroyed using its specific type,
     33 // rather than as a ParseableChunk*.
     34 class ParseableChunk {
     35  public:
     36   ParseableChunk(const char* data_start, size_t data_size) {
     37     SetDataBuffer(data_start, data_size);
     38   }
     39 
     40   const char* End() const { return end_; }
     41 
     42   // The number of bytes remaining to be parsed.  This is not necessarily the
     43   // same as the initial size of the buffer; it changes with each call to
     44   // Advance().
     45   size_t UnparsedSize() const {
     46     return end_ - position_;
     47   }
     48 
     49   // The number of bytes that have already been parsed.
     50   size_t ParsedSize() const {
     51     return position_ - start_;
     52   }
     53 
     54   bool Empty() const { return 0 == UnparsedSize(); }
     55 
     56   // The start of the data remaining to be parsed.
     57   const char* UnparsedData() const { return position_; }
     58 
     59   // Returns a pointer to the start of the data remaining to be parsed.
     60   const char** UnparsedDataAddr() { return &position_; }
     61 
     62   // Moves the parsing position forward by number_of_bytes.
     63   void Advance(size_t number_of_bytes);
     64 
     65   // Jumps the parsing position to a new location.
     66   void SetPosition(const char* position);
     67 
     68   // Jumps the parsing position to the end of the data chunk.
     69   void Finish() {
     70     position_ = end_;
     71   }
     72 
     73   // Jumps the parsing position so that there are now number_of_bytes
     74   // bytes left to parse.  This number should be smaller than the size of data
     75   // to be parsed before the function was called.
     76   void FinishExcept(size_t number_of_bytes);
     77 
     78   void SetDataBuffer(const char* data_start, size_t data_size) {
     79     start_ = data_start;
     80     end_ = data_start + data_size;
     81     position_ = start_;
     82   }
     83 
     84  private:
     85   const char* start_;
     86   const char* end_;
     87 
     88   // The current parsing position within the data chunk.
     89   // Must always respect start_ <= position_ <= end_.
     90   const char* position_;
     91 
     92   // Making these private avoids implicit copy constructor & assignment operator
     93   ParseableChunk(const ParseableChunk&);
     94   void operator=(const ParseableChunk&);
     95 };
     96 
     97 // Represents one of the three sections in the delta window, as described in
     98 // RFC section 4.3:
     99 //     * Data section for ADDs and RUNs
    100 //     * Instructions and sizes section
    101 //     * Addresses section for COPYs
    102 // When using the interleaved format, data and addresses are pulled from the
    103 // instructions and sizes section rather than being stored in separate sections.
    104 // For that reason, this class allows one DeltaWindowSection to be based on
    105 // another, such that the same position pointer is shared by both sections;
    106 // i.e., UnparsedDataAddr() returns the same value for both objects.
    107 // To achieve this end, one extra level of indirection (a pointer to a
    108 // ParseableChunk object) is added.
    109 class DeltaWindowSection {
    110  public:
    111   DeltaWindowSection() : parseable_chunk_(NULL), owned_(true) { }
    112 
    113   ~DeltaWindowSection() {
    114     FreeChunk();
    115   }
    116 
    117   void Init(const char* data_start, size_t data_size) {
    118     if (owned_ && parseable_chunk_) {
    119       // Reuse the already-allocated ParseableChunk object.
    120       parseable_chunk_->SetDataBuffer(data_start, data_size);
    121     } else {
    122       parseable_chunk_ = new ParseableChunk(data_start, data_size);
    123       owned_ = true;
    124     }
    125   }
    126 
    127   void Init(DeltaWindowSection* original) {
    128     FreeChunk();
    129     parseable_chunk_ = original->parseable_chunk_;
    130     owned_ = false;
    131   }
    132 
    133   void Invalidate() { FreeChunk(); }
    134 
    135   bool IsOwned() const { return owned_; }
    136 
    137   // The following functions just pass their arguments to the underlying
    138   // ParseableChunk object.
    139 
    140   const char* End() const {
    141     return parseable_chunk_->End();
    142   }
    143 
    144   size_t UnparsedSize() const {
    145     return parseable_chunk_->UnparsedSize();
    146   }
    147 
    148   size_t ParsedSize() const {
    149     return parseable_chunk_->ParsedSize();
    150   }
    151 
    152   bool Empty() const {
    153     return parseable_chunk_->Empty();
    154   }
    155 
    156   const char* UnparsedData() const {
    157     return parseable_chunk_->UnparsedData();
    158   }
    159 
    160   const char** UnparsedDataAddr() {
    161     return parseable_chunk_->UnparsedDataAddr();
    162   }
    163 
    164   void Advance(size_t number_of_bytes) {
    165     return parseable_chunk_->Advance(number_of_bytes);
    166   }
    167  private:
    168   void FreeChunk() {
    169     if (owned_) {
    170       delete parseable_chunk_;
    171     }
    172     parseable_chunk_ = NULL;
    173   }
    174 
    175   // Will be NULL until Init() has been called.  If owned_ is true, this will
    176   // point to a ParseableChunk object that has been allocated with "new" and
    177   // must be deleted by this DeltaWindowSection object.  If owned_ is false,
    178   // this points at the parseable_chunk_ owned by a different DeltaWindowSection
    179   // object.  In this case, it is important to free the DeltaWindowSection which
    180   // does not own the ParseableChunk before (or simultaneously to) freeing the
    181   // DeltaWindowSection that owns it, or else deleted memory may be accessed.
    182   ParseableChunk* parseable_chunk_;
    183   bool owned_;
    184 
    185   // Making these private avoids implicit copy constructor & assignment operator
    186   DeltaWindowSection(const DeltaWindowSection&);
    187   void operator=(const DeltaWindowSection&);
    188 };
    189 
    190 // Used to parse the bytes and Varints that make up the delta file header
    191 // or delta window header.
    192 class VCDiffHeaderParser {
    193  public:
    194   // header_start should be the start of the header to be parsed;
    195   // data_end is the position just after the last byte of available data
    196   // (which may extend far past the end of the header.)
    197   VCDiffHeaderParser(const char* header_start, const char* data_end);
    198 
    199   // One of these functions should be called for each element of the header.
    200   // variable_description is a description of the value that we are attempting
    201   // to parse, and will only be used to create descriptive error messages.
    202   // If the function returns true, then the element was parsed successfully
    203   // and its value has been placed in *value.  If the function returns false,
    204   // then *value is unchanged, and GetResult() can be called to return the
    205   // reason that the element could not be parsed, which will be either
    206   // RESULT_ERROR (an error occurred), or RESULT_END_OF_DATA (the limit data_end
    207   // was reached before the end of the element to be parsed.)  Once one of these
    208   // functions has returned false, further calls to any of the Parse...
    209   // functions will also return false without performing any additional actions.
    210   // Typical usage is as follows:
    211   //     int32_t segment_length = 0;
    212   //     if (!header_parser.ParseInt32("segment length", &segment_length)) {
    213   //       return header_parser.GetResult();
    214   //     }
    215   //
    216   // The following example takes advantage of the fact that calling a Parse...
    217   // function after an error or end-of-data condition is legal and does nothing.
    218   // It can thus parse more than one element in a row and check the status
    219   // afterwards.  If the first call to ParseInt32() fails, the second will have
    220   // no effect:
    221   //
    222   //     int32_t segment_length = 0, segment_position = 0;
    223   //     header_parser.ParseInt32("segment length", &segment_length));
    224   //     header_parser.ParseInt32("segment position", &segment_position));
    225   //     if (RESULT_SUCCESS != header_parser.GetResult()) {
    226   //       return header_parser.GetResult();
    227   //     }
    228   //
    229   bool ParseByte(unsigned char* value);
    230   bool ParseInt32(const char* variable_description, int32_t* value);
    231   bool ParseUInt32(const char* variable_description, uint32_t* value);
    232   bool ParseChecksum(const char* variable_description, VCDChecksum* value);
    233   bool ParseSize(const char* variable_description, size_t* value);
    234 
    235   // Parses the first three elements of the delta window header:
    236   //
    237   //     Win_Indicator                            - byte
    238   //     [Source segment size]                    - integer (VarintBE format)
    239   //     [Source segment position]                - integer (VarintBE format)
    240   //
    241   // Returns true if the values were parsed successfully and the values were
    242   // found to be acceptable.  Returns false otherwise, in which case
    243   // GetResult() can be called to return the reason that the two values
    244   // could not be validated.  This will be either RESULT_ERROR (an error
    245   // occurred and was logged), or RESULT_END_OF_DATA (the limit data_end was
    246   // reached before the end of the values to be parsed.)  If return value is
    247   // true, then *win_indicator, *source_segment_length, and
    248   // *source_segment_position are populated with the parsed values.  Otherwise,
    249   // the values of these output arguments are undefined.
    250   //
    251   // dictionary_size: The size of the dictionary (source) file.  Used to
    252   //     validate the limits of source_segment_length and
    253   //     source_segment_position if the source segment is taken from the
    254   //     dictionary (i.e., if the parsed *win_indicator equals VCD_SOURCE.)
    255   // decoded_target_size: The size of the target data that has been decoded
    256   //     so far, including all target windows.  Used to validate the limits of
    257   //     source_segment_length and source_segment_position if the source segment
    258   //     is taken from the target (i.e., if the parsed *win_indicator equals
    259   //     VCD_TARGET.)
    260   // allow_vcd_target: If this argument is false, and the parsed *win_indicator
    261   //     is VCD_TARGET, then an error is produced; if true, VCD_TARGET is
    262   //     allowed.
    263   // win_indicator (output): Points to a single unsigned char (not an array)
    264   //     that will receive the parsed value of Win_Indicator.
    265   // source_segment_length (output): The parsed length of the source segment.
    266   // source_segment_position (output): The parsed zero-based index in the
    267   //     source/target file from which the source segment is to be taken.
    268   bool ParseWinIndicatorAndSourceSegment(size_t dictionary_size,
    269                                          size_t decoded_target_size,
    270                                          bool allow_vcd_target,
    271                                          unsigned char* win_indicator,
    272                                          size_t* source_segment_length,
    273                                          size_t* source_segment_position);
    274 
    275   // Parses the following two elements of the delta window header:
    276   //
    277   //     Length of the delta encoding             - integer (VarintBE format)
    278   //     Size of the target window                - integer (VarintBE format)
    279   //
    280   // Return conditions and values are the same as for
    281   // ParseWinIndicatorAndSourceSegment(), above.
    282   //
    283   bool ParseWindowLengths(size_t* target_window_length);
    284 
    285   // May only be called after ParseWindowLengths() has returned RESULT_SUCCESS.
    286   // Returns a pointer to the end of the delta window (which might not point to
    287   // a valid memory location if there is insufficient input data.)
    288   //
    289   const char* EndOfDeltaWindow() const;
    290 
    291   // Parses the following element of the delta window header:
    292   //
    293   //     Delta_Indicator                          - byte
    294   //
    295   // Because none of the bits in Delta_Indicator are used by this implementation
    296   // of VCDIFF, this function does not have an output argument to return the
    297   // value of that field.  It may return RESULT_SUCCESS, RESULT_ERROR, or
    298   // RESULT_END_OF_DATA as with the other Parse...() functions.
    299   //
    300   bool ParseDeltaIndicator();
    301 
    302   // Parses the following 3 elements of the delta window header:
    303   //
    304   //     Length of data for ADDs and RUNs - integer (VarintBE format)
    305   //     Length of instructions and sizes - integer (VarintBE format)
    306   //     Length of addresses for COPYs    - integer (VarintBE format)
    307   //
    308   // If has_checksum is true, it also looks for the following element:
    309   //
    310   //     Adler32 checksum            - unsigned 32-bit integer (VarintBE format)
    311   //
    312   // Return conditions and values are the same as for
    313   // ParseWinIndicatorAndSourceSegment(), above.
    314   //
    315   bool ParseSectionLengths(bool has_checksum,
    316                            size_t* add_and_run_data_length,
    317                            size_t* instructions_and_sizes_length,
    318                            size_t* addresses_length,
    319                            VCDChecksum* checksum);
    320 
    321   // If one of the Parse... functions returned false, this function
    322   // can be used to find the result code (RESULT_ERROR or RESULT_END_OF_DATA)
    323   // describing the reason for the most recent parse failure.  If none of the
    324   // Parse... functions has returned false, returns RESULT_SUCCESS.
    325   VCDiffResult GetResult() const {
    326     return return_code_;
    327   }
    328 
    329   // The following functions just pass their arguments to the underlying
    330   // ParseableChunk object.
    331 
    332   const char* End() const {
    333     return parseable_chunk_.End();
    334   }
    335 
    336   size_t UnparsedSize() const {
    337     return parseable_chunk_.UnparsedSize();
    338   }
    339 
    340   size_t ParsedSize() const {
    341     return parseable_chunk_.ParsedSize();
    342   }
    343 
    344   const char* UnparsedData() const {
    345     return parseable_chunk_.UnparsedData();
    346   }
    347 
    348  private:
    349   // Parses two variable-length integers representing the source segment length
    350   // and source segment position (== offset.)  Checks whether the source segment
    351   // length and position would cause it to exceed the size of the source file or
    352   // target file.  Returns true if the values were parsed successfully and the
    353   // values were found to be acceptable.  Returns false otherwise, in which case
    354   // GetResult() can be called to return the reason that the two values could
    355   // not be validated, which will be either RESULT_ERROR (an error occurred and
    356   // was logged), or RESULT_END_OF_DATA (the limit data_end was reached before
    357   // the end of the integers to be parsed.)
    358   // from_size: The requested size of the source segment.
    359   // from_boundary_name: A NULL-terminated string naming the end of the
    360   //     source or target file, used in error messages.
    361   // from_name: A NULL-terminated string naming the source or target file,
    362   //     also used in error messages.
    363   // source_segment_length (output): The parsed length of the source segment.
    364   // source_segment_position (output): The parsed zero-based index in the
    365   //     source/target file from which the source segment is to be taken.
    366   //
    367   bool ParseSourceSegmentLengthAndPosition(size_t from_size,
    368                                            const char* from_boundary_name,
    369                                            const char* from_name,
    370                                            size_t* source_segment_length,
    371                                            size_t* source_segment_position);
    372 
    373   ParseableChunk parseable_chunk_;
    374 
    375   // Contains the result code of the last Parse...() operation that failed
    376   // (RESULT_ERROR or RESULT_END_OF_DATA).  If no Parse...() method has been
    377   // called, or if all calls to Parse...() were successful, then this contains
    378   // RESULT_SUCCESS.
    379   VCDiffResult return_code_;
    380 
    381   // Will be zero until ParseWindowLengths() has been called.  After
    382   // ParseWindowLengths() has been called successfully, this contains the
    383   // parsed length of the delta encoding.
    384   size_t delta_encoding_length_;
    385 
    386   // Will be NULL until ParseWindowLengths() has been called.  After
    387   // ParseWindowLengths() has been called successfully, this points to the
    388   // beginning of the section of the current window titled "The delta encoding"
    389   // in the RFC, i.e., to the position just after the length of the delta
    390   // encoding.
    391   const char* delta_encoding_start_;
    392 
    393   // Making these private avoids implicit copy constructor & assignment operator
    394   VCDiffHeaderParser(const VCDiffHeaderParser&);
    395   void operator=(const VCDiffHeaderParser&);
    396 };
    397 
    398 }  // namespace open_vcdiff
    399 
    400 #endif  // OPEN_VCDIFF_HEADERPARSER_H_
    401