Home | History | Annotate | Download | only in src
      1 // Copyright 2008 Google Inc.
      2 // Author: Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef OPEN_VCDIFF_ENCODETABLE_H_
     17 #define OPEN_VCDIFF_ENCODETABLE_H_
     18 
     19 #include <config.h>
     20 #include <stddef.h>  // size_t
     21 #include <stdint.h>  // int32_t
     22 #include <string>
     23 #include <vector>
     24 #include "addrcache.h"
     25 #include "checksum.h"
     26 #include "codetable.h"
     27 #include "codetablewriter_interface.h"
     28 
     29 namespace open_vcdiff {
     30 
     31 class OutputStringInterface;
     32 class VCDiffInstructionMap;
     33 
     34 // The method calls after construction *must* conform
     35 // to the following pattern:
     36 //    {{Add|Copy|Run}* [AddChecksum] Output}*
     37 //
     38 // When Output has been called in this sequence, a complete target window
     39 // (as defined in RFC 3284 section 4.3) will have been appended to
     40 // out (unless no calls to Add, Run, or Copy were made, in which
     41 // case Output will do nothing.)  The output will not be available for use
     42 // until after each call to Output().
     43 //
     44 // NOT threadsafe.
     45 //
     46 class VCDiffCodeTableWriter : public CodeTableWriterInterface {
     47  public:
     48   // This constructor uses the default code table.
     49   // If interleaved is true, the encoder writes each delta file window
     50   // by interleaving instructions and sizes with their corresponding
     51   // addresses and data, rather than placing these elements into three
     52   // separate sections.  This facilitates providing partially
     53   // decoded results when only a portion of a delta file window
     54   // is received (e.g. when HTTP over TCP is used as the
     55   // transmission protocol.)  The interleaved format is
     56   // not consistent with the VCDIFF draft standard.
     57   //
     58   explicit VCDiffCodeTableWriter(bool interleaved);
     59 
     60   // Uses a non-standard code table and non-standard cache sizes.  The caller
     61   // must guarantee that code_table_data remains allocated for the lifetime of
     62   // the VCDiffCodeTableWriter object.  Note that this is different from how
     63   // VCDiffCodeTableReader::UseCodeTable works.  It is assumed that a given
     64   // encoder will use either the default code table or a statically-defined
     65   // non-standard code table, whereas the decoder must have the ability to read
     66   // an arbitrary non-standard code table from a delta file and discard it once
     67   // the file has been decoded.
     68   //
     69   VCDiffCodeTableWriter(bool interleaved,
     70                         int near_cache_size,
     71                         int same_cache_size,
     72                         const VCDiffCodeTableData& code_table_data,
     73                         unsigned char max_mode);
     74 
     75   virtual ~VCDiffCodeTableWriter();
     76 
     77   // Initializes the constructed object for use.
     78   // This method must be called after a VCDiffCodeTableWriter is constructed
     79   // and before any of its other methods can be called.  It will return
     80   // false if there was an error initializing the object, or true if it
     81   // was successful.  After the object has been initialized and used,
     82   // Init() can be called again to restore the initial state of the object.
     83   //
     84   bool Init(size_t dictionary_size);
     85 
     86   virtual size_t target_length() const { return target_length_; }
     87 
     88   // Encode an ADD opcode with the "size" bytes starting at data
     89   virtual void Add(const char* data, size_t size);
     90 
     91   // Encode a COPY opcode with args "offset" (into dictionary) and "size" bytes.
     92   virtual void Copy(int32_t offset, size_t size);
     93 
     94   // Encode a RUN opcode for "size" copies of the value "byte".
     95   virtual void Run(size_t size, unsigned char byte);
     96 
     97   void AddChecksum(VCDChecksum checksum) {
     98     add_checksum_ = true;
     99     checksum_ = checksum;
    100   }
    101 
    102   // Finishes encoding and appends the encoded delta window to the output
    103   // string.  The output string is not null-terminated and may contain embedded
    104   // '\0' characters.
    105   virtual void Output(OutputStringInterface* out);
    106 
    107   const std::vector<int>& match_counts() const { return match_counts_; }
    108 
    109  private:
    110   typedef std::string string;
    111 
    112   // This is an estimate of the longest match size the encoder expects to find.
    113   // It is used to determine the initial size of the vector match_counts_.
    114   // If it is too large, then some space will be wasted on vector elements
    115   // that are not used.  If it is too small, then some time will be wasted
    116   // expanding match_counts_ to accommodate larger match sizes.
    117   static const size_t kMaxMatchSize = 2000;
    118 
    119   // The maximum value for the mode of a COPY instruction.
    120   const unsigned char max_mode_;
    121 
    122   // If interleaved is true, sets data_for_add_and_run_ and
    123   // addresses_for_copy_ to point at instructions_and_sizes_,
    124   // so that instructions, sizes, addresses and data will be
    125   // combined into a single interleaved stream.
    126   // If interleaved is false, sets data_for_add_and_run_ and
    127   // addresses_for_copy_ to point at their corresponding
    128   // separate_... strings, so that the three sections will
    129   // be generated separately from one another.
    130   //
    131   void InitSectionPointers(bool interleaved);
    132 
    133   // Determines the best opcode to encode an instruction, and appends
    134   // or substitutes that opcode and its size into the
    135   // instructions_and_sizes_ string.
    136   //
    137   void EncodeInstruction(VCDiffInstructionType inst,
    138                          size_t size,
    139                          unsigned char mode);
    140 
    141   void EncodeInstruction(VCDiffInstructionType inst, size_t size) {
    142     return EncodeInstruction(inst, size, 0);
    143   }
    144 
    145   // Calculates the number of bytes needed to store the given size value as a
    146   // variable-length integer (VarintBE).
    147   static size_t CalculateLengthOfSizeAsVarint(size_t size);
    148 
    149   // Appends the size value to the string as a variable-length integer.
    150   static void AppendSizeToString(size_t size, string* out);
    151 
    152   // Appends the size value to the output string as a variable-length integer.
    153   static void AppendSizeToOutputString(size_t size, OutputStringInterface* out);
    154 
    155   // Calculates the "Length of the delta encoding" field for the delta window
    156   // header, based on the sizes of the sections and of the other header
    157   // elements.
    158   size_t CalculateLengthOfTheDeltaEncoding() const;
    159 
    160   // None of the following 'string' objects are null-terminated.
    161 
    162   // A series of instruction opcodes, each of which may be followed
    163   // by one or two Varint values representing the size parameters
    164   // of the first and second instruction in the opcode.
    165   string instructions_and_sizes_;
    166 
    167   // A series of data arguments (byte values) used for ADD and RUN
    168   // instructions.  Depending on whether interleaved output is used
    169   // for streaming or not, the pointer may point to
    170   // separate_data_for_add_and_run_ or to instructions_and_sizes_.
    171   string *data_for_add_and_run_;
    172   string separate_data_for_add_and_run_;
    173 
    174   // A series of Varint addresses used for COPY instructions.
    175   // For the SAME mode, a byte value is stored instead of a Varint.
    176   // Depending on whether interleaved output is used
    177   // for streaming or not, the pointer may point to
    178   // separate_addresses_for_copy_ or to instructions_and_sizes_.
    179   string *addresses_for_copy_;
    180   string separate_addresses_for_copy_;
    181 
    182   VCDiffAddressCache address_cache_;
    183 
    184   size_t dictionary_size_;
    185 
    186   // The number of bytes of target data that has been encoded so far.
    187   // Each time Add(), Copy(), or Run() is called, this will be incremented.
    188   // The target length is used to compute HERE mode addresses
    189   // for COPY instructions, and is also written into the header
    190   // of the delta window when Output() is called.
    191   //
    192   size_t target_length_;
    193 
    194   const VCDiffCodeTableData* code_table_data_;
    195 
    196   // The instruction map facilitates finding an opcode quickly given an
    197   // instruction inst, size, and mode.  This is an alternate representation
    198   // of the same information that is found in code_table_data_.
    199   //
    200   const VCDiffInstructionMap* instruction_map_;
    201 
    202   // The zero-based index within instructions_and_sizes_ of the byte
    203   // that contains the last single-instruction opcode generated by
    204   // EncodeInstruction().  (See that function for exhaustive details.)
    205   // It is necessary to use an index rather than a pointer for this value
    206   // because instructions_and_sizes_ may be resized, which would invalidate
    207   // any pointers into its data buffer.  The value -1 is reserved to mean that
    208   // either no opcodes have been generated yet, or else the last opcode
    209   // generated was a double-instruction opcode.
    210   //
    211   int last_opcode_index_;
    212 
    213   // If true, an Adler32 checksum of the target window data will be written as
    214   // a variable-length integer, just after the size of the addresses section.
    215   //
    216   bool add_checksum_;
    217 
    218   // The checksum to be written to the current target window,
    219   // if add_checksum_ is true.
    220   // This will not be calculated based on the individual calls to Add(), Run(),
    221   // and Copy(), which would be unnecessarily expensive.  Instead, the code
    222   // that uses the VCDiffCodeTableWriter object is expected to calculate
    223   // the checksum all at once and to call AddChecksum() with that value.
    224   // Must be called sometime before calling Output(), though it can be called
    225   // either before or after the calls to Add(), Run(), and Copy().
    226   //
    227   VCDChecksum checksum_;
    228 
    229   // The value of match_counts_[n] is equal to the number of matches
    230   // of length n (that is, COPY instructions of size n) found so far.
    231   std::vector<int> match_counts_;
    232 
    233   // Making these private avoids implicit copy constructor & assignment operator
    234   VCDiffCodeTableWriter(const VCDiffCodeTableWriter&);  // NOLINT
    235   void operator=(const VCDiffCodeTableWriter&);
    236 };
    237 
    238 };  // namespace open_vcdiff
    239 
    240 #endif  // OPEN_VCDIFF_ENCODETABLE_H_
    241