Home | History | Annotate | Download | only in google
      1 // Copyright 2007 Google Inc.
      2 // Author: Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef OPEN_VCDIFF_VCENCODER_H_
     17 #define OPEN_VCDIFF_VCENCODER_H_
     18 
     19 #include <stddef.h>  // size_t
     20 #include <vector>
     21 #include "google/output_string.h"
     22 
     23 namespace open_vcdiff {
     24 
     25 class VCDiffEngine;
     26 class VCDiffStreamingEncoderImpl;
     27 
     28 // These flags are passed to the constructor of VCDiffStreamingEncoder
     29 // to determine whether certain open-vcdiff format extensions
     30 // (which are not part of the RFC 3284 draft standard for VCDIFF)
     31 // are employed.
     32 //
     33 // Because these extensions are not part of the VCDIFF standard, if
     34 // any of these flags except VCD_STANDARD_FORMAT is specified, then the caller
     35 // must be certain that the receiver of the data will be using open-vcdiff
     36 // to decode the delta file, or at least that the receiver can interpret
     37 // these extensions.  The encoder will use an 'S' as the fourth character
     38 // in the delta file to indicate that non-standard extensions are being used.
     39 //
     40 enum VCDiffFormatExtensionFlagValues {
     41   // No extensions: the encoded format will conform to the RFC
     42   // draft standard for VCDIFF.
     43   VCD_STANDARD_FORMAT = 0x00,
     44   // If this flag is specified, then the encoder writes each delta file
     45   // window by interleaving instructions and sizes with their corresponding
     46   // addresses and data, rather than placing these elements
     47   // into three separate sections.  This facilitates providing partially
     48   // decoded results when only a portion of a delta file window is received
     49   // (e.g. when HTTP over TCP is used as the transmission protocol.)
     50   VCD_FORMAT_INTERLEAVED = 0x01,
     51   // If this flag is specified, then an Adler32 checksum
     52   // of the target window data is included in the delta window.
     53   VCD_FORMAT_CHECKSUM = 0x02
     54 };
     55 
     56 typedef int VCDiffFormatExtensionFlags;
     57 
     58 // A HashedDictionary must be constructed from the dictionary data
     59 // in order to use VCDiffStreamingEncoder.  If the same dictionary will
     60 // be used to perform several encoding operations, then the caller should
     61 // create the HashedDictionary once and cache it for reuse.  This object
     62 // is thread-safe: the same const HashedDictionary can be used
     63 // by several threads simultaneously, each with its own VCDiffStreamingEncoder.
     64 //
     65 // dictionary_contents is copied into the HashedDictionary, so the
     66 // caller may free that string, if desired, after the constructor returns.
     67 //
     68 class HashedDictionary {
     69  public:
     70   HashedDictionary(const char* dictionary_contents,
     71                    size_t dictionary_size);
     72   ~HashedDictionary();
     73 
     74   // Init() must be called before using the HashedDictionary as an argument
     75   // to the VCDiffStreamingEncoder, or for any other purpose except
     76   // destruction.  It returns true if initialization succeeded, or false
     77   // if an error occurred, in which case the caller should destroy the object
     78   // without using it.
     79   bool Init();
     80 
     81   const VCDiffEngine* engine() const { return engine_; }
     82 
     83  private:
     84   const VCDiffEngine* engine_;
     85 
     86   // Make the copy constructor and assignment operator private
     87   // so that they don't inadvertently get used.
     88   HashedDictionary(const HashedDictionary&);  // NOLINT
     89   void operator=(const HashedDictionary&);
     90 };
     91 
     92 // The standard streaming interface to the VCDIFF (RFC 3284) encoder.
     93 // "Streaming" in this context means that, even though the entire set of
     94 // input data to be encoded may not be available at once, the encoder
     95 // can produce partial output based on what is available.  Of course,
     96 // the caller should try to maximize the sizes of the data chunks passed
     97 // to the encoder.
     98 class VCDiffStreamingEncoder {
     99  public:
    100   // The HashedDictionary object passed to the constructor must remain valid,
    101   // without being deleted, for the lifetime of the VCDiffStreamingEncoder
    102   // object.
    103   //
    104   // format_extensions allows certain open-vcdiff extensions to the VCDIFF
    105   // format to be included in the encoded output.  These extensions are not
    106   // part of the RFC 3284 draft standard, so specifying any extension flags
    107   // will make the output compatible only with open-vcdiff, or with other
    108   // VCDIFF implementations that accept these extensions.  See above for an
    109   // explanation of each possible flag value.
    110   //
    111   // *** look_for_target_matches:
    112   // The VCDIFF format allows COPY instruction addresses to reference data from
    113   // the source (dictionary), or from previously encoded target data.
    114   //
    115   // If look_for_target_matches is false, then the encoder will only
    116   // produce COPY instructions that reference source data from the dictionary,
    117   // never from previously encoded target data.  This will speed up the encoding
    118   // process, but the encoded data will not be as compact.
    119   //
    120   // If this value is true, then the encoder will produce COPY instructions
    121   // that reference either source data or target data.  A COPY instruction from
    122   // the previously encoded target data may even extend into the range of the
    123   // data being produced by that same COPY instruction; for example, if the
    124   // previously encoded target data is "LA", then a single COPY instruction of
    125   // length 10 can produce the additional target data "LALALALALA".
    126   //
    127   // There is a third type of COPY instruction that starts within
    128   // the source data and extends from the end of the source data
    129   // into the beginning of the target data.  This VCDIFF encoder will never
    130   // produce a COPY instruction of this third type (regardless of the value of
    131   // look_for_target_matches) because the cost of checking for matches
    132   // across the source-target boundary would not justify its benefits.
    133   //
    134   VCDiffStreamingEncoder(const HashedDictionary* dictionary,
    135                          VCDiffFormatExtensionFlags format_extensions,
    136                          bool look_for_target_matches);
    137   ~VCDiffStreamingEncoder();
    138 
    139   // The client should use these routines as follows:
    140   //    HashedDictionary hd(dictionary, dictionary_size);
    141   //    if (!hd.Init()) {
    142   //      HandleError();
    143   //      return;
    144   //    }
    145   //    string output_string;
    146   //    VCDiffStreamingEncoder v(hd, false, false);
    147   //    if (!v.StartEncoding(&output_string)) {
    148   //      HandleError();
    149   //      return;  // No need to call FinishEncoding()
    150   //    }
    151   //    Process(output_string.data(), output_string.size());
    152   //    output_string.clear();
    153   //    while (get data_buf) {
    154   //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
    155   //        HandleError();
    156   //        return;  // No need to call FinishEncoding()
    157   //      }
    158   //      // The encoding is appended to output_string at each call,
    159   //      // so clear output_string once its contents have been processed.
    160   //      Process(output_string.data(), output_string.size());
    161   //      output_string.clear();
    162   //    }
    163   //    if (!v.FinishEncoding(&output_string)) {
    164   //      HandleError();
    165   //      return;
    166   //    }
    167   //    Process(output_string.data(), output_string.size());
    168   //    output_string.clear();
    169   //
    170   // I.e., the allowed pattern of calls is
    171   //    StartEncoding EncodeChunk* FinishEncoding
    172   //
    173   // The size of the encoded output depends on the sizes of the chunks
    174   // passed in (i.e. the chunking boundary affects compression).
    175   // However the decoded output is independent of chunk boundaries.
    176 
    177   // Sets up the data structures for encoding.
    178   // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
    179   // to *output_string.
    180   //
    181   // Note: we *append*, so the old contents of *output_string stick around.
    182   // This convention differs from the non-streaming Encode/Decode
    183   // interfaces in VCDiffEncoder.
    184   //
    185   // If an error occurs, this function returns false; otherwise it returns true.
    186   // If this function returns false, the caller does not need to call
    187   // FinishEncoding or to do any cleanup except destroying the
    188   // VCDiffStreamingEncoder object.
    189   template<class OutputType>
    190   bool StartEncoding(OutputType* output) {
    191     OutputString<OutputType> output_string(output);
    192     return StartEncodingToInterface(&output_string);
    193   }
    194 
    195   bool StartEncodingToInterface(OutputStringInterface* output_string);
    196 
    197   // Appends compressed encoding for "data" (one complete VCDIFF delta window)
    198   // to *output_string.
    199   // If an error occurs (for example, if StartEncoding was not called
    200   // earlier or StartEncoding returned false), this function returns false;
    201   // otherwise it returns true.  The caller does not need to call FinishEncoding
    202   // or do any cleanup except destroying the VCDiffStreamingEncoder
    203   // if this function returns false.
    204   template<class OutputType>
    205   bool EncodeChunk(const char* data, size_t len, OutputType* output) {
    206     OutputString<OutputType> output_string(output);
    207     return EncodeChunkToInterface(data, len, &output_string);
    208   }
    209 
    210   bool EncodeChunkToInterface(const char* data, size_t len,
    211                               OutputStringInterface* output_string);
    212 
    213   // Finishes encoding and appends any leftover encoded data to *output_string.
    214   // If an error occurs (for example, if StartEncoding was not called
    215   // earlier or StartEncoding returned false), this function returns false;
    216   // otherwise it returns true.  The caller does not need to
    217   // do any cleanup except destroying the VCDiffStreamingEncoder
    218   // if this function returns false.
    219   template<class OutputType>
    220   bool FinishEncoding(OutputType* output) {
    221     OutputString<OutputType> output_string(output);
    222     return FinishEncodingToInterface(&output_string);
    223   }
    224 
    225   bool FinishEncodingToInterface(OutputStringInterface* output_string);
    226 
    227   // Replaces the contents of match_counts with a vector of integers,
    228   // one for each possible match length.  The value of match_counts[n]
    229   // is equal to the number of matches of length n found so far
    230   // for this VCDiffStreamingEncoder object.
    231   void GetMatchCounts(std::vector<int>* match_counts) const;
    232 
    233  private:
    234   VCDiffStreamingEncoderImpl* const impl_;
    235 
    236   // Make the copy constructor and assignment operator private
    237   // so that they don't inadvertently get used.
    238   VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
    239   void operator=(const VCDiffStreamingEncoder&);
    240 };
    241 
    242 // A simpler (non-streaming) interface to the VCDIFF encoder that can be used
    243 // if the entire target data string is available.
    244 //
    245 class VCDiffEncoder {
    246  public:
    247   VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
    248       : dictionary_(dictionary_contents, dictionary_size),
    249         encoder_(NULL),
    250         flags_(VCD_STANDARD_FORMAT),
    251         look_for_target_matches_(true) { }
    252 
    253   ~VCDiffEncoder() {
    254     delete encoder_;
    255   }
    256 
    257   // By default, VCDiffEncoder uses standard VCDIFF format.  This function
    258   // can be used before calling Encode(), to specify that interleaved format
    259   // and/or checksum format should be used.
    260   void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }
    261 
    262   // By default, VCDiffEncoder looks for matches in the dictionary and also in
    263   // the previously encoded target data.  This function can be used before
    264   // calling Encode(), to specify whether or not target matching should be
    265   // enabled.
    266   void SetTargetMatching(bool look_for_target_matches) {
    267     look_for_target_matches_ = look_for_target_matches;
    268   }
    269 
    270   // Replaces old contents of output_string with the encoded form of
    271   // target_data.
    272   template<class OutputType>
    273   bool Encode(const char* target_data,
    274               size_t target_len,
    275               OutputType* output) {
    276     OutputString<OutputType> output_string(output);
    277     return EncodeToInterface(target_data, target_len, &output_string);
    278   }
    279 
    280  private:
    281   bool EncodeToInterface(const char* target_data,
    282                          size_t target_len,
    283                          OutputStringInterface* output_string);
    284 
    285   HashedDictionary dictionary_;
    286   VCDiffStreamingEncoder* encoder_;
    287   VCDiffFormatExtensionFlags flags_;
    288   bool look_for_target_matches_;
    289 
    290   // Make the copy constructor and assignment operator private
    291   // so that they don't inadvertently get used.
    292   VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
    293   void operator=(const VCDiffEncoder&);
    294 };
    295 
    296 }  // namespace open_vcdiff
    297 
    298 #endif  // OPEN_VCDIFF_VCENCODER_H_
    299