Home | History | Annotate | Download | only in google
      1 // Copyright 2007 Google Inc.
      2 // Author: Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef OPEN_VCDIFF_VCENCODER_H_
     17 #define OPEN_VCDIFF_VCENCODER_H_
     18 
     19 #include <stddef.h>  // size_t
     20 #include "google/format_extension_flags.h"
     21 #include "google/output_string.h"
     22 
     23 namespace open_vcdiff {
     24 
     25 class VCDiffEngine;
     26 class VCDiffStreamingEncoderImpl;
     27 
     28 // A HashedDictionary must be constructed from the dictionary data
     29 // in order to use VCDiffStreamingEncoder.  If the same dictionary will
     30 // be used to perform several encoding operations, then the caller should
     31 // create the HashedDictionary once and cache it for reuse.  This object
     32 // is thread-safe: the same const HashedDictionary can be used
     33 // by several threads simultaneously, each with its own VCDiffStreamingEncoder.
     34 //
     35 // dictionary_contents is copied into the HashedDictionary, so the
     36 // caller may free that string, if desired, after the constructor returns.
     37 //
     38 class HashedDictionary {
     39  public:
     40   HashedDictionary(const char* dictionary_contents,
     41                    size_t dictionary_size);
     42   ~HashedDictionary();
     43 
     44   // Init() must be called before using the HashedDictionary as an argument
     45   // to the VCDiffStreamingEncoder, or for any other purpose except
     46   // destruction.  It returns true if initialization succeeded, or false
     47   // if an error occurred, in which case the caller should destroy the object
     48   // without using it.
     49   bool Init();
     50 
     51   const VCDiffEngine* engine() const { return engine_; }
     52 
     53  private:
     54   const VCDiffEngine* engine_;
     55 
     56   // Make the copy constructor and assignment operator private
     57   // so that they don't inadvertently get used.
     58   HashedDictionary(const HashedDictionary&);  // NOLINT
     59   void operator=(const HashedDictionary&);
     60 };
     61 
     62 // The standard streaming interface to the VCDIFF (RFC 3284) encoder.
     63 // "Streaming" in this context means that, even though the entire set of
     64 // input data to be encoded may not be available at once, the encoder
     65 // can produce partial output based on what is available.  Of course,
     66 // the caller should try to maximize the sizes of the data chunks passed
     67 // to the encoder.
     68 class VCDiffStreamingEncoder {
     69  public:
     70   // The HashedDictionary object passed to the constructor must remain valid,
     71   // without being deleted, for the lifetime of the VCDiffStreamingEncoder
     72   // object.
     73   //
     74   // format_extensions allows certain open-vcdiff extensions to the VCDIFF
     75   // format to be included in the encoded output.  These extensions are not
     76   // part of the RFC 3284 draft standard, so specifying any extension flags
     77   // will make the output compatible only with open-vcdiff, or with other
     78   // VCDIFF implementations that accept these extensions.  See above for an
     79   // explanation of each possible flag value.
     80   //
     81   // *** look_for_target_matches:
     82   // The VCDIFF format allows COPY instruction addresses to reference data from
     83   // the source (dictionary), or from previously encoded target data.
     84   //
     85   // If look_for_target_matches is false, then the encoder will only
     86   // produce COPY instructions that reference source data from the dictionary,
     87   // never from previously encoded target data.  This will speed up the encoding
     88   // process, but the encoded data will not be as compact.
     89   //
     90   // If this value is true, then the encoder will produce COPY instructions
     91   // that reference either source data or target data.  A COPY instruction from
     92   // the previously encoded target data may even extend into the range of the
     93   // data being produced by that same COPY instruction; for example, if the
     94   // previously encoded target data is "LA", then a single COPY instruction of
     95   // length 10 can produce the additional target data "LALALALALA".
     96   //
     97   // There is a third type of COPY instruction that starts within
     98   // the source data and extends from the end of the source data
     99   // into the beginning of the target data.  This VCDIFF encoder will never
    100   // produce a COPY instruction of this third type (regardless of the value of
    101   // look_for_target_matches) because the cost of checking for matches
    102   // across the source-target boundary would not justify its benefits.
    103   //
    104   VCDiffStreamingEncoder(const HashedDictionary* dictionary,
    105                          VCDiffFormatExtensionFlags format_extensions,
    106                          bool look_for_target_matches);
    107   ~VCDiffStreamingEncoder();
    108 
    109   // The client should use these routines as follows:
    110   //    HashedDictionary hd(dictionary, dictionary_size);
    111   //    if (!hd.Init()) {
    112   //      HandleError();
    113   //      return;
    114   //    }
    115   //    string output_string;
    116   //    VCDiffStreamingEncoder v(hd, false, false);
    117   //    if (!v.StartEncoding(&output_string)) {
    118   //      HandleError();
    119   //      return;  // No need to call FinishEncoding()
    120   //    }
    121   //    Process(output_string.data(), output_string.size());
    122   //    output_string.clear();
    123   //    while (get data_buf) {
    124   //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
    125   //        HandleError();
    126   //        return;  // No need to call FinishEncoding()
    127   //      }
    128   //      // The encoding is appended to output_string at each call,
    129   //      // so clear output_string once its contents have been processed.
    130   //      Process(output_string.data(), output_string.size());
    131   //      output_string.clear();
    132   //    }
    133   //    if (!v.FinishEncoding(&output_string)) {
    134   //      HandleError();
    135   //      return;
    136   //    }
    137   //    Process(output_string.data(), output_string.size());
    138   //    output_string.clear();
    139   //
    140   // I.e., the allowed pattern of calls is
    141   //    StartEncoding EncodeChunk* FinishEncoding
    142   //
    143   // The size of the encoded output depends on the sizes of the chunks
    144   // passed in (i.e. the chunking boundary affects compression).
    145   // However the decoded output is independent of chunk boundaries.
    146 
    147   // Sets up the data structures for encoding.
    148   // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
    149   // to *output_string.
    150   //
    151   // Note: we *append*, so the old contents of *output_string stick around.
    152   // This convention differs from the non-streaming Encode/Decode
    153   // interfaces in VCDiffEncoder.
    154   //
    155   // If an error occurs, this function returns false; otherwise it returns true.
    156   // If this function returns false, the caller does not need to call
    157   // FinishEncoding or to do any cleanup except destroying the
    158   // VCDiffStreamingEncoder object.
    159   template<class OutputType>
    160   bool StartEncoding(OutputType* output) {
    161     OutputString<OutputType> output_string(output);
    162     return StartEncodingToInterface(&output_string);
    163   }
    164 
    165   bool StartEncodingToInterface(OutputStringInterface* output_string);
    166 
    167   // Appends compressed encoding for "data" (one complete VCDIFF delta window)
    168   // to *output_string.
    169   // If an error occurs (for example, if StartEncoding was not called
    170   // earlier or StartEncoding returned false), this function returns false;
    171   // otherwise it returns true.  The caller does not need to call FinishEncoding
    172   // or do any cleanup except destroying the VCDiffStreamingEncoder
    173   // if this function returns false.
    174   template<class OutputType>
    175   bool EncodeChunk(const char* data, size_t len, OutputType* output) {
    176     OutputString<OutputType> output_string(output);
    177     return EncodeChunkToInterface(data, len, &output_string);
    178   }
    179 
    180   bool EncodeChunkToInterface(const char* data, size_t len,
    181                               OutputStringInterface* output_string);
    182 
    183   // Finishes encoding and appends any leftover encoded data to *output_string.
    184   // If an error occurs (for example, if StartEncoding was not called
    185   // earlier or StartEncoding returned false), this function returns false;
    186   // otherwise it returns true.  The caller does not need to
    187   // do any cleanup except destroying the VCDiffStreamingEncoder
    188   // if this function returns false.
    189   template<class OutputType>
    190   bool FinishEncoding(OutputType* output) {
    191     OutputString<OutputType> output_string(output);
    192     return FinishEncodingToInterface(&output_string);
    193   }
    194 
    195   bool FinishEncodingToInterface(OutputStringInterface* output_string);
    196 
    197  private:
    198   VCDiffStreamingEncoderImpl* const impl_;
    199 
    200   // Make the copy constructor and assignment operator private
    201   // so that they don't inadvertently get used.
    202   VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
    203   void operator=(const VCDiffStreamingEncoder&);
    204 };
    205 
    206 // A simpler (non-streaming) interface to the VCDIFF encoder that can be used
    207 // if the entire target data string is available.
    208 //
    209 class VCDiffEncoder {
    210  public:
    211   VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
    212       : dictionary_(dictionary_contents, dictionary_size),
    213         encoder_(NULL),
    214         flags_(VCD_STANDARD_FORMAT),
    215         look_for_target_matches_(true) { }
    216 
    217   ~VCDiffEncoder() {
    218     delete encoder_;
    219   }
    220 
    221   // By default, VCDiffEncoder uses standard VCDIFF format.  This function
    222   // can be used before calling Encode(), to specify that interleaved format
    223   // and/or checksum format should be used.
    224   void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }
    225 
    226   // By default, VCDiffEncoder looks for matches in the dictionary and also in
    227   // the previously encoded target data.  This function can be used before
    228   // calling Encode(), to specify whether or not target matching should be
    229   // enabled.
    230   void SetTargetMatching(bool look_for_target_matches) {
    231     look_for_target_matches_ = look_for_target_matches;
    232   }
    233 
    234   // Replaces old contents of output_string with the encoded form of
    235   // target_data.
    236   template<class OutputType>
    237   bool Encode(const char* target_data,
    238               size_t target_len,
    239               OutputType* output) {
    240     OutputString<OutputType> output_string(output);
    241     return EncodeToInterface(target_data, target_len, &output_string);
    242   }
    243 
    244  private:
    245   bool EncodeToInterface(const char* target_data,
    246                          size_t target_len,
    247                          OutputStringInterface* output_string);
    248 
    249   HashedDictionary dictionary_;
    250   VCDiffStreamingEncoder* encoder_;
    251   VCDiffFormatExtensionFlags flags_;
    252   bool look_for_target_matches_;
    253 
    254   // Make the copy constructor and assignment operator private
    255   // so that they don't inadvertently get used.
    256   VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
    257   void operator=(const VCDiffEncoder&);
    258 };
    259 
    260 }  // namespace open_vcdiff
    261 
    262 #endif  // OPEN_VCDIFF_VCENCODER_H_
    263