1 // Copyright 2007 Google Inc. 2 // Author: Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef OPEN_VCDIFF_VCENCODER_H_ 17 #define OPEN_VCDIFF_VCENCODER_H_ 18 19 #include <stddef.h> // size_t 20 #include "google/format_extension_flags.h" 21 #include "google/output_string.h" 22 23 namespace open_vcdiff { 24 25 class VCDiffEngine; 26 class VCDiffStreamingEncoderImpl; 27 28 // A HashedDictionary must be constructed from the dictionary data 29 // in order to use VCDiffStreamingEncoder. If the same dictionary will 30 // be used to perform several encoding operations, then the caller should 31 // create the HashedDictionary once and cache it for reuse. This object 32 // is thread-safe: the same const HashedDictionary can be used 33 // by several threads simultaneously, each with its own VCDiffStreamingEncoder. 34 // 35 // dictionary_contents is copied into the HashedDictionary, so the 36 // caller may free that string, if desired, after the constructor returns. 37 // 38 class HashedDictionary { 39 public: 40 HashedDictionary(const char* dictionary_contents, 41 size_t dictionary_size); 42 ~HashedDictionary(); 43 44 // Init() must be called before using the HashedDictionary as an argument 45 // to the VCDiffStreamingEncoder, or for any other purpose except 46 // destruction. It returns true if initialization succeeded, or false 47 // if an error occurred, in which case the caller should destroy the object 48 // without using it. 49 bool Init(); 50 51 const VCDiffEngine* engine() const { return engine_; } 52 53 private: 54 const VCDiffEngine* engine_; 55 56 // Make the copy constructor and assignment operator private 57 // so that they don't inadvertently get used. 58 HashedDictionary(const HashedDictionary&); // NOLINT 59 void operator=(const HashedDictionary&); 60 }; 61 62 // The standard streaming interface to the VCDIFF (RFC 3284) encoder. 63 // "Streaming" in this context means that, even though the entire set of 64 // input data to be encoded may not be available at once, the encoder 65 // can produce partial output based on what is available. Of course, 66 // the caller should try to maximize the sizes of the data chunks passed 67 // to the encoder. 68 class VCDiffStreamingEncoder { 69 public: 70 // The HashedDictionary object passed to the constructor must remain valid, 71 // without being deleted, for the lifetime of the VCDiffStreamingEncoder 72 // object. 73 // 74 // format_extensions allows certain open-vcdiff extensions to the VCDIFF 75 // format to be included in the encoded output. These extensions are not 76 // part of the RFC 3284 draft standard, so specifying any extension flags 77 // will make the output compatible only with open-vcdiff, or with other 78 // VCDIFF implementations that accept these extensions. See above for an 79 // explanation of each possible flag value. 80 // 81 // *** look_for_target_matches: 82 // The VCDIFF format allows COPY instruction addresses to reference data from 83 // the source (dictionary), or from previously encoded target data. 84 // 85 // If look_for_target_matches is false, then the encoder will only 86 // produce COPY instructions that reference source data from the dictionary, 87 // never from previously encoded target data. This will speed up the encoding 88 // process, but the encoded data will not be as compact. 89 // 90 // If this value is true, then the encoder will produce COPY instructions 91 // that reference either source data or target data. A COPY instruction from 92 // the previously encoded target data may even extend into the range of the 93 // data being produced by that same COPY instruction; for example, if the 94 // previously encoded target data is "LA", then a single COPY instruction of 95 // length 10 can produce the additional target data "LALALALALA". 96 // 97 // There is a third type of COPY instruction that starts within 98 // the source data and extends from the end of the source data 99 // into the beginning of the target data. This VCDIFF encoder will never 100 // produce a COPY instruction of this third type (regardless of the value of 101 // look_for_target_matches) because the cost of checking for matches 102 // across the source-target boundary would not justify its benefits. 103 // 104 VCDiffStreamingEncoder(const HashedDictionary* dictionary, 105 VCDiffFormatExtensionFlags format_extensions, 106 bool look_for_target_matches); 107 ~VCDiffStreamingEncoder(); 108 109 // The client should use these routines as follows: 110 // HashedDictionary hd(dictionary, dictionary_size); 111 // if (!hd.Init()) { 112 // HandleError(); 113 // return; 114 // } 115 // string output_string; 116 // VCDiffStreamingEncoder v(hd, false, false); 117 // if (!v.StartEncoding(&output_string)) { 118 // HandleError(); 119 // return; // No need to call FinishEncoding() 120 // } 121 // Process(output_string.data(), output_string.size()); 122 // output_string.clear(); 123 // while (get data_buf) { 124 // if (!v.EncodeChunk(data_buf, data_len, &output_string)) { 125 // HandleError(); 126 // return; // No need to call FinishEncoding() 127 // } 128 // // The encoding is appended to output_string at each call, 129 // // so clear output_string once its contents have been processed. 130 // Process(output_string.data(), output_string.size()); 131 // output_string.clear(); 132 // } 133 // if (!v.FinishEncoding(&output_string)) { 134 // HandleError(); 135 // return; 136 // } 137 // Process(output_string.data(), output_string.size()); 138 // output_string.clear(); 139 // 140 // I.e., the allowed pattern of calls is 141 // StartEncoding EncodeChunk* FinishEncoding 142 // 143 // The size of the encoded output depends on the sizes of the chunks 144 // passed in (i.e. the chunking boundary affects compression). 145 // However the decoded output is independent of chunk boundaries. 146 147 // Sets up the data structures for encoding. 148 // Writes a VCDIFF delta file header (as defined in RFC section 4.1) 149 // to *output_string. 150 // 151 // Note: we *append*, so the old contents of *output_string stick around. 152 // This convention differs from the non-streaming Encode/Decode 153 // interfaces in VCDiffEncoder. 154 // 155 // If an error occurs, this function returns false; otherwise it returns true. 156 // If this function returns false, the caller does not need to call 157 // FinishEncoding or to do any cleanup except destroying the 158 // VCDiffStreamingEncoder object. 159 template<class OutputType> 160 bool StartEncoding(OutputType* output) { 161 OutputString<OutputType> output_string(output); 162 return StartEncodingToInterface(&output_string); 163 } 164 165 bool StartEncodingToInterface(OutputStringInterface* output_string); 166 167 // Appends compressed encoding for "data" (one complete VCDIFF delta window) 168 // to *output_string. 169 // If an error occurs (for example, if StartEncoding was not called 170 // earlier or StartEncoding returned false), this function returns false; 171 // otherwise it returns true. The caller does not need to call FinishEncoding 172 // or do any cleanup except destroying the VCDiffStreamingEncoder 173 // if this function returns false. 174 template<class OutputType> 175 bool EncodeChunk(const char* data, size_t len, OutputType* output) { 176 OutputString<OutputType> output_string(output); 177 return EncodeChunkToInterface(data, len, &output_string); 178 } 179 180 bool EncodeChunkToInterface(const char* data, size_t len, 181 OutputStringInterface* output_string); 182 183 // Finishes encoding and appends any leftover encoded data to *output_string. 184 // If an error occurs (for example, if StartEncoding was not called 185 // earlier or StartEncoding returned false), this function returns false; 186 // otherwise it returns true. The caller does not need to 187 // do any cleanup except destroying the VCDiffStreamingEncoder 188 // if this function returns false. 189 template<class OutputType> 190 bool FinishEncoding(OutputType* output) { 191 OutputString<OutputType> output_string(output); 192 return FinishEncodingToInterface(&output_string); 193 } 194 195 bool FinishEncodingToInterface(OutputStringInterface* output_string); 196 197 private: 198 VCDiffStreamingEncoderImpl* const impl_; 199 200 // Make the copy constructor and assignment operator private 201 // so that they don't inadvertently get used. 202 VCDiffStreamingEncoder(const VCDiffStreamingEncoder&); // NOLINT 203 void operator=(const VCDiffStreamingEncoder&); 204 }; 205 206 // A simpler (non-streaming) interface to the VCDIFF encoder that can be used 207 // if the entire target data string is available. 208 // 209 class VCDiffEncoder { 210 public: 211 VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size) 212 : dictionary_(dictionary_contents, dictionary_size), 213 encoder_(NULL), 214 flags_(VCD_STANDARD_FORMAT), 215 look_for_target_matches_(true) { } 216 217 ~VCDiffEncoder() { 218 delete encoder_; 219 } 220 221 // By default, VCDiffEncoder uses standard VCDIFF format. This function 222 // can be used before calling Encode(), to specify that interleaved format 223 // and/or checksum format should be used. 224 void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; } 225 226 // By default, VCDiffEncoder looks for matches in the dictionary and also in 227 // the previously encoded target data. This function can be used before 228 // calling Encode(), to specify whether or not target matching should be 229 // enabled. 230 void SetTargetMatching(bool look_for_target_matches) { 231 look_for_target_matches_ = look_for_target_matches; 232 } 233 234 // Replaces old contents of output_string with the encoded form of 235 // target_data. 236 template<class OutputType> 237 bool Encode(const char* target_data, 238 size_t target_len, 239 OutputType* output) { 240 OutputString<OutputType> output_string(output); 241 return EncodeToInterface(target_data, target_len, &output_string); 242 } 243 244 private: 245 bool EncodeToInterface(const char* target_data, 246 size_t target_len, 247 OutputStringInterface* output_string); 248 249 HashedDictionary dictionary_; 250 VCDiffStreamingEncoder* encoder_; 251 VCDiffFormatExtensionFlags flags_; 252 bool look_for_target_matches_; 253 254 // Make the copy constructor and assignment operator private 255 // so that they don't inadvertently get used. 256 VCDiffEncoder(const VCDiffEncoder&); // NOLINT 257 void operator=(const VCDiffEncoder&); 258 }; 259 260 } // namespace open_vcdiff 261 262 #endif // OPEN_VCDIFF_VCENCODER_H_ 263