1 // Copyright 2008 Google Inc. 2 // Author: Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef OPEN_VCDIFF_ENCODETABLE_H_ 17 #define OPEN_VCDIFF_ENCODETABLE_H_ 18 19 #include <config.h> 20 #include <stddef.h> // size_t 21 #include <stdint.h> // int32_t 22 #include <string> 23 #include <vector> 24 #include "addrcache.h" 25 #include "checksum.h" 26 #include "codetable.h" 27 #include "codetablewriter_interface.h" 28 29 namespace open_vcdiff { 30 31 class OutputStringInterface; 32 class VCDiffInstructionMap; 33 34 // The method calls after construction *must* conform 35 // to the following pattern: 36 // {{Add|Copy|Run}* [AddChecksum] Output}* 37 // 38 // When Output has been called in this sequence, a complete target window 39 // (as defined in RFC 3284 section 4.3) will have been appended to 40 // out (unless no calls to Add, Run, or Copy were made, in which 41 // case Output will do nothing.) The output will not be available for use 42 // until after each call to Output(). 43 // 44 // NOT threadsafe. 45 // 46 class VCDiffCodeTableWriter : public CodeTableWriterInterface { 47 public: 48 // This constructor uses the default code table. 49 // If interleaved is true, the encoder writes each delta file window 50 // by interleaving instructions and sizes with their corresponding 51 // addresses and data, rather than placing these elements into three 52 // separate sections. This facilitates providing partially 53 // decoded results when only a portion of a delta file window 54 // is received (e.g. when HTTP over TCP is used as the 55 // transmission protocol.) The interleaved format is 56 // not consistent with the VCDIFF draft standard. 57 // 58 explicit VCDiffCodeTableWriter(bool interleaved); 59 60 // Uses a non-standard code table and non-standard cache sizes. The caller 61 // must guarantee that code_table_data remains allocated for the lifetime of 62 // the VCDiffCodeTableWriter object. Note that this is different from how 63 // VCDiffCodeTableReader::UseCodeTable works. It is assumed that a given 64 // encoder will use either the default code table or a statically-defined 65 // non-standard code table, whereas the decoder must have the ability to read 66 // an arbitrary non-standard code table from a delta file and discard it once 67 // the file has been decoded. 68 // 69 VCDiffCodeTableWriter(bool interleaved, 70 int near_cache_size, 71 int same_cache_size, 72 const VCDiffCodeTableData& code_table_data, 73 unsigned char max_mode); 74 75 virtual ~VCDiffCodeTableWriter(); 76 77 // Initializes the constructed object for use. 78 // This method must be called after a VCDiffCodeTableWriter is constructed 79 // and before any of its other methods can be called. It will return 80 // false if there was an error initializing the object, or true if it 81 // was successful. After the object has been initialized and used, 82 // Init() can be called again to restore the initial state of the object. 83 // 84 bool Init(size_t dictionary_size); 85 86 virtual size_t target_length() const { return target_length_; } 87 88 // Encode an ADD opcode with the "size" bytes starting at data 89 virtual void Add(const char* data, size_t size); 90 91 // Encode a COPY opcode with args "offset" (into dictionary) and "size" bytes. 92 virtual void Copy(int32_t offset, size_t size); 93 94 // Encode a RUN opcode for "size" copies of the value "byte". 95 virtual void Run(size_t size, unsigned char byte); 96 97 void AddChecksum(VCDChecksum checksum) { 98 add_checksum_ = true; 99 checksum_ = checksum; 100 } 101 102 // Finishes encoding and appends the encoded delta window to the output 103 // string. The output string is not null-terminated and may contain embedded 104 // '\0' characters. 105 virtual void Output(OutputStringInterface* out); 106 107 const std::vector<int>& match_counts() const { return match_counts_; } 108 109 private: 110 typedef std::string string; 111 112 // This is an estimate of the longest match size the encoder expects to find. 113 // It is used to determine the initial size of the vector match_counts_. 114 // If it is too large, then some space will be wasted on vector elements 115 // that are not used. If it is too small, then some time will be wasted 116 // expanding match_counts_ to accommodate larger match sizes. 117 static const size_t kMaxMatchSize = 2000; 118 119 // The maximum value for the mode of a COPY instruction. 120 const unsigned char max_mode_; 121 122 // If interleaved is true, sets data_for_add_and_run_ and 123 // addresses_for_copy_ to point at instructions_and_sizes_, 124 // so that instructions, sizes, addresses and data will be 125 // combined into a single interleaved stream. 126 // If interleaved is false, sets data_for_add_and_run_ and 127 // addresses_for_copy_ to point at their corresponding 128 // separate_... strings, so that the three sections will 129 // be generated separately from one another. 130 // 131 void InitSectionPointers(bool interleaved); 132 133 // Determines the best opcode to encode an instruction, and appends 134 // or substitutes that opcode and its size into the 135 // instructions_and_sizes_ string. 136 // 137 void EncodeInstruction(VCDiffInstructionType inst, 138 size_t size, 139 unsigned char mode); 140 141 void EncodeInstruction(VCDiffInstructionType inst, size_t size) { 142 return EncodeInstruction(inst, size, 0); 143 } 144 145 // Calculates the number of bytes needed to store the given size value as a 146 // variable-length integer (VarintBE). 147 static size_t CalculateLengthOfSizeAsVarint(size_t size); 148 149 // Appends the size value to the string as a variable-length integer. 150 static void AppendSizeToString(size_t size, string* out); 151 152 // Appends the size value to the output string as a variable-length integer. 153 static void AppendSizeToOutputString(size_t size, OutputStringInterface* out); 154 155 // Calculates the "Length of the delta encoding" field for the delta window 156 // header, based on the sizes of the sections and of the other header 157 // elements. 158 size_t CalculateLengthOfTheDeltaEncoding() const; 159 160 // None of the following 'string' objects are null-terminated. 161 162 // A series of instruction opcodes, each of which may be followed 163 // by one or two Varint values representing the size parameters 164 // of the first and second instruction in the opcode. 165 string instructions_and_sizes_; 166 167 // A series of data arguments (byte values) used for ADD and RUN 168 // instructions. Depending on whether interleaved output is used 169 // for streaming or not, the pointer may point to 170 // separate_data_for_add_and_run_ or to instructions_and_sizes_. 171 string *data_for_add_and_run_; 172 string separate_data_for_add_and_run_; 173 174 // A series of Varint addresses used for COPY instructions. 175 // For the SAME mode, a byte value is stored instead of a Varint. 176 // Depending on whether interleaved output is used 177 // for streaming or not, the pointer may point to 178 // separate_addresses_for_copy_ or to instructions_and_sizes_. 179 string *addresses_for_copy_; 180 string separate_addresses_for_copy_; 181 182 VCDiffAddressCache address_cache_; 183 184 size_t dictionary_size_; 185 186 // The number of bytes of target data that has been encoded so far. 187 // Each time Add(), Copy(), or Run() is called, this will be incremented. 188 // The target length is used to compute HERE mode addresses 189 // for COPY instructions, and is also written into the header 190 // of the delta window when Output() is called. 191 // 192 size_t target_length_; 193 194 const VCDiffCodeTableData* code_table_data_; 195 196 // The instruction map facilitates finding an opcode quickly given an 197 // instruction inst, size, and mode. This is an alternate representation 198 // of the same information that is found in code_table_data_. 199 // 200 const VCDiffInstructionMap* instruction_map_; 201 202 // The zero-based index within instructions_and_sizes_ of the byte 203 // that contains the last single-instruction opcode generated by 204 // EncodeInstruction(). (See that function for exhaustive details.) 205 // It is necessary to use an index rather than a pointer for this value 206 // because instructions_and_sizes_ may be resized, which would invalidate 207 // any pointers into its data buffer. The value -1 is reserved to mean that 208 // either no opcodes have been generated yet, or else the last opcode 209 // generated was a double-instruction opcode. 210 // 211 int last_opcode_index_; 212 213 // If true, an Adler32 checksum of the target window data will be written as 214 // a variable-length integer, just after the size of the addresses section. 215 // 216 bool add_checksum_; 217 218 // The checksum to be written to the current target window, 219 // if add_checksum_ is true. 220 // This will not be calculated based on the individual calls to Add(), Run(), 221 // and Copy(), which would be unnecessarily expensive. Instead, the code 222 // that uses the VCDiffCodeTableWriter object is expected to calculate 223 // the checksum all at once and to call AddChecksum() with that value. 224 // Must be called sometime before calling Output(), though it can be called 225 // either before or after the calls to Add(), Run(), and Copy(). 226 // 227 VCDChecksum checksum_; 228 229 // The value of match_counts_[n] is equal to the number of matches 230 // of length n (that is, COPY instructions of size n) found so far. 231 std::vector<int> match_counts_; 232 233 // Making these private avoids implicit copy constructor & assignment operator 234 VCDiffCodeTableWriter(const VCDiffCodeTableWriter&); // NOLINT 235 void operator=(const VCDiffCodeTableWriter&); 236 }; 237 238 }; // namespace open_vcdiff 239 240 #endif // OPEN_VCDIFF_ENCODETABLE_H_ 241