1 // Copyright 2008 Google Inc. 2 // Author: Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <config.h> 17 #include "encodetable.h" 18 #include <string> 19 #include "addrcache.h" 20 #include "codetable.h" 21 #include "instruction_map.h" 22 #include "logging.h" 23 #include "google/output_string.h" 24 #include "varint_bigendian.h" 25 #include "vcdiff_defs.h" 26 27 namespace open_vcdiff { 28 29 // VCDiffCodeTableWriter members and methods 30 31 // If interleaved is true, the encoder writes each delta file window 32 // by interleaving instructions and sizes with their corresponding 33 // addresses and data, rather than placing these elements into three 34 // separate sections. This facilitates providing partially 35 // decoded results when only a portion of a delta file window 36 // is received (e.g. when HTTP over TCP is used as the 37 // transmission protocol.) The interleaved format is 38 // not consistent with the VCDIFF draft standard. 39 // 40 VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved) 41 : max_mode_(VCDiffAddressCache::DefaultLastMode()), 42 dictionary_size_(0), 43 target_length_(0), 44 code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData), 45 instruction_map_(NULL), 46 last_opcode_index_(-1), 47 add_checksum_(false), 48 checksum_(0), 49 match_counts_(kMaxMatchSize, 0) { 50 InitSectionPointers(interleaved); 51 } 52 53 VCDiffCodeTableWriter::VCDiffCodeTableWriter( 54 bool interleaved, 55 int near_cache_size, 56 int same_cache_size, 57 const VCDiffCodeTableData& code_table_data, 58 unsigned char max_mode) 59 : max_mode_(max_mode), 60 address_cache_(near_cache_size, same_cache_size), 61 dictionary_size_(0), 62 target_length_(0), 63 code_table_data_(&code_table_data), 64 instruction_map_(NULL), 65 last_opcode_index_(-1), 66 add_checksum_(false), 67 checksum_(0) { 68 InitSectionPointers(interleaved); 69 } 70 71 VCDiffCodeTableWriter::~VCDiffCodeTableWriter() { 72 if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) { 73 delete instruction_map_; 74 } 75 } 76 77 void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) { 78 if (interleaved) { 79 data_for_add_and_run_ = &instructions_and_sizes_; 80 addresses_for_copy_ = &instructions_and_sizes_; 81 } else { 82 data_for_add_and_run_ = &separate_data_for_add_and_run_; 83 addresses_for_copy_ = &separate_addresses_for_copy_; 84 } 85 } 86 87 bool VCDiffCodeTableWriter::Init(size_t dictionary_size) { 88 dictionary_size_ = dictionary_size; 89 if (!instruction_map_) { 90 if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) { 91 instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap(); 92 } else { 93 instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_); 94 } 95 if (!instruction_map_) { 96 return false; 97 } 98 } 99 if (!address_cache_.Init()) { 100 return false; 101 } 102 target_length_ = 0; 103 last_opcode_index_ = -1; 104 return true; 105 } 106 107 // The VCDiff format allows each opcode to represent either 108 // one or two delta instructions. This function will first 109 // examine the opcode generated by the last call to EncodeInstruction. 110 // If that opcode was a single-instruction opcode, this function checks 111 // whether there is a compound (double-instruction) opcode that can 112 // combine that single instruction with the instruction that is now 113 // being added, and so save a byte of space. In that case, the 114 // single-instruction opcode at position last_opcode_index_ will be 115 // overwritten with the new double-instruction opcode. 116 // 117 // In the majority of cases, no compound opcode will be possible, 118 // and a new single-instruction opcode will be appended to 119 // instructions_and_sizes_, followed by a representation of its size 120 // if the opcode does not implicitly give the instruction size. 121 // 122 // As an example, say instructions_and_sizes_ contains 10 bytes, the last 123 // of which contains the opcode 0x02 (ADD size 1). Because that was the 124 // most recently added opcode, last_opcode_index_ has the value 10. 125 // EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0. 126 // The function will replace the old opcode 0x02 with the double-instruction 127 // opcode 0xA3 (ADD size 1 + COPY size 4 mode 0). 128 // 129 // All of the double-instruction opcodes in the standard code table 130 // have implicit sizes, meaning that the size of the instruction will not 131 // need to be written to the instructions_and_sizes_ string separately 132 // from the opcode. If a custom code table were used that did not have 133 // this property, then instructions_and_sizes_ might contain a 134 // double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0) 135 // followed by the size of the COPY, then by the size of the ADD. 136 // If using the SDCH interleaved format, the address of the COPY instruction 137 // would follow its size, so the ordering would be 138 // [Compound Opcode][Size of COPY][Address of COPY][Size of ADD] 139 // 140 void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst, 141 size_t size, 142 unsigned char mode) { 143 if (!instruction_map_) { 144 LOG(DFATAL) << "EncodeInstruction() called without calling Init()" 145 << LOG_ENDL; 146 return; 147 } 148 if (last_opcode_index_ >= 0) { 149 const unsigned char last_opcode = 150 instructions_and_sizes_[last_opcode_index_]; 151 // The encoding engine should not generate two ADD instructions in a row. 152 // This won't cause a failure, but it's inefficient encoding and probably 153 // represents a bug in the higher-level logic of the encoder. 154 if ((inst == VCD_ADD) && 155 (code_table_data_->inst1[last_opcode] == VCD_ADD)) { 156 LOG(WARNING) << "EncodeInstruction() called for two ADD instructions" 157 " in a row" << LOG_ENDL; 158 } 159 OpcodeOrNone compound_opcode = kNoOpcode; 160 if (size <= UCHAR_MAX) { 161 compound_opcode = 162 instruction_map_->LookupSecondOpcode(last_opcode, 163 inst, 164 static_cast<unsigned char>(size), 165 mode); 166 if (compound_opcode != kNoOpcode) { 167 instructions_and_sizes_[last_opcode_index_] = 168 static_cast<unsigned char>(compound_opcode); 169 last_opcode_index_ = -1; 170 return; 171 } 172 } 173 // Try finding a compound opcode with size 0. 174 compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode, 175 inst, 176 0, 177 mode); 178 if (compound_opcode != kNoOpcode) { 179 instructions_and_sizes_[last_opcode_index_] = 180 static_cast<unsigned char>(compound_opcode); 181 last_opcode_index_ = -1; 182 AppendSizeToString(size, &instructions_and_sizes_); 183 return; 184 } 185 } 186 OpcodeOrNone opcode = kNoOpcode; 187 if (size <= UCHAR_MAX) { 188 opcode = 189 instruction_map_->LookupFirstOpcode(inst, 190 static_cast<unsigned char>(size), 191 mode); 192 if (opcode != kNoOpcode) { 193 instructions_and_sizes_.push_back(static_cast<char>(opcode)); 194 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1); 195 return; 196 } 197 } 198 // There should always be an opcode with size 0. 199 opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode); 200 if (opcode == kNoOpcode) { 201 LOG(DFATAL) << "No matching opcode found for inst " << inst 202 << ", mode " << mode << ", size 0" << LOG_ENDL; 203 return; 204 } 205 instructions_and_sizes_.push_back(static_cast<char>(opcode)); 206 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1); 207 AppendSizeToString(size, &instructions_and_sizes_); 208 } 209 210 void VCDiffCodeTableWriter::Add(const char* data, size_t size) { 211 EncodeInstruction(VCD_ADD, size); 212 data_for_add_and_run_->append(data, size); 213 target_length_ += size; 214 } 215 216 void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) { 217 if (!instruction_map_) { 218 LOG(DFATAL) << "VCDiffCodeTableWriter::Copy() called without calling Init()" 219 << LOG_ENDL; 220 return; 221 } 222 // If a single interleaved stream of encoded values is used 223 // instead of separate sections for instructions, addresses, and data, 224 // then the string instructions_and_sizes_ may be the same as 225 // addresses_for_copy_. The address should therefore be encoded 226 // *after* the instruction and its size. 227 int32_t encoded_addr = 0; 228 const unsigned char mode = address_cache_.EncodeAddress( 229 offset, 230 static_cast<VCDAddress>(dictionary_size_ + target_length_), 231 &encoded_addr); 232 EncodeInstruction(VCD_COPY, size, mode); 233 if (address_cache_.WriteAddressAsVarintForMode(mode)) { 234 VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_); 235 } else { 236 addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr)); 237 } 238 target_length_ += size; 239 if (size >= match_counts_.size()) { 240 match_counts_.resize(size * 2, 0); // Be generous to avoid resizing again 241 } 242 ++match_counts_[size]; 243 } 244 245 void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) { 246 EncodeInstruction(VCD_RUN, size); 247 data_for_add_and_run_->push_back(byte); 248 target_length_ += size; 249 } 250 251 size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) { 252 return VarintBE<int32_t>::Length(static_cast<int32_t>(size)); 253 } 254 255 void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) { 256 VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out); 257 } 258 259 void VCDiffCodeTableWriter::AppendSizeToOutputString( 260 size_t size, 261 OutputStringInterface* out) { 262 VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out); 263 } 264 265 // This calculation must match the items added between "Start of Delta Encoding" 266 // and "End of Delta Encoding" in Output(), below. 267 size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const { 268 size_t length_of_the_delta_encoding = 269 CalculateLengthOfSizeAsVarint(target_length_) + 270 1 + // Delta_Indicator 271 CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) + 272 CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) + 273 CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) + 274 separate_data_for_add_and_run_.size() + 275 instructions_and_sizes_.size() + 276 separate_addresses_for_copy_.size(); 277 if (add_checksum_) { 278 length_of_the_delta_encoding += 279 VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_)); 280 } 281 return length_of_the_delta_encoding; 282 } 283 284 void VCDiffCodeTableWriter::Output(OutputStringInterface* out) { 285 if (instructions_and_sizes_.empty()) { 286 LOG(WARNING) << "Empty input; no delta window produced" << LOG_ENDL; 287 } else { 288 const size_t length_of_the_delta_encoding = 289 CalculateLengthOfTheDeltaEncoding(); 290 const size_t delta_window_size = 291 length_of_the_delta_encoding + 292 1 + // Win_Indicator 293 CalculateLengthOfSizeAsVarint(dictionary_size_) + 294 CalculateLengthOfSizeAsVarint(0) + 295 CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding); 296 // append() will be called many times on the output string; make sure 297 // the output string is resized only once at most. 298 out->ReserveAdditionalBytes(delta_window_size); 299 300 // Add first element: Win_Indicator 301 if (add_checksum_) { 302 out->push_back(VCD_SOURCE | VCD_CHECKSUM); 303 } else { 304 out->push_back(VCD_SOURCE); 305 } 306 // Source segment size: dictionary size 307 AppendSizeToOutputString(dictionary_size_, out); 308 // Source segment position: 0 (start of dictionary) 309 AppendSizeToOutputString(0, out); 310 311 // [Here is where a secondary compressor would be used 312 // if the encoder and decoder supported that feature.] 313 314 AppendSizeToOutputString(length_of_the_delta_encoding, out); 315 // Start of Delta Encoding 316 const size_t size_before_delta_encoding = out->size(); 317 AppendSizeToOutputString(target_length_, out); 318 out->push_back(0x00); // Delta_Indicator: no compression 319 AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out); 320 AppendSizeToOutputString(instructions_and_sizes_.size(), out); 321 AppendSizeToOutputString(separate_addresses_for_copy_.size(), out); 322 if (add_checksum_) { 323 // The checksum is a 32-bit *unsigned* integer. VarintBE requires a 324 // signed type, so use a 64-bit signed integer to store the checksum. 325 VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_), 326 out); 327 } 328 out->append(separate_data_for_add_and_run_.data(), 329 separate_data_for_add_and_run_.size()); 330 out->append(instructions_and_sizes_.data(), 331 instructions_and_sizes_.size()); 332 out->append(separate_addresses_for_copy_.data(), 333 separate_addresses_for_copy_.size()); 334 // End of Delta Encoding 335 const size_t size_after_delta_encoding = out->size(); 336 if (length_of_the_delta_encoding != 337 (size_after_delta_encoding - size_before_delta_encoding)) { 338 LOG(DFATAL) << "Internal error: calculated length of the delta encoding (" 339 << length_of_the_delta_encoding 340 << ") does not match actual length (" 341 << (size_after_delta_encoding - size_before_delta_encoding) 342 << LOG_ENDL; 343 } 344 separate_data_for_add_and_run_.clear(); 345 instructions_and_sizes_.clear(); 346 separate_addresses_for_copy_.clear(); 347 if (target_length_ == 0) { 348 LOG(WARNING) << "Empty target window" << LOG_ENDL; 349 } 350 } 351 352 // Reset state for next window; assume we are using same code table 353 // and dictionary. The caller will have to invoke Init() if a different 354 // dictionary is used. 355 // 356 // Notably, Init() calls address_cache_.Init(). This resets the address 357 // cache between delta windows, as required by RFC section 5.1. 358 if (!Init(dictionary_size_)) { 359 LOG(DFATAL) << "Internal error: calling Init() to reset " 360 "VCDiffCodeTableWriter state failed" << LOG_ENDL; 361 } 362 } 363 364 }; // namespace open_vcdiff 365