Home | History | Annotate | Download | only in src
      1 // Copyright 2008 Google Inc.
      2 // Author: Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <config.h>
     17 #include <limits.h>  // UCHAR_MAX
     18 #include <string>
     19 #include "addrcache.h"
     20 #include "codetable.h"
     21 #include "encodetable.h"
     22 #include "instruction_map.h"
     23 #include "logging.h"
     24 #include "google/output_string.h"
     25 #include "varint_bigendian.h"
     26 #include "vcdiff_defs.h"
     27 
     28 namespace open_vcdiff {
     29 
     30 static const DeltaFileHeader kHeaderStandardFormat = {
     31     0xD6,  // Header1: "V" | 0x80
     32     0xC3,  // Header2: "C" | 0x80
     33     0xC4,  // Header3: "D" | 0x80
     34     0x00,  // Header4: Draft standard format
     35     0x00 };  // Hdr_Indicator:
     36              // No compression, no custom code table
     37 
     38 static const DeltaFileHeader kHeaderExtendedFormat = {
     39     0xD6,  // Header1: "V" | 0x80
     40     0xC3,  // Header2: "C" | 0x80
     41     0xC4,  // Header3: "D" | 0x80
     42     'S',   // Header4: VCDIFF/SDCH, extensions used
     43     0x00 };  // Hdr_Indicator:
     44              // No compression, no custom code table
     45 
     46 // VCDiffCodeTableWriter members and methods
     47 
     48 // If interleaved is true, the encoder writes each delta file window
     49 // by interleaving instructions and sizes with their corresponding
     50 // addresses and data, rather than placing these elements into three
     51 // separate sections.  This facilitates providing partially
     52 // decoded results when only a portion of a delta file window
     53 // is received (e.g. when HTTP over TCP is used as the
     54 // transmission protocol.)  The interleaved format is
     55 // not consistent with the VCDIFF draft standard.
     56 //
     57 VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
     58     : max_mode_(VCDiffAddressCache::DefaultLastMode()),
     59       dictionary_size_(0),
     60       target_length_(0),
     61       code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
     62       instruction_map_(NULL),
     63       last_opcode_index_(-1),
     64       add_checksum_(false),
     65       checksum_(0) {
     66   InitSectionPointers(interleaved);
     67 }
     68 
     69 VCDiffCodeTableWriter::VCDiffCodeTableWriter(
     70     bool interleaved,
     71     int near_cache_size,
     72     int same_cache_size,
     73     const VCDiffCodeTableData& code_table_data,
     74     unsigned char max_mode)
     75     : max_mode_(max_mode),
     76       address_cache_(near_cache_size, same_cache_size),
     77       dictionary_size_(0),
     78       target_length_(0),
     79       code_table_data_(&code_table_data),
     80       instruction_map_(NULL),
     81       last_opcode_index_(-1),
     82       add_checksum_(false),
     83       checksum_(0)  {
     84   InitSectionPointers(interleaved);
     85 }
     86 
     87 VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
     88   if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
     89     delete instruction_map_;
     90   }
     91 }
     92 
     93 void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
     94   if (interleaved) {
     95     data_for_add_and_run_ = &instructions_and_sizes_;
     96     addresses_for_copy_ = &instructions_and_sizes_;
     97   } else {
     98     data_for_add_and_run_ = &separate_data_for_add_and_run_;
     99     addresses_for_copy_ = &separate_addresses_for_copy_;
    100   }
    101 }
    102 
    103 bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
    104   dictionary_size_ = dictionary_size;
    105   if (!instruction_map_) {
    106     if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
    107       instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
    108     } else {
    109       instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
    110     }
    111     if (!instruction_map_) {
    112       return false;
    113     }
    114   }
    115   if (!address_cache_.Init()) {
    116     return false;
    117   }
    118   target_length_ = 0;
    119   last_opcode_index_ = -1;
    120   return true;
    121 }
    122 
    123 void VCDiffCodeTableWriter::WriteHeader(
    124     OutputStringInterface* out,
    125     VCDiffFormatExtensionFlags format_extensions) {
    126   if (format_extensions == VCD_STANDARD_FORMAT) {
    127     out->append(reinterpret_cast<const char*>(&kHeaderStandardFormat),
    128                 sizeof(kHeaderStandardFormat));
    129   } else {
    130     out->append(reinterpret_cast<const char*>(&kHeaderExtendedFormat),
    131                 sizeof(kHeaderExtendedFormat));
    132   }
    133   // If custom cache table sizes or a custom code table were used
    134   // for encoding, here is where they would be appended to *output.
    135   // This implementation of the encoder does not use those features,
    136   // although the decoder can understand and interpret them.
    137 }
    138 
    139 // The VCDiff format allows each opcode to represent either
    140 // one or two delta instructions.  This function will first
    141 // examine the opcode generated by the last call to EncodeInstruction.
    142 // If that opcode was a single-instruction opcode, this function checks
    143 // whether there is a compound (double-instruction) opcode that can
    144 // combine that single instruction with the instruction that is now
    145 // being added, and so save a byte of space.  In that case, the
    146 // single-instruction opcode at position last_opcode_index_ will be
    147 // overwritten with the new double-instruction opcode.
    148 //
    149 // In the majority of cases, no compound opcode will be possible,
    150 // and a new single-instruction opcode will be appended to
    151 // instructions_and_sizes_, followed by a representation of its size
    152 // if the opcode does not implicitly give the instruction size.
    153 //
    154 // As an example, say instructions_and_sizes_ contains 10 bytes, the last
    155 // of which contains the opcode 0x02 (ADD size 1).  Because that was the
    156 // most recently added opcode, last_opcode_index_ has the value 10.
    157 // EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
    158 // The function will replace the old opcode 0x02 with the double-instruction
    159 // opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
    160 //
    161 // All of the double-instruction opcodes in the standard code table
    162 // have implicit sizes, meaning that the size of the instruction will not
    163 // need to be written to the instructions_and_sizes_ string separately
    164 // from the opcode.  If a custom code table were used that did not have
    165 // this property, then instructions_and_sizes_ might contain a
    166 // double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
    167 // followed by the size of the COPY, then by the size of the ADD.
    168 // If using the SDCH interleaved format, the address of the COPY instruction
    169 // would follow its size, so the ordering would be
    170 // [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
    171 //
    172 void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
    173                                               size_t size,
    174                                               unsigned char mode) {
    175   if (!instruction_map_) {
    176     VCD_DFATAL << "EncodeInstruction() called without calling Init()"
    177                << VCD_ENDL;
    178     return;
    179   }
    180   if (last_opcode_index_ >= 0) {
    181     const unsigned char last_opcode =
    182         instructions_and_sizes_[last_opcode_index_];
    183     // The encoding engine should not generate two ADD instructions in a row.
    184     // This won't cause a failure, but it's inefficient encoding and probably
    185     // represents a bug in the higher-level logic of the encoder.
    186     if ((inst == VCD_ADD) &&
    187         (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
    188       VCD_WARNING << "EncodeInstruction() called for two ADD instructions"
    189                      " in a row" << VCD_ENDL;
    190     }
    191     OpcodeOrNone compound_opcode = kNoOpcode;
    192     if (size <= UCHAR_MAX) {
    193       compound_opcode =
    194           instruction_map_->LookupSecondOpcode(last_opcode,
    195                                                inst,
    196                                                static_cast<unsigned char>(size),
    197                                                mode);
    198       if (compound_opcode != kNoOpcode) {
    199         instructions_and_sizes_[last_opcode_index_] =
    200             static_cast<unsigned char>(compound_opcode);
    201         last_opcode_index_ = -1;
    202         return;
    203       }
    204     }
    205     // Try finding a compound opcode with size 0.
    206     compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
    207                                                            inst,
    208                                                            0,
    209                                                            mode);
    210     if (compound_opcode != kNoOpcode) {
    211       instructions_and_sizes_[last_opcode_index_] =
    212           static_cast<unsigned char>(compound_opcode);
    213       last_opcode_index_ = -1;
    214       AppendSizeToString(size, &instructions_and_sizes_);
    215       return;
    216     }
    217   }
    218   OpcodeOrNone opcode = kNoOpcode;
    219   if (size <= UCHAR_MAX) {
    220     opcode =
    221         instruction_map_->LookupFirstOpcode(inst,
    222                                             static_cast<unsigned char>(size),
    223                                             mode);
    224     if (opcode != kNoOpcode) {
    225       instructions_and_sizes_.push_back(static_cast<char>(opcode));
    226       last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
    227       return;
    228     }
    229   }
    230   // There should always be an opcode with size 0.
    231   opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
    232   if (opcode == kNoOpcode) {
    233     VCD_DFATAL << "No matching opcode found for inst " << inst
    234                << ", mode " << mode << ", size 0" << VCD_ENDL;
    235     return;
    236   }
    237   instructions_and_sizes_.push_back(static_cast<char>(opcode));
    238   last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
    239   AppendSizeToString(size, &instructions_and_sizes_);
    240 }
    241 
    242 void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
    243   EncodeInstruction(VCD_ADD, size);
    244   data_for_add_and_run_->append(data, size);
    245   target_length_ += size;
    246 }
    247 
    248 void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
    249   if (!instruction_map_) {
    250     VCD_DFATAL << "VCDiffCodeTableWriter::Copy() called without calling Init()"
    251                << VCD_ENDL;
    252     return;
    253   }
    254   // If a single interleaved stream of encoded values is used
    255   // instead of separate sections for instructions, addresses, and data,
    256   // then the string instructions_and_sizes_ may be the same as
    257   // addresses_for_copy_.  The address should therefore be encoded
    258   // *after* the instruction and its size.
    259   int32_t encoded_addr = 0;
    260   const unsigned char mode = address_cache_.EncodeAddress(
    261       offset,
    262       static_cast<VCDAddress>(dictionary_size_ + target_length_),
    263       &encoded_addr);
    264   EncodeInstruction(VCD_COPY, size, mode);
    265   if (address_cache_.WriteAddressAsVarintForMode(mode)) {
    266     VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
    267   } else {
    268     addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
    269   }
    270   target_length_ += size;
    271 }
    272 
    273 void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
    274   EncodeInstruction(VCD_RUN, size);
    275   data_for_add_and_run_->push_back(byte);
    276   target_length_ += size;
    277 }
    278 
    279 size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
    280   return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
    281 }
    282 
    283 void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
    284   VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
    285 }
    286 
    287 void VCDiffCodeTableWriter::AppendSizeToOutputString(
    288     size_t size,
    289     OutputStringInterface* out) {
    290   VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
    291 }
    292 
    293 // This calculation must match the items added between "Start of Delta Encoding"
    294 // and "End of Delta Encoding" in Output(), below.
    295 size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
    296   size_t length_of_the_delta_encoding =
    297     CalculateLengthOfSizeAsVarint(target_length_) +
    298     1 +  // Delta_Indicator
    299     CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
    300     CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
    301     CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
    302     separate_data_for_add_and_run_.size() +
    303     instructions_and_sizes_.size() +
    304     separate_addresses_for_copy_.size();
    305   if (add_checksum_) {
    306     length_of_the_delta_encoding +=
    307         VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
    308   }
    309   return length_of_the_delta_encoding;
    310 }
    311 
    312 void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
    313   if (instructions_and_sizes_.empty()) {
    314     VCD_WARNING << "Empty input; no delta window produced" << VCD_ENDL;
    315   } else {
    316     const size_t length_of_the_delta_encoding =
    317         CalculateLengthOfTheDeltaEncoding();
    318     const size_t delta_window_size =
    319         length_of_the_delta_encoding +
    320         1 +  // Win_Indicator
    321         CalculateLengthOfSizeAsVarint(dictionary_size_) +
    322         CalculateLengthOfSizeAsVarint(0) +
    323         CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
    324     // append() will be called many times on the output string; make sure
    325     // the output string is resized only once at most.
    326     out->ReserveAdditionalBytes(delta_window_size);
    327 
    328     // Add first element: Win_Indicator
    329     if (add_checksum_) {
    330       out->push_back(VCD_SOURCE | VCD_CHECKSUM);
    331     } else {
    332       out->push_back(VCD_SOURCE);
    333     }
    334     // Source segment size: dictionary size
    335     AppendSizeToOutputString(dictionary_size_, out);
    336     // Source segment position: 0 (start of dictionary)
    337     AppendSizeToOutputString(0, out);
    338 
    339     // [Here is where a secondary compressor would be used
    340     //  if the encoder and decoder supported that feature.]
    341 
    342     AppendSizeToOutputString(length_of_the_delta_encoding, out);
    343     // Start of Delta Encoding
    344     const size_t size_before_delta_encoding = out->size();
    345     AppendSizeToOutputString(target_length_, out);
    346     out->push_back(0x00);  // Delta_Indicator: no compression
    347     AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
    348     AppendSizeToOutputString(instructions_and_sizes_.size(), out);
    349     AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
    350     if (add_checksum_) {
    351       // The checksum is a 32-bit *unsigned* integer.  VarintBE requires a
    352       // signed type, so use a 64-bit signed integer to store the checksum.
    353       VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
    354                                               out);
    355     }
    356     out->append(separate_data_for_add_and_run_.data(),
    357                 separate_data_for_add_and_run_.size());
    358     out->append(instructions_and_sizes_.data(),
    359                 instructions_and_sizes_.size());
    360     out->append(separate_addresses_for_copy_.data(),
    361                 separate_addresses_for_copy_.size());
    362     // End of Delta Encoding
    363     const size_t size_after_delta_encoding = out->size();
    364     if (length_of_the_delta_encoding !=
    365         (size_after_delta_encoding - size_before_delta_encoding)) {
    366       VCD_DFATAL << "Internal error: calculated length of the delta encoding ("
    367                  << length_of_the_delta_encoding
    368                  << ") does not match actual length ("
    369                  << (size_after_delta_encoding - size_before_delta_encoding)
    370                  << VCD_ENDL;
    371     }
    372     separate_data_for_add_and_run_.clear();
    373     instructions_and_sizes_.clear();
    374     separate_addresses_for_copy_.clear();
    375     if (target_length_ == 0) {
    376       VCD_WARNING << "Empty target window" << VCD_ENDL;
    377     }
    378   }
    379 
    380   // Reset state for next window; assume we are using same code table
    381   // and dictionary.  The caller will have to invoke Init() if a different
    382   // dictionary is used.
    383   //
    384   // Notably, Init() calls address_cache_.Init().  This resets the address
    385   // cache between delta windows, as required by RFC section 5.1.
    386   if (!Init(dictionary_size_)) {
    387     VCD_DFATAL << "Internal error: calling Init() to reset "
    388                   "VCDiffCodeTableWriter state failed" << VCD_ENDL;
    389   }
    390 }
    391 
    392 };  // namespace open_vcdiff
    393