Home | History | Annotate | Download | only in src
      1 // Copyright 2008 Google Inc.
      2 // Author: Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <config.h>
     17 #include "encodetable.h"
     18 #include <string>
     19 #include "addrcache.h"
     20 #include "codetable.h"
     21 #include "instruction_map.h"
     22 #include "logging.h"
     23 #include "google/output_string.h"
     24 #include "varint_bigendian.h"
     25 #include "vcdiff_defs.h"
     26 
     27 namespace open_vcdiff {
     28 
     29 // VCDiffCodeTableWriter members and methods
     30 
     31 // If interleaved is true, the encoder writes each delta file window
     32 // by interleaving instructions and sizes with their corresponding
     33 // addresses and data, rather than placing these elements into three
     34 // separate sections.  This facilitates providing partially
     35 // decoded results when only a portion of a delta file window
     36 // is received (e.g. when HTTP over TCP is used as the
     37 // transmission protocol.)  The interleaved format is
     38 // not consistent with the VCDIFF draft standard.
     39 //
     40 VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
     41     : max_mode_(VCDiffAddressCache::DefaultLastMode()),
     42       dictionary_size_(0),
     43       target_length_(0),
     44       code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
     45       instruction_map_(NULL),
     46       last_opcode_index_(-1),
     47       add_checksum_(false),
     48       checksum_(0),
     49       match_counts_(kMaxMatchSize, 0) {
     50   InitSectionPointers(interleaved);
     51 }
     52 
     53 VCDiffCodeTableWriter::VCDiffCodeTableWriter(
     54     bool interleaved,
     55     int near_cache_size,
     56     int same_cache_size,
     57     const VCDiffCodeTableData& code_table_data,
     58     unsigned char max_mode)
     59     : max_mode_(max_mode),
     60       address_cache_(near_cache_size, same_cache_size),
     61       dictionary_size_(0),
     62       target_length_(0),
     63       code_table_data_(&code_table_data),
     64       instruction_map_(NULL),
     65       last_opcode_index_(-1),
     66       add_checksum_(false),
     67       checksum_(0)  {
     68   InitSectionPointers(interleaved);
     69 }
     70 
     71 VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
     72   if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
     73     delete instruction_map_;
     74   }
     75 }
     76 
     77 void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
     78   if (interleaved) {
     79     data_for_add_and_run_ = &instructions_and_sizes_;
     80     addresses_for_copy_ = &instructions_and_sizes_;
     81   } else {
     82     data_for_add_and_run_ = &separate_data_for_add_and_run_;
     83     addresses_for_copy_ = &separate_addresses_for_copy_;
     84   }
     85 }
     86 
     87 bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
     88   dictionary_size_ = dictionary_size;
     89   if (!instruction_map_) {
     90     if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
     91       instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
     92     } else {
     93       instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
     94     }
     95     if (!instruction_map_) {
     96       return false;
     97     }
     98   }
     99   if (!address_cache_.Init()) {
    100     return false;
    101   }
    102   target_length_ = 0;
    103   last_opcode_index_ = -1;
    104   return true;
    105 }
    106 
    107 // The VCDiff format allows each opcode to represent either
    108 // one or two delta instructions.  This function will first
    109 // examine the opcode generated by the last call to EncodeInstruction.
    110 // If that opcode was a single-instruction opcode, this function checks
    111 // whether there is a compound (double-instruction) opcode that can
    112 // combine that single instruction with the instruction that is now
    113 // being added, and so save a byte of space.  In that case, the
    114 // single-instruction opcode at position last_opcode_index_ will be
    115 // overwritten with the new double-instruction opcode.
    116 //
    117 // In the majority of cases, no compound opcode will be possible,
    118 // and a new single-instruction opcode will be appended to
    119 // instructions_and_sizes_, followed by a representation of its size
    120 // if the opcode does not implicitly give the instruction size.
    121 //
    122 // As an example, say instructions_and_sizes_ contains 10 bytes, the last
    123 // of which contains the opcode 0x02 (ADD size 1).  Because that was the
    124 // most recently added opcode, last_opcode_index_ has the value 10.
    125 // EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
    126 // The function will replace the old opcode 0x02 with the double-instruction
    127 // opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
    128 //
    129 // All of the double-instruction opcodes in the standard code table
    130 // have implicit sizes, meaning that the size of the instruction will not
    131 // need to be written to the instructions_and_sizes_ string separately
    132 // from the opcode.  If a custom code table were used that did not have
    133 // this property, then instructions_and_sizes_ might contain a
    134 // double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
    135 // followed by the size of the COPY, then by the size of the ADD.
    136 // If using the SDCH interleaved format, the address of the COPY instruction
    137 // would follow its size, so the ordering would be
    138 // [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
    139 //
    140 void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
    141                                               size_t size,
    142                                               unsigned char mode) {
    143   if (!instruction_map_) {
    144     LOG(DFATAL) << "EncodeInstruction() called without calling Init()"
    145                 << LOG_ENDL;
    146     return;
    147   }
    148   if (last_opcode_index_ >= 0) {
    149     const unsigned char last_opcode =
    150         instructions_and_sizes_[last_opcode_index_];
    151     // The encoding engine should not generate two ADD instructions in a row.
    152     // This won't cause a failure, but it's inefficient encoding and probably
    153     // represents a bug in the higher-level logic of the encoder.
    154     if ((inst == VCD_ADD) &&
    155         (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
    156       LOG(WARNING) << "EncodeInstruction() called for two ADD instructions"
    157                       " in a row" << LOG_ENDL;
    158     }
    159     OpcodeOrNone compound_opcode = kNoOpcode;
    160     if (size <= UCHAR_MAX) {
    161       compound_opcode =
    162           instruction_map_->LookupSecondOpcode(last_opcode,
    163                                                inst,
    164                                                static_cast<unsigned char>(size),
    165                                                mode);
    166       if (compound_opcode != kNoOpcode) {
    167         instructions_and_sizes_[last_opcode_index_] =
    168             static_cast<unsigned char>(compound_opcode);
    169         last_opcode_index_ = -1;
    170         return;
    171       }
    172     }
    173     // Try finding a compound opcode with size 0.
    174     compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
    175                                                            inst,
    176                                                            0,
    177                                                            mode);
    178     if (compound_opcode != kNoOpcode) {
    179       instructions_and_sizes_[last_opcode_index_] =
    180           static_cast<unsigned char>(compound_opcode);
    181       last_opcode_index_ = -1;
    182       AppendSizeToString(size, &instructions_and_sizes_);
    183       return;
    184     }
    185   }
    186   OpcodeOrNone opcode = kNoOpcode;
    187   if (size <= UCHAR_MAX) {
    188     opcode =
    189         instruction_map_->LookupFirstOpcode(inst,
    190                                             static_cast<unsigned char>(size),
    191                                             mode);
    192     if (opcode != kNoOpcode) {
    193       instructions_and_sizes_.push_back(static_cast<char>(opcode));
    194       last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
    195       return;
    196     }
    197   }
    198   // There should always be an opcode with size 0.
    199   opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
    200   if (opcode == kNoOpcode) {
    201     LOG(DFATAL) << "No matching opcode found for inst " << inst
    202                 << ", mode " << mode << ", size 0" << LOG_ENDL;
    203     return;
    204   }
    205   instructions_and_sizes_.push_back(static_cast<char>(opcode));
    206   last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
    207   AppendSizeToString(size, &instructions_and_sizes_);
    208 }
    209 
    210 void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
    211   EncodeInstruction(VCD_ADD, size);
    212   data_for_add_and_run_->append(data, size);
    213   target_length_ += size;
    214 }
    215 
    216 void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
    217   if (!instruction_map_) {
    218     LOG(DFATAL) << "VCDiffCodeTableWriter::Copy() called without calling Init()"
    219                 << LOG_ENDL;
    220     return;
    221   }
    222   // If a single interleaved stream of encoded values is used
    223   // instead of separate sections for instructions, addresses, and data,
    224   // then the string instructions_and_sizes_ may be the same as
    225   // addresses_for_copy_.  The address should therefore be encoded
    226   // *after* the instruction and its size.
    227   int32_t encoded_addr = 0;
    228   const unsigned char mode = address_cache_.EncodeAddress(
    229       offset,
    230       static_cast<VCDAddress>(dictionary_size_ + target_length_),
    231       &encoded_addr);
    232   EncodeInstruction(VCD_COPY, size, mode);
    233   if (address_cache_.WriteAddressAsVarintForMode(mode)) {
    234     VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
    235   } else {
    236     addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
    237   }
    238   target_length_ += size;
    239   if (size >= match_counts_.size()) {
    240     match_counts_.resize(size * 2, 0);  // Be generous to avoid resizing again
    241   }
    242   ++match_counts_[size];
    243 }
    244 
    245 void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
    246   EncodeInstruction(VCD_RUN, size);
    247   data_for_add_and_run_->push_back(byte);
    248   target_length_ += size;
    249 }
    250 
    251 size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
    252   return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
    253 }
    254 
    255 void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
    256   VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
    257 }
    258 
    259 void VCDiffCodeTableWriter::AppendSizeToOutputString(
    260     size_t size,
    261     OutputStringInterface* out) {
    262   VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
    263 }
    264 
    265 // This calculation must match the items added between "Start of Delta Encoding"
    266 // and "End of Delta Encoding" in Output(), below.
    267 size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
    268   size_t length_of_the_delta_encoding =
    269     CalculateLengthOfSizeAsVarint(target_length_) +
    270     1 +  // Delta_Indicator
    271     CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
    272     CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
    273     CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
    274     separate_data_for_add_and_run_.size() +
    275     instructions_and_sizes_.size() +
    276     separate_addresses_for_copy_.size();
    277   if (add_checksum_) {
    278     length_of_the_delta_encoding +=
    279         VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
    280   }
    281   return length_of_the_delta_encoding;
    282 }
    283 
    284 void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
    285   if (instructions_and_sizes_.empty()) {
    286     LOG(WARNING) << "Empty input; no delta window produced" << LOG_ENDL;
    287   } else {
    288     const size_t length_of_the_delta_encoding =
    289         CalculateLengthOfTheDeltaEncoding();
    290     const size_t delta_window_size =
    291         length_of_the_delta_encoding +
    292         1 +  // Win_Indicator
    293         CalculateLengthOfSizeAsVarint(dictionary_size_) +
    294         CalculateLengthOfSizeAsVarint(0) +
    295         CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
    296     // append() will be called many times on the output string; make sure
    297     // the output string is resized only once at most.
    298     out->ReserveAdditionalBytes(delta_window_size);
    299 
    300     // Add first element: Win_Indicator
    301     if (add_checksum_) {
    302       out->push_back(VCD_SOURCE | VCD_CHECKSUM);
    303     } else {
    304       out->push_back(VCD_SOURCE);
    305     }
    306     // Source segment size: dictionary size
    307     AppendSizeToOutputString(dictionary_size_, out);
    308     // Source segment position: 0 (start of dictionary)
    309     AppendSizeToOutputString(0, out);
    310 
    311     // [Here is where a secondary compressor would be used
    312     //  if the encoder and decoder supported that feature.]
    313 
    314     AppendSizeToOutputString(length_of_the_delta_encoding, out);
    315     // Start of Delta Encoding
    316     const size_t size_before_delta_encoding = out->size();
    317     AppendSizeToOutputString(target_length_, out);
    318     out->push_back(0x00);  // Delta_Indicator: no compression
    319     AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
    320     AppendSizeToOutputString(instructions_and_sizes_.size(), out);
    321     AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
    322     if (add_checksum_) {
    323       // The checksum is a 32-bit *unsigned* integer.  VarintBE requires a
    324       // signed type, so use a 64-bit signed integer to store the checksum.
    325       VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
    326                                               out);
    327     }
    328     out->append(separate_data_for_add_and_run_.data(),
    329                 separate_data_for_add_and_run_.size());
    330     out->append(instructions_and_sizes_.data(),
    331                 instructions_and_sizes_.size());
    332     out->append(separate_addresses_for_copy_.data(),
    333                 separate_addresses_for_copy_.size());
    334     // End of Delta Encoding
    335     const size_t size_after_delta_encoding = out->size();
    336     if (length_of_the_delta_encoding !=
    337         (size_after_delta_encoding - size_before_delta_encoding)) {
    338       LOG(DFATAL) << "Internal error: calculated length of the delta encoding ("
    339                   << length_of_the_delta_encoding
    340                   << ") does not match actual length ("
    341                   << (size_after_delta_encoding - size_before_delta_encoding)
    342                   << LOG_ENDL;
    343     }
    344     separate_data_for_add_and_run_.clear();
    345     instructions_and_sizes_.clear();
    346     separate_addresses_for_copy_.clear();
    347     if (target_length_ == 0) {
    348       LOG(WARNING) << "Empty target window" << LOG_ENDL;
    349     }
    350   }
    351 
    352   // Reset state for next window; assume we are using same code table
    353   // and dictionary.  The caller will have to invoke Init() if a different
    354   // dictionary is used.
    355   //
    356   // Notably, Init() calls address_cache_.Init().  This resets the address
    357   // cache between delta windows, as required by RFC section 5.1.
    358   if (!Init(dictionary_size_)) {
    359     LOG(DFATAL) << "Internal error: calling Init() to reset "
    360                    "VCDiffCodeTableWriter state failed" << LOG_ENDL;
    361   }
    362 }
    363 
    364 };  // namespace open_vcdiff
    365