Home | History | Annotate | Download | only in src
      1 // Copyright 2006, 2008 Google Inc.
      2 // Authors: Chandra Chereddi, Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <config.h>
     17 #include "vcdiffengine.h"
     18 #include <stdint.h>  // uint32_t
     19 #include <string.h>  // memcpy
     20 #include "blockhash.h"
     21 #include "codetablewriter_interface.h"
     22 #include "logging.h"
     23 #include "rolling_hash.h"
     24 
     25 namespace open_vcdiff {
     26 
     27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
     28     // If dictionary_size == 0, then dictionary could be NULL.  Guard against
     29     // using a NULL value.
     30     : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
     31       dictionary_size_(dictionary_size),
     32       hashed_dictionary_(NULL) {
     33   if (dictionary_size > 0) {
     34     memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
     35   }
     36 }
     37 
     38 VCDiffEngine::~VCDiffEngine() {
     39   delete hashed_dictionary_;
     40   if (dictionary_size_ > 0) {
     41     delete[] dictionary_;
     42   }
     43 }
     44 
     45 bool VCDiffEngine::Init() {
     46   if (hashed_dictionary_) {
     47     VCD_DFATAL << "Init() called twice for same VCDiffEngine object"
     48                << VCD_ENDL;
     49     return false;
     50   }
     51   hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
     52                                                        dictionary_size());
     53   if (!hashed_dictionary_) {
     54     VCD_DFATAL << "Creation of dictionary hash failed" << VCD_ENDL;
     55     return false;
     56   }
     57   RollingHash<BlockHash::kBlockSize>::Init();
     58   return true;
     59 }
     60 
     61 // This helper function tries to find an appropriate match within
     62 // hashed_dictionary_ for the block starting at the current target position.
     63 // If target_hash is not NULL, this function will also look for a match
     64 // within the previously encoded target data.
     65 //
     66 // If a match is found, this function will generate an ADD instruction
     67 // for all unencoded data that precedes the match,
     68 // and a COPY instruction for the match itself; then it returns
     69 // the number of bytes processed by both instructions,
     70 // which is guaranteed to be > 0.
     71 // If no appropriate match is found, the function returns 0.
     72 //
     73 // The first four parameters are input parameters which are passed
     74 // directly to BlockHash::FindBestMatch; please see that function
     75 // for a description of their allowable values.
     76 template<bool look_for_target_matches>
     77 inline size_t VCDiffEngine::EncodeCopyForBestMatch(
     78     uint32_t hash_value,
     79     const char* target_candidate_start,
     80     const char* unencoded_target_start,
     81     size_t unencoded_target_size,
     82     const BlockHash* target_hash,
     83     CodeTableWriterInterface* coder) const {
     84   // When FindBestMatch() comes up with a match for a candidate block,
     85   // it will populate best_match with the size, source offset,
     86   // and target offset of the match.
     87   BlockHash::Match best_match;
     88 
     89   // First look for a match in the dictionary.
     90   hashed_dictionary_->FindBestMatch(hash_value,
     91                                     target_candidate_start,
     92                                     unencoded_target_start,
     93                                     unencoded_target_size,
     94                                     &best_match);
     95   // If target matching is enabled, then see if there is a better match
     96   // within the target data that has been encoded so far.
     97   if (look_for_target_matches) {
     98     target_hash->FindBestMatch(hash_value,
     99                                target_candidate_start,
    100                                unencoded_target_start,
    101                                unencoded_target_size,
    102                                &best_match);
    103   }
    104   if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
    105     return 0;
    106   }
    107   if (best_match.target_offset() > 0) {
    108     // Create an ADD instruction to encode all target bytes
    109     // from the end of the last COPY match, if any, up to
    110     // the beginning of this COPY match.
    111     coder->Add(unencoded_target_start, best_match.target_offset());
    112   }
    113   coder->Copy(best_match.source_offset(), best_match.size());
    114   return best_match.target_offset()  // ADD size
    115        + best_match.size();          // + COPY size
    116 }
    117 
    118 // Once the encoder loop has finished checking for matches in the target data,
    119 // this function creates an ADD instruction to encode all target bytes
    120 // from the end of the last COPY match, if any, through the end of
    121 // the target data.  In the worst case, if no matches were found at all,
    122 // this function will create one big ADD instruction
    123 // for the entire buffer of target data.
    124 inline void VCDiffEngine::AddUnmatchedRemainder(
    125     const char* unencoded_target_start,
    126     size_t unencoded_target_size,
    127     CodeTableWriterInterface* coder) const {
    128   if (unencoded_target_size > 0) {
    129     coder->Add(unencoded_target_start, unencoded_target_size);
    130   }
    131 }
    132 
    133 // This helper function tells the coder to finish the encoding and write
    134 // the results into the output string "diff".
    135 inline void VCDiffEngine::FinishEncoding(
    136     size_t target_size,
    137     OutputStringInterface* diff,
    138     CodeTableWriterInterface* coder) const {
    139   if (target_size != static_cast<size_t>(coder->target_length())) {
    140     VCD_DFATAL << "Internal error in VCDiffEngine::Encode: "
    141                   "original target size (" << target_size
    142                << ") does not match number of bytes processed ("
    143                << coder->target_length() << ")" << VCD_ENDL;
    144   }
    145   coder->Output(diff);
    146 }
    147 
    148 template<bool look_for_target_matches>
    149 void VCDiffEngine::EncodeInternal(const char* target_data,
    150                                   size_t target_size,
    151                                   OutputStringInterface* diff,
    152                                   CodeTableWriterInterface* coder) const {
    153   if (!hashed_dictionary_) {
    154     VCD_DFATAL << "Internal error: VCDiffEngine::Encode() "
    155                   "called before VCDiffEngine::Init()" << VCD_ENDL;
    156     return;
    157   }
    158   if (target_size == 0) {
    159     return;  // Do nothing for empty target
    160   }
    161   // Special case for really small input
    162   if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
    163     AddUnmatchedRemainder(target_data, target_size, coder);
    164     FinishEncoding(target_size, diff, coder);
    165     return;
    166   }
    167   RollingHash<BlockHash::kBlockSize> hasher;
    168   BlockHash* target_hash = NULL;
    169   if (look_for_target_matches) {
    170     // Check matches against previously encoded target data
    171     // in this same target window, as well as against the dictionary
    172     target_hash = BlockHash::CreateTargetHash(target_data,
    173                                               target_size,
    174                                               dictionary_size());
    175     if (!target_hash) {
    176       VCD_DFATAL << "Instantiation of target hash failed" << VCD_ENDL;
    177       return;
    178     }
    179   }
    180   const char* const target_end = target_data + target_size;
    181   const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
    182   // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
    183   // dictionary)
    184   const char* next_encode = target_data;
    185   // candidate_pos points to the start of the kBlockSize-byte block that may
    186   // begin a match with the dictionary or previously encoded target data.
    187   const char* candidate_pos = target_data;
    188   uint32_t hash_value = hasher.Hash(candidate_pos);
    189   while (1) {
    190     const size_t bytes_encoded =
    191         EncodeCopyForBestMatch<look_for_target_matches>(
    192             hash_value,
    193             candidate_pos,
    194             next_encode,
    195             (target_end - next_encode),
    196             target_hash,
    197             coder);
    198     if (bytes_encoded > 0) {
    199       next_encode += bytes_encoded;  // Advance past COPYed data
    200       candidate_pos = next_encode;
    201       if (candidate_pos > start_of_last_block) {
    202         break;  // Reached end of target data
    203       }
    204       // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
    205       // can't be used to calculate the hash value at its new position.
    206       hash_value = hasher.Hash(candidate_pos);
    207       if (look_for_target_matches) {
    208         // Update the target hash for the ADDed and COPYed data
    209         target_hash->AddAllBlocksThroughIndex(
    210             static_cast<int>(next_encode - target_data));
    211       }
    212     } else {
    213       // No match, or match is too small to be worth a COPY instruction.
    214       // Move to the next position in the target data.
    215       if ((candidate_pos + 1) > start_of_last_block) {
    216         break;  // Reached end of target data
    217       }
    218       if (look_for_target_matches) {
    219         target_hash->AddOneIndexHash(
    220             static_cast<int>(candidate_pos - target_data),
    221             hash_value);
    222       }
    223       hash_value = hasher.UpdateHash(hash_value,
    224                                      candidate_pos[0],
    225                                      candidate_pos[BlockHash::kBlockSize]);
    226       ++candidate_pos;
    227     }
    228   }
    229   AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
    230   FinishEncoding(target_size, diff, coder);
    231   delete target_hash;
    232 }
    233 
    234 void VCDiffEngine::Encode(const char* target_data,
    235                           size_t target_size,
    236                           bool look_for_target_matches,
    237                           OutputStringInterface* diff,
    238                           CodeTableWriterInterface* coder) const {
    239   if (look_for_target_matches) {
    240     EncodeInternal<true>(target_data, target_size, diff, coder);
    241   } else {
    242     EncodeInternal<false>(target_data, target_size, diff, coder);
    243   }
    244 }
    245 
    246 }  // namespace open_vcdiff
    247