Home | History | Annotate | Download | only in src
      1 // Copyright 2006, 2008 Google Inc.
      2 // Authors: Chandra Chereddi, Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <config.h>
     17 #include "vcdiffengine.h"
     18 #include <stdint.h>  // uint32_t
     19 #include <string.h>  // memcpy
     20 #include "blockhash.h"
     21 #include "codetablewriter_interface.h"
     22 #include "logging.h"
     23 #include "rolling_hash.h"
     24 
     25 namespace open_vcdiff {
     26 
     27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
     28     // If dictionary_size == 0, then dictionary could be NULL.  Guard against
     29     // using a NULL value.
     30     : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
     31       dictionary_size_(dictionary_size),
     32       hashed_dictionary_(NULL) {
     33   if (dictionary_size > 0) {
     34     memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
     35   }
     36 }
     37 
     38 VCDiffEngine::~VCDiffEngine() {
     39   delete hashed_dictionary_;
     40   if (dictionary_size_ > 0) {
     41     delete[] dictionary_;
     42   }
     43 }
     44 
     45 bool VCDiffEngine::Init() {
     46   if (hashed_dictionary_) {
     47     LOG(DFATAL) << "Init() called twice for same VCDiffEngine object"
     48                 << LOG_ENDL;
     49     return false;
     50   }
     51   hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
     52                                                        dictionary_size());
     53   if (!hashed_dictionary_) {
     54     LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL;
     55     return false;
     56   }
     57   if (!RollingHash<BlockHash::kBlockSize>::Init()) {
     58     LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL;
     59     return false;
     60   }
     61   return true;
     62 }
     63 
     64 // This helper function tries to find an appropriate match within
     65 // hashed_dictionary_ for the block starting at the current target position.
     66 // If target_hash is not NULL, this function will also look for a match
     67 // within the previously encoded target data.
     68 //
     69 // If a match is found, this function will generate an ADD instruction
     70 // for all unencoded data that precedes the match,
     71 // and a COPY instruction for the match itself; then it returns
     72 // the number of bytes processed by both instructions,
     73 // which is guaranteed to be > 0.
     74 // If no appropriate match is found, the function returns 0.
     75 //
     76 // The first four parameters are input parameters which are passed
     77 // directly to BlockHash::FindBestMatch; please see that function
     78 // for a description of their allowable values.
     79 template<bool look_for_target_matches>
     80 inline size_t VCDiffEngine::EncodeCopyForBestMatch(
     81     uint32_t hash_value,
     82     const char* target_candidate_start,
     83     const char* unencoded_target_start,
     84     size_t unencoded_target_size,
     85     const BlockHash* target_hash,
     86     CodeTableWriterInterface* coder) const {
     87   // When FindBestMatch() comes up with a match for a candidate block,
     88   // it will populate best_match with the size, source offset,
     89   // and target offset of the match.
     90   BlockHash::Match best_match;
     91 
     92   // First look for a match in the dictionary.
     93   hashed_dictionary_->FindBestMatch(hash_value,
     94                                     target_candidate_start,
     95                                     unencoded_target_start,
     96                                     unencoded_target_size,
     97                                     &best_match);
     98   // If target matching is enabled, then see if there is a better match
     99   // within the target data that has been encoded so far.
    100   if (look_for_target_matches) {
    101     target_hash->FindBestMatch(hash_value,
    102                                target_candidate_start,
    103                                unencoded_target_start,
    104                                unencoded_target_size,
    105                                &best_match);
    106   }
    107   if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
    108     return 0;
    109   }
    110   if (best_match.target_offset() > 0) {
    111     // Create an ADD instruction to encode all target bytes
    112     // from the end of the last COPY match, if any, up to
    113     // the beginning of this COPY match.
    114     coder->Add(unencoded_target_start, best_match.target_offset());
    115   }
    116   coder->Copy(best_match.source_offset(), best_match.size());
    117   return best_match.target_offset()  // ADD size
    118        + best_match.size();          // + COPY size
    119 }
    120 
    121 // Once the encoder loop has finished checking for matches in the target data,
    122 // this function creates an ADD instruction to encode all target bytes
    123 // from the end of the last COPY match, if any, through the end of
    124 // the target data.  In the worst case, if no matches were found at all,
    125 // this function will create one big ADD instruction
    126 // for the entire buffer of target data.
    127 inline void VCDiffEngine::AddUnmatchedRemainder(
    128     const char* unencoded_target_start,
    129     size_t unencoded_target_size,
    130     CodeTableWriterInterface* coder) const {
    131   if (unencoded_target_size > 0) {
    132     coder->Add(unencoded_target_start, unencoded_target_size);
    133   }
    134 }
    135 
    136 // This helper function tells the coder to finish the encoding and write
    137 // the results into the output string "diff".
    138 inline void VCDiffEngine::FinishEncoding(
    139     size_t target_size,
    140     OutputStringInterface* diff,
    141     CodeTableWriterInterface* coder) const {
    142   if (target_size != static_cast<size_t>(coder->target_length())) {
    143     LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: "
    144                    "original target size (" << target_size
    145                 << ") does not match number of bytes processed ("
    146                 << coder->target_length() << ")" << LOG_ENDL;
    147   }
    148   coder->Output(diff);
    149 }
    150 
    151 template<bool look_for_target_matches>
    152 void VCDiffEngine::EncodeInternal(const char* target_data,
    153                                   size_t target_size,
    154                                   OutputStringInterface* diff,
    155                                   CodeTableWriterInterface* coder) const {
    156   if (!hashed_dictionary_) {
    157     LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() "
    158                    "called before VCDiffEngine::Init()" << LOG_ENDL;
    159     return;
    160   }
    161   if (target_size == 0) {
    162     return;  // Do nothing for empty target
    163   }
    164   // Special case for really small input
    165   if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
    166     AddUnmatchedRemainder(target_data, target_size, coder);
    167     FinishEncoding(target_size, diff, coder);
    168     return;
    169   }
    170   RollingHash<BlockHash::kBlockSize> hasher;
    171   BlockHash* target_hash = NULL;
    172   if (look_for_target_matches) {
    173     // Check matches against previously encoded target data
    174     // in this same target window, as well as against the dictionary
    175     target_hash = BlockHash::CreateTargetHash(target_data,
    176                                               target_size,
    177                                               dictionary_size());
    178     if (!target_hash) {
    179       LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL;
    180       return;
    181     }
    182   }
    183   const char* const target_end = target_data + target_size;
    184   const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
    185   // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
    186   // dictionary)
    187   const char* next_encode = target_data;
    188   // candidate_pos points to the start of the kBlockSize-byte block that may
    189   // begin a match with the dictionary or previously encoded target data.
    190   const char* candidate_pos = target_data;
    191   uint32_t hash_value = hasher.Hash(candidate_pos);
    192   while (1) {
    193     const size_t bytes_encoded =
    194         EncodeCopyForBestMatch<look_for_target_matches>(
    195             hash_value,
    196             candidate_pos,
    197             next_encode,
    198             (target_end - next_encode),
    199             target_hash,
    200             coder);
    201     if (bytes_encoded > 0) {
    202       next_encode += bytes_encoded;  // Advance past COPYed data
    203       candidate_pos = next_encode;
    204       if (candidate_pos > start_of_last_block) {
    205         break;  // Reached end of target data
    206       }
    207       // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
    208       // can't be used to calculate the hash value at its new position.
    209       hash_value = hasher.Hash(candidate_pos);
    210       if (look_for_target_matches) {
    211         // Update the target hash for the ADDed and COPYed data
    212         target_hash->AddAllBlocksThroughIndex(
    213             static_cast<int>(next_encode - target_data));
    214       }
    215     } else {
    216       // No match, or match is too small to be worth a COPY instruction.
    217       // Move to the next position in the target data.
    218       if ((candidate_pos + 1) > start_of_last_block) {
    219         break;  // Reached end of target data
    220       }
    221       if (look_for_target_matches) {
    222         target_hash->AddOneIndexHash(
    223             static_cast<int>(candidate_pos - target_data),
    224             hash_value);
    225       }
    226       hash_value = hasher.UpdateHash(hash_value,
    227                                      candidate_pos[0],
    228                                      candidate_pos[BlockHash::kBlockSize]);
    229       ++candidate_pos;
    230     }
    231   }
    232   AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
    233   FinishEncoding(target_size, diff, coder);
    234   delete target_hash;
    235 }
    236 
    237 void VCDiffEngine::Encode(const char* target_data,
    238                           size_t target_size,
    239                           bool look_for_target_matches,
    240                           OutputStringInterface* diff,
    241                           CodeTableWriterInterface* coder) const {
    242   if (look_for_target_matches) {
    243     EncodeInternal<true>(target_data, target_size, diff, coder);
    244   } else {
    245     EncodeInternal<false>(target_data, target_size, diff, coder);
    246   }
    247 }
    248 
    249 }  // namespace open_vcdiff
    250