1 // Copyright 2006, 2008 Google Inc. 2 // Authors: Chandra Chereddi, Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <config.h> 17 #include "vcdiffengine.h" 18 #include <stdint.h> // uint32_t 19 #include <string.h> // memcpy 20 #include "blockhash.h" 21 #include "codetablewriter_interface.h" 22 #include "logging.h" 23 #include "rolling_hash.h" 24 25 namespace open_vcdiff { 26 27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size) 28 // If dictionary_size == 0, then dictionary could be NULL. Guard against 29 // using a NULL value. 30 : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""), 31 dictionary_size_(dictionary_size), 32 hashed_dictionary_(NULL) { 33 if (dictionary_size > 0) { 34 memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size); 35 } 36 } 37 38 VCDiffEngine::~VCDiffEngine() { 39 delete hashed_dictionary_; 40 if (dictionary_size_ > 0) { 41 delete[] dictionary_; 42 } 43 } 44 45 bool VCDiffEngine::Init() { 46 if (hashed_dictionary_) { 47 VCD_DFATAL << "Init() called twice for same VCDiffEngine object" 48 << VCD_ENDL; 49 return false; 50 } 51 hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_, 52 dictionary_size()); 53 if (!hashed_dictionary_) { 54 VCD_DFATAL << "Creation of dictionary hash failed" << VCD_ENDL; 55 return false; 56 } 57 RollingHash<BlockHash::kBlockSize>::Init(); 58 return true; 59 } 60 61 // This helper function tries to find an appropriate match within 62 // hashed_dictionary_ for the block starting at the current target position. 63 // If target_hash is not NULL, this function will also look for a match 64 // within the previously encoded target data. 65 // 66 // If a match is found, this function will generate an ADD instruction 67 // for all unencoded data that precedes the match, 68 // and a COPY instruction for the match itself; then it returns 69 // the number of bytes processed by both instructions, 70 // which is guaranteed to be > 0. 71 // If no appropriate match is found, the function returns 0. 72 // 73 // The first four parameters are input parameters which are passed 74 // directly to BlockHash::FindBestMatch; please see that function 75 // for a description of their allowable values. 76 template<bool look_for_target_matches> 77 inline size_t VCDiffEngine::EncodeCopyForBestMatch( 78 uint32_t hash_value, 79 const char* target_candidate_start, 80 const char* unencoded_target_start, 81 size_t unencoded_target_size, 82 const BlockHash* target_hash, 83 CodeTableWriterInterface* coder) const { 84 // When FindBestMatch() comes up with a match for a candidate block, 85 // it will populate best_match with the size, source offset, 86 // and target offset of the match. 87 BlockHash::Match best_match; 88 89 // First look for a match in the dictionary. 90 hashed_dictionary_->FindBestMatch(hash_value, 91 target_candidate_start, 92 unencoded_target_start, 93 unencoded_target_size, 94 &best_match); 95 // If target matching is enabled, then see if there is a better match 96 // within the target data that has been encoded so far. 97 if (look_for_target_matches) { 98 target_hash->FindBestMatch(hash_value, 99 target_candidate_start, 100 unencoded_target_start, 101 unencoded_target_size, 102 &best_match); 103 } 104 if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) { 105 return 0; 106 } 107 if (best_match.target_offset() > 0) { 108 // Create an ADD instruction to encode all target bytes 109 // from the end of the last COPY match, if any, up to 110 // the beginning of this COPY match. 111 coder->Add(unencoded_target_start, best_match.target_offset()); 112 } 113 coder->Copy(best_match.source_offset(), best_match.size()); 114 return best_match.target_offset() // ADD size 115 + best_match.size(); // + COPY size 116 } 117 118 // Once the encoder loop has finished checking for matches in the target data, 119 // this function creates an ADD instruction to encode all target bytes 120 // from the end of the last COPY match, if any, through the end of 121 // the target data. In the worst case, if no matches were found at all, 122 // this function will create one big ADD instruction 123 // for the entire buffer of target data. 124 inline void VCDiffEngine::AddUnmatchedRemainder( 125 const char* unencoded_target_start, 126 size_t unencoded_target_size, 127 CodeTableWriterInterface* coder) const { 128 if (unencoded_target_size > 0) { 129 coder->Add(unencoded_target_start, unencoded_target_size); 130 } 131 } 132 133 // This helper function tells the coder to finish the encoding and write 134 // the results into the output string "diff". 135 inline void VCDiffEngine::FinishEncoding( 136 size_t target_size, 137 OutputStringInterface* diff, 138 CodeTableWriterInterface* coder) const { 139 if (target_size != static_cast<size_t>(coder->target_length())) { 140 VCD_DFATAL << "Internal error in VCDiffEngine::Encode: " 141 "original target size (" << target_size 142 << ") does not match number of bytes processed (" 143 << coder->target_length() << ")" << VCD_ENDL; 144 } 145 coder->Output(diff); 146 } 147 148 template<bool look_for_target_matches> 149 void VCDiffEngine::EncodeInternal(const char* target_data, 150 size_t target_size, 151 OutputStringInterface* diff, 152 CodeTableWriterInterface* coder) const { 153 if (!hashed_dictionary_) { 154 VCD_DFATAL << "Internal error: VCDiffEngine::Encode() " 155 "called before VCDiffEngine::Init()" << VCD_ENDL; 156 return; 157 } 158 if (target_size == 0) { 159 return; // Do nothing for empty target 160 } 161 // Special case for really small input 162 if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) { 163 AddUnmatchedRemainder(target_data, target_size, coder); 164 FinishEncoding(target_size, diff, coder); 165 return; 166 } 167 RollingHash<BlockHash::kBlockSize> hasher; 168 BlockHash* target_hash = NULL; 169 if (look_for_target_matches) { 170 // Check matches against previously encoded target data 171 // in this same target window, as well as against the dictionary 172 target_hash = BlockHash::CreateTargetHash(target_data, 173 target_size, 174 dictionary_size()); 175 if (!target_hash) { 176 VCD_DFATAL << "Instantiation of target hash failed" << VCD_ENDL; 177 return; 178 } 179 } 180 const char* const target_end = target_data + target_size; 181 const char* const start_of_last_block = target_end - BlockHash::kBlockSize; 182 // Offset of next bytes in string to ADD if NOT copied (i.e., not found in 183 // dictionary) 184 const char* next_encode = target_data; 185 // candidate_pos points to the start of the kBlockSize-byte block that may 186 // begin a match with the dictionary or previously encoded target data. 187 const char* candidate_pos = target_data; 188 uint32_t hash_value = hasher.Hash(candidate_pos); 189 while (1) { 190 const size_t bytes_encoded = 191 EncodeCopyForBestMatch<look_for_target_matches>( 192 hash_value, 193 candidate_pos, 194 next_encode, 195 (target_end - next_encode), 196 target_hash, 197 coder); 198 if (bytes_encoded > 0) { 199 next_encode += bytes_encoded; // Advance past COPYed data 200 candidate_pos = next_encode; 201 if (candidate_pos > start_of_last_block) { 202 break; // Reached end of target data 203 } 204 // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash 205 // can't be used to calculate the hash value at its new position. 206 hash_value = hasher.Hash(candidate_pos); 207 if (look_for_target_matches) { 208 // Update the target hash for the ADDed and COPYed data 209 target_hash->AddAllBlocksThroughIndex( 210 static_cast<int>(next_encode - target_data)); 211 } 212 } else { 213 // No match, or match is too small to be worth a COPY instruction. 214 // Move to the next position in the target data. 215 if ((candidate_pos + 1) > start_of_last_block) { 216 break; // Reached end of target data 217 } 218 if (look_for_target_matches) { 219 target_hash->AddOneIndexHash( 220 static_cast<int>(candidate_pos - target_data), 221 hash_value); 222 } 223 hash_value = hasher.UpdateHash(hash_value, 224 candidate_pos[0], 225 candidate_pos[BlockHash::kBlockSize]); 226 ++candidate_pos; 227 } 228 } 229 AddUnmatchedRemainder(next_encode, target_end - next_encode, coder); 230 FinishEncoding(target_size, diff, coder); 231 delete target_hash; 232 } 233 234 void VCDiffEngine::Encode(const char* target_data, 235 size_t target_size, 236 bool look_for_target_matches, 237 OutputStringInterface* diff, 238 CodeTableWriterInterface* coder) const { 239 if (look_for_target_matches) { 240 EncodeInternal<true>(target_data, target_size, diff, coder); 241 } else { 242 EncodeInternal<false>(target_data, target_size, diff, coder); 243 } 244 } 245 246 } // namespace open_vcdiff 247