1 // Copyright 2006, 2008 Google Inc. 2 // Authors: Chandra Chereddi, Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <config.h> 17 #include "vcdiffengine.h" 18 #include <stdint.h> // uint32_t 19 #include <string.h> // memcpy 20 #include "blockhash.h" 21 #include "codetablewriter_interface.h" 22 #include "logging.h" 23 #include "rolling_hash.h" 24 25 namespace open_vcdiff { 26 27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size) 28 // If dictionary_size == 0, then dictionary could be NULL. Guard against 29 // using a NULL value. 30 : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""), 31 dictionary_size_(dictionary_size), 32 hashed_dictionary_(NULL) { 33 if (dictionary_size > 0) { 34 memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size); 35 } 36 } 37 38 VCDiffEngine::~VCDiffEngine() { 39 delete hashed_dictionary_; 40 if (dictionary_size_ > 0) { 41 delete[] dictionary_; 42 } 43 } 44 45 bool VCDiffEngine::Init() { 46 if (hashed_dictionary_) { 47 LOG(DFATAL) << "Init() called twice for same VCDiffEngine object" 48 << LOG_ENDL; 49 return false; 50 } 51 hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_, 52 dictionary_size()); 53 if (!hashed_dictionary_) { 54 LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL; 55 return false; 56 } 57 if (!RollingHash<BlockHash::kBlockSize>::Init()) { 58 LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL; 59 return false; 60 } 61 return true; 62 } 63 64 // This helper function tries to find an appropriate match within 65 // hashed_dictionary_ for the block starting at the current target position. 66 // If target_hash is not NULL, this function will also look for a match 67 // within the previously encoded target data. 68 // 69 // If a match is found, this function will generate an ADD instruction 70 // for all unencoded data that precedes the match, 71 // and a COPY instruction for the match itself; then it returns 72 // the number of bytes processed by both instructions, 73 // which is guaranteed to be > 0. 74 // If no appropriate match is found, the function returns 0. 75 // 76 // The first four parameters are input parameters which are passed 77 // directly to BlockHash::FindBestMatch; please see that function 78 // for a description of their allowable values. 79 template<bool look_for_target_matches> 80 inline size_t VCDiffEngine::EncodeCopyForBestMatch( 81 uint32_t hash_value, 82 const char* target_candidate_start, 83 const char* unencoded_target_start, 84 size_t unencoded_target_size, 85 const BlockHash* target_hash, 86 CodeTableWriterInterface* coder) const { 87 // When FindBestMatch() comes up with a match for a candidate block, 88 // it will populate best_match with the size, source offset, 89 // and target offset of the match. 90 BlockHash::Match best_match; 91 92 // First look for a match in the dictionary. 93 hashed_dictionary_->FindBestMatch(hash_value, 94 target_candidate_start, 95 unencoded_target_start, 96 unencoded_target_size, 97 &best_match); 98 // If target matching is enabled, then see if there is a better match 99 // within the target data that has been encoded so far. 100 if (look_for_target_matches) { 101 target_hash->FindBestMatch(hash_value, 102 target_candidate_start, 103 unencoded_target_start, 104 unencoded_target_size, 105 &best_match); 106 } 107 if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) { 108 return 0; 109 } 110 if (best_match.target_offset() > 0) { 111 // Create an ADD instruction to encode all target bytes 112 // from the end of the last COPY match, if any, up to 113 // the beginning of this COPY match. 114 coder->Add(unencoded_target_start, best_match.target_offset()); 115 } 116 coder->Copy(best_match.source_offset(), best_match.size()); 117 return best_match.target_offset() // ADD size 118 + best_match.size(); // + COPY size 119 } 120 121 // Once the encoder loop has finished checking for matches in the target data, 122 // this function creates an ADD instruction to encode all target bytes 123 // from the end of the last COPY match, if any, through the end of 124 // the target data. In the worst case, if no matches were found at all, 125 // this function will create one big ADD instruction 126 // for the entire buffer of target data. 127 inline void VCDiffEngine::AddUnmatchedRemainder( 128 const char* unencoded_target_start, 129 size_t unencoded_target_size, 130 CodeTableWriterInterface* coder) const { 131 if (unencoded_target_size > 0) { 132 coder->Add(unencoded_target_start, unencoded_target_size); 133 } 134 } 135 136 // This helper function tells the coder to finish the encoding and write 137 // the results into the output string "diff". 138 inline void VCDiffEngine::FinishEncoding( 139 size_t target_size, 140 OutputStringInterface* diff, 141 CodeTableWriterInterface* coder) const { 142 if (target_size != static_cast<size_t>(coder->target_length())) { 143 LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: " 144 "original target size (" << target_size 145 << ") does not match number of bytes processed (" 146 << coder->target_length() << ")" << LOG_ENDL; 147 } 148 coder->Output(diff); 149 } 150 151 template<bool look_for_target_matches> 152 void VCDiffEngine::EncodeInternal(const char* target_data, 153 size_t target_size, 154 OutputStringInterface* diff, 155 CodeTableWriterInterface* coder) const { 156 if (!hashed_dictionary_) { 157 LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() " 158 "called before VCDiffEngine::Init()" << LOG_ENDL; 159 return; 160 } 161 if (target_size == 0) { 162 return; // Do nothing for empty target 163 } 164 // Special case for really small input 165 if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) { 166 AddUnmatchedRemainder(target_data, target_size, coder); 167 FinishEncoding(target_size, diff, coder); 168 return; 169 } 170 RollingHash<BlockHash::kBlockSize> hasher; 171 BlockHash* target_hash = NULL; 172 if (look_for_target_matches) { 173 // Check matches against previously encoded target data 174 // in this same target window, as well as against the dictionary 175 target_hash = BlockHash::CreateTargetHash(target_data, 176 target_size, 177 dictionary_size()); 178 if (!target_hash) { 179 LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL; 180 return; 181 } 182 } 183 const char* const target_end = target_data + target_size; 184 const char* const start_of_last_block = target_end - BlockHash::kBlockSize; 185 // Offset of next bytes in string to ADD if NOT copied (i.e., not found in 186 // dictionary) 187 const char* next_encode = target_data; 188 // candidate_pos points to the start of the kBlockSize-byte block that may 189 // begin a match with the dictionary or previously encoded target data. 190 const char* candidate_pos = target_data; 191 uint32_t hash_value = hasher.Hash(candidate_pos); 192 while (1) { 193 const size_t bytes_encoded = 194 EncodeCopyForBestMatch<look_for_target_matches>( 195 hash_value, 196 candidate_pos, 197 next_encode, 198 (target_end - next_encode), 199 target_hash, 200 coder); 201 if (bytes_encoded > 0) { 202 next_encode += bytes_encoded; // Advance past COPYed data 203 candidate_pos = next_encode; 204 if (candidate_pos > start_of_last_block) { 205 break; // Reached end of target data 206 } 207 // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash 208 // can't be used to calculate the hash value at its new position. 209 hash_value = hasher.Hash(candidate_pos); 210 if (look_for_target_matches) { 211 // Update the target hash for the ADDed and COPYed data 212 target_hash->AddAllBlocksThroughIndex( 213 static_cast<int>(next_encode - target_data)); 214 } 215 } else { 216 // No match, or match is too small to be worth a COPY instruction. 217 // Move to the next position in the target data. 218 if ((candidate_pos + 1) > start_of_last_block) { 219 break; // Reached end of target data 220 } 221 if (look_for_target_matches) { 222 target_hash->AddOneIndexHash( 223 static_cast<int>(candidate_pos - target_data), 224 hash_value); 225 } 226 hash_value = hasher.UpdateHash(hash_value, 227 candidate_pos[0], 228 candidate_pos[BlockHash::kBlockSize]); 229 ++candidate_pos; 230 } 231 } 232 AddUnmatchedRemainder(next_encode, target_end - next_encode, coder); 233 FinishEncoding(target_size, diff, coder); 234 delete target_hash; 235 } 236 237 void VCDiffEngine::Encode(const char* target_data, 238 size_t target_size, 239 bool look_for_target_matches, 240 OutputStringInterface* diff, 241 CodeTableWriterInterface* coder) const { 242 if (look_for_target_matches) { 243 EncodeInternal<true>(target_data, target_size, diff, coder); 244 } else { 245 EncodeInternal<false>(target_data, target_size, diff, coder); 246 } 247 } 248 249 } // namespace open_vcdiff 250