1 // Copyright 2006, 2008 Google Inc. 2 // Authors: Chandra Chereddi, Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <config.h> 17 #include "blockhash.h" 18 #include <stdint.h> // uint32_t 19 #include <string.h> // memcpy, memcmp 20 #include <algorithm> // std::min 21 #include "compile_assert.h" 22 #include "logging.h" 23 #include "rolling_hash.h" 24 25 namespace open_vcdiff { 26 27 typedef unsigned long uword_t; // a machine word NOLINT 28 29 BlockHash::BlockHash(const char* source_data, 30 size_t source_size, 31 int starting_offset) 32 : source_data_(source_data), 33 source_size_(source_size), 34 hash_table_mask_(0), 35 starting_offset_(starting_offset), 36 last_block_added_(-1) { 37 } 38 39 BlockHash::~BlockHash() { } 40 41 // kBlockSize must be at least 2 to be meaningful. Since it's a compile-time 42 // constant, check its value at compile time rather than wasting CPU cycles 43 // on runtime checks. 44 VCD_COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2); 45 46 // kBlockSize is required to be a power of 2 because multiplication 47 // (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize) 48 // are commonly-used operations. If kBlockSize is a compile-time 49 // constant and a power of 2, the compiler can convert these three operations 50 // into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much 51 // more efficient than executing full integer multiply, divide, or remainder 52 // instructions. 53 VCD_COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0, 54 kBlockSize_must_be_a_power_of_2); 55 56 bool BlockHash::Init(bool populate_hash_table) { 57 if (!hash_table_.empty() || 58 !next_block_table_.empty() || 59 !last_block_table_.empty()) { 60 VCD_DFATAL << "Init() called twice for same BlockHash object" << VCD_ENDL; 61 return false; 62 } 63 const size_t table_size = CalcTableSize(source_size_); 64 if (table_size == 0) { 65 VCD_DFATAL << "Error finding table size for source size " << source_size_ 66 << VCD_ENDL; 67 return false; 68 } 69 // Since table_size is a power of 2, (table_size - 1) is a bit mask 70 // containing all the bits below table_size. 71 hash_table_mask_ = static_cast<uint32_t>(table_size - 1); 72 hash_table_.resize(table_size, -1); 73 next_block_table_.resize(GetNumberOfBlocks(), -1); 74 last_block_table_.resize(GetNumberOfBlocks(), -1); 75 if (populate_hash_table) { 76 AddAllBlocks(); 77 } 78 return true; 79 } 80 81 const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data, 82 size_t dictionary_size) { 83 BlockHash* new_dictionary_hash = new BlockHash(dictionary_data, 84 dictionary_size, 85 0); 86 if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) { 87 delete new_dictionary_hash; 88 return NULL; 89 } else { 90 return new_dictionary_hash; 91 } 92 } 93 94 BlockHash* BlockHash::CreateTargetHash(const char* target_data, 95 size_t target_size, 96 size_t dictionary_size) { 97 BlockHash* new_target_hash = new BlockHash(target_data, 98 target_size, 99 static_cast<int>(dictionary_size)); 100 if (!new_target_hash->Init(/* populate_hash_table = */ false)) { 101 delete new_target_hash; 102 return NULL; 103 } else { 104 return new_target_hash; 105 } 106 } 107 108 // Returns zero if an error occurs. 109 size_t BlockHash::CalcTableSize(const size_t dictionary_size) { 110 // Overallocate the hash table by making it the same size (in bytes) 111 // as the source data. This is a trade-off between space and time: 112 // the empty entries in the hash table will reduce the 113 // probability of a hash collision to (sizeof(int) / kblockSize), 114 // and so save time comparing false matches. 115 const size_t min_size = (dictionary_size / sizeof(int)) + 1; // NOLINT 116 size_t table_size = 1; 117 // Find the smallest power of 2 that is >= min_size, and assign 118 // that value to table_size. 119 while (table_size < min_size) { 120 table_size <<= 1; 121 // Guard against an infinite loop 122 if (table_size <= 0) { 123 VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = " 124 << dictionary_size 125 << "): resulting table_size " << table_size 126 << " is zero or negative" << VCD_ENDL; 127 return 0; 128 } 129 } 130 // Check size sanity 131 if ((table_size & (table_size - 1)) != 0) { 132 VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = " 133 << dictionary_size 134 << "): resulting table_size " << table_size 135 << " is not a power of 2" << VCD_ENDL; 136 return 0; 137 } 138 // The loop above tries to find the smallest power of 2 that is >= min_size. 139 // That value must lie somewhere between min_size and (min_size * 2), 140 // except for the case (dictionary_size == 0, table_size == 1). 141 if ((dictionary_size > 0) && (table_size > (min_size * 2))) { 142 VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = " 143 << dictionary_size 144 << "): resulting table_size " << table_size 145 << " is too large" << VCD_ENDL; 146 return 0; 147 } 148 return table_size; 149 } 150 151 // If the hash value is already available from the rolling hash, 152 // call this function to save time. 153 void BlockHash::AddBlock(uint32_t hash_value) { 154 if (hash_table_.empty()) { 155 VCD_DFATAL << "BlockHash::AddBlock() called before BlockHash::Init()" 156 << VCD_ENDL; 157 return; 158 } 159 // The initial value of last_block_added_ is -1. 160 int block_number = last_block_added_ + 1; 161 const int total_blocks = 162 static_cast<int>(source_size_ / kBlockSize); // round down 163 if (block_number >= total_blocks) { 164 VCD_DFATAL << "BlockHash::AddBlock() called" 165 " with block number " << block_number 166 << " that is past last block " << (total_blocks - 1) 167 << VCD_ENDL; 168 return; 169 } 170 if (next_block_table_[block_number] != -1) { 171 VCD_DFATAL << "Internal error in BlockHash::AddBlock(): " 172 "block number = " << block_number 173 << ", next block should be -1 but is " 174 << next_block_table_[block_number] << VCD_ENDL; 175 return; 176 } 177 const uint32_t hash_table_index = GetHashTableIndex(hash_value); 178 const int first_matching_block = hash_table_[hash_table_index]; 179 if (first_matching_block < 0) { 180 // This is the first entry with this hash value 181 hash_table_[hash_table_index] = block_number; 182 last_block_table_[block_number] = block_number; 183 } else { 184 // Add this entry at the end of the chain of matching blocks 185 const int last_matching_block = last_block_table_[first_matching_block]; 186 if (next_block_table_[last_matching_block] != -1) { 187 VCD_DFATAL << "Internal error in BlockHash::AddBlock(): " 188 "first matching block = " << first_matching_block 189 << ", last matching block = " << last_matching_block 190 << ", next block should be -1 but is " 191 << next_block_table_[last_matching_block] << VCD_ENDL; 192 return; 193 } 194 next_block_table_[last_matching_block] = block_number; 195 last_block_table_[first_matching_block] = block_number; 196 } 197 last_block_added_ = block_number; 198 } 199 200 void BlockHash::AddAllBlocks() { 201 AddAllBlocksThroughIndex(static_cast<int>(source_size_)); 202 } 203 204 void BlockHash::AddAllBlocksThroughIndex(int end_index) { 205 if (end_index > static_cast<int>(source_size_)) { 206 VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called" 207 " with index " << end_index 208 << " higher than end index " << source_size_ << VCD_ENDL; 209 return; 210 } 211 const int last_index_added = last_block_added_ * kBlockSize; 212 if (end_index <= last_index_added) { 213 VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called" 214 " with index " << end_index 215 << " <= last index added ( " << last_index_added 216 << ")" << VCD_ENDL; 217 return; 218 } 219 int end_limit = end_index; 220 // Don't allow reading any indices at or past source_size_. 221 // The Hash function extends (kBlockSize - 1) bytes past the index, 222 // so leave a margin of that size. 223 int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize); 224 if (end_limit > last_legal_hash_index) { 225 end_limit = last_legal_hash_index + 1; 226 } 227 const char* block_ptr = source_data() + NextIndexToAdd(); 228 const char* const end_ptr = source_data() + end_limit; 229 while (block_ptr < end_ptr) { 230 AddBlock(RollingHash<kBlockSize>::Hash(block_ptr)); 231 block_ptr += kBlockSize; 232 } 233 } 234 235 VCD_COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0, 236 kBlockSize_must_be_a_multiple_of_machine_word_size); 237 238 // A recursive template to compare a fixed number 239 // of (possibly unaligned) machine words starting 240 // at addresses block1 and block2. Returns true or false 241 // depending on whether an exact match was found. 242 template<int number_of_words> 243 inline bool CompareWholeWordValues(const char* block1, 244 const char* block2) { 245 return CompareWholeWordValues<1>(block1, block2) && 246 CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t), 247 block2 + sizeof(uword_t)); 248 } 249 250 // The base of the recursive template: compare one pair of machine words. 251 template<> 252 inline bool CompareWholeWordValues<1>(const char* word1, 253 const char* word2) { 254 uword_t aligned_word1, aligned_word2; 255 memcpy(&aligned_word1, word1, sizeof(aligned_word1)); 256 memcpy(&aligned_word2, word2, sizeof(aligned_word2)); 257 return aligned_word1 == aligned_word2; 258 } 259 260 // A block must be composed of an integral number of machine words 261 // (uword_t values.) This function takes advantage of that fact 262 // by comparing the blocks as series of (possibly unaligned) word values. 263 // A word-sized comparison can be performed as a single 264 // machine instruction. Comparing words instead of bytes means that, 265 // on a 64-bit platform, this function will use 8 times fewer test-and-branch 266 // instructions than a byte-by-byte comparison. Even with the extra 267 // cost of the calls to memcpy, this method is still at least twice as fast 268 // as memcmp (measured using gcc on a 64-bit platform, with a block size 269 // of 32.) For blocks with identical contents (a common case), this method 270 // is over six times faster than memcmp. 271 inline bool BlockCompareWordsInline(const char* block1, const char* block2) { 272 static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t); 273 return CompareWholeWordValues<kWordsPerBlock>(block1, block2); 274 } 275 276 bool BlockHash::BlockCompareWords(const char* block1, const char* block2) { 277 return BlockCompareWordsInline(block1, block2); 278 } 279 280 inline bool BlockContentsMatchInline(const char* block1, const char* block2) { 281 // Optimize for mismatch in first byte. Since this function is called only 282 // when the hash values of the two blocks match, it is very likely that either 283 // the blocks are identical, or else the first byte does not match. 284 if (*block1 != *block2) { 285 return false; 286 } 287 #ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS 288 return BlockCompareWordsInline(block1, block2); 289 #else // !VCDIFF_USE_BLOCK_COMPARE_WORDS 290 return memcmp(block1, block2, BlockHash::kBlockSize) == 0; 291 #endif // VCDIFF_USE_BLOCK_COMPARE_WORDS 292 } 293 294 bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) { 295 return BlockContentsMatchInline(block1, block2); 296 } 297 298 inline int BlockHash::SkipNonMatchingBlocks(int block_number, 299 const char* block_ptr) const { 300 int probes = 0; 301 while ((block_number >= 0) && 302 !BlockContentsMatchInline(block_ptr, 303 &source_data_[block_number * kBlockSize])) { 304 if (++probes > kMaxProbes) { 305 return -1; // Avoid too much chaining 306 } 307 block_number = next_block_table_[block_number]; 308 } 309 return block_number; 310 } 311 312 // Init() must have been called and returned true before using 313 // FirstMatchingBlock or NextMatchingBlock. No check is performed 314 // for this condition; the code will crash if this condition is violated. 315 inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value, 316 const char* block_ptr) const { 317 return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)], 318 block_ptr); 319 } 320 321 int BlockHash::FirstMatchingBlock(uint32_t hash_value, 322 const char* block_ptr) const { 323 return FirstMatchingBlockInline(hash_value, block_ptr); 324 } 325 326 int BlockHash::NextMatchingBlock(int block_number, 327 const char* block_ptr) const { 328 if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) { 329 VCD_DFATAL << "NextMatchingBlock called for invalid block number " 330 << block_number << VCD_ENDL; 331 return -1; 332 } 333 return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr); 334 } 335 336 // Keep a count of the number of matches found. This will throttle the 337 // number of iterations in FindBestMatch. For example, if the entire 338 // dictionary is made up of spaces (' ') and the search string is also 339 // made up of spaces, there will be one match for each block in the 340 // dictionary. 341 inline bool BlockHash::TooManyMatches(int* match_counter) { 342 ++(*match_counter); 343 return (*match_counter) > kMaxMatchesToCheck; 344 } 345 346 // Returns the number of bytes to the left of source_match_start 347 // that match the corresponding bytes to the left of target_match_start. 348 // Will not examine more than max_bytes bytes, which is to say that 349 // the return value will be in the range [0, max_bytes] inclusive. 350 int BlockHash::MatchingBytesToLeft(const char* source_match_start, 351 const char* target_match_start, 352 int max_bytes) { 353 const char* source_ptr = source_match_start; 354 const char* target_ptr = target_match_start; 355 int bytes_found = 0; 356 while (bytes_found < max_bytes) { 357 --source_ptr; 358 --target_ptr; 359 if (*source_ptr != *target_ptr) { 360 break; 361 } 362 ++bytes_found; 363 } 364 return bytes_found; 365 } 366 367 // Returns the number of bytes starting at source_match_end 368 // that match the corresponding bytes starting at target_match_end. 369 // Will not examine more than max_bytes bytes, which is to say that 370 // the return value will be in the range [0, max_bytes] inclusive. 371 int BlockHash::MatchingBytesToRight(const char* source_match_end, 372 const char* target_match_end, 373 int max_bytes) { 374 const char* source_ptr = source_match_end; 375 const char* target_ptr = target_match_end; 376 int bytes_found = 0; 377 while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) { 378 ++bytes_found; 379 ++source_ptr; 380 ++target_ptr; 381 } 382 return bytes_found; 383 } 384 385 // No NULL checks are performed on the pointer arguments. The caller 386 // must guarantee that none of the arguments is NULL, or a crash will occur. 387 // 388 // The vast majority of calls to FindBestMatch enter the loop *zero* times, 389 // which is to say that most candidate blocks find no matches in the dictionary. 390 // The important sections for optimization are therefore the code outside the 391 // loop and the code within the loop conditions. Keep this to a minimum. 392 void BlockHash::FindBestMatch(uint32_t hash_value, 393 const char* target_candidate_start, 394 const char* target_start, 395 size_t target_size, 396 Match* best_match) const { 397 int match_counter = 0; 398 for (int block_number = FirstMatchingBlockInline(hash_value, 399 target_candidate_start); 400 (block_number >= 0) && !TooManyMatches(&match_counter); 401 block_number = NextMatchingBlock(block_number, target_candidate_start)) { 402 int source_match_offset = block_number * kBlockSize; 403 const int source_match_end = source_match_offset + kBlockSize; 404 405 int target_match_offset = 406 static_cast<int>(target_candidate_start - target_start); 407 const int target_match_end = target_match_offset + kBlockSize; 408 409 size_t match_size = kBlockSize; 410 { 411 // Extend match start towards beginning of unencoded data 412 const int limit_bytes_to_left = std::min(source_match_offset, 413 target_match_offset); 414 const int matching_bytes_to_left = 415 MatchingBytesToLeft(source_data_ + source_match_offset, 416 target_start + target_match_offset, 417 limit_bytes_to_left); 418 source_match_offset -= matching_bytes_to_left; 419 target_match_offset -= matching_bytes_to_left; 420 match_size += matching_bytes_to_left; 421 } 422 { 423 // Extend match end towards end of unencoded data 424 const size_t source_bytes_to_right = source_size_ - source_match_end; 425 const size_t target_bytes_to_right = target_size - target_match_end; 426 const size_t limit_bytes_to_right = std::min(source_bytes_to_right, 427 target_bytes_to_right); 428 match_size += 429 MatchingBytesToRight(source_data_ + source_match_end, 430 target_start + target_match_end, 431 static_cast<int>(limit_bytes_to_right)); 432 } 433 // Update in/out parameter if the best match found was better 434 // than any match already stored in *best_match. 435 best_match->ReplaceIfBetterMatch(match_size, 436 source_match_offset + starting_offset_, 437 target_match_offset); 438 } 439 } 440 441 } // namespace open_vcdiff 442