Home | History | Annotate | Download | only in src
      1 // Copyright 2006, 2008 Google Inc.
      2 // Authors: Chandra Chereddi, Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <config.h>
     17 #include "blockhash.h"
     18 #include <stdint.h>  // uint32_t
     19 #include <string.h>  // memcpy, memcmp
     20 #include <algorithm>  // std::min
     21 #include "compile_assert.h"
     22 #include "logging.h"
     23 #include "rolling_hash.h"
     24 
     25 namespace open_vcdiff {
     26 
     27 typedef unsigned long uword_t;  // a machine word                         NOLINT
     28 
     29 BlockHash::BlockHash(const char* source_data,
     30                      size_t source_size,
     31                      int starting_offset)
     32     : source_data_(source_data),
     33       source_size_(source_size),
     34       hash_table_mask_(0),
     35       starting_offset_(starting_offset),
     36       last_block_added_(-1) {
     37 }
     38 
     39 BlockHash::~BlockHash() { }
     40 
     41 // kBlockSize must be at least 2 to be meaningful.  Since it's a compile-time
     42 // constant, check its value at compile time rather than wasting CPU cycles
     43 // on runtime checks.
     44 VCD_COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);
     45 
     46 // kBlockSize is required to be a power of 2 because multiplication
     47 // (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)
     48 // are commonly-used operations.  If kBlockSize is a compile-time
     49 // constant and a power of 2, the compiler can convert these three operations
     50 // into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much
     51 // more efficient than executing full integer multiply, divide, or remainder
     52 // instructions.
     53 VCD_COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
     54                    kBlockSize_must_be_a_power_of_2);
     55 
     56 bool BlockHash::Init(bool populate_hash_table) {
     57   if (!hash_table_.empty() ||
     58       !next_block_table_.empty() ||
     59       !last_block_table_.empty()) {
     60     VCD_DFATAL << "Init() called twice for same BlockHash object" << VCD_ENDL;
     61     return false;
     62   }
     63   const size_t table_size = CalcTableSize(source_size_);
     64   if (table_size == 0) {
     65     VCD_DFATAL << "Error finding table size for source size " << source_size_
     66                << VCD_ENDL;
     67     return false;
     68   }
     69   // Since table_size is a power of 2, (table_size - 1) is a bit mask
     70   // containing all the bits below table_size.
     71   hash_table_mask_ = static_cast<uint32_t>(table_size - 1);
     72   hash_table_.resize(table_size, -1);
     73   next_block_table_.resize(GetNumberOfBlocks(), -1);
     74   last_block_table_.resize(GetNumberOfBlocks(), -1);
     75   if (populate_hash_table) {
     76     AddAllBlocks();
     77   }
     78   return true;
     79 }
     80 
     81 const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
     82                                                  size_t dictionary_size) {
     83   BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
     84                                                  dictionary_size,
     85                                                  0);
     86   if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {
     87     delete new_dictionary_hash;
     88     return NULL;
     89   } else {
     90     return new_dictionary_hash;
     91   }
     92 }
     93 
     94 BlockHash* BlockHash::CreateTargetHash(const char* target_data,
     95                                        size_t target_size,
     96                                        size_t dictionary_size) {
     97   BlockHash* new_target_hash = new BlockHash(target_data,
     98                                              target_size,
     99                                              static_cast<int>(dictionary_size));
    100   if (!new_target_hash->Init(/* populate_hash_table = */ false)) {
    101     delete new_target_hash;
    102     return NULL;
    103   } else {
    104     return new_target_hash;
    105   }
    106 }
    107 
    108 // Returns zero if an error occurs.
    109 size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
    110   // Overallocate the hash table by making it the same size (in bytes)
    111   // as the source data.  This is a trade-off between space and time:
    112   // the empty entries in the hash table will reduce the
    113   // probability of a hash collision to (sizeof(int) / kblockSize),
    114   // and so save time comparing false matches.
    115   const size_t min_size = (dictionary_size / sizeof(int)) + 1;  // NOLINT
    116   size_t table_size = 1;
    117   // Find the smallest power of 2 that is >= min_size, and assign
    118   // that value to table_size.
    119   while (table_size < min_size) {
    120     table_size <<= 1;
    121     // Guard against an infinite loop
    122     if (table_size <= 0) {
    123       VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
    124                  << dictionary_size
    125                  << "): resulting table_size " << table_size
    126                  << " is zero or negative" << VCD_ENDL;
    127       return 0;
    128     }
    129   }
    130   // Check size sanity
    131   if ((table_size & (table_size - 1)) != 0) {
    132     VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
    133                << dictionary_size
    134                << "): resulting table_size " << table_size
    135                << " is not a power of 2" << VCD_ENDL;
    136     return 0;
    137   }
    138   // The loop above tries to find the smallest power of 2 that is >= min_size.
    139   // That value must lie somewhere between min_size and (min_size * 2),
    140   // except for the case (dictionary_size == 0, table_size == 1).
    141   if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
    142     VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
    143                << dictionary_size
    144                << "): resulting table_size " << table_size
    145                << " is too large" << VCD_ENDL;
    146     return 0;
    147   }
    148   return table_size;
    149 }
    150 
    151 // If the hash value is already available from the rolling hash,
    152 // call this function to save time.
    153 void BlockHash::AddBlock(uint32_t hash_value) {
    154   if (hash_table_.empty()) {
    155     VCD_DFATAL << "BlockHash::AddBlock() called before BlockHash::Init()"
    156                << VCD_ENDL;
    157     return;
    158   }
    159   // The initial value of last_block_added_ is -1.
    160   int block_number = last_block_added_ + 1;
    161   const int total_blocks =
    162       static_cast<int>(source_size_ / kBlockSize);  // round down
    163   if (block_number >= total_blocks) {
    164     VCD_DFATAL << "BlockHash::AddBlock() called"
    165                   " with block number " << block_number
    166                << " that is past last block " << (total_blocks - 1)
    167                << VCD_ENDL;
    168     return;
    169   }
    170   if (next_block_table_[block_number] != -1) {
    171     VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
    172                   "block number = " << block_number
    173                << ", next block should be -1 but is "
    174                << next_block_table_[block_number] << VCD_ENDL;
    175     return;
    176   }
    177   const uint32_t hash_table_index = GetHashTableIndex(hash_value);
    178   const int first_matching_block = hash_table_[hash_table_index];
    179   if (first_matching_block < 0) {
    180     // This is the first entry with this hash value
    181     hash_table_[hash_table_index] = block_number;
    182     last_block_table_[block_number] = block_number;
    183   } else {
    184     // Add this entry at the end of the chain of matching blocks
    185     const int last_matching_block = last_block_table_[first_matching_block];
    186     if (next_block_table_[last_matching_block] != -1) {
    187       VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
    188                     "first matching block = " << first_matching_block
    189                  << ", last matching block = " << last_matching_block
    190                  << ", next block should be -1 but is "
    191                  << next_block_table_[last_matching_block] << VCD_ENDL;
    192       return;
    193     }
    194     next_block_table_[last_matching_block] = block_number;
    195     last_block_table_[first_matching_block] = block_number;
    196   }
    197   last_block_added_ = block_number;
    198 }
    199 
    200 void BlockHash::AddAllBlocks() {
    201   AddAllBlocksThroughIndex(static_cast<int>(source_size_));
    202 }
    203 
    204 void BlockHash::AddAllBlocksThroughIndex(int end_index) {
    205   if (end_index > static_cast<int>(source_size_)) {
    206     VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
    207                   " with index " << end_index
    208                << " higher than end index  " << source_size_ << VCD_ENDL;
    209     return;
    210   }
    211   const int last_index_added = last_block_added_ * kBlockSize;
    212   if (end_index <= last_index_added) {
    213     VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
    214                   " with index " << end_index
    215                << " <= last index added ( " << last_index_added
    216                << ")" << VCD_ENDL;
    217     return;
    218   }
    219   int end_limit = end_index;
    220   // Don't allow reading any indices at or past source_size_.
    221   // The Hash function extends (kBlockSize - 1) bytes past the index,
    222   // so leave a margin of that size.
    223   int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
    224   if (end_limit > last_legal_hash_index) {
    225     end_limit = last_legal_hash_index + 1;
    226   }
    227   const char* block_ptr = source_data() + NextIndexToAdd();
    228   const char* const end_ptr = source_data() + end_limit;
    229   while (block_ptr < end_ptr) {
    230     AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
    231     block_ptr += kBlockSize;
    232   }
    233 }
    234 
    235 VCD_COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
    236                    kBlockSize_must_be_a_multiple_of_machine_word_size);
    237 
    238 // A recursive template to compare a fixed number
    239 // of (possibly unaligned) machine words starting
    240 // at addresses block1 and block2.  Returns true or false
    241 // depending on whether an exact match was found.
    242 template<int number_of_words>
    243 inline bool CompareWholeWordValues(const char* block1,
    244                                    const char* block2) {
    245   return CompareWholeWordValues<1>(block1, block2) &&
    246          CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
    247                                                      block2 + sizeof(uword_t));
    248 }
    249 
    250 // The base of the recursive template: compare one pair of machine words.
    251 template<>
    252 inline bool CompareWholeWordValues<1>(const char* word1,
    253                                       const char* word2) {
    254   uword_t aligned_word1, aligned_word2;
    255   memcpy(&aligned_word1, word1, sizeof(aligned_word1));
    256   memcpy(&aligned_word2, word2, sizeof(aligned_word2));
    257   return aligned_word1 == aligned_word2;
    258 }
    259 
    260 // A block must be composed of an integral number of machine words
    261 // (uword_t values.)  This function takes advantage of that fact
    262 // by comparing the blocks as series of (possibly unaligned) word values.
    263 // A word-sized comparison can be performed as a single
    264 // machine instruction.  Comparing words instead of bytes means that,
    265 // on a 64-bit platform, this function will use 8 times fewer test-and-branch
    266 // instructions than a byte-by-byte comparison.  Even with the extra
    267 // cost of the calls to memcpy, this method is still at least twice as fast
    268 // as memcmp (measured using gcc on a 64-bit platform, with a block size
    269 // of 32.)  For blocks with identical contents (a common case), this method
    270 // is over six times faster than memcmp.
    271 inline bool BlockCompareWordsInline(const char* block1, const char* block2) {
    272   static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
    273   return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
    274 }
    275 
    276 bool BlockHash::BlockCompareWords(const char* block1, const char* block2) {
    277   return BlockCompareWordsInline(block1, block2);
    278 }
    279 
    280 inline bool BlockContentsMatchInline(const char* block1, const char* block2) {
    281   // Optimize for mismatch in first byte.  Since this function is called only
    282   // when the hash values of the two blocks match, it is very likely that either
    283   // the blocks are identical, or else the first byte does not match.
    284   if (*block1 != *block2) {
    285     return false;
    286   }
    287 #ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS
    288   return BlockCompareWordsInline(block1, block2);
    289 #else  // !VCDIFF_USE_BLOCK_COMPARE_WORDS
    290   return memcmp(block1, block2, BlockHash::kBlockSize) == 0;
    291 #endif  // VCDIFF_USE_BLOCK_COMPARE_WORDS
    292 }
    293 
    294 bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
    295   return BlockContentsMatchInline(block1, block2);
    296 }
    297 
    298 inline int BlockHash::SkipNonMatchingBlocks(int block_number,
    299                                             const char* block_ptr) const {
    300   int probes = 0;
    301   while ((block_number >= 0) &&
    302          !BlockContentsMatchInline(block_ptr,
    303                                    &source_data_[block_number * kBlockSize])) {
    304     if (++probes > kMaxProbes) {
    305       return -1;  // Avoid too much chaining
    306     }
    307     block_number = next_block_table_[block_number];
    308   }
    309   return block_number;
    310 }
    311 
    312 // Init() must have been called and returned true before using
    313 // FirstMatchingBlock or NextMatchingBlock.  No check is performed
    314 // for this condition; the code will crash if this condition is violated.
    315 inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
    316                                                const char* block_ptr) const {
    317   return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
    318                                block_ptr);
    319 }
    320 
    321 int BlockHash::FirstMatchingBlock(uint32_t hash_value,
    322                                   const char* block_ptr) const {
    323   return FirstMatchingBlockInline(hash_value, block_ptr);
    324 }
    325 
    326 int BlockHash::NextMatchingBlock(int block_number,
    327                                  const char* block_ptr) const {
    328   if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
    329     VCD_DFATAL << "NextMatchingBlock called for invalid block number "
    330                << block_number << VCD_ENDL;
    331     return -1;
    332   }
    333   return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
    334 }
    335 
    336 // Keep a count of the number of matches found.  This will throttle the
    337 // number of iterations in FindBestMatch.  For example, if the entire
    338 // dictionary is made up of spaces (' ') and the search string is also
    339 // made up of spaces, there will be one match for each block in the
    340 // dictionary.
    341 inline bool BlockHash::TooManyMatches(int* match_counter) {
    342   ++(*match_counter);
    343   return (*match_counter) > kMaxMatchesToCheck;
    344 }
    345 
    346 // Returns the number of bytes to the left of source_match_start
    347 // that match the corresponding bytes to the left of target_match_start.
    348 // Will not examine more than max_bytes bytes, which is to say that
    349 // the return value will be in the range [0, max_bytes] inclusive.
    350 int BlockHash::MatchingBytesToLeft(const char* source_match_start,
    351                                    const char* target_match_start,
    352                                    int max_bytes) {
    353   const char* source_ptr = source_match_start;
    354   const char* target_ptr = target_match_start;
    355   int bytes_found = 0;
    356   while (bytes_found < max_bytes) {
    357     --source_ptr;
    358     --target_ptr;
    359     if (*source_ptr != *target_ptr) {
    360       break;
    361     }
    362     ++bytes_found;
    363   }
    364   return bytes_found;
    365 }
    366 
    367 // Returns the number of bytes starting at source_match_end
    368 // that match the corresponding bytes starting at target_match_end.
    369 // Will not examine more than max_bytes bytes, which is to say that
    370 // the return value will be in the range [0, max_bytes] inclusive.
    371 int BlockHash::MatchingBytesToRight(const char* source_match_end,
    372                                     const char* target_match_end,
    373                                     int max_bytes) {
    374   const char* source_ptr = source_match_end;
    375   const char* target_ptr = target_match_end;
    376   int bytes_found = 0;
    377   while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
    378     ++bytes_found;
    379     ++source_ptr;
    380     ++target_ptr;
    381   }
    382   return bytes_found;
    383 }
    384 
    385 // No NULL checks are performed on the pointer arguments.  The caller
    386 // must guarantee that none of the arguments is NULL, or a crash will occur.
    387 //
    388 // The vast majority of calls to FindBestMatch enter the loop *zero* times,
    389 // which is to say that most candidate blocks find no matches in the dictionary.
    390 // The important sections for optimization are therefore the code outside the
    391 // loop and the code within the loop conditions.  Keep this to a minimum.
    392 void BlockHash::FindBestMatch(uint32_t hash_value,
    393                               const char* target_candidate_start,
    394                               const char* target_start,
    395                               size_t target_size,
    396                               Match* best_match) const {
    397   int match_counter = 0;
    398   for (int block_number = FirstMatchingBlockInline(hash_value,
    399                                                    target_candidate_start);
    400        (block_number >= 0) && !TooManyMatches(&match_counter);
    401        block_number = NextMatchingBlock(block_number, target_candidate_start)) {
    402     int source_match_offset = block_number * kBlockSize;
    403     const int source_match_end = source_match_offset + kBlockSize;
    404 
    405     int target_match_offset =
    406         static_cast<int>(target_candidate_start - target_start);
    407     const int target_match_end = target_match_offset + kBlockSize;
    408 
    409     size_t match_size = kBlockSize;
    410     {
    411       // Extend match start towards beginning of unencoded data
    412       const int limit_bytes_to_left = std::min(source_match_offset,
    413                                                target_match_offset);
    414       const int matching_bytes_to_left =
    415           MatchingBytesToLeft(source_data_ + source_match_offset,
    416                               target_start + target_match_offset,
    417                               limit_bytes_to_left);
    418       source_match_offset -= matching_bytes_to_left;
    419       target_match_offset -= matching_bytes_to_left;
    420       match_size += matching_bytes_to_left;
    421     }
    422     {
    423       // Extend match end towards end of unencoded data
    424       const size_t source_bytes_to_right = source_size_ - source_match_end;
    425       const size_t target_bytes_to_right = target_size - target_match_end;
    426       const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
    427                                                    target_bytes_to_right);
    428       match_size +=
    429           MatchingBytesToRight(source_data_ + source_match_end,
    430                                target_start + target_match_end,
    431                                static_cast<int>(limit_bytes_to_right));
    432     }
    433     // Update in/out parameter if the best match found was better
    434     // than any match already stored in *best_match.
    435     best_match->ReplaceIfBetterMatch(match_size,
    436                                      source_match_offset + starting_offset_,
    437                                      target_match_offset);
    438   }
    439 }
    440 
    441 }  // namespace open_vcdiff
    442