Home | History | Annotate | Download | only in src
      1 // Copyright 2007, 2008 Google Inc.
      2 // Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #ifndef OPEN_VCDIFF_ROLLING_HASH_H_
     17 #define OPEN_VCDIFF_ROLLING_HASH_H_
     18 
     19 #include <config.h>
     20 #include <stdint.h>  // uint32_t
     21 #include "logging.h"
     22 
     23 namespace open_vcdiff {
     24 
     25 // Rabin-Karp hasher module -- this is a faster version with different
     26 // constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is
     27 // close enough for most applications.
     28 
     29 // Definitions common to all hash window sizes.
     30 class RollingHashUtil {
     31  public:
     32   // Multiplier for incremental hashing.  The compiler should be smart enough to
     33   // convert (val * kMult) into ((val << 8) + val).
     34   static const uint32_t kMult = 257;
     35 
     36   // All hashes are returned modulo "kBase".  Current implementation requires
     37   // kBase <= 2^32/kMult to avoid overflow.  Also, kBase must be a power of two
     38   // so that we can compute modulus efficiently.
     39   static const uint32_t kBase = (1 << 23);
     40 
     41   // Returns operand % kBase, assuming that kBase is a power of two.
     42   static inline uint32_t ModBase(uint32_t operand) {
     43     return operand & (kBase - 1);
     44   }
     45 
     46   // Given an unsigned integer "operand", returns an unsigned integer "result"
     47   // such that
     48   //     result < kBase
     49   // and
     50   //     ModBase(operand + result) == 0
     51   static inline uint32_t FindModBaseInverse(uint32_t operand) {
     52     // The subtraction (0 - operand) produces an unsigned underflow for any
     53     // operand except 0.  The underflow results in a (very large) unsigned
     54     // number.  Binary subtraction is used instead of unary negation because
     55     // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned
     56     // value is negated.
     57     //
     58     // The C++ mod operation (operand % kBase) may produce different results for
     59     // different compilers if operand is negative.  That is not a problem in
     60     // this case, since all numbers used are unsigned, and ModBase does its work
     61     // using bitwise arithmetic rather than the % operator.
     62     return ModBase(uint32_t(0) - operand);
     63   }
     64 
     65   // Here's the heart of the hash algorithm.  Start with a partial_hash value of
     66   // 0, and run this HashStep once against each byte in the data window to be
     67   // hashed.  The result will be the hash value for the entire data window.  The
     68   // Hash() function, below, does exactly this, albeit with some refinements.
     69   static inline uint32_t HashStep(uint32_t partial_hash,
     70                                   unsigned char next_byte) {
     71     return ModBase((partial_hash * kMult) + next_byte);
     72   }
     73 
     74   // Use this function to start computing a new hash value based on the first
     75   // two bytes in the window.  It is equivalent to calling
     76   //     HashStep(HashStep(0, ptr[0]), ptr[1])
     77   // but takes advantage of the fact that the maximum value of
     78   // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus
     79   // avoiding an unnecessary ModBase operation.
     80   static inline uint32_t HashFirstTwoBytes(const char* ptr) {
     81     return (static_cast<unsigned char>(ptr[0]) * kMult)
     82         + static_cast<unsigned char>(ptr[1]);
     83   }
     84  private:
     85   // Making these private avoids copy constructor and assignment operator.
     86   // No objects of this type should be constructed.
     87   RollingHashUtil();
     88   RollingHashUtil(const RollingHashUtil&);  // NOLINT
     89   void operator=(const RollingHashUtil&);
     90 };
     91 
     92 // window_size must be >= 2.
     93 template<int window_size>
     94 class RollingHash {
     95  public:
     96   // Perform global initialization that is required in order to instantiate a
     97   // RollingHash.  This function *must* be called (preferably on startup) by any
     98   // program that uses a RollingHash.  It is harmless to call this function more
     99   // than once.  It is not thread-safe, but calling it from two different
    100   // threads at the same time can only cause a memory leak, not incorrect
    101   // behavior.  Make sure to call it before spawning any threads that could use
    102   // RollingHash.  The function returns true if initialization succeeds, or
    103   // false if initialization fails, in which case the caller should not proceed
    104   // to construct any objects of type RollingHash.
    105   static bool Init();
    106 
    107   // Initialize hasher to maintain a window of the specified size.  You need an
    108   // instance of this type to use UpdateHash(), but Hash() does not depend on
    109   // remove_table_, so it is static.
    110   RollingHash() {
    111     if (!remove_table_) {
    112       LOG(DFATAL) << "RollingHash object instantiated"
    113                      " before calling RollingHash::Init()" << LOG_ENDL;
    114     }
    115   }
    116 
    117   // Compute a hash of the window "ptr[0, window_size - 1]".
    118   static uint32_t Hash(const char* ptr) {
    119     uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr);
    120     for (int i = 2; i < window_size; ++i) {
    121       h = RollingHashUtil::HashStep(h, ptr[i]);
    122     }
    123     return h;
    124   }
    125 
    126   // Update a hash by removing the oldest byte and adding a new byte.
    127   //
    128   // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1]
    129   // along with the value of buffer[0] (the "old_first_byte" argument)
    130   // and the value of buffer[window_size] (the "new_last_byte" argument).
    131   // It quickly computes the hash value of buffer[1] ... buffer[window_size]
    132   // without having to run Hash() on the entire window.
    133   //
    134   // The larger the window, the more advantage comes from using UpdateHash()
    135   // (which runs in time independent of window_size) instead of Hash().
    136   // Each time window_size doubles, the time to execute Hash() also doubles,
    137   // while the time to execute UpdateHash() remains constant.  Empirical tests
    138   // have borne out this statement.
    139   uint32_t UpdateHash(uint32_t old_hash,
    140                       const char old_first_byte,
    141                       const char new_last_byte) const {
    142     uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte);
    143     return RollingHashUtil::HashStep(partial_hash, new_last_byte);
    144   }
    145 
    146  protected:
    147   // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the
    148   // value of the first byte buffer[0], this function returns a *partial* hash
    149   // value for buffer[1] ... buffer[window_size -1].  See the comments in
    150   // Init(), below, for a description of how the contents of remove_table_ are
    151   // computed.
    152   static uint32_t RemoveFirstByteFromHash(uint32_t full_hash,
    153                                           unsigned char first_byte) {
    154     return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]);
    155   }
    156 
    157  private:
    158   // We keep a table that maps from any byte "b" to
    159   //    (- b * pow(kMult, window_size - 1)) % kBase
    160   static const uint32_t* remove_table_;
    161 };
    162 
    163 // For each window_size, fill a 256-entry table such that
    164 //        the hash value of buffer[0] ... buffer[window_size - 1]
    165 //      + remove_table_[buffer[0]]
    166 //     == the hash value of buffer[1] ... buffer[window_size - 1]
    167 // See the comments in Init(), below, for a description of how the contents of
    168 // remove_table_ are computed.
    169 template<int window_size>
    170 const uint32_t* RollingHash<window_size>::remove_table_ = NULL;
    171 
    172 // Init() checks to make sure that the static object remove_table_ has been
    173 // initialized; if not, it does the considerable work of populating it.  Once
    174 // it's ready, the table can be used for any number of RollingHash objects of
    175 // the same window_size.
    176 //
    177 template<int window_size>
    178 bool RollingHash<window_size>::Init() {
    179   if (window_size < 2) {
    180     LOG(ERROR) << "RollingHash window size " << window_size
    181                << " is too small" << LOG_ENDL;
    182     return false;
    183   }
    184   if (remove_table_ == NULL) {
    185     // The new object is placed into a local pointer instead of directly into
    186     // remove_table_, for two reasons:
    187     //   1. remove_table_ is a pointer to const.  The table is populated using
    188     //      the non-const local pointer and then assigned to the global const
    189     //      pointer once it's ready.
    190     //   2. No other thread will ever see remove_table_ pointing to a
    191     //      partially-initialized table.  If two threads happen to call Init()
    192     //      at the same time, two tables with the same contents may be created
    193     //      (causing a memory leak), but the results will be consistent
    194     //      no matter which of the two tables is used.
    195     uint32_t* new_remove_table = new uint32_t[256];
    196     // Compute multiplier.  Concisely, it is:
    197     //     pow(kMult, (window_size - 1)) % kBase,
    198     // but we compute the power in integer form.
    199     uint32_t multiplier = 1;
    200     for (int i = 0; i < window_size - 1; ++i) {
    201       multiplier =
    202           RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult);
    203     }
    204     // For each character removed_byte, compute
    205     //     remove_table_[removed_byte] ==
    206     //         (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase
    207     // where the power operator "pow" is taken in integer form.
    208     //
    209     // If you take a hash value fp representing the hash of
    210     //     buffer[0] ... buffer[window_size - 1]
    211     // and add the value of remove_table_[buffer[0]] to it, the result will be
    212     // a partial hash value for
    213     //     buffer[1] ... buffer[window_size - 1]
    214     // that is to say, it no longer includes buffer[0].
    215     //
    216     // The following byte at buffer[window_size] can then be merged with this
    217     // partial hash value to arrive quickly at the hash value for a window that
    218     // has advanced by one byte, to
    219     //     buffer[1] ... buffer[window_size]
    220     // In fact, that is precisely what happens in UpdateHash, above.
    221     uint32_t byte_times_multiplier = 0;
    222     for (int removed_byte = 0; removed_byte < 256; ++removed_byte) {
    223       new_remove_table[removed_byte] =
    224           RollingHashUtil::FindModBaseInverse(byte_times_multiplier);
    225       // Iteratively adding the multiplier in this loop is equivalent to
    226       // computing (removed_byte * multiplier), and is faster
    227       byte_times_multiplier =
    228           RollingHashUtil::ModBase(byte_times_multiplier + multiplier);
    229     }
    230     remove_table_ = new_remove_table;
    231   }
    232   return true;
    233 }
    234 
    235 }  // namespace open_vcdiff
    236 
    237 #endif  // OPEN_VCDIFF_ROLLING_HASH_H_
    238