1 // Copyright 2007, 2008 Google Inc. 2 // Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef OPEN_VCDIFF_ROLLING_HASH_H_ 17 #define OPEN_VCDIFF_ROLLING_HASH_H_ 18 19 #include <config.h> 20 #include <stdint.h> // uint32_t 21 #include "compile_assert.h" 22 #include "logging.h" 23 24 namespace open_vcdiff { 25 26 // Rabin-Karp hasher module -- this is a faster version with different 27 // constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is 28 // close enough for most applications. 29 30 // Definitions common to all hash window sizes. 31 class RollingHashUtil { 32 public: 33 // Multiplier for incremental hashing. The compiler should be smart enough to 34 // convert (val * kMult) into ((val << 8) + val). 35 static const uint32_t kMult = 257; 36 37 // All hashes are returned modulo "kBase". Current implementation requires 38 // kBase <= 2^32/kMult to avoid overflow. Also, kBase must be a power of two 39 // so that we can compute modulus efficiently. 40 static const uint32_t kBase = (1 << 23); 41 42 // Returns operand % kBase, assuming that kBase is a power of two. 43 static inline uint32_t ModBase(uint32_t operand) { 44 return operand & (kBase - 1); 45 } 46 47 // Given an unsigned integer "operand", returns an unsigned integer "result" 48 // such that 49 // result < kBase 50 // and 51 // ModBase(operand + result) == 0 52 static inline uint32_t FindModBaseInverse(uint32_t operand) { 53 // The subtraction (0 - operand) produces an unsigned underflow for any 54 // operand except 0. The underflow results in a (very large) unsigned 55 // number. Binary subtraction is used instead of unary negation because 56 // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned 57 // value is negated. 58 // 59 // The C++ mod operation (operand % kBase) may produce different results for 60 // different compilers if operand is negative. That is not a problem in 61 // this case, since all numbers used are unsigned, and ModBase does its work 62 // using bitwise arithmetic rather than the % operator. 63 return ModBase(uint32_t(0) - operand); 64 } 65 66 // Here's the heart of the hash algorithm. Start with a partial_hash value of 67 // 0, and run this HashStep once against each byte in the data window to be 68 // hashed. The result will be the hash value for the entire data window. The 69 // Hash() function, below, does exactly this, albeit with some refinements. 70 static inline uint32_t HashStep(uint32_t partial_hash, 71 unsigned char next_byte) { 72 return ModBase((partial_hash * kMult) + next_byte); 73 } 74 75 // Use this function to start computing a new hash value based on the first 76 // two bytes in the window. It is equivalent to calling 77 // HashStep(HashStep(0, ptr[0]), ptr[1]) 78 // but takes advantage of the fact that the maximum value of 79 // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus 80 // avoiding an unnecessary ModBase operation. 81 static inline uint32_t HashFirstTwoBytes(const char* ptr) { 82 return (static_cast<unsigned char>(ptr[0]) * kMult) 83 + static_cast<unsigned char>(ptr[1]); 84 } 85 private: 86 // Making these private avoids copy constructor and assignment operator. 87 // No objects of this type should be constructed. 88 RollingHashUtil(); 89 RollingHashUtil(const RollingHashUtil&); // NOLINT 90 void operator=(const RollingHashUtil&); 91 }; 92 93 // window_size must be >= 2. 94 template<int window_size> 95 class RollingHash { 96 public: 97 // Perform global initialization that is required in order to instantiate a 98 // RollingHash. This function *must* be called (preferably on startup) by any 99 // program that uses a RollingHash. It is harmless to call this function more 100 // than once. It is not thread-safe, but calling it from two different 101 // threads at the same time can only cause a memory leak, not incorrect 102 // behavior. Make sure to call it before spawning any threads that could use 103 // RollingHash. 104 static void Init(); 105 106 // Initialize hasher to maintain a window of the specified size. You need an 107 // instance of this type to use UpdateHash(), but Hash() does not depend on 108 // remove_table_, so it is static. 109 RollingHash() { 110 if (!remove_table_) { 111 VCD_DFATAL << "RollingHash object instantiated" 112 " before calling RollingHash::Init()" << VCD_ENDL; 113 } 114 } 115 116 // Compute a hash of the window "ptr[0, window_size - 1]". 117 static uint32_t Hash(const char* ptr) { 118 uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr); 119 for (int i = 2; i < window_size; ++i) { 120 h = RollingHashUtil::HashStep(h, ptr[i]); 121 } 122 return h; 123 } 124 125 // Update a hash by removing the oldest byte and adding a new byte. 126 // 127 // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1] 128 // along with the value of buffer[0] (the "old_first_byte" argument) 129 // and the value of buffer[window_size] (the "new_last_byte" argument). 130 // It quickly computes the hash value of buffer[1] ... buffer[window_size] 131 // without having to run Hash() on the entire window. 132 // 133 // The larger the window, the more advantage comes from using UpdateHash() 134 // (which runs in time independent of window_size) instead of Hash(). 135 // Each time window_size doubles, the time to execute Hash() also doubles, 136 // while the time to execute UpdateHash() remains constant. Empirical tests 137 // have borne out this statement. 138 uint32_t UpdateHash(uint32_t old_hash, 139 const char old_first_byte, 140 const char new_last_byte) const { 141 uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte); 142 return RollingHashUtil::HashStep(partial_hash, new_last_byte); 143 } 144 145 protected: 146 // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the 147 // value of the first byte buffer[0], this function returns a *partial* hash 148 // value for buffer[1] ... buffer[window_size -1]. See the comments in 149 // Init(), below, for a description of how the contents of remove_table_ are 150 // computed. 151 static uint32_t RemoveFirstByteFromHash(uint32_t full_hash, 152 unsigned char first_byte) { 153 return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]); 154 } 155 156 private: 157 // We keep a table that maps from any byte "b" to 158 // (- b * pow(kMult, window_size - 1)) % kBase 159 static const uint32_t* remove_table_; 160 }; 161 162 // For each window_size, fill a 256-entry table such that 163 // the hash value of buffer[0] ... buffer[window_size - 1] 164 // + remove_table_[buffer[0]] 165 // == the hash value of buffer[1] ... buffer[window_size - 1] 166 // See the comments in Init(), below, for a description of how the contents of 167 // remove_table_ are computed. 168 template<int window_size> 169 const uint32_t* RollingHash<window_size>::remove_table_ = NULL; 170 171 // Init() checks to make sure that the static object remove_table_ has been 172 // initialized; if not, it does the considerable work of populating it. Once 173 // it's ready, the table can be used for any number of RollingHash objects of 174 // the same window_size. 175 // 176 template<int window_size> 177 void RollingHash<window_size>::Init() { 178 VCD_COMPILE_ASSERT(window_size >= 2, 179 RollingHash_window_size_must_be_at_least_2); 180 if (remove_table_ == NULL) { 181 // The new object is placed into a local pointer instead of directly into 182 // remove_table_, for two reasons: 183 // 1. remove_table_ is a pointer to const. The table is populated using 184 // the non-const local pointer and then assigned to the global const 185 // pointer once it's ready. 186 // 2. No other thread will ever see remove_table_ pointing to a 187 // partially-initialized table. If two threads happen to call Init() 188 // at the same time, two tables with the same contents may be created 189 // (causing a memory leak), but the results will be consistent 190 // no matter which of the two tables is used. 191 uint32_t* new_remove_table = new uint32_t[256]; 192 // Compute multiplier. Concisely, it is: 193 // pow(kMult, (window_size - 1)) % kBase, 194 // but we compute the power in integer form. 195 uint32_t multiplier = 1; 196 for (int i = 0; i < window_size - 1; ++i) { 197 multiplier = 198 RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult); 199 } 200 // For each character removed_byte, compute 201 // remove_table_[removed_byte] == 202 // (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase 203 // where the power operator "pow" is taken in integer form. 204 // 205 // If you take a hash value fp representing the hash of 206 // buffer[0] ... buffer[window_size - 1] 207 // and add the value of remove_table_[buffer[0]] to it, the result will be 208 // a partial hash value for 209 // buffer[1] ... buffer[window_size - 1] 210 // that is to say, it no longer includes buffer[0]. 211 // 212 // The following byte at buffer[window_size] can then be merged with this 213 // partial hash value to arrive quickly at the hash value for a window that 214 // has advanced by one byte, to 215 // buffer[1] ... buffer[window_size] 216 // In fact, that is precisely what happens in UpdateHash, above. 217 uint32_t byte_times_multiplier = 0; 218 for (int removed_byte = 0; removed_byte < 256; ++removed_byte) { 219 new_remove_table[removed_byte] = 220 RollingHashUtil::FindModBaseInverse(byte_times_multiplier); 221 // Iteratively adding the multiplier in this loop is equivalent to 222 // computing (removed_byte * multiplier), and is faster 223 byte_times_multiplier = 224 RollingHashUtil::ModBase(byte_times_multiplier + multiplier); 225 } 226 remove_table_ = new_remove_table; 227 } 228 } 229 230 } // namespace open_vcdiff 231 232 #endif // OPEN_VCDIFF_ROLLING_HASH_H_ 233