1 /* 2 * Copyright (C) 2005, 2006, 2008, 2010, 2013 Apple Inc. All rights reserved. 3 * Copyright (C) 2010 Patrick Gansterer <paroga (at) paroga.com> 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #ifndef WTF_StringHasher_h 23 #define WTF_StringHasher_h 24 25 #include "wtf/unicode/Unicode.h" 26 27 namespace WTF { 28 29 // Paul Hsieh's SuperFastHash 30 // http://www.azillionmonkeys.com/qed/hash.html 31 32 // LChar data is interpreted as Latin-1-encoded (zero extended to 16 bits). 33 34 // NOTE: The hash computation here must stay in sync with the create_hash_table script in 35 // JavaScriptCore and the CodeGeneratorJS.pm script in WebCore. 36 37 // Golden ratio. Arbitrary start value to avoid mapping all zeros to a hash value of zero. 38 static const unsigned stringHashingStartValue = 0x9E3779B9U; 39 40 class StringHasher { 41 public: 42 static const unsigned flagCount = 8; // Save 8 bits for StringImpl to use as flags. 43 44 StringHasher() 45 : m_hash(stringHashingStartValue) 46 , m_hasPendingCharacter(false) 47 , m_pendingCharacter(0) 48 { 49 } 50 51 // The hasher hashes two characters at a time, and thus an "aligned" hasher is one 52 // where an even number of characters have been added. Callers that always add 53 // characters two at a time can use the "assuming aligned" functions. 54 void addCharactersAssumingAligned(UChar a, UChar b) 55 { 56 ASSERT(!m_hasPendingCharacter); 57 m_hash += a; 58 m_hash = (m_hash << 16) ^ ((b << 11) ^ m_hash); 59 m_hash += m_hash >> 11; 60 } 61 62 void addCharacter(UChar character) 63 { 64 if (m_hasPendingCharacter) { 65 m_hasPendingCharacter = false; 66 addCharactersAssumingAligned(m_pendingCharacter, character); 67 return; 68 } 69 70 m_pendingCharacter = character; 71 m_hasPendingCharacter = true; 72 } 73 74 void addCharacters(UChar a, UChar b) 75 { 76 if (m_hasPendingCharacter) { 77 #if !ASSERT_DISABLED 78 m_hasPendingCharacter = false; 79 #endif 80 addCharactersAssumingAligned(m_pendingCharacter, a); 81 m_pendingCharacter = b; 82 #if !ASSERT_DISABLED 83 m_hasPendingCharacter = true; 84 #endif 85 return; 86 } 87 88 addCharactersAssumingAligned(a, b); 89 } 90 91 template<typename T, UChar Converter(T)> void addCharactersAssumingAligned(const T* data, unsigned length) 92 { 93 ASSERT(!m_hasPendingCharacter); 94 95 bool remainder = length & 1; 96 length >>= 1; 97 98 while (length--) { 99 addCharactersAssumingAligned(Converter(data[0]), Converter(data[1])); 100 data += 2; 101 } 102 103 if (remainder) 104 addCharacter(Converter(*data)); 105 } 106 107 template<typename T> void addCharactersAssumingAligned(const T* data, unsigned length) 108 { 109 addCharactersAssumingAligned<T, defaultConverter>(data, length); 110 } 111 112 template<typename T, UChar Converter(T)> void addCharactersAssumingAligned(const T* data) 113 { 114 ASSERT(!m_hasPendingCharacter); 115 116 while (T a = *data++) { 117 T b = *data++; 118 if (!b) { 119 addCharacter(Converter(a)); 120 break; 121 } 122 addCharactersAssumingAligned(Converter(a), Converter(b)); 123 } 124 } 125 126 template<typename T> void addCharactersAssumingAligned(const T* data) 127 { 128 addCharactersAssumingAligned<T, defaultConverter>(data); 129 } 130 131 template<typename T, UChar Converter(T)> void addCharacters(const T* data, unsigned length) 132 { 133 if (m_hasPendingCharacter && length) { 134 m_hasPendingCharacter = false; 135 addCharactersAssumingAligned(m_pendingCharacter, Converter(*data++)); 136 --length; 137 } 138 addCharactersAssumingAligned<T, Converter>(data, length); 139 } 140 141 template<typename T> void addCharacters(const T* data, unsigned length) 142 { 143 addCharacters<T, defaultConverter>(data, length); 144 } 145 146 template<typename T, UChar Converter(T)> void addCharacters(const T* data) 147 { 148 if (m_hasPendingCharacter && *data) { 149 m_hasPendingCharacter = false; 150 addCharactersAssumingAligned(m_pendingCharacter, Converter(*data++)); 151 } 152 addCharactersAssumingAligned<T, Converter>(data); 153 } 154 155 template<typename T> void addCharacters(const T* data) 156 { 157 addCharacters<T, defaultConverter>(data); 158 } 159 160 unsigned hashWithTop8BitsMasked() const 161 { 162 unsigned result = avalancheBits(); 163 164 // Reserving space from the high bits for flags preserves most of the hash's 165 // value, since hash lookup typically masks out the high bits anyway. 166 result &= (1U << (sizeof(result) * 8 - flagCount)) - 1; 167 168 // This avoids ever returning a hash code of 0, since that is used to 169 // signal "hash not computed yet". Setting the high bit maintains 170 // reasonable fidelity to a hash code of 0 because it is likely to yield 171 // exactly 0 when hash lookup masks out the high bits. 172 if (!result) 173 result = 0x80000000 >> flagCount; 174 175 return result; 176 } 177 178 unsigned hash() const 179 { 180 unsigned result = avalancheBits(); 181 182 // This avoids ever returning a hash code of 0, since that is used to 183 // signal "hash not computed yet". Setting the high bit maintains 184 // reasonable fidelity to a hash code of 0 because it is likely to yield 185 // exactly 0 when hash lookup masks out the high bits. 186 if (!result) 187 result = 0x80000000; 188 189 return result; 190 } 191 192 template<typename T, UChar Converter(T)> static unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) 193 { 194 StringHasher hasher; 195 hasher.addCharactersAssumingAligned<T, Converter>(data, length); 196 return hasher.hashWithTop8BitsMasked(); 197 } 198 199 template<typename T, UChar Converter(T)> static unsigned computeHashAndMaskTop8Bits(const T* data) 200 { 201 StringHasher hasher; 202 hasher.addCharactersAssumingAligned<T, Converter>(data); 203 return hasher.hashWithTop8BitsMasked(); 204 } 205 206 template<typename T> static unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) 207 { 208 return computeHashAndMaskTop8Bits<T, defaultConverter>(data, length); 209 } 210 211 template<typename T> static unsigned computeHashAndMaskTop8Bits(const T* data) 212 { 213 return computeHashAndMaskTop8Bits<T, defaultConverter>(data); 214 } 215 216 template<typename T, UChar Converter(T)> static unsigned computeHash(const T* data, unsigned length) 217 { 218 StringHasher hasher; 219 hasher.addCharactersAssumingAligned<T, Converter>(data, length); 220 return hasher.hash(); 221 } 222 223 template<typename T, UChar Converter(T)> static unsigned computeHash(const T* data) 224 { 225 StringHasher hasher; 226 hasher.addCharactersAssumingAligned<T, Converter>(data); 227 return hasher.hash(); 228 } 229 230 template<typename T> static unsigned computeHash(const T* data, unsigned length) 231 { 232 return computeHash<T, defaultConverter>(data, length); 233 } 234 235 template<typename T> static unsigned computeHash(const T* data) 236 { 237 return computeHash<T, defaultConverter>(data); 238 } 239 240 static unsigned hashMemory(const void* data, unsigned length) 241 { 242 // FIXME: Why does this function use the version of the hash that drops the top 8 bits? 243 // We want that for all string hashing so we can use those bits in StringImpl and hash 244 // strings consistently, but I don't see why we'd want that for general memory hashing. 245 ASSERT(!(length % 2)); 246 return computeHashAndMaskTop8Bits<UChar>(static_cast<const UChar*>(data), length / sizeof(UChar)); 247 } 248 249 template<size_t length> static unsigned hashMemory(const void* data) 250 { 251 COMPILE_ASSERT(!(length % 2), length_must_be_a_multiple_of_two); 252 return hashMemory(data, length); 253 } 254 255 private: 256 static UChar defaultConverter(UChar character) 257 { 258 return character; 259 } 260 261 static UChar defaultConverter(LChar character) 262 { 263 return character; 264 } 265 266 unsigned avalancheBits() const 267 { 268 unsigned result = m_hash; 269 270 // Handle end case. 271 if (m_hasPendingCharacter) { 272 result += m_pendingCharacter; 273 result ^= result << 11; 274 result += result >> 17; 275 } 276 277 // Force "avalanching" of final 31 bits. 278 result ^= result << 3; 279 result += result >> 5; 280 result ^= result << 2; 281 result += result >> 15; 282 result ^= result << 10; 283 284 return result; 285 } 286 287 unsigned m_hash; 288 bool m_hasPendingCharacter; 289 UChar m_pendingCharacter; 290 }; 291 292 } // namespace WTF 293 294 using WTF::StringHasher; 295 296 #endif // WTF_StringHasher_h 297