1 /* Set of hash utility functions to help maintaining the invariant that 2 if a==b then hash(a)==hash(b) 3 4 All the utility functions (_Py_Hash*()) return "-1" to signify an error. 5 */ 6 #include "Python.h" 7 8 #ifdef __APPLE__ 9 # include <libkern/OSByteOrder.h> 10 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H) 11 # include <endian.h> 12 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H) 13 # include <sys/endian.h> 14 #endif 15 16 #ifdef __cplusplus 17 extern "C" { 18 #endif 19 20 _Py_HashSecret_t _Py_HashSecret = {{0}}; 21 22 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL 23 extern PyHash_FuncDef PyHash_Func; 24 #else 25 static PyHash_FuncDef PyHash_Func; 26 #endif 27 28 /* Count _Py_HashBytes() calls */ 29 #ifdef Py_HASH_STATS 30 #define Py_HASH_STATS_MAX 32 31 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; 32 #endif 33 34 /* For numeric types, the hash of a number x is based on the reduction 35 of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that 36 hash(x) == hash(y) whenever x and y are numerically equal, even if 37 x and y have different types. 38 39 A quick summary of the hashing strategy: 40 41 (1) First define the 'reduction of x modulo P' for any rational 42 number x; this is a standard extension of the usual notion of 43 reduction modulo P for integers. If x == p/q (written in lowest 44 terms), the reduction is interpreted as the reduction of p times 45 the inverse of the reduction of q, all modulo P; if q is exactly 46 divisible by P then define the reduction to be infinity. So we've 47 got a well-defined map 48 49 reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. 50 51 (2) Now for a rational number x, define hash(x) by: 52 53 reduce(x) if x >= 0 54 -reduce(-x) if x < 0 55 56 If the result of the reduction is infinity (this is impossible for 57 integers, floats and Decimals) then use the predefined hash value 58 _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead. 59 _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the 60 hashes of float and Decimal infinities and nans. 61 62 A selling point for the above strategy is that it makes it possible 63 to compute hashes of decimal and binary floating-point numbers 64 efficiently, even if the exponent of the binary or decimal number 65 is large. The key point is that 66 67 reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS) 68 69 provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a 70 binary or decimal float is never infinity, since the denominator is a power 71 of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, 72 for nonnegative x, 73 74 reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS 75 76 reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS 77 78 and reduce(10**e) can be computed efficiently by the usual modular 79 exponentiation algorithm. For reduce(2**e) it's even better: since 80 P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication 81 by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. 82 83 */ 84 85 Py_hash_t 86 _Py_HashDouble(double v) 87 { 88 int e, sign; 89 double m; 90 Py_uhash_t x, y; 91 92 if (!Py_IS_FINITE(v)) { 93 if (Py_IS_INFINITY(v)) 94 return v > 0 ? _PyHASH_INF : -_PyHASH_INF; 95 else 96 return _PyHASH_NAN; 97 } 98 99 m = frexp(v, &e); 100 101 sign = 1; 102 if (m < 0) { 103 sign = -1; 104 m = -m; 105 } 106 107 /* process 28 bits at a time; this should work well both for binary 108 and hexadecimal floating point. */ 109 x = 0; 110 while (m) { 111 x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28); 112 m *= 268435456.0; /* 2**28 */ 113 e -= 28; 114 y = (Py_uhash_t)m; /* pull out integer part */ 115 m -= y; 116 x += y; 117 if (x >= _PyHASH_MODULUS) 118 x -= _PyHASH_MODULUS; 119 } 120 121 /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */ 122 e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS); 123 x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e); 124 125 x = x * sign; 126 if (x == (Py_uhash_t)-1) 127 x = (Py_uhash_t)-2; 128 return (Py_hash_t)x; 129 } 130 131 Py_hash_t 132 _Py_HashPointer(void *p) 133 { 134 Py_hash_t x; 135 size_t y = (size_t)p; 136 /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid 137 excessive hash collisions for dicts and sets */ 138 y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4)); 139 x = (Py_hash_t)y; 140 if (x == -1) 141 x = -2; 142 return x; 143 } 144 145 Py_hash_t 146 _Py_HashBytes(const void *src, Py_ssize_t len) 147 { 148 Py_hash_t x; 149 /* 150 We make the hash of the empty string be 0, rather than using 151 (prefix ^ suffix), since this slightly obfuscates the hash secret 152 */ 153 if (len == 0) { 154 return 0; 155 } 156 157 #ifdef Py_HASH_STATS 158 hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++; 159 #endif 160 161 #if Py_HASH_CUTOFF > 0 162 if (len < Py_HASH_CUTOFF) { 163 /* Optimize hashing of very small strings with inline DJBX33A. */ 164 Py_uhash_t hash; 165 const unsigned char *p = src; 166 hash = 5381; /* DJBX33A starts with 5381 */ 167 168 switch(len) { 169 /* ((hash << 5) + hash) + *p == hash * 33 + *p */ 170 case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ 171 case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ 172 case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ 173 case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ 174 case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ 175 case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ 176 case 1: hash = ((hash << 5) + hash) + *p++; break; 177 default: 178 Py_UNREACHABLE(); 179 } 180 hash ^= len; 181 hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix; 182 x = (Py_hash_t)hash; 183 } 184 else 185 #endif /* Py_HASH_CUTOFF */ 186 x = PyHash_Func.hash(src, len); 187 188 if (x == -1) 189 return -2; 190 return x; 191 } 192 193 void 194 _PyHash_Fini(void) 195 { 196 #ifdef Py_HASH_STATS 197 int i; 198 Py_ssize_t total = 0; 199 const char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n"; 200 201 fprintf(stderr, "len calls total\n"); 202 for (i = 1; i <= Py_HASH_STATS_MAX; i++) { 203 total += hashstats[i]; 204 fprintf(stderr, fmt, i, hashstats[i], total); 205 } 206 total += hashstats[0]; 207 fprintf(stderr, "> %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n", 208 hashstats[0], total); 209 #endif 210 } 211 212 PyHash_FuncDef * 213 PyHash_GetFuncDef(void) 214 { 215 return &PyHash_Func; 216 } 217 218 /* Optimized memcpy() for Windows */ 219 #ifdef _MSC_VER 220 # if SIZEOF_PY_UHASH_T == 4 221 # define PY_UHASH_CPY(dst, src) do { \ 222 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \ 223 } while(0) 224 # elif SIZEOF_PY_UHASH_T == 8 225 # define PY_UHASH_CPY(dst, src) do { \ 226 dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \ 227 dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \ 228 } while(0) 229 # else 230 # error SIZEOF_PY_UHASH_T must be 4 or 8 231 # endif /* SIZEOF_PY_UHASH_T */ 232 #else /* not Windows */ 233 # define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T) 234 #endif /* _MSC_VER */ 235 236 237 #if Py_HASH_ALGORITHM == Py_HASH_FNV 238 /* ************************************************************************** 239 * Modified Fowler-Noll-Vo (FNV) hash function 240 */ 241 static Py_hash_t 242 fnv(const void *src, Py_ssize_t len) 243 { 244 const unsigned char *p = src; 245 Py_uhash_t x; 246 Py_ssize_t remainder, blocks; 247 union { 248 Py_uhash_t value; 249 unsigned char bytes[SIZEOF_PY_UHASH_T]; 250 } block; 251 252 #ifdef Py_DEBUG 253 assert(_Py_HashSecret_Initialized); 254 #endif 255 remainder = len % SIZEOF_PY_UHASH_T; 256 if (remainder == 0) { 257 /* Process at least one block byte by byte to reduce hash collisions 258 * for strings with common prefixes. */ 259 remainder = SIZEOF_PY_UHASH_T; 260 } 261 blocks = (len - remainder) / SIZEOF_PY_UHASH_T; 262 263 x = (Py_uhash_t) _Py_HashSecret.fnv.prefix; 264 x ^= (Py_uhash_t) *p << 7; 265 while (blocks--) { 266 PY_UHASH_CPY(block.bytes, p); 267 x = (_PyHASH_MULTIPLIER * x) ^ block.value; 268 p += SIZEOF_PY_UHASH_T; 269 } 270 /* add remainder */ 271 for (; remainder > 0; remainder--) 272 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++; 273 x ^= (Py_uhash_t) len; 274 x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix; 275 if (x == (Py_uhash_t) -1) { 276 x = (Py_uhash_t) -2; 277 } 278 return x; 279 } 280 281 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T, 282 16 * SIZEOF_PY_HASH_T}; 283 284 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */ 285 286 287 /* ************************************************************************** 288 <MIT License> 289 Copyright (c) 2013 Marek Majkowski <marek (at) popcount.org> 290 291 Permission is hereby granted, free of charge, to any person obtaining a copy 292 of this software and associated documentation files (the "Software"), to deal 293 in the Software without restriction, including without limitation the rights 294 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 295 copies of the Software, and to permit persons to whom the Software is 296 furnished to do so, subject to the following conditions: 297 298 The above copyright notice and this permission notice shall be included in 299 all copies or substantial portions of the Software. 300 301 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 302 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 303 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 304 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 305 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 306 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 307 THE SOFTWARE. 308 </MIT License> 309 310 Original location: 311 https://github.com/majek/csiphash/ 312 313 Solution inspired by code from: 314 Samuel Neves (supercop/crypto_auth/siphash24/little) 315 djb (supercop/crypto_auth/siphash24/little2) 316 Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) 317 318 Modified for Python by Christian Heimes: 319 - C89 / MSVC compatibility 320 - _rotl64() on Windows 321 - letoh64() fallback 322 */ 323 324 /* byte swap little endian to host endian 325 * Endian conversion not only ensures that the hash function returns the same 326 * value on all platforms. It is also required to for a good dispersion of 327 * the hash values' least significant bits. 328 */ 329 #if PY_LITTLE_ENDIAN 330 # define _le64toh(x) ((uint64_t)(x)) 331 #elif defined(__APPLE__) 332 # define _le64toh(x) OSSwapLittleToHostInt64(x) 333 #elif defined(HAVE_LETOH64) 334 # define _le64toh(x) le64toh(x) 335 #else 336 # define _le64toh(x) (((uint64_t)(x) << 56) | \ 337 (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \ 338 (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \ 339 (((uint64_t)(x) << 8) & 0xff00000000ULL) | \ 340 (((uint64_t)(x) >> 8) & 0xff000000ULL) | \ 341 (((uint64_t)(x) >> 24) & 0xff0000ULL) | \ 342 (((uint64_t)(x) >> 40) & 0xff00ULL) | \ 343 ((uint64_t)(x) >> 56)) 344 #endif 345 346 347 #ifdef _MSC_VER 348 # define ROTATE(x, b) _rotl64(x, b) 349 #else 350 # define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) ) 351 #endif 352 353 #define HALF_ROUND(a,b,c,d,s,t) \ 354 a += b; c += d; \ 355 b = ROTATE(b, s) ^ a; \ 356 d = ROTATE(d, t) ^ c; \ 357 a = ROTATE(a, 32); 358 359 #define DOUBLE_ROUND(v0,v1,v2,v3) \ 360 HALF_ROUND(v0,v1,v2,v3,13,16); \ 361 HALF_ROUND(v2,v1,v0,v3,17,21); \ 362 HALF_ROUND(v0,v1,v2,v3,13,16); \ 363 HALF_ROUND(v2,v1,v0,v3,17,21); 364 365 366 static uint64_t 367 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) { 368 uint64_t b = (uint64_t)src_sz << 56; 369 const uint8_t *in = (uint8_t*)src; 370 371 uint64_t v0 = k0 ^ 0x736f6d6570736575ULL; 372 uint64_t v1 = k1 ^ 0x646f72616e646f6dULL; 373 uint64_t v2 = k0 ^ 0x6c7967656e657261ULL; 374 uint64_t v3 = k1 ^ 0x7465646279746573ULL; 375 376 uint64_t t; 377 uint8_t *pt; 378 379 while (src_sz >= 8) { 380 uint64_t mi; 381 memcpy(&mi, in, sizeof(mi)); 382 mi = _le64toh(mi); 383 in += sizeof(mi); 384 src_sz -= sizeof(mi); 385 v3 ^= mi; 386 DOUBLE_ROUND(v0,v1,v2,v3); 387 v0 ^= mi; 388 } 389 390 t = 0; 391 pt = (uint8_t *)&t; 392 switch (src_sz) { 393 case 7: pt[6] = in[6]; /* fall through */ 394 case 6: pt[5] = in[5]; /* fall through */ 395 case 5: pt[4] = in[4]; /* fall through */ 396 case 4: memcpy(pt, in, sizeof(uint32_t)); break; 397 case 3: pt[2] = in[2]; /* fall through */ 398 case 2: pt[1] = in[1]; /* fall through */ 399 case 1: pt[0] = in[0]; /* fall through */ 400 } 401 b |= _le64toh(t); 402 403 v3 ^= b; 404 DOUBLE_ROUND(v0,v1,v2,v3); 405 v0 ^= b; 406 v2 ^= 0xff; 407 DOUBLE_ROUND(v0,v1,v2,v3); 408 DOUBLE_ROUND(v0,v1,v2,v3); 409 410 /* modified */ 411 t = (v0 ^ v1) ^ (v2 ^ v3); 412 return t; 413 } 414 415 static Py_hash_t 416 pysiphash(const void *src, Py_ssize_t src_sz) { 417 return (Py_hash_t)siphash24( 418 _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1), 419 src, src_sz); 420 } 421 422 uint64_t 423 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz) 424 { 425 return siphash24(key, 0, src, src_sz); 426 } 427 428 429 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 430 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128}; 431 #endif 432 433 #ifdef __cplusplus 434 } 435 #endif 436