Home | History | Annotate | Download | only in Python
      1 /* Set of hash utility functions to help maintaining the invariant that
      2     if a==b then hash(a)==hash(b)
      3 
      4    All the utility functions (_Py_Hash*()) return "-1" to signify an error.
      5 */
      6 #include "Python.h"
      7 
      8 #ifdef __APPLE__
      9 #  include <libkern/OSByteOrder.h>
     10 #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
     11 #  include <endian.h>
     12 #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
     13 #  include <sys/endian.h>
     14 #endif
     15 
     16 #ifdef __cplusplus
     17 extern "C" {
     18 #endif
     19 
     20 _Py_HashSecret_t _Py_HashSecret = {{0}};
     21 
     22 #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
     23 extern PyHash_FuncDef PyHash_Func;
     24 #else
     25 static PyHash_FuncDef PyHash_Func;
     26 #endif
     27 
     28 /* Count _Py_HashBytes() calls */
     29 #ifdef Py_HASH_STATS
     30 #define Py_HASH_STATS_MAX 32
     31 static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
     32 #endif
     33 
     34 /* For numeric types, the hash of a number x is based on the reduction
     35    of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
     36    hash(x) == hash(y) whenever x and y are numerically equal, even if
     37    x and y have different types.
     38 
     39    A quick summary of the hashing strategy:
     40 
     41    (1) First define the 'reduction of x modulo P' for any rational
     42    number x; this is a standard extension of the usual notion of
     43    reduction modulo P for integers.  If x == p/q (written in lowest
     44    terms), the reduction is interpreted as the reduction of p times
     45    the inverse of the reduction of q, all modulo P; if q is exactly
     46    divisible by P then define the reduction to be infinity.  So we've
     47    got a well-defined map
     48 
     49       reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
     50 
     51    (2) Now for a rational number x, define hash(x) by:
     52 
     53       reduce(x)   if x >= 0
     54       -reduce(-x) if x < 0
     55 
     56    If the result of the reduction is infinity (this is impossible for
     57    integers, floats and Decimals) then use the predefined hash value
     58    _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
     59    _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
     60    hashes of float and Decimal infinities and nans.
     61 
     62    A selling point for the above strategy is that it makes it possible
     63    to compute hashes of decimal and binary floating-point numbers
     64    efficiently, even if the exponent of the binary or decimal number
     65    is large.  The key point is that
     66 
     67       reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
     68 
     69    provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
     70    binary or decimal float is never infinity, since the denominator is a power
     71    of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
     72    for nonnegative x,
     73 
     74       reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
     75 
     76       reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
     77 
     78    and reduce(10**e) can be computed efficiently by the usual modular
     79    exponentiation algorithm.  For reduce(2**e) it's even better: since
     80    P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
     81    by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
     82 
     83    */
     84 
     85 Py_hash_t
     86 _Py_HashDouble(double v)
     87 {
     88     int e, sign;
     89     double m;
     90     Py_uhash_t x, y;
     91 
     92     if (!Py_IS_FINITE(v)) {
     93         if (Py_IS_INFINITY(v))
     94             return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
     95         else
     96             return _PyHASH_NAN;
     97     }
     98 
     99     m = frexp(v, &e);
    100 
    101     sign = 1;
    102     if (m < 0) {
    103         sign = -1;
    104         m = -m;
    105     }
    106 
    107     /* process 28 bits at a time;  this should work well both for binary
    108        and hexadecimal floating point. */
    109     x = 0;
    110     while (m) {
    111         x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
    112         m *= 268435456.0;  /* 2**28 */
    113         e -= 28;
    114         y = (Py_uhash_t)m;  /* pull out integer part */
    115         m -= y;
    116         x += y;
    117         if (x >= _PyHASH_MODULUS)
    118             x -= _PyHASH_MODULUS;
    119     }
    120 
    121     /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
    122     e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
    123     x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
    124 
    125     x = x * sign;
    126     if (x == (Py_uhash_t)-1)
    127         x = (Py_uhash_t)-2;
    128     return (Py_hash_t)x;
    129 }
    130 
    131 Py_hash_t
    132 _Py_HashPointer(void *p)
    133 {
    134     Py_hash_t x;
    135     size_t y = (size_t)p;
    136     /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
    137        excessive hash collisions for dicts and sets */
    138     y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
    139     x = (Py_hash_t)y;
    140     if (x == -1)
    141         x = -2;
    142     return x;
    143 }
    144 
    145 Py_hash_t
    146 _Py_HashBytes(const void *src, Py_ssize_t len)
    147 {
    148     Py_hash_t x;
    149     /*
    150       We make the hash of the empty string be 0, rather than using
    151       (prefix ^ suffix), since this slightly obfuscates the hash secret
    152     */
    153     if (len == 0) {
    154         return 0;
    155     }
    156 
    157 #ifdef Py_HASH_STATS
    158     hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
    159 #endif
    160 
    161 #if Py_HASH_CUTOFF > 0
    162     if (len < Py_HASH_CUTOFF) {
    163         /* Optimize hashing of very small strings with inline DJBX33A. */
    164         Py_uhash_t hash;
    165         const unsigned char *p = src;
    166         hash = 5381; /* DJBX33A starts with 5381 */
    167 
    168         switch(len) {
    169             /* ((hash << 5) + hash) + *p == hash * 33 + *p */
    170             case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
    171             case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
    172             case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
    173             case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
    174             case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
    175             case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
    176             case 1: hash = ((hash << 5) + hash) + *p++; break;
    177             default:
    178                 Py_UNREACHABLE();
    179         }
    180         hash ^= len;
    181         hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
    182         x = (Py_hash_t)hash;
    183     }
    184     else
    185 #endif /* Py_HASH_CUTOFF */
    186         x = PyHash_Func.hash(src, len);
    187 
    188     if (x == -1)
    189         return -2;
    190     return x;
    191 }
    192 
    193 void
    194 _PyHash_Fini(void)
    195 {
    196 #ifdef Py_HASH_STATS
    197     int i;
    198     Py_ssize_t total = 0;
    199     const char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n";
    200 
    201     fprintf(stderr, "len   calls    total\n");
    202     for (i = 1; i <= Py_HASH_STATS_MAX; i++) {
    203         total += hashstats[i];
    204         fprintf(stderr, fmt, i, hashstats[i], total);
    205     }
    206     total += hashstats[0];
    207     fprintf(stderr, ">  %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n",
    208             hashstats[0], total);
    209 #endif
    210 }
    211 
    212 PyHash_FuncDef *
    213 PyHash_GetFuncDef(void)
    214 {
    215     return &PyHash_Func;
    216 }
    217 
    218 /* Optimized memcpy() for Windows */
    219 #ifdef _MSC_VER
    220 #  if SIZEOF_PY_UHASH_T == 4
    221 #    define PY_UHASH_CPY(dst, src) do {                                    \
    222        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
    223        } while(0)
    224 #  elif SIZEOF_PY_UHASH_T == 8
    225 #    define PY_UHASH_CPY(dst, src) do {                                    \
    226        dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
    227        dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
    228        } while(0)
    229 #  else
    230 #    error SIZEOF_PY_UHASH_T must be 4 or 8
    231 #  endif /* SIZEOF_PY_UHASH_T */
    232 #else /* not Windows */
    233 #  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
    234 #endif /* _MSC_VER */
    235 
    236 
    237 #if Py_HASH_ALGORITHM == Py_HASH_FNV
    238 /* **************************************************************************
    239  * Modified Fowler-Noll-Vo (FNV) hash function
    240  */
    241 static Py_hash_t
    242 fnv(const void *src, Py_ssize_t len)
    243 {
    244     const unsigned char *p = src;
    245     Py_uhash_t x;
    246     Py_ssize_t remainder, blocks;
    247     union {
    248         Py_uhash_t value;
    249         unsigned char bytes[SIZEOF_PY_UHASH_T];
    250     } block;
    251 
    252 #ifdef Py_DEBUG
    253     assert(_Py_HashSecret_Initialized);
    254 #endif
    255     remainder = len % SIZEOF_PY_UHASH_T;
    256     if (remainder == 0) {
    257         /* Process at least one block byte by byte to reduce hash collisions
    258          * for strings with common prefixes. */
    259         remainder = SIZEOF_PY_UHASH_T;
    260     }
    261     blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
    262 
    263     x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
    264     x ^= (Py_uhash_t) *p << 7;
    265     while (blocks--) {
    266         PY_UHASH_CPY(block.bytes, p);
    267         x = (_PyHASH_MULTIPLIER * x) ^ block.value;
    268         p += SIZEOF_PY_UHASH_T;
    269     }
    270     /* add remainder */
    271     for (; remainder > 0; remainder--)
    272         x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
    273     x ^= (Py_uhash_t) len;
    274     x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
    275     if (x == (Py_uhash_t) -1) {
    276         x = (Py_uhash_t) -2;
    277     }
    278     return x;
    279 }
    280 
    281 static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
    282                                      16 * SIZEOF_PY_HASH_T};
    283 
    284 #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
    285 
    286 
    287 /* **************************************************************************
    288  <MIT License>
    289  Copyright (c) 2013  Marek Majkowski <marek (at) popcount.org>
    290 
    291  Permission is hereby granted, free of charge, to any person obtaining a copy
    292  of this software and associated documentation files (the "Software"), to deal
    293  in the Software without restriction, including without limitation the rights
    294  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    295  copies of the Software, and to permit persons to whom the Software is
    296  furnished to do so, subject to the following conditions:
    297 
    298  The above copyright notice and this permission notice shall be included in
    299  all copies or substantial portions of the Software.
    300 
    301  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    302  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    303  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    304  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    305  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    306  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    307  THE SOFTWARE.
    308  </MIT License>
    309 
    310  Original location:
    311     https://github.com/majek/csiphash/
    312 
    313  Solution inspired by code from:
    314     Samuel Neves (supercop/crypto_auth/siphash24/little)
    315     djb (supercop/crypto_auth/siphash24/little2)
    316     Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
    317 
    318  Modified for Python by Christian Heimes:
    319     - C89 / MSVC compatibility
    320     - _rotl64() on Windows
    321     - letoh64() fallback
    322 */
    323 
    324 /* byte swap little endian to host endian
    325  * Endian conversion not only ensures that the hash function returns the same
    326  * value on all platforms. It is also required to for a good dispersion of
    327  * the hash values' least significant bits.
    328  */
    329 #if PY_LITTLE_ENDIAN
    330 #  define _le64toh(x) ((uint64_t)(x))
    331 #elif defined(__APPLE__)
    332 #  define _le64toh(x) OSSwapLittleToHostInt64(x)
    333 #elif defined(HAVE_LETOH64)
    334 #  define _le64toh(x) le64toh(x)
    335 #else
    336 #  define _le64toh(x) (((uint64_t)(x) << 56) | \
    337                       (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
    338                       (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
    339                       (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
    340                       (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
    341                       (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
    342                       (((uint64_t)(x) >> 40) & 0xff00ULL) | \
    343                       ((uint64_t)(x)  >> 56))
    344 #endif
    345 
    346 
    347 #ifdef _MSC_VER
    348 #  define ROTATE(x, b)  _rotl64(x, b)
    349 #else
    350 #  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
    351 #endif
    352 
    353 #define HALF_ROUND(a,b,c,d,s,t)         \
    354     a += b; c += d;             \
    355     b = ROTATE(b, s) ^ a;           \
    356     d = ROTATE(d, t) ^ c;           \
    357     a = ROTATE(a, 32);
    358 
    359 #define DOUBLE_ROUND(v0,v1,v2,v3)       \
    360     HALF_ROUND(v0,v1,v2,v3,13,16);      \
    361     HALF_ROUND(v2,v1,v0,v3,17,21);      \
    362     HALF_ROUND(v0,v1,v2,v3,13,16);      \
    363     HALF_ROUND(v2,v1,v0,v3,17,21);
    364 
    365 
    366 static uint64_t
    367 siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
    368     uint64_t b = (uint64_t)src_sz << 56;
    369     const uint8_t *in = (uint8_t*)src;
    370 
    371     uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
    372     uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
    373     uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
    374     uint64_t v3 = k1 ^ 0x7465646279746573ULL;
    375 
    376     uint64_t t;
    377     uint8_t *pt;
    378 
    379     while (src_sz >= 8) {
    380         uint64_t mi;
    381         memcpy(&mi, in, sizeof(mi));
    382         mi = _le64toh(mi);
    383         in += sizeof(mi);
    384         src_sz -= sizeof(mi);
    385         v3 ^= mi;
    386         DOUBLE_ROUND(v0,v1,v2,v3);
    387         v0 ^= mi;
    388     }
    389 
    390     t = 0;
    391     pt = (uint8_t *)&t;
    392     switch (src_sz) {
    393         case 7: pt[6] = in[6]; /* fall through */
    394         case 6: pt[5] = in[5]; /* fall through */
    395         case 5: pt[4] = in[4]; /* fall through */
    396         case 4: memcpy(pt, in, sizeof(uint32_t)); break;
    397         case 3: pt[2] = in[2]; /* fall through */
    398         case 2: pt[1] = in[1]; /* fall through */
    399         case 1: pt[0] = in[0]; /* fall through */
    400     }
    401     b |= _le64toh(t);
    402 
    403     v3 ^= b;
    404     DOUBLE_ROUND(v0,v1,v2,v3);
    405     v0 ^= b;
    406     v2 ^= 0xff;
    407     DOUBLE_ROUND(v0,v1,v2,v3);
    408     DOUBLE_ROUND(v0,v1,v2,v3);
    409 
    410     /* modified */
    411     t = (v0 ^ v1) ^ (v2 ^ v3);
    412     return t;
    413 }
    414 
    415 static Py_hash_t
    416 pysiphash(const void *src, Py_ssize_t src_sz) {
    417     return (Py_hash_t)siphash24(
    418         _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
    419         src, src_sz);
    420 }
    421 
    422 uint64_t
    423 _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
    424 {
    425     return siphash24(key, 0, src, src_sz);
    426 }
    427 
    428 
    429 #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
    430 static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
    431 #endif
    432 
    433 #ifdef __cplusplus
    434 }
    435 #endif
    436