Home | History | Annotate | Download | only in src
      1 #pragma once
      2 
      3 #include "Types.h"
      4 
      5 #include <math.h>
      6 #include <vector>
      7 #include <map>
      8 #include <algorithm>   // for std::sort
      9 #include <string.h>    // for memset
     10 #include <stdio.h>     // for printf
     11 
     12 double calcScore ( const int * bins, const int bincount, const int ballcount );
     13 
     14 void plot ( double n );
     15 
     16 inline double ExpectedCollisions ( double balls, double bins )
     17 {
     18   return balls - bins + bins * pow(1 - 1/bins,balls);
     19 }
     20 
     21 double chooseK ( int b, int k );
     22 double chooseUpToK ( int n, int k );
     23 
     24 //-----------------------------------------------------------------------------
     25 
     26 inline uint32_t f3mix ( uint32_t k )
     27 {
     28   k ^= k >> 16;
     29   k *= 0x85ebca6b;
     30   k ^= k >> 13;
     31   k *= 0xc2b2ae35;
     32   k ^= k >> 16;
     33 
     34   return k;
     35 }
     36 
     37 //-----------------------------------------------------------------------------
     38 // Sort the hash list, count the total number of collisions and return
     39 // the first N collisions for further processing
     40 
     41 template< typename hashtype >
     42 int FindCollisions ( std::vector<hashtype> & hashes,
     43                      HashSet<hashtype> & collisions,
     44                      int maxCollisions )
     45 {
     46   int collcount = 0;
     47 
     48   std::sort(hashes.begin(),hashes.end());
     49 
     50   for(size_t i = 1; i < hashes.size(); i++)
     51   {
     52     if(hashes[i] == hashes[i-1])
     53     {
     54       collcount++;
     55 
     56       if((int)collisions.size() < maxCollisions)
     57       {
     58         collisions.insert(hashes[i]);
     59       }
     60     }
     61   }
     62 
     63   return collcount;
     64 }
     65 
     66 //-----------------------------------------------------------------------------
     67 
     68 template < class keytype, typename hashtype >
     69 int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
     70 {
     71   int collcount = 0;
     72 
     73   typedef std::map<hashtype,keytype> htab;
     74   htab tab;
     75 
     76   for(size_t i = 1; i < keys.size(); i++)
     77   {
     78     keytype & k1 = keys[i];
     79 
     80     hashtype h = hash(&k1,sizeof(keytype),0);
     81 
     82     typename htab::iterator it = tab.find(h);
     83 
     84     if(it != tab.end())
     85     {
     86       keytype & k2 = (*it).second;
     87 
     88       printf("A: ");
     89       printbits(&k1,sizeof(keytype));
     90       printf("B: ");
     91       printbits(&k2,sizeof(keytype));
     92     }
     93     else
     94     {
     95       tab.insert( std::make_pair(h,k1) );
     96     }
     97   }
     98 
     99   return collcount;
    100 }
    101 
    102 //----------------------------------------------------------------------------
    103 // Measure the distribution "score" for each possible N-bit span up to 20 bits
    104 
    105 template< typename hashtype >
    106 double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
    107 {
    108   printf("Testing distribution - ");
    109 
    110   if(drawDiagram) printf("\n");
    111 
    112   const int hashbits = sizeof(hashtype) * 8;
    113 
    114   int maxwidth = 20;
    115 
    116   // We need at least 5 keys per bin to reliably test distribution biases
    117   // down to 1%, so don't bother to test sparser distributions than that
    118 
    119   while(double(hashes.size()) / double(1 << maxwidth) < 5.0)
    120   {
    121     maxwidth--;
    122   }
    123 
    124   std::vector<int> bins;
    125   bins.resize(1 << maxwidth);
    126 
    127   double worst = 0;
    128   int worstStart = -1;
    129   int worstWidth = -1;
    130 
    131   for(int start = 0; start < hashbits; start++)
    132   {
    133     int width = maxwidth;
    134     int bincount = (1 << width);
    135 
    136     memset(&bins[0],0,sizeof(int)*bincount);
    137 
    138     for(size_t j = 0; j < hashes.size(); j++)
    139     {
    140       hashtype & hash = hashes[j];
    141 
    142       uint32_t index = window(&hash,sizeof(hash),start,width);
    143 
    144       bins[index]++;
    145     }
    146 
    147     // Test the distribution, then fold the bins in half,
    148     // repeat until we're down to 256 bins
    149 
    150     if(drawDiagram) printf("[");
    151 
    152     while(bincount >= 256)
    153     {
    154       double n = calcScore(&bins[0],bincount,(int)hashes.size());
    155 
    156       if(drawDiagram) plot(n);
    157 
    158       if(n > worst)
    159       {
    160         worst = n;
    161         worstStart = start;
    162         worstWidth = width;
    163       }
    164 
    165       width--;
    166       bincount /= 2;
    167 
    168       if(width < 8) break;
    169 
    170       for(int i = 0; i < bincount; i++)
    171       {
    172         bins[i] += bins[i+bincount];
    173       }
    174     }
    175 
    176     if(drawDiagram) printf("]\n");
    177   }
    178 
    179   double pct = worst * 100.0;
    180 
    181   printf("Worst bias is the %3d-bit window at bit %3d - %5.3f%%",worstWidth,worstStart,pct);
    182   if(pct >= 1.0) printf(" !!!!! ");
    183   printf("\n");
    184 
    185   return worst;
    186 }
    187 
    188 //----------------------------------------------------------------------------
    189 
    190 template < typename hashtype >
    191 bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram )
    192 {
    193   bool result = true;
    194 
    195   {
    196     size_t count = hashes.size();
    197 
    198     double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
    199 
    200     printf("Testing collisions   - Expected %8.2f, ",expected);
    201 
    202     double collcount = 0;
    203 
    204     HashSet<hashtype> collisions;
    205 
    206     collcount = FindCollisions(hashes,collisions,1000);
    207 
    208     printf("actual %8.2f (%5.2fx)",collcount, collcount / expected);
    209 
    210     if(sizeof(hashtype) == sizeof(uint32_t))
    211     {
    212     // 2x expected collisions = fail
    213 
    214     // #TODO - collision failure cutoff needs to be expressed as a standard deviation instead
    215     // of a scale factor, otherwise we fail erroneously if there are a small expected number
    216     // of collisions
    217 
    218     if(double(collcount) / double(expected) > 2.0)
    219     {
    220       printf(" !!!!! ");
    221       result = false;
    222     }
    223     }
    224     else
    225     {
    226       // For all hashes larger than 32 bits, _any_ collisions are a failure.
    227 
    228       if(collcount > 0)
    229       {
    230         printf(" !!!!! ");
    231         result = false;
    232       }
    233     }
    234 
    235     printf("\n");
    236   }
    237 
    238   //----------
    239 
    240   if(testDist)
    241   {
    242     TestDistribution(hashes,drawDiagram);
    243   }
    244 
    245   return result;
    246 }
    247 
    248 //----------
    249 
    250 template < typename hashtype >
    251 bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram )
    252 {
    253   std::vector<hashtype> collisions;
    254 
    255   return TestHashList(hashes,collisions,testDist,drawDiagram);
    256 }
    257 
    258 //-----------------------------------------------------------------------------
    259 
    260 template < class keytype, typename hashtype >
    261 bool TestKeyList ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
    262 {
    263   int keycount = (int)keys.size();
    264 
    265   std::vector<hashtype> hashes;
    266 
    267   hashes.resize(keycount);
    268 
    269   printf("Hashing");
    270 
    271   for(int i = 0; i < keycount; i++)
    272   {
    273     if(i % (keycount / 10) == 0) printf(".");
    274 
    275     keytype & k = keys[i];
    276 
    277     hash(&k,sizeof(k),0,&hashes[i]);
    278   }
    279 
    280   printf("\n");
    281 
    282   bool result = TestHashList(hashes,testColl,testDist,drawDiagram);
    283 
    284   printf("\n");
    285 
    286   return result;
    287 }
    288 
    289 //-----------------------------------------------------------------------------
    290 // Bytepair test - generate 16-bit indices from all possible non-overlapping
    291 // 8-bit sections of the hash value, check distribution on all of them.
    292 
    293 // This is a very good test for catching weak intercorrelations between bits -
    294 // much harder to pass than the normal distribution test. However, it doesn't
    295 // really model the normal usage of hash functions in hash table lookup, so
    296 // I'm not sure it's that useful (and hash functions that fail this test but
    297 // pass the normal distribution test still work well in practice)
    298 
    299 template < typename hashtype >
    300 double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram )
    301 {
    302   const int nbytes = sizeof(hashtype);
    303   const int hashbits = nbytes * 8;
    304 
    305   const int nbins = 65536;
    306 
    307   std::vector<int> bins(nbins,0);
    308 
    309   double worst = 0;
    310 
    311   for(int a = 0; a < hashbits; a++)
    312   {
    313     if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n");
    314 
    315     if(drawDiagram) printf("[");
    316 
    317     for(int b = 0; b < hashbits; b++)
    318     {
    319       if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" ");
    320 
    321       bins.clear();
    322       bins.resize(nbins,0);
    323 
    324       for(size_t i = 0; i < hashes.size(); i++)
    325       {
    326         hashtype & hash = hashes[i];
    327 
    328         uint32_t pa = window(&hash,sizeof(hash),a,8);
    329         uint32_t pb = window(&hash,sizeof(hash),b,8);
    330 
    331         bins[pa | (pb << 8)]++;
    332       }
    333 
    334       double s = calcScore(bins,bins.size(),hashes.size());
    335 
    336       if(drawDiagram) plot(s);
    337 
    338       if(s > worst)
    339       {
    340         worst = s;
    341       }
    342     }
    343 
    344     if(drawDiagram) printf("]\n");
    345   }
    346 
    347   return worst;
    348 }
    349 
    350 //-----------------------------------------------------------------------------
    351 // Simplified test - only check 64k distributions, and only on byte boundaries
    352 
    353 template < typename hashtype >
    354 void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg )
    355 {
    356   const int hashbits = sizeof(hashtype) * 8;
    357   const int nbins = 65536;
    358 
    359   std::vector<int> bins(nbins,0);
    360 
    361   dworst = -1.0e90;
    362   davg = 0;
    363 
    364   for(int start = 0; start < hashbits; start += 8)
    365   {
    366     bins.clear();
    367     bins.resize(nbins,0);
    368 
    369     for(size_t j = 0; j < hashes.size(); j++)
    370     {
    371       hashtype & hash = hashes[j];
    372 
    373       uint32_t index = window(&hash,sizeof(hash),start,16);
    374 
    375       bins[index]++;
    376     }
    377 
    378     double n = calcScore(&bins.front(),(int)bins.size(),(int)hashes.size());
    379 
    380     davg += n;
    381 
    382     if(n > dworst) dworst = n;
    383   }
    384 
    385   davg /= double(hashbits/8);
    386 }
    387 
    388 //-----------------------------------------------------------------------------
    389