1 #pragma once 2 3 #include "Types.h" 4 5 #include <math.h> 6 #include <vector> 7 #include <map> 8 #include <algorithm> // for std::sort 9 #include <string.h> // for memset 10 #include <stdio.h> // for printf 11 12 double calcScore ( const int * bins, const int bincount, const int ballcount ); 13 14 void plot ( double n ); 15 16 inline double ExpectedCollisions ( double balls, double bins ) 17 { 18 return balls - bins + bins * pow(1 - 1/bins,balls); 19 } 20 21 double chooseK ( int b, int k ); 22 double chooseUpToK ( int n, int k ); 23 24 //----------------------------------------------------------------------------- 25 26 inline uint32_t f3mix ( uint32_t k ) 27 { 28 k ^= k >> 16; 29 k *= 0x85ebca6b; 30 k ^= k >> 13; 31 k *= 0xc2b2ae35; 32 k ^= k >> 16; 33 34 return k; 35 } 36 37 //----------------------------------------------------------------------------- 38 // Sort the hash list, count the total number of collisions and return 39 // the first N collisions for further processing 40 41 template< typename hashtype > 42 int FindCollisions ( std::vector<hashtype> & hashes, 43 HashSet<hashtype> & collisions, 44 int maxCollisions ) 45 { 46 int collcount = 0; 47 48 std::sort(hashes.begin(),hashes.end()); 49 50 for(size_t i = 1; i < hashes.size(); i++) 51 { 52 if(hashes[i] == hashes[i-1]) 53 { 54 collcount++; 55 56 if((int)collisions.size() < maxCollisions) 57 { 58 collisions.insert(hashes[i]); 59 } 60 } 61 } 62 63 return collcount; 64 } 65 66 //----------------------------------------------------------------------------- 67 68 template < class keytype, typename hashtype > 69 int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys ) 70 { 71 int collcount = 0; 72 73 typedef std::map<hashtype,keytype> htab; 74 htab tab; 75 76 for(size_t i = 1; i < keys.size(); i++) 77 { 78 keytype & k1 = keys[i]; 79 80 hashtype h = hash(&k1,sizeof(keytype),0); 81 82 typename htab::iterator it = tab.find(h); 83 84 if(it != tab.end()) 85 { 86 keytype & k2 = (*it).second; 87 88 printf("A: "); 89 printbits(&k1,sizeof(keytype)); 90 printf("B: "); 91 printbits(&k2,sizeof(keytype)); 92 } 93 else 94 { 95 tab.insert( std::make_pair(h,k1) ); 96 } 97 } 98 99 return collcount; 100 } 101 102 //---------------------------------------------------------------------------- 103 // Measure the distribution "score" for each possible N-bit span up to 20 bits 104 105 template< typename hashtype > 106 double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram ) 107 { 108 printf("Testing distribution - "); 109 110 if(drawDiagram) printf("\n"); 111 112 const int hashbits = sizeof(hashtype) * 8; 113 114 int maxwidth = 20; 115 116 // We need at least 5 keys per bin to reliably test distribution biases 117 // down to 1%, so don't bother to test sparser distributions than that 118 119 while(double(hashes.size()) / double(1 << maxwidth) < 5.0) 120 { 121 maxwidth--; 122 } 123 124 std::vector<int> bins; 125 bins.resize(1 << maxwidth); 126 127 double worst = 0; 128 int worstStart = -1; 129 int worstWidth = -1; 130 131 for(int start = 0; start < hashbits; start++) 132 { 133 int width = maxwidth; 134 int bincount = (1 << width); 135 136 memset(&bins[0],0,sizeof(int)*bincount); 137 138 for(size_t j = 0; j < hashes.size(); j++) 139 { 140 hashtype & hash = hashes[j]; 141 142 uint32_t index = window(&hash,sizeof(hash),start,width); 143 144 bins[index]++; 145 } 146 147 // Test the distribution, then fold the bins in half, 148 // repeat until we're down to 256 bins 149 150 if(drawDiagram) printf("["); 151 152 while(bincount >= 256) 153 { 154 double n = calcScore(&bins[0],bincount,(int)hashes.size()); 155 156 if(drawDiagram) plot(n); 157 158 if(n > worst) 159 { 160 worst = n; 161 worstStart = start; 162 worstWidth = width; 163 } 164 165 width--; 166 bincount /= 2; 167 168 if(width < 8) break; 169 170 for(int i = 0; i < bincount; i++) 171 { 172 bins[i] += bins[i+bincount]; 173 } 174 } 175 176 if(drawDiagram) printf("]\n"); 177 } 178 179 double pct = worst * 100.0; 180 181 printf("Worst bias is the %3d-bit window at bit %3d - %5.3f%%",worstWidth,worstStart,pct); 182 if(pct >= 1.0) printf(" !!!!! "); 183 printf("\n"); 184 185 return worst; 186 } 187 188 //---------------------------------------------------------------------------- 189 190 template < typename hashtype > 191 bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram ) 192 { 193 bool result = true; 194 195 { 196 size_t count = hashes.size(); 197 198 double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1)); 199 200 printf("Testing collisions - Expected %8.2f, ",expected); 201 202 double collcount = 0; 203 204 HashSet<hashtype> collisions; 205 206 collcount = FindCollisions(hashes,collisions,1000); 207 208 printf("actual %8.2f (%5.2fx)",collcount, collcount / expected); 209 210 if(sizeof(hashtype) == sizeof(uint32_t)) 211 { 212 // 2x expected collisions = fail 213 214 // #TODO - collision failure cutoff needs to be expressed as a standard deviation instead 215 // of a scale factor, otherwise we fail erroneously if there are a small expected number 216 // of collisions 217 218 if(double(collcount) / double(expected) > 2.0) 219 { 220 printf(" !!!!! "); 221 result = false; 222 } 223 } 224 else 225 { 226 // For all hashes larger than 32 bits, _any_ collisions are a failure. 227 228 if(collcount > 0) 229 { 230 printf(" !!!!! "); 231 result = false; 232 } 233 } 234 235 printf("\n"); 236 } 237 238 //---------- 239 240 if(testDist) 241 { 242 TestDistribution(hashes,drawDiagram); 243 } 244 245 return result; 246 } 247 248 //---------- 249 250 template < typename hashtype > 251 bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram ) 252 { 253 std::vector<hashtype> collisions; 254 255 return TestHashList(hashes,collisions,testDist,drawDiagram); 256 } 257 258 //----------------------------------------------------------------------------- 259 260 template < class keytype, typename hashtype > 261 bool TestKeyList ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram ) 262 { 263 int keycount = (int)keys.size(); 264 265 std::vector<hashtype> hashes; 266 267 hashes.resize(keycount); 268 269 printf("Hashing"); 270 271 for(int i = 0; i < keycount; i++) 272 { 273 if(i % (keycount / 10) == 0) printf("."); 274 275 keytype & k = keys[i]; 276 277 hash(&k,sizeof(k),0,&hashes[i]); 278 } 279 280 printf("\n"); 281 282 bool result = TestHashList(hashes,testColl,testDist,drawDiagram); 283 284 printf("\n"); 285 286 return result; 287 } 288 289 //----------------------------------------------------------------------------- 290 // Bytepair test - generate 16-bit indices from all possible non-overlapping 291 // 8-bit sections of the hash value, check distribution on all of them. 292 293 // This is a very good test for catching weak intercorrelations between bits - 294 // much harder to pass than the normal distribution test. However, it doesn't 295 // really model the normal usage of hash functions in hash table lookup, so 296 // I'm not sure it's that useful (and hash functions that fail this test but 297 // pass the normal distribution test still work well in practice) 298 299 template < typename hashtype > 300 double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram ) 301 { 302 const int nbytes = sizeof(hashtype); 303 const int hashbits = nbytes * 8; 304 305 const int nbins = 65536; 306 307 std::vector<int> bins(nbins,0); 308 309 double worst = 0; 310 311 for(int a = 0; a < hashbits; a++) 312 { 313 if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n"); 314 315 if(drawDiagram) printf("["); 316 317 for(int b = 0; b < hashbits; b++) 318 { 319 if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" "); 320 321 bins.clear(); 322 bins.resize(nbins,0); 323 324 for(size_t i = 0; i < hashes.size(); i++) 325 { 326 hashtype & hash = hashes[i]; 327 328 uint32_t pa = window(&hash,sizeof(hash),a,8); 329 uint32_t pb = window(&hash,sizeof(hash),b,8); 330 331 bins[pa | (pb << 8)]++; 332 } 333 334 double s = calcScore(bins,bins.size(),hashes.size()); 335 336 if(drawDiagram) plot(s); 337 338 if(s > worst) 339 { 340 worst = s; 341 } 342 } 343 344 if(drawDiagram) printf("]\n"); 345 } 346 347 return worst; 348 } 349 350 //----------------------------------------------------------------------------- 351 // Simplified test - only check 64k distributions, and only on byte boundaries 352 353 template < typename hashtype > 354 void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg ) 355 { 356 const int hashbits = sizeof(hashtype) * 8; 357 const int nbins = 65536; 358 359 std::vector<int> bins(nbins,0); 360 361 dworst = -1.0e90; 362 davg = 0; 363 364 for(int start = 0; start < hashbits; start += 8) 365 { 366 bins.clear(); 367 bins.resize(nbins,0); 368 369 for(size_t j = 0; j < hashes.size(); j++) 370 { 371 hashtype & hash = hashes[j]; 372 373 uint32_t index = window(&hash,sizeof(hash),start,16); 374 375 bins[index]++; 376 } 377 378 double n = calcScore(&bins.front(),(int)bins.size(),(int)hashes.size()); 379 380 davg += n; 381 382 if(n > dworst) dworst = n; 383 } 384 385 davg /= double(hashbits/8); 386 } 387 388 //----------------------------------------------------------------------------- 389