1 #include "SpeedTest.h" 2 3 #include "Random.h" 4 5 #include <stdio.h> // for printf 6 #include <memory.h> // for memset 7 #include <math.h> // for sqrt 8 #include <algorithm> // for sort 9 10 //----------------------------------------------------------------------------- 11 // We view our timing values as a series of random variables V that has been 12 // contaminated with occasional outliers due to cache misses, thread 13 // preemption, etcetera. To filter out the outliers, we search for the largest 14 // subset of V such that all its values are within three standard deviations 15 // of the mean. 16 17 double CalcMean ( std::vector<double> & v ) 18 { 19 double mean = 0; 20 21 for(int i = 0; i < (int)v.size(); i++) 22 { 23 mean += v[i]; 24 } 25 26 mean /= double(v.size()); 27 28 return mean; 29 } 30 31 double CalcMean ( std::vector<double> & v, int a, int b ) 32 { 33 double mean = 0; 34 35 for(int i = a; i <= b; i++) 36 { 37 mean += v[i]; 38 } 39 40 mean /= (b-a+1); 41 42 return mean; 43 } 44 45 double CalcStdv ( std::vector<double> & v, int a, int b ) 46 { 47 double mean = CalcMean(v,a,b); 48 49 double stdv = 0; 50 51 for(int i = a; i <= b; i++) 52 { 53 double x = v[i] - mean; 54 55 stdv += x*x; 56 } 57 58 stdv = sqrt(stdv / (b-a+1)); 59 60 return stdv; 61 } 62 63 // Return true if the largest value in v[0,len) is more than three 64 // standard deviations from the mean 65 66 bool ContainsOutlier ( std::vector<double> & v, size_t len ) 67 { 68 double mean = 0; 69 70 for(size_t i = 0; i < len; i++) 71 { 72 mean += v[i]; 73 } 74 75 mean /= double(len); 76 77 double stdv = 0; 78 79 for(size_t i = 0; i < len; i++) 80 { 81 double x = v[i] - mean; 82 stdv += x*x; 83 } 84 85 stdv = sqrt(stdv / double(len)); 86 87 double cutoff = mean + stdv*3; 88 89 return v[len-1] > cutoff; 90 } 91 92 // Do a binary search to find the largest subset of v that does not contain 93 // outliers. 94 95 void FilterOutliers ( std::vector<double> & v ) 96 { 97 std::sort(v.begin(),v.end()); 98 99 size_t len = 0; 100 101 for(size_t x = 0x40000000; x; x = x >> 1 ) 102 { 103 if((len | x) >= v.size()) continue; 104 105 if(!ContainsOutlier(v,len | x)) 106 { 107 len |= x; 108 } 109 } 110 111 v.resize(len); 112 } 113 114 // Iteratively tighten the set to find a subset that does not contain 115 // outliers. I'm not positive this works correctly in all cases. 116 117 void FilterOutliers2 ( std::vector<double> & v ) 118 { 119 std::sort(v.begin(),v.end()); 120 121 int a = 0; 122 int b = (int)(v.size() - 1); 123 124 for(int i = 0; i < 10; i++) 125 { 126 //printf("%d %d\n",a,b); 127 128 double mean = CalcMean(v,a,b); 129 double stdv = CalcStdv(v,a,b); 130 131 double cutA = mean - stdv*3; 132 double cutB = mean + stdv*3; 133 134 while((a < b) && (v[a] < cutA)) a++; 135 while((b > a) && (v[b] > cutB)) b--; 136 } 137 138 std::vector<double> v2; 139 140 v2.insert(v2.begin(),v.begin()+a,v.begin()+b+1); 141 142 v.swap(v2); 143 } 144 145 //----------------------------------------------------------------------------- 146 // We really want the rdtsc() calls to bracket the function call as tightly 147 // as possible, but that's hard to do portably. We'll try and get as close as 148 // possible by marking the function as NEVER_INLINE (to keep the optimizer from 149 // moving it) and marking the timing variables as "volatile register". 150 151 NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed ) 152 { 153 volatile register int64_t begin,end; 154 155 uint32_t temp[16]; 156 157 begin = rdtsc(); 158 159 hash(key,len,seed,temp); 160 161 end = rdtsc(); 162 163 return end-begin; 164 } 165 166 //----------------------------------------------------------------------------- 167 168 double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int blocksize, const int align ) 169 { 170 Rand r(seed); 171 172 uint8_t * buf = new uint8_t[blocksize + 512]; 173 174 uint64_t t1 = reinterpret_cast<uint64_t>(buf); 175 176 t1 = (t1 + 255) & BIG_CONSTANT(0xFFFFFFFFFFFFFF00); 177 t1 += align; 178 179 uint8_t * block = reinterpret_cast<uint8_t*>(t1); 180 181 r.rand_p(block,blocksize); 182 183 //---------- 184 185 std::vector<double> times; 186 times.reserve(trials); 187 188 for(int itrial = 0; itrial < trials; itrial++) 189 { 190 r.rand_p(block,blocksize); 191 192 double t = (double)timehash(hash,block,blocksize,itrial); 193 194 if(t > 0) times.push_back(t); 195 } 196 197 //---------- 198 199 std::sort(times.begin(),times.end()); 200 201 FilterOutliers(times); 202 203 delete [] buf; 204 205 return CalcMean(times); 206 } 207 208 //----------------------------------------------------------------------------- 209 // 256k blocks seem to give the best results. 210 211 void BulkSpeedTest ( pfHash hash, uint32_t seed ) 212 { 213 const int trials = 2999; 214 const int blocksize = 256 * 1024; 215 216 printf("Bulk speed test - %d-byte keys\n",blocksize); 217 218 for(int align = 0; align < 8; align++) 219 { 220 double cycles = SpeedTest(hash,seed,trials,blocksize,align); 221 222 double bestbpc = double(blocksize)/cycles; 223 224 double bestbps = (bestbpc * 3000000000.0 / 1048576.0); 225 printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps); 226 } 227 } 228 229 //----------------------------------------------------------------------------- 230 231 void TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose, double & /*outCycles*/ ) 232 { 233 const int trials = 999999; 234 235 if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize); 236 237 double cycles = SpeedTest(hash,seed,trials,keysize,0); 238 239 printf("%8.2f cycles/hash\n",cycles); 240 } 241 242 //----------------------------------------------------------------------------- 243