1 #include <iostream> 2 #include <chrono> 3 #include <vector> 4 #include <algorithm> 5 #include <numeric> 6 #include <stdlib.h> 7 #include <memory> 8 #include <cmath> 9 #include <string> 10 #include <thread> 11 12 #define CACHE_HIT_SIZE 1 << 17 13 14 using namespace std; 15 16 size_t size_start = 64; 17 size_t size_end = 16 * (1ull << 20); 18 size_t samples = 2048; 19 size_t size_per_test = 64 * (1ull << 20); 20 size_t tot_sum = 0; 21 size_t delay = 0; 22 float speed = 0; 23 bool dummy = false; 24 25 void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size); 26 void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size); 27 uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size); 28 29 enum BenchType { 30 MemcpyBench, 31 MemsetBench, 32 SumBench, 33 }; 34 35 static void usage(char* p) { 36 printf("Usage: %s <test> <options>\n" 37 "<test> is one of the following:\n" 38 " --memcpy\n" 39 " --memset\n" 40 " --sum\n" 41 "<options> are optional and apply to all tests:\n" 42 " --dummy\n" 43 " Simulates cpu-only load of a test. Guaranteed to use L2\n" 44 " instead. Not supported on --sum test.\n" 45 " --delay DELAY_DIVISOR\n" 46 " --start START_SIZE_MB\n" 47 " --end END_SIZE_MB (requires start, optional)\n" 48 " --samples NUM_SAMPLES\n" 49 , p); 50 } 51 52 int main(int argc, char *argv[]) 53 { 54 BenchType type = MemcpyBench; 55 if (argc <= 1) { 56 usage(argv[0]); 57 return 0; 58 } 59 for (int i = 1; i < argc; i++) { 60 if (string(argv[i]) == string("--memcpy")) { 61 type = MemcpyBench; 62 } else if (string(argv[i]) == string("--memset")) { 63 type = MemsetBench; 64 } else if (string(argv[i]) == string("--sum")) { 65 type = SumBench; 66 } else if (string(argv[i]) == string("--dummy")) { 67 dummy = true; 68 } else if (i + 1 < argc) { 69 if (string(argv[i]) == string("--delay")) { 70 delay = atoi(argv[++i]); 71 } else if (string(argv[i]) == string("--start")) { 72 size_start = atoi(argv[++i]) * (1ull << 20); 73 size_end = size_start; 74 } else if (string(argv[i]) == string("--end")) { 75 size_t end = atoi(argv[++i]) * (1ull << 20); 76 if (end > size_start && i > 3 77 && string(argv[i-3]) == string("--start")) { 78 size_end = end; 79 } else { 80 printf("Cannot specify --end without --start.\n"); 81 return 0; 82 } 83 } else if (string(argv[i]) == string("--samples")) { 84 samples = atoi(argv[++i]); 85 } else { 86 printf("Unknown argument %s\n", argv[i]); 87 return 0; 88 } 89 } else { 90 printf("The %s option requires a single argument.\n", argv[i]); 91 return 0; 92 } 93 } 94 95 unique_ptr<uint8_t[]> src(new uint8_t[size_end]); 96 unique_ptr<uint8_t[]> dst(new uint8_t[size_end]); 97 memset(src.get(), 1, size_end); 98 99 double start_pow = log10(size_start); 100 double end_pow = log10(size_end); 101 double pow_inc = (end_pow - start_pow) / samples; 102 103 //cout << "src: " << (uintptr_t)src.get() << endl; 104 //cout << "dst: " << (uintptr_t)dst.get() << endl; 105 106 for (double cur_pow = start_pow; cur_pow <= end_pow && samples > 0; 107 cur_pow += pow_inc) { 108 chrono::time_point<chrono::high_resolution_clock> 109 copy_start, copy_end, pre_wait; 110 111 size_t cur_size = (size_t)pow(10.0, cur_pow); 112 size_t iter_per_size = size_per_test / cur_size; 113 114 // run benchmark 115 switch (type) { 116 case MemsetBench: { 117 memcpy_noinline(src.get(), dst.get(), cur_size); 118 memset_noinline(dst.get(), 0xdeadbeef, cur_size); 119 size_t hit_size = CACHE_HIT_SIZE; 120 copy_start = chrono::high_resolution_clock::now(); 121 for (int i = 0; i < iter_per_size; i++) { 122 if (!dummy) { 123 memset_noinline(dst.get(), 0xdeadbeef, cur_size); 124 } else { 125 while (hit_size < cur_size) { 126 memset_noinline 127 (dst.get(), 0xdeadbeef, CACHE_HIT_SIZE); 128 hit_size += 1 << 17; 129 } 130 } 131 if (delay != 0) 132 this_thread::sleep_for(chrono 133 ::nanoseconds(size_per_test / delay)); 134 } 135 copy_end = chrono::high_resolution_clock::now(); 136 break; 137 } 138 case MemcpyBench: { 139 memcpy_noinline(dst.get(), src.get(), cur_size); 140 memcpy_noinline(src.get(), dst.get(), cur_size); 141 size_t hit_size = CACHE_HIT_SIZE; 142 copy_start = chrono::high_resolution_clock::now(); 143 for (int i = 0; i < iter_per_size; i++) { 144 if (!dummy) { 145 memcpy_noinline(dst.get(), src.get(), cur_size); 146 } else { 147 while (hit_size < cur_size) { 148 memcpy_noinline 149 (dst.get(), src.get(), CACHE_HIT_SIZE); 150 hit_size += CACHE_HIT_SIZE; 151 } 152 } 153 if (delay != 0) 154 this_thread::sleep_for(chrono 155 ::nanoseconds(size_per_test / delay)); 156 } 157 copy_end = chrono::high_resolution_clock::now(); 158 break; 159 } 160 case SumBench: { 161 uint64_t s = 0; 162 s += sum(src.get(), cur_size); 163 copy_start = chrono::high_resolution_clock::now(); 164 for (int i = 0; i < iter_per_size; i++) { 165 s += sum(src.get(), cur_size); 166 if (delay != 0) 167 this_thread::sleep_for(chrono 168 ::nanoseconds(size_per_test / delay)); 169 } 170 copy_end = chrono::high_resolution_clock::now(); 171 tot_sum += s; 172 break; 173 } 174 } 175 176 samples--; 177 double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size); 178 double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9); 179 if (type == MemcpyBench) 180 gb_per_sec *= 2.0; 181 double percent_waiting = 0; 182 if (delay != 0) { 183 percent_waiting = (size_per_test / delay) / ns_per_copy * 100; 184 } 185 cout << "size: " << cur_size << ", perf: " << gb_per_sec 186 << "GB/s, iter: " << iter_per_size << ", \% time spent waiting: " 187 << percent_waiting << endl; 188 } 189 return 0; 190 } 191