Home | History | Annotate | Download | only in memcpy-perf
      1 #include <iostream>
      2 #include <chrono>
      3 #include <vector>
      4 #include <algorithm>
      5 #include <numeric>
      6 #include <stdlib.h>
      7 #include <memory>
      8 #include <cmath>
      9 #include <string>
     10 #include <thread>
     11 
     12 #define CACHE_HIT_SIZE 1 << 17
     13 
     14 using namespace std;
     15 
     16 size_t size_start = 64;
     17 size_t size_end = 16 * (1ull << 20);
     18 size_t samples = 2048;
     19 size_t size_per_test = 64 * (1ull << 20);
     20 size_t tot_sum = 0;
     21 size_t delay = 0;
     22 float speed = 0;
     23 bool dummy = false;
     24 
     25 void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size);
     26 void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size);
     27 uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size);
     28 
     29 enum BenchType {
     30     MemcpyBench,
     31     MemsetBench,
     32     SumBench,
     33 };
     34 
     35 static void usage(char* p) {
     36     printf("Usage: %s <test> <options>\n"
     37            "<test> is one of the following:\n"
     38            "  --memcpy\n"
     39            "  --memset\n"
     40            "  --sum\n"
     41            "<options> are optional and apply to all tests:\n"
     42            "  --dummy\n"
     43            "    Simulates cpu-only load of a test. Guaranteed to use L2\n"
     44            "    instead.  Not supported on --sum test.\n"
     45            "  --delay DELAY_DIVISOR\n"
     46            "  --start START_SIZE_MB\n"
     47            "    --end END_SIZE_MB (requires start, optional)\n"
     48            "  --samples NUM_SAMPLES\n"
     49            , p);
     50 }
     51 
     52 int main(int argc, char *argv[])
     53 {
     54     BenchType type = MemcpyBench;
     55     if (argc <= 1) {
     56         usage(argv[0]);
     57         return 0;
     58     }
     59     for (int i = 1; i < argc; i++) {
     60       if (string(argv[i]) == string("--memcpy")) {
     61          type = MemcpyBench;
     62       } else if (string(argv[i]) == string("--memset")) {
     63          type = MemsetBench;
     64       } else if (string(argv[i]) == string("--sum")) {
     65          type = SumBench;
     66       } else if (string(argv[i]) == string("--dummy")) {
     67          dummy = true;
     68       } else if (i + 1 < argc) {
     69           if (string(argv[i]) == string("--delay")) {
     70              delay = atoi(argv[++i]);
     71           } else if (string(argv[i]) == string("--start")) {
     72              size_start = atoi(argv[++i]) * (1ull << 20);
     73              size_end = size_start;
     74           } else if (string(argv[i]) == string("--end")) {
     75              size_t end = atoi(argv[++i]) * (1ull << 20);
     76              if (end > size_start && i > 3
     77                  && string(argv[i-3]) == string("--start")) {
     78                  size_end = end;
     79              } else {
     80                  printf("Cannot specify --end without --start.\n");
     81                  return 0;
     82              }
     83           } else if (string(argv[i]) == string("--samples")) {
     84              samples = atoi(argv[++i]);
     85           } else {
     86              printf("Unknown argument %s\n", argv[i]);
     87              return 0;
     88           }
     89        } else {
     90           printf("The %s option requires a single argument.\n", argv[i]);
     91           return 0;
     92        }
     93     }
     94 
     95     unique_ptr<uint8_t[]> src(new uint8_t[size_end]);
     96     unique_ptr<uint8_t[]> dst(new uint8_t[size_end]);
     97     memset(src.get(), 1, size_end);
     98 
     99     double start_pow = log10(size_start);
    100     double end_pow = log10(size_end);
    101     double pow_inc = (end_pow - start_pow) / samples;
    102 
    103     //cout << "src: " << (uintptr_t)src.get() << endl;
    104     //cout << "dst: " <<  (uintptr_t)dst.get() << endl;
    105 
    106     for (double cur_pow = start_pow; cur_pow <= end_pow && samples > 0;
    107             cur_pow += pow_inc) {
    108         chrono::time_point<chrono::high_resolution_clock>
    109             copy_start, copy_end, pre_wait;
    110 
    111         size_t cur_size = (size_t)pow(10.0, cur_pow);
    112         size_t iter_per_size = size_per_test / cur_size;
    113 
    114         // run benchmark
    115         switch (type) {
    116             case MemsetBench: {
    117                 memcpy_noinline(src.get(), dst.get(), cur_size);
    118                 memset_noinline(dst.get(), 0xdeadbeef, cur_size);
    119                 size_t hit_size = CACHE_HIT_SIZE;
    120                 copy_start = chrono::high_resolution_clock::now();
    121                 for (int i = 0; i < iter_per_size; i++) {
    122                     if (!dummy) {
    123                         memset_noinline(dst.get(), 0xdeadbeef, cur_size);
    124                     } else {
    125                         while (hit_size < cur_size) {
    126                             memset_noinline
    127                                 (dst.get(), 0xdeadbeef, CACHE_HIT_SIZE);
    128                             hit_size += 1 << 17;
    129                         }
    130                     }
    131                     if (delay != 0)
    132                         this_thread::sleep_for(chrono
    133                             ::nanoseconds(size_per_test / delay));
    134                 }
    135                 copy_end = chrono::high_resolution_clock::now();
    136                 break;
    137             }
    138             case MemcpyBench: {
    139                 memcpy_noinline(dst.get(), src.get(), cur_size);
    140                 memcpy_noinline(src.get(), dst.get(), cur_size);
    141                 size_t hit_size = CACHE_HIT_SIZE;
    142                 copy_start = chrono::high_resolution_clock::now();
    143                 for (int i = 0; i < iter_per_size; i++) {
    144                     if (!dummy) {
    145                         memcpy_noinline(dst.get(), src.get(), cur_size);
    146                     } else {
    147                         while (hit_size < cur_size) {
    148                             memcpy_noinline
    149                                 (dst.get(), src.get(), CACHE_HIT_SIZE);
    150                             hit_size += CACHE_HIT_SIZE;
    151                         }
    152                     }
    153                     if (delay != 0)
    154                         this_thread::sleep_for(chrono
    155                             ::nanoseconds(size_per_test / delay));
    156                 }
    157                 copy_end = chrono::high_resolution_clock::now();
    158                 break;
    159             }
    160             case SumBench: {
    161                 uint64_t s = 0;
    162                 s += sum(src.get(), cur_size);
    163                 copy_start = chrono::high_resolution_clock::now();
    164                 for (int i = 0; i < iter_per_size; i++) {
    165                     s += sum(src.get(), cur_size);
    166                     if (delay != 0)
    167                         this_thread::sleep_for(chrono
    168                             ::nanoseconds(size_per_test / delay));
    169                 }
    170                 copy_end = chrono::high_resolution_clock::now();
    171                 tot_sum += s;
    172                 break;
    173             }
    174         }
    175 
    176         samples--;
    177         double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size);
    178         double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9);
    179         if (type == MemcpyBench)
    180             gb_per_sec *= 2.0;
    181         double percent_waiting = 0;
    182         if (delay != 0) {
    183             percent_waiting = (size_per_test / delay) / ns_per_copy * 100;
    184         }
    185         cout << "size: " << cur_size << ", perf: " << gb_per_sec
    186              << "GB/s, iter: " << iter_per_size << ", \% time spent waiting: "
    187              << percent_waiting << endl;
    188     }
    189     return 0;
    190 }
    191