Home | History | Annotate | Download | only in bench
      1 /*
      2  * Copyright 2016 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can
      5  * be found in the LICENSE file.
      6  *
      7  */
      8 
      9 //
     10 //
     11 //
     12 
     13 #include <stdlib.h>
     14 #include <stdio.h>
     15 #include <string.h>
     16 #include <inttypes.h>
     17 
     18 //
     19 // squelch OpenCL 1.2 deprecation warning
     20 //
     21 
     22 #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS
     23 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
     24 #endif
     25 
     26 #include "common/macros.h"
     27 #include "common/cl/assert_cl.h"
     28 #include "common/cl/find_cl.h"
     29 //
     30 //
     31 //
     32 
     33 #include "hs_cl.h"
     34 
     35 //
     36 // FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW
     37 //
     38 
     39 #include "intel/gen8/u32/hs_target.h"
     40 #include "intel/gen8/u64/hs_target.h"
     41 
     42 // #include "intel/gen9lp/u32/hs_target.h"
     43 // #include "intel/gen9lp/u64/hs_target.h"
     44 
     45 //
     46 // The quality of the RNG doesn't matter.  The same number of
     47 // instructions will be run no matter what the key distribution looks
     48 // like.  So here is something small and fast.
     49 //
     50 
     51 static
     52 uint32_t
     53 hs_rand_u32()
     54 {
     55   static uint32_t seed = 0xDEADBEEF;
     56 
     57   // Numerical Recipes
     58   seed = seed * 1664525 + 1013904223;
     59 
     60   return seed;
     61 }
     62 
     63 //
     64 //
     65 //
     66 
     67 static
     68 void
     69 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
     70 {
     71 #if   1
     72   for (uint32_t ii=0; ii<count*words; ii++)
     73     vin_h[ii] = hs_rand_u32();
     74 #elif 0 // in-order
     75   memset(vin_h,0,count*words*sizeof(uint32_t));
     76   for (uint32_t ii=0; ii<count; ii++)
     77     vin_h[ii*words] = ii;
     78 #else   // reverse order
     79   memset(vin_h,0,count*words*sizeof(uint32_t));
     80   for (uint32_t ii=0; ii<count; ii++)
     81     vin_h[ii*words] = count - 1 - ii;
     82 #endif
     83 }
     84 
     85 //
     86 //
     87 //
     88 
     89 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
     90 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
     91 
     92 //
     93 //
     94 //
     95 
     96 static
     97 char const *
     98 hs_cpu_sort(void     *       sorted_h,
     99             uint32_t   const hs_words,
    100             uint32_t   const count,
    101             double   * const cpu_ns)
    102 {
    103   if (hs_words == 1)
    104     return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
    105   else
    106     return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
    107 }
    108 
    109 static
    110 void
    111 hs_transpose_slabs_u32(uint32_t const hs_words,
    112                        uint32_t const hs_width,
    113                        uint32_t const hs_height,
    114                        uint32_t *     vout_h,
    115                        uint32_t const count)
    116 {
    117   uint32_t   const slab_keys  = hs_width * hs_height;
    118   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
    119   uint32_t * const slab       = ALLOCA_MACRO(slab_size);
    120   uint32_t         slab_count = count / slab_keys;
    121 
    122   while (slab_count-- > 0)
    123     {
    124       memcpy(slab,vout_h,slab_size);
    125 
    126       for (uint32_t row=0; row<hs_height; row++)
    127         for (uint32_t col=0; col<hs_width; col++)
    128           vout_h[col * hs_height + row] = slab[row * hs_width + col];
    129 
    130       vout_h += slab_keys;
    131     }
    132 }
    133 
    134 static
    135 void
    136 hs_transpose_slabs_u64(uint32_t const hs_words,
    137                        uint32_t const hs_width,
    138                        uint32_t const hs_height,
    139                        uint64_t *     vout_h,
    140                        uint32_t const count)
    141 {
    142   uint32_t   const slab_keys  = hs_width * hs_height;
    143   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
    144   uint64_t * const slab       = ALLOCA_MACRO(slab_size);
    145   uint32_t         slab_count = count / slab_keys;
    146 
    147   while (slab_count-- > 0)
    148     {
    149       memcpy(slab,vout_h,slab_size);
    150 
    151       for (uint32_t row=0; row<hs_height; row++)
    152         for (uint32_t col=0; col<hs_width; col++)
    153           vout_h[col * hs_height + row] = slab[row * hs_width + col];
    154 
    155       vout_h += slab_keys;
    156     }
    157 }
    158 
    159 static
    160 void
    161 hs_transpose_slabs(uint32_t const hs_words,
    162                    uint32_t const hs_width,
    163                    uint32_t const hs_height,
    164                    void   *       vout_h,
    165                    uint32_t const count)
    166 {
    167   if (hs_words == 1)
    168     hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
    169   else
    170     hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
    171 }
    172 
    173 //
    174 //
    175 //
    176 
    177 static
    178 void
    179 hs_debug_u32(uint32_t const   hs_width,
    180              uint32_t const   hs_height,
    181              uint32_t const * vout_h,
    182              uint32_t const   count)
    183 {
    184   uint32_t const slab_keys = hs_width * hs_height;
    185   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
    186 
    187   for (uint32_t ss=0; ss<slabs; ss++) {
    188     fprintf(stderr,"%u\n",ss);
    189     for (uint32_t cc=0; cc<hs_height; cc++) {
    190       for (uint32_t rr=0; rr<hs_width; rr++)
    191         fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
    192       fprintf(stderr,"\n");
    193     }
    194   }
    195 }
    196 
    197 static
    198 void
    199 hs_debug_u64(uint32_t const   hs_width,
    200              uint32_t const   hs_height,
    201              uint64_t const * vout_h,
    202              uint32_t const   count)
    203 {
    204   uint32_t const slab_keys = hs_width * hs_height;
    205   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
    206 
    207   for (uint32_t ss=0; ss<slabs; ss++) {
    208     fprintf(stderr,"%u\n",ss);
    209     for (uint32_t cc=0; cc<hs_height; cc++) {
    210       for (uint32_t rr=0; rr<hs_width; rr++)
    211         fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
    212       fprintf(stderr,"\n");
    213     }
    214   }
    215 }
    216 
    217 //
    218 // Used for benchmarking on out-of-order queues.  Attaching an event
    219 // to a kernel on an OOQ with profiling enabled will result in a
    220 // synchronization point and block concurrent execution of kernels.
    221 //
    222 // The workaround that enables measuring the entire runtime of the
    223 // sort is to launch a dummy kernel with an event, a barrier without
    224 // an event, then the call to hs_sort(), followed by a final dummy
    225 // kernel with an event.
    226 //
    227 // The end time of the first dummy and start time of the second dummy
    228 // will provide a conservative estimate of the total execution time of
    229 // the hs_sort() routine.
    230 //
    231 // Note that once kernels are enqueued they are scheduled with only
    232 // microseconds between them so this should only be a small number of
    233 // microseconds longer than the true hs_sort() execution time.
    234 //
    235 
    236 #define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }"
    237 
    238 static cl_kernel hs_dummy_kernel;
    239 
    240 static
    241 void
    242 hs_dummy_kernel_create(cl_context context, cl_device_id device_id)
    243 {
    244   cl_int err;
    245 
    246   char   const * strings[]        = { HS_DUMMY_KERNEL_PROGRAM         };
    247   size_t const   strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) };
    248 
    249   cl_program program = clCreateProgramWithSource(context,
    250                                                  1,
    251                                                  strings,
    252                                                  strings_sizeof,
    253                                                  &err); cl_ok(err);
    254   cl(BuildProgram(program,
    255                   1,
    256                   &device_id,
    257                   NULL,
    258                   NULL,
    259                   NULL));
    260 
    261   hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err);
    262 
    263   cl(ReleaseProgram(program));
    264 }
    265 
    266 static
    267 void
    268 hs_dummy_kernel_release()
    269 {
    270   cl(ReleaseKernel(hs_dummy_kernel));
    271 }
    272 
    273 static
    274 void
    275 hs_dummy_kernel_enqueue(cl_command_queue cq,
    276                         uint32_t         wait_list_size,
    277                         cl_event const * wait_list,
    278                         cl_event       * event)
    279 {
    280   size_t const global_work_size = 1;
    281 
    282   cl(EnqueueNDRangeKernel(cq,
    283                           hs_dummy_kernel,
    284                           1,
    285                           NULL,
    286                           &global_work_size,
    287                           NULL,
    288                           wait_list_size,
    289                           wait_list,
    290                           event));
    291 }
    292 
    293 //
    294 //
    295 //
    296 
    297 static
    298 void
    299 hs_bench(cl_context                   context,
    300          cl_command_queue             cq,
    301          cl_command_queue             cq_profile,
    302          char           const * const device_name,
    303          char           const * const driver_version,
    304          uint32_t               const hs_words,
    305          uint32_t               const hs_width,
    306          uint32_t               const hs_height,
    307          struct hs_cl   const * const hs,
    308          uint32_t               const count_lo,
    309          uint32_t               const count_hi,
    310          uint32_t               const count_step,
    311          uint32_t               const loops,
    312          uint32_t               const warmup,
    313          bool                   const linearize)
    314 {
    315   //
    316   // return if nothing to do
    317   //
    318   if (count_hi <= 1)
    319     return;
    320 
    321   //
    322   // size the arrays
    323   //
    324   uint32_t count_hi_padded_in, count_hi_padded_out;
    325 
    326   hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out);
    327 
    328   //
    329   // SIZE
    330   //
    331   size_t const key_size    = sizeof(uint32_t)    * hs_words;
    332 
    333   size_t const size_hi_in  = count_hi_padded_in  * key_size;
    334   size_t const size_hi_out = count_hi_padded_out * key_size;
    335 
    336   //
    337   // ALLOCATE
    338   //
    339   cl_int cl_err;
    340 
    341   void * sorted_h = malloc(size_hi_in);
    342 
    343   cl_mem random   = clCreateBuffer(context,
    344                                    CL_MEM_READ_ONLY  | CL_MEM_ALLOC_HOST_PTR,
    345                                    size_hi_in,
    346                                    NULL,&cl_err); cl_ok(cl_err);
    347 
    348   cl_mem vin      = clCreateBuffer(context,
    349                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
    350                                    size_hi_in,
    351                                    NULL,&cl_err); cl_ok(cl_err);
    352 
    353   cl_mem vout     = clCreateBuffer(context,
    354                                    CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
    355                                    size_hi_out,
    356                                    NULL,&cl_err); cl_ok(cl_err);
    357   //
    358   // BLOCKING MAP AND INIT KEYS
    359   //
    360   {
    361     void * random_h = clEnqueueMapBuffer(cq,
    362                                          random,
    363                                          CL_TRUE,
    364                                          CL_MAP_WRITE_INVALIDATE_REGION,
    365                                          0,size_hi_in,
    366                                          0,NULL,NULL,
    367                                          &cl_err); cl_ok(cl_err);
    368 
    369     // fill with random numbers
    370     hs_fill_rand(random_h,count_hi,hs_words);
    371 
    372     //
    373     // UNMAP
    374     //
    375     cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL));
    376   }
    377 
    378   //
    379   // BENCHMARK
    380   //
    381   for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
    382     {
    383       // compute padding before sorting
    384       uint32_t count_padded_in, count_padded_out;
    385 
    386       hs_cl_pad(hs,count,&count_padded_in,&count_padded_out);
    387 
    388       cl_ulong elapsed_ns_min = UINT64_MAX;
    389       cl_ulong elapsed_ns_max = 0;
    390       cl_ulong elapsed_ns_sum = 0;
    391 
    392       cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
    393       cl(Finish(cq));
    394 
    395       for (uint32_t ii=0; ii<warmup+loops; ii++)
    396         {
    397           if (ii == warmup)
    398             {
    399               elapsed_ns_min = UINT64_MAX;
    400               elapsed_ns_max = 0;
    401               elapsed_ns_sum = 0;
    402             }
    403 
    404 #if 0
    405           //
    406           // optionally, initialize vin on every loop -- no need
    407           //
    408           cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL));
    409           cl(Finish(cq));
    410 #endif
    411 
    412           //
    413           // sort vin
    414           //
    415           cl_event start, complete, end;
    416 
    417           hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start);
    418 
    419           // note hs_sort enqueues a final barrier
    420           hs_cl_sort(hs,
    421                      cq,
    422                      1,&start,&complete,
    423                      vin,vout,
    424                      count,
    425                      count_padded_in,
    426                      count_padded_out,
    427                      linearize);
    428 
    429           hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end);
    430 
    431           cl(Finish(cq_profile));
    432 
    433           //
    434           // measure duration
    435           //
    436           cl_ulong t_start=0, t_end=0;
    437 
    438           // start
    439           cl(GetEventProfilingInfo(start,
    440                                    CL_PROFILING_COMMAND_END,
    441                                    sizeof(cl_ulong),
    442                                    &t_start,
    443                                    NULL));
    444 
    445           // end
    446           cl(GetEventProfilingInfo(end,
    447                                    CL_PROFILING_COMMAND_START,
    448                                    sizeof(cl_ulong),
    449                                    &t_end,
    450                                    NULL));
    451 
    452           cl_ulong const t = t_end - t_start;
    453 
    454           elapsed_ns_min  = MIN_MACRO(elapsed_ns_min,t);
    455           elapsed_ns_max  = MAX_MACRO(elapsed_ns_max,t);
    456           elapsed_ns_sum += t;
    457 
    458           cl(ReleaseEvent(start));
    459           cl(ReleaseEvent(complete));
    460           cl(ReleaseEvent(end));
    461         }
    462 
    463       //
    464       // COPY KEYS BACK FOR VERIFICATION
    465       //
    466       size_t const size_padded_in = count_padded_in * key_size;
    467 
    468       void * vin_h = clEnqueueMapBuffer(cq,
    469                                         vin,
    470                                         CL_FALSE,
    471                                         CL_MAP_READ,
    472                                         0,size_padded_in,
    473                                         0,NULL,NULL,
    474                                         &cl_err); cl_ok(cl_err);
    475 
    476       void * vout_h = clEnqueueMapBuffer(cq,
    477                                          vout,
    478                                          CL_FALSE,
    479                                          CL_MAP_READ,
    480                                          0,size_padded_in,
    481                                          0,NULL,NULL,
    482                                          &cl_err); cl_ok(cl_err);
    483       cl(Finish(cq));
    484 
    485       //
    486       // SORT THE UNTOUCHED RANDOM INPUT
    487       //
    488       memcpy(sorted_h,vin_h,size_padded_in);
    489 
    490       double cpu_ns;
    491 
    492       char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns);
    493 
    494       //
    495       // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING
    496       //
    497       if (!linearize)
    498         hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in);
    499 
    500       //
    501       // VERIFY
    502       //
    503       bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0;
    504 
    505 #ifndef NDEBUG
    506       if (!verified)
    507         {
    508           if (hs_words == 1)
    509             hs_debug_u32(hs_width,hs_height,vout_h,count);
    510           else // ulong
    511             hs_debug_u64(hs_width,hs_height,vout_h,count);
    512         }
    513 #endif
    514 
    515       cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL));
    516       cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL));
    517 
    518       cl(Finish(cq));
    519 
    520       //
    521       // REPORT
    522       //
    523       fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
    524               device_name,
    525               driver_version,
    526               (hs_words == 1) ? "uint" : "ulong",
    527               linearize ? "linear" : "slab",
    528               verified ? "  OK  " : "*FAIL*",
    529               count,
    530               count_padded_in,
    531               count_padded_out,
    532               // CPU
    533               algo,
    534               cpu_ns / 1000000.0,                       // milliseconds
    535               1000.0 * count / cpu_ns,                  // mkeys / sec
    536               // GPU
    537               loops,
    538               elapsed_ns_sum / 1000000.0 / loops,       // avg msecs
    539               elapsed_ns_min / 1000000.0,               // min msecs
    540               elapsed_ns_max / 1000000.0,               // max msecs
    541               1000.0 * count * loops / elapsed_ns_sum,  // mkeys / sec - avg
    542               1000.0 * count         / elapsed_ns_min); // mkeys / sec - max
    543 
    544       // quit early if not verified
    545       if (!verified)
    546         break;
    547     }
    548 
    549   //
    550   // dispose
    551   //
    552   cl(ReleaseMemObject(vout));
    553   cl(ReleaseMemObject(vin));
    554   cl(ReleaseMemObject(random));
    555   free(sorted_h);
    556 }
    557 
    558 //
    559 //
    560 //
    561 
    562 int
    563 main(int argc, char const * argv[])
    564 {
    565   char const * const target_platform_substring = "Intel";
    566   char const * const target_device_substring   = "Graphics";
    567 
    568   //
    569   // find platform and device ids
    570   //
    571   cl_platform_id platform_id;
    572   cl_device_id   device_id;
    573 
    574 #define HS_DEVICE_NAME_SIZE  64
    575 
    576   char   device_name[HS_DEVICE_NAME_SIZE];
    577   size_t device_name_size;
    578 
    579   cl(FindIdsByName(target_platform_substring,
    580                    target_device_substring,
    581                    &platform_id,
    582                    &device_id,
    583                    HS_DEVICE_NAME_SIZE,
    584                    device_name,
    585                    &device_name_size,
    586                    true));
    587   //
    588   // create context
    589   //
    590   cl_context_properties context_properties[] =
    591     {
    592       CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id,
    593       0
    594     };
    595 
    596   cl_int     cl_err;
    597   cl_context context = clCreateContext(context_properties,
    598                                        1,
    599                                        &device_id,
    600                                        NULL,
    601                                        NULL,
    602                                        &cl_err);
    603   cl_ok(cl_err);
    604 
    605   //
    606   // create command queue
    607   //
    608 #if 0 // OPENCL 2.0
    609 
    610   cl_queue_properties props[] = {
    611     CL_QUEUE_PROPERTIES,
    612     (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
    613 #ifndef NDEBUG
    614     (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
    615 #endif
    616     0
    617   };
    618 
    619   cl_queue_properties props_profile[] = {
    620     CL_QUEUE_PROPERTIES,
    621     (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE,
    622     0
    623   };
    624 
    625   cl_command_queue cq = clCreateCommandQueueWithProperties(context,
    626                                                            device_id,
    627                                                            props,
    628                                                            &cl_err); cl_ok(cl_err);
    629 
    630   cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context,
    631                                                                    device_id,
    632                                                                    props_profile,
    633                                                                    &cl_err); cl_ok(cl_err);
    634 #else // OPENCL 1.2
    635 
    636   cl_command_queue cq = clCreateCommandQueue(context,
    637                                              device_id,
    638 #ifndef NDEBUG
    639                                              CL_QUEUE_PROFILING_ENABLE |
    640 #endif
    641                                              CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
    642                                              &cl_err); cl_ok(cl_err);
    643 
    644   cl_command_queue cq_profile = clCreateCommandQueue(context,
    645                                                      device_id,
    646                                                      CL_QUEUE_PROFILING_ENABLE,
    647                                                      &cl_err); cl_ok(cl_err);
    648 #endif
    649 
    650   //
    651   // Intel GEN workaround -- create dummy kernel for semi-accurate
    652   // profiling on an out-of-order queue.
    653   //
    654   hs_dummy_kernel_create(context,device_id);
    655 
    656   //
    657   // select the target
    658   //
    659 
    660   uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0);
    661 
    662   struct hs_cl_target const * hs_target;
    663 
    664   if (key_val_words == 1)
    665     hs_target = &hs_intel_gen8_u32;
    666   else
    667     hs_target = &hs_intel_gen8_u64;
    668 
    669   //
    670   // create kernels
    671   //
    672   fprintf(stdout,"Creating... ");
    673 
    674   struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id);
    675 
    676   fprintf(stdout,"done.\n");
    677 
    678   //
    679   //
    680   //
    681 
    682 #ifdef NDEBUG
    683 #define HS_BENCH_LOOPS   100
    684 #define HS_BENCH_WARMUP  100
    685 #else
    686 #define HS_BENCH_LOOPS   1
    687 #define HS_BENCH_WARMUP  0
    688 #endif
    689 
    690   //
    691   // sort sizes and loops
    692   //
    693   uint32_t const kpb        = hs_target->config.slab.height << hs_target->config.slab.width_log2;
    694 
    695   uint32_t const count_lo   = (argc <= 2) ? kpb             : strtoul(argv[2],NULL,0);
    696   uint32_t const count_hi   = (argc <= 3) ? count_lo        : strtoul(argv[3],NULL,0);
    697   uint32_t const count_step = (argc <= 4) ? count_lo        : strtoul(argv[4],NULL,0);
    698   uint32_t const loops      = (argc <= 5) ? HS_BENCH_LOOPS  : strtoul(argv[5],NULL,0);
    699   uint32_t const warmup     = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0);
    700   bool     const linearize  = (argc <= 7) ? true            : strtoul(argv[7],NULL,0);
    701 
    702   //
    703   // labels
    704   //
    705   fprintf(stdout,
    706           "Device, "
    707           "Driver, "
    708           "Type, "
    709           "Slab/Linear, "
    710           "Verified?, "
    711           "Keys, "
    712           "Keys Padded In, "
    713           "Keys Padded Out, "
    714           "CPU Algorithm, "
    715           "CPU Msecs, "
    716           "CPU Mkeys/s, "
    717           "Trials, "
    718           "Avg. Msecs, "
    719           "Min Msecs, "
    720           "Max Msecs, "
    721           "Avg. Mkeys/s, "
    722           "Max. Mkeys/s\n");
    723 
    724   //
    725   // we want to track driver versions
    726   //
    727   size_t driver_version_size;
    728 
    729   cl(GetDeviceInfo(device_id,
    730                    CL_DRIVER_VERSION,
    731                    0,
    732                    NULL,
    733                    &driver_version_size));
    734 
    735   char * const driver_version = ALLOCA_MACRO(driver_version_size);
    736 
    737   cl(GetDeviceInfo(device_id,
    738                    CL_DRIVER_VERSION,
    739                    driver_version_size,
    740                    driver_version,
    741                    NULL));
    742   //
    743   // benchmark
    744   //
    745   hs_bench(context,
    746            cq,cq_profile,
    747            device_name,
    748            driver_version,
    749            hs_target->config.words.key + hs_target->config.words.val,
    750            1 << hs_target->config.slab.width_log2,
    751            hs_target->config.slab.height,
    752            hs,
    753            count_lo,
    754            count_hi,
    755            count_step,
    756            loops,
    757            warmup,
    758            linearize);
    759 
    760   //
    761   // release everything
    762   //
    763   hs_cl_release(hs);
    764 
    765   hs_dummy_kernel_release();
    766 
    767   cl(ReleaseCommandQueue(cq));
    768   cl(ReleaseCommandQueue(cq_profile));
    769 
    770   cl(ReleaseContext(context));
    771 
    772   return 0;
    773 }
    774