Home | History | Annotate | Download | only in bench
      1 /*
      2  * Copyright 2016 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can
      5  * be found in the LICENSE file.
      6  *
      7  */
      8 
      9 //
     10 //
     11 //
     12 
     13 #include <stdlib.h>
     14 #include <stdio.h>
     15 #include <string.h>
     16 #include <inttypes.h>
     17 
     18 //
     19 //
     20 
     21 #include "common/macros.h"
     22 #include "common/vk/assert_vk.h"
     23 #include "common/vk/host_alloc.h"
     24 #include "common/vk/cache_vk.h"
     25 
     26 //
     27 //
     28 //
     29 
     30 #include "hs_vk.h"
     31 
     32 //
     33 // Compile-time images of HotSort targets
     34 //
     35 
     36 #include "hs/vk/intel/gen8/u32/hs_target.h"
     37 #include "hs/vk/intel/gen8/u64/hs_target.h"
     38 
     39 #include "hs/vk/nvidia/sm_35/u32/hs_target.h"
     40 #include "hs/vk/nvidia/sm_35/u64/hs_target.h"
     41 
     42 #include "hs/vk/amd/gcn/u32/hs_target.h"
     43 #include "hs/vk/amd/gcn/u64/hs_target.h"
     44 
     45 //
     46 //
     47 //
     48 
     49 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns);
     50 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns);
     51 
     52 //
     53 //
     54 //
     55 
     56 static
     57 char const *
     58 hs_cpu_sort(void     *       sorted_h,
     59             uint32_t   const hs_words,
     60             uint32_t   const count,
     61             double   * const cpu_ns)
     62 {
     63   if (hs_words == 1)
     64     return hs_cpu_sort_u32(sorted_h,count,cpu_ns);
     65   else
     66     return hs_cpu_sort_u64(sorted_h,count,cpu_ns);
     67 }
     68 
     69 static
     70 void
     71 hs_transpose_slabs_u32(uint32_t const hs_words,
     72                        uint32_t const hs_width,
     73                        uint32_t const hs_height,
     74                        uint32_t *     vout_h,
     75                        uint32_t const count)
     76 {
     77   uint32_t   const slab_keys  = hs_width * hs_height;
     78   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
     79   uint32_t * const slab       = ALLOCA_MACRO(slab_size);
     80   uint32_t         slab_count = count / slab_keys;
     81 
     82   while (slab_count-- > 0)
     83     {
     84       memcpy(slab,vout_h,slab_size);
     85 
     86       for (uint32_t row=0; row<hs_height; row++)
     87         for (uint32_t col=0; col<hs_width; col++)
     88           vout_h[col * hs_height + row] = slab[row * hs_width + col];
     89 
     90       vout_h += slab_keys;
     91     }
     92 }
     93 
     94 static
     95 void
     96 hs_transpose_slabs_u64(uint32_t const hs_words,
     97                        uint32_t const hs_width,
     98                        uint32_t const hs_height,
     99                        uint64_t *     vout_h,
    100                        uint32_t const count)
    101 {
    102   uint32_t   const slab_keys  = hs_width * hs_height;
    103   size_t     const slab_size  = sizeof(uint32_t) * hs_words * slab_keys;
    104   uint64_t * const slab       = ALLOCA_MACRO(slab_size);
    105   uint32_t         slab_count = count / slab_keys;
    106 
    107   while (slab_count-- > 0)
    108     {
    109       memcpy(slab,vout_h,slab_size);
    110 
    111       for (uint32_t row=0; row<hs_height; row++)
    112         for (uint32_t col=0; col<hs_width; col++)
    113           vout_h[col * hs_height + row] = slab[row * hs_width + col];
    114 
    115       vout_h += slab_keys;
    116     }
    117 }
    118 
    119 static
    120 void
    121 hs_transpose_slabs(uint32_t const hs_words,
    122                    uint32_t const hs_width,
    123                    uint32_t const hs_height,
    124                    void   *       vout_h,
    125                    uint32_t const count)
    126 {
    127   if (hs_words == 1)
    128     hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count);
    129   else
    130     hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count);
    131 }
    132 
    133 //
    134 //
    135 //
    136 
    137 #ifndef NDEBUG
    138 
    139 static
    140 VkBool32
    141 VKAPI_PTR
    142 vk_debug_report_cb(VkDebugReportFlagsEXT      flags,
    143                    VkDebugReportObjectTypeEXT objectType,
    144                    uint64_t                   object,
    145                    size_t                     location,
    146                    int32_t                    messageCode,
    147                    const char*                pLayerPrefix,
    148                    const char*                pMessage,
    149                    void*                      pUserData)
    150 {
    151   char const * flag_str = "";
    152   bool         is_error = false;
    153 
    154 #define VK_FLAG_CASE_TO_STRING(c)               \
    155   case c:                                       \
    156     flag_str = #c;                              \
    157     is_error = true;                            \
    158     break
    159 
    160   switch (flags)
    161     {
    162       // VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_INFORMATION_BIT_EXT);
    163       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_WARNING_BIT_EXT);
    164       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT);
    165       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_ERROR_BIT_EXT);
    166       VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_DEBUG_BIT_EXT);
    167     }
    168 
    169   if (is_error)
    170     {
    171       fprintf(stderr,"%s  %s  %s\n",
    172               flag_str,
    173               pLayerPrefix,
    174               pMessage);
    175     }
    176 
    177   return VK_FALSE;
    178 }
    179 
    180 #endif
    181 
    182 //
    183 //
    184 //
    185 
    186 static
    187 uint32_t
    188 hs_rand_u32()
    189 {
    190   static uint32_t seed = 0xDEADBEEF;
    191 
    192   // Numerical Recipes
    193   seed = seed * 1664525 + 1013904223;
    194 
    195   return seed;
    196 }
    197 
    198 //
    199 //
    200 //
    201 
    202 static
    203 void
    204 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words)
    205 {
    206 #if   1
    207   for (uint32_t ii=0; ii<count*words; ii++)
    208     vin_h[ii] = hs_rand_u32();
    209 #elif 0 // in-order
    210   memset(vin_h,0,count*words*sizeof(uint32_t));
    211   for (uint32_t ii=0; ii<count; ii++)
    212     vin_h[ii*words] = ii;
    213 #else   // reverse order
    214   memset(vin_h,0,count*words*sizeof(uint32_t));
    215   for (uint32_t ii=0; ii<count; ii++)
    216     vin_h[ii*words] = count - 1 - ii;
    217 #endif
    218 }
    219 
    220 
    221 //
    222 //
    223 //
    224 
    225 static
    226 void
    227 hs_debug_u32(uint32_t const   hs_width,
    228              uint32_t const   hs_height,
    229              uint32_t const * vout_h,
    230              uint32_t const   count)
    231 {
    232   uint32_t const slab_keys = hs_width * hs_height;
    233   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
    234 
    235   for (uint32_t ss=0; ss<slabs; ss++) {
    236     fprintf(stderr,"%u\n",ss);
    237     for (uint32_t cc=0; cc<hs_height; cc++) {
    238       for (uint32_t rr=0; rr<hs_width; rr++)
    239         fprintf(stderr,"%8" PRIX32 " ",*vout_h++);
    240       fprintf(stderr,"\n");
    241     }
    242   }
    243 }
    244 
    245 static
    246 void
    247 hs_debug_u64(uint32_t const   hs_width,
    248              uint32_t const   hs_height,
    249              uint64_t const * vout_h,
    250              uint32_t const   count)
    251 {
    252   uint32_t const slab_keys = hs_width * hs_height;
    253   uint32_t const slabs     = (count + slab_keys - 1) / slab_keys;
    254 
    255   for (uint32_t ss=0; ss<slabs; ss++) {
    256     fprintf(stderr,"%u\n",ss);
    257     for (uint32_t cc=0; cc<hs_height; cc++) {
    258       for (uint32_t rr=0; rr<hs_width; rr++)
    259         fprintf(stderr,"%16" PRIX64 " ",*vout_h++);
    260       fprintf(stderr,"\n");
    261     }
    262   }
    263 }
    264 
    265 //
    266 //
    267 //
    268 
    269 bool
    270 is_matching_device(VkPhysicalDeviceProperties const * const phy_device_props,
    271                    struct hs_vk_target const *      * const hs_target,
    272                    uint32_t                           const vendor_id,
    273                    uint32_t                           const device_id,
    274                    uint32_t                           const key_val_words)
    275 {
    276   if ((phy_device_props->vendorID != vendor_id) || (phy_device_props->deviceID != device_id))
    277     return false;
    278 
    279   if (phy_device_props->vendorID == 0x10DE)
    280     {
    281       //
    282       // FIXME -- for now, the kernels in this app are targeting
    283       // sm_35+ devices.  You could add some rigorous rejection by
    284       // device id here...
    285       //
    286       if (key_val_words == 1)
    287         *hs_target = &hs_nvidia_sm35_u32;
    288       else
    289         *hs_target = &hs_nvidia_sm35_u64;
    290     }
    291   else if (phy_device_props->vendorID == 0x8086)
    292     {
    293       //
    294       // FIXME -- for now, the kernels in this app are targeting GEN8+
    295       // devices -- this does *not* include variants of GEN9LP+
    296       // "Apollo Lake" because that device has a different
    297       // architectural "shape" than GEN8 GTx.  You could add some
    298       // rigorous rejection by device id here...
    299       //
    300       if (key_val_words == 1)
    301         *hs_target = &hs_intel_gen8_u32;
    302       else
    303         *hs_target = &hs_intel_gen8_u64;
    304     }
    305   else if (phy_device_props->vendorID == 0x1002)
    306     {
    307       //
    308       // AMD GCN
    309       //
    310       if (key_val_words == 1)
    311         *hs_target = &hs_amd_gcn_u32;
    312       else
    313         *hs_target = &hs_amd_gcn_u64;
    314     }
    315   else
    316     {
    317       return false;
    318     }
    319 
    320   return true;
    321 }
    322 
    323 //
    324 //
    325 //
    326 
    327 uint32_t
    328 vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props,
    329                      uint32_t                         const   compatible_mem_types,
    330                      VkMemoryPropertyFlags            const   required_mem_props,
    331                      bool                             const   abort)
    332 {
    333   //
    334   // FIXME -- jump between indices in the memoryTypeBits mask
    335   //
    336   uint32_t const count = phy_device_mem_props->memoryTypeCount;
    337 
    338   for (uint32_t index=0; index<count; index++)
    339     {
    340       // acceptable memory type for this resource?
    341       if ((compatible_mem_types & (1<<index)) == 0)
    342         continue;
    343 
    344       // otherwise, find first match...
    345       VkMemoryPropertyFlags const common_props =
    346         phy_device_mem_props->memoryTypes[index].propertyFlags & required_mem_props;
    347 
    348       if (common_props == required_mem_props)
    349         return index;
    350     }
    351 
    352   if (abort)
    353     {
    354       fprintf(stderr,"Memory type not found: %X\n",required_mem_props);
    355       exit(EXIT_FAILURE);
    356     }
    357 
    358   return UINT32_MAX;
    359 }
    360 
    361 //
    362 //
    363 //
    364 
    365 #ifdef NDEBUG
    366 #define HS_BENCH_LOOPS   100
    367 #define HS_BENCH_WARMUP  100
    368 #else
    369 #define HS_BENCH_LOOPS   1
    370 #define HS_BENCH_WARMUP  0
    371 #endif
    372 
    373 //
    374 //
    375 //
    376 
    377 int
    378 main(int argc, char const * argv[])
    379 {
    380   //
    381   // select the target by vendor and device id
    382   //
    383   uint32_t const vendor_id     = (argc <= 1) ? UINT32_MAX : strtoul(argv[1],NULL,16);
    384   uint32_t const device_id     = (argc <= 2) ? UINT32_MAX : strtoul(argv[2],NULL,16);
    385   uint32_t const key_val_words = (argc <= 3) ? 1          : strtoul(argv[3],NULL,0);
    386 
    387   if ((key_val_words != 1) && (key_val_words != 2))
    388     {
    389       fprintf(stderr,"Key/Val Words must be 1 or 2\n");
    390       exit(EXIT_FAILURE);
    391     }
    392 
    393   //
    394   // create a Vulkan instances
    395   //
    396   VkApplicationInfo const app_info = {
    397       .sType                 = VK_STRUCTURE_TYPE_APPLICATION_INFO,
    398       .pNext                 = NULL,
    399       .pApplicationName      = "Google HotSort Bench",
    400       .applicationVersion    = 0,
    401       .pEngineName           = "Google HotSort Gen",
    402       .engineVersion         = 0,
    403       .apiVersion            = VK_API_VERSION_1_1
    404   };
    405 
    406   char const * const instance_enabled_layers[] = {
    407     "VK_LAYER_LUNARG_standard_validation"
    408   };
    409 
    410   char const * const instance_enabled_extensions[] = {
    411     VK_EXT_DEBUG_REPORT_EXTENSION_NAME
    412   };
    413 
    414   uint32_t const instance_enabled_layer_count =
    415 #ifndef NDEBUG
    416     ARRAY_LENGTH_MACRO(instance_enabled_layers)
    417 #else
    418     0
    419 #endif
    420     ;
    421 
    422   uint32_t const instance_enabled_extension_count =
    423 #ifndef NDEBUG
    424     ARRAY_LENGTH_MACRO(instance_enabled_extensions)
    425 #else
    426     0
    427 #endif
    428     ;
    429 
    430   VkInstanceCreateInfo const instance_info = {
    431     .sType                   = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
    432     .pNext                   = NULL,
    433     .flags                   = 0,
    434     .pApplicationInfo        = &app_info,
    435     .enabledLayerCount       = instance_enabled_layer_count,
    436     .ppEnabledLayerNames     = instance_enabled_layers,
    437     .enabledExtensionCount   = instance_enabled_extension_count,
    438     .ppEnabledExtensionNames = instance_enabled_extensions
    439   };
    440 
    441   VkInstance instance;
    442 
    443   vk(CreateInstance(&instance_info,NULL,&instance));
    444 
    445   //
    446   //
    447   //
    448 #ifndef NDEBUG
    449   PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT =
    450     (PFN_vkCreateDebugReportCallbackEXT)
    451     vkGetInstanceProcAddr(instance,"vkCreateDebugReportCallbackEXT");
    452 
    453   PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT =
    454     (PFN_vkDestroyDebugReportCallbackEXT)
    455     vkGetInstanceProcAddr(instance,"vkDestroyDebugReportCallbackEXT");
    456 
    457   struct VkDebugReportCallbackCreateInfoEXT const drcci = {
    458     .sType       = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
    459     .pNext       = NULL,
    460     .flags       = UINT32_MAX, // enable everything for now
    461     .pfnCallback = vk_debug_report_cb,
    462     .pUserData   = NULL
    463   };
    464 
    465   VkDebugReportCallbackEXT drc;
    466 
    467   vk(CreateDebugReportCallbackEXT(instance,
    468                                   &drcci,
    469                                   NULL,
    470                                   &drc));
    471 #endif
    472 
    473   //
    474   // acquire all physical devices and select a match
    475   //
    476   uint32_t phy_device_count;
    477 
    478   vk(EnumeratePhysicalDevices(instance,
    479                               &phy_device_count,
    480                               NULL));
    481 
    482   VkPhysicalDevice * phy_devices = vk_host_alloc(NULL,phy_device_count * sizeof(*phy_devices));
    483 
    484   vk(EnumeratePhysicalDevices(instance,
    485                               &phy_device_count,
    486                               phy_devices));
    487 
    488   VkPhysicalDevice           phy_device = VK_NULL_HANDLE;
    489   VkPhysicalDeviceProperties phy_device_props;
    490 
    491   struct hs_vk_target const * hs_target;
    492 
    493   for (uint32_t ii=0; ii<phy_device_count; ii++)
    494     {
    495       VkPhysicalDeviceProperties tmp;
    496 
    497       vkGetPhysicalDeviceProperties(phy_devices[ii],&tmp);
    498 
    499       bool const is_match = is_matching_device(&tmp,
    500                                                &hs_target,
    501                                                vendor_id,
    502                                                device_id,
    503                                                key_val_words);
    504 
    505       fprintf(stdout,"%c %4X : %4X : %s\n",
    506               is_match ? '*' : ' ',
    507               tmp.vendorID,
    508               tmp.deviceID,
    509               tmp.deviceName);
    510 
    511       if (is_match)
    512         {
    513           phy_device = phy_devices[ii];
    514           memcpy(&phy_device_props,&tmp,sizeof(tmp));
    515         }
    516 
    517     }
    518 
    519   if (phy_device == VK_NULL_HANDLE)
    520     {
    521       fprintf(stderr,"Device %4X:%4X not found.\n",
    522               vendor_id & 0xFFFF,
    523               device_id & 0xFFFF);
    524 
    525       return EXIT_FAILURE;
    526     }
    527 
    528   vk_host_free(NULL,phy_devices);
    529 
    530   //
    531   // Get rest of command line
    532   //
    533   uint32_t const slab_size    = hs_target->config.slab.height << hs_target->config.slab.width_log2;
    534 
    535   uint32_t const count_lo     = (argc <=  4) ? slab_size       : strtoul(argv[ 4],NULL,0);
    536   uint32_t const count_hi     = (argc <=  5) ? count_lo        : strtoul(argv[ 5],NULL,0);
    537   uint32_t const count_step   = (argc <=  6) ? count_lo        : strtoul(argv[ 6],NULL,0);
    538   uint32_t const loops        = (argc <=  7) ? HS_BENCH_LOOPS  : strtoul(argv[ 7],NULL,0);
    539   uint32_t const warmup       = (argc <=  8) ? HS_BENCH_WARMUP : strtoul(argv[ 8],NULL,0);
    540   bool     const linearize    = (argc <=  9) ? true            : strtoul(argv[ 9],NULL,0) != 0;
    541   bool     const verify       = (argc <= 10) ? true            : strtoul(argv[10],NULL,0) != 0;
    542 
    543   //
    544   // get the physical device's memory props
    545   //
    546   VkPhysicalDeviceMemoryProperties phy_device_mem_props;
    547 
    548   vkGetPhysicalDeviceMemoryProperties(phy_device,&phy_device_mem_props);
    549 
    550   //
    551   // get queue properties
    552   //
    553   VkQueueFamilyProperties queue_fam_props[2];
    554   uint32_t                queue_fam_count = ARRAY_LENGTH_MACRO(queue_fam_props);
    555 
    556   vkGetPhysicalDeviceQueueFamilyProperties(phy_device,&queue_fam_count,queue_fam_props);
    557 
    558   //
    559   // create device
    560   //
    561   float const queue_priorities[] = { 1.0f };
    562 
    563   VkDeviceQueueCreateInfo const queue_info = {
    564     .sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
    565     .pNext            = NULL,
    566     .flags            = 0,
    567     .queueFamilyIndex = 0,
    568     .queueCount       = 1,
    569     .pQueuePriorities = queue_priorities
    570   };
    571 
    572   //
    573   // clumsily enable AMD GCN shader info extension
    574   //
    575   char const * const device_enabled_extensions[] = {
    576 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
    577     VK_AMD_SHADER_INFO_EXTENSION_NAME
    578 #endif
    579   };
    580 
    581   uint32_t device_enabled_extension_count = 0;
    582 
    583 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD )
    584   if (phy_device_props.vendorID == 0x1002)
    585     device_enabled_extension_count = 1;
    586 #endif
    587 
    588   //
    589   //
    590   //
    591   VkPhysicalDeviceFeatures device_features = { false };
    592 
    593   if (key_val_words == 2)
    594     {
    595       device_features.shaderInt64 = true;
    596     }
    597 
    598   VkDeviceCreateInfo const device_info = {
    599     .sType                   = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
    600     .pNext                   = NULL,
    601     .flags                   = 0,
    602     .queueCreateInfoCount    = 1,
    603     .pQueueCreateInfos       = &queue_info,
    604     .enabledLayerCount       = 0,
    605     .ppEnabledLayerNames     = NULL,
    606     .enabledExtensionCount   = device_enabled_extension_count,
    607     .ppEnabledExtensionNames = device_enabled_extensions,
    608     .pEnabledFeatures        = &device_features
    609   };
    610 
    611   VkDevice device;
    612 
    613   vk(CreateDevice(phy_device,&device_info,NULL,&device));
    614 
    615   //
    616   // get a queue
    617   //
    618   VkQueue queue;
    619 
    620   vkGetDeviceQueue(device,0,0,&queue);
    621 
    622   //
    623   // get the pipeline cache
    624   //
    625   VkPipelineCache pipeline_cache;
    626 
    627   vk_pipeline_cache_create(device,NULL,".vk_cache",&pipeline_cache);
    628 
    629   //
    630   // create a descriptor set pool
    631   //
    632   VkDescriptorPoolSize const dps[] = {
    633     {
    634       .type            = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
    635       .descriptorCount = 2
    636     }
    637   };
    638 
    639   VkDescriptorPoolCreateInfo const dpci = {
    640     .sType         = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
    641     .pNext         = NULL,
    642     .flags         = 0, // VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
    643     .maxSets       = 1,
    644     .poolSizeCount = ARRAY_LENGTH_MACRO(dps),
    645     .pPoolSizes    = dps
    646   };
    647 
    648   VkDescriptorPool desc_pool;
    649 
    650   vk(CreateDescriptorPool(device,
    651                           &dpci,
    652                           NULL, // allocator
    653                           &desc_pool));
    654 
    655   //
    656   // create HotSort device instance
    657   //
    658   struct hs_vk * hs = hs_vk_create(hs_target,
    659                                    device,
    660                                    NULL,
    661                                    pipeline_cache);
    662   //
    663   // create a HotSort descriptor set for this thread
    664   //
    665   VkDescriptorSet hs_ds = hs_vk_ds_alloc(hs,desc_pool);
    666 
    667   //
    668   // create a command pool for this thread
    669   //
    670   VkCommandPoolCreateInfo const cmd_pool_info = {
    671     .sType            = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
    672     .pNext            = NULL,
    673     .flags            = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
    674     .queueFamilyIndex = 0,
    675   };
    676 
    677   VkCommandPool cmd_pool;
    678 
    679   vk(CreateCommandPool(device,
    680                        &cmd_pool_info,
    681                        NULL,
    682                        &cmd_pool));
    683 
    684   //
    685   // create a query pool for benchmarking
    686   //
    687   static VkQueryPoolCreateInfo const query_pool_info = {
    688     .sType              = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
    689     .pNext              = NULL,
    690     .flags              = 0,
    691     .queryType          = VK_QUERY_TYPE_TIMESTAMP,
    692     .queryCount         = 4,
    693     .pipelineStatistics = 0
    694   };
    695 
    696   VkQueryPool query_pool;
    697 
    698   vk(CreateQueryPool(device,
    699                      &query_pool_info,
    700                      NULL,
    701                      &query_pool));
    702 
    703   //
    704   // create two big buffers -- buffer_out_count is always the largest
    705   //
    706   uint32_t buffer_in_count, buffer_out_count;
    707 
    708   hs_vk_pad(hs,count_hi,&buffer_in_count,&buffer_out_count);
    709 
    710   size_t const buffer_out_size = buffer_out_count * key_val_words * sizeof(uint32_t);
    711 
    712   VkBufferCreateInfo bci = {
    713     .sType                 = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
    714     .pNext                 = NULL,
    715     .flags                 = 0,
    716     .size                  = buffer_out_size,
    717     .usage                 = 0,
    718     .sharingMode           = VK_SHARING_MODE_EXCLUSIVE,
    719     .queueFamilyIndexCount = 0,
    720     .pQueueFamilyIndices   = NULL
    721   };
    722 
    723   VkBuffer vin, vout, sorted, rand;
    724 
    725   bci.usage =
    726     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
    727     VK_BUFFER_USAGE_TRANSFER_DST_BIT,
    728 
    729   vk(CreateBuffer(device,
    730                   &bci,
    731                   NULL,
    732                   &vin));
    733 
    734   vk(CreateBuffer(device,
    735                   &bci,
    736                   NULL,
    737                   &sorted));
    738 
    739   bci.usage =
    740     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
    741     VK_BUFFER_USAGE_TRANSFER_SRC_BIT   |
    742     VK_BUFFER_USAGE_TRANSFER_DST_BIT;
    743 
    744   vk(CreateBuffer(device,
    745                   &bci,
    746                   NULL,
    747                   &vout));
    748 
    749   bci.usage =
    750     VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
    751     VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
    752 
    753   vk(CreateBuffer(device,
    754                   &bci,
    755                   NULL,
    756                   &rand));
    757 
    758   //
    759   // get memory requirements for one of the buffers
    760   //
    761   VkMemoryRequirements mr_vin, mr_vout, mr_sorted, mr_rand;
    762 
    763   vkGetBufferMemoryRequirements(device,vin, &mr_vin);
    764   vkGetBufferMemoryRequirements(device,vout,&mr_vout);
    765 
    766   vkGetBufferMemoryRequirements(device,rand,&mr_sorted);
    767   vkGetBufferMemoryRequirements(device,rand,&mr_rand);
    768 
    769   //
    770   // allocate memory for the buffers
    771   //
    772   // for simplicity, all buffers are the same size
    773   //
    774   // vin and vout have the same usage
    775   //
    776   VkMemoryAllocateInfo const mai_vin_vout = {
    777     .sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
    778     .pNext           = NULL,
    779     .allocationSize  = mr_vin.size,
    780     .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
    781                                             mr_vin.memoryTypeBits,
    782                                             VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
    783                                             true)
    784   };
    785 
    786   VkMemoryAllocateInfo const mai_sorted_rand = {
    787     .sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
    788     .pNext           = NULL,
    789     .allocationSize  = mr_sorted.size,
    790     .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props,
    791                                             mr_sorted.memoryTypeBits,
    792                                             VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
    793                                             VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
    794                                             true)
    795   };
    796 
    797   VkDeviceMemory mem_vin, mem_vout, mem_sorted, mem_rand;
    798 
    799   vk(AllocateMemory(device,
    800                     &mai_vin_vout,
    801                     NULL,
    802                     &mem_vin));
    803 
    804   vk(AllocateMemory(device,
    805                     &mai_vin_vout,
    806                     NULL,
    807                     &mem_vout));
    808 
    809   vk(AllocateMemory(device,
    810                     &mai_sorted_rand,
    811                     NULL,
    812                     &mem_sorted));
    813 
    814   vk(AllocateMemory(device,
    815                     &mai_sorted_rand,
    816                     NULL,
    817                     &mem_rand));
    818 
    819   //
    820   // bind backing memory to the virtual allocations
    821   //
    822   vk(BindBufferMemory(device,vin,   mem_vin,   0));
    823   vk(BindBufferMemory(device,vout,  mem_vout,  0));
    824 
    825   vk(BindBufferMemory(device,sorted,mem_sorted,0));
    826   vk(BindBufferMemory(device,rand,  mem_rand,  0));
    827 
    828   //
    829   // map and fill the rand buffer with random values
    830   //
    831   void * rand_h   = vk_host_alloc(NULL,buffer_out_size);
    832   void * sorted_h = vk_host_alloc(NULL,buffer_out_size);
    833 
    834   hs_fill_rand(rand_h,buffer_out_count,key_val_words);
    835 
    836   void * rand_map;
    837 
    838   vk(MapMemory(device,mem_rand,0,VK_WHOLE_SIZE,0,&rand_map));
    839 
    840   memcpy(rand_map,rand_h,buffer_out_size);
    841 
    842   vkUnmapMemory(device,mem_rand);
    843 
    844   //
    845   // create a single command buffer for this thread
    846   //
    847   VkCommandBufferAllocateInfo const cmd_buffer_info = {
    848     .sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
    849     .pNext              = NULL,
    850     .commandPool        = cmd_pool,
    851     .level              = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
    852     .commandBufferCount = 1
    853   };
    854 
    855   VkCommandBuffer cb;
    856 
    857   vk(AllocateCommandBuffers(device,
    858                             &cmd_buffer_info,
    859                             &cb));
    860 
    861   //
    862   //
    863   //
    864   static VkCommandBufferBeginInfo const cb_begin_info = {
    865     .sType            = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
    866     .pNext            = NULL,
    867     .flags            = 0, // VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
    868     .pInheritanceInfo = NULL
    869   };
    870 
    871   struct VkSubmitInfo const submit_info = {
    872     .sType                = VK_STRUCTURE_TYPE_SUBMIT_INFO,
    873     .pNext                = NULL,
    874     .waitSemaphoreCount   = 0,
    875     .pWaitSemaphores      = NULL,
    876     .pWaitDstStageMask    = NULL,
    877     .commandBufferCount   = 1,
    878     .pCommandBuffers      = &cb,
    879     .signalSemaphoreCount = 0,
    880     .pSignalSemaphores    = NULL
    881   };
    882 
    883   //
    884   // labels
    885   //
    886   fprintf(stdout,
    887           "Device, "
    888           "Driver, "
    889           "Type, "
    890           "Slab/Linear, "
    891           "Verified?, "
    892           "Keys, "
    893           "Keys Padded In, "
    894           "Keys Padded Out, "
    895           "CPU, "
    896           "Algorithm, "
    897           "CPU Msecs, "
    898           "CPU Mkeys/s, "
    899           "GPU, "
    900           "Trials, "
    901           "Avg. Msecs, "
    902           "Min Msecs, "
    903           "Max Msecs, "
    904           "Avg. Mkeys/s, "
    905           "Max. Mkeys/s\n");
    906 
    907   //
    908   // test a range
    909   //
    910   for (uint32_t count=count_lo; count<=count_hi; count+=count_step)
    911     {
    912       //
    913       // size the vin and vout arrays
    914       //
    915       uint32_t count_padded_in, count_padded_out;
    916 
    917       hs_vk_pad(hs,count,&count_padded_in,&count_padded_out);
    918 
    919       //
    920       // initialize vin with 'count' random keys
    921       //
    922       vkBeginCommandBuffer(cb,&cb_begin_info);
    923 
    924       VkBufferCopy const copy_rand = {
    925         .srcOffset = 0,
    926         .dstOffset = 0,
    927         .size      = count * key_val_words * sizeof(uint32_t)
    928       };
    929 
    930       vkCmdCopyBuffer(cb,
    931                       rand,
    932                       vin,
    933                       1,
    934                       &copy_rand);
    935 
    936       vk(EndCommandBuffer(cb));
    937 
    938       vk(QueueSubmit(queue,
    939                      1,
    940                      &submit_info,
    941                      VK_NULL_HANDLE)); // FIXME -- put a fence here
    942 
    943       // wait for queue to drain
    944       vk(QueueWaitIdle(queue));
    945       vk(ResetCommandBuffer(cb,0));
    946 
    947       //
    948       // build the sorting command buffer
    949       //
    950       vkBeginCommandBuffer(cb,&cb_begin_info);
    951 
    952       //
    953       // starting timestamp
    954       //
    955       vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,query_pool,0);
    956 
    957       //
    958       // bind the vin/vout buffers early
    959       //
    960       hs_vk_ds_bind(hs,hs_ds,cb,vin,vout);
    961 
    962       //
    963       // append sorting commands
    964       //
    965       hs_vk_sort(hs,
    966                  cb,
    967                  vin,0,0,
    968                  vout,0,0,
    969                  count,
    970                  count_padded_in,
    971                  count_padded_out,
    972                  linearize);
    973 
    974       //
    975       // end timestamp
    976       //
    977       vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,query_pool,1);
    978 
    979       //
    980       // end the command buffer
    981       //
    982       vk(EndCommandBuffer(cb));
    983 
    984       //
    985       // measure the min/max/avg execution time
    986       //
    987       uint64_t elapsed_ns_min = UINT64_MAX;
    988       uint64_t elapsed_ns_max = 0;
    989       uint64_t elapsed_ns_sum = 0;
    990 
    991       for (uint32_t ii=0; ii<warmup+loops; ii++)
    992         {
    993           if (ii == warmup)
    994             {
    995               elapsed_ns_min = UINT64_MAX;
    996               elapsed_ns_max = 0;
    997               elapsed_ns_sum = 0;
    998             }
    999 
   1000           vk(QueueSubmit(queue,
   1001                          1,
   1002                          &submit_info,
   1003                          VK_NULL_HANDLE)); // FIXME -- put a fence here
   1004 
   1005           // wait for queue to drain
   1006           vk(QueueWaitIdle(queue));
   1007 
   1008           // get results
   1009           uint64_t timestamps[2];
   1010 
   1011           vk(GetQueryPoolResults(device,query_pool,
   1012                                  0,ARRAY_LENGTH_MACRO(timestamps),
   1013                                  sizeof(timestamps),
   1014                                  timestamps,
   1015                                  sizeof(timestamps[0]),
   1016                                  VK_QUERY_RESULT_64_BIT |
   1017                                  VK_QUERY_RESULT_WAIT_BIT));
   1018 
   1019           uint64_t const t = timestamps[1] - timestamps[0];
   1020 
   1021           elapsed_ns_min  = MIN_MACRO(elapsed_ns_min,t);
   1022           elapsed_ns_max  = MAX_MACRO(elapsed_ns_max,t);
   1023           elapsed_ns_sum += t;
   1024         }
   1025 
   1026       vk(ResetCommandBuffer(cb,0));
   1027 
   1028       //
   1029       // copy the results back and, optionally, verify them
   1030       //
   1031       char const * cpu_algo = NULL;
   1032       double       cpu_ns   = 0.0;
   1033       bool         verified = false;
   1034 
   1035       if (verify)
   1036         {
   1037           size_t const size_padded_in = count_padded_in * key_val_words * sizeof(uint32_t);
   1038 
   1039           vkBeginCommandBuffer(cb,&cb_begin_info);
   1040 
   1041           VkBufferCopy const copy_vout = {
   1042             .srcOffset = 0,
   1043             .dstOffset = 0,
   1044             .size      = size_padded_in
   1045           };
   1046 
   1047           vkCmdCopyBuffer(cb,
   1048                           vout,
   1049                           sorted,
   1050                           1,
   1051                           &copy_vout);
   1052 
   1053           vk(EndCommandBuffer(cb));
   1054 
   1055           vk(QueueSubmit(queue,
   1056                          1,
   1057                          &submit_info,
   1058                          VK_NULL_HANDLE)); // FIXME -- put a fence here
   1059 
   1060           // wait for queue to drain
   1061           vk(QueueWaitIdle(queue));
   1062           vk(ResetCommandBuffer(cb,0));
   1063 
   1064           size_t const size_sorted_h = count * key_val_words * sizeof(uint32_t);
   1065 
   1066           // copy and sort random data
   1067           memcpy(sorted_h,rand_h,size_sorted_h);
   1068           memset((uint8_t*)sorted_h + size_sorted_h,-1,size_padded_in-size_sorted_h);
   1069 
   1070           cpu_algo = hs_cpu_sort(sorted_h,key_val_words,count_padded_in,&cpu_ns);
   1071 
   1072           void * sorted_map;
   1073 
   1074           vk(MapMemory(device,mem_sorted,0,VK_WHOLE_SIZE,0,&sorted_map));
   1075 
   1076           if (!linearize) {
   1077             hs_transpose_slabs(key_val_words,
   1078                                1u<<hs_target->config.slab.width_log2,
   1079                                hs_target->config.slab.height,
   1080                                sorted_map,
   1081                                count_padded_in);
   1082           }
   1083 
   1084           // verify
   1085           verified = memcmp(sorted_h,sorted_map,size_padded_in) == 0;
   1086 
   1087 #ifndef NDEBUG
   1088           if (!verified)
   1089             {
   1090               if (key_val_words == 1)
   1091                 {
   1092                   hs_debug_u32(1u<<hs_target->config.slab.width_log2,
   1093                                hs_target->config.slab.height,
   1094                                sorted_h,
   1095                                count);
   1096 
   1097                   hs_debug_u32(1u<<hs_target->config.slab.width_log2,
   1098                                hs_target->config.slab.height,
   1099                                sorted_map,
   1100                                count);
   1101                 }
   1102               else // ulong
   1103                 {
   1104                   hs_debug_u64(1u<<hs_target->config.slab.width_log2,
   1105                                hs_target->config.slab.height,
   1106                                sorted_h,
   1107                                count);
   1108 
   1109                   hs_debug_u64(1u<<hs_target->config.slab.width_log2,
   1110                                hs_target->config.slab.height,
   1111                                sorted_map,
   1112                                count);
   1113                 }
   1114             }
   1115 #endif
   1116 
   1117           vkUnmapMemory(device,mem_sorted);
   1118         }
   1119 
   1120       //
   1121       // REPORT
   1122       //
   1123       float const timestamp_period = phy_device_props.limits.timestampPeriod;
   1124 
   1125       fprintf(stdout,"%s, %u.%u.%u.%u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n",
   1126               phy_device_props.deviceName,
   1127               (phy_device_props.driverVersion>>24)&0xFF,
   1128               (phy_device_props.driverVersion>>16)&0xFF,
   1129               (phy_device_props.driverVersion>> 8)&0xFF,
   1130               (phy_device_props.driverVersion    )&0xFF,
   1131               (key_val_words == 1) ? "uint" : "ulong",
   1132               linearize ? "linear" : "slab",
   1133               verify ? (verified ? "  OK  " : "*FAIL*") : "UNVERIFIED",
   1134               count,
   1135               count_padded_in,
   1136               count_padded_out,
   1137               // CPU
   1138               verify ? cpu_algo : "UNVERIFIED",
   1139               verify ? (cpu_ns / 1000000.0)      : 0.0,                      // milliseconds
   1140               verify ? (1000.0 * count / cpu_ns) : 0.0,                      // mkeys / sec
   1141               // GPU
   1142               loops,
   1143               timestamp_period * elapsed_ns_sum / 1e6 / loops,               // avg msecs
   1144               timestamp_period * elapsed_ns_min / 1e6,                       // min msecs
   1145               timestamp_period * elapsed_ns_max / 1e6,                       // max msecs
   1146               1000.0 * count * loops / (timestamp_period * elapsed_ns_sum),  // mkeys / sec - avg
   1147               1000.0 * count         / (timestamp_period * elapsed_ns_min)); // mkeys / sec - max
   1148     }
   1149 
   1150   // reset the descriptor pool
   1151   vk(ResetDescriptorPool(device,desc_pool,0));
   1152 
   1153   //
   1154   // cleanup
   1155   //
   1156 
   1157   // release shared HotSort state
   1158   hs_vk_release(hs);
   1159 
   1160   // destroy the vin/vout buffers (before device memory)
   1161   vkDestroyBuffer(device,vin,   NULL);
   1162   vkDestroyBuffer(device,vout,  NULL);
   1163   vkDestroyBuffer(device,sorted,NULL);
   1164   vkDestroyBuffer(device,rand,  NULL);
   1165 
   1166   // free device memory
   1167   vkFreeMemory(device,mem_vin,   NULL);
   1168   vkFreeMemory(device,mem_vout,  NULL);
   1169   vkFreeMemory(device,mem_sorted,NULL);
   1170   vkFreeMemory(device,mem_rand,  NULL);
   1171 
   1172   // free host memory
   1173   vk_host_free(NULL,rand_h);
   1174   vk_host_free(NULL,sorted_h);
   1175 
   1176   // destroy the descriptor pool
   1177   vkDestroyDescriptorPool(device,desc_pool,NULL);
   1178 
   1179   // destroy remaining...
   1180   vkDestroyQueryPool(device,query_pool,NULL);
   1181   vkFreeCommandBuffers(device,cmd_pool,1,&cb);
   1182   vkDestroyCommandPool(device,cmd_pool,NULL);
   1183 
   1184   vk_pipeline_cache_destroy(device,NULL,".vk_cache",pipeline_cache);
   1185 
   1186   vkDestroyDevice(device,NULL);
   1187 
   1188 #ifndef NDEBUG
   1189   vkDestroyDebugReportCallbackEXT(instance,drc,NULL);
   1190 #endif
   1191 
   1192   vkDestroyInstance(instance,NULL);
   1193 
   1194   return EXIT_SUCCESS;
   1195 }
   1196 
   1197 //
   1198 //
   1199 //
   1200