1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9 // 10 // 11 // 12 13 #include <stdlib.h> 14 #include <stdio.h> 15 #include <string.h> 16 #include <inttypes.h> 17 18 // 19 // 20 21 #include "common/macros.h" 22 #include "common/vk/assert_vk.h" 23 #include "common/vk/host_alloc.h" 24 #include "common/vk/cache_vk.h" 25 26 // 27 // 28 // 29 30 #include "hs_vk.h" 31 32 // 33 // Compile-time images of HotSort targets 34 // 35 36 #include "hs/vk/intel/gen8/u32/hs_target.h" 37 #include "hs/vk/intel/gen8/u64/hs_target.h" 38 39 #include "hs/vk/nvidia/sm_35/u32/hs_target.h" 40 #include "hs/vk/nvidia/sm_35/u64/hs_target.h" 41 42 #include "hs/vk/amd/gcn/u32/hs_target.h" 43 #include "hs/vk/amd/gcn/u64/hs_target.h" 44 45 // 46 // 47 // 48 49 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns); 50 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns); 51 52 // 53 // 54 // 55 56 static 57 char const * 58 hs_cpu_sort(void * sorted_h, 59 uint32_t const hs_words, 60 uint32_t const count, 61 double * const cpu_ns) 62 { 63 if (hs_words == 1) 64 return hs_cpu_sort_u32(sorted_h,count,cpu_ns); 65 else 66 return hs_cpu_sort_u64(sorted_h,count,cpu_ns); 67 } 68 69 static 70 void 71 hs_transpose_slabs_u32(uint32_t const hs_words, 72 uint32_t const hs_width, 73 uint32_t const hs_height, 74 uint32_t * vout_h, 75 uint32_t const count) 76 { 77 uint32_t const slab_keys = hs_width * hs_height; 78 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; 79 uint32_t * const slab = ALLOCA_MACRO(slab_size); 80 uint32_t slab_count = count / slab_keys; 81 82 while (slab_count-- > 0) 83 { 84 memcpy(slab,vout_h,slab_size); 85 86 for (uint32_t row=0; row<hs_height; row++) 87 for (uint32_t col=0; col<hs_width; col++) 88 vout_h[col * hs_height + row] = slab[row * hs_width + col]; 89 90 vout_h += slab_keys; 91 } 92 } 93 94 static 95 void 96 hs_transpose_slabs_u64(uint32_t const hs_words, 97 uint32_t const hs_width, 98 uint32_t const hs_height, 99 uint64_t * vout_h, 100 uint32_t const count) 101 { 102 uint32_t const slab_keys = hs_width * hs_height; 103 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; 104 uint64_t * const slab = ALLOCA_MACRO(slab_size); 105 uint32_t slab_count = count / slab_keys; 106 107 while (slab_count-- > 0) 108 { 109 memcpy(slab,vout_h,slab_size); 110 111 for (uint32_t row=0; row<hs_height; row++) 112 for (uint32_t col=0; col<hs_width; col++) 113 vout_h[col * hs_height + row] = slab[row * hs_width + col]; 114 115 vout_h += slab_keys; 116 } 117 } 118 119 static 120 void 121 hs_transpose_slabs(uint32_t const hs_words, 122 uint32_t const hs_width, 123 uint32_t const hs_height, 124 void * vout_h, 125 uint32_t const count) 126 { 127 if (hs_words == 1) 128 hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count); 129 else 130 hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count); 131 } 132 133 // 134 // 135 // 136 137 #ifndef NDEBUG 138 139 static 140 VkBool32 141 VKAPI_PTR 142 vk_debug_report_cb(VkDebugReportFlagsEXT flags, 143 VkDebugReportObjectTypeEXT objectType, 144 uint64_t object, 145 size_t location, 146 int32_t messageCode, 147 const char* pLayerPrefix, 148 const char* pMessage, 149 void* pUserData) 150 { 151 char const * flag_str = ""; 152 bool is_error = false; 153 154 #define VK_FLAG_CASE_TO_STRING(c) \ 155 case c: \ 156 flag_str = #c; \ 157 is_error = true; \ 158 break 159 160 switch (flags) 161 { 162 // VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_INFORMATION_BIT_EXT); 163 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_WARNING_BIT_EXT); 164 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT); 165 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_ERROR_BIT_EXT); 166 VK_FLAG_CASE_TO_STRING(VK_DEBUG_REPORT_DEBUG_BIT_EXT); 167 } 168 169 if (is_error) 170 { 171 fprintf(stderr,"%s %s %s\n", 172 flag_str, 173 pLayerPrefix, 174 pMessage); 175 } 176 177 return VK_FALSE; 178 } 179 180 #endif 181 182 // 183 // 184 // 185 186 static 187 uint32_t 188 hs_rand_u32() 189 { 190 static uint32_t seed = 0xDEADBEEF; 191 192 // Numerical Recipes 193 seed = seed * 1664525 + 1013904223; 194 195 return seed; 196 } 197 198 // 199 // 200 // 201 202 static 203 void 204 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words) 205 { 206 #if 1 207 for (uint32_t ii=0; ii<count*words; ii++) 208 vin_h[ii] = hs_rand_u32(); 209 #elif 0 // in-order 210 memset(vin_h,0,count*words*sizeof(uint32_t)); 211 for (uint32_t ii=0; ii<count; ii++) 212 vin_h[ii*words] = ii; 213 #else // reverse order 214 memset(vin_h,0,count*words*sizeof(uint32_t)); 215 for (uint32_t ii=0; ii<count; ii++) 216 vin_h[ii*words] = count - 1 - ii; 217 #endif 218 } 219 220 221 // 222 // 223 // 224 225 static 226 void 227 hs_debug_u32(uint32_t const hs_width, 228 uint32_t const hs_height, 229 uint32_t const * vout_h, 230 uint32_t const count) 231 { 232 uint32_t const slab_keys = hs_width * hs_height; 233 uint32_t const slabs = (count + slab_keys - 1) / slab_keys; 234 235 for (uint32_t ss=0; ss<slabs; ss++) { 236 fprintf(stderr,"%u\n",ss); 237 for (uint32_t cc=0; cc<hs_height; cc++) { 238 for (uint32_t rr=0; rr<hs_width; rr++) 239 fprintf(stderr,"%8" PRIX32 " ",*vout_h++); 240 fprintf(stderr,"\n"); 241 } 242 } 243 } 244 245 static 246 void 247 hs_debug_u64(uint32_t const hs_width, 248 uint32_t const hs_height, 249 uint64_t const * vout_h, 250 uint32_t const count) 251 { 252 uint32_t const slab_keys = hs_width * hs_height; 253 uint32_t const slabs = (count + slab_keys - 1) / slab_keys; 254 255 for (uint32_t ss=0; ss<slabs; ss++) { 256 fprintf(stderr,"%u\n",ss); 257 for (uint32_t cc=0; cc<hs_height; cc++) { 258 for (uint32_t rr=0; rr<hs_width; rr++) 259 fprintf(stderr,"%16" PRIX64 " ",*vout_h++); 260 fprintf(stderr,"\n"); 261 } 262 } 263 } 264 265 // 266 // 267 // 268 269 bool 270 is_matching_device(VkPhysicalDeviceProperties const * const phy_device_props, 271 struct hs_vk_target const * * const hs_target, 272 uint32_t const vendor_id, 273 uint32_t const device_id, 274 uint32_t const key_val_words) 275 { 276 if ((phy_device_props->vendorID != vendor_id) || (phy_device_props->deviceID != device_id)) 277 return false; 278 279 if (phy_device_props->vendorID == 0x10DE) 280 { 281 // 282 // FIXME -- for now, the kernels in this app are targeting 283 // sm_35+ devices. You could add some rigorous rejection by 284 // device id here... 285 // 286 if (key_val_words == 1) 287 *hs_target = &hs_nvidia_sm35_u32; 288 else 289 *hs_target = &hs_nvidia_sm35_u64; 290 } 291 else if (phy_device_props->vendorID == 0x8086) 292 { 293 // 294 // FIXME -- for now, the kernels in this app are targeting GEN8+ 295 // devices -- this does *not* include variants of GEN9LP+ 296 // "Apollo Lake" because that device has a different 297 // architectural "shape" than GEN8 GTx. You could add some 298 // rigorous rejection by device id here... 299 // 300 if (key_val_words == 1) 301 *hs_target = &hs_intel_gen8_u32; 302 else 303 *hs_target = &hs_intel_gen8_u64; 304 } 305 else if (phy_device_props->vendorID == 0x1002) 306 { 307 // 308 // AMD GCN 309 // 310 if (key_val_words == 1) 311 *hs_target = &hs_amd_gcn_u32; 312 else 313 *hs_target = &hs_amd_gcn_u64; 314 } 315 else 316 { 317 return false; 318 } 319 320 return true; 321 } 322 323 // 324 // 325 // 326 327 uint32_t 328 vk_find_mem_type_idx(VkPhysicalDeviceMemoryProperties const * phy_device_mem_props, 329 uint32_t const compatible_mem_types, 330 VkMemoryPropertyFlags const required_mem_props, 331 bool const abort) 332 { 333 // 334 // FIXME -- jump between indices in the memoryTypeBits mask 335 // 336 uint32_t const count = phy_device_mem_props->memoryTypeCount; 337 338 for (uint32_t index=0; index<count; index++) 339 { 340 // acceptable memory type for this resource? 341 if ((compatible_mem_types & (1<<index)) == 0) 342 continue; 343 344 // otherwise, find first match... 345 VkMemoryPropertyFlags const common_props = 346 phy_device_mem_props->memoryTypes[index].propertyFlags & required_mem_props; 347 348 if (common_props == required_mem_props) 349 return index; 350 } 351 352 if (abort) 353 { 354 fprintf(stderr,"Memory type not found: %X\n",required_mem_props); 355 exit(EXIT_FAILURE); 356 } 357 358 return UINT32_MAX; 359 } 360 361 // 362 // 363 // 364 365 #ifdef NDEBUG 366 #define HS_BENCH_LOOPS 100 367 #define HS_BENCH_WARMUP 100 368 #else 369 #define HS_BENCH_LOOPS 1 370 #define HS_BENCH_WARMUP 0 371 #endif 372 373 // 374 // 375 // 376 377 int 378 main(int argc, char const * argv[]) 379 { 380 // 381 // select the target by vendor and device id 382 // 383 uint32_t const vendor_id = (argc <= 1) ? UINT32_MAX : strtoul(argv[1],NULL,16); 384 uint32_t const device_id = (argc <= 2) ? UINT32_MAX : strtoul(argv[2],NULL,16); 385 uint32_t const key_val_words = (argc <= 3) ? 1 : strtoul(argv[3],NULL,0); 386 387 if ((key_val_words != 1) && (key_val_words != 2)) 388 { 389 fprintf(stderr,"Key/Val Words must be 1 or 2\n"); 390 exit(EXIT_FAILURE); 391 } 392 393 // 394 // create a Vulkan instances 395 // 396 VkApplicationInfo const app_info = { 397 .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, 398 .pNext = NULL, 399 .pApplicationName = "Google HotSort Bench", 400 .applicationVersion = 0, 401 .pEngineName = "Google HotSort Gen", 402 .engineVersion = 0, 403 .apiVersion = VK_API_VERSION_1_1 404 }; 405 406 char const * const instance_enabled_layers[] = { 407 "VK_LAYER_LUNARG_standard_validation" 408 }; 409 410 char const * const instance_enabled_extensions[] = { 411 VK_EXT_DEBUG_REPORT_EXTENSION_NAME 412 }; 413 414 uint32_t const instance_enabled_layer_count = 415 #ifndef NDEBUG 416 ARRAY_LENGTH_MACRO(instance_enabled_layers) 417 #else 418 0 419 #endif 420 ; 421 422 uint32_t const instance_enabled_extension_count = 423 #ifndef NDEBUG 424 ARRAY_LENGTH_MACRO(instance_enabled_extensions) 425 #else 426 0 427 #endif 428 ; 429 430 VkInstanceCreateInfo const instance_info = { 431 .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, 432 .pNext = NULL, 433 .flags = 0, 434 .pApplicationInfo = &app_info, 435 .enabledLayerCount = instance_enabled_layer_count, 436 .ppEnabledLayerNames = instance_enabled_layers, 437 .enabledExtensionCount = instance_enabled_extension_count, 438 .ppEnabledExtensionNames = instance_enabled_extensions 439 }; 440 441 VkInstance instance; 442 443 vk(CreateInstance(&instance_info,NULL,&instance)); 444 445 // 446 // 447 // 448 #ifndef NDEBUG 449 PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT = 450 (PFN_vkCreateDebugReportCallbackEXT) 451 vkGetInstanceProcAddr(instance,"vkCreateDebugReportCallbackEXT"); 452 453 PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT = 454 (PFN_vkDestroyDebugReportCallbackEXT) 455 vkGetInstanceProcAddr(instance,"vkDestroyDebugReportCallbackEXT"); 456 457 struct VkDebugReportCallbackCreateInfoEXT const drcci = { 458 .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, 459 .pNext = NULL, 460 .flags = UINT32_MAX, // enable everything for now 461 .pfnCallback = vk_debug_report_cb, 462 .pUserData = NULL 463 }; 464 465 VkDebugReportCallbackEXT drc; 466 467 vk(CreateDebugReportCallbackEXT(instance, 468 &drcci, 469 NULL, 470 &drc)); 471 #endif 472 473 // 474 // acquire all physical devices and select a match 475 // 476 uint32_t phy_device_count; 477 478 vk(EnumeratePhysicalDevices(instance, 479 &phy_device_count, 480 NULL)); 481 482 VkPhysicalDevice * phy_devices = vk_host_alloc(NULL,phy_device_count * sizeof(*phy_devices)); 483 484 vk(EnumeratePhysicalDevices(instance, 485 &phy_device_count, 486 phy_devices)); 487 488 VkPhysicalDevice phy_device = VK_NULL_HANDLE; 489 VkPhysicalDeviceProperties phy_device_props; 490 491 struct hs_vk_target const * hs_target; 492 493 for (uint32_t ii=0; ii<phy_device_count; ii++) 494 { 495 VkPhysicalDeviceProperties tmp; 496 497 vkGetPhysicalDeviceProperties(phy_devices[ii],&tmp); 498 499 bool const is_match = is_matching_device(&tmp, 500 &hs_target, 501 vendor_id, 502 device_id, 503 key_val_words); 504 505 fprintf(stdout,"%c %4X : %4X : %s\n", 506 is_match ? '*' : ' ', 507 tmp.vendorID, 508 tmp.deviceID, 509 tmp.deviceName); 510 511 if (is_match) 512 { 513 phy_device = phy_devices[ii]; 514 memcpy(&phy_device_props,&tmp,sizeof(tmp)); 515 } 516 517 } 518 519 if (phy_device == VK_NULL_HANDLE) 520 { 521 fprintf(stderr,"Device %4X:%4X not found.\n", 522 vendor_id & 0xFFFF, 523 device_id & 0xFFFF); 524 525 return EXIT_FAILURE; 526 } 527 528 vk_host_free(NULL,phy_devices); 529 530 // 531 // Get rest of command line 532 // 533 uint32_t const slab_size = hs_target->config.slab.height << hs_target->config.slab.width_log2; 534 535 uint32_t const count_lo = (argc <= 4) ? slab_size : strtoul(argv[ 4],NULL,0); 536 uint32_t const count_hi = (argc <= 5) ? count_lo : strtoul(argv[ 5],NULL,0); 537 uint32_t const count_step = (argc <= 6) ? count_lo : strtoul(argv[ 6],NULL,0); 538 uint32_t const loops = (argc <= 7) ? HS_BENCH_LOOPS : strtoul(argv[ 7],NULL,0); 539 uint32_t const warmup = (argc <= 8) ? HS_BENCH_WARMUP : strtoul(argv[ 8],NULL,0); 540 bool const linearize = (argc <= 9) ? true : strtoul(argv[ 9],NULL,0) != 0; 541 bool const verify = (argc <= 10) ? true : strtoul(argv[10],NULL,0) != 0; 542 543 // 544 // get the physical device's memory props 545 // 546 VkPhysicalDeviceMemoryProperties phy_device_mem_props; 547 548 vkGetPhysicalDeviceMemoryProperties(phy_device,&phy_device_mem_props); 549 550 // 551 // get queue properties 552 // 553 VkQueueFamilyProperties queue_fam_props[2]; 554 uint32_t queue_fam_count = ARRAY_LENGTH_MACRO(queue_fam_props); 555 556 vkGetPhysicalDeviceQueueFamilyProperties(phy_device,&queue_fam_count,queue_fam_props); 557 558 // 559 // create device 560 // 561 float const queue_priorities[] = { 1.0f }; 562 563 VkDeviceQueueCreateInfo const queue_info = { 564 .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, 565 .pNext = NULL, 566 .flags = 0, 567 .queueFamilyIndex = 0, 568 .queueCount = 1, 569 .pQueuePriorities = queue_priorities 570 }; 571 572 // 573 // clumsily enable AMD GCN shader info extension 574 // 575 char const * const device_enabled_extensions[] = { 576 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD ) 577 VK_AMD_SHADER_INFO_EXTENSION_NAME 578 #endif 579 }; 580 581 uint32_t device_enabled_extension_count = 0; 582 583 #if defined( HS_VK_VERBOSE_STATISTICS_AMD ) || defined( HS_VK_VERBOSE_DISASSEMBLY_AMD ) 584 if (phy_device_props.vendorID == 0x1002) 585 device_enabled_extension_count = 1; 586 #endif 587 588 // 589 // 590 // 591 VkPhysicalDeviceFeatures device_features = { false }; 592 593 if (key_val_words == 2) 594 { 595 device_features.shaderInt64 = true; 596 } 597 598 VkDeviceCreateInfo const device_info = { 599 .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, 600 .pNext = NULL, 601 .flags = 0, 602 .queueCreateInfoCount = 1, 603 .pQueueCreateInfos = &queue_info, 604 .enabledLayerCount = 0, 605 .ppEnabledLayerNames = NULL, 606 .enabledExtensionCount = device_enabled_extension_count, 607 .ppEnabledExtensionNames = device_enabled_extensions, 608 .pEnabledFeatures = &device_features 609 }; 610 611 VkDevice device; 612 613 vk(CreateDevice(phy_device,&device_info,NULL,&device)); 614 615 // 616 // get a queue 617 // 618 VkQueue queue; 619 620 vkGetDeviceQueue(device,0,0,&queue); 621 622 // 623 // get the pipeline cache 624 // 625 VkPipelineCache pipeline_cache; 626 627 vk_pipeline_cache_create(device,NULL,".vk_cache",&pipeline_cache); 628 629 // 630 // create a descriptor set pool 631 // 632 VkDescriptorPoolSize const dps[] = { 633 { 634 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 635 .descriptorCount = 2 636 } 637 }; 638 639 VkDescriptorPoolCreateInfo const dpci = { 640 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, 641 .pNext = NULL, 642 .flags = 0, // VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 643 .maxSets = 1, 644 .poolSizeCount = ARRAY_LENGTH_MACRO(dps), 645 .pPoolSizes = dps 646 }; 647 648 VkDescriptorPool desc_pool; 649 650 vk(CreateDescriptorPool(device, 651 &dpci, 652 NULL, // allocator 653 &desc_pool)); 654 655 // 656 // create HotSort device instance 657 // 658 struct hs_vk * hs = hs_vk_create(hs_target, 659 device, 660 NULL, 661 pipeline_cache); 662 // 663 // create a HotSort descriptor set for this thread 664 // 665 VkDescriptorSet hs_ds = hs_vk_ds_alloc(hs,desc_pool); 666 667 // 668 // create a command pool for this thread 669 // 670 VkCommandPoolCreateInfo const cmd_pool_info = { 671 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, 672 .pNext = NULL, 673 .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, 674 .queueFamilyIndex = 0, 675 }; 676 677 VkCommandPool cmd_pool; 678 679 vk(CreateCommandPool(device, 680 &cmd_pool_info, 681 NULL, 682 &cmd_pool)); 683 684 // 685 // create a query pool for benchmarking 686 // 687 static VkQueryPoolCreateInfo const query_pool_info = { 688 .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, 689 .pNext = NULL, 690 .flags = 0, 691 .queryType = VK_QUERY_TYPE_TIMESTAMP, 692 .queryCount = 4, 693 .pipelineStatistics = 0 694 }; 695 696 VkQueryPool query_pool; 697 698 vk(CreateQueryPool(device, 699 &query_pool_info, 700 NULL, 701 &query_pool)); 702 703 // 704 // create two big buffers -- buffer_out_count is always the largest 705 // 706 uint32_t buffer_in_count, buffer_out_count; 707 708 hs_vk_pad(hs,count_hi,&buffer_in_count,&buffer_out_count); 709 710 size_t const buffer_out_size = buffer_out_count * key_val_words * sizeof(uint32_t); 711 712 VkBufferCreateInfo bci = { 713 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, 714 .pNext = NULL, 715 .flags = 0, 716 .size = buffer_out_size, 717 .usage = 0, 718 .sharingMode = VK_SHARING_MODE_EXCLUSIVE, 719 .queueFamilyIndexCount = 0, 720 .pQueueFamilyIndices = NULL 721 }; 722 723 VkBuffer vin, vout, sorted, rand; 724 725 bci.usage = 726 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | 727 VK_BUFFER_USAGE_TRANSFER_DST_BIT, 728 729 vk(CreateBuffer(device, 730 &bci, 731 NULL, 732 &vin)); 733 734 vk(CreateBuffer(device, 735 &bci, 736 NULL, 737 &sorted)); 738 739 bci.usage = 740 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | 741 VK_BUFFER_USAGE_TRANSFER_SRC_BIT | 742 VK_BUFFER_USAGE_TRANSFER_DST_BIT; 743 744 vk(CreateBuffer(device, 745 &bci, 746 NULL, 747 &vout)); 748 749 bci.usage = 750 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | 751 VK_BUFFER_USAGE_TRANSFER_SRC_BIT; 752 753 vk(CreateBuffer(device, 754 &bci, 755 NULL, 756 &rand)); 757 758 // 759 // get memory requirements for one of the buffers 760 // 761 VkMemoryRequirements mr_vin, mr_vout, mr_sorted, mr_rand; 762 763 vkGetBufferMemoryRequirements(device,vin, &mr_vin); 764 vkGetBufferMemoryRequirements(device,vout,&mr_vout); 765 766 vkGetBufferMemoryRequirements(device,rand,&mr_sorted); 767 vkGetBufferMemoryRequirements(device,rand,&mr_rand); 768 769 // 770 // allocate memory for the buffers 771 // 772 // for simplicity, all buffers are the same size 773 // 774 // vin and vout have the same usage 775 // 776 VkMemoryAllocateInfo const mai_vin_vout = { 777 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, 778 .pNext = NULL, 779 .allocationSize = mr_vin.size, 780 .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props, 781 mr_vin.memoryTypeBits, 782 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 783 true) 784 }; 785 786 VkMemoryAllocateInfo const mai_sorted_rand = { 787 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, 788 .pNext = NULL, 789 .allocationSize = mr_sorted.size, 790 .memoryTypeIndex = vk_find_mem_type_idx(&phy_device_mem_props, 791 mr_sorted.memoryTypeBits, 792 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | 793 VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 794 true) 795 }; 796 797 VkDeviceMemory mem_vin, mem_vout, mem_sorted, mem_rand; 798 799 vk(AllocateMemory(device, 800 &mai_vin_vout, 801 NULL, 802 &mem_vin)); 803 804 vk(AllocateMemory(device, 805 &mai_vin_vout, 806 NULL, 807 &mem_vout)); 808 809 vk(AllocateMemory(device, 810 &mai_sorted_rand, 811 NULL, 812 &mem_sorted)); 813 814 vk(AllocateMemory(device, 815 &mai_sorted_rand, 816 NULL, 817 &mem_rand)); 818 819 // 820 // bind backing memory to the virtual allocations 821 // 822 vk(BindBufferMemory(device,vin, mem_vin, 0)); 823 vk(BindBufferMemory(device,vout, mem_vout, 0)); 824 825 vk(BindBufferMemory(device,sorted,mem_sorted,0)); 826 vk(BindBufferMemory(device,rand, mem_rand, 0)); 827 828 // 829 // map and fill the rand buffer with random values 830 // 831 void * rand_h = vk_host_alloc(NULL,buffer_out_size); 832 void * sorted_h = vk_host_alloc(NULL,buffer_out_size); 833 834 hs_fill_rand(rand_h,buffer_out_count,key_val_words); 835 836 void * rand_map; 837 838 vk(MapMemory(device,mem_rand,0,VK_WHOLE_SIZE,0,&rand_map)); 839 840 memcpy(rand_map,rand_h,buffer_out_size); 841 842 vkUnmapMemory(device,mem_rand); 843 844 // 845 // create a single command buffer for this thread 846 // 847 VkCommandBufferAllocateInfo const cmd_buffer_info = { 848 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, 849 .pNext = NULL, 850 .commandPool = cmd_pool, 851 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, 852 .commandBufferCount = 1 853 }; 854 855 VkCommandBuffer cb; 856 857 vk(AllocateCommandBuffers(device, 858 &cmd_buffer_info, 859 &cb)); 860 861 // 862 // 863 // 864 static VkCommandBufferBeginInfo const cb_begin_info = { 865 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, 866 .pNext = NULL, 867 .flags = 0, // VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, 868 .pInheritanceInfo = NULL 869 }; 870 871 struct VkSubmitInfo const submit_info = { 872 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, 873 .pNext = NULL, 874 .waitSemaphoreCount = 0, 875 .pWaitSemaphores = NULL, 876 .pWaitDstStageMask = NULL, 877 .commandBufferCount = 1, 878 .pCommandBuffers = &cb, 879 .signalSemaphoreCount = 0, 880 .pSignalSemaphores = NULL 881 }; 882 883 // 884 // labels 885 // 886 fprintf(stdout, 887 "Device, " 888 "Driver, " 889 "Type, " 890 "Slab/Linear, " 891 "Verified?, " 892 "Keys, " 893 "Keys Padded In, " 894 "Keys Padded Out, " 895 "CPU, " 896 "Algorithm, " 897 "CPU Msecs, " 898 "CPU Mkeys/s, " 899 "GPU, " 900 "Trials, " 901 "Avg. Msecs, " 902 "Min Msecs, " 903 "Max Msecs, " 904 "Avg. Mkeys/s, " 905 "Max. Mkeys/s\n"); 906 907 // 908 // test a range 909 // 910 for (uint32_t count=count_lo; count<=count_hi; count+=count_step) 911 { 912 // 913 // size the vin and vout arrays 914 // 915 uint32_t count_padded_in, count_padded_out; 916 917 hs_vk_pad(hs,count,&count_padded_in,&count_padded_out); 918 919 // 920 // initialize vin with 'count' random keys 921 // 922 vkBeginCommandBuffer(cb,&cb_begin_info); 923 924 VkBufferCopy const copy_rand = { 925 .srcOffset = 0, 926 .dstOffset = 0, 927 .size = count * key_val_words * sizeof(uint32_t) 928 }; 929 930 vkCmdCopyBuffer(cb, 931 rand, 932 vin, 933 1, 934 ©_rand); 935 936 vk(EndCommandBuffer(cb)); 937 938 vk(QueueSubmit(queue, 939 1, 940 &submit_info, 941 VK_NULL_HANDLE)); // FIXME -- put a fence here 942 943 // wait for queue to drain 944 vk(QueueWaitIdle(queue)); 945 vk(ResetCommandBuffer(cb,0)); 946 947 // 948 // build the sorting command buffer 949 // 950 vkBeginCommandBuffer(cb,&cb_begin_info); 951 952 // 953 // starting timestamp 954 // 955 vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,query_pool,0); 956 957 // 958 // bind the vin/vout buffers early 959 // 960 hs_vk_ds_bind(hs,hs_ds,cb,vin,vout); 961 962 // 963 // append sorting commands 964 // 965 hs_vk_sort(hs, 966 cb, 967 vin,0,0, 968 vout,0,0, 969 count, 970 count_padded_in, 971 count_padded_out, 972 linearize); 973 974 // 975 // end timestamp 976 // 977 vkCmdWriteTimestamp(cb,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,query_pool,1); 978 979 // 980 // end the command buffer 981 // 982 vk(EndCommandBuffer(cb)); 983 984 // 985 // measure the min/max/avg execution time 986 // 987 uint64_t elapsed_ns_min = UINT64_MAX; 988 uint64_t elapsed_ns_max = 0; 989 uint64_t elapsed_ns_sum = 0; 990 991 for (uint32_t ii=0; ii<warmup+loops; ii++) 992 { 993 if (ii == warmup) 994 { 995 elapsed_ns_min = UINT64_MAX; 996 elapsed_ns_max = 0; 997 elapsed_ns_sum = 0; 998 } 999 1000 vk(QueueSubmit(queue, 1001 1, 1002 &submit_info, 1003 VK_NULL_HANDLE)); // FIXME -- put a fence here 1004 1005 // wait for queue to drain 1006 vk(QueueWaitIdle(queue)); 1007 1008 // get results 1009 uint64_t timestamps[2]; 1010 1011 vk(GetQueryPoolResults(device,query_pool, 1012 0,ARRAY_LENGTH_MACRO(timestamps), 1013 sizeof(timestamps), 1014 timestamps, 1015 sizeof(timestamps[0]), 1016 VK_QUERY_RESULT_64_BIT | 1017 VK_QUERY_RESULT_WAIT_BIT)); 1018 1019 uint64_t const t = timestamps[1] - timestamps[0]; 1020 1021 elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t); 1022 elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t); 1023 elapsed_ns_sum += t; 1024 } 1025 1026 vk(ResetCommandBuffer(cb,0)); 1027 1028 // 1029 // copy the results back and, optionally, verify them 1030 // 1031 char const * cpu_algo = NULL; 1032 double cpu_ns = 0.0; 1033 bool verified = false; 1034 1035 if (verify) 1036 { 1037 size_t const size_padded_in = count_padded_in * key_val_words * sizeof(uint32_t); 1038 1039 vkBeginCommandBuffer(cb,&cb_begin_info); 1040 1041 VkBufferCopy const copy_vout = { 1042 .srcOffset = 0, 1043 .dstOffset = 0, 1044 .size = size_padded_in 1045 }; 1046 1047 vkCmdCopyBuffer(cb, 1048 vout, 1049 sorted, 1050 1, 1051 ©_vout); 1052 1053 vk(EndCommandBuffer(cb)); 1054 1055 vk(QueueSubmit(queue, 1056 1, 1057 &submit_info, 1058 VK_NULL_HANDLE)); // FIXME -- put a fence here 1059 1060 // wait for queue to drain 1061 vk(QueueWaitIdle(queue)); 1062 vk(ResetCommandBuffer(cb,0)); 1063 1064 size_t const size_sorted_h = count * key_val_words * sizeof(uint32_t); 1065 1066 // copy and sort random data 1067 memcpy(sorted_h,rand_h,size_sorted_h); 1068 memset((uint8_t*)sorted_h + size_sorted_h,-1,size_padded_in-size_sorted_h); 1069 1070 cpu_algo = hs_cpu_sort(sorted_h,key_val_words,count_padded_in,&cpu_ns); 1071 1072 void * sorted_map; 1073 1074 vk(MapMemory(device,mem_sorted,0,VK_WHOLE_SIZE,0,&sorted_map)); 1075 1076 if (!linearize) { 1077 hs_transpose_slabs(key_val_words, 1078 1u<<hs_target->config.slab.width_log2, 1079 hs_target->config.slab.height, 1080 sorted_map, 1081 count_padded_in); 1082 } 1083 1084 // verify 1085 verified = memcmp(sorted_h,sorted_map,size_padded_in) == 0; 1086 1087 #ifndef NDEBUG 1088 if (!verified) 1089 { 1090 if (key_val_words == 1) 1091 { 1092 hs_debug_u32(1u<<hs_target->config.slab.width_log2, 1093 hs_target->config.slab.height, 1094 sorted_h, 1095 count); 1096 1097 hs_debug_u32(1u<<hs_target->config.slab.width_log2, 1098 hs_target->config.slab.height, 1099 sorted_map, 1100 count); 1101 } 1102 else // ulong 1103 { 1104 hs_debug_u64(1u<<hs_target->config.slab.width_log2, 1105 hs_target->config.slab.height, 1106 sorted_h, 1107 count); 1108 1109 hs_debug_u64(1u<<hs_target->config.slab.width_log2, 1110 hs_target->config.slab.height, 1111 sorted_map, 1112 count); 1113 } 1114 } 1115 #endif 1116 1117 vkUnmapMemory(device,mem_sorted); 1118 } 1119 1120 // 1121 // REPORT 1122 // 1123 float const timestamp_period = phy_device_props.limits.timestampPeriod; 1124 1125 fprintf(stdout,"%s, %u.%u.%u.%u, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", 1126 phy_device_props.deviceName, 1127 (phy_device_props.driverVersion>>24)&0xFF, 1128 (phy_device_props.driverVersion>>16)&0xFF, 1129 (phy_device_props.driverVersion>> 8)&0xFF, 1130 (phy_device_props.driverVersion )&0xFF, 1131 (key_val_words == 1) ? "uint" : "ulong", 1132 linearize ? "linear" : "slab", 1133 verify ? (verified ? " OK " : "*FAIL*") : "UNVERIFIED", 1134 count, 1135 count_padded_in, 1136 count_padded_out, 1137 // CPU 1138 verify ? cpu_algo : "UNVERIFIED", 1139 verify ? (cpu_ns / 1000000.0) : 0.0, // milliseconds 1140 verify ? (1000.0 * count / cpu_ns) : 0.0, // mkeys / sec 1141 // GPU 1142 loops, 1143 timestamp_period * elapsed_ns_sum / 1e6 / loops, // avg msecs 1144 timestamp_period * elapsed_ns_min / 1e6, // min msecs 1145 timestamp_period * elapsed_ns_max / 1e6, // max msecs 1146 1000.0 * count * loops / (timestamp_period * elapsed_ns_sum), // mkeys / sec - avg 1147 1000.0 * count / (timestamp_period * elapsed_ns_min)); // mkeys / sec - max 1148 } 1149 1150 // reset the descriptor pool 1151 vk(ResetDescriptorPool(device,desc_pool,0)); 1152 1153 // 1154 // cleanup 1155 // 1156 1157 // release shared HotSort state 1158 hs_vk_release(hs); 1159 1160 // destroy the vin/vout buffers (before device memory) 1161 vkDestroyBuffer(device,vin, NULL); 1162 vkDestroyBuffer(device,vout, NULL); 1163 vkDestroyBuffer(device,sorted,NULL); 1164 vkDestroyBuffer(device,rand, NULL); 1165 1166 // free device memory 1167 vkFreeMemory(device,mem_vin, NULL); 1168 vkFreeMemory(device,mem_vout, NULL); 1169 vkFreeMemory(device,mem_sorted,NULL); 1170 vkFreeMemory(device,mem_rand, NULL); 1171 1172 // free host memory 1173 vk_host_free(NULL,rand_h); 1174 vk_host_free(NULL,sorted_h); 1175 1176 // destroy the descriptor pool 1177 vkDestroyDescriptorPool(device,desc_pool,NULL); 1178 1179 // destroy remaining... 1180 vkDestroyQueryPool(device,query_pool,NULL); 1181 vkFreeCommandBuffers(device,cmd_pool,1,&cb); 1182 vkDestroyCommandPool(device,cmd_pool,NULL); 1183 1184 vk_pipeline_cache_destroy(device,NULL,".vk_cache",pipeline_cache); 1185 1186 vkDestroyDevice(device,NULL); 1187 1188 #ifndef NDEBUG 1189 vkDestroyDebugReportCallbackEXT(instance,drc,NULL); 1190 #endif 1191 1192 vkDestroyInstance(instance,NULL); 1193 1194 return EXIT_SUCCESS; 1195 } 1196 1197 // 1198 // 1199 // 1200