1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9 // 10 // 11 // 12 13 #include <stdlib.h> 14 #include <stdio.h> 15 #include <string.h> 16 #include <inttypes.h> 17 18 // 19 // squelch OpenCL 1.2 deprecation warning 20 // 21 22 #ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS 23 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 24 #endif 25 26 #include "common/macros.h" 27 #include "common/cl/assert_cl.h" 28 #include "common/cl/find_cl.h" 29 // 30 // 31 // 32 33 #include "hs_cl.h" 34 35 // 36 // FIXME -- LIMITED TO INTEL / GEN8+ FOR NOW 37 // 38 39 #include "intel/gen8/u32/hs_target.h" 40 #include "intel/gen8/u64/hs_target.h" 41 42 // #include "intel/gen9lp/u32/hs_target.h" 43 // #include "intel/gen9lp/u64/hs_target.h" 44 45 // 46 // The quality of the RNG doesn't matter. The same number of 47 // instructions will be run no matter what the key distribution looks 48 // like. So here is something small and fast. 49 // 50 51 static 52 uint32_t 53 hs_rand_u32() 54 { 55 static uint32_t seed = 0xDEADBEEF; 56 57 // Numerical Recipes 58 seed = seed * 1664525 + 1013904223; 59 60 return seed; 61 } 62 63 // 64 // 65 // 66 67 static 68 void 69 hs_fill_rand(uint32_t * vin_h, uint32_t const count, uint32_t const words) 70 { 71 #if 1 72 for (uint32_t ii=0; ii<count*words; ii++) 73 vin_h[ii] = hs_rand_u32(); 74 #elif 0 // in-order 75 memset(vin_h,0,count*words*sizeof(uint32_t)); 76 for (uint32_t ii=0; ii<count; ii++) 77 vin_h[ii*words] = ii; 78 #else // reverse order 79 memset(vin_h,0,count*words*sizeof(uint32_t)); 80 for (uint32_t ii=0; ii<count; ii++) 81 vin_h[ii*words] = count - 1 - ii; 82 #endif 83 } 84 85 // 86 // 87 // 88 89 char const * hs_cpu_sort_u32(uint32_t * a, uint32_t const count, double * const cpu_ns); 90 char const * hs_cpu_sort_u64(uint64_t * a, uint32_t const count, double * const cpu_ns); 91 92 // 93 // 94 // 95 96 static 97 char const * 98 hs_cpu_sort(void * sorted_h, 99 uint32_t const hs_words, 100 uint32_t const count, 101 double * const cpu_ns) 102 { 103 if (hs_words == 1) 104 return hs_cpu_sort_u32(sorted_h,count,cpu_ns); 105 else 106 return hs_cpu_sort_u64(sorted_h,count,cpu_ns); 107 } 108 109 static 110 void 111 hs_transpose_slabs_u32(uint32_t const hs_words, 112 uint32_t const hs_width, 113 uint32_t const hs_height, 114 uint32_t * vout_h, 115 uint32_t const count) 116 { 117 uint32_t const slab_keys = hs_width * hs_height; 118 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; 119 uint32_t * const slab = ALLOCA_MACRO(slab_size); 120 uint32_t slab_count = count / slab_keys; 121 122 while (slab_count-- > 0) 123 { 124 memcpy(slab,vout_h,slab_size); 125 126 for (uint32_t row=0; row<hs_height; row++) 127 for (uint32_t col=0; col<hs_width; col++) 128 vout_h[col * hs_height + row] = slab[row * hs_width + col]; 129 130 vout_h += slab_keys; 131 } 132 } 133 134 static 135 void 136 hs_transpose_slabs_u64(uint32_t const hs_words, 137 uint32_t const hs_width, 138 uint32_t const hs_height, 139 uint64_t * vout_h, 140 uint32_t const count) 141 { 142 uint32_t const slab_keys = hs_width * hs_height; 143 size_t const slab_size = sizeof(uint32_t) * hs_words * slab_keys; 144 uint64_t * const slab = ALLOCA_MACRO(slab_size); 145 uint32_t slab_count = count / slab_keys; 146 147 while (slab_count-- > 0) 148 { 149 memcpy(slab,vout_h,slab_size); 150 151 for (uint32_t row=0; row<hs_height; row++) 152 for (uint32_t col=0; col<hs_width; col++) 153 vout_h[col * hs_height + row] = slab[row * hs_width + col]; 154 155 vout_h += slab_keys; 156 } 157 } 158 159 static 160 void 161 hs_transpose_slabs(uint32_t const hs_words, 162 uint32_t const hs_width, 163 uint32_t const hs_height, 164 void * vout_h, 165 uint32_t const count) 166 { 167 if (hs_words == 1) 168 hs_transpose_slabs_u32(hs_words,hs_width,hs_height,vout_h,count); 169 else 170 hs_transpose_slabs_u64(hs_words,hs_width,hs_height,vout_h,count); 171 } 172 173 // 174 // 175 // 176 177 static 178 void 179 hs_debug_u32(uint32_t const hs_width, 180 uint32_t const hs_height, 181 uint32_t const * vout_h, 182 uint32_t const count) 183 { 184 uint32_t const slab_keys = hs_width * hs_height; 185 uint32_t const slabs = (count + slab_keys - 1) / slab_keys; 186 187 for (uint32_t ss=0; ss<slabs; ss++) { 188 fprintf(stderr,"%u\n",ss); 189 for (uint32_t cc=0; cc<hs_height; cc++) { 190 for (uint32_t rr=0; rr<hs_width; rr++) 191 fprintf(stderr,"%8" PRIX32 " ",*vout_h++); 192 fprintf(stderr,"\n"); 193 } 194 } 195 } 196 197 static 198 void 199 hs_debug_u64(uint32_t const hs_width, 200 uint32_t const hs_height, 201 uint64_t const * vout_h, 202 uint32_t const count) 203 { 204 uint32_t const slab_keys = hs_width * hs_height; 205 uint32_t const slabs = (count + slab_keys - 1) / slab_keys; 206 207 for (uint32_t ss=0; ss<slabs; ss++) { 208 fprintf(stderr,"%u\n",ss); 209 for (uint32_t cc=0; cc<hs_height; cc++) { 210 for (uint32_t rr=0; rr<hs_width; rr++) 211 fprintf(stderr,"%16" PRIX64 " ",*vout_h++); 212 fprintf(stderr,"\n"); 213 } 214 } 215 } 216 217 // 218 // Used for benchmarking on out-of-order queues. Attaching an event 219 // to a kernel on an OOQ with profiling enabled will result in a 220 // synchronization point and block concurrent execution of kernels. 221 // 222 // The workaround that enables measuring the entire runtime of the 223 // sort is to launch a dummy kernel with an event, a barrier without 224 // an event, then the call to hs_sort(), followed by a final dummy 225 // kernel with an event. 226 // 227 // The end time of the first dummy and start time of the second dummy 228 // will provide a conservative estimate of the total execution time of 229 // the hs_sort() routine. 230 // 231 // Note that once kernels are enqueued they are scheduled with only 232 // microseconds between them so this should only be a small number of 233 // microseconds longer than the true hs_sort() execution time. 234 // 235 236 #define HS_DUMMY_KERNEL_PROGRAM "kernel void hs_dummy_kernel() { ; }" 237 238 static cl_kernel hs_dummy_kernel; 239 240 static 241 void 242 hs_dummy_kernel_create(cl_context context, cl_device_id device_id) 243 { 244 cl_int err; 245 246 char const * strings[] = { HS_DUMMY_KERNEL_PROGRAM }; 247 size_t const strings_sizeof[] = { sizeof(HS_DUMMY_KERNEL_PROGRAM) }; 248 249 cl_program program = clCreateProgramWithSource(context, 250 1, 251 strings, 252 strings_sizeof, 253 &err); cl_ok(err); 254 cl(BuildProgram(program, 255 1, 256 &device_id, 257 NULL, 258 NULL, 259 NULL)); 260 261 hs_dummy_kernel = clCreateKernel(program,"hs_dummy_kernel",&err); cl_ok(err); 262 263 cl(ReleaseProgram(program)); 264 } 265 266 static 267 void 268 hs_dummy_kernel_release() 269 { 270 cl(ReleaseKernel(hs_dummy_kernel)); 271 } 272 273 static 274 void 275 hs_dummy_kernel_enqueue(cl_command_queue cq, 276 uint32_t wait_list_size, 277 cl_event const * wait_list, 278 cl_event * event) 279 { 280 size_t const global_work_size = 1; 281 282 cl(EnqueueNDRangeKernel(cq, 283 hs_dummy_kernel, 284 1, 285 NULL, 286 &global_work_size, 287 NULL, 288 wait_list_size, 289 wait_list, 290 event)); 291 } 292 293 // 294 // 295 // 296 297 static 298 void 299 hs_bench(cl_context context, 300 cl_command_queue cq, 301 cl_command_queue cq_profile, 302 char const * const device_name, 303 char const * const driver_version, 304 uint32_t const hs_words, 305 uint32_t const hs_width, 306 uint32_t const hs_height, 307 struct hs_cl const * const hs, 308 uint32_t const count_lo, 309 uint32_t const count_hi, 310 uint32_t const count_step, 311 uint32_t const loops, 312 uint32_t const warmup, 313 bool const linearize) 314 { 315 // 316 // return if nothing to do 317 // 318 if (count_hi <= 1) 319 return; 320 321 // 322 // size the arrays 323 // 324 uint32_t count_hi_padded_in, count_hi_padded_out; 325 326 hs_cl_pad(hs,count_hi,&count_hi_padded_in,&count_hi_padded_out); 327 328 // 329 // SIZE 330 // 331 size_t const key_size = sizeof(uint32_t) * hs_words; 332 333 size_t const size_hi_in = count_hi_padded_in * key_size; 334 size_t const size_hi_out = count_hi_padded_out * key_size; 335 336 // 337 // ALLOCATE 338 // 339 cl_int cl_err; 340 341 void * sorted_h = malloc(size_hi_in); 342 343 cl_mem random = clCreateBuffer(context, 344 CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, 345 size_hi_in, 346 NULL,&cl_err); cl_ok(cl_err); 347 348 cl_mem vin = clCreateBuffer(context, 349 CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 350 size_hi_in, 351 NULL,&cl_err); cl_ok(cl_err); 352 353 cl_mem vout = clCreateBuffer(context, 354 CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 355 size_hi_out, 356 NULL,&cl_err); cl_ok(cl_err); 357 // 358 // BLOCKING MAP AND INIT KEYS 359 // 360 { 361 void * random_h = clEnqueueMapBuffer(cq, 362 random, 363 CL_TRUE, 364 CL_MAP_WRITE_INVALIDATE_REGION, 365 0,size_hi_in, 366 0,NULL,NULL, 367 &cl_err); cl_ok(cl_err); 368 369 // fill with random numbers 370 hs_fill_rand(random_h,count_hi,hs_words); 371 372 // 373 // UNMAP 374 // 375 cl(EnqueueUnmapMemObject(cq,random,random_h,0,NULL,NULL)); 376 } 377 378 // 379 // BENCHMARK 380 // 381 for (uint32_t count=count_lo; count<=count_hi; count+=count_step) 382 { 383 // compute padding before sorting 384 uint32_t count_padded_in, count_padded_out; 385 386 hs_cl_pad(hs,count,&count_padded_in,&count_padded_out); 387 388 cl_ulong elapsed_ns_min = UINT64_MAX; 389 cl_ulong elapsed_ns_max = 0; 390 cl_ulong elapsed_ns_sum = 0; 391 392 cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); 393 cl(Finish(cq)); 394 395 for (uint32_t ii=0; ii<warmup+loops; ii++) 396 { 397 if (ii == warmup) 398 { 399 elapsed_ns_min = UINT64_MAX; 400 elapsed_ns_max = 0; 401 elapsed_ns_sum = 0; 402 } 403 404 #if 0 405 // 406 // optionally, initialize vin on every loop -- no need 407 // 408 cl(EnqueueCopyBuffer(cq,random,vin,0,0,count * key_size,0,NULL,NULL)); 409 cl(Finish(cq)); 410 #endif 411 412 // 413 // sort vin 414 // 415 cl_event start, complete, end; 416 417 hs_dummy_kernel_enqueue(cq_profile,0,NULL,&start); 418 419 // note hs_sort enqueues a final barrier 420 hs_cl_sort(hs, 421 cq, 422 1,&start,&complete, 423 vin,vout, 424 count, 425 count_padded_in, 426 count_padded_out, 427 linearize); 428 429 hs_dummy_kernel_enqueue(cq_profile,1,&complete,&end); 430 431 cl(Finish(cq_profile)); 432 433 // 434 // measure duration 435 // 436 cl_ulong t_start=0, t_end=0; 437 438 // start 439 cl(GetEventProfilingInfo(start, 440 CL_PROFILING_COMMAND_END, 441 sizeof(cl_ulong), 442 &t_start, 443 NULL)); 444 445 // end 446 cl(GetEventProfilingInfo(end, 447 CL_PROFILING_COMMAND_START, 448 sizeof(cl_ulong), 449 &t_end, 450 NULL)); 451 452 cl_ulong const t = t_end - t_start; 453 454 elapsed_ns_min = MIN_MACRO(elapsed_ns_min,t); 455 elapsed_ns_max = MAX_MACRO(elapsed_ns_max,t); 456 elapsed_ns_sum += t; 457 458 cl(ReleaseEvent(start)); 459 cl(ReleaseEvent(complete)); 460 cl(ReleaseEvent(end)); 461 } 462 463 // 464 // COPY KEYS BACK FOR VERIFICATION 465 // 466 size_t const size_padded_in = count_padded_in * key_size; 467 468 void * vin_h = clEnqueueMapBuffer(cq, 469 vin, 470 CL_FALSE, 471 CL_MAP_READ, 472 0,size_padded_in, 473 0,NULL,NULL, 474 &cl_err); cl_ok(cl_err); 475 476 void * vout_h = clEnqueueMapBuffer(cq, 477 vout, 478 CL_FALSE, 479 CL_MAP_READ, 480 0,size_padded_in, 481 0,NULL,NULL, 482 &cl_err); cl_ok(cl_err); 483 cl(Finish(cq)); 484 485 // 486 // SORT THE UNTOUCHED RANDOM INPUT 487 // 488 memcpy(sorted_h,vin_h,size_padded_in); 489 490 double cpu_ns; 491 492 char const * const algo = hs_cpu_sort(sorted_h,hs_words,count_padded_in,&cpu_ns); 493 494 // 495 // EXPLICITLY TRANSPOSE THE CPU SORTED SLABS IF NOT LINEARIZING 496 // 497 if (!linearize) 498 hs_transpose_slabs(hs_words,hs_width,hs_height,vout_h,count_padded_in); 499 500 // 501 // VERIFY 502 // 503 bool const verified = memcmp(sorted_h,vout_h,size_padded_in) == 0; 504 505 #ifndef NDEBUG 506 if (!verified) 507 { 508 if (hs_words == 1) 509 hs_debug_u32(hs_width,hs_height,vout_h,count); 510 else // ulong 511 hs_debug_u64(hs_width,hs_height,vout_h,count); 512 } 513 #endif 514 515 cl(EnqueueUnmapMemObject(cq,vin, vin_h, 0,NULL,NULL)); 516 cl(EnqueueUnmapMemObject(cq,vout,vout_h,0,NULL,NULL)); 517 518 cl(Finish(cq)); 519 520 // 521 // REPORT 522 // 523 fprintf(stdout,"%s, %s, %s, %s, %s, %8u, %8u, %8u, CPU, %s, %9.2f, %6.2f, GPU, %9u, %7.3f, %7.3f, %7.3f, %6.2f, %6.2f\n", 524 device_name, 525 driver_version, 526 (hs_words == 1) ? "uint" : "ulong", 527 linearize ? "linear" : "slab", 528 verified ? " OK " : "*FAIL*", 529 count, 530 count_padded_in, 531 count_padded_out, 532 // CPU 533 algo, 534 cpu_ns / 1000000.0, // milliseconds 535 1000.0 * count / cpu_ns, // mkeys / sec 536 // GPU 537 loops, 538 elapsed_ns_sum / 1000000.0 / loops, // avg msecs 539 elapsed_ns_min / 1000000.0, // min msecs 540 elapsed_ns_max / 1000000.0, // max msecs 541 1000.0 * count * loops / elapsed_ns_sum, // mkeys / sec - avg 542 1000.0 * count / elapsed_ns_min); // mkeys / sec - max 543 544 // quit early if not verified 545 if (!verified) 546 break; 547 } 548 549 // 550 // dispose 551 // 552 cl(ReleaseMemObject(vout)); 553 cl(ReleaseMemObject(vin)); 554 cl(ReleaseMemObject(random)); 555 free(sorted_h); 556 } 557 558 // 559 // 560 // 561 562 int 563 main(int argc, char const * argv[]) 564 { 565 char const * const target_platform_substring = "Intel"; 566 char const * const target_device_substring = "Graphics"; 567 568 // 569 // find platform and device ids 570 // 571 cl_platform_id platform_id; 572 cl_device_id device_id; 573 574 #define HS_DEVICE_NAME_SIZE 64 575 576 char device_name[HS_DEVICE_NAME_SIZE]; 577 size_t device_name_size; 578 579 cl(FindIdsByName(target_platform_substring, 580 target_device_substring, 581 &platform_id, 582 &device_id, 583 HS_DEVICE_NAME_SIZE, 584 device_name, 585 &device_name_size, 586 true)); 587 // 588 // create context 589 // 590 cl_context_properties context_properties[] = 591 { 592 CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 593 0 594 }; 595 596 cl_int cl_err; 597 cl_context context = clCreateContext(context_properties, 598 1, 599 &device_id, 600 NULL, 601 NULL, 602 &cl_err); 603 cl_ok(cl_err); 604 605 // 606 // create command queue 607 // 608 #if 0 // OPENCL 2.0 609 610 cl_queue_properties props[] = { 611 CL_QUEUE_PROPERTIES, 612 (cl_queue_properties)CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 613 #ifndef NDEBUG 614 (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, 615 #endif 616 0 617 }; 618 619 cl_queue_properties props_profile[] = { 620 CL_QUEUE_PROPERTIES, 621 (cl_queue_properties)CL_QUEUE_PROFILING_ENABLE, 622 0 623 }; 624 625 cl_command_queue cq = clCreateCommandQueueWithProperties(context, 626 device_id, 627 props, 628 &cl_err); cl_ok(cl_err); 629 630 cl_command_queue cq_profile = clCreateCommandQueueWithProperties(context, 631 device_id, 632 props_profile, 633 &cl_err); cl_ok(cl_err); 634 #else // OPENCL 1.2 635 636 cl_command_queue cq = clCreateCommandQueue(context, 637 device_id, 638 #ifndef NDEBUG 639 CL_QUEUE_PROFILING_ENABLE | 640 #endif 641 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 642 &cl_err); cl_ok(cl_err); 643 644 cl_command_queue cq_profile = clCreateCommandQueue(context, 645 device_id, 646 CL_QUEUE_PROFILING_ENABLE, 647 &cl_err); cl_ok(cl_err); 648 #endif 649 650 // 651 // Intel GEN workaround -- create dummy kernel for semi-accurate 652 // profiling on an out-of-order queue. 653 // 654 hs_dummy_kernel_create(context,device_id); 655 656 // 657 // select the target 658 // 659 660 uint32_t const key_val_words = (argc == 1) ? 2 : strtoul(argv[1],NULL,0); 661 662 struct hs_cl_target const * hs_target; 663 664 if (key_val_words == 1) 665 hs_target = &hs_intel_gen8_u32; 666 else 667 hs_target = &hs_intel_gen8_u64; 668 669 // 670 // create kernels 671 // 672 fprintf(stdout,"Creating... "); 673 674 struct hs_cl * const hs = hs_cl_create(hs_target,context,device_id); 675 676 fprintf(stdout,"done.\n"); 677 678 // 679 // 680 // 681 682 #ifdef NDEBUG 683 #define HS_BENCH_LOOPS 100 684 #define HS_BENCH_WARMUP 100 685 #else 686 #define HS_BENCH_LOOPS 1 687 #define HS_BENCH_WARMUP 0 688 #endif 689 690 // 691 // sort sizes and loops 692 // 693 uint32_t const kpb = hs_target->config.slab.height << hs_target->config.slab.width_log2; 694 695 uint32_t const count_lo = (argc <= 2) ? kpb : strtoul(argv[2],NULL,0); 696 uint32_t const count_hi = (argc <= 3) ? count_lo : strtoul(argv[3],NULL,0); 697 uint32_t const count_step = (argc <= 4) ? count_lo : strtoul(argv[4],NULL,0); 698 uint32_t const loops = (argc <= 5) ? HS_BENCH_LOOPS : strtoul(argv[5],NULL,0); 699 uint32_t const warmup = (argc <= 6) ? HS_BENCH_WARMUP : strtoul(argv[6],NULL,0); 700 bool const linearize = (argc <= 7) ? true : strtoul(argv[7],NULL,0); 701 702 // 703 // labels 704 // 705 fprintf(stdout, 706 "Device, " 707 "Driver, " 708 "Type, " 709 "Slab/Linear, " 710 "Verified?, " 711 "Keys, " 712 "Keys Padded In, " 713 "Keys Padded Out, " 714 "CPU Algorithm, " 715 "CPU Msecs, " 716 "CPU Mkeys/s, " 717 "Trials, " 718 "Avg. Msecs, " 719 "Min Msecs, " 720 "Max Msecs, " 721 "Avg. Mkeys/s, " 722 "Max. Mkeys/s\n"); 723 724 // 725 // we want to track driver versions 726 // 727 size_t driver_version_size; 728 729 cl(GetDeviceInfo(device_id, 730 CL_DRIVER_VERSION, 731 0, 732 NULL, 733 &driver_version_size)); 734 735 char * const driver_version = ALLOCA_MACRO(driver_version_size); 736 737 cl(GetDeviceInfo(device_id, 738 CL_DRIVER_VERSION, 739 driver_version_size, 740 driver_version, 741 NULL)); 742 // 743 // benchmark 744 // 745 hs_bench(context, 746 cq,cq_profile, 747 device_name, 748 driver_version, 749 hs_target->config.words.key + hs_target->config.words.val, 750 1 << hs_target->config.slab.width_log2, 751 hs_target->config.slab.height, 752 hs, 753 count_lo, 754 count_hi, 755 count_step, 756 loops, 757 warmup, 758 linearize); 759 760 // 761 // release everything 762 // 763 hs_cl_release(hs); 764 765 hs_dummy_kernel_release(); 766 767 cl(ReleaseCommandQueue(cq)); 768 cl(ReleaseCommandQueue(cq_profile)); 769 770 cl(ReleaseContext(context)); 771 772 return 0; 773 } 774