1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9 // 10 // 11 // 12 13 #include "path.h" 14 #include "block_pool_cl.h" 15 #include "path_builder_cl_12.h" 16 #include "kernel_cl_12.h" 17 18 // 19 // 20 // 21 22 #if 0 23 24 // 25 // SIMD AVX2 26 // 27 28 #define SKC_PATHS_COPY_WORDS_PER_ELEM 8 29 #define SKC_PATHS_COPY_SUBGROUP_SIZE 1 30 #define SKC_PATHS_COPY_KERNEL_ATTRIBUTES 31 32 typedef skc_uint8 skc_paths_copy_elem; 33 typedef skc_uint8 skc_pb_idx_v; 34 35 #define SKC_PATHS_COPY_ELEM_EXPAND() SKC_EXPAND_8() 36 37 #define SKC_IS_NOT_PATH_HEAD(sg,I) ((sg) + I >= SKC_PATH_HEAD_WORDS) 38 39 #endif 40 41 // 42 // 43 // 44 45 #define SKC_PATHS_COPY_SUBGROUP_SIZE_MASK (SKC_PATHS_COPY_SUBGROUP_SIZE - 1) 46 #define SKC_PATHS_COPY_ELEMS_PER_BLOCK (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) 47 #define SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK (SKC_DEVICE_SUBBLOCK_WORDS / SKC_PATHS_COPY_ELEM_WORDS) 48 #define SKC_PATHS_COPY_ELEMS_PER_THREAD (SKC_PATHS_COPY_ELEMS_PER_BLOCK / SKC_PATHS_COPY_SUBGROUP_SIZE) 49 50 // FIXME -- use SUBGROUP terminology everywhere 51 #define SKC_PATHS_COPY_SUBGROUP_WORDS (SKC_PATHS_COPY_SUBGROUP_SIZE * SKC_PATHS_COPY_ELEM_WORDS) 52 53 // 54 // 55 // 56 57 #define SKC_PATHS_COPY_ELEMS_BEFORE_HEADER \ 58 (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS / SKC_PATHS_COPY_ELEM_WORDS) / SKC_PATHS_COPY_SUBGROUP_WORDS)) 59 60 #define SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER \ 61 (SKC_PATHS_COPY_SUBGROUP_SIZE * ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_SUBGROUP_WORDS - 1) / SKC_PATHS_COPY_SUBGROUP_WORDS)) 62 63 // #define SKC_PATHS_COPY_HEAD_ELEMS ((SKC_PATH_HEAD_WORDS + SKC_PATHS_COPY_ELEM_WORDS - 1) / SKC_PATHS_COPY_ELEM_WORDS) 64 65 // 66 // 67 // 68 69 // 70 // BIT-FIELD EXTRACT/INSERT ARE NOT AVAILABLE IN OPENCL 71 // 72 73 #define SKC_CMD_PATHS_COPY_ONE_BITS (SKC_TAGGED_BLOCK_ID_BITS_TAG + SKC_DEVICE_SUBBLOCK_WORDS_LOG2) 74 75 #define SKC_CMD_PATHS_COPY_ONE_MASK SKC_BITS_TO_MASK(SKC_CMD_PATHS_COPY_ONE_BITS) 76 77 #define SKC_CMD_PATHS_COPY_ONE (1u << SKC_CMD_PATHS_COPY_ONE_BITS) 78 79 #define SKC_CMD_PATHS_COPY_GET_TAG(ti) SKC_TAGGED_BLOCK_ID_GET_TAG(ti) 80 81 #define SKC_CMD_PATHS_COPY_GET_ROLLING(ti) ((ti) >> SKC_CMD_PATHS_COPY_ONE_BITS) 82 83 #define SKC_CMD_PATHS_COPY_UPDATE_ROLLING(ti,b) (((ti) & SKC_CMD_PATHS_COPY_ONE_MASK) | ((b) << SKC_TAGGED_BLOCK_ID_BITS_TAG)) 84 85 // 86 // 87 // 88 89 skc_uint 90 skc_sub_group_local_id() 91 { 92 #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 93 return get_sub_group_local_id(); 94 #else 95 return 0; 96 #endif 97 } 98 99 // 100 // convert an atomic read counter offset to a block id 101 // 102 103 skc_block_id_t 104 skc_bp_off_to_id(__global skc_block_id_t const * const bp_ids, 105 skc_uint const bp_idx_mask, 106 skc_uint const bp_reads, 107 skc_uint const bp_off) 108 { 109 skc_uint const bp_idx = (bp_reads + bp_off) & bp_idx_mask; 110 111 return bp_ids[bp_idx]; 112 } 113 114 // 115 // 116 // 117 118 void 119 skc_copy_segs(__global skc_paths_copy_elem * const bp_elems, // to 120 skc_uint const bp_elems_idx, 121 __global skc_paths_copy_elem const * const pb_elems, // from 122 skc_uint const pb_elems_idx) 123 { 124 for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) 125 { 126 (bp_elems+bp_elems_idx)[ii] = (pb_elems+pb_elems_idx)[ii]; 127 } 128 129 #if 0 130 // 131 // NOTE THIS IS PRINTING 8 ROWS 132 // 133 printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", 134 (skc_uint)get_global_id(0),pb_elems_idx, 135 as_float((pb_elems+pb_elems_idx)[0*SKC_PATHS_COPY_SUBGROUP_SIZE]), 136 as_float((pb_elems+pb_elems_idx)[1*SKC_PATHS_COPY_SUBGROUP_SIZE]), 137 as_float((pb_elems+pb_elems_idx)[2*SKC_PATHS_COPY_SUBGROUP_SIZE]), 138 as_float((pb_elems+pb_elems_idx)[3*SKC_PATHS_COPY_SUBGROUP_SIZE])); 139 printf("%5u : (%8u) : { { %5.0f, %5.0f }, { %5.0f, %5.0f } },\n", 140 (skc_uint)get_global_id(0),pb_elems_idx, 141 as_float((pb_elems+pb_elems_idx)[4*SKC_PATHS_COPY_SUBGROUP_SIZE]), 142 as_float((pb_elems+pb_elems_idx)[5*SKC_PATHS_COPY_SUBGROUP_SIZE]), 143 as_float((pb_elems+pb_elems_idx)[6*SKC_PATHS_COPY_SUBGROUP_SIZE]), 144 as_float((pb_elems+pb_elems_idx)[7*SKC_PATHS_COPY_SUBGROUP_SIZE])); 145 #endif 146 } 147 148 // 149 // 150 // 151 152 void 153 skc_copy_node(__global skc_paths_copy_elem * const bp_elems, // to 154 skc_uint const bp_elems_idx, 155 __global skc_block_id_t const * const bp_ids, 156 skc_uint const bp_reads, 157 skc_uint const bp_idx_mask, 158 __global skc_paths_copy_elem const * const pb_elems, // from 159 skc_uint const pb_elems_idx, 160 skc_uint const pb_rolling) 161 { 162 // 163 // remap block id tags bp_elems the host-side rolling counter pb_elems a 164 // device-side block pool id 165 // 166 for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) 167 { 168 // load block_id_tag words 169 skc_paths_copy_elem elem = (pb_elems + pb_elems_idx)[ii]; 170 171 // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid 172 skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; 173 174 // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS 175 176 // 177 // FIXME -- SIMD can be fully parallelized since a bp_ids[] load 178 // will _always_ be safe as long as we don't use the loaded 179 // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead 180 // of iterating over the vector components. 181 // 182 183 // only convert if original elem is not invalid 184 185 #undef SKC_EXPAND_X 186 #define SKC_EXPAND_X(I,S,C,P,R) \ 187 if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ 188 skc_block_id_t const b = bp_ids[bp_idx C]; \ 189 elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ 190 } 191 192 // printf("%2u: < %8X, %8X, %8X >\n",ii,bp_idx,b,elem C); 193 194 SKC_PATHS_COPY_ELEM_EXPAND(); 195 196 // store the elem back 197 (bp_elems+bp_elems_idx)[ii] = elem; 198 } 199 } 200 201 // 202 // 203 // 204 205 void 206 skc_host_map_update(__global skc_uint * const host_map, 207 skc_uint const block, 208 skc_paths_copy_elem const elem) 209 { 210 // 211 // write first elem to map -- FIXME -- this is a little nasty 212 // because it relies on the the host handle always being the first 213 // word in the path header. 214 // 215 // OTOH, this is not unreasonable. The alternative is to have a 216 // separate kernel initializing the map. 217 // 218 #if SKC_PATHS_COPY_SUBGROUP_SIZE > 1 219 if (get_sub_group_local_id() == SKC_PATH_HEAD_OFFSET_HANDLE) 220 #endif 221 { 222 #if SKC_PATHS_COPY_ELEM_WORDS == 1 223 host_map[elem] = block; 224 #if 0 225 printf("[%u] = %u\n",elem,block); 226 #endif 227 #else 228 host_map[elem.SKC_CONCAT(s,SKC_PATH_HEAD_OFFSET_HANDLE)] = block; 229 #endif 230 } 231 } 232 233 // 234 // 235 // 236 237 void 238 skc_copy_head(__global skc_uint * const host_map, 239 skc_uint const block, 240 __global skc_paths_copy_elem * const bp_elems, // to 241 skc_uint const bp_elems_idx, 242 __global skc_block_id_t const * const bp_ids, 243 skc_uint const bp_reads, 244 skc_uint const bp_idx_mask, 245 __global skc_paths_copy_elem const * const pb_elems, // from 246 skc_uint const pb_elems_idx, 247 skc_uint const pb_rolling) 248 { 249 // 250 // if there are more path header words than there are 251 // threads-per-block then we can just copy the initial header words 252 // 253 #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER > 0 ) 254 for (skc_uint ii=0; ii<SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) 255 { 256 skc_paths_copy_elem const elem = (pb_elems+pb_elems_idx)[ii]; 257 258 (bp_elems+bp_elems_idx)[ii] = elem; 259 260 if (ii == 0) { 261 skc_host_map_update(host_map,block,elem); 262 } 263 } 264 #endif 265 266 // 267 // this is similar to copy node but the first H words of the path 268 // header are not modified and simply copied 269 // 270 for (skc_uint ii=SKC_PATHS_COPY_ELEMS_BEFORE_HEADER; ii<SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) 271 { 272 skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; 273 274 #if ( SKC_PATHS_COPY_ELEMS_BEFORE_HEADER == 0 ) 275 if (ii == 0) { 276 skc_host_map_update(host_map,block,elem); 277 } 278 #endif 279 // calculate ahead of time -- if elem was invalid then bp_idx is definitely invalid 280 skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; 281 282 // 283 // FIXME -- SIMD can be fully parallelized since a bp_ids[] load 284 // will _always_ be safe as long as we don't use the loaded 285 // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead 286 // of iterating over the vector components. 287 // 288 289 // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS 290 291 // FIXME -- MIX MIX MIX MIX / SELECT 292 293 // only convert if original elem is not invalid 294 #undef SKC_EXPAND_X 295 #define SKC_EXPAND_X(I,S,C,P,R) \ 296 if (SKC_IS_NOT_PATH_HEAD(ii,I) && (elem C != SKC_TAGGED_BLOCK_ID_INVALID)) { \ 297 skc_block_id_t const b = bp_ids[bp_idx C]; \ 298 elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ 299 } 300 301 // printf("%2u: ( %8X, %8X, %8X )\n",ii,bp_idx,b,elem C); 302 303 SKC_PATHS_COPY_ELEM_EXPAND(); 304 305 // store the elem back 306 (bp_elems+bp_elems_idx)[ii] = elem; 307 } 308 309 // 310 // the remaining words are treated like a node 311 // 312 for (skc_uint ii=SKC_PATHS_COPY_ELEMS_INCLUDING_HEADER; ii<SKC_PATHS_COPY_ELEMS_PER_BLOCK; ii+=SKC_PATHS_COPY_SUBGROUP_SIZE) 313 { 314 // load block_id_tag words 315 skc_paths_copy_elem elem = (pb_elems+pb_elems_idx)[ii]; 316 317 // calculate ahead of time 318 skc_pb_idx_v const bp_idx = (bp_reads + SKC_CMD_PATHS_COPY_GET_ROLLING(elem - pb_rolling)) & bp_idx_mask; 319 320 // 321 // FIXME -- SIMD can be fully parallelized since a bp_ids[] load 322 // will _always_ be safe as long as we don't use the loaded 323 // value! So... fix UPDATE_ROLLING to be SIMD-friendly instead 324 // of iterating over the vector components. 325 // 326 327 // FIXME ^^^^^ THE IDX PROBABLY DOESN'T NEED TO BE SHIFTED TWICE AND WE CAN SAVE A FEW INSTRUCTIONS 328 329 // only convert if original elem is not invalid 330 #undef SKC_EXPAND_X 331 #define SKC_EXPAND_X(I,S,C,P,R) \ 332 if (elem C != SKC_TAGGED_BLOCK_ID_INVALID) { \ 333 skc_block_id_t const b = bp_ids[bp_idx C]; \ 334 elem C = SKC_CMD_PATHS_COPY_UPDATE_ROLLING(elem C,b); \ 335 } 336 337 // printf("%2u: [ %8X, %8X, %8X ]\n",ii,bp_idx,b,elem C); 338 339 SKC_PATHS_COPY_ELEM_EXPAND(); 340 341 // store the elem 342 (bp_elems+bp_elems_idx)[ii] = elem; 343 } 344 } 345 346 // 347 // FIXME -- pack some of these constant integer args in a vec or struct 348 // 349 350 __kernel 351 SKC_PATHS_COPY_KERNEL_ATTRIBS 352 void 353 skc_kernel_paths_copy 354 (__global skc_uint * const host_map, 355 356 __global skc_block_id_t const * const bp_ids, 357 __global skc_paths_copy_elem * const bp_elems, 358 skc_uint const bp_idx_mask, // pow2 modulo mask for block pool ring 359 360 __global skc_uint const * const bp_alloc, // block pool ring base 361 skc_uint const bp_alloc_idx,// which subbuf 362 363 __global union skc_tagged_block_id const * const pb_cmds, 364 __global skc_paths_copy_elem const * const pb_elems, 365 366 skc_uint const pb_size, // # of commands/blocks in buffer 367 skc_uint const pb_rolling, // shifted rolling counter base 368 369 skc_uint const pb_prev_from, 370 skc_uint const pb_prev_span, 371 skc_uint const pb_curr_from) 372 { 373 // 374 // THERE ARE 3 TYPES OF PATH COPYING COMMANDS: 375 // 376 // - HEAD 377 // - NODE 378 // - SEGS 379 // 380 // THESE ARE SUBGROUP ORIENTED KERNELS 381 // 382 // A SUBGROUP CAN OPERATE ON [1,N] BLOCKS 383 // 384 385 // 386 // It's likely that peak bandwidth is achievable with a single 387 // workgroup. 388 // 389 // So let's keep the grids modestly sized and for simplicity and 390 // portability, let's assume that a single workgroup can perform all 391 // steps in the copy. 392 // 393 // Launch as large of a workgroup as possiblex 394 // 395 // 1. ATOMICALLY ALLOCATE BLOCKS BP_ELEMS POOL 396 // 2. CONVERT COMMANDS IN PB_ELEMS BLOCK OFFSETS 397 // 3. FOR EACH COMMAND: 398 // - HEAD: SAVED HEAD ID PB_ELEMS MAP. CONVERT AND COPY H INDICES. 399 // - NODE: CONVERT AND COPY B INDICES 400 // - SEGS: BULK COPY 401 // 402 // B : number of words in block -- always pow2 403 // W : intelligently/arbitrarily chosen factor of B -- always pow2 404 // 405 406 // 407 // There are several approaches to processing the commands: 408 // 409 // 1. B threads are responsible for one block. All threads broadcast 410 // load a single command word. Workgroup size must be a facpb_elemsr of 411 // B. 412 // 413 // 2. W threads process an entire block. W will typically be the 414 // device's subgroup/warp/wave width. W threads broadcast load a 415 // single command word. 416 // 417 // 3. W threads process W blocks. W threads load W command words and 418 // process W blocks. 419 // 420 // Clearly (1) has low I/O intensity but will achieve high 421 // parallelism by activating the most possible threads. The downside 422 // of this kind of approach is that the kernel will occupy even a 423 // large GPU with low intensity work and reduce opportunities for 424 // concurrent kernel execution (of other kernels). 425 // 426 // See Vasily Volkov's CUDA presentation describing these tradeoffs. 427 // 428 // Note that there are many other approaches. For example, similar 429 // pb_elems (1) but each thread loads a pow2 vector of block data. 430 // 431 432 // load the copied atomic read "base" from gmem 433 skc_uint const bp_reads = bp_alloc[bp_alloc_idx]; 434 // will always be less than 2^32 435 skc_uint const gid = get_global_id(0); 436 // every subgroup/simd that will work on the block loads the same command 437 skc_uint const sg_idx = gid / SKC_PATHS_COPY_SUBGROUP_SIZE; 438 // path builder data can be spread across two spans 439 skc_uint pb_idx = sg_idx + ((sg_idx < pb_prev_span) ? pb_prev_from : pb_curr_from); 440 441 // no need pb_elems make this branchless 442 if (pb_idx >= pb_size) 443 pb_idx -= pb_size; 444 445 // broadcast load the command 446 union skc_tagged_block_id const pb_cmd = pb_cmds[pb_idx]; 447 448 // what do we want pb_elems do with this block? 449 skc_cmd_paths_copy_tag const tag = SKC_CMD_PATHS_COPY_GET_TAG(pb_cmd.u32); 450 451 // compute offset from rolling base to get index into block pool ring allocation 452 skc_uint const bp_off = SKC_CMD_PATHS_COPY_GET_ROLLING(pb_cmd.u32 - pb_rolling); 453 454 // convert the pb_cmd's offset counter pb_elems a block id 455 skc_block_id_t const block = skc_bp_off_to_id(bp_ids,bp_idx_mask,bp_reads,bp_off); 456 457 #if 0 458 if (get_sub_group_local_id() == 0) { 459 printf("bp_off/reads = %u / %u\n",bp_off,bp_reads); 460 printf("< %8u >\n",block); 461 } 462 #endif 463 464 // FIXME -- could make this 0 for SIMD, gid&mask or get_sub_group_local_id() 465 skc_uint const tid = gid & SKC_PATHS_COPY_SUBGROUP_SIZE_MASK; 466 467 // calculate bp_elems (to) / pb_elems (from) 468 skc_uint const bp_elems_idx = block * SKC_PATHS_COPY_ELEMS_PER_SUBBLOCK + tid; 469 skc_uint const pb_elems_idx = pb_idx * SKC_PATHS_COPY_ELEMS_PER_BLOCK + tid; 470 471 if (tag == SKC_CMD_PATHS_COPY_TAG_SEGS) 472 { 473 #if 0 474 if (tid == 0) 475 printf("%3u, segs\n",bp_off); 476 #endif 477 skc_copy_segs(bp_elems, 478 bp_elems_idx, 479 pb_elems, 480 pb_elems_idx); 481 } 482 else if (tag == SKC_CMD_PATHS_COPY_TAG_NODE) 483 { 484 #if 0 485 if (tid == 0) 486 printf("%3u, NODE\n",bp_off); 487 #endif 488 skc_copy_node(bp_elems, // to 489 bp_elems_idx, 490 bp_ids, 491 bp_reads, 492 bp_idx_mask, 493 pb_elems, // from 494 pb_elems_idx, 495 pb_rolling); 496 } 497 else // ( tag == SKC_CMD_PATHS_COPY_TAG_HEAD) 498 { 499 #if 0 500 if (tid == 0) 501 printf("%3u, HEAD\n",bp_off); 502 #endif 503 skc_copy_head(host_map, 504 block, 505 bp_elems, // to 506 bp_elems_idx, 507 bp_ids, 508 bp_reads, 509 bp_idx_mask, 510 pb_elems, // from 511 pb_elems_idx, 512 pb_rolling); 513 } 514 } 515 516 // 517 // 518 // 519 520 __kernel 521 SKC_PATHS_ALLOC_KERNEL_ATTRIBS 522 void 523 skc_kernel_paths_alloc(__global skc_uint volatile * const bp_atomics, 524 __global skc_uint * const bp_alloc, 525 skc_uint const bp_alloc_idx, 526 skc_uint const pb_cmd_count) 527 { 528 // 529 // allocate blocks in block pool 530 // 531 skc_uint const reads = atomic_add(bp_atomics+SKC_BP_ATOMIC_OFFSET_READS,pb_cmd_count); 532 533 // store in slot 534 bp_alloc[bp_alloc_idx] = reads; 535 536 #if 0 537 printf("pc: %8u + %u\n",reads,pb_cmd_count); 538 #endif 539 } 540 541 // 542 // 543 // 544