1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9 // 10 // FIXME -- a pre-allocation step could load the path header quads and 11 // total up the number of blocks in the workgroup or subgroup 12 // minimizing the number of later atomics adds. 13 // 14 15 #include "block.h" 16 #include "path.h" 17 #include "common.h" 18 #include "atomic_cl.h" 19 #include "block_pool_cl.h" 20 #include "kernel_cl_12.h" 21 22 // 23 // 24 // 25 26 #define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) 27 28 #define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS) 29 30 #define SKC_PATHS_RECLAIM_X (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS) 31 32 // 33 // 34 // 35 36 #if ( SKC_PATHS_RECLAIM_X == 1 ) 37 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_1() 38 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 0 39 40 #elif ( SKC_PATHS_RECLAIM_X == 2 ) 41 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_2() 42 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 1 43 44 #elif ( SKC_PATHS_RECLAIM_X == 4 ) 45 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_4() 46 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 3 47 48 #elif ( SKC_PATHS_RECLAIM_X == 8 ) 49 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_8() 50 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 7 51 52 #elif ( SKC_PATHS_RECLAIM_X == 16) 53 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND() SKC_EXPAND_16() 54 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST 15 55 56 #else 57 #error "MISSING SKC_PATHS_RECLAIM_X" 58 #endif 59 60 // 61 // FIXME -- slate these for replacement 62 // 63 64 #define SKC_BROADCAST(E,S,I) \ 65 sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) 66 67 #define SKC_BROADCAST_LAST_HELPER(E,I) \ 68 sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) 69 70 #define SKC_BROADCAST_LAST(E,I) \ 71 SKC_BROADCAST_LAST_HELPER(E,I) 72 73 // 74 // COMPILE-TIME PREDICATES 75 // 76 77 #define SKC_PATHS_RECLAIM_ELEM_GTE(X,I) \ 78 SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) 79 80 #define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I) \ 81 (skc_bool)SKC_GTE_MACRO(X, I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) && \ 82 (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) 83 84 #define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I) \ 85 SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I) 86 87 #define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I) \ 88 SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I) 89 90 // 91 // RUN-TIME PREDICATES 92 // 93 94 #define SKC_PATHS_RECLAIM_IS_HEADER(I) \ 95 (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS) 96 97 // 98 // FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL 99 // POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK 100 // COMBOS (NOT NECESSARILY POW2) 101 // 102 // FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR 103 // UINT TYPE INSTEAD OF A ULONG. 104 // 105 106 #define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2 107 #define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE skc_uint 108 109 // 110 // 111 // 112 113 #define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS) 114 115 #define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I) \ 116 (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK) \ 117 ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) 118 119 #define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C) \ 120 S = sub_group_scan_exclusive_add(C) 121 122 #define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I) \ 123 (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK) 124 125 // 126 // 127 // 128 129 struct skc_reclaim 130 { 131 skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE]; 132 }; 133 134 __kernel 135 SKC_PATHS_RECLAIM_KERNEL_ATTRIBS 136 void 137 skc_kernel_paths_reclaim(__global skc_block_id_t * const bp_ids, // block pool ids ring 138 __global skc_uint * const bp_elems, // block pool blocks 139 __global skc_uint volatile * const bp_atomics, // read/write atomics 140 skc_uint const bp_mask, // pow2 modulo mask for block pool ring 141 __global skc_block_id_t const * const map, // path host-to-device map 142 struct skc_reclaim const reclaim) // array of host path ids 143 { 144 #if (__OPENCL_VERSION__ < 200) 145 skc_uint const reclaim_stride = get_num_sub_groups(); 146 #else 147 skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups 148 #endif 149 skc_uint reclaim_idx = get_group_id(0) * reclaim_stride + get_sub_group_id(); 150 151 #if 0 152 // 153 // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT 154 // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL 155 // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE 156 // RECLAMATION JOB ON THE REST OF THE PIPELINE. 157 // 158 for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride) 159 #endif 160 { 161 // get host path id 162 skc_path_h const path = reclaim.aN[reclaim_idx]; 163 164 // get the path header block from the map 165 skc_block_id_t id = map[path]; 166 167 // 168 // blindly load all of the head elements into registers 169 // 170 skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); 171 172 #undef SKC_EXPAND_X 173 #define SKC_EXPAND_X(I,S,C,P,R) \ 174 skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; 175 176 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 177 178 // 179 // pick out count.nodes and count.prims from the header 180 // 181 skc_uint count_blocks, count_nodes; 182 183 #undef SKC_EXPAND_X 184 #define SKC_EXPAND_X(I,S,C,P,R) \ 185 if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \ 186 count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \ 187 } \ 188 if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \ 189 count_nodes = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \ 190 } 191 192 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 193 194 #if 0 195 if (get_sub_group_local_id() == 0) { 196 printf("reclaim paths: %9u / %5u / %5u\n",path,count_blocks,count_nodes); 197 } 198 #endif 199 200 // 201 // acquire a span in the block pool ids ring for reclaimed ids 202 // 203 // FIXME count_blocks and atomic add can be done in same lane 204 // 205 skc_uint bp_ids_base = 0; 206 207 if (get_sub_group_local_id() == 0) { 208 bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks); 209 210 #if 0 211 printf("paths: bp_ids_base = %u\n",bp_ids_base); 212 #endif 213 } 214 215 bp_ids_base = sub_group_broadcast(bp_ids_base,0); 216 217 // 218 // shift away the tagged block id's tag 219 // 220 #undef SKC_EXPAND_X 221 #define SKC_EXPAND_X(I,S,C,P,R) \ 222 if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ 223 h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; \ 224 } 225 226 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 227 228 // 229 // swap current id with next 230 // 231 if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) 232 { 233 skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); 234 235 SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; 236 237 id = next; 238 } 239 240 // 241 // - we'll skip subgroups that are entirely header 242 // 243 // - but we need to mark any header elements that partially fill 244 // a subgroup as invalid tagged block ids 245 // 246 #undef SKC_EXPAND_X 247 #define SKC_EXPAND_X(I,S,C,P,R) \ 248 if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ 249 if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) { \ 250 if (SKC_PATHS_RECLAIM_IS_HEADER(I)) { \ 251 h##I = SKC_TAGGED_BLOCK_ID_INVALID; \ 252 } \ 253 } \ 254 } 255 256 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 257 258 { 259 // 260 // count reclaimable blocks in each lane 261 // 262 SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); 263 264 #undef SKC_EXPAND_X 265 #define SKC_EXPAND_X(I,S,C,P,R) \ 266 if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ 267 packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \ 268 } 269 270 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 271 272 // 273 // scan to find index of each block 274 // 275 SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); 276 277 SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); 278 279 // 280 // store blocks back to ring 281 // 282 #undef SKC_EXPAND_X 283 #define SKC_EXPAND_X(I,S,C,P,R) \ 284 if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) { \ 285 skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ 286 skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ 287 skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ 288 if (count > 0) { \ 289 bp_ids[bp_ids_idx] = h##I; \ 290 } \ 291 skc_uint const total = index + count; \ 292 bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ 293 } 294 295 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 296 297 // printf("P %7u ! %u\n",bp_ids_idx,h##I); 298 } 299 300 // 301 // we're done if it was just the header 302 // 303 if (count_nodes == 0) 304 return; 305 306 // 307 // otherwise, walk the nodes 308 // 309 do { 310 // id of next block is in last lane 311 id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); 312 313 // get index of each element 314 skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id(); 315 316 // 317 // blindly load all of the node elements into registers 318 // 319 #undef SKC_EXPAND_X 320 #define SKC_EXPAND_X(I,S,C,P,R) \ 321 skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE]; 322 323 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 324 325 // 326 // shift away the tagged block id's tag 327 // 328 #undef SKC_EXPAND_X 329 #define SKC_EXPAND_X(I,S,C,P,R) \ 330 n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG; 331 332 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 333 334 // 335 // swap current id with next 336 // 337 if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1) 338 { 339 skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST); 340 341 SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id; 342 343 id = next; 344 } 345 346 // 347 // count reclaimable blocks in each lane 348 // 349 SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 ); 350 351 #undef SKC_EXPAND_X 352 #define SKC_EXPAND_X(I,S,C,P,R) \ 353 packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I); 354 355 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 356 357 // 358 // scan to find index of each block 359 // 360 SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 ); 361 362 SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count); 363 364 // 365 // store blocks back to ring 366 // 367 #undef SKC_EXPAND_X 368 #define SKC_EXPAND_X(I,S,C,P,R) { \ 369 skc_uint const index = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \ 370 skc_uint const count = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \ 371 skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask; \ 372 if (count > 0) { \ 373 bp_ids[bp_ids_idx] = n##I; \ 374 } \ 375 skc_uint const total = index + count; \ 376 bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \ 377 } 378 379 SKC_PATHS_RECLAIM_BLOCK_EXPAND(); 380 381 // printf("P %7u ! %u\n",bp_ids_idx,n##I); 382 383 // any more nodes? 384 } while (--count_nodes > 0); 385 } 386 } 387 388 // 389 // 390 // 391