Home | History | Annotate | Download | only in kernels
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can
      5  * be found in the LICENSE file.
      6  *
      7  */
      8 
      9 //
     10 // FIXME -- a pre-allocation step could load the path header quads and
     11 // total up the number of blocks in the workgroup or subgroup
     12 // minimizing the number of later atomics adds.
     13 //
     14 
     15 #include "block.h"
     16 #include "path.h"
     17 #include "common.h"
     18 #include "atomic_cl.h"
     19 #include "block_pool_cl.h"
     20 #include "kernel_cl_12.h"
     21 
     22 //
     23 //
     24 //
     25 
     26 #define SKC_PATHS_RECLAIM_SUBGROUP_SIZE_MASK (SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
     27 
     28 #define SKC_PATHS_RECLAIM_SUBGROUP_ELEMS     (SKC_PATHS_RECLAIM_SUBGROUP_SIZE * SKC_PATHS_RECLAIM_LOCAL_ELEMS)
     29 
     30 #define SKC_PATHS_RECLAIM_X                  (SKC_DEVICE_BLOCK_WORDS / SKC_PATHS_RECLAIM_SUBGROUP_ELEMS)
     31 
     32 //
     33 //
     34 //
     35 
     36 #if   ( SKC_PATHS_RECLAIM_X == 1 )
     37 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_1()
     38 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  0
     39 
     40 #elif ( SKC_PATHS_RECLAIM_X == 2 )
     41 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_2()
     42 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  1
     43 
     44 #elif ( SKC_PATHS_RECLAIM_X == 4 )
     45 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_4()
     46 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  3
     47 
     48 #elif ( SKC_PATHS_RECLAIM_X == 8 )
     49 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_8()
     50 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  7
     51 
     52 #elif ( SKC_PATHS_RECLAIM_X == 16)
     53 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND()       SKC_EXPAND_16()
     54 #define SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST  15
     55 
     56 #else
     57 #error "MISSING SKC_PATHS_RECLAIM_X"
     58 #endif
     59 
     60 //
     61 // FIXME -- slate these for replacement
     62 //
     63 
     64 #define SKC_BROADCAST(E,S,I)                                            \
     65   sub_group_broadcast(E,S - I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
     66 
     67 #define SKC_BROADCAST_LAST_HELPER(E,I)                          \
     68   sub_group_broadcast(E,SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
     69 
     70 #define SKC_BROADCAST_LAST(E,I)                 \
     71   SKC_BROADCAST_LAST_HELPER(E,I)
     72 
     73 //
     74 // COMPILE-TIME PREDICATES
     75 //
     76 
     77 #define SKC_PATHS_RECLAIM_ELEM_GTE(X,I)                         \
     78   SKC_GTE_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
     79 
     80 #define SKC_PATHS_RECLAIM_ELEM_IN_RANGE(X,I)                            \
     81   (skc_bool)SKC_GTE_MACRO(X, I   * SKC_PATHS_RECLAIM_SUBGROUP_SIZE) &&  \
     82   (skc_bool)SKC_LT_MACRO(X,(I+1) * SKC_PATHS_RECLAIM_SUBGROUP_SIZE)
     83 
     84 #define SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)            \
     85   SKC_PATHS_RECLAIM_ELEM_GTE(SKC_PATH_HEAD_WORDS,I)
     86 
     87 #define SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)                   \
     88   SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_WORDS,I)
     89 
     90 //
     91 // RUN-TIME PREDICATES
     92 //
     93 
     94 #define SKC_PATHS_RECLAIM_IS_HEADER(I)                                  \
     95   (get_sub_group_local_id() + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE < SKC_PATH_HEAD_WORDS)
     96 
     97 //
     98 // FIXME -- THIS BITFIELD SCAN APPROACH CAN BE PARAMETERIZED FOR ALL
     99 // POSSIBLE PRACTICAL POWER-OF-TWO SUBGROUP AND SUBBLOCKS-PER-BLOCK
    100 // COMBOS (NOT NECESSARILY POW2)
    101 //
    102 // FOR WIDER SUBGROUPS WITH BIG BLOCKS, WE WILL WANT TO USE A VECTOR
    103 // UINT TYPE INSTEAD OF A ULONG.
    104 //
    105 
    106 #define SKC_PATHS_RECLAIM_PACKED_COUNT_BITS     SKC_PATHS_RECLAIM_SUBGROUP_SIZE_LOG2
    107 #define SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE  skc_uint
    108 
    109 //
    110 //
    111 //
    112 
    113 #define SKC_PATHS_RECLAIM_PACKED_COUNT_MASK  SKC_BITS_TO_MASK(SKC_PATHS_RECLAIM_PACKED_COUNT_BITS)
    114 
    115 #define SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(E,I)            \
    116   (((E) & SKC_DEVICE_SUBBLOCKS_PER_BLOCK_MASK)                  \
    117    ? 0 : (1u << SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I))
    118 
    119 #define SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(S,C)  \
    120   S = sub_group_scan_exclusive_add(C)
    121 
    122 #define SKC_PATHS_RECLAIM_PACKED_COUNT_GET(C,I)                         \
    123   (((C) >> (SKC_PATHS_RECLAIM_PACKED_COUNT_BITS * I)) & SKC_PATHS_RECLAIM_PACKED_COUNT_MASK)
    124 
    125 //
    126 //
    127 //
    128 
    129 struct skc_reclaim
    130 {
    131   skc_path_h aN[SKC_RECLAIM_ARRAY_SIZE];
    132 };
    133 
    134 __kernel
    135 SKC_PATHS_RECLAIM_KERNEL_ATTRIBS
    136 void
    137 skc_kernel_paths_reclaim(__global skc_block_id_t          * const bp_ids,      // block pool ids ring
    138                          __global skc_uint                * const bp_elems,    // block pool blocks
    139                          __global skc_uint       volatile * const bp_atomics,  // read/write atomics
    140                          skc_uint                           const bp_mask,     // pow2 modulo mask for block pool ring
    141                          __global skc_block_id_t const    * const map,         // path host-to-device map
    142                          struct   skc_reclaim               const reclaim)     // array of host path ids
    143 {
    144 #if (__OPENCL_VERSION__ < 200)
    145   skc_uint const reclaim_stride = get_num_sub_groups();
    146 #else
    147   skc_uint const reclaim_stride = get_enqueued_num_sub_groups(); // 2.0 supports non-uniform workgroups
    148 #endif
    149   skc_uint       reclaim_idx    = get_group_id(0) * reclaim_stride + get_sub_group_id();
    150 
    151 #if 0
    152   //
    153   // NOTE -- FOR NOW, THIS KERNEL ALWAYS LAUNCHES FIXED SIZE GRIDS BUT
    154   // WE MIGHT WANT TO HAVE THE GRID LIMIT ITSELF TO A FRACTIONAL
    155   // MULTIPROCESSOR IN ORDER TO MINIMIZE THE IMPACT OF A LARGE
    156   // RECLAMATION JOB ON THE REST OF THE PIPELINE.
    157   //
    158   for (; reclaim_idx < SKC_RECLAIM_ARRAY_SIZE; reclaim_idx+=reclaim_stride)
    159 #endif
    160     {
    161       // get host path id
    162       skc_path_h const path = reclaim.aN[reclaim_idx];
    163 
    164       // get the path header block from the map
    165       skc_block_id_t   id   = map[path];
    166 
    167       //
    168       // blindly load all of the head elements into registers
    169       //
    170       skc_uint const head_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
    171 
    172 #undef  SKC_EXPAND_X
    173 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    174       skc_uint h##I = bp_elems[head_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
    175 
    176       SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    177 
    178       //
    179       // pick out count.nodes and count.prims from the header
    180       //
    181       skc_uint count_blocks, count_nodes;
    182 
    183 #undef  SKC_EXPAND_X
    184 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    185       if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_BLOCKS,I)) { \
    186         count_blocks = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_BLOCKS,I); \
    187       }                                                                 \
    188       if (SKC_PATHS_RECLAIM_ELEM_IN_RANGE(SKC_PATH_HEAD_OFFSET_NODES,I)) { \
    189         count_nodes  = SKC_BROADCAST(h##I,SKC_PATH_HEAD_OFFSET_NODES,I); \
    190       }
    191 
    192       SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    193 
    194 #if 0
    195       if (get_sub_group_local_id() == 0) {
    196         printf("reclaim paths:   %9u / %5u / %5u\n",path,count_blocks,count_nodes);
    197       }
    198 #endif
    199 
    200       //
    201       // acquire a span in the block pool ids ring for reclaimed ids
    202       //
    203       // FIXME count_blocks and atomic add can be done in same lane
    204       //
    205       skc_uint bp_ids_base = 0;
    206 
    207       if (get_sub_group_local_id() == 0) {
    208         bp_ids_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_SUBGROUP(bp_atomics+SKC_BP_ATOMIC_OFFSET_WRITES,count_blocks);
    209 
    210 #if 0
    211         printf("paths: bp_ids_base = %u\n",bp_ids_base);
    212 #endif
    213       }
    214 
    215       bp_ids_base = sub_group_broadcast(bp_ids_base,0);
    216 
    217       //
    218       // shift away the tagged block id's tag
    219       //
    220 #undef  SKC_EXPAND_X
    221 #define SKC_EXPAND_X(I,S,C,P,R)                         \
    222       if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
    223         h##I = h##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;    \
    224       }
    225 
    226       SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    227 
    228       //
    229       // swap current id with next
    230       //
    231       if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
    232         {
    233           skc_block_id_t const next = SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
    234 
    235           SKC_CONCAT(h,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
    236 
    237           id = next;
    238         }
    239 
    240       //
    241       // - we'll skip subgroups that are entirely header
    242       //
    243       // - but we need to mark any header elements that partially fill
    244       //   a subgroup as invalid tagged block ids
    245       //
    246 #undef  SKC_EXPAND_X
    247 #define SKC_EXPAND_X(I,S,C,P,R)                         \
    248       if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {      \
    249         if (SKC_PATHS_RECLAIM_PARTIALLY_HEADER(I)) {    \
    250           if (SKC_PATHS_RECLAIM_IS_HEADER(I)) {         \
    251             h##I = SKC_TAGGED_BLOCK_ID_INVALID;         \
    252           }                                             \
    253         }                                               \
    254       }
    255 
    256       SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    257 
    258       {
    259         //
    260         // count reclaimable blocks in each lane
    261         //
    262         SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
    263 
    264 #undef  SKC_EXPAND_X
    265 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    266         if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
    267           packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(h##I,I); \
    268         }
    269 
    270         SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    271 
    272         //
    273         // scan to find index of each block
    274         //
    275         SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
    276 
    277         SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
    278 
    279         //
    280         // store blocks back to ring
    281         //
    282 #undef  SKC_EXPAND_X
    283 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    284         if (!SKC_PATHS_RECLAIM_ENTIRELY_HEADER(I)) {                    \
    285           skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
    286           skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
    287           skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
    288           if (count > 0) {                                              \
    289             bp_ids[bp_ids_idx] = h##I;                                  \
    290           }                                                             \
    291           skc_uint const total = index + count;                         \
    292           bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
    293         }
    294 
    295         SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    296 
    297         // printf("P %7u ! %u\n",bp_ids_idx,h##I);
    298       }
    299 
    300       //
    301       // we're done if it was just the header
    302       //
    303       if (count_nodes == 0)
    304         return;
    305 
    306       //
    307       // otherwise, walk the nodes
    308       //
    309       do {
    310         // id of next block is in last lane
    311         id = sub_group_broadcast(id,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1);
    312 
    313         // get index of each element
    314         skc_uint const node_idx = id * SKC_DEVICE_SUBBLOCK_WORDS + get_sub_group_local_id();
    315 
    316         //
    317         // blindly load all of the node elements into registers
    318         //
    319 #undef  SKC_EXPAND_X
    320 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    321         skc_uint n##I = bp_elems[node_idx + I * SKC_PATHS_RECLAIM_SUBGROUP_SIZE];
    322 
    323         SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    324 
    325         //
    326         // shift away the tagged block id's tag
    327         //
    328 #undef  SKC_EXPAND_X
    329 #define SKC_EXPAND_X(I,S,C,P,R)                         \
    330         n##I = n##I >> SKC_TAGGED_BLOCK_ID_BITS_TAG;
    331 
    332         SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    333 
    334         //
    335         // swap current id with next
    336         //
    337         if (get_sub_group_local_id() == SKC_PATHS_RECLAIM_SUBGROUP_SIZE - 1)
    338           {
    339             skc_block_id_t const next = SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST);
    340 
    341             SKC_CONCAT(n,SKC_PATHS_RECLAIM_BLOCK_EXPAND_I_LAST) = id;
    342 
    343             id = next;
    344           }
    345 
    346         //
    347         // count reclaimable blocks in each lane
    348         //
    349         SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_count = ( 0 );
    350 
    351 #undef  SKC_EXPAND_X
    352 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    353         packed_count |= SKC_PATHS_RECLAIM_PACKED_COUNT_IS_BLOCK(n##I,I);
    354 
    355         SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    356 
    357         //
    358         // scan to find index of each block
    359         //
    360         SKC_PATHS_RECLAIM_PACKED_COUNT_DECLARE packed_index = ( 0 );
    361 
    362         SKC_PATHS_RECLAIM_PACKED_COUNT_SCAN_EXCLUSIVE_ADD(packed_index,packed_count);
    363 
    364         //
    365         // store blocks back to ring
    366         //
    367 #undef  SKC_EXPAND_X
    368 #define SKC_EXPAND_X(I,S,C,P,R) {                                       \
    369           skc_uint const index      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_index,I); \
    370           skc_uint const count      = SKC_PATHS_RECLAIM_PACKED_COUNT_GET(packed_count,I); \
    371           skc_uint const bp_ids_idx = (bp_ids_base + index) & bp_mask;  \
    372           if (count > 0) {                                              \
    373             bp_ids[bp_ids_idx] = n##I;                                  \
    374           }                                                             \
    375           skc_uint const total = index + count;                         \
    376           bp_ids_base += sub_group_broadcast(total,SKC_PATHS_RECLAIM_SUBGROUP_SIZE-1); \
    377         }
    378 
    379         SKC_PATHS_RECLAIM_BLOCK_EXPAND();
    380 
    381         // printf("P %7u ! %u\n",bp_ids_idx,n##I);
    382 
    383         // any more nodes?
    384       } while (--count_nodes > 0);
    385     }
    386 }
    387 
    388 //
    389 //
    390 //
    391