Home | History | Annotate | Download | only in kernels
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can
      5  * be found in the LICENSE file.
      6  *
      7  */
      8 
      9 //
     10 //
     11 //
     12 
     13 #include "tile.h"
     14 #include "common.h"
     15 #include "raster.h"
     16 #include "atomic_cl.h"
     17 #include "kernel_cl_12.h"
     18 
     19 //
     20 //
     21 //
     22 
     23 #define SKC_PLACE_SUBGROUP_MASK      (SKC_PLACE_SUBGROUP_SIZE - 1)
     24 #define SKC_PLACE_SUBGROUP_LAST      (SKC_PLACE_SUBGROUP_SIZE - 1)
     25 
     26 //
     27 //
     28 //
     29 
     30 #define SKC_PLACE_SMEM_COUNT_TTSK    SKC_MAX_MACRO(SKC_RASTER_NODE_MAX_TTSK,SKC_PLACE_SUBGROUP_SIZE)
     31 #define SKC_PLACE_SMEM_COUNT_TTPK    SKC_RASTER_NODE_MAX_TTPK
     32 
     33 //
     34 //
     35 //
     36 
     37 #define SKC_PLACE_X                  (SKC_DEVICE_BLOCK_DWORDS / SKC_PLACE_SUBGROUP_SIZE)
     38 
     39 //
     40 //
     41 //
     42 
     43 #if   ( SKC_PLACE_X == 1 )
     44 #define SKC_PLACE_EXPAND()           SKC_EXPAND_1()
     45 #define SKC_PLACE_EXPAND_I_LAST      0
     46 
     47 #elif ( SKC_PLACE_X == 2 )
     48 #define SKC_PLACE_EXPAND()           SKC_EXPAND_2()
     49 #define SKC_PLACE_EXPAND_I_LAST      1
     50 
     51 #elif ( SKC_PLACE_X == 4 )
     52 #define SKC_PLACE_EXPAND()           SKC_EXPAND_4()
     53 #define SKC_PLACE_EXPAND_I_LAST      3
     54 
     55 #elif ( SKC_PLACE_X == 8 )
     56 #define SKC_PLACE_EXPAND()           SKC_EXPAND_8()
     57 #define SKC_PLACE_EXPAND_I_LAST      7
     58 
     59 #elif ( SKC_PLACE_X == 16)
     60 #define SKC_PLACE_EXPAND()           SKC_EXPAND_16()
     61 #define SKC_PLACE_EXPAND_I_LAST      15
     62 #endif
     63 
     64 //
     65 // PREFIX STORES THE 64-BIT KEYS WITH TWO 32-BIT SUBGROUP-WIDE
     66 // COALESCED WRITES.  LO FIRST, FOLLOWED BY HI.
     67 //
     68 // THIS SLIGHTLY COMPLICATES LOADING BY THE PLACE KERNEL IF THE
     69 // KERNELS USE DIFFERENT SUBGROUP SIZES.
     70 //
     71 // THE BENEFIT IS THAT THE RASTER RECLAIM KERNEL ONLY HAS TO LOAD THE
     72 // LO WORD OF THE KEY SINCE IT CONTAINS THE BLOCK ID.
     73 //
     74 // NOTE: AT THIS POINT, ONLY INTEL'S HD GRAPHICS ARCHITECTURE UNDER
     75 // OPENCL SUPPORTS SELECTING A SUBGROUP SIZE (8/16/32). VULKAN MAY
     76 // ONLY SUPPORT A SUBGROUP SIZE OF 16.
     77 //
     78 
     79 #if    ( SKC_PREFIX_SUBGROUP_SIZE == SKC_PLACE_SUBGROUP_SIZE )
     80 
     81 #define SKC_PLACE_STRIDE_H(L)              (L)
     82 #define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
     83 #define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE)
     84 
     85 #elif  ( SKC_PREFIX_SUBGROUP_SIZE >  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
     86 
     87 #define SKC_PLACE_SUBGROUP_RATIO           (SKC_PREFIX_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_SIZE)
     88 #define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_RATIO - 1)
     89 #define SKC_PLACE_SUBGROUP_RATIO_SCALE(I)  ((I / SKC_PLACE_SUBGROUP_RATIO) * 2 * SKC_PLACE_SUBGROUP_RATIO + (I & SKC_PLACE_SUBGROUP_RATIO_MASK))
     90 
     91 #define SKC_PLACE_STRIDE_H(L)              (L)
     92 #define SKC_PLACE_STRIDE_V_LO(I)           (SKC_PLACE_SUBGROUP_RATIO_SCALE(I) * SKC_PLACE_SUBGROUP_SIZE)
     93 #define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_RATIO * SKC_PLACE_SUBGROUP_SIZE)
     94 
     95 #elif  ( SKC_PREFIX_SUBGROUP_SIZE <  SKC_PLACE_SUBGROUP_SIZE ) // same as above when ratio equals 1
     96 
     97 #define SKC_PLACE_SUBGROUP_RATIO           (SKC_PLACE_SUBGROUP_SIZE / SKC_PREFIX_SUBGROUP_SIZE)
     98 #define SKC_PLACE_SUBGROUP_RATIO_MASK      (SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO - 1) // equal to prefix subgroup mask
     99 
    100 #define SKC_PLACE_STRIDE_H(L)              (((L) & ~SKC_PLACE_SUBGROUP_RATIO_MASK) * 2 + ((L) & SKC_PLACE_SUBGROUP_RATIO_MASK))
    101 #define SKC_PLACE_STRIDE_V_LO(I)           (I * 2 * SKC_PLACE_SUBGROUP_SIZE)
    102 #define SKC_PLACE_STRIDE_V_HI(I)           (SKC_PLACE_STRIDE_V_LO(I) + SKC_PLACE_SUBGROUP_SIZE / SKC_PLACE_SUBGROUP_RATIO)
    103 
    104 #endif
    105 
    106 //
    107 // A COARSE COMPILE-TIME GUARD -- WILL ONLY MATTER WHEN SUBGROUP SIZE
    108 // IS EQUAL TO THE RASTER HEADER SIZE (CURRENTLY 8)
    109 //
    110 
    111 #define SKC_PLACE_IS_ALL_HEADER_ROW(i)   (((i)+1) * SKC_PLACE_SUBGROUP_SIZE <= SKC_RASTER_HEAD_DWORDS)
    112 
    113 #define SKC_PLACE_IS_NOT_HEADER_ROW(i)   ( (i)    * SKC_PLACE_SUBGROUP_SIZE >= SKC_RASTER_HEAD_DWORDS)
    114 
    115 #define SKC_PLACE_IS_TRAILING_ROW(i)     (((i)+1) * SKC_PLACE_SUBGROUP_SIZE == SKC_DEVICE_BLOCK_DWORDS)
    116 
    117 #define SKC_PLACE_IS_HEADER_ROW_KEY(i)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
    118 
    119 
    120 //
    121 // Note: HEADER_LESS_THAN purposefully wraps unsigned integer to ~UINT_MAX
    122 //
    123 #define SKC_PLACE_HEADER_LESS_THAN(i,k) ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS < (k))
    124 #define SKC_PLACE_NODE_LESS_THAN(i,k)   ((i) * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id()                          < (k))
    125 
    126 //
    127 // TTSK v2:
    128 //
    129 //  0                                       63
    130 //  | TTSB ID | PREFIX |  SPAN   |  X  |  Y  |
    131 //  +---------+--------+---------+-----+-----+
    132 //  |    27   | 1 (=0) | 12 (=0) | 12  | 12  |
    133 //
    134 //
    135 // TTPK v2:
    136 //
    137 //  0                                    63
    138 //  | TTPB ID | PREFIX | SPAN |  X  |  Y  |
    139 //  +---------+--------+------+-----+-----+
    140 //  |    27   | 1 (=1) |  12  | 12  | 12  |
    141 //
    142 //
    143 
    144 //
    145 // TTCK (32-BIT COMPARE) v1:
    146 //
    147 //  0                                                           63
    148 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
    149 //  +----------------------+--------+--------+-------+-----+-----+
    150 //  |          30          |    1   |    1   |   18  |  7  |  7  |
    151 //
    152 //
    153 // TTCK (32-BIT COMPARE) v2:
    154 //
    155 //  0                                                           63
    156 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
    157 //  +----------------------+--------+--------+-------+-----+-----+
    158 //  |          30          |    1   |    1   |   15  |  9  |  8  |
    159 //
    160 //
    161 // TTCK (64-BIT COMPARE) -- achieves 4K x 4K with an 8x16 tile:
    162 //
    163 //  0                                                           63
    164 //  | PAYLOAD/TTSB/TTPB ID | PREFIX | ESCAPE | LAYER |  X  |  Y  |
    165 //  +----------------------+--------+--------+-------+-----+-----+
    166 //  |          27          |    1   |    1   |   18  |  9  |  8  |
    167 //
    168 
    169 union skc_subgroup_smem
    170 {
    171   skc_uint scratch[SKC_PLACE_SUBGROUP_SIZE]; // will only use SKC_PLACE_SUBGROUP_SIZE
    172 
    173   struct {
    174     struct {
    175       skc_ttsk_lo_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
    176       skc_ttpk_lo_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
    177     } lo;
    178 
    179     struct {
    180       skc_ttsk_hi_t sk[SKC_PLACE_SMEM_COUNT_TTSK];
    181       skc_ttpk_hi_t pk[SKC_PLACE_SMEM_COUNT_TTPK];
    182     } hi;
    183 
    184     // skc_uint span[SKC_PLACE_SMEM_COUNT_TTPK];
    185   };
    186 
    187 };
    188 
    189 //
    190 // scatter scan max
    191 //
    192 static
    193 skc_int_v_t
    194 skc_scatter_scan_max(__local union skc_subgroup_smem  volatile * const smem,
    195                      skc_int_v_t                                 const iss,
    196                      skc_int_v_t                                 const ess)
    197 {
    198   //
    199   // prefix sums determine which lanes we're going to work on next
    200   //
    201   skc_pred_v_t const is_scratch_store = (iss > 0) && (ess < SKC_PLACE_SUBGROUP_SIZE);
    202   skc_int_v_t  const scratch_idx      = max(ess,0);
    203 
    204   //
    205   // SIMT
    206   //
    207 
    208   //
    209   // zero the volatile smem scratchpad using vector syntax
    210   //
    211   smem->scratch[get_sub_group_local_id()] = ( 0 );
    212 
    213   //
    214   // store source lane at starting lane
    215   //
    216   if (is_scratch_store) {
    217     smem->scratch[scratch_idx] = get_sub_group_local_id();
    218   }
    219 
    220   //
    221   // propagate lanes to right using max scan
    222   //
    223   skc_int_v_t const scratch = smem->scratch[get_sub_group_local_id()];
    224   skc_int_v_t const source  = sub_group_scan_inclusive_max(scratch);
    225 
    226   return source;
    227 }
    228 
    229 //
    230 //
    231 //
    232 
    233 static
    234 skc_bool
    235 skc_xk_clip(union skc_tile_clip const * const tile_clip,
    236             skc_ttxk_t                * const xk)
    237 {
    238   //
    239   // clip the sk and pk keys
    240   //
    241   // if fully clipped then return false
    242   //
    243   // alternatively -- we can expand all these keys in place
    244   //
    245   // alternatively -- keep sk and pk keys segregated because sk
    246   // represents the vast majority of keys and are easier to process.
    247   // don't mess with the fastpath!
    248   //
    249   return false;
    250 }
    251 
    252 //
    253 //
    254 //
    255 
    256 static
    257 skc_ttck_t
    258 skc_sk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
    259              union skc_cmd_place              const    * const cmd,
    260              skc_uint                                    const sk_idx)
    261 {
    262   skc_uint const lo = smem->lo.sk[sk_idx]; // assumes prefix bit is 0
    263   skc_uint const hi = smem->hi.sk[sk_idx];
    264 
    265   skc_ttck_t ck;
    266 
    267   ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
    268 
    269   // FIXME -- x and y should already be clipped and shifted
    270   skc_uint const x = (cmd->tx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
    271   skc_uint const y = (cmd->ty + SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
    272 
    273   ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
    274 
    275   return ck;
    276 }
    277 
    278 static
    279 skc_ttck_t
    280 skc_pk_to_ck(__local union skc_subgroup_smem  volatile * const smem,
    281              union skc_cmd_place              const    * const cmd,
    282              skc_uint                                    const pk_idx,
    283              skc_uint                                    const dx)
    284 {
    285   skc_uint const lo = smem->lo.pk[pk_idx] & SKC_TTXK_LO_MASK_ID_PREFIX; // assumes prefix bit is 1
    286   skc_uint const hi = smem->hi.pk[pk_idx];
    287 
    288   skc_ttck_t ck;
    289 
    290   ck.lo = lo | (cmd->layer_id << SKC_TTCK_LO_BITS_ID_PREFIX_ESCAPE); // FIXME -- preshift the layer id
    291 
    292   // FIXME -- x and y should already be clipped and shifted
    293   skc_uint const x = (cmd->tx + dx + SKC_BFE(hi,SKC_TTXK_HI_BITS_X,SKC_TTXK_HI_OFFSET_X)) << SKC_TTCK_HI_OFFSET_X;
    294   skc_uint const y = (cmd->ty +      SKC_BFE(hi,SKC_TTXK_HI_BITS_Y,SKC_TTXK_HI_OFFSET_Y)) << SKC_TTCK_HI_OFFSET_Y;
    295 
    296   ck.hi = (cmd->layer_id >> SKC_TTCK_HI_SHR_LAYER) | x | y;
    297 
    298   return ck;
    299 }
    300 
    301 //
    302 //
    303 //
    304 
    305 static
    306 void
    307 skc_ttsk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
    308                __global skc_ttck_t                       * const ck_extent,
    309                __local union skc_subgroup_smem  volatile * const smem,
    310                union skc_cmd_place              const    * const cmd,
    311                skc_uint                         const            sk)
    312 {
    313   //
    314   // Pretty sure you can never ever have an sk count equal to 0
    315   //
    316   skc_uint ck_base = 0;
    317 
    318   // last lane performs the block pool allocation with an atomic increment
    319   if (get_sub_group_local_id() == 0) {
    320     ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,sk);
    321   }
    322 
    323   // broadcast base to all lanes
    324   ck_base = sub_group_broadcast(ck_base,0);
    325 
    326   // convert sk keys to ck keys
    327   for (skc_uint ii=get_sub_group_local_id(); ii<sk; ii+=SKC_PLACE_SUBGROUP_SIZE)
    328     {
    329       ck_extent[ck_base+ii] = skc_sk_to_ck(smem,cmd,ii);
    330     }
    331 }
    332 
    333 //
    334 //
    335 //
    336 
    337 static
    338 skc_int
    339 skc_ttpk_get_span(__local union skc_subgroup_smem  volatile * const smem,
    340                   skc_uint                                    const idx)
    341 {
    342   skc_uint const lo      = smem->lo.pk[idx];
    343   skc_uint const hi      = smem->hi.pk[idx];
    344 
    345   skc_uint const span_lo = lo >> SKC_TTXK_LO_OFFSET_SPAN;
    346   skc_uint const span_hi = (hi & SKC_BITS_TO_MASK(SKC_TTXK_HI_BITS_SPAN)) << SKC_TTXK_LO_BITS_SPAN;
    347 
    348   return (span_lo | span_hi) + 1;
    349 }
    350 
    351 //
    352 //
    353 //
    354 
    355 static
    356 void
    357 skc_ttpk_flush(__global SKC_ATOMIC_UINT         volatile * const place_atomics,
    358                __global skc_ttck_t                       * const ck_extent,
    359                __local union skc_subgroup_smem  volatile * const smem,
    360                union skc_cmd_place              const    * const cmd,
    361                skc_uint                         const            pk)
    362 {
    363   // bail out if pk queue is empty
    364   if (pk == 0)
    365     return;
    366 
    367 #if 0
    368   if (get_sub_group_local_id() == 0)
    369     printf("%u\n",pk);
    370 #endif
    371 
    372   //
    373   // FIXME -- this nested loop iterates over the queue processing a
    374   // subgroup of 64-bit keys at a time.  This is probably not the most
    375   // efficient approach so investigate how to store and iterate over a
    376   // wider than subgroup (node-sized) queue of keys.
    377   //
    378 
    379   // round up so we work with full subgroups
    380   skc_uint const pk_ru = (pk + SKC_PLACE_SUBGROUP_SIZE - 1) & ~SKC_PLACE_SUBGROUP_MASK;
    381   skc_uint       ii    = 0;
    382 
    383   // nested loop that expands all ttpk keys
    384 #if (SKC_PLACE_SMEM_COUNT_TTPK > SKC_PLACE_SUBGROUP_SIZE)
    385   for (; ii<pk_ru; ii+=SKC_PLACE_SUBGROUP_SIZE)
    386 #endif
    387     {
    388       skc_uint idx  = ii + get_sub_group_local_id();
    389       skc_int  span = 0;
    390 
    391       // how many tiles does this ttpk span?
    392       if (idx < pk)
    393         span = skc_ttpk_get_span(smem,idx);
    394 
    395       // we need inclusive, exclusive and total
    396       skc_int iss = sub_group_scan_inclusive_add(span);
    397       skc_int ess = iss - span;
    398       skc_int rem = sub_group_broadcast(iss,SKC_PLACE_SUBGROUP_SIZE-1);
    399 
    400       // printf("%u : %u\n",span,iss);
    401       // continue;
    402 
    403       // atomically allocate space for the pk keys
    404       skc_uint ck_base = 0;
    405 
    406       // last lane performs the block pool allocation with an atomic increment
    407       if (get_sub_group_local_id() == 0) {
    408         ck_base = SKC_ATOMIC_ADD_GLOBAL_RELAXED_DEVICE(place_atomics,rem);
    409       }
    410 
    411       // broadcast atomically allocated extent base to all lanes
    412       skc_uint ck_idx = sub_group_broadcast(ck_base,0) + get_sub_group_local_id();
    413 
    414       //
    415       // FIXME -- this loop would probably be faster if the ttpk keys
    416       // were held in registers and accessed with shuffles instead of
    417       // SMEM loads
    418       //
    419 
    420       //
    421       // loop until there are no more expanded pk keys
    422       //
    423       while (true)
    424         {
    425           skc_int const source = skc_scatter_scan_max(smem,iss,ess);
    426           skc_int const dx     = get_sub_group_local_id() - intel_sub_group_shuffle(ess,source);
    427 
    428           // store valid ck keys to gmem
    429           if (get_sub_group_local_id() < rem) {
    430             ck_extent[ck_idx] = skc_pk_to_ck(smem,cmd,ii+source,dx);
    431           }
    432 
    433           // decrement remainder
    434           rem -= SKC_PLACE_SUBGROUP_SIZE;
    435 
    436           if (rem <= 0)
    437             break;
    438 
    439           // increment/decrement indices
    440           ck_idx += SKC_PLACE_SUBGROUP_SIZE;
    441           iss    -= SKC_PLACE_SUBGROUP_SIZE;
    442           ess    -= SKC_PLACE_SUBGROUP_SIZE;
    443         }
    444     }
    445 }
    446 
    447 //
    448 //
    449 //
    450 
    451 static
    452 skc_uint
    453 skc_ballot(skc_uint * const xk, skc_uint const is_xk)
    454 {
    455 #if 0
    456   //
    457   // FIXME -- when available, this should use the idiom:
    458   //
    459   //   ballot() + lane_mask_less_than_or_equal + popcount()
    460   //
    461   // Supported by:
    462   //
    463   //   - Vulkan 1.1 / SPIR-V 1.3
    464   //   - CUDA
    465   //   - AVX2 (SSE*?)
    466   //
    467 #else
    468   //
    469   // otherwise, emulate with an inclusive scan (yuk)
    470   //
    471   skc_uint const prefix = sub_group_scan_inclusive_add(is_xk);
    472 
    473   skc_uint const xk_idx = *xk + prefix - is_xk;
    474 
    475   *xk += sub_group_broadcast(prefix,SKC_PLACE_SUBGROUP_LAST);
    476 
    477 #if 0
    478   printf("< %3u >\n",xk_idx);
    479 #endif
    480 
    481   return xk_idx;
    482 #endif
    483 }
    484 
    485 //
    486 //
    487 //
    488 __kernel
    489 SKC_PLACE_KERNEL_ATTRIBS
    490 void
    491 skc_kernel_place(__global skc_bp_elem_t                * const bp_elems,
    492                  __global SKC_ATOMIC_UINT     volatile * const place_atomics,
    493                  __global skc_ttck_t                   * const ck_extent,
    494                  __global union skc_cmd_place const    * const cmds,
    495                  __global skc_block_id_t               * const map,
    496                  skc_uint4                               const clip,
    497                  skc_uint                                const count)
    498 {
    499   //
    500   // declare shared memory block
    501   //
    502 #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
    503   __local union skc_subgroup_smem  volatile                smem[1];
    504 #else
    505   __local union skc_subgroup_smem  volatile                smem_wg[SKC_PLACE_WORKGROUP_SUBGROUPS];
    506   __local union skc_subgroup_smem  volatile * const smem = smem_wg + get_sub_group_id();
    507 #endif
    508 
    509   //
    510   // This is a subgroup-centric kernel
    511   //
    512   // Which subgroup in the grid is this?
    513   //
    514   // TAKE NOTE: the Intel GEN compiler appears to be recognizing
    515   // get_group_id(0) as a uniform but the alternative calculation used
    516   // when there are multiple subgroups per workgroup is not
    517   // cooperating and driving spillage elsewhere.
    518   //
    519   // Test the raster's translated bounds against the composition's
    520   // tile clip
    521   //
    522   // There are 3 cases:
    523   //
    524   //   - the raster is completely clipped -> return
    525   //   - the raster is partially  clipped -> all keys must clipped
    526   //   - the raster is not        clipped -> no keys are tested
    527   //
    528   //
    529   // There are at least 4 implementations of place and we want to
    530   // special-case them as much as possible so that, at the least, the
    531   // fastpath remains fast.
    532   //
    533   //  - implement NO CLIP + NO TRANSLATION fastpath -- CAN ATOMICALLY ALLOCATE SK+PK KEYS IN ONE STEP
    534   //
    535   //  - implement CLIPPED + NO TRANSLATION path
    536   //
    537   //  - implement NO CLIP +    TRANSLATION path
    538   //
    539   //  - implement CLIPPED +    TRANSLATION path
    540   //
    541   //
    542   // FIXME/OPTIMIZATION: split scan accumulator into a triple-bin
    543   // 12:12:8 integer where:
    544   //
    545   //  12: ttsk
    546   //  12: ttpk
    547   //   8: /dev/null -- clipped or invalid key
    548   //
    549   // Three kinds of nodes in a raster's list:
    550   //
    551   //  - the head node
    552   //  - an internal node
    553   //  - the final node
    554   //
    555 
    556 #if ( SKC_PLACE_WORKGROUP_SUBGROUPS == 1 )
    557   skc_uint const cmd_idx = get_group_id(0);
    558 #else
    559   skc_uint const cmd_idx = get_group_id(0) * SKC_PLACE_WORKGROUP_SUBGROUPS + get_sub_group_id();
    560 #endif
    561 
    562   // load command
    563   union skc_cmd_place const cmd = cmds[cmd_idx];
    564 
    565   // get the raster header from the raster host id -- scalar
    566   skc_block_id_t            id  = map[cmd.raster_h];
    567 
    568   //
    569   // load all of the head block ttxk keys into registers
    570   //
    571   // FIXME -- this pattern lends itself to using the higher
    572   // performance Intel GEN block load instructions
    573   //
    574   skc_uint const head_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
    575 
    576 #undef  SKC_EXPAND_X
    577 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
    578   union skc_raster_node_elem const h##I = {                     \
    579     .u32v2 = { bp_elems[head_id + SKC_PLACE_STRIDE_V_LO(I)],    \
    580                bp_elems[head_id + SKC_PLACE_STRIDE_V_HI(I)]  }  \
    581   };
    582 
    583   SKC_PLACE_EXPAND();
    584 
    585   //
    586   // load raster header counts -- we only need the "nodes" and "keys"
    587   // words but the keys we loaded are doublewords.
    588   //
    589   // FIXME -- this can be made portable with compile-time macro expansion
    590   //
    591   skc_uint nodes = sub_group_broadcast(h0.u32v2.lo,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_NODES
    592   skc_uint keys  = sub_group_broadcast(h0.u32v2.hi,1); // SKC_RASTER_HEAD_OFFSET_COUNTS_KEYS
    593 
    594   //
    595   //
    596   //
    597 #if 0
    598 #undef  SKC_EXPAND_X
    599 #define SKC_EXPAND_X(I,S,C,P,R)                                 \
    600   printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",             \
    601          nodes,keys,                                            \
    602          I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),  \
    603          h##I.u32v2.hi,h##I.u32v2.lo,                           \
    604          h##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
    605 
    606   SKC_PLACE_EXPAND();
    607 #endif
    608 
    609   //
    610 #if 0
    611   if (get_sub_group_local_id() == 0) {
    612     printf("place: %u / %u / %u\n",head_id,nodes,keys);
    613   }
    614 #endif
    615 
    616   {
    617     //
    618     // classify every key in the header
    619     //
    620     // keys: 0 is not a key / 1 is a key
    621     // skpk: 0 is sk        / 1 is pk
    622     //
    623     skc_uint bits_keys = 0;
    624     skc_uint bits_skpk = 0;
    625 
    626     //
    627     // calculate bits_keys
    628     //
    629 #undef  SKC_EXPAND_X
    630 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    631     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
    632       skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id() - SKC_RASTER_HEAD_DWORDS; \
    633       if (idx < keys) {                                                 \
    634         bits_keys |= (1u << I);                                         \
    635       }                                                                 \
    636       if (SKC_PLACE_IS_TRAILING_ROW(I)) {                               \
    637         if (keys > SKC_RASTER_HEAD_COUNT_KEYS) {                        \
    638           if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {    \
    639             bits_keys &= ~(1u << I);                                    \
    640           }                                                             \
    641         }                                                               \
    642       }                                                                 \
    643     }
    644 
    645     SKC_PLACE_EXPAND();
    646 
    647     //
    648     // blindly calculate bits_skpk
    649     //
    650 #undef  SKC_EXPAND_X
    651 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    652     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {                              \
    653       bits_skpk |= (h##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
    654     }
    655 
    656     SKC_PLACE_EXPAND();
    657 
    658 #if 0
    659     printf("%2X : %2X\n",bits_keys,bits_skpk);
    660 #endif
    661 
    662     //
    663     // next pointer is last element of last row.  save it now because
    664     // this might be recognized as a subgroup-uniform/scalar.
    665     //
    666     id = sub_group_broadcast(SKC_CONCAT(h,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
    667 
    668     //
    669     // append SK keys first
    670     //
    671     skc_uint const bits_sk = bits_keys & ~bits_skpk;
    672     skc_uint       sk      = 0;
    673 
    674 #undef  SKC_EXPAND_X
    675 #define SKC_EXPAND_X(I,S,C,P,R)                 \
    676     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
    677       skc_uint is_sk  = (bits_sk >> I) & 1;     \
    678       skc_uint sk_idx = skc_ballot(&sk,is_sk);  \
    679       if (is_sk) {                              \
    680         smem->lo.sk[sk_idx] = h##I.xk.lo;       \
    681         smem->hi.sk[sk_idx] = h##I.xk.hi;       \
    682       }                                         \
    683     }
    684 
    685     SKC_PLACE_EXPAND();
    686 
    687     //
    688     // append PK keys next
    689     //
    690     skc_uint const bits_pk = bits_keys & bits_skpk;
    691     skc_uint       pk      = 0;
    692 
    693 #undef  SKC_EXPAND_X
    694 #define SKC_EXPAND_X(I,S,C,P,R)                 \
    695     if (!SKC_PLACE_IS_ALL_HEADER_ROW(I)) {      \
    696       skc_uint is_pk  = (bits_pk >> I) & 1;     \
    697       skc_uint pk_idx = skc_ballot(&pk,is_pk);  \
    698       if (is_pk) {                              \
    699         smem->lo.pk[pk_idx] = h##I.xk.lo;       \
    700         smem->hi.pk[pk_idx] = h##I.xk.hi;       \
    701       }                                         \
    702     }
    703 
    704     SKC_PLACE_EXPAND();
    705 
    706 #if 0
    707     printf("%2u * %2u\n",sk,pk);
    708 #endif
    709     //
    710     // flush the keys
    711     //
    712     skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
    713     skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
    714   }
    715 
    716   //
    717   // we're done if there was only a head node
    718   //
    719   if (nodes == 0)
    720     return;
    721 
    722   //
    723   // decrement keys
    724   //
    725   keys -= SKC_RASTER_HEAD_COUNT_KEYS;
    726 
    727   //
    728   // otherwise, append keys in trailing nodes to smem
    729   //
    730   while (true)
    731     {
    732       //
    733       // load all of the node block ttxk keys into registers
    734       //
    735       // FIXME -- this pattern lends itself to using the higher
    736       // performance Intel GEN block load instructions
    737       //
    738       skc_uint const node_id = id * SKC_DEVICE_SUBBLOCK_WORDS + SKC_PLACE_STRIDE_H(get_sub_group_local_id());
    739 
    740 #undef  SKC_EXPAND_X
    741 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    742       union skc_raster_node_elem const n##I = {                         \
    743         .u32v2 = { bp_elems[node_id + SKC_PLACE_STRIDE_V_LO(I)],        \
    744                    bp_elems[node_id + SKC_PLACE_STRIDE_V_HI(I)]  }      \
    745       };
    746 
    747       SKC_PLACE_EXPAND();
    748 
    749 #if 0
    750 #undef  SKC_EXPAND_X
    751 #define SKC_EXPAND_X(I,S,C,P,R)                                         \
    752       printf("%5u :  %6u : %3u : %08X . %08X - %08X\n",                 \
    753              nodes,keys,                                                \
    754              I*SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(),      \
    755              n##I.u32v2.hi,n##I.u32v2.lo,                               \
    756              n##I.u32v2.lo & SKC_TTXK_LO_MASK_PREFIX);
    757 
    758       SKC_PLACE_EXPAND();
    759 #endif
    760 
    761       //
    762       // classify every key in the header
    763       //
    764       // keys: 0 is not a key / 1 is a key
    765       // skpk: 0 is sk        / 1 is pk
    766       //
    767       skc_uint bits_keys = 0;
    768       skc_uint bits_skpk = 0;
    769 
    770       //
    771       // calculate bits_keys
    772       //
    773 #undef  SKC_EXPAND_X
    774 #define SKC_EXPAND_X(I,S,C,P,R) {                                       \
    775         skc_uint const idx = I * SKC_PLACE_SUBGROUP_SIZE + get_sub_group_local_id(); \
    776         if (idx < keys) {                                               \
    777           bits_keys |= (1u << I);                                       \
    778         }                                                               \
    779         if (SKC_PLACE_IS_TRAILING_ROW(I)) {                             \
    780           if (keys > SKC_RASTER_NODE_COUNT_KEYS) {                      \
    781             if (get_sub_group_local_id() == SKC_PLACE_SUBGROUP_LAST) {  \
    782               bits_keys &= ~(1u << I);                                  \
    783             }                                                           \
    784           }                                                             \
    785         }                                                               \
    786       }
    787 
    788       SKC_PLACE_EXPAND();
    789 
    790       //
    791       // blindly calculate bits_skpk
    792       //
    793 #undef  SKC_EXPAND_X
    794 #define SKC_EXPAND_X(I,S,C,P,R) {                                       \
    795         bits_skpk |= (n##I.xk.lo & SKC_TTXK_LO_MASK_PREFIX) >> (SKC_TTXK_LO_OFFSET_PREFIX - I); \
    796       }
    797 
    798       SKC_PLACE_EXPAND();
    799 
    800 #if 0
    801       printf("%2X : %2X\n",bits_keys,bits_skpk);
    802 #endif
    803 
    804       //
    805       // next pointer is last element of last row.  save it now because
    806       // this might be recognized as a subgroup-uniform/scalar.
    807       //
    808       id = sub_group_broadcast(SKC_CONCAT(n,SKC_PLACE_EXPAND_I_LAST).next.node,SKC_PLACE_SUBGROUP_LAST);
    809 
    810       //
    811       // append SK keys first
    812       //
    813       skc_uint const bits_sk = bits_keys & ~bits_skpk;
    814       skc_uint       sk      = 0;
    815 
    816 #undef  SKC_EXPAND_X
    817 #define SKC_EXPAND_X(I,S,C,P,R) {                       \
    818         skc_uint is_sk  = (bits_sk >> I) & 1;           \
    819         skc_uint sk_idx = skc_ballot(&sk,is_sk);        \
    820         if (is_sk) {                                    \
    821           smem->lo.sk[sk_idx] = n##I.xk.lo;             \
    822           smem->hi.sk[sk_idx] = n##I.xk.hi;             \
    823         }                                               \
    824       }
    825 
    826       SKC_PLACE_EXPAND();
    827 
    828       //
    829       // append PK keys next
    830       //
    831       skc_uint const bits_pk = bits_keys & bits_skpk;
    832       skc_uint       pk      = 0;
    833 
    834 #undef  SKC_EXPAND_X
    835 #define SKC_EXPAND_X(I,S,C,P,R) {                       \
    836         skc_uint is_pk  = (bits_pk >> I) & 1;           \
    837         skc_uint pk_idx = skc_ballot(&pk,is_pk);        \
    838         if (is_pk) {                                    \
    839           smem->lo.pk[pk_idx] = n##I.xk.lo;             \
    840           smem->hi.pk[pk_idx] = n##I.xk.hi;             \
    841         }                                               \
    842       }
    843 
    844       SKC_PLACE_EXPAND();
    845 
    846 #if 0
    847     printf("%2u * %2u\n",sk,pk);
    848 #endif
    849       //
    850       // if total for either the sk or pk queue reaches the
    851       // highwater mark then flush it to the extent
    852       //
    853       skc_ttsk_flush(place_atomics,ck_extent,smem,&cmd,sk);
    854       skc_ttpk_flush(place_atomics,ck_extent,smem,&cmd,pk);
    855 
    856       //
    857       // if this was the last node then we're done
    858       //
    859       if (--nodes == 0)
    860         return;
    861 
    862       //
    863       // otherwise decrement keys
    864       //
    865       keys -= SKC_RASTER_NODE_COUNT_KEYS;
    866     }
    867 }
    868 
    869 //
    870 //
    871 //
    872