Home | History | Annotate | Download | only in intel
      1 //
      2 // Copyright 2016 Google Inc.
      3 //
      4 // Use of this source code is governed by a BSD-style
      5 // license that can be found in the LICENSE file.
      6 //
      7 
      8 #ifndef HS_CL_MACROS_ONCE
      9 #define HS_CL_MACROS_ONCE
     10 
     11 //
     12 // Define the type based on key and val sizes
     13 //
     14 
     15 #if   HS_KEY_WORDS == 1
     16 #if   HS_VAL_WORDS == 0
     17 #define HS_KEY_TYPE  uint
     18 #endif
     19 #elif HS_KEY_WORDS == 2
     20 #define HS_KEY_TYPE  ulong
     21 #endif
     22 
     23 //
     24 // FYI, restrict shouldn't have any impact on these kernels and
     25 // benchmarks appear to prove that true
     26 //
     27 
     28 #define HS_RESTRICT restrict
     29 
     30 //
     31 //
     32 //
     33 
     34 #define HS_REQD_SUBGROUP_SIZE()                 \
     35   __attribute__((intel_reqd_sub_group_size(HS_SLAB_THREADS)))
     36 
     37 //
     38 // KERNEL PROTOS
     39 //
     40 
     41 #define HS_BS_KERNEL_PROTO(slab_count,slab_count_ru_log2)               \
     42   __kernel                                                              \
     43   HS_REQD_SUBGROUP_SIZE()                                               \
     44   __attribute__((reqd_work_group_size(HS_SLAB_THREADS*slab_count,1,1))) \
     45   void                                                                  \
     46   hs_kernel_bs_##slab_count_ru_log2(__global HS_KEY_TYPE const * const HS_RESTRICT vin, \
     47                                     __global HS_KEY_TYPE       * const HS_RESTRICT vout)
     48 
     49 #define HS_BC_KERNEL_PROTO(slab_count,slab_count_log2)                  \
     50   __kernel                                                              \
     51   HS_REQD_SUBGROUP_SIZE()                                               \
     52   __attribute__((reqd_work_group_size(HS_SLAB_THREADS*slab_count,1,1))) \
     53   void                                                                  \
     54   hs_kernel_bc_##slab_count_log2(__global HS_KEY_TYPE * const HS_RESTRICT vout)
     55 
     56 #define HS_FM_KERNEL_PROTO(s,r)                                         \
     57   __kernel                                                              \
     58   HS_REQD_SUBGROUP_SIZE()                                               \
     59   void                                                                  \
     60   hs_kernel_fm_##s##_##r(__global HS_KEY_TYPE * const HS_RESTRICT vout)
     61 
     62 #define HS_HM_KERNEL_PROTO(s)                                           \
     63   __kernel                                                              \
     64   HS_REQD_SUBGROUP_SIZE()                                               \
     65   void                                                                  \
     66   hs_kernel_hm_##s(__global HS_KEY_TYPE * const HS_RESTRICT vout)
     67 
     68 #define HS_TRANSPOSE_KERNEL_PROTO()                                     \
     69   __kernel                                                              \
     70   HS_REQD_SUBGROUP_SIZE()                                               \
     71   void                                                                  \
     72   hs_kernel_transpose(__global HS_KEY_TYPE * const HS_RESTRICT vout)
     73 
     74 //
     75 // BLOCK LOCAL MEMORY DECLARATION
     76 //
     77 
     78 #define HS_BLOCK_LOCAL_MEM_DECL(width,height)   \
     79   __local struct {                              \
     80     HS_KEY_TYPE m[width * height];              \
     81   } shared
     82 
     83 //
     84 //
     85 //
     86 
     87 #define HS_SUBGROUP_ID()                        \
     88   get_sub_group_id()
     89 
     90 //
     91 // BLOCK BARRIER
     92 //
     93 
     94 #define HS_BLOCK_BARRIER()                      \
     95   barrier(CLK_LOCAL_MEM_FENCE)
     96 
     97 //
     98 // SLAB GLOBAL
     99 //
    100 
    101 #define HS_SLAB_GLOBAL_PREAMBLE()                                       \
    102   uint const gmem_idx =                                                 \
    103     (get_global_id(0) & ~(HS_SLAB_THREADS-1)) * HS_SLAB_HEIGHT +        \
    104     (get_local_id(0) & (HS_SLAB_THREADS-1))
    105 
    106 #define HS_SLAB_GLOBAL_LOAD(extent,row_idx)     \
    107   extent[gmem_idx + HS_SLAB_THREADS * row_idx]
    108 
    109 #define HS_SLAB_GLOBAL_STORE(row_idx,reg)               \
    110   vout[gmem_idx + HS_SLAB_THREADS * row_idx] = reg
    111 
    112 //
    113 // SLAB LOCAL
    114 //
    115 
    116 #define HS_SLAB_LOCAL_L(offset)                 \
    117   shared.m[smem_l_idx + (offset)]
    118 
    119 #define HS_SLAB_LOCAL_R(offset)                 \
    120   shared.m[smem_r_idx + (offset)]
    121 
    122 //
    123 // SLAB LOCAL VERTICAL LOADS
    124 //
    125 
    126 #define HS_BX_LOCAL_V(offset)                   \
    127   shared.m[get_local_id(0) + (offset)]
    128 
    129 //
    130 // BLOCK SORT MERGE HORIZONTAL
    131 //
    132 
    133 #define HS_BS_MERGE_H_PREAMBLE(slab_count)                      \
    134   uint const smem_l_idx =                                       \
    135     get_sub_group_id() * (HS_SLAB_THREADS * slab_count) +       \
    136     get_sub_group_local_id();                                   \
    137   uint const smem_r_idx =                                       \
    138     (get_sub_group_id() ^ 1) * (HS_SLAB_THREADS * slab_count) + \
    139     (get_sub_group_local_id() ^ (HS_SLAB_THREADS - 1))
    140 
    141 //
    142 // BLOCK CLEAN MERGE HORIZONTAL
    143 //
    144 
    145 #define HS_BC_MERGE_H_PREAMBLE(slab_count)                              \
    146   uint const gmem_l_idx =                                               \
    147     (get_global_id(0) & ~(HS_SLAB_THREADS*slab_count-1)) *              \
    148     HS_SLAB_HEIGHT + get_local_id(0);                                   \
    149   uint const smem_l_idx =                                               \
    150     get_sub_group_id() * (HS_SLAB_THREADS * slab_count) +               \
    151     get_sub_group_local_id()
    152 
    153 #define HS_BC_GLOBAL_LOAD_L(slab_idx)                   \
    154   vout[gmem_l_idx + (HS_SLAB_THREADS * slab_idx)]
    155 
    156 //
    157 // SLAB FLIP AND HALF PREAMBLES
    158 //
    159 
    160 #define HS_SLAB_FLIP_PREAMBLE(mask)                                     \
    161   uint const flip_lane_idx = get_sub_group_local_id() ^ mask;           \
    162   int  const t_lt          = get_sub_group_local_id() < flip_lane_idx;
    163 
    164 #define HS_SLAB_HALF_PREAMBLE(mask)                                     \
    165   uint const half_lane_idx = get_sub_group_local_id() ^ mask;           \
    166   int  const t_lt          = get_sub_group_local_id() < half_lane_idx;
    167 
    168 //
    169 // Inter-lane compare exchange
    170 //
    171 
    172 // default
    173 #define HS_CMP_XCHG_V0(a,b)                     \
    174   {                                             \
    175     HS_KEY_TYPE const t = min(a,b);             \
    176     b = max(a,b);                               \
    177     a = t;                                      \
    178   }
    179 
    180 // super slow
    181 #define HS_CMP_XCHG_V1(a,b)                     \
    182   {                                             \
    183     HS_KEY_TYPE const tmp = a;                  \
    184     a  = (a < b) ? a : b;                       \
    185     b ^= a ^ tmp;                               \
    186   }
    187 
    188 // best
    189 #define HS_CMP_XCHG_V2(a,b)                     \
    190   if (a >= b) {                                 \
    191     HS_KEY_TYPE const t = a;                    \
    192     a = b;                                      \
    193     b = t;                                      \
    194   }
    195 
    196 // good
    197 #define HS_CMP_XCHG_V3(a,b)                     \
    198   {                                             \
    199     int         const ge = a >= b;              \
    200     HS_KEY_TYPE const t  = a;                   \
    201     a = ge ? b : a;                             \
    202     b = ge ? t : b;                             \
    203   }
    204 
    205 //
    206 //
    207 //
    208 
    209 #if   (HS_KEY_WORDS == 1)
    210 #define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V0(a,b)
    211 #elif (HS_KEY_WORDS == 2)
    212 #define HS_CMP_XCHG(a,b)  HS_CMP_XCHG_V2(a,b)
    213 #endif
    214 
    215 //
    216 // The flip/half comparisons rely on a "conditional min/max":
    217 //
    218 //  - if the flag is false, return min(a,b)
    219 //  - otherwise, return max(a,b)
    220 //
    221 // What's a little surprising is that sequence (1) is faster than (2)
    222 // for 32-bit keys.
    223 //
    224 // I suspect either a code generation problem or that the sequence
    225 // maps well to the GEN instruction set.
    226 //
    227 // We mostly care about 64-bit keys and unsurprisingly sequence (2) is
    228 // fastest for this wider type.
    229 //
    230 
    231 // this is what you would normally use
    232 #define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) ^ lt) ? b : a
    233 
    234 // this may be faster for 32-bit keys on Intel GEN
    235 #define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b))
    236 
    237 //
    238 // FIXME -- EVENTUALLY HANDLE KEY+VAL
    239 //
    240 
    241 #if   (HS_KEY_WORDS == 1)
    242 #define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
    243 #elif (HS_KEY_WORDS == 2)
    244 #define HS_COND_MIN_MAX(lt,a,b) HS_COND_MIN_MAX_V0(lt,a,b)
    245 #endif
    246 
    247 //
    248 // Conditional inter-subgroup flip/half compare exchange
    249 //
    250 
    251 #define HS_CMP_FLIP(i,a,b)                                              \
    252   {                                                                     \
    253     HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,flip_lane_idx);    \
    254     HS_KEY_TYPE const tb = intel_sub_group_shuffle(b,flip_lane_idx);    \
    255     a = HS_COND_MIN_MAX(t_lt,a,tb);                                     \
    256     b = HS_COND_MIN_MAX(t_lt,b,ta);                                     \
    257   }
    258 
    259 #define HS_CMP_HALF(i,a)                                                \
    260   {                                                                     \
    261     HS_KEY_TYPE const ta = intel_sub_group_shuffle(a,half_lane_idx);    \
    262     a = HS_COND_MIN_MAX(t_lt,a,ta);                                     \
    263   }
    264 
    265 //
    266 // The device's comparison operator might return what we actually
    267 // want.  For example, it appears GEN 'cmp' returns {true:-1,false:0}.
    268 //
    269 
    270 #define HS_CMP_IS_ZERO_ONE
    271 
    272 #ifdef HS_CMP_IS_ZERO_ONE
    273 // OpenCL requires a {true: +1, false: 0} scalar result
    274 // (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF }
    275 #define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b))
    276 #define HS_CMP_TO_MASK(a)   (HS_KEY_TYPE)(-a)
    277 #else
    278 // However, OpenCL requires { -1, 0 } for vectors
    279 // (a < b) -> { 0xFFFFFFFF, 0 }
    280 #define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64
    281 #define HS_CMP_TO_MASK(a)   (a)
    282 #endif
    283 
    284 //
    285 // The "flip-merge" and "half-merge" preambles are very similar
    286 //
    287 // For now, we're only using the .y dimension for the span idx
    288 //
    289 
    290 #define HS_HM_PREAMBLE(half_span)                       \
    291   uint const span_idx    = get_global_id(1);            \
    292   uint const span_stride = get_global_size(0);          \
    293   uint const span_size   = span_stride * half_span * 2; \
    294   uint const span_base   = span_idx * span_size;        \
    295   uint const span_off    = get_global_id(0);            \
    296   uint const span_l      = span_base + span_off
    297 
    298 #define HS_FM_PREAMBLE(half_span)                                       \
    299   HS_HM_PREAMBLE(half_span);                                            \
    300   uint const span_r      = span_base + span_stride * (half_span + 1) - span_off - 1
    301 
    302 //
    303 //
    304 //
    305 
    306 #define HS_XM_GLOBAL_L(stride_idx)              \
    307   vout[span_l + span_stride * stride_idx]
    308 
    309 #define HS_XM_GLOBAL_LOAD_L(stride_idx)         \
    310   HS_XM_GLOBAL_L(stride_idx)
    311 
    312 #define HS_XM_GLOBAL_STORE_L(stride_idx,reg)    \
    313   HS_XM_GLOBAL_L(stride_idx) = reg
    314 
    315 #define HS_FM_GLOBAL_R(stride_idx)              \
    316   vout[span_r + span_stride * stride_idx]
    317 
    318 #define HS_FM_GLOBAL_LOAD_R(stride_idx)         \
    319   HS_FM_GLOBAL_R(stride_idx)
    320 
    321 #define HS_FM_GLOBAL_STORE_R(stride_idx,reg)    \
    322   HS_FM_GLOBAL_R(stride_idx) = reg
    323 
    324 //
    325 // This snarl of macros is for transposing a "slab" of sorted elements
    326 // into linear order.
    327 //
    328 // This can occur as the last step in hs_sort() or via a custom kernel
    329 // that inspects the slab and then transposes and stores it to memory.
    330 //
    331 // The slab format can be inspected more efficiently than a linear
    332 // arrangement.
    333 //
    334 // The prime example is detecting when adjacent keys (in sort order)
    335 // have differing high order bits ("key changes").  The index of each
    336 // change is recorded to an auxilary array.
    337 //
    338 // A post-processing step like this needs to be able to navigate the
    339 // slab and eventually transpose and store the slab in linear order.
    340 //
    341 
    342 #define HS_SUBGROUP_SHUFFLE_XOR(v,m)   intel_sub_group_shuffle_xor(v,m)
    343 
    344 #define HS_TRANSPOSE_REG(prefix,row)   prefix##row
    345 #define HS_TRANSPOSE_DECL(prefix,row)  HS_KEY_TYPE const HS_TRANSPOSE_REG(prefix,row)
    346 #define HS_TRANSPOSE_PRED(level)       is_lo_##level
    347 
    348 #define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)       \
    349   prefix_curr##row_ll##_##row_ur
    350 
    351 #define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur)      \
    352   HS_KEY_TYPE const HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur)
    353 
    354 #define HS_TRANSPOSE_STAGE(level)                       \
    355   bool const HS_TRANSPOSE_PRED(level) =                 \
    356     (get_sub_group_local_id() & (1 << (level-1))) == 0;
    357 
    358 #define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \
    359   HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) =                    \
    360     HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ?                  \
    361                             HS_TRANSPOSE_REG(prefix_prev,row_ll) :      \
    362                             HS_TRANSPOSE_REG(prefix_prev,row_ur),       \
    363                             1<<(level-1));                              \
    364                                                                         \
    365   HS_TRANSPOSE_DECL(prefix_curr,row_ll) =                               \
    366     HS_TRANSPOSE_PRED(level)                  ?                         \
    367     HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) :                   \
    368     HS_TRANSPOSE_REG(prefix_prev,row_ll);                               \
    369                                                                         \
    370   HS_TRANSPOSE_DECL(prefix_curr,row_ur) =                               \
    371     HS_TRANSPOSE_PRED(level)                  ?                         \
    372     HS_TRANSPOSE_REG(prefix_prev,row_ur)      :                         \
    373     HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur);
    374 
    375 #define HS_TRANSPOSE_REMAP(prefix,row_from,row_to)      \
    376   vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \
    377     HS_TRANSPOSE_REG(prefix,row_from);
    378 
    379 //
    380 //
    381 //
    382 
    383 #endif
    384 
    385 //
    386 //
    387 //
    388