1 // 2 // Copyright 2016 Google Inc. 3 // 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 // 7 8 #ifndef HS_GLSL_MACROS_ONCE 9 #define HS_GLSL_MACROS_ONCE 10 11 // 12 // Define the type based on key and val sizes 13 // 14 15 #if HS_KEY_WORDS == 1 16 #if HS_VAL_WORDS == 0 17 #define HS_KEY_TYPE uint 18 #endif 19 #elif HS_KEY_WORDS == 2 // FIXME -- might want to use uint2 20 #define HS_KEY_TYPE uint64_t // GL_ARB_gpu_shader_int64 21 #endif 22 23 // 24 // FYI, restrict shouldn't have any impact on these kernels and 25 // benchmarks appear to prove that true 26 // 27 28 #define HS_RESTRICT restrict 29 30 // 31 // 32 // 33 34 #define HS_GLSL_BINDING(n) \ 35 layout( binding = n) 36 37 #define HS_GLSL_WORKGROUP_SIZE(x,y,z) \ 38 layout( local_size_x = x, \ 39 local_size_y = y, \ 40 local_size_z = z) in 41 42 // 43 // KERNEL PROTOS 44 // 45 46 #define HS_BS_KERNEL_PROTO(slab_count,slab_count_ru_log2) \ 47 HS_GLSL_SUBGROUP_SIZE() \ 48 HS_GLSL_WORKGROUP_SIZE(HS_SLAB_THREADS*slab_count,1,1); \ 49 HS_GLSL_BINDING(0) writeonly buffer Vout { HS_KEY_TYPE vout[]; }; \ 50 HS_GLSL_BINDING(1) readonly buffer Vin { HS_KEY_TYPE vin[]; }; \ 51 void main() 52 53 #define HS_BC_KERNEL_PROTO(slab_count,slab_count_log2) \ 54 HS_GLSL_SUBGROUP_SIZE() \ 55 HS_GLSL_WORKGROUP_SIZE(HS_SLAB_THREADS*slab_count,1,1); \ 56 HS_GLSL_BINDING(0) buffer Vout { HS_KEY_TYPE vout[]; }; \ 57 void main() 58 59 #define HS_FM_KERNEL_PROTO(s,r) \ 60 HS_GLSL_SUBGROUP_SIZE() \ 61 HS_GLSL_WORKGROUP_SIZE(HS_SLAB_THREADS,1,1); \ 62 HS_GLSL_BINDING(0) buffer Vout { HS_KEY_TYPE vout[]; }; \ 63 void main() 64 65 #define HS_HM_KERNEL_PROTO(s) \ 66 HS_GLSL_SUBGROUP_SIZE() \ 67 HS_GLSL_WORKGROUP_SIZE(HS_SLAB_THREADS,1,1); \ 68 HS_GLSL_BINDING(0) buffer Vout { HS_KEY_TYPE vout[]; }; \ 69 void main() 70 71 #define HS_TRANSPOSE_KERNEL_PROTO() \ 72 HS_GLSL_SUBGROUP_SIZE() \ 73 HS_GLSL_WORKGROUP_SIZE(HS_SLAB_THREADS,1,1); \ 74 HS_GLSL_BINDING(0) buffer Vout { HS_KEY_TYPE vout[]; }; \ 75 void main() 76 77 // 78 // BLOCK LOCAL MEMORY DECLARATION 79 // 80 81 #define HS_BLOCK_LOCAL_MEM_DECL(width,height) \ 82 shared struct { \ 83 HS_KEY_TYPE m[width * height]; \ 84 } smem 85 86 // 87 // BLOCK BARRIER 88 // 89 90 #define HS_BLOCK_BARRIER() \ 91 barrier() 92 93 // 94 // SHUFFLES 95 // 96 97 #if (HS_KEY_WORDS == 1) 98 #define HS_SHUFFLE_CAST_TO(v) v 99 #define HS_SHUFFLE_CAST_FROM(v) v 100 #elif (HS_KEY_WORDS == 2) 101 #define HS_SHUFFLE_CAST_TO(v) uint64BitsToDouble(v) 102 #define HS_SHUFFLE_CAST_FROM(v) doubleBitsToUint64(v) 103 #endif 104 105 #define HS_SUBGROUP_SHUFFLE(v,i) HS_SHUFFLE_CAST_FROM(subgroupShuffle(HS_SHUFFLE_CAST_TO(v),i)) 106 #define HS_SUBGROUP_SHUFFLE_XOR(v,m) HS_SHUFFLE_CAST_FROM(subgroupShuffleXor(HS_SHUFFLE_CAST_TO(v),m)) 107 #define HS_SUBGROUP_SHUFFLE_UP(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleUp(HS_SHUFFLE_CAST_TO(v),d)) 108 #define HS_SUBGROUP_SHUFFLE_DOWN(v,d) HS_SHUFFLE_CAST_FROM(subgroupShuffleDown(HS_SHUFFLE_CAST_TO(v),d)) 109 110 // 111 // SLAB GLOBAL 112 // 113 114 #define HS_SLAB_GLOBAL_PREAMBLE() \ 115 const uint gmem_idx = \ 116 (gl_GlobalInvocationID.x & ~(HS_SLAB_THREADS-1)) * \ 117 HS_SLAB_HEIGHT + \ 118 (gl_LocalInvocationID.x & (HS_SLAB_THREADS-1)) 119 120 #define HS_SLAB_GLOBAL_LOAD(extent,row_idx) \ 121 extent[gmem_idx + HS_SLAB_THREADS * row_idx] 122 123 #define HS_SLAB_GLOBAL_STORE(row_idx,reg) \ 124 vout[gmem_idx + HS_SLAB_THREADS * row_idx] = reg 125 126 // 127 // SLAB LOCAL 128 // 129 130 #define HS_SLAB_LOCAL_L(offset) \ 131 smem.m[smem_l_idx + (offset)] 132 133 #define HS_SLAB_LOCAL_R(offset) \ 134 smem.m[smem_r_idx + (offset)] 135 136 // 137 // SLAB LOCAL VERTICAL LOADS 138 // 139 140 #define HS_BX_LOCAL_V(offset) \ 141 smem.m[gl_LocalInvocationID.x + (offset)] 142 143 // 144 // BLOCK SORT MERGE HORIZONTAL 145 // 146 147 #define HS_BS_MERGE_H_PREAMBLE(slab_count) \ 148 const uint smem_l_idx = \ 149 HS_SUBGROUP_ID() * (HS_SLAB_THREADS * slab_count) + \ 150 HS_SUBGROUP_LANE_ID(); \ 151 const uint smem_r_idx = \ 152 (HS_SUBGROUP_ID() ^ 1) * (HS_SLAB_THREADS * slab_count) + \ 153 (HS_SUBGROUP_LANE_ID() ^ (HS_SLAB_THREADS - 1)) 154 155 // 156 // BLOCK CLEAN MERGE HORIZONTAL 157 // 158 159 #define HS_BC_MERGE_H_PREAMBLE(slab_count) \ 160 const uint gmem_l_idx = \ 161 (gl_GlobalInvocationID.x & ~(HS_SLAB_THREADS * slab_count -1)) \ 162 * HS_SLAB_HEIGHT + gl_LocalInvocationID.x; \ 163 const uint smem_l_idx = \ 164 HS_SUBGROUP_ID() * (HS_SLAB_THREADS * slab_count) + \ 165 HS_SUBGROUP_LANE_ID() 166 167 #define HS_BC_GLOBAL_LOAD_L(slab_idx) \ 168 vout[gmem_l_idx + (HS_SLAB_THREADS * slab_idx)] 169 170 // 171 // SLAB FLIP AND HALF PREAMBLES 172 // 173 174 #if 0 175 176 #define HS_SLAB_FLIP_PREAMBLE(mask) \ 177 const uint flip_lane_idx = HS_SUBGROUP_LANE_ID() ^ mask; \ 178 const bool t_lt = HS_SUBGROUP_LANE_ID() < flip_lane_idx 179 180 #define HS_SLAB_HALF_PREAMBLE(mask) \ 181 const uint half_lane_idx = HS_SUBGROUP_LANE_ID() ^ mask; \ 182 const bool t_lt = HS_SUBGROUP_LANE_ID() < half_lane_idx 183 184 #else 185 186 #define HS_SLAB_FLIP_PREAMBLE(mask) \ 187 const uint flip_lane_mask = mask; \ 188 const bool t_lt = gl_LocalInvocationID.x < (gl_LocalInvocationID.x ^ mask) 189 190 #define HS_SLAB_HALF_PREAMBLE(mask) \ 191 const uint half_lane_mask = mask; \ 192 const bool t_lt = gl_LocalInvocationID.x < (gl_LocalInvocationID.x ^ mask) 193 194 #endif 195 196 // 197 // Inter-lane compare exchange 198 // 199 200 // best on 32-bit keys 201 #define HS_CMP_XCHG_V0(a,b) \ 202 { \ 203 const HS_KEY_TYPE t = min(a,b); \ 204 b = max(a,b); \ 205 a = t; \ 206 } 207 208 // good on Intel GEN 32-bit keys 209 #define HS_CMP_XCHG_V1(a,b) \ 210 { \ 211 const HS_KEY_TYPE tmp = a; \ 212 a = (a < b) ? a : b; \ 213 b ^= a ^ tmp; \ 214 } 215 216 // best on 64-bit keys 217 #define HS_CMP_XCHG_V2(a,b) \ 218 if (a >= b) { \ 219 const HS_KEY_TYPE t = a; \ 220 a = b; \ 221 b = t; \ 222 } 223 224 // ok 225 #define HS_CMP_XCHG_V3(a,b) \ 226 { \ 227 const bool ge = a >= b; \ 228 const HS_KEY_TYPE t = a; \ 229 a = ge ? b : a; \ 230 b = ge ? t : b; \ 231 } 232 233 // 234 // The flip/half comparisons rely on a "conditional min/max": 235 // 236 // - if the flag is false, return min(a,b) 237 // - otherwise, return max(a,b) 238 // 239 // What's a little surprising is that sequence (1) is faster than (2) 240 // for 32-bit keys. 241 // 242 // I suspect either a code generation problem or that the sequence 243 // maps well to the GEN instruction set. 244 // 245 // We mostly care about 64-bit keys and unsurprisingly sequence (2) is 246 // fastest for this wider type. 247 // 248 249 #define HS_LOGICAL_XOR() != 250 251 // this is what you would normally use 252 #define HS_COND_MIN_MAX_V0(lt,a,b) ((a <= b) HS_LOGICAL_XOR() lt) ? b : a 253 254 // this seems to be faster for 32-bit keys on Intel GEN 255 #define HS_COND_MIN_MAX_V1(lt,a,b) (lt ? b : a) ^ ((a ^ b) & HS_LTE_TO_MASK(a,b)) 256 257 // 258 // Conditional inter-subgroup flip/half compare exchange 259 // 260 261 #if 0 262 263 #define HS_CMP_FLIP(i,a,b) \ 264 { \ 265 const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,flip_lane_idx); \ 266 const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE(b,flip_lane_idx); \ 267 a = HS_COND_MIN_MAX(t_lt,a,tb); \ 268 b = HS_COND_MIN_MAX(t_lt,b,ta); \ 269 } 270 271 #define HS_CMP_HALF(i,a) \ 272 { \ 273 const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE(a,half_lane_idx); \ 274 a = HS_COND_MIN_MAX(t_lt,a,ta); \ 275 } 276 277 #else 278 279 #define HS_CMP_FLIP(i,a,b) \ 280 { \ 281 const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE_XOR(a,flip_lane_mask); \ 282 const HS_KEY_TYPE tb = HS_SUBGROUP_SHUFFLE_XOR(b,flip_lane_mask); \ 283 a = HS_COND_MIN_MAX(t_lt,a,tb); \ 284 b = HS_COND_MIN_MAX(t_lt,b,ta); \ 285 } 286 287 #define HS_CMP_HALF(i,a) \ 288 { \ 289 const HS_KEY_TYPE ta = HS_SUBGROUP_SHUFFLE_XOR(a,half_lane_mask); \ 290 a = HS_COND_MIN_MAX(t_lt,a,ta); \ 291 } 292 293 #endif 294 295 // 296 // The device's comparison operator might return what we actually 297 // want. For example, it appears GEN 'cmp' returns {true:-1,false:0}. 298 // 299 300 #define HS_CMP_IS_ZERO_ONE 301 302 #ifdef HS_CMP_IS_ZERO_ONE 303 // OpenCL requires a {true: +1, false: 0} scalar result 304 // (a < b) -> { +1, 0 } -> NEGATE -> { 0, 0xFFFFFFFF } 305 #define HS_LTE_TO_MASK(a,b) (HS_KEY_TYPE)(-(a <= b)) 306 #define HS_CMP_TO_MASK(a) (HS_KEY_TYPE)(-a) 307 #else 308 // However, OpenCL requires { -1, 0 } for vectors 309 // (a < b) -> { 0xFFFFFFFF, 0 } 310 #define HS_LTE_TO_MASK(a,b) (a <= b) // FIXME for uint64 311 #define HS_CMP_TO_MASK(a) (a) 312 #endif 313 314 // 315 // The "flip-merge" and "half-merge" preambles are very similar 316 // 317 // For now, we're only using the .y dimension for the span idx 318 // 319 320 #define HS_HM_PREAMBLE(half_span) \ 321 const uint span_idx = gl_WorkGroupID.y; \ 322 const uint span_stride = gl_NumWorkGroups.x * gl_WorkGroupSize.x; \ 323 const uint span_size = span_stride * half_span * 2; \ 324 const uint span_base = span_idx * span_size; \ 325 const uint span_off = gl_GlobalInvocationID.x; \ 326 const uint span_l = span_base + span_off 327 328 #define HS_FM_PREAMBLE(half_span) \ 329 HS_HM_PREAMBLE(half_span); \ 330 const uint span_r = span_base + span_stride * (half_span + 1) - span_off - 1 331 332 // 333 // 334 // 335 336 #define HS_XM_GLOBAL_L(stride_idx) \ 337 vout[span_l + span_stride * stride_idx] 338 339 #define HS_XM_GLOBAL_LOAD_L(stride_idx) \ 340 HS_XM_GLOBAL_L(stride_idx) 341 342 #define HS_XM_GLOBAL_STORE_L(stride_idx,reg) \ 343 HS_XM_GLOBAL_L(stride_idx) = reg 344 345 #define HS_FM_GLOBAL_R(stride_idx) \ 346 vout[span_r + span_stride * stride_idx] 347 348 #define HS_FM_GLOBAL_LOAD_R(stride_idx) \ 349 HS_FM_GLOBAL_R(stride_idx) 350 351 #define HS_FM_GLOBAL_STORE_R(stride_idx,reg) \ 352 HS_FM_GLOBAL_R(stride_idx) = reg 353 354 // 355 // This snarl of macros is for transposing a "slab" of sorted elements 356 // into linear order. 357 // 358 // This can occur as the last step in hs_sort() or via a custom kernel 359 // that inspects the slab and then transposes and stores it to memory. 360 // 361 // The slab format can be inspected more efficiently than a linear 362 // arrangement. 363 // 364 // The prime example is detecting when adjacent keys (in sort order) 365 // have differing high order bits ("key changes"). The index of each 366 // change is recorded to an auxilary array. 367 // 368 // A post-processing step like this needs to be able to navigate the 369 // slab and eventually transpose and store the slab in linear order. 370 // 371 372 #define HS_TRANSPOSE_REG(prefix,row) prefix##row 373 #define HS_TRANSPOSE_DECL(prefix,row) const HS_KEY_TYPE HS_TRANSPOSE_REG(prefix,row) 374 #define HS_TRANSPOSE_PRED(level) is_lo_##level 375 376 #define HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) \ 377 prefix_curr##row_ll##_##row_ur 378 379 #define HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) \ 380 const HS_KEY_TYPE HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) 381 382 #define HS_TRANSPOSE_STAGE(level) \ 383 const bool HS_TRANSPOSE_PRED(level) = \ 384 (HS_SUBGROUP_LANE_ID() & (1 << (level-1))) == 0; 385 386 #define HS_TRANSPOSE_BLEND(prefix_prev,prefix_curr,level,row_ll,row_ur) \ 387 HS_TRANSPOSE_TMP_DECL(prefix_curr,row_ll,row_ur) = \ 388 HS_SUBGROUP_SHUFFLE_XOR(HS_TRANSPOSE_PRED(level) ? \ 389 HS_TRANSPOSE_REG(prefix_prev,row_ll) : \ 390 HS_TRANSPOSE_REG(prefix_prev,row_ur), \ 391 1<<(level-1)); \ 392 \ 393 HS_TRANSPOSE_DECL(prefix_curr,row_ll) = \ 394 HS_TRANSPOSE_PRED(level) ? \ 395 HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur) : \ 396 HS_TRANSPOSE_REG(prefix_prev,row_ll); \ 397 \ 398 HS_TRANSPOSE_DECL(prefix_curr,row_ur) = \ 399 HS_TRANSPOSE_PRED(level) ? \ 400 HS_TRANSPOSE_REG(prefix_prev,row_ur) : \ 401 HS_TRANSPOSE_TMP_REG(prefix_curr,row_ll,row_ur); 402 403 #define HS_TRANSPOSE_REMAP(prefix,row_from,row_to) \ 404 vout[gmem_idx + ((row_to-1) << HS_SLAB_WIDTH_LOG2)] = \ 405 HS_TRANSPOSE_REG(prefix,row_from); 406 407 // 408 // 409 // 410 411 #endif 412 413 // 414 // 415 // 416