1 /* 2 * Copyright 2018 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can 5 * be found in the LICENSE file. 6 * 7 */ 8 9 // 10 // NOTE THAT THE SEGMENT TTCK KERNEL IS ENTIRELY DEPENDENT ON THE 11 // LAYOUT OF THE TTCK KEY. IF THE TTCK KEY IS ALTERED THEN THIS 12 // KERNEL WILL NEED TO BE UPDATED 13 // 14 15 #include "tile.h" 16 #include "atomic_cl.h" 17 #include "kernel_cl_12.h" 18 #include "hs/cl/intel/gen8/u64/hs_config.h" 19 #include "hs/cl/intel/hs_cl_macros.h" 20 21 // 22 // 23 // 24 25 #define HS_LANE_MASK (HS_SLAB_WIDTH - 1) 26 27 // 28 // 29 // 30 31 #define SKC_YX_NEQ(row,prev) \ 32 (((as_uint2(r##row).hi ^ as_uint2(r##prev).hi) & SKC_TTCK_HI_MASK_YX) != 0) 33 34 // 35 // 36 // 37 38 __kernel 39 __attribute__((intel_reqd_sub_group_size(HS_SLAB_WIDTH))) 40 void 41 skc_kernel_segment_ttck(__global HS_KEY_TYPE * SKC_RESTRICT const vout, 42 __global uint * SKC_RESTRICT const indices, 43 __global SKC_ATOMIC_UINT volatile * SKC_RESTRICT const atomics) 44 { 45 uint const global_id = get_global_id(0); 46 uint const gmem_base = (global_id >> HS_SLAB_WIDTH_LOG2) * HS_SLAB_KEYS; 47 uint const gmem_idx = gmem_base + (global_id & HS_LANE_MASK); 48 uint const lane_idx = gmem_base + (global_id & HS_LANE_MASK) * HS_SLAB_HEIGHT; 49 50 // 51 // LOAD ALL THE ROWS 52 // 53 #undef HS_SLAB_ROW 54 #define HS_SLAB_ROW(row,prev) \ 55 HS_KEY_TYPE const r##row = (vout + gmem_idx)[prev * HS_SLAB_WIDTH]; 56 57 HS_SLAB_ROWS(); 58 59 // 60 // LOAD LAST REGISTER FROM COLUMN TO LEFT 61 // 62 uint diffs = 0; 63 uint2 r0 = r1; 64 65 if (gmem_base > 0) { 66 // if this is the first key in any slab but the first then it 67 // broadcast loads the last key in previous slab 68 r0.hi = as_uint2(vout[gmem_base - 1]).hi; 69 } else if (get_sub_group_local_id() == 0) { 70 // if this is the first lane in the first slab 71 diffs = 1; 72 } 73 74 // now shuffle in the last key from the column to the left 75 r0.hi = intel_sub_group_shuffle_up(r0.hi,as_uint2(HS_REG_LAST(r)).hi,1); 76 77 // 78 // FIND ALL DIFFERENCES IN SLAB 79 // 80 uint valid = 0; 81 82 #undef HS_SLAB_ROW 83 #define HS_SLAB_ROW(row,prev) \ 84 valid |= ((r##row != SKC_ULONG_MAX) << prev); 85 86 HS_SLAB_ROWS(); 87 88 #undef HS_SLAB_ROW 89 #define HS_SLAB_ROW(row,prev) \ 90 diffs |= (SKC_YX_NEQ(row,prev) << prev); 91 92 HS_SLAB_ROWS(); 93 94 // 95 // SUM UP THE DIFFERENCES 96 // 97 uint const valid_diffs = valid & diffs; 98 uint const count = popcount(valid_diffs); 99 uint const inclusive = sub_group_scan_inclusive_add(count); 100 uint const exclusive = inclusive - count; 101 102 // 103 // RESERVE SPACE IN THE INDICES ARRAY 104 // 105 uint next = 0; 106 107 if (get_sub_group_local_id() == HS_SLAB_WIDTH-1) 108 next = atomic_add(atomics+1,inclusive); // FIXME -- need a symbolic offset 109 110 // distribute base across subgroup 111 next = exclusive + sub_group_broadcast(next,HS_SLAB_WIDTH-1); 112 113 // 114 // STORE THE INDICES 115 // 116 #undef HS_SLAB_ROW 117 #define HS_SLAB_ROW(row,prev) \ 118 if (valid_diffs & (1 << prev)) \ 119 indices[next++] = lane_idx + prev; 120 121 HS_SLAB_ROWS(); 122 123 // 124 // TRANSPOSE THE SLAB AND STORE IT 125 // 126 HS_TRANSPOSE_SLAB(); 127 } 128 129 // 130 // 131 // 132