1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 2 ; FIXME: Fails with -enable-var-scope 3 4 ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half. 5 6 ; Extract the high bit of the low half 7 ; GCN-LABEL: {{^}}v_uextract_bit_31_i64: 8 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 9 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] 10 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 11 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} 12 define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 13 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 14 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 15 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 16 %ld.64 = load i64, i64 addrspace(1)* %in.gep 17 %srl = lshr i64 %ld.64, 31 18 %bit = and i64 %srl, 1 19 store i64 %bit, i64 addrspace(1)* %out.gep 20 ret void 21 } 22 23 ; Extract the high bit of the high half 24 ; GCN-LABEL: {{^}}v_uextract_bit_63_i64: 25 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 26 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 27 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] 28 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 29 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}} 30 define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 31 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 32 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 33 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 34 %ld.64 = load i64, i64 addrspace(1)* %in.gep 35 %srl = lshr i64 %ld.64, 63 36 %bit = and i64 %srl, 1 37 store i64 %bit, i64 addrspace(1)* %out.gep 38 ret void 39 } 40 41 ; GCN-LABEL: {{^}}v_uextract_bit_1_i64: 42 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 43 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1 44 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 45 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 46 define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 47 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 48 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 49 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 50 %ld.64 = load i64, i64 addrspace(1)* %in.gep 51 %srl = lshr i64 %ld.64, 1 52 %bit = and i64 %srl, 1 53 store i64 %bit, i64 addrspace(1)* %out.gep 54 ret void 55 } 56 57 ; GCN-LABEL: {{^}}v_uextract_bit_20_i64: 58 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 59 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1 60 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 61 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 62 define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 63 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 64 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 65 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 66 %ld.64 = load i64, i64 addrspace(1)* %in.gep 67 %srl = lshr i64 %ld.64, 20 68 %bit = and i64 %srl, 1 69 store i64 %bit, i64 addrspace(1)* %out.gep 70 ret void 71 } 72 73 ; GCN-LABEL: {{^}}v_uextract_bit_32_i64: 74 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 75 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]] 76 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 77 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}} 78 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}} 79 define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 80 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 81 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 82 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 83 %ld.64 = load i64, i64 addrspace(1)* %in.gep 84 %srl = lshr i64 %ld.64, 32 85 %bit = and i64 %srl, 1 86 store i64 %bit, i64 addrspace(1)* %out.gep 87 ret void 88 } 89 90 ; GCN-LABEL: {{^}}v_uextract_bit_33_i64: 91 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 92 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 93 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}} 94 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 95 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}} 96 define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 97 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 98 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 99 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 100 %ld.64 = load i64, i64 addrspace(1)* %in.gep 101 %srl = lshr i64 %ld.64, 33 102 %bit = and i64 %srl, 1 103 store i64 %bit, i64 addrspace(1)* %out.gep 104 ret void 105 } 106 107 ; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64: 108 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 109 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2 110 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 111 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 112 define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 113 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 114 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 115 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 116 %ld.64 = load i64, i64 addrspace(1)* %in.gep 117 %srl = lshr i64 %ld.64, 20 118 %bit = and i64 %srl, 3 119 store i64 %bit, i64 addrspace(1)* %out.gep 120 ret void 121 } 122 123 ; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64: 124 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 125 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 126 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 127 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 128 define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 129 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 130 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 131 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 132 %ld.64 = load i64, i64 addrspace(1)* %in.gep 133 %srl = lshr i64 %ld.64, 1 134 %bit = and i64 %srl, 1073741823 135 store i64 %bit, i64 addrspace(1)* %out.gep 136 ret void 137 } 138 139 ; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64: 140 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 141 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]] 142 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 143 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} 144 define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 145 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 146 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 147 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 148 %ld.64 = load i64, i64 addrspace(1)* %in.gep 149 %srl = lshr i64 %ld.64, 1 150 %bit = and i64 %srl, 2147483647 151 store i64 %bit, i64 addrspace(1)* %out.gep 152 ret void 153 } 154 155 ; Spans the dword boundary, so requires full shift. 156 ; Truncated after the shift, so only low shift result is used. 157 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64: 158 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 159 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 160 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}} 161 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 162 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} 163 define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 164 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 165 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 166 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 167 %ld.64 = load i64, i64 addrspace(1)* %in.gep 168 %srl = lshr i64 %ld.64, 31 169 %bit = and i64 %srl, 3 170 store i64 %bit, i64 addrspace(1)* %out.gep 171 ret void 172 } 173 174 ; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64: 175 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 176 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 177 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2 178 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 179 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}} 180 define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 181 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 182 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 183 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 184 %ld.64 = load i64, i64 addrspace(1)* %in.gep 185 %srl = lshr i64 %ld.64, 33 186 %bit = and i64 %srl, 3 187 store i64 %bit, i64 addrspace(1)* %out.gep 188 ret void 189 } 190 191 ; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64: 192 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 193 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 194 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30 195 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}} 196 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 197 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}} 198 define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 199 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 200 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 201 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 202 %ld.64 = load i64, i64 addrspace(1)* %in.gep 203 %srl = lshr i64 %ld.64, 30 204 %bit = and i64 %srl, 1073741823 205 store i64 %bit, i64 addrspace(1)* %out.gep 206 ret void 207 } 208 209 ; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64: 210 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 211 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 212 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 213 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 214 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}} 215 define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 216 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 217 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 218 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 219 %ld.64 = load i64, i64 addrspace(1)* %in.gep 220 %srl = lshr i64 %ld.64, 33 221 %bit = and i64 %srl, 1073741823 222 store i64 %bit, i64 addrspace(1)* %out.gep 223 ret void 224 } 225 226 ; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64: 227 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 228 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 229 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 230 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[ZERO]]{{\]}} 231 define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 232 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 233 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 234 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 235 %ld.64 = load i64, i64 addrspace(1)* %in.gep 236 %srl = lshr i64 %ld.64, 31 237 %and = and i64 %srl, 4294967295 238 store i64 %and, i64 addrspace(1)* %out 239 ret void 240 } 241 242 ; trunc applied before and mask 243 ; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32: 244 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 245 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] 246 ; GCN: buffer_store_dword v[[SHIFT]] 247 define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 248 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 249 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 250 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 251 %ld.64 = load i64, i64 addrspace(1)* %in.gep 252 %srl = lshr i64 %ld.64, 31 253 %trunc = trunc i64 %srl to i32 254 %bit = and i32 %trunc, 1 255 store i32 %bit, i32 addrspace(1)* %out.gep 256 ret void 257 } 258 259 ; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32: 260 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 261 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}} 262 ; GCN: buffer_store_dword [[BFE]] 263 define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 264 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 265 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 266 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 267 %ld.64 = load i64, i64 addrspace(1)* %in.gep 268 %srl = lshr i64 %ld.64, 3 269 %trunc = trunc i64 %srl to i32 270 %bit = and i32 %trunc, 1 271 store i32 %bit, i32 addrspace(1)* %out.gep 272 ret void 273 } 274 275 ; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32: 276 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 277 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}} 278 ; GCN: buffer_store_dword [[BFE]] 279 define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 280 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 281 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 282 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 283 %ld.64 = load i64, i64 addrspace(1)* %in.gep 284 %srl = lshr i64 %ld.64, 33 285 %trunc = trunc i64 %srl to i32 286 %bit = and i32 %trunc, 1 287 store i32 %bit, i32 addrspace(1)* %out.gep 288 ret void 289 } 290 291 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32: 292 ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 293 ; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 294 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]] 295 ; GCN-NOT: v[[SHRLO]] 296 ; GCN: buffer_store_dword v[[SHRLO]] 297 define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 298 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 299 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 300 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 301 %ld.64 = load i64, i64 addrspace(1)* %in.gep 302 %srl = lshr i64 %ld.64, 31 303 %trunc = trunc i64 %srl to i32 304 %bit = and i32 %trunc, 3 305 store i32 %bit, i32 addrspace(1)* %out.gep 306 ret void 307 } 308 309 ; GCN-LABEL: {{^}}and_not_mask_i64: 310 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 311 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 312 ; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}} 313 ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] 314 ; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]] 315 ; GCN-NOT: v[[SHRLO]] 316 ; GCN-NOT: v[[SHRHI]] 317 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} 318 define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 319 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 320 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 321 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 322 %ld.64 = load i64, i64 addrspace(1)* %in.gep 323 %srl = lshr i64 %ld.64, 20 324 %bit = and i64 %srl, 4 325 store i64 %bit, i64 addrspace(1)* %out.gep 326 ret void 327 } 328 329 ; The instruction count is the same with/without hasOneUse, but 330 ; keeping the 32-bit and has a smaller encoding size than the bfe. 331 332 ; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64: 333 ; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] 334 ; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27 335 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]] 336 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 337 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} 338 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} 339 define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 340 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 341 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 342 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 343 %ld.64 = load i64, i64 addrspace(1)* %in.gep 344 %srl = lshr i64 %ld.64, 27 345 %bit = and i64 %srl, 3 346 store volatile i64 %srl, i64 addrspace(1)* %out 347 store volatile i64 %bit, i64 addrspace(1)* %out 348 ret void 349 } 350 351 ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64: 352 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 353 ; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}} 354 ; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]] 355 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]] 356 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 357 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}} 358 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}} 359 define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 360 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 361 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 362 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 363 %ld.64 = load i64, i64 addrspace(1)* %in.gep 364 %srl = lshr i64 %ld.64, 34 365 %bit = and i64 %srl, 7 366 store volatile i64 %srl, i64 addrspace(1)* %out 367 store volatile i64 %bit, i64 addrspace(1)* %out 368 ret void 369 } 370 371 ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64: 372 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 373 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3 374 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 375 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}} 376 ; GCN: buffer_store_dword v[[ZERO]] 377 define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { 378 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 379 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 380 %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x 381 %out1.gep = getelementptr i32, i32 addrspace(1)* %out1, i32 %id.x 382 %ld.64 = load i64, i64 addrspace(1)* %in.gep 383 %srl = lshr i64 %ld.64, 33 384 %bit = and i64 %srl, 7 385 store volatile i64 %bit, i64 addrspace(1)* %out0.gep 386 387 %srl.srl32 = lshr i64 %srl, 32 388 %srl.hi = trunc i64 %srl.srl32 to i32 389 store volatile i32 %srl.hi, i32 addrspace(1)* %out1.gep 390 ret void 391 } 392 393 declare i32 @llvm.amdgcn.workitem.id.x() #0 394 395 declare i32 @llvm.amdgcn.workgroup.id.x() #0 396 397 attributes #0 = { nounwind readnone } 398 attributes #1 = { nounwind } 399