1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s 2 ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s 3 4 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: 5 ; HSA: enable_sgpr_private_segment_buffer = 1 6 ; HSA: enable_sgpr_dispatch_ptr = 0 7 ; CI: enable_sgpr_queue_ptr = 1 8 ; GFX9: enable_sgpr_queue_ptr = 0 9 10 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 11 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} 12 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 13 ; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 14 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 15 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 16 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 17 18 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 19 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} 20 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) 21 ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 22 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] 23 24 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base 25 ; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1 26 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 27 ; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 28 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 29 30 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 31 32 ; At most 2 digits. Make sure src_shared_base is not counted as a high 33 ; number SGPR. 34 35 ; CI: NumSgprs: {{[0-9][0-9]+}} 36 ; GFX9: NumSgprs: {{[0-9]+}} 37 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { 38 %stof = addrspacecast i32 addrspace(3)* %ptr to i32* 39 store volatile i32 7, i32* %stof 40 ret void 41 } 42 43 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: 44 ; HSA: enable_sgpr_private_segment_buffer = 1 45 ; HSA: enable_sgpr_dispatch_ptr = 0 46 ; CI: enable_sgpr_queue_ptr = 1 47 ; GFX9: enable_sgpr_queue_ptr = 0 48 49 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 50 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} 51 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 52 53 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 54 ; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 55 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 56 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 57 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 58 59 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} 60 ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) 61 ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 62 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] 63 64 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base 65 66 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 67 ; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], 0 68 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 69 ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 70 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 71 72 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 73 74 ; CI: NumSgprs: {{[0-9][0-9]+}} 75 ; GFX9: NumSgprs: {{[0-9]+}} 76 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 { 77 %stof = addrspacecast i32 addrspace(5)* %ptr to i32* 78 store volatile i32 7, i32* %stof 79 ret void 80 } 81 82 ; no-op 83 ; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast: 84 ; HSA: enable_sgpr_queue_ptr = 0 85 86 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 87 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 88 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 89 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 90 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 91 define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { 92 %stof = addrspacecast i32 addrspace(1)* %ptr to i32* 93 store volatile i32 7, i32* %stof 94 ret void 95 } 96 97 ; no-op 98 ; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast: 99 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 100 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 101 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 102 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} 103 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 { 104 %stof = addrspacecast i32 addrspace(4)* %ptr to i32* 105 %ld = load volatile i32, i32* %stof 106 ret void 107 } 108 109 ; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast: 110 ; HSA: enable_sgpr_private_segment_buffer = 1 111 ; HSA: enable_sgpr_dispatch_ptr = 0 112 ; HSA: enable_sgpr_queue_ptr = 0 113 114 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 115 ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} 116 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 117 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 118 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 119 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]] 120 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 { 121 %ftos = addrspacecast i32* %ptr to i32 addrspace(3)* 122 store volatile i32 0, i32 addrspace(3)* %ftos 123 ret void 124 } 125 126 ; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast: 127 ; HSA: enable_sgpr_private_segment_buffer = 1 128 ; HSA: enable_sgpr_dispatch_ptr = 0 129 ; HSA: enable_sgpr_queue_ptr = 0 130 131 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 132 ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}} 133 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 134 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]] 135 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 136 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} 137 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { 138 %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* 139 store volatile i32 0, i32 addrspace(5)* %ftos 140 ret void 141 } 142 143 ; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast: 144 ; HSA: enable_sgpr_queue_ptr = 0 145 146 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 147 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 148 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 149 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 150 ; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 151 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 { 152 %ftos = addrspacecast i32* %ptr to i32 addrspace(1)* 153 store volatile i32 0, i32 addrspace(1)* %ftos 154 ret void 155 } 156 157 ; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast: 158 ; HSA: enable_sgpr_queue_ptr = 0 159 160 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 161 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 162 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 { 163 %ftos = addrspacecast i32* %ptr to i32 addrspace(4)* 164 load volatile i32, i32 addrspace(4)* %ftos 165 ret void 166 } 167 168 ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: 169 ; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 170 ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 171 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) 172 ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 173 ; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] 174 175 ; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base 176 177 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 178 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 179 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 180 define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 { 181 %cast = addrspacecast i32 addrspace(3)* null to i32* 182 store volatile i32 7, i32* %cast 183 ret void 184 } 185 186 ; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast: 187 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 188 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 189 ; HSA: ds_write_b32 [[PTR]], [[K]] 190 define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 { 191 %cast = addrspacecast i32* null to i32 addrspace(3)* 192 store volatile i32 7, i32 addrspace(3)* %cast 193 ret void 194 } 195 196 ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: 197 ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 198 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 199 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 200 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 201 define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { 202 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32* 203 store volatile i32 7, i32* %cast 204 ret void 205 } 206 207 ; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast: 208 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 209 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 210 ; HSA: ds_write_b32 [[PTR]], [[K]] 211 define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { 212 %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)* 213 store volatile i32 7, i32 addrspace(3)* %cast 214 ret void 215 } 216 217 ; FIXME: Shouldn't need to enable queue ptr 218 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: 219 ; CI: enable_sgpr_queue_ptr = 1 220 ; GFX9: enable_sgpr_queue_ptr = 0 221 222 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 223 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 224 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 225 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 226 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { 227 %cast = addrspacecast i32 addrspace(5)* null to i32* 228 store volatile i32 7, i32* %cast 229 ret void 230 } 231 232 ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: 233 ; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 234 ; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} 235 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { 236 %cast = addrspacecast i32* null to i32 addrspace(5)* 237 store volatile i32 7, i32 addrspace(5)* %cast 238 ret void 239 } 240 241 ; Disable optimizations in case there are optimizations added that 242 ; specialize away generic pointer accesses. 243 244 ; HSA-LABEL: {{^}}branch_use_flat_i32: 245 ; HSA: {{flat|global}}_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} 246 ; HSA: s_endpgm 247 define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { 248 entry: 249 %cmp = icmp ne i32 %c, 0 250 br i1 %cmp, label %local, label %global 251 252 local: 253 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32* 254 br label %end 255 256 global: 257 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32* 258 br label %end 259 260 end: 261 %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ] 262 store volatile i32 %x, i32* %fptr, align 4 263 ; %val = load i32, i32* %fptr, align 4 264 ; store i32 %val, i32 addrspace(1)* %out, align 4 265 ret void 266 } 267 268 ; Check for prologue initializing special SGPRs pointing to scratch. 269 ; HSA-LABEL: {{^}}store_flat_scratch: 270 ; CI-DAG: s_mov_b32 flat_scratch_lo, s9 271 ; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 272 ; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 273 274 ; GFX9: s_add_u32 flat_scratch_lo, s6, s9 275 ; GFX9: s_addc_u32 flat_scratch_hi, s7, 0 276 277 ; HSA: {{flat|global}}_store_dword 278 ; HSA: s_barrier 279 ; HSA: {{flat|global}}_load_dword 280 define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { 281 %alloca = alloca i32, i32 9, align 4, addrspace(5) 282 %x = call i32 @llvm.amdgcn.workitem.id.x() #2 283 %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x 284 %fptr = addrspacecast i32 addrspace(5)* %pptr to i32* 285 store volatile i32 %x, i32* %fptr 286 ; Dummy call 287 call void @llvm.amdgcn.s.barrier() #1 288 %reload = load volatile i32, i32* %fptr, align 4 289 store volatile i32 %reload, i32 addrspace(1)* %out, align 4 290 ret void 291 } 292 293 declare void @llvm.amdgcn.s.barrier() #1 294 declare i32 @llvm.amdgcn.workitem.id.x() #2 295 296 attributes #0 = { nounwind } 297 attributes #1 = { nounwind convergent } 298 attributes #2 = { nounwind readnone } 299