1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s 2 3 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: 4 ; HSA: enable_sgpr_private_segment_buffer = 1 5 ; HSA: enable_sgpr_dispatch_ptr = 0 6 ; HSA: enable_sgpr_queue_ptr = 1 7 8 ; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 9 ; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} 10 11 ; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 12 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 13 14 ; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]] 15 ; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] 16 ; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 17 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 18 19 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 20 define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { 21 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* 22 store volatile i32 7, i32 addrspace(4)* %stof 23 ret void 24 } 25 26 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: 27 ; HSA: enable_sgpr_private_segment_buffer = 1 28 ; HSA: enable_sgpr_dispatch_ptr = 0 29 ; HSA: enable_sgpr_queue_ptr = 1 30 31 ; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 32 ; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} 33 34 ; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 35 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 36 37 ; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]] 38 ; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]] 39 ; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 40 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 41 42 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]] 43 define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 { 44 %stof = addrspacecast i32* %ptr to i32 addrspace(4)* 45 store volatile i32 7, i32 addrspace(4)* %stof 46 ret void 47 } 48 49 ; no-op 50 ; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast: 51 ; HSA: enable_sgpr_queue_ptr = 0 52 53 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 54 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 55 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 56 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 57 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 58 define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { 59 %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)* 60 store volatile i32 7, i32 addrspace(4)* %stof 61 ret void 62 } 63 64 ; no-op 65 ; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast: 66 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}} 67 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 68 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 69 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}} 70 define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 { 71 %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)* 72 %ld = load volatile i32, i32 addrspace(4)* %stof 73 ret void 74 } 75 76 ; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast: 77 ; HSA: enable_sgpr_private_segment_buffer = 1 78 ; HSA: enable_sgpr_dispatch_ptr = 0 79 ; HSA: enable_sgpr_queue_ptr = 0 80 81 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 82 ; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} 83 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 84 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 85 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 86 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]] 87 define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 { 88 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)* 89 store volatile i32 0, i32 addrspace(3)* %ftos 90 ret void 91 } 92 93 ; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast: 94 ; HSA: enable_sgpr_private_segment_buffer = 1 95 ; HSA: enable_sgpr_dispatch_ptr = 0 96 ; HSA: enable_sgpr_queue_ptr = 0 97 98 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}} 99 ; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}} 100 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 101 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 102 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 103 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} 104 define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 { 105 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32* 106 store volatile i32 0, i32* %ftos 107 ret void 108 } 109 110 ; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast: 111 ; HSA: enable_sgpr_queue_ptr = 0 112 113 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 114 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 115 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 116 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 117 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]] 118 define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { 119 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* 120 store volatile i32 0, i32 addrspace(1)* %ftos 121 ret void 122 } 123 124 ; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast: 125 ; HSA: enable_sgpr_queue_ptr = 0 126 127 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0 128 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0 129 define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 { 130 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)* 131 load volatile i32, i32 addrspace(2)* %ftos 132 ret void 133 } 134 135 ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: 136 ; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 137 ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 138 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 139 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 140 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 141 define void @cast_0_group_to_flat_addrspacecast() #0 { 142 %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)* 143 store i32 7, i32 addrspace(4)* %cast 144 ret void 145 } 146 147 ; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast: 148 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 149 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 150 ; HSA: ds_write_b32 [[PTR]], [[K]] 151 define void @cast_0_flat_to_group_addrspacecast() #0 { 152 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)* 153 store i32 7, i32 addrspace(3)* %cast 154 ret void 155 } 156 157 ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: 158 ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 159 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 160 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 161 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 162 define void @cast_neg1_group_to_flat_addrspacecast() #0 { 163 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)* 164 store i32 7, i32 addrspace(4)* %cast 165 ret void 166 } 167 168 ; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast: 169 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 170 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 171 ; HSA: ds_write_b32 [[PTR]], [[K]] 172 define void @cast_neg1_flat_to_group_addrspacecast() #0 { 173 %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)* 174 store i32 7, i32 addrspace(3)* %cast 175 ret void 176 } 177 178 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: 179 ; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 180 ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 181 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 182 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 183 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] 184 define void @cast_0_private_to_flat_addrspacecast() #0 { 185 %cast = addrspacecast i32* null to i32 addrspace(4)* 186 store i32 7, i32 addrspace(4)* %cast 187 ret void 188 } 189 190 ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: 191 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 192 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 193 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen 194 define void @cast_0_flat_to_private_addrspacecast() #0 { 195 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)* 196 store i32 7, i32* %cast 197 ret void 198 } 199 200 ; Disable optimizations in case there are optimizations added that 201 ; specialize away generic pointer accesses. 202 203 ; HSA-LABEL: {{^}}branch_use_flat_i32: 204 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} 205 ; HSA: s_endpgm 206 define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { 207 entry: 208 %cmp = icmp ne i32 %c, 0 209 br i1 %cmp, label %local, label %global 210 211 local: 212 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* 213 br label %end 214 215 global: 216 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* 217 br label %end 218 219 end: 220 %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] 221 store i32 %x, i32 addrspace(4)* %fptr, align 4 222 ; %val = load i32, i32 addrspace(4)* %fptr, align 4 223 ; store i32 %val, i32 addrspace(1)* %out, align 4 224 ret void 225 } 226 227 ; Check for prologue initializing special SGPRs pointing to scratch. 228 ; HSA-LABEL: {{^}}store_flat_scratch: 229 ; HSA-DAG: s_mov_b32 flat_scratch_lo, s9 230 ; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 231 ; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 232 ; HSA: flat_store_dword 233 ; HSA: s_barrier 234 ; HSA: flat_load_dword 235 define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { 236 %alloca = alloca i32, i32 9, align 4 237 %x = call i32 @llvm.amdgcn.workitem.id.x() #2 238 %pptr = getelementptr i32, i32* %alloca, i32 %x 239 %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* 240 store i32 %x, i32 addrspace(4)* %fptr 241 ; Dummy call 242 call void @llvm.amdgcn.s.barrier() #1 243 %reload = load i32, i32 addrspace(4)* %fptr, align 4 244 store i32 %reload, i32 addrspace(1)* %out, align 4 245 ret void 246 } 247 248 declare void @llvm.amdgcn.s.barrier() #1 249 declare i32 @llvm.amdgcn.workitem.id.x() #2 250 251 attributes #0 = { nounwind } 252 attributes #1 = { nounwind convergent } 253 attributes #2 = { nounwind readnone } 254