1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s 2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 3 4 @lds = addrspace(3) global [512 x float] undef, align 4 5 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 6 7 8 ; GCN-LABEL: @simple_read2st64_f32_0_1 9 ; CI: s_mov_b32 m0 10 ; GFX9-NOT: m0 11 12 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 13 ; GCN: s_waitcnt lgkmcnt(0) 14 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] 15 ; CI: buffer_store_dword [[RESULT]] 16 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 17 define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { 18 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 19 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 20 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 21 %add.x = add nsw i32 %x.i, 64 22 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 23 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 24 %sum = fadd float %val0, %val1 25 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 26 store float %sum, float addrspace(1)* %out.gep, align 4 27 ret void 28 } 29 30 ; GCN-LABEL: @simple_read2st64_f32_1_2 31 ; CI: s_mov_b32 m0 32 ; GFX9-NOT: m0 33 34 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 35 ; GCN: s_waitcnt lgkmcnt(0) 36 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] 37 ; CI: buffer_store_dword [[RESULT]] 38 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 39 define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 40 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 41 %add.x.0 = add nsw i32 %x.i, 64 42 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 43 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 44 %add.x.1 = add nsw i32 %x.i, 128 45 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 46 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 47 %sum = fadd float %val0, %val1 48 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 49 store float %sum, float addrspace(1)* %out.gep, align 4 50 ret void 51 } 52 53 ; GCN-LABEL: @simple_read2st64_f32_max_offset 54 ; CI: s_mov_b32 m0 55 ; GFX9-NOT: m0 56 57 ; GCN: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 58 ; GCN: s_waitcnt lgkmcnt(0) 59 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] 60 ; CI: buffer_store_dword [[RESULT]] 61 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 62 define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 63 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 64 %add.x.0 = add nsw i32 %x.i, 64 65 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 66 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 67 %add.x.1 = add nsw i32 %x.i, 16320 68 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 69 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 70 %sum = fadd float %val0, %val1 71 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 72 store float %sum, float addrspace(1)* %out.gep, align 4 73 ret void 74 } 75 76 ; GCN-LABEL: @simple_read2st64_f32_over_max_offset 77 ; CI: s_mov_b32 m0 78 ; GFX9-NOT: m0 79 80 ; GCN-NOT: ds_read2st64_b32 81 ; GCN-DAG: v_add_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], {{(vcc, )?}}0x10000, {{v[0-9]+}} 82 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 83 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}} 84 ; GCN: s_endpgm 85 define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { 86 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 87 %add.x.0 = add nsw i32 %x.i, 64 88 %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 89 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 90 %add.x.1 = add nsw i32 %x.i, 16384 91 %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 92 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 93 %sum = fadd float %val0, %val1 94 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 95 store float %sum, float addrspace(1)* %out.gep, align 4 96 ret void 97 } 98 99 ; GCN-LABEL: @odd_invalid_read2st64_f32_0 100 ; CI: s_mov_b32 m0 101 ; GFX9-NOT: m0 102 103 ; GCN-NOT: ds_read2st64_b32 104 ; GCN: s_endpgm 105 define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { 106 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 107 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 108 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 109 %add.x = add nsw i32 %x.i, 63 110 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 111 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 112 %sum = fadd float %val0, %val1 113 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 114 store float %sum, float addrspace(1)* %out.gep, align 4 115 ret void 116 } 117 118 ; GCN-LABEL: @odd_invalid_read2st64_f32_1 119 ; CI: s_mov_b32 m0 120 ; GFX9-NOT: m0 121 122 ; GCN-NOT: ds_read2st64_b32 123 ; GCN: s_endpgm 124 define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { 125 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 126 %add.x.0 = add nsw i32 %x.i, 64 127 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 128 %val0 = load float, float addrspace(3)* %arrayidx0, align 4 129 %add.x.1 = add nsw i32 %x.i, 127 130 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 131 %val1 = load float, float addrspace(3)* %arrayidx1, align 4 132 %sum = fadd float %val0, %val1 133 %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i 134 store float %sum, float addrspace(1)* %out.gep, align 4 135 ret void 136 } 137 138 ; GCN-LABEL: @simple_read2st64_f64_0_1 139 ; CI: s_mov_b32 m0 140 ; GFX9-NOT: m0 141 142 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 143 ; GCN: s_waitcnt lgkmcnt(0) 144 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} 145 ; CI: buffer_store_dwordx2 [[RESULT]] 146 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 147 define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { 148 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 149 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 150 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 151 %add.x = add nsw i32 %x.i, 64 152 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 153 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 154 %sum = fadd double %val0, %val1 155 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 156 store double %sum, double addrspace(1)* %out.gep, align 8 157 ret void 158 } 159 160 ; GCN-LABEL: @simple_read2st64_f64_1_2 161 ; CI: s_mov_b32 m0 162 ; GFX9-NOT: m0 163 164 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 165 ; GCN: s_waitcnt lgkmcnt(0) 166 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} 167 168 ; CI: buffer_store_dwordx2 [[RESULT]] 169 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 170 define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 171 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 172 %add.x.0 = add nsw i32 %x.i, 64 173 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 174 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 175 %add.x.1 = add nsw i32 %x.i, 128 176 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 177 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 178 %sum = fadd double %val0, %val1 179 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 180 store double %sum, double addrspace(1)* %out.gep, align 8 181 ret void 182 } 183 184 ; Alignment only 185 186 ; GCN-LABEL: @misaligned_read2st64_f64 187 ; CI: s_mov_b32 m0 188 ; GFX9-NOT: m0 189 190 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 191 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 192 ; GCN: s_endpgm 193 define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 194 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 195 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 196 %val0 = load double, double addrspace(3)* %arrayidx0, align 4 197 %add.x = add nsw i32 %x.i, 64 198 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 199 %val1 = load double, double addrspace(3)* %arrayidx1, align 4 200 %sum = fadd double %val0, %val1 201 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 202 store double %sum, double addrspace(1)* %out.gep, align 4 203 ret void 204 } 205 206 ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff 207 ; GCN-LABEL: @simple_read2st64_f64_max_offset 208 ; CI: s_mov_b32 m0 209 ; GFX9-NOT: m0 210 211 ; GCN: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 212 ; GCN: s_waitcnt lgkmcnt(0) 213 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} 214 215 ; CI: buffer_store_dwordx2 [[RESULT]] 216 ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 217 define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 218 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 219 %add.x.0 = add nsw i32 %x.i, 256 220 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 221 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 222 %add.x.1 = add nsw i32 %x.i, 8128 223 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 224 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 225 %sum = fadd double %val0, %val1 226 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 227 store double %sum, double addrspace(1)* %out.gep, align 8 228 ret void 229 } 230 231 ; GCN-LABEL: @simple_read2st64_f64_over_max_offset 232 ; CI: s_mov_b32 m0 233 ; GFX9-NOT: m0 234 235 ; GCN-NOT: ds_read2st64_b64 236 ; GCN-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 237 ; GCN-DAG: v_add_{{i|u}}32_e32 [[BIGADD:v[0-9]+]], {{(vcc, )?}}0x10000, {{v[0-9]+}} 238 ; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] 239 ; GCN: s_endpgm 240 define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 241 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 242 %add.x.0 = add nsw i32 %x.i, 64 243 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 244 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 245 %add.x.1 = add nsw i32 %x.i, 8192 246 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 247 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 248 %sum = fadd double %val0, %val1 249 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 250 store double %sum, double addrspace(1)* %out.gep, align 8 251 ret void 252 } 253 254 ; GCN-LABEL: @invalid_read2st64_f64_odd_offset 255 ; CI: s_mov_b32 m0 256 ; GFX9-NOT: m0 257 258 ; GCN-NOT: ds_read2st64_b64 259 ; GCN: s_endpgm 260 define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 261 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 262 %add.x.0 = add nsw i32 %x.i, 64 263 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 264 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 265 %add.x.1 = add nsw i32 %x.i, 8129 266 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 267 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 268 %sum = fadd double %val0, %val1 269 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 270 store double %sum, double addrspace(1)* %out.gep, align 8 271 ret void 272 } 273 274 ; The stride of 8 elements is 8 * 8 bytes. We need to make sure the 275 ; stride in elements, not bytes, is a multiple of 64. 276 277 ; GCN-LABEL: @byte_size_only_divisible_64_read2_f64 278 ; CI: s_mov_b32 m0 279 ; GFX9-NOT: m0 280 281 ; GCN-NOT: ds_read2st_b64 282 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 283 ; GCN: s_endpgm 284 define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { 285 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 286 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 287 %val0 = load double, double addrspace(3)* %arrayidx0, align 8 288 %add.x = add nsw i32 %x.i, 8 289 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 290 %val1 = load double, double addrspace(3)* %arrayidx1, align 8 291 %sum = fadd double %val0, %val1 292 %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i 293 store double %sum, double addrspace(1)* %out.gep, align 4 294 ret void 295 } 296 297 declare i32 @llvm.amdgcn.workitem.id.x() #1 298 declare i32 @llvm.amdgcn.workitem.id.y() #1 299 300 attributes #0 = { nounwind } 301 attributes #1 = { nounwind readnone } 302