1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s 2 3 @lds = addrspace(3) global [512 x float] undef, align 4 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 5 6 7 ; SI-LABEL: @simple_write2_one_val_f32 8 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] 9 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 10 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 11 ; SI: s_endpgm 12 define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 13 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 14 %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i 15 %val = load float, float addrspace(1)* %in.gep, align 4 16 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 17 store float %val, float addrspace(3)* %arrayidx0, align 4 18 %add.x = add nsw i32 %x.i, 8 19 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 20 store float %val, float addrspace(3)* %arrayidx1, align 4 21 ret void 22 } 23 24 ; SI-LABEL: @simple_write2_two_val_f32 25 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 26 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 27 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 28 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 29 ; SI: s_endpgm 30 define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 31 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 32 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 33 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 34 %val0 = load float, float addrspace(1)* %in.gep.0, align 4 35 %val1 = load float, float addrspace(1)* %in.gep.1, align 4 36 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 37 store float %val0, float addrspace(3)* %arrayidx0, align 4 38 %add.x = add nsw i32 %x.i, 8 39 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 40 store float %val1, float addrspace(3)* %arrayidx1, align 4 41 ret void 42 } 43 44 ; SI-LABEL: @simple_write2_two_val_f32_volatile_0 45 ; SI-NOT: ds_write2_b32 46 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} 47 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 48 ; SI: s_endpgm 49 define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 50 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 51 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 52 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 53 %val0 = load float, float addrspace(1)* %in0.gep, align 4 54 %val1 = load float, float addrspace(1)* %in1.gep, align 4 55 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 56 store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 57 %add.x = add nsw i32 %x.i, 8 58 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 59 store float %val1, float addrspace(3)* %arrayidx1, align 4 60 ret void 61 } 62 63 ; SI-LABEL: @simple_write2_two_val_f32_volatile_1 64 ; SI-NOT: ds_write2_b32 65 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} 66 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 67 ; SI: s_endpgm 68 define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 69 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 70 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 71 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 72 %val0 = load float, float addrspace(1)* %in0.gep, align 4 73 %val1 = load float, float addrspace(1)* %in1.gep, align 4 74 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 75 store float %val0, float addrspace(3)* %arrayidx0, align 4 76 %add.x = add nsw i32 %x.i, 8 77 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 78 store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 79 ret void 80 } 81 82 ; 2 data subregisters from different super registers. 83 ; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 84 ; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} 85 ; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} 86 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 87 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 88 ; SI: s_endpgm 89 define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 90 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 91 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 92 %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 93 %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 94 %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 95 %val0.0 = extractelement <2 x float> %val0, i32 0 96 %val1.1 = extractelement <2 x float> %val1, i32 1 97 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 98 store float %val0.0, float addrspace(3)* %arrayidx0, align 4 99 %add.x = add nsw i32 %x.i, 8 100 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 101 store float %val1.1, float addrspace(3)* %arrayidx1, align 4 102 ret void 103 } 104 105 ; SI-LABEL: @simple_write2_two_val_subreg2_f32 106 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} 107 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 108 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 109 ; SI: s_endpgm 110 define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { 111 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 112 %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i 113 %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 114 %val0 = extractelement <2 x float> %val, i32 0 115 %val1 = extractelement <2 x float> %val, i32 1 116 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 117 store float %val0, float addrspace(3)* %arrayidx0, align 4 118 %add.x = add nsw i32 %x.i, 8 119 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 120 store float %val1, float addrspace(3)* %arrayidx1, align 4 121 ret void 122 } 123 124 ; SI-LABEL: @simple_write2_two_val_subreg4_f32 125 ; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} 126 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 127 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 128 ; SI: s_endpgm 129 define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { 130 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 131 %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i 132 %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 133 %val0 = extractelement <4 x float> %val, i32 0 134 %val1 = extractelement <4 x float> %val, i32 3 135 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 136 store float %val0, float addrspace(3)* %arrayidx0, align 4 137 %add.x = add nsw i32 %x.i, 8 138 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 139 store float %val1, float addrspace(3)* %arrayidx1, align 4 140 ret void 141 } 142 143 ; SI-LABEL: @simple_write2_two_val_max_offset_f32 144 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 145 ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 146 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} 147 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 148 ; SI: s_endpgm 149 define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { 150 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 151 %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i 152 %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 153 %val0 = load float, float addrspace(1)* %in.gep.0, align 4 154 %val1 = load float, float addrspace(1)* %in.gep.1, align 4 155 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 156 store float %val0, float addrspace(3)* %arrayidx0, align 4 157 %add.x = add nsw i32 %x.i, 255 158 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 159 store float %val1, float addrspace(3)* %arrayidx1, align 4 160 ret void 161 } 162 163 ; SI-LABEL: @simple_write2_two_val_too_far_f32 164 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} 165 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 166 ; SI: s_endpgm 167 define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 168 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 169 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 170 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 171 %val0 = load float, float addrspace(1)* %in0.gep, align 4 172 %val1 = load float, float addrspace(1)* %in1.gep, align 4 173 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i 174 store float %val0, float addrspace(3)* %arrayidx0, align 4 175 %add.x = add nsw i32 %x.i, 257 176 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x 177 store float %val1, float addrspace(3)* %arrayidx1, align 4 178 ret void 179 } 180 181 ; SI-LABEL: @simple_write2_two_val_f32_x2 182 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 183 ; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 184 ; SI: s_endpgm 185 define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 186 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 187 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 188 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 189 %val0 = load float, float addrspace(1)* %in0.gep, align 4 190 %val1 = load float, float addrspace(1)* %in1.gep, align 4 191 192 %idx.0 = add nsw i32 %tid.x, 0 193 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 194 store float %val0, float addrspace(3)* %arrayidx0, align 4 195 196 %idx.1 = add nsw i32 %tid.x, 8 197 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 198 store float %val1, float addrspace(3)* %arrayidx1, align 4 199 200 %idx.2 = add nsw i32 %tid.x, 11 201 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 202 store float %val0, float addrspace(3)* %arrayidx2, align 4 203 204 %idx.3 = add nsw i32 %tid.x, 27 205 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 206 store float %val1, float addrspace(3)* %arrayidx3, align 4 207 208 ret void 209 } 210 211 ; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base 212 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 213 ; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 214 ; SI: s_endpgm 215 define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { 216 %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 217 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x 218 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x 219 %val0 = load float, float addrspace(1)* %in0.gep, align 4 220 %val1 = load float, float addrspace(1)* %in1.gep, align 4 221 222 %idx.0 = add nsw i32 %tid.x, 3 223 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 224 store float %val0, float addrspace(3)* %arrayidx0, align 4 225 226 %idx.1 = add nsw i32 %tid.x, 8 227 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 228 store float %val1, float addrspace(3)* %arrayidx1, align 4 229 230 %idx.2 = add nsw i32 %tid.x, 11 231 %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 232 store float %val0, float addrspace(3)* %arrayidx2, align 4 233 234 %idx.3 = add nsw i32 %tid.x, 27 235 %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 236 store float %val1, float addrspace(3)* %arrayidx3, align 4 237 238 ret void 239 } 240 241 ; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 242 ; SI-NOT: ds_write2_b32 243 ; SI: ds_write_b32 244 ; SI: ds_write_b32 245 ; SI: s_endpgm 246 define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { 247 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 248 %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i 249 %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i 250 %val0 = load float, float addrspace(1)* %in0.gep, align 4 251 %val1 = load float, float addrspace(1)* %in1.gep, align 4 252 253 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 254 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 255 %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 256 %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 257 %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 258 259 ; Apply an additional offset after the vector that will be more obviously folded. 260 %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 261 store float %val0, float addrspace(3)* %gep.0, align 4 262 263 %add.x = add nsw i32 %x.i, 8 264 store float %val1, float addrspace(3)* %gep.1.offset, align 4 265 ret void 266 } 267 268 ; SI-LABEL: @simple_write2_one_val_f64 269 ; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], 270 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} 271 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 272 ; SI: s_endpgm 273 define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 274 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 275 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 276 %val = load double, double addrspace(1)* %in.gep, align 8 277 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 278 store double %val, double addrspace(3)* %arrayidx0, align 8 279 %add.x = add nsw i32 %x.i, 8 280 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 281 store double %val, double addrspace(3)* %arrayidx1, align 8 282 ret void 283 } 284 285 ; SI-LABEL: @misaligned_simple_write2_one_val_f64 286 ; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} 287 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} 288 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 289 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 290 ; SI: s_endpgm 291 define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { 292 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 293 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i 294 %val = load double, double addrspace(1)* %in.gep, align 8 295 %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i 296 store double %val, double addrspace(3)* %arrayidx0, align 4 297 %add.x = add nsw i32 %x.i, 7 298 %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x 299 store double %val, double addrspace(3)* %arrayidx1, align 4 300 ret void 301 } 302 303 ; SI-LABEL: @simple_write2_two_val_f64 304 ; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 305 ; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 306 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} 307 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 308 ; SI: s_endpgm 309 define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { 310 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 311 %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i 312 %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 313 %val0 = load double, double addrspace(1)* %in.gep.0, align 8 314 %val1 = load double, double addrspace(1)* %in.gep.1, align 8 315 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i 316 store double %val0, double addrspace(3)* %arrayidx0, align 8 317 %add.x = add nsw i32 %x.i, 8 318 %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x 319 store double %val1, double addrspace(3)* %arrayidx1, align 8 320 ret void 321 } 322 323 @foo = addrspace(3) global [4 x i32] undef, align 4 324 325 ; SI-LABEL: @store_constant_adjacent_offsets 326 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 327 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 328 define void @store_constant_adjacent_offsets() { 329 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 330 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 331 ret void 332 } 333 334 ; SI-LABEL: @store_constant_disjoint_offsets 335 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} 336 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 337 ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 338 define void @store_constant_disjoint_offsets() { 339 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 340 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 341 ret void 342 } 343 344 @bar = addrspace(3) global [4 x i64] undef, align 4 345 346 ; SI-LABEL: @store_misaligned64_constant_offsets 347 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 348 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 349 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 350 ; SI: s_endpgm 351 define void @store_misaligned64_constant_offsets() { 352 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 353 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 354 ret void 355 } 356 357 @bar.large = addrspace(3) global [4096 x i64] undef, align 4 358 359 ; SI-LABEL: @store_misaligned64_constant_large_offsets 360 ; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} 361 ; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} 362 ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 363 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 364 ; SI: s_endpgm 365 define void @store_misaligned64_constant_large_offsets() { 366 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 367 store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 368 ret void 369 } 370 371 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 372 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 373 374 define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { 375 %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 376 %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 377 %val = load float, float addrspace(1)* %in 378 %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i 379 store float %val, float addrspace(3)* %arrayidx44, align 4 380 %add47 = add nsw i32 %x.i, 1 381 %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 382 store float %val, float addrspace(3)* %arrayidx48, align 4 383 %add51 = add nsw i32 %x.i, 16 384 %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 385 store float %val, float addrspace(3)* %arrayidx52, align 4 386 %add55 = add nsw i32 %x.i, 17 387 %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 388 store float %val, float addrspace(3)* %arrayidx56, align 4 389 %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i 390 store float %val, float addrspace(3)* %arrayidx60, align 4 391 %add63 = add nsw i32 %y.i, 1 392 %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 393 store float %val, float addrspace(3)* %arrayidx64, align 4 394 %add67 = add nsw i32 %y.i, 32 395 %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 396 store float %val, float addrspace(3)* %arrayidx68, align 4 397 %add71 = add nsw i32 %y.i, 33 398 %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 399 store float %val, float addrspace(3)* %arrayidx72, align 4 400 %add75 = add nsw i32 %y.i, 64 401 %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 402 store float %val, float addrspace(3)* %arrayidx76, align 4 403 %add79 = add nsw i32 %y.i, 65 404 %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 405 store float %val, float addrspace(3)* %arrayidx80, align 4 406 ret void 407 } 408 409 ; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4: 410 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}} 411 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}} 412 ; CI: s_endpgm 413 define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { 414 %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 415 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in 416 %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 417 %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i 418 store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4 419 ret void 420 } 421 422 ; Function Attrs: nounwind readnone 423 declare i32 @llvm.r600.read.tgid.x() #1 424 425 ; Function Attrs: nounwind readnone 426 declare i32 @llvm.r600.read.tgid.y() #1 427 428 ; Function Attrs: nounwind readnone 429 declare i32 @llvm.r600.read.tidig.x() #1 430 431 ; Function Attrs: nounwind readnone 432 declare i32 @llvm.r600.read.tidig.y() #1 433 434 ; Function Attrs: convergent nounwind 435 declare void @llvm.AMDGPU.barrier.local() #2 436 437 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } 438 attributes #1 = { nounwind readnone } 439 attributes #2 = { convergent nounwind } 440