1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s 2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s 3 4 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 5 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 6 7 ; Run with devices with different unaligned load restrictions. 8 9 ; TODO: Vector element tests 10 ; TODO: Non-zero base offset for load and store combinations 11 ; TODO: Same base addrspacecasted 12 13 14 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: 15 ; GCN: buffer_store_byte 16 ; GCN: buffer_store_byte 17 ; GCN: s_endpgm 18 define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { 19 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 20 21 store i8 123, i8 addrspace(1)* %out.gep.1 22 store i8 456, i8 addrspace(1)* %out, align 2 23 ret void 24 } 25 26 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: 27 ; GCN: buffer_store_byte 28 ; GCN: buffer_store_byte 29 ; GCN: s_endpgm 30 define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { 31 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 32 33 store i8 123, i8 addrspace(1)* %out.gep.1 34 store i8 456, i8 addrspace(1)* %out 35 ret void 36 } 37 38 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: 39 ; GCN: buffer_store_dword v 40 define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { 41 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 42 43 store i16 123, i16 addrspace(1)* %out.gep.1 44 store i16 456, i16 addrspace(1)* %out, align 4 45 ret void 46 } 47 48 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: 49 ; GCN: buffer_store_dword v 50 define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { 51 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 52 53 store i16 0, i16 addrspace(1)* %out.gep.1 54 store i16 0, i16 addrspace(1)* %out, align 4 55 ret void 56 } 57 58 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: 59 ; GCN: buffer_store_short 60 ; GCN: buffer_store_short 61 ; GCN: s_endpgm 62 define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { 63 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 64 65 store i16 123, i16 addrspace(1)* %out.gep.1 66 store i16 456, i16 addrspace(1)* %out 67 ret void 68 } 69 70 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: 71 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 72 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 73 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 74 define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { 75 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 76 77 store i32 123, i32 addrspace(1)* %out.gep.1 78 store i32 456, i32 addrspace(1)* %out 79 ret void 80 } 81 82 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: 83 ; GCN: buffer_store_dwordx2 84 define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { 85 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 86 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* 87 store float 1.0, float addrspace(1)* %out.gep.1.bc 88 store i32 456, i32 addrspace(1)* %out 89 ret void 90 } 91 92 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: 93 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 94 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b 95 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 96 define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { 97 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 98 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 99 store i32 123, i32 addrspace(1)* %out.gep.1.bc 100 store float 4.0, float addrspace(1)* %out 101 ret void 102 } 103 104 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: 105 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} 106 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} 107 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} 108 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} 109 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} 110 define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { 111 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 112 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 113 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 114 115 store i32 123, i32 addrspace(1)* %out.gep.1 116 store i32 456, i32 addrspace(1)* %out.gep.2 117 store i32 333, i32 addrspace(1)* %out.gep.3 118 store i32 1234, i32 addrspace(1)* %out 119 ret void 120 } 121 122 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: 123 ; GCN: buffer_store_dwordx4 124 define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { 125 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 126 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 127 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 128 129 store float 8.0, float addrspace(1)* %out 130 store float 1.0, float addrspace(1)* %out.gep.1 131 store float 2.0, float addrspace(1)* %out.gep.2 132 store float 4.0, float addrspace(1)* %out.gep.3 133 ret void 134 } 135 136 ; First store is out of order. 137 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: 138 ; GCN: buffer_store_dwordx4 139 define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { 140 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 141 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 142 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 143 144 store float 1.0, float addrspace(1)* %out.gep.1 145 store float 2.0, float addrspace(1)* %out.gep.2 146 store float 4.0, float addrspace(1)* %out.gep.3 147 store float 8.0, float addrspace(1)* %out 148 ret void 149 } 150 151 ; FIXME: Should be able to merge this 152 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: 153 ; GCN-NOAA: buffer_store_dword v 154 ; GCN-NOAA: buffer_store_dword v 155 ; GCN-NOAA: buffer_store_dword v 156 ; GCN-NOAA: buffer_store_dword v 157 158 ; GCN-AA: buffer_store_dwordx2 159 ; GCN-AA: buffer_store_dword v 160 ; GCN-AA: buffer_store_dword v 161 162 ; GCN: s_endpgm 163 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { 164 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 165 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 166 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 167 168 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 169 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* 170 171 store i32 11, i32 addrspace(1)* %out.gep.1.bc 172 store float 2.0, float addrspace(1)* %out.gep.2 173 store i32 17, i32 addrspace(1)* %out.gep.3.bc 174 store float 8.0, float addrspace(1)* %out 175 ret void 176 } 177 178 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: 179 ; SI-DAG: buffer_store_dwordx2 180 ; SI-DAG: buffer_store_dword 181 ; SI-NOT: buffer_store_dword 182 ; GCN: s_endpgm 183 define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { 184 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 185 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 186 187 store i32 123, i32 addrspace(1)* %out.gep.1 188 store i32 456, i32 addrspace(1)* %out.gep.2 189 store i32 1234, i32 addrspace(1)* %out 190 ret void 191 } 192 193 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: 194 ; GCN: buffer_store_dwordx4 195 define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { 196 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 197 198 store i64 123, i64 addrspace(1)* %out.gep.1 199 store i64 456, i64 addrspace(1)* %out 200 ret void 201 } 202 203 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: 204 ; GCN: buffer_store_dwordx4 205 ; GCN: buffer_store_dwordx4 206 define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { 207 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 208 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 209 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 210 211 store i64 123, i64 addrspace(1)* %out.gep.1 212 store i64 456, i64 addrspace(1)* %out.gep.2 213 store i64 333, i64 addrspace(1)* %out.gep.3 214 store i64 1234, i64 addrspace(1)* %out 215 ret void 216 } 217 218 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: 219 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 220 ; GCN: buffer_store_dwordx2 [[LOAD]] 221 define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 222 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 223 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 224 225 %lo = load i32, i32 addrspace(1)* %in 226 %hi = load i32, i32 addrspace(1)* %in.gep.1 227 228 store i32 %lo, i32 addrspace(1)* %out 229 store i32 %hi, i32 addrspace(1)* %out.gep.1 230 ret void 231 } 232 233 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: 234 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 235 ; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 236 define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 237 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 238 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 239 240 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 241 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 242 %lo = load i32, i32 addrspace(1)* %in.gep.0 243 %hi = load i32, i32 addrspace(1)* %in.gep.1 244 245 store i32 %lo, i32 addrspace(1)* %out.gep.0 246 store i32 %hi, i32 addrspace(1)* %out.gep.1 247 ret void 248 } 249 250 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: 251 ; GCN: buffer_load_dword v 252 ; GCN: buffer_load_dword v 253 ; GCN: buffer_store_dword v 254 ; GCN: buffer_store_dword v 255 define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 256 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 257 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 258 259 %lo = load i32, i32 addrspace(1)* %in 260 %hi = load i32, i32 addrspace(1)* %in.gep.1 261 262 store i32 %hi, i32 addrspace(1)* %out 263 store i32 %lo, i32 addrspace(1)* %out.gep.1 264 ret void 265 } 266 267 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: 268 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 269 ; GCN: buffer_store_dwordx4 [[LOAD]] 270 define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 271 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 272 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 273 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 274 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 275 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 276 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 277 278 %x = load i32, i32 addrspace(1)* %in 279 %y = load i32, i32 addrspace(1)* %in.gep.1 280 %z = load i32, i32 addrspace(1)* %in.gep.2 281 %w = load i32, i32 addrspace(1)* %in.gep.3 282 283 store i32 %x, i32 addrspace(1)* %out 284 store i32 %y, i32 addrspace(1)* %out.gep.1 285 store i32 %z, i32 addrspace(1)* %out.gep.2 286 store i32 %w, i32 addrspace(1)* %out.gep.3 287 ret void 288 } 289 290 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: 291 ; SI-DAG: buffer_load_dwordx2 292 ; SI-DAG: buffer_load_dword v 293 ; GCN: s_waitcnt 294 ; SI-DAG: buffer_store_dword v 295 ; SI-DAG: buffer_store_dwordx2 v 296 ; GCN: s_endpgm 297 define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 298 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 299 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 300 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 301 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 302 303 %x = load i32, i32 addrspace(1)* %in 304 %y = load i32, i32 addrspace(1)* %in.gep.1 305 %z = load i32, i32 addrspace(1)* %in.gep.2 306 307 store i32 %x, i32 addrspace(1)* %out 308 store i32 %y, i32 addrspace(1)* %out.gep.1 309 store i32 %z, i32 addrspace(1)* %out.gep.2 310 ret void 311 } 312 313 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: 314 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 315 ; GCN: buffer_store_dwordx4 [[LOAD]] 316 define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 317 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 318 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 319 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 320 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 321 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 322 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 323 324 %x = load float, float addrspace(1)* %in 325 %y = load float, float addrspace(1)* %in.gep.1 326 %z = load float, float addrspace(1)* %in.gep.2 327 %w = load float, float addrspace(1)* %in.gep.3 328 329 store float %x, float addrspace(1)* %out 330 store float %y, float addrspace(1)* %out.gep.1 331 store float %z, float addrspace(1)* %out.gep.2 332 store float %w, float addrspace(1)* %out.gep.3 333 ret void 334 } 335 336 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: 337 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 338 ; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 339 define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 340 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 341 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 342 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 343 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 344 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 345 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 346 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 347 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 348 349 %x = load i32, i32 addrspace(1)* %in.gep.0 350 %y = load i32, i32 addrspace(1)* %in.gep.1 351 %z = load i32, i32 addrspace(1)* %in.gep.2 352 %w = load i32, i32 addrspace(1)* %in.gep.3 353 354 store i32 %x, i32 addrspace(1)* %out.gep.0 355 store i32 %y, i32 addrspace(1)* %out.gep.1 356 store i32 %z, i32 addrspace(1)* %out.gep.2 357 store i32 %w, i32 addrspace(1)* %out.gep.3 358 ret void 359 } 360 361 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: 362 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 363 ; GCN: s_barrier 364 ; GCN: buffer_store_dwordx4 [[LOAD]] 365 define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 366 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 367 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 368 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 369 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 370 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 371 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 372 373 %x = load i32, i32 addrspace(1)* %in 374 %y = load i32, i32 addrspace(1)* %in.gep.1 375 %z = load i32, i32 addrspace(1)* %in.gep.2 376 %w = load i32, i32 addrspace(1)* %in.gep.3 377 378 ; Make sure the barrier doesn't stop this 379 tail call void @llvm.AMDGPU.barrier.local() #1 380 381 store i32 %w, i32 addrspace(1)* %out.gep.3 382 store i32 %z, i32 addrspace(1)* %out.gep.2 383 store i32 %y, i32 addrspace(1)* %out.gep.1 384 store i32 %x, i32 addrspace(1)* %out 385 386 ret void 387 } 388 389 ; TODO: Re-packing of loaded register required. Maybe an IR pass 390 ; should catch this? 391 392 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: 393 ; GCN: buffer_load_dword v 394 ; GCN: buffer_load_dword v 395 ; GCN: buffer_load_dword v 396 ; GCN: buffer_load_dword v 397 ; GCN: s_barrier 398 ; GCN: buffer_store_dword v 399 ; GCN: buffer_store_dword v 400 ; GCN: buffer_store_dword v 401 ; GCN: buffer_store_dword v 402 define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 403 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 404 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 405 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 406 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 407 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 408 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 409 410 %x = load i32, i32 addrspace(1)* %in 411 %y = load i32, i32 addrspace(1)* %in.gep.1 412 %z = load i32, i32 addrspace(1)* %in.gep.2 413 %w = load i32, i32 addrspace(1)* %in.gep.3 414 415 ; Make sure the barrier doesn't stop this 416 tail call void @llvm.AMDGPU.barrier.local() #1 417 418 store i32 %w, i32 addrspace(1)* %out 419 store i32 %z, i32 addrspace(1)* %out.gep.1 420 store i32 %y, i32 addrspace(1)* %out.gep.2 421 store i32 %x, i32 addrspace(1)* %out.gep.3 422 423 ret void 424 } 425 426 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: 427 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 428 ; GCN: buffer_store_dword [[LOAD]] 429 ; GCN: s_endpgm 430 define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 431 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 432 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 433 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 434 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 435 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 436 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 437 438 %x = load i8, i8 addrspace(1)* %in, align 4 439 %y = load i8, i8 addrspace(1)* %in.gep.1 440 %z = load i8, i8 addrspace(1)* %in.gep.2 441 %w = load i8, i8 addrspace(1)* %in.gep.3 442 443 store i8 %x, i8 addrspace(1)* %out, align 4 444 store i8 %y, i8 addrspace(1)* %out.gep.1 445 store i8 %z, i8 addrspace(1)* %out.gep.2 446 store i8 %w, i8 addrspace(1)* %out.gep.3 447 ret void 448 } 449 450 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: 451 ; GCN: buffer_load_ubyte 452 ; GCN: buffer_load_ubyte 453 ; GCN: buffer_load_ubyte 454 ; GCN: buffer_load_ubyte 455 ; GCN: buffer_store_byte 456 ; GCN: buffer_store_byte 457 ; GCN: buffer_store_byte 458 ; GCN: buffer_store_byte 459 ; GCN: s_endpgm 460 define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 461 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 462 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 463 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 464 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 465 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 466 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 467 468 %x = load i8, i8 addrspace(1)* %in 469 %y = load i8, i8 addrspace(1)* %in.gep.1 470 %z = load i8, i8 addrspace(1)* %in.gep.2 471 %w = load i8, i8 addrspace(1)* %in.gep.3 472 473 store i8 %x, i8 addrspace(1)* %out 474 store i8 %y, i8 addrspace(1)* %out.gep.1 475 store i8 %z, i8 addrspace(1)* %out.gep.2 476 store i8 %w, i8 addrspace(1)* %out.gep.3 477 ret void 478 } 479 480 ; This works once AA is enabled on the subtarget 481 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: 482 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 483 484 ; GCN-NOAA: buffer_store_dword v 485 ; GCN-NOAA: buffer_store_dword v 486 ; GCN-NOAA: buffer_store_dword v 487 ; GCN-NOAA: buffer_store_dword v 488 489 ; GCN-AA: buffer_store_dwordx4 [[LOAD]] 490 491 ; GCN: s_endpgm 492 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { 493 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 494 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 495 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 496 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in 497 498 %x = extractelement <4 x i32> %vec, i32 0 499 %y = extractelement <4 x i32> %vec, i32 1 500 %z = extractelement <4 x i32> %vec, i32 2 501 %w = extractelement <4 x i32> %vec, i32 3 502 503 store i32 %x, i32 addrspace(1)* %out 504 store i32 %y, i32 addrspace(1)* %out.gep.1 505 store i32 %z, i32 addrspace(1)* %out.gep.2 506 store i32 %w, i32 addrspace(1)* %out.gep.3 507 ret void 508 } 509 510 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: 511 ; GCN: ds_write_b8 512 ; GCN: ds_write_b8 513 ; GCN: s_endpgm 514 define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { 515 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 516 517 store i8 123, i8 addrspace(3)* %out.gep.1 518 store i8 456, i8 addrspace(3)* %out, align 2 519 ret void 520 } 521 522 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: 523 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 524 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 525 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} 526 define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { 527 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 528 529 store i32 123, i32 addrspace(3)* %out.gep.1 530 store i32 456, i32 addrspace(3)* %out 531 ret void 532 } 533 534 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: 535 ; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 536 ; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d 537 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 538 539 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 540 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b 541 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 542 543 ; GCN: s_endpgm 544 define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { 545 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 546 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 547 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 548 549 store i32 123, i32 addrspace(3)* %out.gep.1 550 store i32 456, i32 addrspace(3)* %out.gep.2 551 store i32 333, i32 addrspace(3)* %out.gep.3 552 store i32 1234, i32 addrspace(3)* %out 553 ret void 554 } 555 556 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: 557 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} 558 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} 559 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} 560 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} 561 ; GCN: buffer_store_dword v[[HI]] 562 define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { 563 store i32 9, i32 addrspace(1)* %out, align 4 564 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 565 store i32 12, i32 addrspace(1)* %idx1, align 4 566 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 567 store i32 16, i32 addrspace(1)* %idx2, align 4 568 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 569 store i32 -12, i32 addrspace(1)* %idx3, align 4 570 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 571 store i32 11, i32 addrspace(1)* %idx4, align 4 572 ret void 573 } 574 575 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: 576 ; GCN: buffer_store_dwordx4 577 ; GCN: buffer_store_dwordx2 578 define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { 579 store i32 13, i32 addrspace(1)* %out, align 4 580 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 581 store i32 15, i32 addrspace(1)* %idx1, align 4 582 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 583 store i32 62, i32 addrspace(1)* %idx2, align 4 584 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 585 store i32 63, i32 addrspace(1)* %idx3, align 4 586 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 587 store i32 11, i32 addrspace(1)* %idx4, align 4 588 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 589 store i32 123, i32 addrspace(1)* %idx5, align 4 590 ret void 591 } 592 593 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: 594 ; GCN: buffer_store_dwordx4 595 ; GCN: buffer_store_dwordx2 596 ; GCN: buffer_store_dword v 597 define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { 598 store i32 34, i32 addrspace(1)* %out, align 4 599 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 600 store i32 999, i32 addrspace(1)* %idx1, align 4 601 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 602 store i32 65, i32 addrspace(1)* %idx2, align 4 603 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 604 store i32 33, i32 addrspace(1)* %idx3, align 4 605 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 606 store i32 98, i32 addrspace(1)* %idx4, align 4 607 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 608 store i32 91, i32 addrspace(1)* %idx5, align 4 609 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 610 store i32 212, i32 addrspace(1)* %idx6, align 4 611 ret void 612 } 613 614 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: 615 ; GCN: buffer_store_dwordx4 616 ; GCN: buffer_store_dwordx4 617 ; GCN: s_endpgm 618 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { 619 store i32 34, i32 addrspace(1)* %out, align 4 620 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 621 store i32 999, i32 addrspace(1)* %idx1, align 4 622 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 623 store i32 65, i32 addrspace(1)* %idx2, align 4 624 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 625 store i32 33, i32 addrspace(1)* %idx3, align 4 626 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 627 store i32 98, i32 addrspace(1)* %idx4, align 4 628 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 629 store i32 91, i32 addrspace(1)* %idx5, align 4 630 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 631 store i32 212, i32 addrspace(1)* %idx6, align 4 632 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 633 store i32 999, i32 addrspace(1)* %idx7, align 4 634 ret void 635 } 636 637 ; This requires handling of scalar_to_vector for v2i64 to avoid 638 ; scratch usage. 639 ; FIXME: Should do single load and store 640 641 ; GCN-LABEL: {{^}}copy_v3i32_align4: 642 ; GCN-NOT: SCRATCH_RSRC_DWORD 643 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 644 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 645 ; GCN-NOT: offen 646 ; GCN: s_waitcnt vmcnt 647 ; GCN-NOT: offen 648 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 649 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 650 651 ; GCN: ScratchSize: 0{{$}} 652 define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { 653 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 654 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out 655 ret void 656 } 657 658 ; GCN-LABEL: {{^}}copy_v3i64_align4: 659 ; GCN-NOT: SCRATCH_RSRC_DWORD 660 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 661 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 662 ; GCN-NOT: offen 663 ; GCN: s_waitcnt vmcnt 664 ; GCN-NOT: offen 665 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 666 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 667 ; GCN: ScratchSize: 0{{$}} 668 define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { 669 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 670 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out 671 ret void 672 } 673 674 ; GCN-LABEL: {{^}}copy_v3f32_align4: 675 ; GCN-NOT: SCRATCH_RSRC_DWORD 676 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 677 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 678 ; GCN-NOT: offen 679 ; GCN: s_waitcnt vmcnt 680 ; GCN-NOT: offen 681 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 682 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 683 ; GCN: ScratchSize: 0{{$}} 684 define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { 685 %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 686 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 687 store <3 x float> %fadd, <3 x float> addrspace(1)* %out 688 ret void 689 } 690 691 ; GCN-LABEL: {{^}}copy_v3f64_align4: 692 ; GCN-NOT: SCRATCH_RSRC_DWORD 693 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 694 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 695 ; GCN-NOT: offen 696 ; GCN: s_waitcnt vmcnt 697 ; GCN-NOT: offen 698 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 699 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 700 ; GCN: ScratchSize: 0{{$}} 701 define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { 702 %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 703 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 704 store <3 x double> %fadd, <3 x double> addrspace(1)* %out 705 ret void 706 } 707 708 declare void @llvm.AMDGPU.barrier.local() #1 709 710 attributes #0 = { nounwind } 711 attributes #1 = { convergent nounwind } 712