1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s 3 4 ; This test is mostly to test DAG store merging, so disable the vectorizer. 5 ; Run with devices with different unaligned load restrictions. 6 7 ; TODO: Vector element tests 8 ; TODO: Non-zero base offset for load and store combinations 9 ; TODO: Same base addrspacecasted 10 11 12 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: 13 ; GCN: buffer_store_short 14 ; GCN: s_endpgm 15 define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { 16 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 17 18 store i8 123, i8 addrspace(1)* %out.gep.1 19 store i8 456, i8 addrspace(1)* %out, align 2 20 ret void 21 } 22 23 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: 24 ; GCN: buffer_store_byte 25 ; GCN: buffer_store_byte 26 ; GCN: s_endpgm 27 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { 28 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 29 30 store i8 123, i8 addrspace(1)* %out.gep.1 31 store i8 456, i8 addrspace(1)* %out 32 ret void 33 } 34 35 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: 36 ; GCN: buffer_store_dword v 37 define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { 38 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 39 40 store i16 123, i16 addrspace(1)* %out.gep.1 41 store i16 456, i16 addrspace(1)* %out, align 4 42 ret void 43 } 44 45 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: 46 ; GCN: buffer_store_dword v 47 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { 48 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 49 50 store i16 0, i16 addrspace(1)* %out.gep.1 51 store i16 0, i16 addrspace(1)* %out, align 4 52 ret void 53 } 54 55 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: 56 ; GCN: buffer_store_short 57 ; GCN: buffer_store_short 58 ; GCN: s_endpgm 59 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { 60 %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 61 62 store i16 123, i16 addrspace(1)* %out.gep.1 63 store i16 456, i16 addrspace(1)* %out 64 ret void 65 } 66 67 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: 68 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 69 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 70 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 71 define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { 72 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 73 74 store i32 123, i32 addrspace(1)* %out.gep.1 75 store i32 456, i32 addrspace(1)* %out 76 ret void 77 } 78 79 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: 80 ; GCN: buffer_store_dwordx2 81 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { 82 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 83 %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* 84 store float 1.0, float addrspace(1)* %out.gep.1.bc 85 store i32 456, i32 addrspace(1)* %out 86 ret void 87 } 88 89 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: 90 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 91 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b 92 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 93 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { 94 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 95 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 96 store i32 123, i32 addrspace(1)* %out.gep.1.bc 97 store float 4.0, float addrspace(1)* %out 98 ret void 99 } 100 101 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: 102 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}} 103 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}} 104 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}} 105 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}} 106 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}} 107 define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { 108 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 109 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 110 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 111 112 store i32 123, i32 addrspace(1)* %out.gep.1 113 store i32 456, i32 addrspace(1)* %out.gep.2 114 store i32 333, i32 addrspace(1)* %out.gep.3 115 store i32 1234, i32 addrspace(1)* %out 116 ret void 117 } 118 119 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: 120 ; GCN: buffer_store_dwordx4 121 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { 122 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 123 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 124 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 125 126 store float 8.0, float addrspace(1)* %out 127 store float 1.0, float addrspace(1)* %out.gep.1 128 store float 2.0, float addrspace(1)* %out.gep.2 129 store float 4.0, float addrspace(1)* %out.gep.3 130 ret void 131 } 132 133 ; First store is out of order. 134 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: 135 ; GCN: buffer_store_dwordx4 136 define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { 137 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 138 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 139 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 140 141 store float 1.0, float addrspace(1)* %out.gep.1 142 store float 2.0, float addrspace(1)* %out.gep.2 143 store float 4.0, float addrspace(1)* %out.gep.3 144 store float 8.0, float addrspace(1)* %out 145 ret void 146 } 147 148 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32: 149 ; GCN-AA: buffer_store_dwordx4 v 150 ; GCN: s_endpgm 151 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 { 152 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 153 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 154 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 155 156 %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* 157 %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)* 158 159 store i32 11, i32 addrspace(1)* %out.gep.1.bc 160 store float 2.0, float addrspace(1)* %out.gep.2 161 store i32 17, i32 addrspace(1)* %out.gep.3.bc 162 store float 8.0, float addrspace(1)* %out 163 ret void 164 } 165 166 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: 167 ; SI-DAG: buffer_store_dwordx2 168 ; SI-DAG: buffer_store_dword 169 ; SI-NOT: buffer_store_dword 170 ; GCN: s_endpgm 171 define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { 172 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 173 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 174 175 store i32 123, i32 addrspace(1)* %out.gep.1 176 store i32 456, i32 addrspace(1)* %out.gep.2 177 store i32 1234, i32 addrspace(1)* %out 178 ret void 179 } 180 181 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: 182 ; GCN: buffer_store_dwordx4 183 define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { 184 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 185 186 store i64 123, i64 addrspace(1)* %out.gep.1 187 store i64 456, i64 addrspace(1)* %out 188 ret void 189 } 190 191 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: 192 ; GCN: buffer_store_dwordx4 193 ; GCN: buffer_store_dwordx4 194 define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { 195 %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 196 %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 197 %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 198 199 store i64 123, i64 addrspace(1)* %out.gep.1 200 store i64 456, i64 addrspace(1)* %out.gep.2 201 store i64 333, i64 addrspace(1)* %out.gep.3 202 store i64 1234, i64 addrspace(1)* %out 203 ret void 204 } 205 206 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: 207 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 208 ; GCN: buffer_store_dwordx2 [[LOAD]] 209 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 210 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 211 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 212 213 %lo = load i32, i32 addrspace(1)* %in 214 %hi = load i32, i32 addrspace(1)* %in.gep.1 215 216 store i32 %lo, i32 addrspace(1)* %out 217 store i32 %hi, i32 addrspace(1)* %out.gep.1 218 ret void 219 } 220 221 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: 222 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 223 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 224 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 225 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 226 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 227 228 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 229 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 230 %lo = load i32, i32 addrspace(1)* %in.gep.0 231 %hi = load i32, i32 addrspace(1)* %in.gep.1 232 233 store i32 %lo, i32 addrspace(1)* %out.gep.0 234 store i32 %hi, i32 addrspace(1)* %out.gep.1 235 ret void 236 } 237 238 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: 239 ; GCN: buffer_load_dwordx2 v 240 ; GCN: buffer_store_dwordx2 v 241 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 242 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 243 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 244 245 %lo = load i32, i32 addrspace(1)* %in 246 %hi = load i32, i32 addrspace(1)* %in.gep.1 247 248 store i32 %hi, i32 addrspace(1)* %out 249 store i32 %lo, i32 addrspace(1)* %out.gep.1 250 ret void 251 } 252 253 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: 254 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 255 ; GCN: buffer_store_dwordx4 [[LOAD]] 256 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 257 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 258 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 259 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 260 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 261 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 262 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 263 264 %x = load i32, i32 addrspace(1)* %in 265 %y = load i32, i32 addrspace(1)* %in.gep.1 266 %z = load i32, i32 addrspace(1)* %in.gep.2 267 %w = load i32, i32 addrspace(1)* %in.gep.3 268 269 store i32 %x, i32 addrspace(1)* %out 270 store i32 %y, i32 addrspace(1)* %out.gep.1 271 store i32 %z, i32 addrspace(1)* %out.gep.2 272 store i32 %w, i32 addrspace(1)* %out.gep.3 273 ret void 274 } 275 276 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: 277 ; SI-DAG: buffer_load_dwordx2 278 ; SI-DAG: buffer_load_dword v 279 ; GCN: s_waitcnt 280 ; SI-DAG: buffer_store_dword v 281 ; SI-DAG: buffer_store_dwordx2 v 282 ; GCN: s_endpgm 283 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 284 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 285 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 286 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 287 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 288 289 %x = load i32, i32 addrspace(1)* %in 290 %y = load i32, i32 addrspace(1)* %in.gep.1 291 %z = load i32, i32 addrspace(1)* %in.gep.2 292 293 store i32 %x, i32 addrspace(1)* %out 294 store i32 %y, i32 addrspace(1)* %out.gep.1 295 store i32 %z, i32 addrspace(1)* %out.gep.2 296 ret void 297 } 298 299 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: 300 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 301 ; GCN: buffer_store_dwordx4 [[LOAD]] 302 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 303 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 304 %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 305 %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 306 %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 307 %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 308 %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 309 310 %x = load float, float addrspace(1)* %in 311 %y = load float, float addrspace(1)* %in.gep.1 312 %z = load float, float addrspace(1)* %in.gep.2 313 %w = load float, float addrspace(1)* %in.gep.3 314 315 store float %x, float addrspace(1)* %out 316 store float %y, float addrspace(1)* %out.gep.1 317 store float %z, float addrspace(1)* %out.gep.2 318 store float %w, float addrspace(1)* %out.gep.3 319 ret void 320 } 321 322 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: 323 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 324 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 325 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 326 %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 327 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 328 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 329 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 330 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 331 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 332 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 333 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 334 335 %x = load i32, i32 addrspace(1)* %in.gep.0 336 %y = load i32, i32 addrspace(1)* %in.gep.1 337 %z = load i32, i32 addrspace(1)* %in.gep.2 338 %w = load i32, i32 addrspace(1)* %in.gep.3 339 340 store i32 %x, i32 addrspace(1)* %out.gep.0 341 store i32 %y, i32 addrspace(1)* %out.gep.1 342 store i32 %z, i32 addrspace(1)* %out.gep.2 343 store i32 %w, i32 addrspace(1)* %out.gep.3 344 ret void 345 } 346 347 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: 348 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 349 ; GCN: s_barrier 350 ; GCN: buffer_store_dwordx4 [[LOAD]] 351 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 352 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 353 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 354 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 355 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 356 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 357 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 358 359 %x = load i32, i32 addrspace(1)* %in 360 %y = load i32, i32 addrspace(1)* %in.gep.1 361 %z = load i32, i32 addrspace(1)* %in.gep.2 362 %w = load i32, i32 addrspace(1)* %in.gep.3 363 364 ; Make sure the barrier doesn't stop this 365 tail call void @llvm.amdgcn.s.barrier() #1 366 367 store i32 %w, i32 addrspace(1)* %out.gep.3 368 store i32 %z, i32 addrspace(1)* %out.gep.2 369 store i32 %y, i32 addrspace(1)* %out.gep.1 370 store i32 %x, i32 addrspace(1)* %out 371 372 ret void 373 } 374 375 ; TODO: Re-packing of loaded register required. Maybe an IR pass 376 ; should catch this? 377 378 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: 379 ; GCN: buffer_load_dwordx4 v 380 ; GCN: s_barrier 381 ; GCN: buffer_store_dwordx4 v 382 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { 383 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 384 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 385 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 386 %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 387 %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 388 %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 389 390 %x = load i32, i32 addrspace(1)* %in 391 %y = load i32, i32 addrspace(1)* %in.gep.1 392 %z = load i32, i32 addrspace(1)* %in.gep.2 393 %w = load i32, i32 addrspace(1)* %in.gep.3 394 395 ; Make sure the barrier doesn't stop this 396 tail call void @llvm.amdgcn.s.barrier() #1 397 398 store i32 %w, i32 addrspace(1)* %out 399 store i32 %z, i32 addrspace(1)* %out.gep.1 400 store i32 %y, i32 addrspace(1)* %out.gep.2 401 store i32 %x, i32 addrspace(1)* %out.gep.3 402 403 ret void 404 } 405 406 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: 407 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 408 ; GCN: buffer_store_dword [[LOAD]] 409 ; GCN: s_endpgm 410 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 411 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 412 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 413 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 414 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 415 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 416 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 417 418 %x = load i8, i8 addrspace(1)* %in, align 4 419 %y = load i8, i8 addrspace(1)* %in.gep.1 420 %z = load i8, i8 addrspace(1)* %in.gep.2 421 %w = load i8, i8 addrspace(1)* %in.gep.3 422 423 store i8 %x, i8 addrspace(1)* %out, align 4 424 store i8 %y, i8 addrspace(1)* %out.gep.1 425 store i8 %z, i8 addrspace(1)* %out.gep.2 426 store i8 %w, i8 addrspace(1)* %out.gep.3 427 ret void 428 } 429 430 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: 431 ; GCN: buffer_load_ubyte 432 ; GCN: buffer_load_ubyte 433 ; GCN: buffer_load_ubyte 434 ; GCN: buffer_load_ubyte 435 ; GCN: buffer_store_byte 436 ; GCN: buffer_store_byte 437 ; GCN: buffer_store_byte 438 ; GCN: buffer_store_byte 439 ; GCN: s_endpgm 440 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { 441 %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 442 %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 443 %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 444 %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 445 %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 446 %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 447 448 %x = load i8, i8 addrspace(1)* %in 449 %y = load i8, i8 addrspace(1)* %in.gep.1 450 %z = load i8, i8 addrspace(1)* %in.gep.2 451 %w = load i8, i8 addrspace(1)* %in.gep.3 452 453 store i8 %x, i8 addrspace(1)* %out 454 store i8 %y, i8 addrspace(1)* %out.gep.1 455 store i8 %z, i8 addrspace(1)* %out.gep.2 456 store i8 %w, i8 addrspace(1)* %out.gep.3 457 ret void 458 } 459 460 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: 461 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] 462 ; GCN: buffer_store_dwordx4 [[LOAD]] 463 ; GCN: s_endpgm 464 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { 465 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 466 %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 467 %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 468 %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in 469 470 %x = extractelement <4 x i32> %vec, i32 0 471 %y = extractelement <4 x i32> %vec, i32 1 472 %z = extractelement <4 x i32> %vec, i32 2 473 %w = extractelement <4 x i32> %vec, i32 3 474 475 store i32 %x, i32 addrspace(1)* %out 476 store i32 %y, i32 addrspace(1)* %out.gep.1 477 store i32 %z, i32 addrspace(1)* %out.gep.2 478 store i32 %w, i32 addrspace(1)* %out.gep.3 479 ret void 480 } 481 482 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: 483 ; GCN: ds_write_b16 484 ; GCN: s_endpgm 485 define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { 486 %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 487 488 store i8 123, i8 addrspace(3)* %out.gep.1 489 store i8 456, i8 addrspace(3)* %out, align 2 490 ret void 491 } 492 493 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: 494 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 495 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b 496 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} 497 define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { 498 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 499 500 store i32 123, i32 addrspace(3)* %out.gep.1 501 store i32 456, i32 addrspace(3)* %out 502 ret void 503 } 504 505 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: 506 ; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8 507 ; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d 508 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3 509 510 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2 511 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b 512 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1 513 514 ; GCN: s_endpgm 515 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { 516 %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 517 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 518 %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 519 520 store i32 123, i32 addrspace(3)* %out.gep.1 521 store i32 456, i32 addrspace(3)* %out.gep.2 522 store i32 333, i32 addrspace(3)* %out.gep.3 523 store i32 1234, i32 addrspace(3)* %out 524 ret void 525 } 526 527 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: 528 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} 529 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} 530 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} 531 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} 532 ; GCN: buffer_store_dword v[[HI]] 533 define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { 534 store i32 9, i32 addrspace(1)* %out, align 4 535 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 536 store i32 12, i32 addrspace(1)* %idx1, align 4 537 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 538 store i32 16, i32 addrspace(1)* %idx2, align 4 539 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 540 store i32 -12, i32 addrspace(1)* %idx3, align 4 541 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 542 store i32 11, i32 addrspace(1)* %idx4, align 4 543 ret void 544 } 545 546 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32: 547 ; GCN: buffer_store_dwordx4 548 ; GCN: buffer_store_dwordx2 549 define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) { 550 store i32 13, i32 addrspace(1)* %out, align 4 551 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 552 store i32 15, i32 addrspace(1)* %idx1, align 4 553 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 554 store i32 62, i32 addrspace(1)* %idx2, align 4 555 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 556 store i32 63, i32 addrspace(1)* %idx3, align 4 557 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 558 store i32 11, i32 addrspace(1)* %idx4, align 4 559 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 560 store i32 123, i32 addrspace(1)* %idx5, align 4 561 ret void 562 } 563 564 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: 565 ; GCN: buffer_store_dwordx4 566 ; GCN: buffer_store_dwordx2 567 ; GCN: buffer_store_dword v 568 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { 569 store i32 34, i32 addrspace(1)* %out, align 4 570 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 571 store i32 999, i32 addrspace(1)* %idx1, align 4 572 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 573 store i32 65, i32 addrspace(1)* %idx2, align 4 574 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 575 store i32 33, i32 addrspace(1)* %idx3, align 4 576 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 577 store i32 98, i32 addrspace(1)* %idx4, align 4 578 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 579 store i32 91, i32 addrspace(1)* %idx5, align 4 580 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 581 store i32 212, i32 addrspace(1)* %idx6, align 4 582 ret void 583 } 584 585 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: 586 ; GCN: buffer_store_dwordx4 587 ; GCN: buffer_store_dwordx4 588 ; GCN: s_endpgm 589 define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { 590 store i32 34, i32 addrspace(1)* %out, align 4 591 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 592 store i32 999, i32 addrspace(1)* %idx1, align 4 593 %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 594 store i32 65, i32 addrspace(1)* %idx2, align 4 595 %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 596 store i32 33, i32 addrspace(1)* %idx3, align 4 597 %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4 598 store i32 98, i32 addrspace(1)* %idx4, align 4 599 %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5 600 store i32 91, i32 addrspace(1)* %idx5, align 4 601 %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6 602 store i32 212, i32 addrspace(1)* %idx6, align 4 603 %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7 604 store i32 999, i32 addrspace(1)* %idx7, align 4 605 ret void 606 } 607 608 ; This requires handling of scalar_to_vector for v2i64 to avoid 609 ; scratch usage. 610 ; FIXME: Should do single load and store 611 612 ; GCN-LABEL: {{^}}copy_v3i32_align4: 613 ; GCN-NOT: SCRATCH_RSRC_DWORD 614 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 615 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 616 ; GCN-NOT: offen 617 ; GCN: s_waitcnt vmcnt 618 ; GCN-NOT: offen 619 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 620 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 621 622 ; GCN: ScratchSize: 0{{$}} 623 define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { 624 %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4 625 store <3 x i32> %vec, <3 x i32> addrspace(1)* %out 626 ret void 627 } 628 629 ; GCN-LABEL: {{^}}copy_v3i64_align4: 630 ; GCN-NOT: SCRATCH_RSRC_DWORD 631 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 632 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 633 ; GCN-NOT: offen 634 ; GCN: s_waitcnt vmcnt 635 ; GCN-NOT: offen 636 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 637 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 638 ; GCN: ScratchSize: 0{{$}} 639 define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 { 640 %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4 641 store <3 x i64> %vec, <3 x i64> addrspace(1)* %out 642 ret void 643 } 644 645 ; GCN-LABEL: {{^}}copy_v3f32_align4: 646 ; GCN-NOT: SCRATCH_RSRC_DWORD 647 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 648 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 649 ; GCN-NOT: offen 650 ; GCN: s_waitcnt vmcnt 651 ; GCN-NOT: offen 652 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 653 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 654 ; GCN: ScratchSize: 0{{$}} 655 define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { 656 %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 657 %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0> 658 store <3 x float> %fadd, <3 x float> addrspace(1)* %out 659 ret void 660 } 661 662 ; GCN-LABEL: {{^}}copy_v3f64_align4: 663 ; GCN-NOT: SCRATCH_RSRC_DWORD 664 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 665 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 666 ; GCN-NOT: offen 667 ; GCN: s_waitcnt vmcnt 668 ; GCN-NOT: offen 669 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 670 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} 671 ; GCN: ScratchSize: 0{{$}} 672 define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 { 673 %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4 674 %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0> 675 store <3 x double> %fadd, <3 x double> addrspace(1)* %out 676 ret void 677 } 678 679 declare void @llvm.amdgcn.s.barrier() #1 680 681 attributes #0 = { nounwind } 682 attributes #1 = { convergent nounwind } 683