1 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s 5 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s 6 7 ; FUNC-LABEL: {{^}}i8_arg: 8 ; HSA-VI: kernarg_segment_byte_size = 12 9 ; HSA-VI: kernarg_segment_alignment = 4 10 11 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14 15 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 16 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 17 18 19 ; EGCM: VTX_READ_8{{.*}} #3 20 ; EGCM: KC0[2].Y 21 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 22 %ext = zext i8 %in to i32 23 store i32 %ext, i32 addrspace(1)* %out, align 4 24 ret void 25 } 26 27 ; FUNC-LABEL: {{^}}i8_zext_arg: 28 ; HSA-VI: kernarg_segment_byte_size = 12 29 ; HSA-VI: kernarg_segment_alignment = 4 30 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 31 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 32 33 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 34 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 35 36 37 ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 38 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 39 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 40 41 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 42 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 43 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 44 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 45 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 46 %ext = zext i8 %in to i32 47 store i32 %ext, i32 addrspace(1)* %out, align 4 48 ret void 49 } 50 51 ; FUNC-LABEL: {{^}}i8_sext_arg: 52 ; HSA-VI: kernarg_segment_byte_size = 12 53 ; HSA-VI: kernarg_segment_alignment = 4 54 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 55 56 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 57 58 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 59 ; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] 60 ; HSA-VI: flat_store_dword 61 62 63 ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 64 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 65 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 66 67 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 68 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 69 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 70 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 71 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 72 %ext = sext i8 %in to i32 73 store i32 %ext, i32 addrspace(1)* %out, align 4 74 ret void 75 } 76 77 ; FUNC-LABEL: {{^}}i16_arg: 78 ; HSA-VI: kernarg_segment_byte_size = 12 79 ; HSA-VI: kernarg_segment_alignment = 4 80 81 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 82 83 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 84 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 85 86 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 87 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 88 ; HSA-VI: flat_store_dword 89 90 ; EGCM: VTX_READ_16 91 ; EGCM: KC0[2].Y 92 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 93 %ext = zext i16 %in to i32 94 store i32 %ext, i32 addrspace(1)* %out, align 4 95 ret void 96 } 97 98 ; FUNC-LABEL: {{^}}i16_zext_arg: 99 ; HSA-VI: kernarg_segment_byte_size = 12 100 ; HSA-VI: kernarg_segment_alignment = 4 101 102 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 103 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 104 105 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 106 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 107 ; HSA-VI: flat_store_dword 108 109 ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 110 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 111 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 112 113 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 114 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 115 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 116 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 117 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 118 %ext = zext i16 %in to i32 119 store i32 %ext, i32 addrspace(1)* %out, align 4 120 ret void 121 } 122 123 ; FUNC-LABEL: {{^}}i16_sext_arg: 124 ; HSA-VI: kernarg_segment_byte_size = 12 125 ; HSA-VI: kernarg_segment_alignment = 4 126 127 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 128 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 129 130 131 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 132 ; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] 133 ; HSA-VI: flat_store_dword 134 135 ; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 136 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 137 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 138 139 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 140 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 141 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 142 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 143 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 144 %ext = sext i16 %in to i32 145 store i32 %ext, i32 addrspace(1)* %out, align 4 146 ret void 147 } 148 149 ; FUNC-LABEL: {{^}}i32_arg: 150 ; HSA-VI: kernarg_segment_byte_size = 12 151 ; HSA-VI: kernarg_segment_alignment = 4 152 153 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z 154 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 155 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 156 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 157 define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 158 entry: 159 store i32 %in, i32 addrspace(1)* %out, align 4 160 ret void 161 } 162 163 ; FUNC-LABEL: {{^}}f32_arg: 164 ; HSA-VI: kernarg_segment_byte_size = 12 165 ; HSA-VI: kernarg_segment_alignment = 4 166 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z 167 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 168 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 169 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 170 define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 171 entry: 172 store float %in, float addrspace(1)* %out, align 4 173 ret void 174 } 175 176 ; FUNC-LABEL: {{^}}v2i8_arg: 177 ; HSA-VI: kernarg_segment_byte_size = 12 178 ; HSA-VI: kernarg_segment_alignment = 4 179 180 ; EGCM: VTX_READ_8 181 ; EGCM: VTX_READ_8 182 183 ; GCN: s_load_dword s 184 ; GCN-NOT: {{buffer|flat|global}}_load_ 185 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 186 entry: 187 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 188 ret void 189 } 190 191 ; FUNC-LABEL: {{^}}v2i16_arg: 192 ; HSA-VI: kernarg_segment_byte_size = 12 193 ; HSA-VI: kernarg_segment_alignment = 4 194 195 ; EGCM: VTX_READ_16 196 ; EGCM: VTX_READ_16 197 198 ; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb 199 ; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 200 ; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 201 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 202 entry: 203 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 204 ret void 205 } 206 207 ; FUNC-LABEL: {{^}}v2i32_arg: 208 ; HSA-VI: kernarg_segment_byte_size = 16 209 ; HSA-VI: kernarg_segment_alignment = 4 210 211 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 212 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 213 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 214 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 215 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 216 define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 217 entry: 218 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 219 ret void 220 } 221 222 ; FUNC-LABEL: {{^}}v2f32_arg: 223 ; HSA-VI: kernarg_segment_byte_size = 16 224 ; HSA-VI: kernarg_segment_alignment = 4 225 226 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 227 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 228 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 229 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 230 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 231 define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 232 entry: 233 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 234 ret void 235 } 236 237 ; FUNC-LABEL: {{^}}v3i8_arg: 238 ; HSA-VI: kernarg_segment_byte_size = 12 239 ; HSA-VI: kernarg_segment_alignment = 4 240 241 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 242 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 243 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 244 245 ; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 246 247 ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 248 ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 249 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 250 entry: 251 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 252 ret void 253 } 254 255 ; FUNC-LABEL: {{^}}v3i16_arg: 256 ; HSA-VI: kernarg_segment_byte_size = 16 257 ; HSA-VI: kernarg_segment_alignment = 4 258 259 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 260 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 261 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 262 263 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 264 265 ; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 266 ; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 267 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 268 entry: 269 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 270 ret void 271 } 272 273 ; FUNC-LABEL: {{^}}v3i32_arg: 274 ; HSA-VI: kernarg_segment_byte_size = 32 275 ; HSA-VI: kernarg_segment_alignment = 4 276 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 277 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 278 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 279 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 280 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 281 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 282 define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 283 entry: 284 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 285 ret void 286 } 287 288 ; FUNC-LABEL: {{^}}v3f32_arg: 289 ; HSA-VI: kernarg_segment_byte_size = 32 290 ; HSA-VI: kernarg_segment_alignment = 4 291 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 292 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 293 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 294 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 295 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 296 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 297 define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 298 entry: 299 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 300 ret void 301 } 302 303 ; FUNC-LABEL: {{^}}v4i8_arg: 304 ; HSA-VI: kernarg_segment_byte_size = 12 305 ; HSA-VI: kernarg_segment_alignment = 4 306 ; EGCM: VTX_READ_8 307 ; EGCM: VTX_READ_8 308 ; EGCM: VTX_READ_8 309 ; EGCM: VTX_READ_8 310 311 ; GCN-DAG: s_load_dwordx2 s 312 ; GCN-DAG: s_load_dword s 313 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 314 entry: 315 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 316 ret void 317 } 318 319 ; FUNC-LABEL: {{^}}v4i16_arg: 320 ; HSA-VI: kernarg_segment_byte_size = 16 321 ; HSA-VI: kernarg_segment_alignment = 4 322 ; EGCM: VTX_READ_16 323 ; EGCM: VTX_READ_16 324 ; EGCM: VTX_READ_16 325 ; EGCM: VTX_READ_16 326 327 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb 328 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 329 330 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 331 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 332 333 334 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 335 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 336 337 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 338 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 339 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 340 entry: 341 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 342 ret void 343 } 344 345 ; FUNC-LABEL: {{^}}v4i32_arg: 346 ; HSA-VI: kernarg_segment_byte_size = 32 347 ; HSA-VI: kernarg_segment_alignment = 4 348 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 349 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 350 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 351 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 352 353 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 354 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 355 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 356 define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 357 entry: 358 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 359 ret void 360 } 361 362 ; FUNC-LABEL: {{^}}v4f32_arg: 363 ; HSA-VI: kernarg_segment_byte_size = 32 364 ; HSA-VI: kernarg_segment_alignment = 4 365 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 366 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 367 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 368 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 369 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 370 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 371 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 372 define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 373 entry: 374 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 375 ret void 376 } 377 378 ; FIXME: Lots of unpack and re-pack junk on VI 379 ; FUNC-LABEL: {{^}}v8i8_arg: 380 ; HSA-VI: kernarg_segment_byte_size = 16 381 ; HSA-VI: kernarg_segment_alignment = 4 382 ; EGCM: VTX_READ_8 383 ; EGCM: VTX_READ_8 384 ; EGCM: VTX_READ_8 385 ; EGCM: VTX_READ_8 386 ; EGCM: VTX_READ_8 387 ; EGCM: VTX_READ_8 388 ; EGCM: VTX_READ_8 389 ; EGCM: VTX_READ_8 390 391 ; SI-NOT: {{buffer|flat|global}}_load 392 ; SI: s_load_dwordx2 s 393 ; SI-NEXT: s_load_dwordx2 s 394 ; SI-NOT: {{buffer|flat|global}}_load 395 396 ; VI: s_load_dwordx2 s 397 ; VI-NEXT: s_load_dwordx2 s 398 ; VI-NOT: lshl 399 ; VI-NOT: _or 400 ; VI-NOT: _sdwa 401 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 402 entry: 403 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 404 ret void 405 } 406 407 ; FUNC-LABEL: {{^}}v8i16_arg: 408 ; HSA-VI: kernarg_segment_byte_size = 32 409 ; HSA-VI: kernarg_segment_alignment = 4 410 ; EGCM: VTX_READ_16 411 ; EGCM: VTX_READ_16 412 ; EGCM: VTX_READ_16 413 ; EGCM: VTX_READ_16 414 ; EGCM: VTX_READ_16 415 ; EGCM: VTX_READ_16 416 ; EGCM: VTX_READ_16 417 ; EGCM: VTX_READ_16 418 419 ; SI: s_load_dwordx4 420 ; SI-NEXT: s_load_dwordx2 421 ; SI-NOT: {{buffer|flat|global}}_load 422 423 424 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 425 426 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 427 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 428 entry: 429 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 430 ret void 431 } 432 433 ; FUNC-LABEL: {{^}}v8i32_arg: 434 ; HSA-VI: kernarg_segment_byte_size = 64 435 ; HSA-VI: kernarg_segment_alignment = 5 436 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 437 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 438 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 439 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 440 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 441 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 442 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 443 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 444 445 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 446 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 447 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 448 define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 449 entry: 450 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 451 ret void 452 } 453 454 ; FUNC-LABEL: {{^}}v8f32_arg: 455 ; HSA-VI: kernarg_segment_byte_size = 64 456 ; HSA-VI: kernarg_segment_alignment = 5 457 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 458 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 459 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 460 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 461 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 462 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 463 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 464 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 465 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 466 define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 467 entry: 468 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 469 ret void 470 } 471 472 ; FIXME: Pack/repack on VI 473 474 ; FUNC-LABEL: {{^}}v16i8_arg: 475 ; HSA-VI: kernarg_segment_byte_size = 32 476 ; HSA-VI: kernarg_segment_alignment = 4 477 ; EGCM: VTX_READ_8 478 ; EGCM: VTX_READ_8 479 ; EGCM: VTX_READ_8 480 ; EGCM: VTX_READ_8 481 ; EGCM: VTX_READ_8 482 ; EGCM: VTX_READ_8 483 ; EGCM: VTX_READ_8 484 ; EGCM: VTX_READ_8 485 ; EGCM: VTX_READ_8 486 ; EGCM: VTX_READ_8 487 ; EGCM: VTX_READ_8 488 ; EGCM: VTX_READ_8 489 ; EGCM: VTX_READ_8 490 ; EGCM: VTX_READ_8 491 ; EGCM: VTX_READ_8 492 ; EGCM: VTX_READ_8 493 494 ; SI: s_load_dwordx4 s 495 ; SI-NEXT: s_load_dwordx2 s 496 ; SI-NOT: {{buffer|flat|global}}_load 497 498 499 ; VI: s_load_dwordx4 s 500 ; VI-NOT: shr 501 ; VI-NOT: shl 502 ; VI-NOT: _sdwa 503 ; VI-NOT: _or_ 504 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 505 entry: 506 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 507 ret void 508 } 509 510 ; FUNC-LABEL: {{^}}v16i16_arg: 511 ; HSA-VI: kernarg_segment_byte_size = 64 512 ; HSA-VI: kernarg_segment_alignment = 5 513 ; EGCM: VTX_READ_16 514 ; EGCM: VTX_READ_16 515 ; EGCM: VTX_READ_16 516 ; EGCM: VTX_READ_16 517 ; EGCM: VTX_READ_16 518 519 ; EGCM: VTX_READ_16 520 ; EGCM: VTX_READ_16 521 ; EGCM: VTX_READ_16 522 ; EGCM: VTX_READ_16 523 ; EGCM: VTX_READ_16 524 ; EGCM: VTX_READ_16 525 ; EGCM: VTX_READ_16 526 ; EGCM: VTX_READ_16 527 ; EGCM: VTX_READ_16 528 ; EGCM: VTX_READ_16 529 ; EGCM: VTX_READ_16 530 531 ; SI: s_load_dwordx8 s 532 ; SI-NEXT: s_load_dwordx2 s 533 ; SI-NOT: {{buffer|flat|global}}_load 534 535 536 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 537 538 ; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 539 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 540 entry: 541 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 542 ret void 543 } 544 545 ; FUNC-LABEL: {{^}}v16i32_arg: 546 ; HSA-VI: kernarg_segment_byte_size = 128 547 ; HSA-VI: kernarg_segment_alignment = 6 548 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 549 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 550 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 551 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 552 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 553 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 554 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 555 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 556 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 557 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 558 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 559 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 560 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 561 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 562 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 563 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 564 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 565 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 566 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 567 define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 568 entry: 569 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 570 ret void 571 } 572 573 ; FUNC-LABEL: {{^}}v16f32_arg: 574 ; HSA-VI: kernarg_segment_byte_size = 128 575 ; HSA-VI: kernarg_segment_alignment = 6 576 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 577 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 578 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 579 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 580 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 581 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 582 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 583 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 584 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 585 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 586 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 587 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 588 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 589 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 590 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 591 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 592 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 593 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 594 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 595 define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 596 entry: 597 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 598 ret void 599 } 600 601 ; FUNC-LABEL: {{^}}kernel_arg_i64: 602 ; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24 603 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 604 605 ; MESA-GCN: buffer_store_dwordx2 606 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 607 store i64 %a, i64 addrspace(1)* %out, align 8 608 ret void 609 } 610 611 ; FUNC-LABEL: {{^}}f64_kernel_arg: 612 ; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 613 ; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 614 ; MESA-GCN: buffer_store_dwordx2 615 616 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 617 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 618 entry: 619 store double %in, double addrspace(1)* %out 620 ret void 621 } 622 623 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 624 ; XGCN: s_load_dwordx2 625 ; XGCN: s_load_dwordx2 626 ; XGCN: buffer_store_dwordx2 627 ; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 628 ; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 629 ; ret void 630 ; } 631 632 ; FUNC-LABEL: {{^}}i65_arg: 633 ; HSA-VI: kernarg_segment_byte_size = 24 634 ; HSA-VI: kernarg_segment_alignment = 4 635 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 636 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 637 define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 638 entry: 639 store i65 %in, i65 addrspace(1)* %out, align 4 640 ret void 641 } 642 643 ; FUNC-LABEL: {{^}}i1_arg: 644 ; HSA-VI: kernarg_segment_byte_size = 12 645 ; HSA-VI: kernarg_segment_alignment = 4 646 647 ; GCN: s_load_dword s 648 ; GCN: s_and_b32 649 ; GCN: {{buffer|flat}}_store_byte 650 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 651 store i1 %x, i1 addrspace(1)* %out, align 1 652 ret void 653 } 654 655 ; FUNC-LABEL: {{^}}i1_arg_zext_i32: 656 ; HSA-VI: kernarg_segment_byte_size = 12 657 ; HSA-VI: kernarg_segment_alignment = 4 658 659 ; GCN: s_load_dword 660 ; SGCN: buffer_store_dword 661 define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 662 %ext = zext i1 %x to i32 663 store i32 %ext, i32 addrspace(1)* %out, align 4 664 ret void 665 } 666 667 ; FUNC-LABEL: {{^}}i1_arg_zext_i64: 668 ; HSA-VI: kernarg_segment_byte_size = 12 669 ; HSA-VI: kernarg_segment_alignment = 4 670 671 ; GCN: s_load_dword s 672 ; GCN: {{buffer|flat}}_store_dwordx2 673 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 674 %ext = zext i1 %x to i64 675 store i64 %ext, i64 addrspace(1)* %out, align 8 676 ret void 677 } 678 679 ; FUNC-LABEL: {{^}}i1_arg_sext_i32: 680 ; HSA-VI: kernarg_segment_byte_size = 12 681 ; HSA-VI: kernarg_segment_alignment = 4 682 683 ; GCN: s_load_dword 684 ; GCN: {{buffer|flat}}_store_dword 685 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 686 %ext = sext i1 %x to i32 687 store i32 %ext, i32addrspace(1)* %out, align 4 688 ret void 689 } 690 691 ; FUNC-LABEL: {{^}}i1_arg_sext_i64: 692 ; HSA-VI: kernarg_segment_byte_size = 12 693 ; HSA-VI: kernarg_segment_alignment = 4 694 695 ; GCN: s_load_dword 696 ; GCN: s_bfe_i64 697 ; GCN: {{buffer|flat}}_store_dwordx2 698 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 699 %ext = sext i1 %x to i64 700 store i64 %ext, i64 addrspace(1)* %out, align 8 701 ret void 702 } 703 704 ; FUNC-LABEL: {{^}}empty_struct_arg: 705 ; HSA-VI: kernarg_segment_byte_size = 0 706 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 707 ret void 708 } 709 710 ; The correct load offsets for these: 711 ; load 4 from 0, 712 ; load 8 from 8 713 ; load 4 from 24 714 ; load 8 from 32 715 716 ; With the SelectionDAG argument lowering, the alignments for the 717 ; struct members is not properly considered, making these wrong. 718 719 ; FIXME: Total argument size is computed wrong 720 ; FUNC-LABEL: {{^}}struct_argument_alignment: 721 ; HSA-VI: kernarg_segment_byte_size = 40 722 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 723 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 724 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 725 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 726 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 727 %val0 = extractvalue {i32, i64} %arg0, 0 728 %val1 = extractvalue {i32, i64} %arg0, 1 729 %val2 = extractvalue {i32, i64} %arg1, 0 730 %val3 = extractvalue {i32, i64} %arg1, 1 731 store volatile i32 %val0, i32 addrspace(1)* null 732 store volatile i64 %val1, i64 addrspace(1)* null 733 store volatile i32 %val2, i32 addrspace(1)* null 734 store volatile i64 %val3, i64 addrspace(1)* null 735 ret void 736 } 737 738 ; No padding between i8 and next struct, but round up at end to 4 byte 739 ; multiple. 740 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: 741 ; HSA-VI: kernarg_segment_byte_size = 28 742 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 743 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 744 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 745 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 746 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 747 %val0 = extractvalue <{i32, i64}> %arg0, 0 748 %val1 = extractvalue <{i32, i64}> %arg0, 1 749 %val2 = extractvalue <{i32, i64}> %arg1, 0 750 %val3 = extractvalue <{i32, i64}> %arg1, 1 751 store volatile i32 %val0, i32 addrspace(1)* null 752 store volatile i64 %val1, i64 addrspace(1)* null 753 store volatile i32 %val2, i32 addrspace(1)* null 754 store volatile i64 %val3, i64 addrspace(1)* null 755 ret void 756 } 757 758 ; GCN-LABEL: {{^}}struct_argument_alignment_after: 759 ; HSA-VI: kernarg_segment_byte_size = 64 760 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 761 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 762 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 763 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 764 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 765 define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 766 %val0 = extractvalue {i32, i64} %arg0, 0 767 %val1 = extractvalue {i32, i64} %arg0, 1 768 %val2 = extractvalue {i32, i64} %arg2, 0 769 %val3 = extractvalue {i32, i64} %arg2, 1 770 store volatile i32 %val0, i32 addrspace(1)* null 771 store volatile i64 %val1, i64 addrspace(1)* null 772 store volatile i32 %val2, i32 addrspace(1)* null 773 store volatile i64 %val3, i64 addrspace(1)* null 774 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 775 ret void 776 } 777 778 ; GCN-LABEL: {{^}}array_3xi32: 779 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 780 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 781 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 782 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 783 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 784 store volatile i16 %arg0, i16 addrspace(1)* undef 785 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 786 ret void 787 } 788 789 ; FIXME: Why not all scalar loads? 790 ; GCN-LABEL: {{^}}array_3xi16: 791 ; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2 792 ; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0 793 ; HSA-VI: flat_load_ushort 794 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 795 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 796 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 797 store volatile i8 %arg0, i8 addrspace(1)* undef 798 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 799 ret void 800 } 801