Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
      2 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
      3 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
      4 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s
      5 ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s
      6 
      7 ; FUNC-LABEL: {{^}}i8_arg:
      8 ; HSA-VI: kernarg_segment_byte_size = 12
      9 ; HSA-VI: kernarg_segment_alignment = 4
     10 
     11 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
     12 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
     13 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
     14 
     15 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
     16 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
     17 
     18 
     19 ; EGCM: VTX_READ_8{{.*}} #3
     20 ; EGCM: KC0[2].Y
     21 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
     22   %ext = zext i8 %in to i32
     23   store i32 %ext, i32 addrspace(1)* %out, align 4
     24   ret void
     25 }
     26 
     27 ; FUNC-LABEL: {{^}}i8_zext_arg:
     28 ; HSA-VI: kernarg_segment_byte_size = 12
     29 ; HSA-VI: kernarg_segment_alignment = 4
     30 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
     31 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
     32 
     33 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
     34 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
     35 
     36 
     37 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
     38 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
     39 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
     40 
     41 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
     42 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
     43 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
     44 ; CM-NEXT:	2(2.802597e-45), 0(0.000000e+00)
     45 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
     46   %ext = zext i8 %in to i32
     47   store i32 %ext, i32 addrspace(1)* %out, align 4
     48   ret void
     49 }
     50 
     51 ; FUNC-LABEL: {{^}}i8_sext_arg:
     52 ; HSA-VI: kernarg_segment_byte_size = 12
     53 ; HSA-VI: kernarg_segment_alignment = 4
     54 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
     55 
     56 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
     57 
     58 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
     59 ; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
     60 ; HSA-VI: flat_store_dword
     61 
     62 
     63 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
     64 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
     65 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
     66 
     67 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
     68 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
     69 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
     70 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
     71 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
     72   %ext = sext i8 %in to i32
     73   store i32 %ext, i32 addrspace(1)* %out, align 4
     74   ret void
     75 }
     76 
     77 ; FUNC-LABEL: {{^}}i16_arg:
     78 ; HSA-VI: kernarg_segment_byte_size = 12
     79 ; HSA-VI: kernarg_segment_alignment = 4
     80 
     81 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
     82 
     83 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
     84 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
     85 
     86 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
     87 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
     88 ; HSA-VI: flat_store_dword
     89 
     90 ; EGCM: VTX_READ_16
     91 ; EGCM: KC0[2].Y
     92 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
     93   %ext = zext i16 %in to i32
     94   store i32 %ext, i32 addrspace(1)* %out, align 4
     95   ret void
     96 }
     97 
     98 ; FUNC-LABEL: {{^}}i16_zext_arg:
     99 ; HSA-VI: kernarg_segment_byte_size = 12
    100 ; HSA-VI: kernarg_segment_alignment = 4
    101 
    102 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
    103 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
    104 
    105 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
    106 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
    107 ; HSA-VI: flat_store_dword
    108 
    109 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
    110 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
    111 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
    112 
    113 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
    114 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
    115 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
    116 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
    117 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
    118   %ext = zext i16 %in to i32
    119   store i32 %ext, i32 addrspace(1)* %out, align 4
    120   ret void
    121 }
    122 
    123 ; FUNC-LABEL: {{^}}i16_sext_arg:
    124 ; HSA-VI: kernarg_segment_byte_size = 12
    125 ; HSA-VI: kernarg_segment_alignment = 4
    126 
    127 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
    128 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
    129 
    130 
    131 ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
    132 ; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
    133 ; HSA-VI: flat_store_dword
    134 
    135 ; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
    136 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
    137 ; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
    138 
    139 ; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
    140 ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
    141 ; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
    142 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
    143 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
    144   %ext = sext i16 %in to i32
    145   store i32 %ext, i32 addrspace(1)* %out, align 4
    146   ret void
    147 }
    148 
    149 ; FUNC-LABEL: {{^}}i32_arg:
    150 ; HSA-VI: kernarg_segment_byte_size = 12
    151 ; HSA-VI: kernarg_segment_alignment = 4
    152 
    153 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
    154 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
    155 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
    156 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
    157 define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
    158 entry:
    159   store i32 %in, i32 addrspace(1)* %out, align 4
    160   ret void
    161 }
    162 
    163 ; FUNC-LABEL: {{^}}f32_arg:
    164 ; HSA-VI: kernarg_segment_byte_size = 12
    165 ; HSA-VI: kernarg_segment_alignment = 4
    166 ; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
    167 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
    168 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
    169 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
    170 define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
    171 entry:
    172   store float %in, float addrspace(1)* %out, align 4
    173   ret void
    174 }
    175 
    176 ; FUNC-LABEL: {{^}}v2i8_arg:
    177 ; HSA-VI: kernarg_segment_byte_size = 12
    178 ; HSA-VI: kernarg_segment_alignment = 4
    179 
    180 ; EGCM: VTX_READ_8
    181 ; EGCM: VTX_READ_8
    182 
    183 ; GCN: s_load_dword s
    184 ; GCN-NOT: {{buffer|flat|global}}_load_
    185 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
    186 entry:
    187   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
    188   ret void
    189 }
    190 
    191 ; FUNC-LABEL: {{^}}v2i16_arg:
    192 ; HSA-VI: kernarg_segment_byte_size = 12
    193 ; HSA-VI: kernarg_segment_alignment = 4
    194 
    195 ; EGCM: VTX_READ_16
    196 ; EGCM: VTX_READ_16
    197 
    198 ; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
    199 ; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
    200 ; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
    201 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
    202 entry:
    203   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
    204   ret void
    205 }
    206 
    207 ; FUNC-LABEL: {{^}}v2i32_arg:
    208 ; HSA-VI: kernarg_segment_byte_size = 16
    209 ; HSA-VI: kernarg_segment_alignment = 4
    210 
    211 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
    212 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
    213 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
    214 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
    215 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
    216 define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
    217 entry:
    218   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
    219   ret void
    220 }
    221 
    222 ; FUNC-LABEL: {{^}}v2f32_arg:
    223 ; HSA-VI: kernarg_segment_byte_size = 16
    224 ; HSA-VI: kernarg_segment_alignment = 4
    225 
    226 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
    227 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
    228 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
    229 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
    230 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
    231 define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
    232 entry:
    233   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
    234   ret void
    235 }
    236 
    237 ; FUNC-LABEL: {{^}}v3i8_arg:
    238 ; HSA-VI: kernarg_segment_byte_size = 12
    239 ; HSA-VI: kernarg_segment_alignment = 4
    240 
    241 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
    242 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
    243 ; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
    244 
    245 ; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
    246 
    247 ; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
    248 ; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
    249 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
    250 entry:
    251   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
    252   ret void
    253 }
    254 
    255 ; FUNC-LABEL: {{^}}v3i16_arg:
    256 ; HSA-VI: kernarg_segment_byte_size = 16
    257 ; HSA-VI: kernarg_segment_alignment = 4
    258 
    259 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
    260 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
    261 ; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
    262 
    263 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
    264 
    265 ; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
    266 ; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
    267 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
    268 entry:
    269   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
    270   ret void
    271 }
    272 
    273 ; FUNC-LABEL: {{^}}v3i32_arg:
    274 ; HSA-VI: kernarg_segment_byte_size = 32
    275 ; HSA-VI: kernarg_segment_alignment = 4
    276 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
    277 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
    278 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
    279 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
    280 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
    281 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
    282 define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
    283 entry:
    284   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
    285   ret void
    286 }
    287 
    288 ; FUNC-LABEL: {{^}}v3f32_arg:
    289 ; HSA-VI: kernarg_segment_byte_size = 32
    290 ; HSA-VI: kernarg_segment_alignment = 4
    291 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
    292 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
    293 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
    294 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
    295 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
    296 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
    297 define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
    298 entry:
    299   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
    300   ret void
    301 }
    302 
    303 ; FUNC-LABEL: {{^}}v4i8_arg:
    304 ; HSA-VI: kernarg_segment_byte_size = 12
    305 ; HSA-VI: kernarg_segment_alignment = 4
    306 ; EGCM: VTX_READ_8
    307 ; EGCM: VTX_READ_8
    308 ; EGCM: VTX_READ_8
    309 ; EGCM: VTX_READ_8
    310 
    311 ; GCN-DAG: s_load_dwordx2 s
    312 ; GCN-DAG: s_load_dword s
    313 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
    314 entry:
    315   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
    316   ret void
    317 }
    318 
    319 ; FUNC-LABEL: {{^}}v4i16_arg:
    320 ; HSA-VI: kernarg_segment_byte_size = 16
    321 ; HSA-VI: kernarg_segment_alignment = 4
    322 ; EGCM: VTX_READ_16
    323 ; EGCM: VTX_READ_16
    324 ; EGCM: VTX_READ_16
    325 ; EGCM: VTX_READ_16
    326 
    327 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
    328 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
    329 
    330 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
    331 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
    332 
    333 
    334 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
    335 ; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
    336 
    337 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
    338 ; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
    339 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
    340 entry:
    341   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
    342   ret void
    343 }
    344 
    345 ; FUNC-LABEL: {{^}}v4i32_arg:
    346 ; HSA-VI: kernarg_segment_byte_size = 32
    347 ; HSA-VI: kernarg_segment_alignment = 4
    348 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
    349 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
    350 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
    351 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
    352 
    353 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
    354 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
    355 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
    356 define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
    357 entry:
    358   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
    359   ret void
    360 }
    361 
    362 ; FUNC-LABEL: {{^}}v4f32_arg:
    363 ; HSA-VI: kernarg_segment_byte_size = 32
    364 ; HSA-VI: kernarg_segment_alignment = 4
    365 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
    366 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
    367 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
    368 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
    369 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
    370 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
    371 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
    372 define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
    373 entry:
    374   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
    375   ret void
    376 }
    377 
    378 ; FIXME: Lots of unpack and re-pack junk on VI
    379 ; FUNC-LABEL: {{^}}v8i8_arg:
    380 ; HSA-VI: kernarg_segment_byte_size = 16
    381 ; HSA-VI: kernarg_segment_alignment = 4
    382 ; EGCM: VTX_READ_8
    383 ; EGCM: VTX_READ_8
    384 ; EGCM: VTX_READ_8
    385 ; EGCM: VTX_READ_8
    386 ; EGCM: VTX_READ_8
    387 ; EGCM: VTX_READ_8
    388 ; EGCM: VTX_READ_8
    389 ; EGCM: VTX_READ_8
    390 
    391 ; SI-NOT: {{buffer|flat|global}}_load
    392 ; SI: s_load_dwordx2 s
    393 ; SI-NEXT: s_load_dwordx2 s
    394 ; SI-NOT: {{buffer|flat|global}}_load
    395 
    396 ; VI: s_load_dwordx2 s
    397 ; VI-NEXT: s_load_dwordx2 s
    398 ; VI-NOT: lshl
    399 ; VI-NOT: _or
    400 ; VI-NOT: _sdwa
    401 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
    402 entry:
    403   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
    404   ret void
    405 }
    406 
    407 ; FUNC-LABEL: {{^}}v8i16_arg:
    408 ; HSA-VI: kernarg_segment_byte_size = 32
    409 ; HSA-VI: kernarg_segment_alignment = 4
    410 ; EGCM: VTX_READ_16
    411 ; EGCM: VTX_READ_16
    412 ; EGCM: VTX_READ_16
    413 ; EGCM: VTX_READ_16
    414 ; EGCM: VTX_READ_16
    415 ; EGCM: VTX_READ_16
    416 ; EGCM: VTX_READ_16
    417 ; EGCM: VTX_READ_16
    418 
    419 ; SI: s_load_dwordx4
    420 ; SI-NEXT: s_load_dwordx2
    421 ; SI-NOT: {{buffer|flat|global}}_load
    422 
    423 
    424 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
    425 
    426 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
    427 define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
    428 entry:
    429   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
    430   ret void
    431 }
    432 
    433 ; FUNC-LABEL: {{^}}v8i32_arg:
    434 ; HSA-VI: kernarg_segment_byte_size = 64
    435 ; HSA-VI: kernarg_segment_alignment = 5
    436 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
    437 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
    438 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
    439 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
    440 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
    441 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
    442 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
    443 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
    444 
    445 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
    446 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
    447 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
    448 define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
    449 entry:
    450   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
    451   ret void
    452 }
    453 
    454 ; FUNC-LABEL: {{^}}v8f32_arg:
    455 ; HSA-VI: kernarg_segment_byte_size = 64
    456 ; HSA-VI: kernarg_segment_alignment = 5
    457 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
    458 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
    459 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
    460 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
    461 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
    462 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
    463 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
    464 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
    465 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
    466 define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
    467 entry:
    468   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
    469   ret void
    470 }
    471 
    472 ; FIXME: Pack/repack on VI
    473 
    474 ; FUNC-LABEL: {{^}}v16i8_arg:
    475 ; HSA-VI: kernarg_segment_byte_size = 32
    476 ; HSA-VI: kernarg_segment_alignment = 4
    477 ; EGCM: VTX_READ_8
    478 ; EGCM: VTX_READ_8
    479 ; EGCM: VTX_READ_8
    480 ; EGCM: VTX_READ_8
    481 ; EGCM: VTX_READ_8
    482 ; EGCM: VTX_READ_8
    483 ; EGCM: VTX_READ_8
    484 ; EGCM: VTX_READ_8
    485 ; EGCM: VTX_READ_8
    486 ; EGCM: VTX_READ_8
    487 ; EGCM: VTX_READ_8
    488 ; EGCM: VTX_READ_8
    489 ; EGCM: VTX_READ_8
    490 ; EGCM: VTX_READ_8
    491 ; EGCM: VTX_READ_8
    492 ; EGCM: VTX_READ_8
    493 
    494 ; SI: s_load_dwordx4 s
    495 ; SI-NEXT: s_load_dwordx2 s
    496 ; SI-NOT: {{buffer|flat|global}}_load
    497 
    498 
    499 ; VI: s_load_dwordx4 s
    500 ; VI-NOT: shr
    501 ; VI-NOT: shl
    502 ; VI-NOT: _sdwa
    503 ; VI-NOT: _or_
    504 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
    505 entry:
    506   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
    507   ret void
    508 }
    509 
    510 ; FUNC-LABEL: {{^}}v16i16_arg:
    511 ; HSA-VI: kernarg_segment_byte_size = 64
    512 ; HSA-VI: kernarg_segment_alignment = 5
    513 ; EGCM: VTX_READ_16
    514 ; EGCM: VTX_READ_16
    515 ; EGCM: VTX_READ_16
    516 ; EGCM: VTX_READ_16
    517 ; EGCM: VTX_READ_16
    518 
    519 ; EGCM: VTX_READ_16
    520 ; EGCM: VTX_READ_16
    521 ; EGCM: VTX_READ_16
    522 ; EGCM: VTX_READ_16
    523 ; EGCM: VTX_READ_16
    524 ; EGCM: VTX_READ_16
    525 ; EGCM: VTX_READ_16
    526 ; EGCM: VTX_READ_16
    527 ; EGCM: VTX_READ_16
    528 ; EGCM: VTX_READ_16
    529 ; EGCM: VTX_READ_16
    530 
    531 ; SI: s_load_dwordx8 s
    532 ; SI-NEXT: s_load_dwordx2 s
    533 ; SI-NOT: {{buffer|flat|global}}_load
    534 
    535 
    536 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
    537 
    538 ; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
    539 define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
    540 entry:
    541   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
    542   ret void
    543 }
    544 
    545 ; FUNC-LABEL: {{^}}v16i32_arg:
    546 ; HSA-VI: kernarg_segment_byte_size = 128
    547 ; HSA-VI: kernarg_segment_alignment = 6
    548 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
    549 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
    550 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
    551 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
    552 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
    553 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
    554 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
    555 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
    556 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
    557 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
    558 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
    559 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
    560 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
    561 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
    562 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
    563 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
    564 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
    565 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
    566 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
    567 define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
    568 entry:
    569   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
    570   ret void
    571 }
    572 
    573 ; FUNC-LABEL: {{^}}v16f32_arg:
    574 ; HSA-VI: kernarg_segment_byte_size = 128
    575 ; HSA-VI: kernarg_segment_alignment = 6
    576 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
    577 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
    578 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
    579 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
    580 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
    581 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
    582 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
    583 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
    584 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
    585 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
    586 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
    587 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
    588 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
    589 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
    590 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
    591 ; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
    592 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
    593 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
    594 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
    595 define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
    596 entry:
    597   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
    598   ret void
    599 }
    600 
    601 ; FUNC-LABEL: {{^}}kernel_arg_i64:
    602 ; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
    603 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
    604 
    605 ; MESA-GCN: buffer_store_dwordx2
    606 define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
    607   store i64 %a, i64 addrspace(1)* %out, align 8
    608   ret void
    609 }
    610 
    611 ; FUNC-LABEL: {{^}}f64_kernel_arg:
    612 ; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
    613 ; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
    614 ; MESA-GCN: buffer_store_dwordx2
    615 
    616 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
    617 define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
    618 entry:
    619   store double %in, double addrspace(1)* %out
    620   ret void
    621 }
    622 
    623 ; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
    624 ; XGCN: s_load_dwordx2
    625 ; XGCN: s_load_dwordx2
    626 ; XGCN: buffer_store_dwordx2
    627 ; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
    628 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
    629 ;   ret void
    630 ; }
    631 
    632 ; FUNC-LABEL: {{^}}i65_arg:
    633 ; HSA-VI: kernarg_segment_byte_size = 24
    634 ; HSA-VI: kernarg_segment_alignment = 4
    635 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
    636 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
    637 define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
    638 entry:
    639   store i65 %in, i65 addrspace(1)* %out, align 4
    640   ret void
    641 }
    642 
    643 ; FUNC-LABEL: {{^}}i1_arg:
    644 ; HSA-VI: kernarg_segment_byte_size = 12
    645 ; HSA-VI: kernarg_segment_alignment = 4
    646 
    647 ; GCN: s_load_dword s
    648 ; GCN: s_and_b32
    649 ; GCN: {{buffer|flat}}_store_byte
    650 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
    651   store i1 %x, i1 addrspace(1)* %out, align 1
    652   ret void
    653 }
    654 
    655 ; FUNC-LABEL: {{^}}i1_arg_zext_i32:
    656 ; HSA-VI: kernarg_segment_byte_size = 12
    657 ; HSA-VI: kernarg_segment_alignment = 4
    658 
    659 ; GCN: s_load_dword
    660 ; SGCN: buffer_store_dword
    661 define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
    662   %ext = zext i1 %x to i32
    663   store i32 %ext, i32 addrspace(1)* %out, align 4
    664   ret void
    665 }
    666 
    667 ; FUNC-LABEL: {{^}}i1_arg_zext_i64:
    668 ; HSA-VI: kernarg_segment_byte_size = 12
    669 ; HSA-VI: kernarg_segment_alignment = 4
    670 
    671 ; GCN: s_load_dword s
    672 ; GCN: {{buffer|flat}}_store_dwordx2
    673 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
    674   %ext = zext i1 %x to i64
    675   store i64 %ext, i64 addrspace(1)* %out, align 8
    676   ret void
    677 }
    678 
    679 ; FUNC-LABEL: {{^}}i1_arg_sext_i32:
    680 ; HSA-VI: kernarg_segment_byte_size = 12
    681 ; HSA-VI: kernarg_segment_alignment = 4
    682 
    683 ; GCN: s_load_dword
    684 ; GCN: {{buffer|flat}}_store_dword
    685 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
    686   %ext = sext i1 %x to i32
    687   store i32 %ext, i32addrspace(1)* %out, align 4
    688   ret void
    689 }
    690 
    691 ; FUNC-LABEL: {{^}}i1_arg_sext_i64:
    692 ; HSA-VI: kernarg_segment_byte_size = 12
    693 ; HSA-VI: kernarg_segment_alignment = 4
    694 
    695 ; GCN: s_load_dword
    696 ; GCN: s_bfe_i64
    697 ; GCN: {{buffer|flat}}_store_dwordx2
    698 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
    699   %ext = sext i1 %x to i64
    700   store i64 %ext, i64 addrspace(1)* %out, align 8
    701   ret void
    702 }
    703 
    704 ; FUNC-LABEL: {{^}}empty_struct_arg:
    705 ; HSA-VI: kernarg_segment_byte_size = 0
    706 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
    707   ret void
    708 }
    709 
    710 ; The correct load offsets for these:
    711 ; load 4 from 0,
    712 ; load 8 from 8
    713 ; load 4 from 24
    714 ; load 8 from 32
    715 
    716 ; With the SelectionDAG argument lowering, the alignments for the
    717 ; struct members is not properly considered, making these wrong.
    718 
    719 ; FIXME: Total argument size is computed wrong
    720 ; FUNC-LABEL: {{^}}struct_argument_alignment:
    721 ; HSA-VI: kernarg_segment_byte_size = 40
    722 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
    723 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
    724 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
    725 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
    726 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
    727   %val0 = extractvalue {i32, i64} %arg0, 0
    728   %val1 = extractvalue {i32, i64} %arg0, 1
    729   %val2 = extractvalue {i32, i64} %arg1, 0
    730   %val3 = extractvalue {i32, i64} %arg1, 1
    731   store volatile i32 %val0, i32 addrspace(1)* null
    732   store volatile i64 %val1, i64 addrspace(1)* null
    733   store volatile i32 %val2, i32 addrspace(1)* null
    734   store volatile i64 %val3, i64 addrspace(1)* null
    735   ret void
    736 }
    737 
    738 ; No padding between i8 and next struct, but round up at end to 4 byte
    739 ; multiple.
    740 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
    741 ; HSA-VI: kernarg_segment_byte_size = 28
    742 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
    743 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
    744 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
    745 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
    746 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
    747   %val0 = extractvalue <{i32, i64}> %arg0, 0
    748   %val1 = extractvalue <{i32, i64}> %arg0, 1
    749   %val2 = extractvalue <{i32, i64}> %arg1, 0
    750   %val3 = extractvalue <{i32, i64}> %arg1, 1
    751   store volatile i32 %val0, i32 addrspace(1)* null
    752   store volatile i64 %val1, i64 addrspace(1)* null
    753   store volatile i32 %val2, i32 addrspace(1)* null
    754   store volatile i64 %val3, i64 addrspace(1)* null
    755   ret void
    756 }
    757 
    758 ; GCN-LABEL: {{^}}struct_argument_alignment_after:
    759 ; HSA-VI: kernarg_segment_byte_size = 64
    760 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
    761 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
    762 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
    763 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
    764 ; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
    765 define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
    766   %val0 = extractvalue {i32, i64} %arg0, 0
    767   %val1 = extractvalue {i32, i64} %arg0, 1
    768   %val2 = extractvalue {i32, i64} %arg2, 0
    769   %val3 = extractvalue {i32, i64} %arg2, 1
    770   store volatile i32 %val0, i32 addrspace(1)* null
    771   store volatile i64 %val1, i64 addrspace(1)* null
    772   store volatile i32 %val2, i32 addrspace(1)* null
    773   store volatile i64 %val3, i64 addrspace(1)* null
    774   store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
    775   ret void
    776 }
    777 
    778 ; GCN-LABEL: {{^}}array_3xi32:
    779 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
    780 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
    781 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
    782 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
    783 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
    784   store volatile i16 %arg0, i16 addrspace(1)* undef
    785   store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
    786   ret void
    787 }
    788 
    789 ; FIXME: Why not all scalar loads?
    790 ; GCN-LABEL: {{^}}array_3xi16:
    791 ; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2
    792 ; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0
    793 ; HSA-VI: flat_load_ushort
    794 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
    795 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
    796 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
    797   store volatile i8 %arg0, i8 addrspace(1)* undef
    798   store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
    799   ret void
    800 }
    801