Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
      2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
      3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
      4 
      5 declare i32 @llvm.amdgcn.workitem.id.x() #0
      6 declare i32 @llvm.amdgcn.workitem.id.y() #0
      7 
      8 ; In this test both the pointer and the offset operands to the
      9 ; BUFFER_LOAD instructions end up being stored in vgprs.  This
     10 ; requires us to add the pointer and offset together, store the
     11 ; result in the offset operand (vaddr), and then store 0 in an
     12 ; sgpr register pair and use that for the pointer operand
     13 ; (low 64-bits of srsrc).
     14 
     15 ; GCN-LABEL: {{^}}mubuf:
     16 
     17 ; Make sure we aren't using VGPRs for the source operand of s_mov_b64
     18 ; GCN-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
     19 
     20 ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
     21 ; instructions
     22 ; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
     23 ; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
     24 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
     25 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
     26 
     27 define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
     28 entry:
     29   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
     30   %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
     31   %tmp2 = sext i32 %tmp to i64
     32   %tmp3 = sext i32 %tmp1 to i64
     33   br label %loop
     34 
     35 loop:                                             ; preds = %loop, %entry
     36   %tmp4 = phi i64 [ 0, %entry ], [ %tmp5, %loop ]
     37   %tmp5 = add i64 %tmp2, %tmp4
     38   %tmp6 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp5
     39   %tmp7 = load i8, i8 addrspace(1)* %tmp6, align 1
     40   %tmp8 = or i64 %tmp5, 1
     41   %tmp9 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp8
     42   %tmp10 = load i8, i8 addrspace(1)* %tmp9, align 1
     43   %tmp11 = add i8 %tmp7, %tmp10
     44   %tmp12 = sext i8 %tmp11 to i32
     45   store i32 %tmp12, i32 addrspace(1)* %out
     46   %tmp13 = icmp slt i64 %tmp5, 10
     47   br i1 %tmp13, label %loop, label %done
     48 
     49 done:                                             ; preds = %loop
     50   ret void
     51 }
     52 
     53 ; Test moving an SMRD instruction to the VALU
     54 ; FIXME: movs can be moved before nop to reduce count
     55 
     56 ; GCN-LABEL: {{^}}smrd_valu:
     57 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
     58 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
     59 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
     60 ; SI: s_nop 3
     61 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
     62 ; SI: s_mov_b32
     63 
     64 ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
     65 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
     66 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
     67 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
     68 define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
     69 entry:
     70   %tmp = icmp ne i32 %a, 0
     71   br i1 %tmp, label %if, label %else
     72 
     73 if:                                               ; preds = %entry
     74   %tmp1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
     75   br label %endif
     76 
     77 else:                                             ; preds = %entry
     78   %tmp2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in
     79   %tmp3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %tmp2
     80   br label %endif
     81 
     82 endif:                                            ; preds = %else, %if
     83   %tmp4 = phi i32 addrspace(2)* [ %tmp1, %if ], [ %tmp3, %else ]
     84   %tmp5 = getelementptr i32, i32 addrspace(2)* %tmp4, i32 3000
     85   %tmp6 = load i32, i32 addrspace(2)* %tmp5
     86   store i32 %tmp6, i32 addrspace(1)* %out
     87   ret void
     88 }
     89 
     90 ; Test moving an SMRD with an immediate offset to the VALU
     91 
     92 ; GCN-LABEL: {{^}}smrd_valu2:
     93 ; GCN-NOHSA-NOT: v_add
     94 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
     95 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
     96 define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
     97 entry:
     98   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
     99   %tmp1 = add i32 %tmp, 4
    100   %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
    101   %tmp3 = load i32, i32 addrspace(2)* %tmp2
    102   store i32 %tmp3, i32 addrspace(1)* %out
    103   ret void
    104 }
    105 
    106 ; Use a big offset that will use the SMRD literal offset on CI
    107 ; GCN-LABEL: {{^}}smrd_valu_ci_offset:
    108 ; GCN-NOHSA-NOT: v_add
    109 ; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
    110 ; GCN-NOHSA-NOT: v_add
    111 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
    112 ; GCN-NOHSA: v_add_i32_e32
    113 ; GCN-NOHSA: buffer_store_dword
    114 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
    115 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
    116 define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
    117 entry:
    118   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    119   %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
    120   %tmp3 = getelementptr i32, i32 addrspace(2)* %tmp2, i32 5000
    121   %tmp4 = load i32, i32 addrspace(2)* %tmp3
    122   %tmp5 = add i32 %tmp4, %c
    123   store i32 %tmp5, i32 addrspace(1)* %out
    124   ret void
    125 }
    126 
    127 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
    128 ; GCN-NOHSA-NOT: v_add
    129 ; GCN-NOHSA: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
    130 ; GCN-NOHSA-NOT: v_add
    131 ; GCN-NOHSA: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
    132 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    133 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    134 ; GCN-NOHSA: buffer_store_dwordx2
    135 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
    136 define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
    137 entry:
    138   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    139   %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
    140   %tmp3 = getelementptr i64, i64 addrspace(2)* %tmp2, i32 5000
    141   %tmp4 = load i64, i64 addrspace(2)* %tmp3
    142   %tmp5 = or i64 %tmp4, %c
    143   store i64 %tmp5, i64 addrspace(1)* %out
    144   ret void
    145 }
    146 
    147 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
    148 ; GCN-NOHSA-NOT: v_add
    149 ; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
    150 ; GCN-NOHSA-NOT: v_add
    151 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
    152 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    153 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    154 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    155 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    156 ; GCN-NOHSA: buffer_store_dwordx4
    157 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
    158 define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
    159 entry:
    160   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    161   %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
    162   %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %tmp2, i32 1234
    163   %tmp4 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp3
    164   %tmp5 = or <4 x i32> %tmp4, %c
    165   store <4 x i32> %tmp5, <4 x i32> addrspace(1)* %out
    166   ret void
    167 }
    168 
    169 ; Original scalar load uses SGPR offset on SI and 32-bit literal on
    170 ; CI.
    171 
    172 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
    173 ; GCN-NOHSA-NOT: v_add
    174 ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
    175 ; GCN-NOHSA-NOT: v_add
    176 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
    177 ; GCN-NOHSA-NOT: v_add
    178 ; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
    179 ; GCN-NOHSA-NOT: v_add
    180 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
    181 
    182 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    183 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    184 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    185 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    186 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    187 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    188 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    189 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    190 ; GCN-NOHSA: buffer_store_dwordx4
    191 ; GCN-NOHSA: buffer_store_dwordx4
    192 ; GCN-HSA: flat_load_dwordx4
    193 ; GCN-HSA: flat_load_dwordx4
    194 define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
    195 entry:
    196   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    197   %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
    198   %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %tmp2, i32 1234
    199   %tmp4 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp3
    200   %tmp5 = or <8 x i32> %tmp4, %c
    201   store <8 x i32> %tmp5, <8 x i32> addrspace(1)* %out
    202   ret void
    203 }
    204 
    205 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
    206 
    207 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
    208 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
    209 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
    210 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
    211 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
    212 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
    213 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
    214 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
    215 
    216 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    217 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    218 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    219 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    220 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    221 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    222 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    223 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    224 ; GCN-NOHSA: buffer_store_dwordx4
    225 ; GCN-NOHSA: buffer_store_dwordx4
    226 ; GCN-NOHSA: buffer_store_dwordx4
    227 ; GCN-NOHSA: buffer_store_dwordx4
    228 
    229 ; GCN-HSA: flat_load_dwordx4
    230 ; GCN-HSA: flat_load_dwordx4
    231 ; GCN-HSA: flat_load_dwordx4
    232 ; GCN-HSA: flat_load_dwordx4
    233 
    234 ; GCN: s_endpgm
    235 define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
    236 entry:
    237   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    238   %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
    239   %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %tmp2, i32 1234
    240   %tmp4 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp3
    241   %tmp5 = or <16 x i32> %tmp4, %c
    242   store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
    243   ret void
    244 }
    245 
    246 ; GCN-LABEL: {{^}}smrd_valu2_salu_user:
    247 ; GCN-NOHSA: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    248 ; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
    249 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
    250 ; GCN-NOHSA: buffer_store_dword [[ADD]]
    251 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
    252 define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
    253 entry:
    254   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    255   %tmp1 = add i32 %tmp, 4
    256   %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %tmp, i32 4
    257   %tmp3 = load i32, i32 addrspace(2)* %tmp2
    258   %tmp4 = add i32 %tmp3, %a
    259   store i32 %tmp4, i32 addrspace(1)* %out
    260   ret void
    261 }
    262 
    263 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
    264 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
    265 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
    266 define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
    267 entry:
    268   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    269   %tmp1 = add i32 %tmp, 4
    270   %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 255
    271   %tmp3 = load i32, i32 addrspace(2)* %tmp2
    272   store i32 %tmp3, i32 addrspace(1)* %out
    273   ret void
    274 }
    275 
    276 ; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
    277 ; GCN-NOHSA-NOT: v_add
    278 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
    279 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
    280 define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
    281 entry:
    282   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    283   %tmp1 = add i32 %tmp, 4
    284   %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %tmp, i32 256
    285   %tmp3 = load i32, i32 addrspace(2)* %tmp2
    286   store i32 %tmp3, i32 addrspace(1)* %out
    287   ret void
    288 }
    289 
    290 ; GCN-LABEL: {{^}}s_load_imm_v8i32:
    291 ; GCN-NOHSA: buffer_load_dwordx4
    292 ; GCN-NOHSA: buffer_load_dwordx4
    293 ; GCN-HSA: flat_load_dwordx4
    294 ; GCN-HSA: flat_load_dwordx4
    295 define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
    296 entry:
    297   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    298   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
    299   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
    300   %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
    301   store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
    302   ret void
    303 }
    304 
    305 ; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user:
    306 ; GCN-NOHSA: buffer_load_dwordx4
    307 ; GCN-NOHSA: buffer_load_dwordx4
    308 ; GCN-NOHSA: v_add_i32_e32
    309 ; GCN-NOHSA: v_add_i32_e32
    310 ; GCN-NOHSA: v_add_i32_e32
    311 ; GCN-NOHSA: v_add_i32_e32
    312 ; GCN-NOHSA: v_add_i32_e32
    313 ; GCN-NOHSA: v_add_i32_e32
    314 ; GCN-NOHSA: v_add_i32_e32
    315 ; GCN-NOHSA: buffer_store_dword
    316 ; GCN-HSA: flat_load_dwordx4
    317 ; GCN-HSA: flat_load_dwordx4
    318 define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
    319 entry:
    320   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    321   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
    322   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
    323   %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4
    324 
    325   %elt0 = extractelement <8 x i32> %tmp3, i32 0
    326   %elt1 = extractelement <8 x i32> %tmp3, i32 1
    327   %elt2 = extractelement <8 x i32> %tmp3, i32 2
    328   %elt3 = extractelement <8 x i32> %tmp3, i32 3
    329   %elt4 = extractelement <8 x i32> %tmp3, i32 4
    330   %elt5 = extractelement <8 x i32> %tmp3, i32 5
    331   %elt6 = extractelement <8 x i32> %tmp3, i32 6
    332   %elt7 = extractelement <8 x i32> %tmp3, i32 7
    333 
    334   %add0 = add i32 %elt0, %elt1
    335   %add1 = add i32 %add0, %elt2
    336   %add2 = add i32 %add1, %elt3
    337   %add3 = add i32 %add2, %elt4
    338   %add4 = add i32 %add3, %elt5
    339   %add5 = add i32 %add4, %elt6
    340   %add6 = add i32 %add5, %elt7
    341 
    342   store i32 %add6, i32 addrspace(1)* %out
    343   ret void
    344 }
    345 
    346 ; GCN-LABEL: {{^}}s_load_imm_v16i32:
    347 ; GCN-NOHSA: buffer_load_dwordx4
    348 ; GCN-NOHSA: buffer_load_dwordx4
    349 ; GCN-NOHSA: buffer_load_dwordx4
    350 ; GCN-NOHSA: buffer_load_dwordx4
    351 ; GCN-HSA: flat_load_dwordx4
    352 ; GCN-HSA: flat_load_dwordx4
    353 ; GCN-HSA: flat_load_dwordx4
    354 ; GCN-HSA: flat_load_dwordx4
    355 define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
    356 entry:
    357   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    358   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
    359   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
    360   %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
    361   store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
    362   ret void
    363 }
    364 
    365 ; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user:
    366 ; GCN-NOHSA: buffer_load_dwordx4
    367 ; GCN-NOHSA: buffer_load_dwordx4
    368 ; GCN-NOHSA: buffer_load_dwordx4
    369 ; GCN-NOHSA: buffer_load_dwordx4
    370 ; GCN-NOHSA: v_add_i32_e32
    371 ; GCN-NOHSA: v_add_i32_e32
    372 ; GCN-NOHSA: v_add_i32_e32
    373 ; GCN-NOHSA: v_add_i32_e32
    374 ; GCN-NOHSA: v_add_i32_e32
    375 ; GCN-NOHSA: v_add_i32_e32
    376 ; GCN-NOHSA: v_add_i32_e32
    377 ; GCN-NOHSA: v_add_i32_e32
    378 ; GCN-NOHSA: v_add_i32_e32
    379 ; GCN-NOHSA: v_add_i32_e32
    380 ; GCN-NOHSA: v_add_i32_e32
    381 ; GCN-NOHSA: v_add_i32_e32
    382 ; GCN-NOHSA: v_add_i32_e32
    383 ; GCN-NOHSA: v_add_i32_e32
    384 ; GCN-NOHSA: v_add_i32_e32
    385 ; GCN-NOHSA: buffer_store_dword
    386 ; GCN-HSA: flat_load_dwordx4
    387 ; GCN-HSA: flat_load_dwordx4
    388 ; GCN-HSA: flat_load_dwordx4
    389 ; GCN-HSA: flat_load_dwordx4
    390 define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
    391 entry:
    392   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    393   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
    394   %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
    395   %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4
    396 
    397   %elt0 = extractelement <16 x i32> %tmp3, i32 0
    398   %elt1 = extractelement <16 x i32> %tmp3, i32 1
    399   %elt2 = extractelement <16 x i32> %tmp3, i32 2
    400   %elt3 = extractelement <16 x i32> %tmp3, i32 3
    401   %elt4 = extractelement <16 x i32> %tmp3, i32 4
    402   %elt5 = extractelement <16 x i32> %tmp3, i32 5
    403   %elt6 = extractelement <16 x i32> %tmp3, i32 6
    404   %elt7 = extractelement <16 x i32> %tmp3, i32 7
    405   %elt8 = extractelement <16 x i32> %tmp3, i32 8
    406   %elt9 = extractelement <16 x i32> %tmp3, i32 9
    407   %elt10 = extractelement <16 x i32> %tmp3, i32 10
    408   %elt11 = extractelement <16 x i32> %tmp3, i32 11
    409   %elt12 = extractelement <16 x i32> %tmp3, i32 12
    410   %elt13 = extractelement <16 x i32> %tmp3, i32 13
    411   %elt14 = extractelement <16 x i32> %tmp3, i32 14
    412   %elt15 = extractelement <16 x i32> %tmp3, i32 15
    413 
    414   %add0 = add i32 %elt0, %elt1
    415   %add1 = add i32 %add0, %elt2
    416   %add2 = add i32 %add1, %elt3
    417   %add3 = add i32 %add2, %elt4
    418   %add4 = add i32 %add3, %elt5
    419   %add5 = add i32 %add4, %elt6
    420   %add6 = add i32 %add5, %elt7
    421   %add7 = add i32 %add6, %elt8
    422   %add8 = add i32 %add7, %elt9
    423   %add9 = add i32 %add8, %elt10
    424   %add10 = add i32 %add9, %elt11
    425   %add11 = add i32 %add10, %elt12
    426   %add12 = add i32 %add11, %elt13
    427   %add13 = add i32 %add12, %elt14
    428   %add14 = add i32 %add13, %elt15
    429 
    430   store i32 %add14, i32 addrspace(1)* %out
    431   ret void
    432 }
    433 
    434 ; Make sure we legalize vopc operands after moving an sopc to the value.
    435 
    436 ; {{^}}sopc_vopc_legalize_bug:
    437 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
    438 ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
    439 ; GCN: s_and_b64 vcc, exec, vcc
    440 ; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
    441 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
    442 ; GCN-NOHSA: buffer_store_dword [[ONE]]
    443 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
    444 ; GCN; {{^}}[[EXIT]]:
    445 ; GCN: s_endpgm
    446 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
    447 bb3:                                              ; preds = %bb2
    448   %tmp0 = bitcast i32 %cond to float
    449   %tmp1 = fadd float %tmp0, 2.500000e-01
    450   %tmp2 = bitcast float %tmp1 to i32
    451   %tmp3 = icmp ult i32 %tmp2, %cond
    452   br i1 %tmp3, label %bb6, label %bb7
    453 
    454 bb6:
    455   store i32 1, i32 addrspace(1)* %out
    456   br label %bb7
    457 
    458 bb7:                                              ; preds = %bb3
    459   ret void
    460 }
    461 
    462 attributes #0 = { nounwind readnone }
    463 attributes #1 = { nounwind }
    464