Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
      2 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
      3 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
      4 
      5 declare i32 @llvm.amdgcn.workitem.id.x() #0
      6 declare i32 @llvm.amdgcn.workitem.id.y() #0
      7 
      8 ; In this test both the pointer and the offset operands to the
      9 ; BUFFER_LOAD instructions end up being stored in vgprs.  This
     10 ; requires us to add the pointer and offset together, store the
     11 ; result in the offset operand (vaddr), and then store 0 in an
     12 ; sgpr register pair and use that for the pointer operand
     13 ; (low 64-bits of srsrc).
     14 
     15 ; GCN-LABEL: {{^}}mubuf:
     16 
     17 ; Make sure we aren't using VGPRs for the source operand of s_mov_b64
     18 ; GCN-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
     19 
     20 ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
     21 ; instructions
     22 ; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
     23 ; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
     24 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
     25 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
     26 
     27 define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
     28 entry:
     29   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
     30   %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
     31   %tmp2 = sext i32 %tmp to i64
     32   %tmp3 = sext i32 %tmp1 to i64
     33   br label %loop
     34 
     35 loop:                                             ; preds = %loop, %entry
     36   %tmp4 = phi i64 [ 0, %entry ], [ %tmp5, %loop ]
     37   %tmp5 = add i64 %tmp2, %tmp4
     38   %tmp6 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp5
     39   %tmp7 = load i8, i8 addrspace(1)* %tmp6, align 1
     40   %tmp8 = or i64 %tmp5, 1
     41   %tmp9 = getelementptr i8, i8 addrspace(1)* %in, i64 %tmp8
     42   %tmp10 = load i8, i8 addrspace(1)* %tmp9, align 1
     43   %tmp11 = add i8 %tmp7, %tmp10
     44   %tmp12 = sext i8 %tmp11 to i32
     45   store i32 %tmp12, i32 addrspace(1)* %out
     46   %tmp13 = icmp slt i64 %tmp5, 10
     47   br i1 %tmp13, label %loop, label %done
     48 
     49 done:                                             ; preds = %loop
     50   ret void
     51 }
     52 
     53 ; Test moving an SMRD instruction to the VALU
     54 ; FIXME: movs can be moved before nop to reduce count
     55 
     56 ; GCN-LABEL: {{^}}smrd_valu:
     57 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
     58 ; SI: s_mov_b32
     59 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
     60 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
     61 ; SI: s_nop 3
     62 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
     63 
     64 ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
     65 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
     66 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
     67 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
     68 define amdgpu_kernel void @smrd_valu(i32 addrspace(4)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
     69 entry:
     70   %tmp = icmp ne i32 %a, 0
     71   br i1 %tmp, label %if, label %else
     72 
     73 if:                                               ; preds = %entry
     74   %tmp1 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in
     75   br label %endif
     76 
     77 else:                                             ; preds = %entry
     78   %tmp2 = getelementptr i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %in
     79   %tmp3 = load i32 addrspace(4)*, i32 addrspace(4)* addrspace(1)* %tmp2
     80   br label %endif
     81 
     82 endif:                                            ; preds = %else, %if
     83   %tmp4 = phi i32 addrspace(4)* [ %tmp1, %if ], [ %tmp3, %else ]
     84   %tmp5 = getelementptr i32, i32 addrspace(4)* %tmp4, i32 3000
     85   %tmp6 = load i32, i32 addrspace(4)* %tmp5
     86   store i32 %tmp6, i32 addrspace(1)* %out
     87   ret void
     88 }
     89 
     90 ; Test moving an SMRD with an immediate offset to the VALU
     91 
     92 ; GCN-LABEL: {{^}}smrd_valu2:
     93 ; GCN-NOHSA-NOT: v_add
     94 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
     95 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
     96 define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in) #1 {
     97 entry:
     98   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
     99   %tmp1 = add i32 %tmp, 4
    100   %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4
    101   %tmp3 = load i32, i32 addrspace(4)* %tmp2
    102   store i32 %tmp3, i32 addrspace(1)* %out
    103   ret void
    104 }
    105 
    106 ; Use a big offset that will use the SMRD literal offset on CI
    107 ; GCN-LABEL: {{^}}smrd_valu_ci_offset:
    108 ; GCN-NOHSA-NOT: v_add
    109 ; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
    110 ; GCN-NOHSA-NOT: v_add
    111 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
    112 ; GCN-NOHSA: v_add_i32_e32
    113 ; GCN-NOHSA: buffer_store_dword
    114 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
    115 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
    116 define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %c) #1 {
    117 entry:
    118   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    119   %tmp2 = getelementptr i32, i32 addrspace(4)* %in, i32 %tmp
    120   %tmp3 = getelementptr i32, i32 addrspace(4)* %tmp2, i32 5000
    121   %tmp4 = load i32, i32 addrspace(4)* %tmp3
    122   %tmp5 = add i32 %tmp4, %c
    123   store i32 %tmp5, i32 addrspace(1)* %out
    124   ret void
    125 }
    126 
    127 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
    128 ; GCN-NOHSA-NOT: v_add
    129 ; GCN-NOHSA: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
    130 ; GCN-NOHSA-NOT: v_add
    131 ; GCN-NOHSA: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
    132 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    133 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    134 ; GCN-NOHSA: buffer_store_dwordx2
    135 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
    136 define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(4)* %in, i64 %c) #1 {
    137 entry:
    138   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    139   %tmp2 = getelementptr i64, i64 addrspace(4)* %in, i32 %tmp
    140   %tmp3 = getelementptr i64, i64 addrspace(4)* %tmp2, i32 5000
    141   %tmp4 = load i64, i64 addrspace(4)* %tmp3
    142   %tmp5 = or i64 %tmp4, %c
    143   store i64 %tmp5, i64 addrspace(1)* %out
    144   ret void
    145 }
    146 
    147 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
    148 ; GCN-NOHSA-NOT: v_add
    149 ; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
    150 ; GCN-NOHSA-NOT: v_add
    151 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
    152 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    153 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    154 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    155 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    156 ; GCN-NOHSA: buffer_store_dwordx4
    157 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
    158 define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(4)* %in, <4 x i32> %c) #1 {
    159 entry:
    160   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    161   %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %in, i32 %tmp
    162   %tmp3 = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %tmp2, i32 1234
    163   %tmp4 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp3
    164   %tmp5 = or <4 x i32> %tmp4, %c
    165   store <4 x i32> %tmp5, <4 x i32> addrspace(1)* %out
    166   ret void
    167 }
    168 
    169 ; Original scalar load uses SGPR offset on SI and 32-bit literal on
    170 ; CI.
    171 
    172 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
    173 ; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
    174 ; GCN-NOHSA-NOT: v_add
    175 ; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
    176 ; GCN-NOHSA-NOT: v_add
    177 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
    178 ; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
    179 
    180 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    181 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    182 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    183 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    184 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    185 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    186 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    187 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    188 ; GCN-NOHSA: buffer_store_dwordx4
    189 ; GCN-NOHSA: buffer_store_dwordx4
    190 ; GCN-HSA: flat_load_dwordx4
    191 ; GCN-HSA: flat_load_dwordx4
    192 define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(4)* %in, <8 x i32> %c) #1 {
    193 entry:
    194   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    195   %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %in, i32 %tmp
    196   %tmp3 = getelementptr <8 x i32>, <8 x i32> addrspace(4)* %tmp2, i32 1234
    197   %tmp4 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp3
    198   %tmp5 = or <8 x i32> %tmp4, %c
    199   store <8 x i32> %tmp5, <8 x i32> addrspace(1)* %out
    200   ret void
    201 }
    202 
    203 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
    204 
    205 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
    206 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
    207 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
    208 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
    209 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
    210 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
    211 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
    212 ; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
    213 
    214 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    215 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    216 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    217 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    218 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    219 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    220 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    221 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
    222 ; GCN-NOHSA: buffer_store_dwordx4
    223 ; GCN-NOHSA: buffer_store_dwordx4
    224 ; GCN-NOHSA: buffer_store_dwordx4
    225 ; GCN-NOHSA: buffer_store_dwordx4
    226 
    227 ; GCN-HSA: flat_load_dwordx4
    228 ; GCN-HSA: flat_load_dwordx4
    229 ; GCN-HSA: flat_load_dwordx4
    230 ; GCN-HSA: flat_load_dwordx4
    231 
    232 ; GCN: s_endpgm
    233 define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(4)* %in, <16 x i32> %c) #1 {
    234 entry:
    235   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    236   %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %in, i32 %tmp
    237   %tmp3 = getelementptr <16 x i32>, <16 x i32> addrspace(4)* %tmp2, i32 1234
    238   %tmp4 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp3
    239   %tmp5 = or <16 x i32> %tmp4, %c
    240   store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
    241   ret void
    242 }
    243 
    244 ; GCN-LABEL: {{^}}smrd_valu2_salu_user:
    245 ; GCN-NOHSA: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
    246 ; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
    247 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
    248 ; GCN-NOHSA: buffer_store_dword [[ADD]]
    249 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
    250 define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(4)* %in, i32 %a) #1 {
    251 entry:
    252   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    253   %tmp1 = add i32 %tmp, 4
    254   %tmp2 = getelementptr [8 x i32], [8 x i32] addrspace(4)* %in, i32 %tmp, i32 4
    255   %tmp3 = load i32, i32 addrspace(4)* %tmp2
    256   %tmp4 = add i32 %tmp3, %a
    257   store i32 %tmp4, i32 addrspace(1)* %out
    258   ret void
    259 }
    260 
    261 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
    262 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
    263 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
    264 define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 {
    265 entry:
    266   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    267   %tmp1 = add i32 %tmp, 4
    268   %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 255
    269   %tmp3 = load i32, i32 addrspace(4)* %tmp2
    270   store i32 %tmp3, i32 addrspace(1)* %out
    271   ret void
    272 }
    273 
    274 ; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
    275 ; GCN-NOHSA-NOT: v_add
    276 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
    277 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
    278 define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(4)* %in) #1 {
    279 entry:
    280   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
    281   %tmp1 = add i32 %tmp, 4
    282   %tmp2 = getelementptr [1024 x i32], [1024 x i32] addrspace(4)* %in, i32 %tmp, i32 256
    283   %tmp3 = load i32, i32 addrspace(4)* %tmp2
    284   store i32 %tmp3, i32 addrspace(1)* %out
    285   ret void
    286 }
    287 
    288 ; GCN-LABEL: {{^}}s_load_imm_v8i32:
    289 ; GCN-NOHSA: buffer_load_dwordx4
    290 ; GCN-NOHSA: buffer_load_dwordx4
    291 ; GCN-HSA: flat_load_dwordx4
    292 ; GCN-HSA: flat_load_dwordx4
    293 define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
    294 entry:
    295   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    296   %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
    297   %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)*
    298   %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4
    299   store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
    300   ret void
    301 }
    302 
    303 ; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user:
    304 ; GCN-NOHSA: buffer_load_dwordx4
    305 ; GCN-NOHSA: buffer_load_dwordx4
    306 ; GCN-NOHSA: v_add_i32_e32
    307 ; GCN-NOHSA: v_add_i32_e32
    308 ; GCN-NOHSA: v_add_i32_e32
    309 ; GCN-NOHSA: v_add_i32_e32
    310 ; GCN-NOHSA: v_add_i32_e32
    311 ; GCN-NOHSA: v_add_i32_e32
    312 ; GCN-NOHSA: v_add_i32_e32
    313 ; GCN-NOHSA: buffer_store_dword
    314 ; GCN-HSA: flat_load_dwordx4
    315 ; GCN-HSA: flat_load_dwordx4
    316 define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
    317 entry:
    318   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    319   %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
    320   %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <8 x i32> addrspace(4)*
    321   %tmp3 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp2, align 4
    322 
    323   %elt0 = extractelement <8 x i32> %tmp3, i32 0
    324   %elt1 = extractelement <8 x i32> %tmp3, i32 1
    325   %elt2 = extractelement <8 x i32> %tmp3, i32 2
    326   %elt3 = extractelement <8 x i32> %tmp3, i32 3
    327   %elt4 = extractelement <8 x i32> %tmp3, i32 4
    328   %elt5 = extractelement <8 x i32> %tmp3, i32 5
    329   %elt6 = extractelement <8 x i32> %tmp3, i32 6
    330   %elt7 = extractelement <8 x i32> %tmp3, i32 7
    331 
    332   %add0 = add i32 %elt0, %elt1
    333   %add1 = add i32 %add0, %elt2
    334   %add2 = add i32 %add1, %elt3
    335   %add3 = add i32 %add2, %elt4
    336   %add4 = add i32 %add3, %elt5
    337   %add5 = add i32 %add4, %elt6
    338   %add6 = add i32 %add5, %elt7
    339 
    340   store i32 %add6, i32 addrspace(1)* %out
    341   ret void
    342 }
    343 
    344 ; GCN-LABEL: {{^}}s_load_imm_v16i32:
    345 ; GCN-NOHSA: buffer_load_dwordx4
    346 ; GCN-NOHSA: buffer_load_dwordx4
    347 ; GCN-NOHSA: buffer_load_dwordx4
    348 ; GCN-NOHSA: buffer_load_dwordx4
    349 ; GCN-HSA: flat_load_dwordx4
    350 ; GCN-HSA: flat_load_dwordx4
    351 ; GCN-HSA: flat_load_dwordx4
    352 ; GCN-HSA: flat_load_dwordx4
    353 define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
    354 entry:
    355   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    356   %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
    357   %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)*
    358   %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4
    359   store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
    360   ret void
    361 }
    362 
    363 ; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user:
    364 ; GCN-NOHSA: buffer_load_dwordx4
    365 ; GCN-NOHSA: buffer_load_dwordx4
    366 ; GCN-NOHSA: buffer_load_dwordx4
    367 ; GCN-NOHSA: buffer_load_dwordx4
    368 ; GCN-NOHSA: v_add_i32_e32
    369 ; GCN-NOHSA: v_add_i32_e32
    370 ; GCN-NOHSA: v_add_i32_e32
    371 ; GCN-NOHSA: v_add_i32_e32
    372 ; GCN-NOHSA: v_add_i32_e32
    373 ; GCN-NOHSA: v_add_i32_e32
    374 ; GCN-NOHSA: v_add_i32_e32
    375 ; GCN-NOHSA: v_add_i32_e32
    376 ; GCN-NOHSA: v_add_i32_e32
    377 ; GCN-NOHSA: v_add_i32_e32
    378 ; GCN-NOHSA: v_add_i32_e32
    379 ; GCN-NOHSA: v_add_i32_e32
    380 ; GCN-NOHSA: v_add_i32_e32
    381 ; GCN-NOHSA: v_add_i32_e32
    382 ; GCN-NOHSA: v_add_i32_e32
    383 ; GCN-NOHSA: buffer_store_dword
    384 ; GCN-HSA: flat_load_dwordx4
    385 ; GCN-HSA: flat_load_dwordx4
    386 ; GCN-HSA: flat_load_dwordx4
    387 ; GCN-HSA: flat_load_dwordx4
    388 define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(4)* nocapture readonly %in) #1 {
    389 entry:
    390   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
    391   %tmp1 = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tmp0
    392   %tmp2 = bitcast i32 addrspace(4)* %tmp1 to <16 x i32> addrspace(4)*
    393   %tmp3 = load <16 x i32>, <16 x i32> addrspace(4)* %tmp2, align 4
    394 
    395   %elt0 = extractelement <16 x i32> %tmp3, i32 0
    396   %elt1 = extractelement <16 x i32> %tmp3, i32 1
    397   %elt2 = extractelement <16 x i32> %tmp3, i32 2
    398   %elt3 = extractelement <16 x i32> %tmp3, i32 3
    399   %elt4 = extractelement <16 x i32> %tmp3, i32 4
    400   %elt5 = extractelement <16 x i32> %tmp3, i32 5
    401   %elt6 = extractelement <16 x i32> %tmp3, i32 6
    402   %elt7 = extractelement <16 x i32> %tmp3, i32 7
    403   %elt8 = extractelement <16 x i32> %tmp3, i32 8
    404   %elt9 = extractelement <16 x i32> %tmp3, i32 9
    405   %elt10 = extractelement <16 x i32> %tmp3, i32 10
    406   %elt11 = extractelement <16 x i32> %tmp3, i32 11
    407   %elt12 = extractelement <16 x i32> %tmp3, i32 12
    408   %elt13 = extractelement <16 x i32> %tmp3, i32 13
    409   %elt14 = extractelement <16 x i32> %tmp3, i32 14
    410   %elt15 = extractelement <16 x i32> %tmp3, i32 15
    411 
    412   %add0 = add i32 %elt0, %elt1
    413   %add1 = add i32 %add0, %elt2
    414   %add2 = add i32 %add1, %elt3
    415   %add3 = add i32 %add2, %elt4
    416   %add4 = add i32 %add3, %elt5
    417   %add5 = add i32 %add4, %elt6
    418   %add6 = add i32 %add5, %elt7
    419   %add7 = add i32 %add6, %elt8
    420   %add8 = add i32 %add7, %elt9
    421   %add9 = add i32 %add8, %elt10
    422   %add10 = add i32 %add9, %elt11
    423   %add11 = add i32 %add10, %elt12
    424   %add12 = add i32 %add11, %elt13
    425   %add13 = add i32 %add12, %elt14
    426   %add14 = add i32 %add13, %elt15
    427 
    428   store i32 %add14, i32 addrspace(1)* %out
    429   ret void
    430 }
    431 
    432 ; Make sure we legalize vopc operands after moving an sopc to the value.
    433 
    434 ; {{^}}sopc_vopc_legalize_bug:
    435 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
    436 ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
    437 ; GCN: s_and_b64 vcc, exec, vcc
    438 ; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
    439 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
    440 ; GCN-NOHSA: buffer_store_dword [[ONE]]
    441 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
    442 ; GCN: {{^}}[[EXIT]]:
    443 ; GCN: s_endpgm
    444 define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
    445 bb3:                                              ; preds = %bb2
    446   %tmp0 = bitcast i32 %cond to float
    447   %tmp1 = fadd float %tmp0, 2.500000e-01
    448   %tmp2 = bitcast float %tmp1 to i32
    449   %tmp3 = icmp ult i32 %tmp2, %cond
    450   br i1 %tmp3, label %bb6, label %bb7
    451 
    452 bb6:
    453   store i32 1, i32 addrspace(1)* %out
    454   br label %bb7
    455 
    456 bb7:                                              ; preds = %bb3
    457   ret void
    458 }
    459 
    460 ; GCN-LABEL: {{^}}phi_visit_order:
    461 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
    462 define amdgpu_kernel void @phi_visit_order() {
    463 bb:
    464   br label %bb1
    465 
    466 bb1:
    467   %tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb4 ]
    468   %tid = call i32 @llvm.amdgcn.workitem.id.x()
    469   %cnd = icmp eq i32 %tid, 0
    470   br i1 %cnd, label %bb4, label %bb2
    471 
    472 bb2:
    473   %tmp3 = add nsw i32 %tmp, 1
    474   br label %bb4
    475 
    476 bb4:
    477   %tmp5 = phi i32 [ %tmp3, %bb2 ], [ %tmp, %bb1 ]
    478   br label %bb1
    479 }
    480 
    481 ; GCN-LABEL: {{^}}phi_imm_in_sgprs
    482 ; GCN: s_movk_i32 [[A:s[0-9]+]], 0x400
    483 ; GCN: s_movk_i32 [[B:s[0-9]+]], 0x400
    484 ; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]:
    485 ; GCN: s_xor_b32 [[B]], [[B]], [[A]]
    486 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
    487 define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
    488 entry:
    489   br label %loop
    490 
    491 loop:
    492   %i = phi i32 [0, %entry], [%i.add, %loop]
    493   %offset = phi i32 [1024, %entry], [%offset.xor, %loop]
    494   %offset.xor = xor i32 %offset, 1024
    495   %offset.i = add i32 %offset.xor, %i
    496   %ptr = getelementptr i32, i32 addrspace(3)* %out, i32 %offset.i
    497   store i32 0, i32 addrspace(3)* %ptr
    498   %i.add = add i32 %i, 1
    499   %cmp = icmp ult i32 %i.add, %cond
    500   br i1 %cmp, label %loop, label %exit
    501 
    502 exit:
    503   ret void
    504 }
    505 
    506 attributes #0 = { nounwind readnone }
    507 attributes #1 = { nounwind }
    508