Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
      2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
      3 
      4 ; FIXME: Broken on evergreen
      5 ; FIXME: For some reason the 8 and 16 vectors are being stored as
      6 ; individual elements instead of 128-bit stores.
      7 
      8 
      9 ; FIXME: Why is the constant moved into the intermediate register and
     10 ; not just directly into the vector component?
     11 
     12 ; SI-LABEL: {{^}}insertelement_v4f32_0:
     13 ; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
     14 ; v_mov_b32_e32
     15 ; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
     16 ; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
     17 ; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
     18 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     19   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
     20   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     21   ret void
     22 }
     23 
     24 ; SI-LABEL: {{^}}insertelement_v4f32_1:
     25 define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     26   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
     27   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     28   ret void
     29 }
     30 
     31 ; SI-LABEL: {{^}}insertelement_v4f32_2:
     32 define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     33   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
     34   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     35   ret void
     36 }
     37 
     38 ; SI-LABEL: {{^}}insertelement_v4f32_3:
     39 define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     40   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
     41   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     42   ret void
     43 }
     44 
     45 ; SI-LABEL: {{^}}insertelement_v4i32_0:
     46 define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
     47   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
     48   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
     49   ret void
     50 }
     51 
     52 ; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
     53 ; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
     54 ; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
     55 ; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
     56 define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
     57   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
     58   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
     59   ret void
     60 }
     61 
     62 ; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
     63 ; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
     64 ; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
     65 ; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
     66 define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
     67   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
     68   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     69   ret void
     70 }
     71 
     72 ; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
     73 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
     74 ; SI: buffer_store_dwordx4
     75 ; SI: buffer_store_dwordx4
     76 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
     77   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
     78   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
     79   ret void
     80 }
     81 
     82 ; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
     83 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
     84 ; SI: buffer_store_dwordx4
     85 ; SI: buffer_store_dwordx4
     86 ; SI: buffer_store_dwordx4
     87 ; SI: buffer_store_dwordx4
     88 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
     89   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
     90   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
     91   ret void
     92 }
     93 
     94 ; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
     95 ; SI: buffer_store_dwordx2
     96 define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
     97   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
     98   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
     99   ret void
    100 }
    101 
    102 ; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
    103 ; SI: buffer_store_dwordx4
    104 define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
    105   %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
    106   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
    107   ret void
    108 }
    109 
    110 ; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
    111 ; FIXMESI: buffer_store_dwordx4
    112 ; FIXMESI: buffer_store_dwordx4
    113 define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
    114   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
    115   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
    116   ret void
    117 }
    118 
    119 ; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
    120 ; FIXMESI: buffer_store_dwordx4
    121 ; FIXMESI: buffer_store_dwordx4
    122 ; FIXMESI: buffer_store_dwordx4
    123 ; FIXMESI: buffer_store_dwordx4
    124 define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
    125   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
    126   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
    127   ret void
    128 }
    129 
    130 
    131 ; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
    132 ; FIXMESI: buffer_store_dwordx2
    133 define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
    134   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
    135   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
    136   ret void
    137 }
    138 
    139 ; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
    140 ; FIXMESI: buffer_store_dwordx4
    141 define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
    142   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
    143   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
    144   ret void
    145 }
    146 
    147 
    148 ; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
    149 ; FIXMESI: BUFFER_STORE_USHORT
    150 define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
    151   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
    152   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
    153   ret void
    154 }
    155 
    156 ; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
    157 ; FIXMESI: buffer_store_dword
    158 define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
    159   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
    160   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
    161   ret void
    162 }
    163 
    164 ; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
    165 ; FIXMESI: buffer_store_dwordx2
    166 define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
    167   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
    168   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
    169   ret void
    170 }
    171 
    172 ; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
    173 ; FIXMESI: buffer_store_dwordx4
    174 define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
    175   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
    176   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
    177   ret void
    178 }
    179 
    180 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
    181 ; the compiler doesn't crash.
    182 ; SI-LABEL: {{^}}insert_split_bb:
    183 define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
    184 entry:
    185   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
    186   %1 = icmp eq i32 %a, 0
    187   br i1 %1, label %if, label %else
    188 
    189 if:
    190   %2 = load i32, i32 addrspace(1)* %in
    191   %3 = insertelement <2 x i32> %0, i32 %2, i32 1
    192   br label %endif
    193 
    194 else:
    195   %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    196   %5 = load i32, i32 addrspace(1)* %4
    197   %6 = insertelement <2 x i32> %0, i32 %5, i32 1
    198   br label %endif
    199 
    200 endif:
    201   %7 = phi <2 x i32> [%3, %if], [%6, %else]
    202   store <2 x i32> %7, <2 x i32> addrspace(1)* %out
    203   ret void
    204 }
    205 
    206 ; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
    207 ; SI: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
    208 ; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
    209 ; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
    210 
    211 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    212 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    213 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    214 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    215 
    216 ; SI: s_mov_b32 m0, [[SCALEDIDX]]
    217 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
    218 
    219 ; Increment to next element.
    220 ; FIXME: Should be able to manipulate m0 directly instead of add and
    221 ; copy.
    222 
    223 ; SI: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1
    224 ; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
    225 ; SI-DAG: s_mov_b32 m0, [[IDX1]]
    226 ; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
    227 
    228 ; SI: buffer_store_dwordx4
    229 ; SI: s_endpgm
    230 define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
    231   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
    232   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
    233   ret void
    234 }
    235 
    236 ; FIXME: Inline immediate should be folded into v_movreld_b32.
    237 ; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
    238 
    239 ; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
    240 ; SI-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
    241 
    242 ; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
    243 ; SI-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
    244 
    245 ; SI: buffer_store_dwordx4
    246 ; SI: s_endpgm
    247 define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
    248   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
    249   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
    250   ret void
    251 }
    252 
    253 ; FIXME: Should be able to do without stack access. The used stack
    254 ; space is also 2x what should be required.
    255 
    256 ; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
    257 ; SI: SCRATCH_RSRC_DWORD
    258 
    259 ; Stack store
    260 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    261 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    262 
    263 ; Write element
    264 ; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    265 
    266 ; Stack reload
    267 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    268 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    269 
    270 ; Store result
    271 ; SI: buffer_store_dwordx4
    272 ; SI: buffer_store_dwordx4
    273 ; SI: s_endpgm
    274 ; SI: ScratchSize: 64
    275 
    276 define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
    277   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
    278   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
    279   ret void
    280 }
    281 
    282 ; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
    283 ; SI: SCRATCH_RSRC_DWORD
    284 
    285 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    286 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    287 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
    288 ; SI-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
    289 
    290 ; SI: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    291 
    292 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    293 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    294 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    295 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    296 
    297 ; SI: buffer_store_dwordx4
    298 ; SI: buffer_store_dwordx4
    299 ; SI: buffer_store_dwordx4
    300 ; SI: buffer_store_dwordx4
    301 ; SI: s_endpgm
    302 ; SI: ScratchSize: 128
    303 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
    304   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
    305   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
    306   ret void
    307 }
    308