Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
      2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
      3 
      4 ; FIXME: Broken on evergreen
      5 ; FIXME: For some reason the 8 and 16 vectors are being stored as
      6 ; individual elements instead of 128-bit stores.
      7 
      8 
      9 ; FIXME: Why is the constant moved into the intermediate register and
     10 ; not just directly into the vector component?
     11 
     12 ; GCN-LABEL: {{^}}insertelement_v4f32_0:
     13 ; GCN: s_load_dwordx4
     14 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     15 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     16 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     17 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
     18 ; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
     19 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
     20 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
     21 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     22   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
     23   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     24   ret void
     25 }
     26 
     27 ; GCN-LABEL: {{^}}insertelement_v4f32_1:
     28 define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     29   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
     30   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     31   ret void
     32 }
     33 
     34 ; GCN-LABEL: {{^}}insertelement_v4f32_2:
     35 define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     36   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
     37   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     38   ret void
     39 }
     40 
     41 ; GCN-LABEL: {{^}}insertelement_v4f32_3:
     42 define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
     43   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
     44   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
     45   ret void
     46 }
     47 
     48 ; GCN-LABEL: {{^}}insertelement_v4i32_0:
     49 define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
     50   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
     51   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
     52   ret void
     53 }
     54 
     55 ; GCN-LABEL: {{^}}insertelement_v3f32_1:
     56 define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
     57   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
     58   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
     59   ret void
     60 }
     61 
     62 ; GCN-LABEL: {{^}}insertelement_v3f32_2:
     63 define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
     64   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
     65   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
     66   ret void
     67 }
     68 
     69 ; GCN-LABEL: {{^}}insertelement_v3f32_3:
     70 define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
     71   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
     72   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
     73   ret void
     74 }
     75 
     76 ; GCN-LABEL: {{^}}insertelement_to_sgpr:
     77 ; GCN-NOT: v_readfirstlane
     78 define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
     79   %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef
     80   %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
     81   %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
     82   ret <4 x float> %tmp2
     83 }
     84 
     85 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
     86 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
     87 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
     88 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
     89 define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
     90   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
     91   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
     92   ret void
     93 }
     94 
     95 ; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
     96 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
     97 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
     98 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
     99 ; GCN-DAG: buffer_store_dword v
    100 define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
    101   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
    102   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
    103   ret void
    104 }
    105 
    106 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
    107 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
    108 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
    109 ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
    110 define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
    111   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
    112   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
    113   ret void
    114 }
    115 
    116 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
    117 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
    118 ; GCN: buffer_store_dwordx4
    119 ; GCN: buffer_store_dwordx4
    120 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
    121   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
    122   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
    123   ret void
    124 }
    125 
    126 ; GCN-LABEL: {{^}}dynamic_insertelement_v16f32:
    127 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
    128 ; GCN: buffer_store_dwordx4
    129 ; GCN: buffer_store_dwordx4
    130 ; GCN: buffer_store_dwordx4
    131 ; GCN: buffer_store_dwordx4
    132 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
    133   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
    134   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
    135   ret void
    136 }
    137 
    138 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
    139 ; GCN: v_movreld_b32
    140 ; GCN: buffer_store_dwordx2
    141 define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
    142   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
    143   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
    144   ret void
    145 }
    146 
    147 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
    148 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 5
    149 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
    150 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
    151 ; GCN-DAG: buffer_store_dword v
    152 define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
    153   %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
    154   store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
    155   ret void
    156 }
    157 
    158 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
    159 ; GCN: v_movreld_b32
    160 ; GCN: buffer_store_dwordx4
    161 define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
    162   %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
    163   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
    164   ret void
    165 }
    166 
    167 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
    168 ; GCN: v_movreld_b32
    169 ; GCN: buffer_store_dwordx4
    170 ; GCN: buffer_store_dwordx4
    171 define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
    172   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
    173   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
    174   ret void
    175 }
    176 
    177 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i32:
    178 ; GCN: v_movreld_b32
    179 ; GCN: buffer_store_dwordx4
    180 ; GCN: buffer_store_dwordx4
    181 ; GCN: buffer_store_dwordx4
    182 ; GCN: buffer_store_dwordx4
    183 define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
    184   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
    185   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
    186   ret void
    187 }
    188 
    189 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
    190 define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
    191   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
    192   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
    193   ret void
    194 }
    195 
    196 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
    197 define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
    198   %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
    199   store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
    200   ret void
    201 }
    202 
    203 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i16:
    204 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
    205 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
    206 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
    207 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
    208 
    209 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:6
    210 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4
    211 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
    212 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    213 ; GCN: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    214 
    215 ; GCN: buffer_load_ushort
    216 ; GCN: buffer_load_ushort
    217 ; GCN: buffer_load_ushort
    218 ; GCN: buffer_load_ushort
    219 
    220 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
    221 define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
    222   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
    223   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
    224   ret void
    225 }
    226 
    227 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
    228 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    229 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    230 
    231 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
    232 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    233 
    234 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    235 
    236 ; GCN: buffer_load_ubyte
    237 ; GCN: buffer_load_ubyte
    238 
    239 ; GCN: buffer_store_short v{{[0-9]+}}, off
    240 define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
    241   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
    242   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
    243   ret void
    244 }
    245 
    246 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
    247 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    248 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    249 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    250 
    251 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
    252 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
    253 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    254 
    255 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    256 
    257 ; GCN: buffer_load_ubyte
    258 ; GCN: buffer_load_ubyte
    259 ; GCN: buffer_load_ubyte
    260 
    261 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
    262 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
    263 define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
    264   %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
    265   store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
    266   ret void
    267 }
    268 
    269 ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
    270 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    271 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    272 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    273 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
    274 
    275 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:3
    276 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:2
    277 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:1
    278 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    279 
    280 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
    281 
    282 ; GCN: buffer_load_ubyte
    283 ; GCN: buffer_load_ubyte
    284 ; GCN: buffer_load_ubyte
    285 ; GCN: buffer_load_ubyte
    286 
    287 ; GCN: buffer_store_dword v{{[0-9]+}}, off
    288 define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
    289   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
    290   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
    291   ret void
    292 }
    293 
    294 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
    295 define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
    296   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
    297   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
    298   ret void
    299 }
    300 
    301 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
    302 define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
    303   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
    304   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
    305   ret void
    306 }
    307 
    308 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
    309 ; the compiler doesn't crash.
    310 ; GCN-LABEL: {{^}}insert_split_bb:
    311 define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
    312 entry:
    313   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
    314   %1 = icmp eq i32 %a, 0
    315   br i1 %1, label %if, label %else
    316 
    317 if:
    318   %2 = load i32, i32 addrspace(1)* %in
    319   %3 = insertelement <2 x i32> %0, i32 %2, i32 1
    320   br label %endif
    321 
    322 else:
    323   %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
    324   %5 = load i32, i32 addrspace(1)* %4
    325   %6 = insertelement <2 x i32> %0, i32 %5, i32 1
    326   br label %endif
    327 
    328 endif:
    329   %7 = phi <2 x i32> [%3, %if], [%6, %else]
    330   store <2 x i32> %7, <2 x i32> addrspace(1)* %out
    331   ret void
    332 }
    333 
    334 ; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
    335 ; GCN: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
    336 ; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
    337 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}}
    338 
    339 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    340 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    341 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    342 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
    343 
    344 ; GCN: s_mov_b32 m0, [[SCALEDIDX]]
    345 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
    346 
    347 ; Increment to next element.
    348 ; FIXME: Should be able to manipulate m0 directly instead of add and
    349 ; copy.
    350 
    351 ; FIXME: Should avoid resetting m0 to same value
    352 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
    353 ; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
    354 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
    355 
    356 ; GCN: buffer_store_dwordx4
    357 ; GCN: s_endpgm
    358 define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
    359   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
    360   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
    361   ret void
    362 }
    363 
    364 ; FIXME: Inline immediate should be folded into v_movreld_b32.
    365 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:
    366 
    367 ; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 5{{$}}
    368 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0{{$}}
    369 
    370 ; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]]
    371 ; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]
    372 
    373 ; GCN: buffer_store_dwordx4
    374 ; GCN: s_endpgm
    375 define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
    376   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
    377   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
    378   ret void
    379 }
    380 
    381 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
    382 define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
    383   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
    384   store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
    385   ret void
    386 }
    387 
    388 ; FIXME: Should be able to do without stack access. The used stack
    389 ; space is also 2x what should be required.
    390 
    391 ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
    392 ; GCN: SCRATCH_RSRC_DWORD
    393 
    394 ; Stack store
    395 
    396 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    397 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    398 
    399 ; Write element
    400 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    401 
    402 ; Stack reload
    403 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    404 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    405 
    406 ; Store result
    407 ; GCN: buffer_store_dwordx4
    408 ; GCN: buffer_store_dwordx4
    409 ; GCN: s_endpgm
    410 ; GCN: ScratchSize: 64
    411 
    412 define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
    413   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
    414   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
    415   ret void
    416 }
    417 
    418 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
    419 ; GCN: SCRATCH_RSRC_DWORD
    420 
    421 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    422 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    423 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:32{{$}}
    424 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:48{{$}}
    425 
    426 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    427 
    428 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    429 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    430 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:16{{$}}
    431 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
    432 
    433 ; GCN: buffer_store_dwordx4
    434 ; GCN: buffer_store_dwordx4
    435 ; GCN: buffer_store_dwordx4
    436 ; GCN: buffer_store_dwordx4
    437 ; GCN: s_endpgm
    438 ; GCN: ScratchSize: 128
    439 define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
    440   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
    441   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
    442   ret void
    443 }
    444 
    445 declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
    446