Home | History | Annotate | Download | only in AMDGPU
      1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
      2 
      3 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
      4 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
      5 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
      6 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
      7 
      8 ; GCN-NOT: pack
      9 ; GCN-NOT: and
     10 ; GCN-NOT: shl
     11 ; GCN-NOT: or
     12 
     13 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
     14 define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
     15 bb:
     16   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
     17 
     18   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
     19   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
     20   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
     21 
     22   %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
     23   %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
     24 
     25   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
     26   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
     27   ret void
     28 }
     29 
     30 ; Apply fneg to broadcasted vector
     31 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
     32 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
     33 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
     34 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
     35 
     36 ; GCN-NOT: pack
     37 ; GCN-NOT: and
     38 ; GCN-NOT: shl
     39 ; GCN-NOT: or
     40 
     41 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
     42 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
     43 bb:
     44   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
     45 
     46   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
     47   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
     48   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
     49 
     50   %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
     51   %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
     52   %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
     53 
     54   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
     55   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
     56   ret void
     57 }
     58 
     59 ; Apply fneg before broadcast
     60 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
     61 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
     62 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
     63 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
     64 
     65 ; GCN-NOT: pack
     66 ; GCN-NOT: and
     67 ; GCN-NOT: shl
     68 ; GCN-NOT: or
     69 
     70 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
     71 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
     72 bb:
     73   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
     74 
     75   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
     76   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
     77   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
     78 
     79   %neg.scalar0 = fsub half -0.0, %scalar0
     80   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
     81   %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
     82 
     83   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
     84   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
     85   ret void
     86 }
     87 
     88 ; Apply fneg before and after broadcast, and should cancel out.
     89 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
     90 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
     91 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
     92 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
     93 
     94 ; GCN-NOT: pack
     95 ; GCN-NOT: and
     96 ; GCN-NOT: shl
     97 ; GCN-NOT: or
     98 
     99 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
    100 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
    101 bb:
    102   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    103 
    104   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    105   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    106   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
    107 
    108   %neg.scalar0 = fsub half -0.0, %scalar0
    109   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
    110   %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
    111   %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
    112 
    113   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
    114   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    115   ret void
    116 }
    117 
    118 ; Add scalar, but negate low component
    119 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
    120 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    121 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    122 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
    123 
    124 ; GCN-NOT: pack
    125 ; GCN-NOT: and
    126 ; GCN-NOT: shl
    127 ; GCN-NOT: or
    128 
    129 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
    130 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
    131 bb:
    132   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    133 
    134   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    135   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    136   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
    137 
    138   %neg.scalar0 = fsub half -0.0, %scalar0
    139   %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
    140   %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
    141   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
    142   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    143   ret void
    144 }
    145 
    146 ; Add scalar, but negate high component
    147 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
    148 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    149 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    150 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
    151 
    152 ; GCN-NOT: pack
    153 ; GCN-NOT: and
    154 ; GCN-NOT: shl
    155 ; GCN-NOT: or
    156 
    157 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
    158 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
    159 bb:
    160   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    161 
    162   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    163   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    164   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
    165 
    166   %neg.scalar0 = fsub half -0.0, %scalar0
    167   %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
    168   %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
    169   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
    170   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    171   ret void
    172 }
    173 
    174 ; Apply fneg before broadcast with bitcast
    175 ; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
    176 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    177 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
    178 
    179 ; GCN-NOT: pack
    180 ; GCN-NOT: and
    181 ; GCN-NOT: shl
    182 ; GCN-NOT: or
    183 
    184 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
    185 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
    186 bb:
    187   %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
    188   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
    189   %neg.scalar0 = fsub half -0.0, %scalar0
    190   %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
    191 
    192   %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
    193   %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
    194 
    195   %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
    196   store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
    197   ret void
    198 }
    199 
    200 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
    201 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    202 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    203 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
    204 ; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
    205 
    206 ; FIXME: Remove and
    207 ; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
    208 ; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
    209 ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
    210 
    211 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
    212 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
    213 bb:
    214   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    215   %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
    216 
    217   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    218   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    219 
    220   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
    221   %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
    222 
    223   %neg.scalar1 = fsub half -0.0, %scalar1
    224   %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
    225   %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
    226   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
    227   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    228   ret void
    229 }
    230 
    231 ; FIXME: Can we avoid waitcnt between the two halves?
    232 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
    233 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    234 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    235 ; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
    236 ; GCN: s_waitcnt
    237 ; GCN: ds_read_u16_d16_hi [[PACKED]]
    238 
    239 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
    240 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
    241 bb:
    242   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    243   %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
    244 
    245   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    246   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    247 
    248   %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
    249   %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
    250 
    251   %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
    252   %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
    253   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    254 
    255   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
    256   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    257   ret void
    258 }
    259 
    260 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
    261 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    262 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    263 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    264 
    265 ; GCN-NOT: pack
    266 ; GCN-NOT: and
    267 ; GCN-NOT: shl
    268 ; GCN-NOT: or
    269 
    270 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
    271 define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    272 bb:
    273   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    274   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    275 
    276   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    277   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    278   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    279 
    280   %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    281   %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
    282 
    283   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
    284   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    285   ret void
    286 }
    287 
    288 ; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
    289 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    290 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    291 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    292 
    293 ; GCN-NOT: pack
    294 ; GCN-NOT: and
    295 ; GCN-NOT: shl
    296 ; GCN-NOT: or
    297 
    298 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
    299 define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    300 bb:
    301   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    302   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    303 
    304   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    305   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    306   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    307 
    308   %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
    309   %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1
    310 
    311   %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
    312   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
    313   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    314   ret void
    315 }
    316 
    317 ; GCN-LABEL: {{^}}add_vector_scalar_hi:
    318 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    319 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    320 
    321 ; GCN-NOT: pack
    322 ; GCN-NOT: and
    323 ; GCN-NOT: shl
    324 ; GCN-NOT: or
    325 
    326 ; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
    327 define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
    328 bb:
    329   %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
    330 
    331   %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
    332   %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
    333 
    334   %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
    335   %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
    336 
    337   store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
    338   ret void
    339 }
    340 
    341 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
    342 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    343 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    344 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    345 
    346 ; GCN-NOT: pack
    347 ; GCN-NOT: and
    348 ; GCN-NOT: shl
    349 ; GCN-NOT: or
    350 
    351 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
    352 define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    353 bb:
    354   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    355   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    356 
    357   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    358   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    359   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    360 
    361   %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
    362 
    363   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
    364 
    365   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    366   ret void
    367 }
    368 
    369 ; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
    370 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    371 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    372 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    373 
    374 ; GCN-NOT: pack
    375 ; GCN-NOT: and
    376 ; GCN-NOT: shl
    377 ; GCN-NOT: or
    378 
    379 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
    380 define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    381 bb:
    382   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    383   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    384 
    385   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    386   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    387   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    388 
    389   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    390   %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
    391   %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1
    392   %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
    393 
    394   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
    395   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    396   ret void
    397 }
    398 
    399 ; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
    400 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    401 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    402 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    403 
    404 ; GCN-NOT: pack
    405 ; GCN-NOT: and
    406 ; GCN-NOT: shl
    407 ; GCN-NOT: or
    408 
    409 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
    410 define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    411 bb:
    412   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    413   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    414 
    415   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    416   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    417   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    418 
    419   %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
    420   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
    421 
    422   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    423   ret void
    424 }
    425 
    426 ; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
    427 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    428 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    429 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    430 
    431 ; GCN-NOT: pack
    432 ; GCN-NOT: and
    433 ; GCN-NOT: shl
    434 ; GCN-NOT: or
    435 ; GCN-NOT: xor
    436 
    437 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
    438 define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    439 bb:
    440   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    441   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    442 
    443   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    444   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    445   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    446   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    447 
    448   %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
    449   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
    450 
    451   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    452   ret void
    453 }
    454 
    455 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
    456 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    457 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    458 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    459 
    460 ; GCN-NOT: pack
    461 ; GCN-NOT: and
    462 ; GCN-NOT: shl
    463 ; GCN-NOT: or
    464 ; GCN-NOT: xor
    465 
    466 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
    467 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    468 bb:
    469   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    470   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    471 
    472   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    473   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    474   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    475   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    476   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
    477   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
    478 
    479   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    480   ret void
    481 }
    482 
    483 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
    484 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    485 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    486 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    487 
    488 ; GCN-NOT: pack
    489 ; GCN-NOT: and
    490 ; GCN-NOT: shl
    491 ; GCN-NOT: or
    492 ; GCN-NOT: xor
    493 
    494 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
    495 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    496 bb:
    497   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    498   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    499 
    500   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    501   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    502   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    503   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    504   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
    505   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
    506 
    507   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    508   ret void
    509 }
    510 
    511 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
    512 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    513 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    514 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    515 
    516 ; GCN-NOT: pack
    517 ; GCN-NOT: and
    518 ; GCN-NOT: shl
    519 ; GCN-NOT: or
    520 ; GCN-NOT: xor
    521 
    522 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
    523 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    524 bb:
    525   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    526   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    527 
    528   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    529   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    530   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    531   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    532   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
    533   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
    534 
    535   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    536   ret void
    537 }
    538 
    539 ; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
    540 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    541 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    542 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    543 
    544 ; GCN-NOT: pack
    545 ; GCN-NOT: and
    546 ; GCN-NOT: shl
    547 ; GCN-NOT: or
    548 ; GCN-NOT: xor
    549 
    550 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
    551 define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    552 bb:
    553   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    554   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    555 
    556   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    557   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    558   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    559   %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
    560   %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
    561   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
    562 
    563   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    564   ret void
    565 }
    566 
    567 ; GCN-LABEL: {{^}}bitcast_fneg_f32:
    568 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
    569 define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    570 bb:
    571   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    572   %f32 = load volatile float, float addrspace(3)* undef, align 4
    573   %neg.f32 = fsub float -0.0, %f32
    574   %bc = bitcast float %neg.f32 to <2 x half>
    575   %result = fadd <2 x half> %vec0, %bc
    576 
    577   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    578   ret void
    579 }
    580 
    581 ; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
    582 ; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
    583 define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    584 bb:
    585   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    586 
    587   %f32 = load volatile float, float addrspace(3)* undef, align 4
    588   %neg.f32 = fsub float -0.0, %f32
    589   %bc = bitcast float %neg.f32 to <2 x half>
    590   %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
    591   %result = fadd <2 x half> %vec0, %shuf
    592   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    593   ret void
    594 }
    595 
    596 ; GCN-LABEL: {{^}}extract_from_i64:
    597 ; GCN: v_lshl_or_b32
    598 ; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
    599 define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
    600 bb:
    601   %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
    602   %i64 = load volatile i64, i64 addrspace(1)* undef
    603 
    604   %elt0 = trunc i64 %i64 to i16
    605   %hi = lshr i64 %i64, 16
    606   %elt1 = trunc i64 %hi to i16
    607 
    608   %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0
    609   %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1
    610   %result = add <2 x i16> %vec0, %ins1
    611   store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
    612   ret void
    613 }
    614 
    615 
    616 ; Bitcast is final obstacle to identifying same source register
    617 ; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
    618 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    619 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    620 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    621 
    622 ; GCN-NOT: pack
    623 ; GCN-NOT: and
    624 ; GCN-NOT: shl
    625 ; GCN-NOT: _or
    626 
    627 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
    628 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
    629 define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    630 bb:
    631   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    632   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    633 
    634   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    635   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    636   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    637 
    638   %scalar0 = load volatile i16, i16 addrspace(1)* undef
    639   %shl = shl i16 %scalar0, 1
    640   %shl.bc = bitcast i16 %shl to half
    641 
    642   %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
    643   %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
    644 
    645   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
    646   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    647   ret void
    648 }
    649 
    650 
    651 ; Bitcast is final obstacle to identifying same source register
    652 ; GCN-LABEL: {{^}}mix_elt_types_op_sel:
    653 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
    654 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
    655 ; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
    656 
    657 ; GCN-NOT: pack
    658 ; GCN-NOT: and
    659 ; GCN-NOT: shl
    660 ; GCN-NOT: _or
    661 
    662 ; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
    663 ; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
    664 define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
    665 bb:
    666   %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
    667   %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
    668 
    669   %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
    670   %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
    671   %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
    672 
    673   %scalar0 = load volatile i16, i16 addrspace(1)* undef
    674   %scalar1 = load volatile half, half addrspace(1)* undef
    675   %shl = shl i16 %scalar0, 1
    676   %shl.bc = bitcast i16 %shl to half
    677 
    678   %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
    679 
    680   %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
    681   %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
    682 
    683   %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
    684   store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
    685   ret void
    686 }
    687 
    688 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
    689 
    690 attributes #0 = { nounwind }
    691 attributes #1 = { nounwind readnone }
    692