Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -asm-verbose=false -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
      2 
      3 
      4 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
      5 ;CHECK-LABEL: smull8h:
      6 ;CHECK: smull.8h
      7   %tmp1 = load <8 x i8>, <8 x i8>* %A
      8   %tmp2 = load <8 x i8>, <8 x i8>* %B
      9   %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
     10   ret <8 x i16> %tmp3
     11 }
     12 
     13 define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     14 ;CHECK-LABEL: smull4s:
     15 ;CHECK: smull.4s
     16   %tmp1 = load <4 x i16>, <4 x i16>* %A
     17   %tmp2 = load <4 x i16>, <4 x i16>* %B
     18   %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
     19   ret <4 x i32> %tmp3
     20 }
     21 
     22 define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     23 ;CHECK-LABEL: smull2d:
     24 ;CHECK: smull.2d
     25   %tmp1 = load <2 x i32>, <2 x i32>* %A
     26   %tmp2 = load <2 x i32>, <2 x i32>* %B
     27   %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
     28   ret <2 x i64> %tmp3
     29 }
     30 
     31 declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
     32 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
     33 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
     34 
     35 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
     36 ;CHECK-LABEL: umull8h:
     37 ;CHECK: umull.8h
     38   %tmp1 = load <8 x i8>, <8 x i8>* %A
     39   %tmp2 = load <8 x i8>, <8 x i8>* %B
     40   %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
     41   ret <8 x i16> %tmp3
     42 }
     43 
     44 define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     45 ;CHECK-LABEL: umull4s:
     46 ;CHECK: umull.4s
     47   %tmp1 = load <4 x i16>, <4 x i16>* %A
     48   %tmp2 = load <4 x i16>, <4 x i16>* %B
     49   %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
     50   ret <4 x i32> %tmp3
     51 }
     52 
     53 define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     54 ;CHECK-LABEL: umull2d:
     55 ;CHECK: umull.2d
     56   %tmp1 = load <2 x i32>, <2 x i32>* %A
     57   %tmp2 = load <2 x i32>, <2 x i32>* %B
     58   %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
     59   ret <2 x i64> %tmp3
     60 }
     61 
     62 declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
     63 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
     64 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
     65 
     66 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     67 ;CHECK-LABEL: sqdmull4s:
     68 ;CHECK: sqdmull.4s
     69   %tmp1 = load <4 x i16>, <4 x i16>* %A
     70   %tmp2 = load <4 x i16>, <4 x i16>* %B
     71   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
     72   ret <4 x i32> %tmp3
     73 }
     74 
     75 define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     76 ;CHECK-LABEL: sqdmull2d:
     77 ;CHECK: sqdmull.2d
     78   %tmp1 = load <2 x i32>, <2 x i32>* %A
     79   %tmp2 = load <2 x i32>, <2 x i32>* %B
     80   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
     81   ret <2 x i64> %tmp3
     82 }
     83 
     84 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     85 ;CHECK-LABEL: sqdmull2_4s:
     86 ;CHECK: sqdmull.4s
     87   %load1 = load <8 x i16>, <8 x i16>* %A
     88   %load2 = load <8 x i16>, <8 x i16>* %B
     89   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
     90   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
     91   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
     92   ret <4 x i32> %tmp3
     93 }
     94 
     95 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     96 ;CHECK-LABEL: sqdmull2_2d:
     97 ;CHECK: sqdmull.2d
     98   %load1 = load <4 x i32>, <4 x i32>* %A
     99   %load2 = load <4 x i32>, <4 x i32>* %B
    100   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    101   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    102   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    103   ret <2 x i64> %tmp3
    104 }
    105 
    106 
    107 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
    108 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
    109 
    110 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    111 ;CHECK-LABEL: pmull8h:
    112 ;CHECK: pmull.8h
    113   %tmp1 = load <8 x i8>, <8 x i8>* %A
    114   %tmp2 = load <8 x i8>, <8 x i8>* %B
    115   %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
    116   ret <8 x i16> %tmp3
    117 }
    118 
    119 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
    120 
    121 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    122 ;CHECK-LABEL: sqdmulh_4h:
    123 ;CHECK: sqdmulh.4h
    124   %tmp1 = load <4 x i16>, <4 x i16>* %A
    125   %tmp2 = load <4 x i16>, <4 x i16>* %B
    126   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    127   ret <4 x i16> %tmp3
    128 }
    129 
    130 define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    131 ;CHECK-LABEL: sqdmulh_8h:
    132 ;CHECK: sqdmulh.8h
    133   %tmp1 = load <8 x i16>, <8 x i16>* %A
    134   %tmp2 = load <8 x i16>, <8 x i16>* %B
    135   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    136   ret <8 x i16> %tmp3
    137 }
    138 
    139 define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    140 ;CHECK-LABEL: sqdmulh_2s:
    141 ;CHECK: sqdmulh.2s
    142   %tmp1 = load <2 x i32>, <2 x i32>* %A
    143   %tmp2 = load <2 x i32>, <2 x i32>* %B
    144   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    145   ret <2 x i32> %tmp3
    146 }
    147 
    148 define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    149 ;CHECK-LABEL: sqdmulh_4s:
    150 ;CHECK: sqdmulh.4s
    151   %tmp1 = load <4 x i32>, <4 x i32>* %A
    152   %tmp2 = load <4 x i32>, <4 x i32>* %B
    153   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    154   ret <4 x i32> %tmp3
    155 }
    156 
    157 define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
    158 ;CHECK-LABEL: sqdmulh_1s:
    159 ;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
    160   %tmp1 = load i32, i32* %A
    161   %tmp2 = load i32, i32* %B
    162   %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
    163   ret i32 %tmp3
    164 }
    165 
    166 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    167 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    168 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    169 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    170 declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
    171 
    172 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    173 ;CHECK-LABEL: sqrdmulh_4h:
    174 ;CHECK: sqrdmulh.4h
    175   %tmp1 = load <4 x i16>, <4 x i16>* %A
    176   %tmp2 = load <4 x i16>, <4 x i16>* %B
    177   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    178   ret <4 x i16> %tmp3
    179 }
    180 
    181 define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    182 ;CHECK-LABEL: sqrdmulh_8h:
    183 ;CHECK: sqrdmulh.8h
    184   %tmp1 = load <8 x i16>, <8 x i16>* %A
    185   %tmp2 = load <8 x i16>, <8 x i16>* %B
    186   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    187   ret <8 x i16> %tmp3
    188 }
    189 
    190 define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    191 ;CHECK-LABEL: sqrdmulh_2s:
    192 ;CHECK: sqrdmulh.2s
    193   %tmp1 = load <2 x i32>, <2 x i32>* %A
    194   %tmp2 = load <2 x i32>, <2 x i32>* %B
    195   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    196   ret <2 x i32> %tmp3
    197 }
    198 
    199 define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    200 ;CHECK-LABEL: sqrdmulh_4s:
    201 ;CHECK: sqrdmulh.4s
    202   %tmp1 = load <4 x i32>, <4 x i32>* %A
    203   %tmp2 = load <4 x i32>, <4 x i32>* %B
    204   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    205   ret <4 x i32> %tmp3
    206 }
    207 
    208 define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
    209 ;CHECK-LABEL: sqrdmulh_1s:
    210 ;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
    211   %tmp1 = load i32, i32* %A
    212   %tmp2 = load i32, i32* %B
    213   %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
    214   ret i32 %tmp3
    215 }
    216 
    217 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    218 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    219 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    220 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    221 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
    222 
    223 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
    224 ;CHECK-LABEL: fmulx_2s:
    225 ;CHECK: fmulx.2s
    226   %tmp1 = load <2 x float>, <2 x float>* %A
    227   %tmp2 = load <2 x float>, <2 x float>* %B
    228   %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
    229   ret <2 x float> %tmp3
    230 }
    231 
    232 define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
    233 ;CHECK-LABEL: fmulx_4s:
    234 ;CHECK: fmulx.4s
    235   %tmp1 = load <4 x float>, <4 x float>* %A
    236   %tmp2 = load <4 x float>, <4 x float>* %B
    237   %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
    238   ret <4 x float> %tmp3
    239 }
    240 
    241 define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
    242 ;CHECK-LABEL: fmulx_2d:
    243 ;CHECK: fmulx.2d
    244   %tmp1 = load <2 x double>, <2 x double>* %A
    245   %tmp2 = load <2 x double>, <2 x double>* %B
    246   %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
    247   ret <2 x double> %tmp3
    248 }
    249 
    250 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
    251 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
    252 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
    253 
    254 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    255 ;CHECK-LABEL: smlal4s:
    256 ;CHECK: smlal.4s
    257   %tmp1 = load <4 x i16>, <4 x i16>* %A
    258   %tmp2 = load <4 x i16>, <4 x i16>* %B
    259   %tmp3 = load <4 x i32>, <4 x i32>* %C
    260   %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    261   %tmp5 = add <4 x i32> %tmp3, %tmp4
    262   ret <4 x i32> %tmp5
    263 }
    264 
    265 define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    266 ;CHECK-LABEL: smlal2d:
    267 ;CHECK: smlal.2d
    268   %tmp1 = load <2 x i32>, <2 x i32>* %A
    269   %tmp2 = load <2 x i32>, <2 x i32>* %B
    270   %tmp3 = load <2 x i64>, <2 x i64>* %C
    271   %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    272   %tmp5 = add <2 x i64> %tmp3, %tmp4
    273   ret <2 x i64> %tmp5
    274 }
    275 
    276 define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    277 ;CHECK-LABEL: smlsl4s:
    278 ;CHECK: smlsl.4s
    279   %tmp1 = load <4 x i16>, <4 x i16>* %A
    280   %tmp2 = load <4 x i16>, <4 x i16>* %B
    281   %tmp3 = load <4 x i32>, <4 x i32>* %C
    282   %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    283   %tmp5 = sub <4 x i32> %tmp3, %tmp4
    284   ret <4 x i32> %tmp5
    285 }
    286 
    287 define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    288 ;CHECK-LABEL: smlsl2d:
    289 ;CHECK: smlsl.2d
    290   %tmp1 = load <2 x i32>, <2 x i32>* %A
    291   %tmp2 = load <2 x i32>, <2 x i32>* %B
    292   %tmp3 = load <2 x i64>, <2 x i64>* %C
    293   %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    294   %tmp5 = sub <2 x i64> %tmp3, %tmp4
    295   ret <2 x i64> %tmp5
    296 }
    297 
    298 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
    299 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
    300 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
    301 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
    302 
    303 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    304 ;CHECK-LABEL: sqdmlal4s:
    305 ;CHECK: sqdmlal.4s
    306   %tmp1 = load <4 x i16>, <4 x i16>* %A
    307   %tmp2 = load <4 x i16>, <4 x i16>* %B
    308   %tmp3 = load <4 x i32>, <4 x i32>* %C
    309   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    310   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
    311   ret <4 x i32> %tmp5
    312 }
    313 
    314 define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    315 ;CHECK-LABEL: sqdmlal2d:
    316 ;CHECK: sqdmlal.2d
    317   %tmp1 = load <2 x i32>, <2 x i32>* %A
    318   %tmp2 = load <2 x i32>, <2 x i32>* %B
    319   %tmp3 = load <2 x i64>, <2 x i64>* %C
    320   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    321   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
    322   ret <2 x i64> %tmp5
    323 }
    324 
    325 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
    326 ;CHECK-LABEL: sqdmlal2_4s:
    327 ;CHECK: sqdmlal.4s
    328   %load1 = load <8 x i16>, <8 x i16>* %A
    329   %load2 = load <8 x i16>, <8 x i16>* %B
    330   %tmp3 = load <4 x i32>, <4 x i32>* %C
    331   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    332   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    333   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    334   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
    335   ret <4 x i32> %tmp5
    336 }
    337 
    338 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
    339 ;CHECK-LABEL: sqdmlal2_2d:
    340 ;CHECK: sqdmlal.2d
    341   %load1 = load <4 x i32>, <4 x i32>* %A
    342   %load2 = load <4 x i32>, <4 x i32>* %B
    343   %tmp3 = load <2 x i64>, <2 x i64>* %C
    344   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    345   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    346   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    347   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
    348   ret <2 x i64> %tmp5
    349 }
    350 
    351 define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    352 ;CHECK-LABEL: sqdmlsl4s:
    353 ;CHECK: sqdmlsl.4s
    354   %tmp1 = load <4 x i16>, <4 x i16>* %A
    355   %tmp2 = load <4 x i16>, <4 x i16>* %B
    356   %tmp3 = load <4 x i32>, <4 x i32>* %C
    357   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    358   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
    359   ret <4 x i32> %tmp5
    360 }
    361 
    362 define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    363 ;CHECK-LABEL: sqdmlsl2d:
    364 ;CHECK: sqdmlsl.2d
    365   %tmp1 = load <2 x i32>, <2 x i32>* %A
    366   %tmp2 = load <2 x i32>, <2 x i32>* %B
    367   %tmp3 = load <2 x i64>, <2 x i64>* %C
    368   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    369   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
    370   ret <2 x i64> %tmp5
    371 }
    372 
    373 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
    374 ;CHECK-LABEL: sqdmlsl2_4s:
    375 ;CHECK: sqdmlsl.4s
    376   %load1 = load <8 x i16>, <8 x i16>* %A
    377   %load2 = load <8 x i16>, <8 x i16>* %B
    378   %tmp3 = load <4 x i32>, <4 x i32>* %C
    379   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    380   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    381   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    382   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
    383   ret <4 x i32> %tmp5
    384 }
    385 
    386 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
    387 ;CHECK-LABEL: sqdmlsl2_2d:
    388 ;CHECK: sqdmlsl.2d
    389   %load1 = load <4 x i32>, <4 x i32>* %A
    390   %load2 = load <4 x i32>, <4 x i32>* %B
    391   %tmp3 = load <2 x i64>, <2 x i64>* %C
    392   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    393   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    394   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    395   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
    396   ret <2 x i64> %tmp5
    397 }
    398 
    399 define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    400 ;CHECK-LABEL: umlal4s:
    401 ;CHECK: umlal.4s
    402   %tmp1 = load <4 x i16>, <4 x i16>* %A
    403   %tmp2 = load <4 x i16>, <4 x i16>* %B
    404   %tmp3 = load <4 x i32>, <4 x i32>* %C
    405   %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    406   %tmp5 = add <4 x i32> %tmp3, %tmp4
    407   ret <4 x i32> %tmp5
    408 }
    409 
    410 define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    411 ;CHECK-LABEL: umlal2d:
    412 ;CHECK: umlal.2d
    413   %tmp1 = load <2 x i32>, <2 x i32>* %A
    414   %tmp2 = load <2 x i32>, <2 x i32>* %B
    415   %tmp3 = load <2 x i64>, <2 x i64>* %C
    416   %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    417   %tmp5 = add <2 x i64> %tmp3, %tmp4
    418   ret <2 x i64> %tmp5
    419 }
    420 
    421 define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    422 ;CHECK-LABEL: umlsl4s:
    423 ;CHECK: umlsl.4s
    424   %tmp1 = load <4 x i16>, <4 x i16>* %A
    425   %tmp2 = load <4 x i16>, <4 x i16>* %B
    426   %tmp3 = load <4 x i32>, <4 x i32>* %C
    427   %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    428   %tmp5 = sub <4 x i32> %tmp3, %tmp4
    429   ret <4 x i32> %tmp5
    430 }
    431 
    432 define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    433 ;CHECK-LABEL: umlsl2d:
    434 ;CHECK: umlsl.2d
    435   %tmp1 = load <2 x i32>, <2 x i32>* %A
    436   %tmp2 = load <2 x i32>, <2 x i32>* %B
    437   %tmp3 = load <2 x i64>, <2 x i64>* %C
    438   %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    439   %tmp5 = sub <2 x i64> %tmp3, %tmp4
    440   ret <2 x i64> %tmp5
    441 }
    442 
    443 define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
    444 ;CHECK-LABEL: fmla_2s:
    445 ;CHECK: fmla.2s
    446   %tmp1 = load <2 x float>, <2 x float>* %A
    447   %tmp2 = load <2 x float>, <2 x float>* %B
    448   %tmp3 = load <2 x float>, <2 x float>* %C
    449   %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
    450   ret <2 x float> %tmp4
    451 }
    452 
    453 define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
    454 ;CHECK-LABEL: fmla_4s:
    455 ;CHECK: fmla.4s
    456   %tmp1 = load <4 x float>, <4 x float>* %A
    457   %tmp2 = load <4 x float>, <4 x float>* %B
    458   %tmp3 = load <4 x float>, <4 x float>* %C
    459   %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
    460   ret <4 x float> %tmp4
    461 }
    462 
    463 define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
    464 ;CHECK-LABEL: fmla_2d:
    465 ;CHECK: fmla.2d
    466   %tmp1 = load <2 x double>, <2 x double>* %A
    467   %tmp2 = load <2 x double>, <2 x double>* %B
    468   %tmp3 = load <2 x double>, <2 x double>* %C
    469   %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
    470   ret <2 x double> %tmp4
    471 }
    472 
    473 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
    474 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
    475 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
    476 
    477 define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
    478 ;CHECK-LABEL: fmls_2s:
    479 ;CHECK: fmls.2s
    480   %tmp1 = load <2 x float>, <2 x float>* %A
    481   %tmp2 = load <2 x float>, <2 x float>* %B
    482   %tmp3 = load <2 x float>, <2 x float>* %C
    483   %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
    484   %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
    485   ret <2 x float> %tmp5
    486 }
    487 
    488 define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
    489 ;CHECK-LABEL: fmls_4s:
    490 ;CHECK: fmls.4s
    491   %tmp1 = load <4 x float>, <4 x float>* %A
    492   %tmp2 = load <4 x float>, <4 x float>* %B
    493   %tmp3 = load <4 x float>, <4 x float>* %C
    494   %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
    495   %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
    496   ret <4 x float> %tmp5
    497 }
    498 
    499 define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
    500 ;CHECK-LABEL: fmls_2d:
    501 ;CHECK: fmls.2d
    502   %tmp1 = load <2 x double>, <2 x double>* %A
    503   %tmp2 = load <2 x double>, <2 x double>* %B
    504   %tmp3 = load <2 x double>, <2 x double>* %C
    505   %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
    506   %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
    507   ret <2 x double> %tmp5
    508 }
    509 
    510 define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
    511 ;CHECK-LABEL: fmls_commuted_neg_2s:
    512 ;CHECK: fmls.2s
    513   %tmp1 = load <2 x float>, <2 x float>* %A
    514   %tmp2 = load <2 x float>, <2 x float>* %B
    515   %tmp3 = load <2 x float>, <2 x float>* %C
    516   %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
    517   %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
    518   ret <2 x float> %tmp5
    519 }
    520 
    521 define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
    522 ;CHECK-LABEL: fmls_commuted_neg_4s:
    523 ;CHECK: fmls.4s
    524   %tmp1 = load <4 x float>, <4 x float>* %A
    525   %tmp2 = load <4 x float>, <4 x float>* %B
    526   %tmp3 = load <4 x float>, <4 x float>* %C
    527   %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
    528   %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
    529   ret <4 x float> %tmp5
    530 }
    531 
    532 define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
    533 ;CHECK-LABEL: fmls_commuted_neg_2d:
    534 ;CHECK: fmls.2d
    535   %tmp1 = load <2 x double>, <2 x double>* %A
    536   %tmp2 = load <2 x double>, <2 x double>* %B
    537   %tmp3 = load <2 x double>, <2 x double>* %C
    538   %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
    539   %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
    540   ret <2 x double> %tmp5
    541 }
    542 
    543 define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
    544 ;CHECK-LABEL: fmls_indexed_2s:
    545 ;CHECK: fmls.2s
    546 entry:
    547   %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
    548   %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
    549   %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
    550   ret <2 x float> %fmls1
    551 }
    552 
    553 define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
    554 ;CHECK-LABEL: fmls_indexed_4s:
    555 ;CHECK: fmls.4s
    556 entry:
    557   %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    558   %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
    559   %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
    560   ret <4 x float> %fmls1
    561 }
    562 
    563 define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
    564 ;CHECK-LABEL: fmls_indexed_2d:
    565 ;CHECK: fmls.2d
    566 entry:
    567   %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
    568   %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
    569   %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
    570   ret <2 x double> %fmls1
    571 }
    572 
    573 define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
    574 entry:
    575 ; CHECK-LABEL: fmla_indexed_scalar_2s:
    576 ; CHECK-NEXT: fmla.2s
    577 ; CHECK-NEXT: ret
    578   %v1 = insertelement <2 x float> undef, float %c, i32 0
    579   %v2 = insertelement <2 x float> %v1, float %c, i32 1
    580   %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
    581   ret <2 x float> %fmla1
    582 }
    583 
    584 define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
    585 entry:
    586 ; CHECK-LABEL: fmla_indexed_scalar_4s:
    587 ; CHECK-NEXT: fmla.4s
    588 ; CHECK-NEXT: ret
    589   %v1 = insertelement <4 x float> undef, float %c, i32 0
    590   %v2 = insertelement <4 x float> %v1, float %c, i32 1
    591   %v3 = insertelement <4 x float> %v2, float %c, i32 2
    592   %v4 = insertelement <4 x float> %v3, float %c, i32 3
    593   %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
    594   ret <4 x float> %fmla1
    595 }
    596 
    597 define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
    598 ; CHECK-LABEL: fmla_indexed_scalar_2d:
    599 ; CHECK-NEXT: fmla.2d
    600 ; CHECK-NEXT: ret
    601 entry:
    602   %v1 = insertelement <2 x double> undef, double %c, i32 0
    603   %v2 = insertelement <2 x double> %v1, double %c, i32 1
    604   %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
    605   ret <2 x double> %fmla1
    606 }
    607 
    608 define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    609 ;CHECK-LABEL: mul_4h:
    610 ;CHECK-NOT: dup
    611 ;CHECK: mul.4h
    612   %tmp1 = load <4 x i16>, <4 x i16>* %A
    613   %tmp2 = load <4 x i16>, <4 x i16>* %B
    614   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    615   %tmp4 = mul <4 x i16> %tmp1, %tmp3
    616   ret <4 x i16> %tmp4
    617 }
    618 
    619 define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    620 ;CHECK-LABEL: mul_8h:
    621 ;CHECK-NOT: dup
    622 ;CHECK: mul.8h
    623   %tmp1 = load <8 x i16>, <8 x i16>* %A
    624   %tmp2 = load <8 x i16>, <8 x i16>* %B
    625   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    626   %tmp4 = mul <8 x i16> %tmp1, %tmp3
    627   ret <8 x i16> %tmp4
    628 }
    629 
    630 define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    631 ;CHECK-LABEL: mul_2s:
    632 ;CHECK-NOT: dup
    633 ;CHECK: mul.2s
    634   %tmp1 = load <2 x i32>, <2 x i32>* %A
    635   %tmp2 = load <2 x i32>, <2 x i32>* %B
    636   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    637   %tmp4 = mul <2 x i32> %tmp1, %tmp3
    638   ret <2 x i32> %tmp4
    639 }
    640 
    641 define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    642 ;CHECK-LABEL: mul_4s:
    643 ;CHECK-NOT: dup
    644 ;CHECK: mul.4s
    645   %tmp1 = load <4 x i32>, <4 x i32>* %A
    646   %tmp2 = load <4 x i32>, <4 x i32>* %B
    647   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    648   %tmp4 = mul <4 x i32> %tmp1, %tmp3
    649   ret <4 x i32> %tmp4
    650 }
    651 
    652 define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
    653 ; CHECK-LABEL: mul_2d:
    654 ; CHECK: mul
    655 ; CHECK: mul
    656   %tmp1 = mul <2 x i64> %A, %B
    657   ret <2 x i64> %tmp1
    658 }
    659 
    660 define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
    661 ;CHECK-LABEL: fmul_lane_2s:
    662 ;CHECK-NOT: dup
    663 ;CHECK: fmul.2s
    664   %tmp1 = load <2 x float>, <2 x float>* %A
    665   %tmp2 = load <2 x float>, <2 x float>* %B
    666   %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
    667   %tmp4 = fmul <2 x float> %tmp1, %tmp3
    668   ret <2 x float> %tmp4
    669 }
    670 
    671 define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
    672 ;CHECK-LABEL: fmul_lane_4s:
    673 ;CHECK-NOT: dup
    674 ;CHECK: fmul.4s
    675   %tmp1 = load <4 x float>, <4 x float>* %A
    676   %tmp2 = load <4 x float>, <4 x float>* %B
    677   %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    678   %tmp4 = fmul <4 x float> %tmp1, %tmp3
    679   ret <4 x float> %tmp4
    680 }
    681 
    682 define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
    683 ;CHECK-LABEL: fmul_lane_2d:
    684 ;CHECK-NOT: dup
    685 ;CHECK: fmul.2d
    686   %tmp1 = load <2 x double>, <2 x double>* %A
    687   %tmp2 = load <2 x double>, <2 x double>* %B
    688   %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
    689   %tmp4 = fmul <2 x double> %tmp1, %tmp3
    690   ret <2 x double> %tmp4
    691 }
    692 
    693 define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
    694 ;CHECK-LABEL: fmul_lane_s:
    695 ;CHECK-NOT: dup
    696 ;CHECK: fmul.s s0, s0, v1[3]
    697   %B = extractelement <4 x float> %vec, i32 3
    698   %res = fmul float %A, %B
    699   ret float %res
    700 }
    701 
    702 define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
    703 ;CHECK-LABEL: fmul_lane_d:
    704 ;CHECK-NOT: dup
    705 ;CHECK: fmul.d d0, d0, v1[1]
    706   %B = extractelement <2 x double> %vec, i32 1
    707   %res = fmul double %A, %B
    708   ret double %res
    709 }
    710 
    711 
    712 
    713 define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
    714 ;CHECK-LABEL: fmulx_lane_2s:
    715 ;CHECK-NOT: dup
    716 ;CHECK: fmulx.2s
    717   %tmp1 = load <2 x float>, <2 x float>* %A
    718   %tmp2 = load <2 x float>, <2 x float>* %B
    719   %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
    720   %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
    721   ret <2 x float> %tmp4
    722 }
    723 
    724 define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
    725 ;CHECK-LABEL: fmulx_lane_4s:
    726 ;CHECK-NOT: dup
    727 ;CHECK: fmulx.4s
    728   %tmp1 = load <4 x float>, <4 x float>* %A
    729   %tmp2 = load <4 x float>, <4 x float>* %B
    730   %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    731   %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
    732   ret <4 x float> %tmp4
    733 }
    734 
    735 define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
    736 ;CHECK-LABEL: fmulx_lane_2d:
    737 ;CHECK-NOT: dup
    738 ;CHECK: fmulx.2d
    739   %tmp1 = load <2 x double>, <2 x double>* %A
    740   %tmp2 = load <2 x double>, <2 x double>* %B
    741   %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
    742   %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
    743   ret <2 x double> %tmp4
    744 }
    745 
    746 define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    747 ;CHECK-LABEL: sqdmulh_lane_4h:
    748 ;CHECK-NOT: dup
    749 ;CHECK: sqdmulh.4h
    750   %tmp1 = load <4 x i16>, <4 x i16>* %A
    751   %tmp2 = load <4 x i16>, <4 x i16>* %B
    752   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    753   %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
    754   ret <4 x i16> %tmp4
    755 }
    756 
    757 define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    758 ;CHECK-LABEL: sqdmulh_lane_8h:
    759 ;CHECK-NOT: dup
    760 ;CHECK: sqdmulh.8h
    761   %tmp1 = load <8 x i16>, <8 x i16>* %A
    762   %tmp2 = load <8 x i16>, <8 x i16>* %B
    763   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    764   %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
    765   ret <8 x i16> %tmp4
    766 }
    767 
    768 define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    769 ;CHECK-LABEL: sqdmulh_lane_2s:
    770 ;CHECK-NOT: dup
    771 ;CHECK: sqdmulh.2s
    772   %tmp1 = load <2 x i32>, <2 x i32>* %A
    773   %tmp2 = load <2 x i32>, <2 x i32>* %B
    774   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    775   %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
    776   ret <2 x i32> %tmp4
    777 }
    778 
    779 define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    780 ;CHECK-LABEL: sqdmulh_lane_4s:
    781 ;CHECK-NOT: dup
    782 ;CHECK: sqdmulh.4s
    783   %tmp1 = load <4 x i32>, <4 x i32>* %A
    784   %tmp2 = load <4 x i32>, <4 x i32>* %B
    785   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    786   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
    787   ret <4 x i32> %tmp4
    788 }
    789 
    790 define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
    791 ;CHECK-LABEL: sqdmulh_lane_1s:
    792 ;CHECK-NOT: dup
    793 ;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
    794   %tmp1 = extractelement <4 x i32> %B, i32 1
    795   %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
    796   ret i32 %tmp2
    797 }
    798 
    799 define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    800 ;CHECK-LABEL: sqrdmulh_lane_4h:
    801 ;CHECK-NOT: dup
    802 ;CHECK: sqrdmulh.4h
    803   %tmp1 = load <4 x i16>, <4 x i16>* %A
    804   %tmp2 = load <4 x i16>, <4 x i16>* %B
    805   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    806   %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
    807   ret <4 x i16> %tmp4
    808 }
    809 
    810 define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    811 ;CHECK-LABEL: sqrdmulh_lane_8h:
    812 ;CHECK-NOT: dup
    813 ;CHECK: sqrdmulh.8h
    814   %tmp1 = load <8 x i16>, <8 x i16>* %A
    815   %tmp2 = load <8 x i16>, <8 x i16>* %B
    816   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    817   %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
    818   ret <8 x i16> %tmp4
    819 }
    820 
    821 define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    822 ;CHECK-LABEL: sqrdmulh_lane_2s:
    823 ;CHECK-NOT: dup
    824 ;CHECK: sqrdmulh.2s
    825   %tmp1 = load <2 x i32>, <2 x i32>* %A
    826   %tmp2 = load <2 x i32>, <2 x i32>* %B
    827   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    828   %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
    829   ret <2 x i32> %tmp4
    830 }
    831 
    832 define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    833 ;CHECK-LABEL: sqrdmulh_lane_4s:
    834 ;CHECK-NOT: dup
    835 ;CHECK: sqrdmulh.4s
    836   %tmp1 = load <4 x i32>, <4 x i32>* %A
    837   %tmp2 = load <4 x i32>, <4 x i32>* %B
    838   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    839   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
    840   ret <4 x i32> %tmp4
    841 }
    842 
    843 define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
    844 ;CHECK-LABEL: sqrdmulh_lane_1s:
    845 ;CHECK-NOT: dup
    846 ;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
    847   %tmp1 = extractelement <4 x i32> %B, i32 1
    848   %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
    849   ret i32 %tmp2
    850 }
    851 
    852 define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    853 ;CHECK-LABEL: sqdmull_lane_4s:
    854 ;CHECK-NOT: dup
    855 ;CHECK: sqdmull.4s
    856   %tmp1 = load <4 x i16>, <4 x i16>* %A
    857   %tmp2 = load <4 x i16>, <4 x i16>* %B
    858   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    859   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
    860   ret <4 x i32> %tmp4
    861 }
    862 
    863 define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    864 ;CHECK-LABEL: sqdmull_lane_2d:
    865 ;CHECK-NOT: dup
    866 ;CHECK: sqdmull.2d
    867   %tmp1 = load <2 x i32>, <2 x i32>* %A
    868   %tmp2 = load <2 x i32>, <2 x i32>* %B
    869   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    870   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
    871   ret <2 x i64> %tmp4
    872 }
    873 
    874 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    875 ;CHECK-LABEL: sqdmull2_lane_4s:
    876 ;CHECK-NOT: dup
    877 ;CHECK: sqdmull.4s
    878   %load1 = load <8 x i16>, <8 x i16>* %A
    879   %load2 = load <8 x i16>, <8 x i16>* %B
    880   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    881   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    882   %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    883   ret <4 x i32> %tmp4
    884 }
    885 
    886 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    887 ;CHECK-LABEL: sqdmull2_lane_2d:
    888 ;CHECK-NOT: dup
    889 ;CHECK: sqdmull.2d
    890   %load1 = load <4 x i32>, <4 x i32>* %A
    891   %load2 = load <4 x i32>, <4 x i32>* %B
    892   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    893   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
    894   %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    895   ret <2 x i64> %tmp4
    896 }
    897 
    898 define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    899 ;CHECK-LABEL: umull_lane_4s:
    900 ;CHECK-NOT: dup
    901 ;CHECK: umull.4s
    902   %tmp1 = load <4 x i16>, <4 x i16>* %A
    903   %tmp2 = load <4 x i16>, <4 x i16>* %B
    904   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    905   %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
    906   ret <4 x i32> %tmp4
    907 }
    908 
    909 define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    910 ;CHECK-LABEL: umull_lane_2d:
    911 ;CHECK-NOT: dup
    912 ;CHECK: umull.2d
    913   %tmp1 = load <2 x i32>, <2 x i32>* %A
    914   %tmp2 = load <2 x i32>, <2 x i32>* %B
    915   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    916   %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
    917   ret <2 x i64> %tmp4
    918 }
    919 
    920 define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    921 ;CHECK-LABEL: smull_lane_4s:
    922 ;CHECK-NOT: dup
    923 ;CHECK: smull.4s
    924   %tmp1 = load <4 x i16>, <4 x i16>* %A
    925   %tmp2 = load <4 x i16>, <4 x i16>* %B
    926   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    927   %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
    928   ret <4 x i32> %tmp4
    929 }
    930 
    931 define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    932 ;CHECK-LABEL: smull_lane_2d:
    933 ;CHECK-NOT: dup
    934 ;CHECK: smull.2d
    935   %tmp1 = load <2 x i32>, <2 x i32>* %A
    936   %tmp2 = load <2 x i32>, <2 x i32>* %B
    937   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    938   %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
    939   ret <2 x i64> %tmp4
    940 }
    941 
    942 define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    943 ;CHECK-LABEL: smlal_lane_4s:
    944 ;CHECK-NOT: dup
    945 ;CHECK: smlal.4s
    946   %tmp1 = load <4 x i16>, <4 x i16>* %A
    947   %tmp2 = load <4 x i16>, <4 x i16>* %B
    948   %tmp3 = load <4 x i32>, <4 x i32>* %C
    949   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    950   %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
    951   %tmp6 = add <4 x i32> %tmp3, %tmp5
    952   ret <4 x i32> %tmp6
    953 }
    954 
    955 define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    956 ;CHECK-LABEL: smlal_lane_2d:
    957 ;CHECK-NOT: dup
    958 ;CHECK: smlal.2d
    959   %tmp1 = load <2 x i32>, <2 x i32>* %A
    960   %tmp2 = load <2 x i32>, <2 x i32>* %B
    961   %tmp3 = load <2 x i64>, <2 x i64>* %C
    962   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    963   %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
    964   %tmp6 = add <2 x i64> %tmp3, %tmp5
    965   ret <2 x i64> %tmp6
    966 }
    967 
    968 define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    969 ;CHECK-LABEL: sqdmlal_lane_4s:
    970 ;CHECK-NOT: dup
    971 ;CHECK: sqdmlal.4s
    972   %tmp1 = load <4 x i16>, <4 x i16>* %A
    973   %tmp2 = load <4 x i16>, <4 x i16>* %B
    974   %tmp3 = load <4 x i32>, <4 x i32>* %C
    975   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    976   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
    977   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
    978   ret <4 x i32> %tmp6
    979 }
    980 
    981 define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    982 ;CHECK-LABEL: sqdmlal_lane_2d:
    983 ;CHECK-NOT: dup
    984 ;CHECK: sqdmlal.2d
    985   %tmp1 = load <2 x i32>, <2 x i32>* %A
    986   %tmp2 = load <2 x i32>, <2 x i32>* %B
    987   %tmp3 = load <2 x i64>, <2 x i64>* %C
    988   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
    989   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
    990   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
    991   ret <2 x i64> %tmp6
    992 }
    993 
    994 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
    995 ;CHECK-LABEL: sqdmlal2_lane_4s:
    996 ;CHECK-NOT: dup
    997 ;CHECK: sqdmlal.4s
    998   %load1 = load <8 x i16>, <8 x i16>* %A
    999   %load2 = load <8 x i16>, <8 x i16>* %B
   1000   %tmp3 = load <4 x i32>, <4 x i32>* %C
   1001   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1002   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1003   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   1004   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   1005   ret <4 x i32> %tmp6
   1006 }
   1007 
   1008 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
   1009 ;CHECK-LABEL: sqdmlal2_lane_2d:
   1010 ;CHECK-NOT: dup
   1011 ;CHECK: sqdmlal.2d
   1012   %load1 = load <4 x i32>, <4 x i32>* %A
   1013   %load2 = load <4 x i32>, <4 x i32>* %B
   1014   %tmp3 = load <2 x i64>, <2 x i64>* %C
   1015   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1016   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
   1017   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   1018   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   1019   ret <2 x i64> %tmp6
   1020 }
   1021 
   1022 define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
   1023 ;CHECK-LABEL: sqdmlal_lane_1s:
   1024 ;CHECK: sqdmlal.4s
   1025   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   1026   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   1027   %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   1028   %prod = extractelement <4 x i32> %prod.vec, i32 0
   1029   %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
   1030   ret i32 %res
   1031 }
   1032 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
   1033 
   1034 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
   1035 ;CHECK-LABEL: sqdmlsl_lane_1s:
   1036 ;CHECK: sqdmlsl.4s
   1037   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   1038   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   1039   %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   1040   %prod = extractelement <4 x i32> %prod.vec, i32 0
   1041   %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
   1042   ret i32 %res
   1043 }
   1044 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
   1045 
   1046 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
   1047 ;CHECK-LABEL: sqdmlal_lane_1d:
   1048 ;CHECK: sqdmlal.s
   1049   %rhs = extractelement <2 x i32> %C, i32 1
   1050   %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
   1051   %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
   1052   ret i64 %res
   1053 }
   1054 declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
   1055 declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
   1056 
   1057 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
   1058 ;CHECK-LABEL: sqdmlsl_lane_1d:
   1059 ;CHECK: sqdmlsl.s
   1060   %rhs = extractelement <2 x i32> %C, i32 1
   1061   %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
   1062   %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
   1063   ret i64 %res
   1064 }
   1065 declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
   1066 
   1067 
   1068 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
   1069 ;CHECK-LABEL: umlal_lane_4s:
   1070 ;CHECK-NOT: dup
   1071 ;CHECK: umlal.4s
   1072   %tmp1 = load <4 x i16>, <4 x i16>* %A
   1073   %tmp2 = load <4 x i16>, <4 x i16>* %B
   1074   %tmp3 = load <4 x i32>, <4 x i32>* %C
   1075   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1076   %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   1077   %tmp6 = add <4 x i32> %tmp3, %tmp5
   1078   ret <4 x i32> %tmp6
   1079 }
   1080 
   1081 define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
   1082 ;CHECK-LABEL: umlal_lane_2d:
   1083 ;CHECK-NOT: dup
   1084 ;CHECK: umlal.2d
   1085   %tmp1 = load <2 x i32>, <2 x i32>* %A
   1086   %tmp2 = load <2 x i32>, <2 x i32>* %B
   1087   %tmp3 = load <2 x i64>, <2 x i64>* %C
   1088   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
   1089   %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   1090   %tmp6 = add <2 x i64> %tmp3, %tmp5
   1091   ret <2 x i64> %tmp6
   1092 }
   1093 
   1094 
   1095 define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
   1096 ;CHECK-LABEL: smlsl_lane_4s:
   1097 ;CHECK-NOT: dup
   1098 ;CHECK: smlsl.4s
   1099   %tmp1 = load <4 x i16>, <4 x i16>* %A
   1100   %tmp2 = load <4 x i16>, <4 x i16>* %B
   1101   %tmp3 = load <4 x i32>, <4 x i32>* %C
   1102   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1103   %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   1104   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   1105   ret <4 x i32> %tmp6
   1106 }
   1107 
   1108 define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
   1109 ;CHECK-LABEL: smlsl_lane_2d:
   1110 ;CHECK-NOT: dup
   1111 ;CHECK: smlsl.2d
   1112   %tmp1 = load <2 x i32>, <2 x i32>* %A
   1113   %tmp2 = load <2 x i32>, <2 x i32>* %B
   1114   %tmp3 = load <2 x i64>, <2 x i64>* %C
   1115   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
   1116   %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   1117   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   1118   ret <2 x i64> %tmp6
   1119 }
   1120 
   1121 define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
   1122 ;CHECK-LABEL: sqdmlsl_lane_4s:
   1123 ;CHECK-NOT: dup
   1124 ;CHECK: sqdmlsl.4s
   1125   %tmp1 = load <4 x i16>, <4 x i16>* %A
   1126   %tmp2 = load <4 x i16>, <4 x i16>* %B
   1127   %tmp3 = load <4 x i32>, <4 x i32>* %C
   1128   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1129   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   1130   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   1131   ret <4 x i32> %tmp6
   1132 }
   1133 
   1134 define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
   1135 ;CHECK-LABEL: sqdmlsl_lane_2d:
   1136 ;CHECK-NOT: dup
   1137 ;CHECK: sqdmlsl.2d
   1138   %tmp1 = load <2 x i32>, <2 x i32>* %A
   1139   %tmp2 = load <2 x i32>, <2 x i32>* %B
   1140   %tmp3 = load <2 x i64>, <2 x i64>* %C
   1141   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
   1142   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   1143   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   1144   ret <2 x i64> %tmp6
   1145 }
   1146 
   1147 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
   1148 ;CHECK-LABEL: sqdmlsl2_lane_4s:
   1149 ;CHECK-NOT: dup
   1150 ;CHECK: sqdmlsl.4s
   1151   %load1 = load <8 x i16>, <8 x i16>* %A
   1152   %load2 = load <8 x i16>, <8 x i16>* %B
   1153   %tmp3 = load <4 x i32>, <4 x i32>* %C
   1154   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1155   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1156   %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   1157   %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   1158   ret <4 x i32> %tmp6
   1159 }
   1160 
   1161 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
   1162 ;CHECK-LABEL: sqdmlsl2_lane_2d:
   1163 ;CHECK-NOT: dup
   1164 ;CHECK: sqdmlsl.2d
   1165   %load1 = load <4 x i32>, <4 x i32>* %A
   1166   %load2 = load <4 x i32>, <4 x i32>* %B
   1167   %tmp3 = load <2 x i64>, <2 x i64>* %C
   1168   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1169   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
   1170   %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   1171   %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   1172   ret <2 x i64> %tmp6
   1173 }
   1174 
   1175 define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
   1176 ;CHECK-LABEL: umlsl_lane_4s:
   1177 ;CHECK-NOT: dup
   1178 ;CHECK: umlsl.4s
   1179   %tmp1 = load <4 x i16>, <4 x i16>* %A
   1180   %tmp2 = load <4 x i16>, <4 x i16>* %B
   1181   %tmp3 = load <4 x i32>, <4 x i32>* %C
   1182   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1183   %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   1184   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   1185   ret <4 x i32> %tmp6
   1186 }
   1187 
   1188 define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
   1189 ;CHECK-LABEL: umlsl_lane_2d:
   1190 ;CHECK-NOT: dup
   1191 ;CHECK: umlsl.2d
   1192   %tmp1 = load <2 x i32>, <2 x i32>* %A
   1193   %tmp2 = load <2 x i32>, <2 x i32>* %B
   1194   %tmp3 = load <2 x i64>, <2 x i64>* %C
   1195   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
   1196   %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   1197   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   1198   ret <2 x i64> %tmp6
   1199 }
   1200 
   1201 ; Scalar FMULX
   1202 define float @fmulxs(float %a, float %b) nounwind {
   1203 ; CHECK-LABEL: fmulxs:
   1204 ; CHECK-NEXT: fmulx s0, s0, s1
   1205   %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
   1206 ; CHECK-NEXT: ret
   1207   ret float %fmulx.i
   1208 }
   1209 
   1210 define double @fmulxd(double %a, double %b) nounwind {
   1211 ; CHECK-LABEL: fmulxd:
   1212 ; CHECK-NEXT: fmulx d0, d0, d1
   1213   %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
   1214 ; CHECK-NEXT: ret
   1215   ret double %fmulx.i
   1216 }
   1217 
   1218 define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
   1219 ; CHECK-LABEL: fmulxs_lane:
   1220 ; CHECK-NEXT: fmulx.s s0, s0, v1[3]
   1221   %b = extractelement <4 x float> %vec, i32 3
   1222   %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
   1223 ; CHECK-NEXT: ret
   1224   ret float %fmulx.i
   1225 }
   1226 
   1227 define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
   1228 ; CHECK-LABEL: fmulxd_lane:
   1229 ; CHECK-NEXT: fmulx.d d0, d0, v1[1]
   1230   %b = extractelement <2 x double> %vec, i32 1
   1231   %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
   1232 ; CHECK-NEXT: ret
   1233   ret double %fmulx.i
   1234 }
   1235 
   1236 declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
   1237 declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
   1238 
   1239 
   1240 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
   1241 ; CHECK-LABEL: smull2_8h_simple:
   1242 ; CHECK-NEXT: smull2.8h v0, v0, v1
   1243 ; CHECK-NEXT: ret
   1244   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1245   %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1246   %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
   1247   ret <8 x i16> %3
   1248 }
   1249 
   1250 define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
   1251 ; CHECK-LABEL: foo0:
   1252 ; CHECK: smull2.8h v0, v0, v1
   1253   %tmp = bitcast <16 x i8> %a to <2 x i64>
   1254   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1255   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
   1256   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   1257   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1258   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
   1259   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   1260   ret <8 x i16> %vmull.i.i
   1261 }
   1262 
   1263 define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
   1264 ; CHECK-LABEL: foo1:
   1265 ; CHECK: smull2.4s v0, v0, v1
   1266   %tmp = bitcast <8 x i16> %a to <2 x i64>
   1267   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1268   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
   1269   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   1270   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1271   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
   1272   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   1273   ret <4 x i32> %vmull2.i.i
   1274 }
   1275 
   1276 define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
   1277 ; CHECK-LABEL: foo2:
   1278 ; CHECK: smull2.2d v0, v0, v1
   1279   %tmp = bitcast <4 x i32> %a to <2 x i64>
   1280   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1281   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   1282   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   1283   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1284   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
   1285   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   1286   ret <2 x i64> %vmull2.i.i
   1287 }
   1288 
   1289 define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
   1290 ; CHECK-LABEL: foo3:
   1291 ; CHECK: umull2.8h v0, v0, v1
   1292   %tmp = bitcast <16 x i8> %a to <2 x i64>
   1293   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1294   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
   1295   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   1296   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1297   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
   1298   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   1299   ret <8 x i16> %vmull.i.i
   1300 }
   1301 
   1302 define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
   1303 ; CHECK-LABEL: foo4:
   1304 ; CHECK: umull2.4s v0, v0, v1
   1305   %tmp = bitcast <8 x i16> %a to <2 x i64>
   1306   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1307   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
   1308   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   1309   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1310   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
   1311   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   1312   ret <4 x i32> %vmull2.i.i
   1313 }
   1314 
   1315 define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
   1316 ; CHECK-LABEL: foo5:
   1317 ; CHECK: umull2.2d v0, v0, v1
   1318   %tmp = bitcast <4 x i32> %a to <2 x i64>
   1319   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1320   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   1321   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   1322   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1323   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
   1324   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   1325   ret <2 x i64> %vmull2.i.i
   1326 }
   1327 
   1328 define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
   1329 ; CHECK-LABEL: foo6:
   1330 ; CHECK-NEXT: smull2.4s v0, v1, v2[1]
   1331 ; CHECK-NEXT: ret
   1332 entry:
   1333   %0 = bitcast <8 x i16> %b to <2 x i64>
   1334   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1335   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   1336   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1337   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   1338   ret <4 x i32> %vmull2.i
   1339 }
   1340 
   1341 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
   1342 ; CHECK-LABEL: foo7:
   1343 ; CHECK-NEXT: smull2.2d v0, v1, v2[1]
   1344 ; CHECK-NEXT: ret
   1345 entry:
   1346   %0 = bitcast <4 x i32> %b to <2 x i64>
   1347   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1348   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   1349   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1350   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   1351   ret <2 x i64> %vmull2.i
   1352 }
   1353 
   1354 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
   1355 ; CHECK-LABEL: foo8:
   1356 ; CHECK-NEXT: umull2.4s v0, v1, v2[1]
   1357 ; CHECK-NEXT: ret
   1358 entry:
   1359   %0 = bitcast <8 x i16> %b to <2 x i64>
   1360   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1361   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   1362   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1363   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   1364   ret <4 x i32> %vmull2.i
   1365 }
   1366 
   1367 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
   1368 ; CHECK-LABEL: foo9:
   1369 ; CHECK-NEXT: umull2.2d v0, v1, v2[1]
   1370 ; CHECK-NEXT: ret
   1371 entry:
   1372   %0 = bitcast <4 x i32> %b to <2 x i64>
   1373   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1374   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   1375   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1376   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   1377   ret <2 x i64> %vmull2.i
   1378 }
   1379 
   1380 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
   1381 ; CHECK-LABEL: bar0:
   1382 ; CHECK: smlal2.8h v0, v1, v2
   1383 ; CHECK-NEXT: ret
   1384 
   1385   %tmp = bitcast <16 x i8> %b to <2 x i64>
   1386   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1387   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
   1388   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   1389   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1390   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
   1391   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   1392   %add.i = add <8 x i16> %vmull.i.i.i, %a
   1393   ret <8 x i16> %add.i
   1394 }
   1395 
   1396 define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
   1397 ; CHECK-LABEL: bar1:
   1398 ; CHECK: smlal2.4s v0, v1, v2
   1399 ; CHECK-NEXT: ret
   1400 
   1401   %tmp = bitcast <8 x i16> %b to <2 x i64>
   1402   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1403   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
   1404   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   1405   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1406   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
   1407   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   1408   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   1409   ret <4 x i32> %add.i
   1410 }
   1411 
   1412 define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
   1413 ; CHECK-LABEL: bar2:
   1414 ; CHECK: smlal2.2d v0, v1, v2
   1415 ; CHECK-NEXT: ret
   1416 
   1417   %tmp = bitcast <4 x i32> %b to <2 x i64>
   1418   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1419   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
   1420   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   1421   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1422   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
   1423   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   1424   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   1425   ret <2 x i64> %add.i
   1426 }
   1427 
   1428 define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
   1429 ; CHECK-LABEL: bar3:
   1430 ; CHECK: umlal2.8h v0, v1, v2
   1431 ; CHECK-NEXT: ret
   1432 
   1433   %tmp = bitcast <16 x i8> %b to <2 x i64>
   1434   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1435   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
   1436   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   1437   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1438   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
   1439   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   1440   %add.i = add <8 x i16> %vmull.i.i.i, %a
   1441   ret <8 x i16> %add.i
   1442 }
   1443 
   1444 define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
   1445 ; CHECK-LABEL: bar4:
   1446 ; CHECK: umlal2.4s v0, v1, v2
   1447 ; CHECK-NEXT: ret
   1448 
   1449   %tmp = bitcast <8 x i16> %b to <2 x i64>
   1450   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1451   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
   1452   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   1453   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1454   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
   1455   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   1456   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   1457   ret <4 x i32> %add.i
   1458 }
   1459 
   1460 define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
   1461 ; CHECK-LABEL: bar5:
   1462 ; CHECK: umlal2.2d v0, v1, v2
   1463 ; CHECK-NEXT: ret
   1464 
   1465   %tmp = bitcast <4 x i32> %b to <2 x i64>
   1466   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1467   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
   1468   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   1469   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1470   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
   1471   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   1472   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   1473   ret <2 x i64> %add.i
   1474 }
   1475 
   1476 define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
   1477 ; CHECK-LABEL: mlal2_1:
   1478 ; CHECK: smlal2.4s v0, v1, v2[3]
   1479 ; CHECK-NEXT: ret
   1480   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   1481   %tmp = bitcast <8 x i16> %b to <2 x i64>
   1482   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1483   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
   1484   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   1485   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1486   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
   1487   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   1488   %add = add <4 x i32> %vmull2.i.i, %a
   1489   ret <4 x i32> %add
   1490 }
   1491 
   1492 define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
   1493 ; CHECK-LABEL: mlal2_2:
   1494 ; CHECK: smlal2.2d v0, v1, v2[1]
   1495 ; CHECK-NEXT: ret
   1496   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1497   %tmp = bitcast <4 x i32> %b to <2 x i64>
   1498   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1499   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   1500   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   1501   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1502   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
   1503   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   1504   %add = add <2 x i64> %vmull2.i.i, %a
   1505   ret <2 x i64> %add
   1506 }
   1507 
   1508 define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
   1509 ; CHECK-LABEL: mlal2_4:
   1510 ; CHECK: umlal2.4s v0, v1, v2[2]
   1511 ; CHECK-NEXT: ret
   1512 
   1513   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   1514   %tmp = bitcast <8 x i16> %b to <2 x i64>
   1515   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1516   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
   1517   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   1518   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1519   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
   1520   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   1521   %add = add <4 x i32> %vmull2.i.i, %a
   1522   ret <4 x i32> %add
   1523 }
   1524 
   1525 define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
   1526 ; CHECK-LABEL: mlal2_5:
   1527 ; CHECK: umlal2.2d v0, v1, v2[0]
   1528 ; CHECK-NEXT: ret
   1529   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
   1530   %tmp = bitcast <4 x i32> %b to <2 x i64>
   1531   %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   1532   %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   1533   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   1534   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   1535   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
   1536   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   1537   %add = add <2 x i64> %vmull2.i.i, %a
   1538   ret <2 x i64> %add
   1539 }
   1540 
   1541 ; rdar://12328502
   1542 define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
   1543 entry:
   1544 ; CHECK-LABEL: vmulq_n_f64:
   1545 ; CHECK-NOT: dup.2d
   1546 ; CHECK: fmul.2d v0, v0, v1[0]
   1547   %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
   1548   %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
   1549   %mul.i = fmul <2 x double> %vecinit1.i, %x
   1550   ret <2 x double> %mul.i
   1551 }
   1552 
   1553 define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
   1554 entry:
   1555 ; CHECK-LABEL: vmulq_n_f32:
   1556 ; CHECK-NOT: dup.4s
   1557 ; CHECK: fmul.4s v0, v0, v1[0]
   1558   %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
   1559   %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
   1560   %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
   1561   %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
   1562   %mul.i = fmul <4 x float> %vecinit3.i, %x
   1563   ret <4 x float> %mul.i
   1564 }
   1565 
   1566 define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
   1567 entry:
   1568 ; CHECK-LABEL: vmul_n_f32:
   1569 ; CHECK-NOT: dup.2s
   1570 ; CHECK: fmul.2s v0, v0, v1[0]
   1571   %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
   1572   %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
   1573   %mul.i = fmul <2 x float> %vecinit1.i, %x
   1574   ret <2 x float> %mul.i
   1575 }
   1576 
   1577 define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
   1578 entry:
   1579 ; CHECK: vmla_laneq_s16_test
   1580 ; CHECK-NOT: ext
   1581 ; CHECK: mla.4h v0, v1, v2[6]
   1582 ; CHECK-NEXT: ret
   1583   %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
   1584   %mul = mul <4 x i16> %shuffle, %b
   1585   %add = add <4 x i16> %mul, %a
   1586   ret <4 x i16> %add
   1587 }
   1588 
   1589 define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
   1590 entry:
   1591 ; CHECK: vmla_laneq_s32_test
   1592 ; CHECK-NOT: ext
   1593 ; CHECK: mla.2s v0, v1, v2[3]
   1594 ; CHECK-NEXT: ret
   1595   %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1596   %mul = mul <2 x i32> %shuffle, %b
   1597   %add = add <2 x i32> %mul, %a
   1598   ret <2 x i32> %add
   1599 }
   1600 
   1601 define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
   1602 entry:
   1603 ; CHECK: not_really_vmlaq_laneq_s16_test
   1604 ; CHECK-NOT: ext
   1605 ; CHECK: mla.8h v0, v1, v2[5]
   1606 ; CHECK-NEXT: ret
   1607   %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1608   %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   1609   %mul = mul <8 x i16> %shuffle2, %b
   1610   %add = add <8 x i16> %mul, %a
   1611   ret <8 x i16> %add
   1612 }
   1613 
   1614 define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
   1615 entry:
   1616 ; CHECK: not_really_vmlaq_laneq_s32_test
   1617 ; CHECK-NOT: ext
   1618 ; CHECK: mla.4s v0, v1, v2[3]
   1619 ; CHECK-NEXT: ret
   1620   %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1621   %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1622   %mul = mul <4 x i32> %shuffle2, %b
   1623   %add = add <4 x i32> %mul, %a
   1624   ret <4 x i32> %add
   1625 }
   1626 
   1627 define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
   1628 entry:
   1629 ; CHECK: vmull_laneq_s16_test
   1630 ; CHECK-NOT: ext
   1631 ; CHECK: smull.4s v0, v0, v1[6]
   1632 ; CHECK-NEXT: ret
   1633   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
   1634   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   1635   ret <4 x i32> %vmull2.i
   1636 }
   1637 
   1638 define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
   1639 entry:
   1640 ; CHECK: vmull_laneq_s32_test
   1641 ; CHECK-NOT: ext
   1642 ; CHECK: smull.2d v0, v0, v1[2]
   1643 ; CHECK-NEXT: ret
   1644   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
   1645   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   1646   ret <2 x i64> %vmull2.i
   1647 }
   1648 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
   1649 entry:
   1650 ; CHECK: vmull_laneq_u16_test
   1651 ; CHECK-NOT: ext
   1652 ; CHECK: umull.4s v0, v0, v1[6]
   1653 ; CHECK-NEXT: ret
   1654   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
   1655   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   1656   ret <4 x i32> %vmull2.i
   1657 }
   1658 
   1659 define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
   1660 entry:
   1661 ; CHECK: vmull_laneq_u32_test
   1662 ; CHECK-NOT: ext
   1663 ; CHECK: umull.2d v0, v0, v1[2]
   1664 ; CHECK-NEXT: ret
   1665   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
   1666   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   1667   ret <2 x i64> %vmull2.i
   1668 }
   1669 
   1670 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
   1671 entry:
   1672 ; CHECK: vmull_high_n_s16_test
   1673 ; CHECK-NOT: ext
   1674 ; CHECK: smull2.4s
   1675 ; CHECK-NEXT: ret
   1676   %conv = trunc i32 %d to i16
   1677   %0 = bitcast <8 x i16> %b to <2 x i64>
   1678   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1679   %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
   1680   %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
   1681   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   1682   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   1683   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
   1684   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   1685   ret <4 x i32> %vmull2.i.i
   1686 }
   1687 
   1688 define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
   1689 entry:
   1690 ; CHECK: vmull_high_n_s32_test
   1691 ; CHECK-NOT: ext
   1692 ; CHECK: smull2.2d
   1693 ; CHECK-NEXT: ret
   1694   %0 = bitcast <4 x i32> %b to <2 x i64>
   1695   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1696   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   1697   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   1698   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
   1699   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   1700   ret <2 x i64> %vmull2.i.i
   1701 }
   1702 
   1703 define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
   1704 entry:
   1705 ; CHECK: vmull_high_n_u16_test
   1706 ; CHECK-NOT: ext
   1707 ; CHECK: umull2.4s
   1708 ; CHECK-NEXT: ret
   1709   %conv = trunc i32 %d to i16
   1710   %0 = bitcast <8 x i16> %b to <2 x i64>
   1711   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1712   %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
   1713   %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
   1714   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   1715   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   1716   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
   1717   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   1718   ret <4 x i32> %vmull2.i.i
   1719 }
   1720 
   1721 define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
   1722 entry:
   1723 ; CHECK: vmull_high_n_u32_test
   1724 ; CHECK-NOT: ext
   1725 ; CHECK: umull2.2d
   1726 ; CHECK-NEXT: ret
   1727   %0 = bitcast <4 x i32> %b to <2 x i64>
   1728   %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   1729   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   1730   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   1731   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
   1732   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   1733   ret <2 x i64> %vmull2.i.i
   1734 }
   1735 
   1736 define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
   1737 ; CHECK-LABEL: vmul_built_dup_test:
   1738 ; CHECK-NOT: ins
   1739 ; CHECK-NOT: dup
   1740 ; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
   1741   %vget_lane = extractelement <4 x i32> %b, i32 1
   1742   %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
   1743   %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
   1744   %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
   1745   %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
   1746   %prod = mul <4 x i32> %a, %vecinit3.i
   1747   ret <4 x i32> %prod
   1748 }
   1749 
   1750 define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
   1751 ; CHECK-LABEL: vmul_built_dup_fromsmall_test:
   1752 ; CHECK-NOT: ins
   1753 ; CHECK-NOT: dup
   1754 ; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
   1755   %vget_lane = extractelement <4 x i16> %b, i32 3
   1756   %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
   1757   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
   1758   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
   1759   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
   1760   %prod = mul <4 x i16> %a, %vecinit3.i
   1761   ret <4 x i16> %prod
   1762 }
   1763 
   1764 define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
   1765 ; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
   1766 ; CHECK-NOT: ins
   1767 ; CHECK-NOT: dup
   1768 ; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
   1769   %vget_lane = extractelement <4 x i16> %b, i32 0
   1770   %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
   1771   %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
   1772   %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
   1773   %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
   1774   %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
   1775   %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
   1776   %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
   1777   %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
   1778   %prod = mul <8 x i16> %a, %vecinit7.i
   1779   ret <8 x i16> %prod
   1780 }
   1781 
   1782 define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
   1783 ; CHECK-LABEL: mull_from_two_extracts:
   1784 ; CHECK-NOT: ext
   1785 ; CHECK: sqdmull2.2d
   1786 
   1787   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1788   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1789 
   1790   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   1791   ret <2 x i64> %res
   1792 }
   1793 
   1794 define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
   1795 ; CHECK-LABEL: mlal_from_two_extracts:
   1796 ; CHECK-NOT: ext
   1797 ; CHECK: sqdmlal2.2d
   1798 
   1799   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1800   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1801 
   1802   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   1803   %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   1804   ret <2 x i64> %sum
   1805 }
   1806 
   1807 define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
   1808 ; CHECK-LABEL: mull_from_extract_dup:
   1809 ; CHECK-NOT: ext
   1810 ; CHECK: sqdmull2.2d
   1811   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   1812   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
   1813 
   1814   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1815 
   1816   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   1817   ret <2 x i64> %res
   1818 }
   1819 
   1820 define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
   1821 ; CHECK-LABEL: pmull_from_extract_dup:
   1822 ; CHECK-NOT: ext
   1823 ; CHECK: pmull2.8h
   1824   %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
   1825   %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   1826 
   1827   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1828 
   1829   %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
   1830   ret <8 x i16> %res
   1831 }
   1832 
   1833 define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
   1834 ; CHECK-LABEL: pmull_from_extract_duplane:
   1835 ; CHECK-NOT: ext
   1836 ; CHECK: pmull2.8h
   1837 
   1838   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1839   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   1840 
   1841   %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
   1842   ret <8 x i16> %res
   1843 }
   1844 
   1845 define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
   1846 ; CHECK-LABEL: sqdmull_from_extract_duplane:
   1847 ; CHECK-NOT: ext
   1848 ; CHECK: sqdmull2.2d
   1849 
   1850   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1851   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
   1852 
   1853   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   1854   ret <2 x i64> %res
   1855 }
   1856 
   1857 define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
   1858 ; CHECK-LABEL: sqdmlal_from_extract_duplane:
   1859 ; CHECK-NOT: ext
   1860 ; CHECK: sqdmlal2.2d
   1861 
   1862   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1863   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
   1864 
   1865   %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   1866   %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   1867   ret <2 x i64> %sum
   1868 }
   1869 
   1870 define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
   1871 ; CHECK-LABEL: umlal_from_extract_duplane:
   1872 ; CHECK-NOT: ext
   1873 ; CHECK: umlal2.2d
   1874 
   1875   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1876   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
   1877 
   1878   %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   1879   %sum = add <2 x i64> %accum, %res
   1880   ret <2 x i64> %sum
   1881 }
   1882 
   1883 define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
   1884 ; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
   1885 ; CHECK: fmla.s s0, s1, v2[3]
   1886   %rhs = extractelement <4 x float> %rvec, i32 3
   1887   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
   1888   ret float %res
   1889 }
   1890 
   1891 define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
   1892 ; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
   1893 ; CHECK: fmla.s s0, s1, v2[1]
   1894   %rhs = extractelement <2 x float> %rvec, i32 1
   1895   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
   1896   ret float %res
   1897 }
   1898 
   1899 define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
   1900 ; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
   1901 ; CHECK: fmls.s s0, s1, v2[3]
   1902   %rhs.scal = extractelement <4 x float> %rvec, i32 3
   1903   %rhs = fsub float -0.0, %rhs.scal
   1904   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
   1905   ret float %res
   1906 }
   1907 
   1908 define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
   1909 ; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
   1910 ; CHECK: fmls.s s0, s1, v2[1]
   1911   %rhs.scal = extractelement <2 x float> %rvec, i32 1
   1912   %rhs = fsub float -0.0, %rhs.scal
   1913   %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
   1914   ret float %res
   1915 }
   1916 
   1917 declare float @llvm.fma.f32(float, float, float)
   1918 
   1919 define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
   1920 ; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
   1921 ; CHECK: fmla.d d0, d1, v2[1]
   1922   %rhs = extractelement <2 x double> %rvec, i32 1
   1923   %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
   1924   ret double %res
   1925 }
   1926 
   1927 define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
   1928 ; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
   1929 ; CHECK: fmls.d d0, d1, v2[1]
   1930   %rhs.scal = extractelement <2 x double> %rvec, i32 1
   1931   %rhs = fsub double -0.0, %rhs.scal
   1932   %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
   1933   ret double %res
   1934 }
   1935 
   1936 declare double @llvm.fma.f64(double, double, double)
   1937 
   1938 define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
   1939 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
   1940 ; CHECK: fmls.2s v0, v1, v2[3]
   1941   %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
   1942   %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   1943   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
   1944   ret <2 x float> %res
   1945 }
   1946 
   1947 define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
   1948 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
   1949 ; CHECK: fmls.2s v0, v1, v2[1]
   1950   %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
   1951   %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   1952   %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
   1953   ret <2 x float> %res
   1954 }
   1955 
   1956 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
   1957 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
   1958 ; CHECK: fmls.4s v0, v1, v2[3]
   1959   %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
   1960   %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1961   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
   1962   ret <4 x float> %res
   1963 }
   1964 
   1965 define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
   1966 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
   1967 ; CHECK: fmls.4s v0, v1, v2[1]
   1968   %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
   1969   %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1970   %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
   1971   ret <4 x float> %res
   1972 }
   1973 
   1974 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
   1975 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
   1976 ; CHECK: fmls.2d v0, v1, v2[1]
   1977   %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
   1978   %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   1979   %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
   1980   ret <2 x double> %res
   1981 }
   1982 
   1983 define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
   1984 ; CHECK-LABEL: test_fmul_v1f64:
   1985 ; CHECK: fmul
   1986   %prod = fmul <1 x double> %L, %R
   1987   ret <1 x double> %prod
   1988 }
   1989 
   1990 define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
   1991 ; CHECK-LABEL: test_fdiv_v1f64:
   1992 ; CHECK-LABEL: fdiv
   1993   %prod = fdiv <1 x double> %L, %R
   1994   ret <1 x double> %prod
   1995 }
   1996 
   1997 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
   1998 ;CHECK-LABEL: sqdmlal_d:
   1999 ;CHECK: sqdmlal
   2000   %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
   2001   %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
   2002   ret i64 %tmp5
   2003 }
   2004 
   2005 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
   2006 ;CHECK-LABEL: sqdmlsl_d:
   2007 ;CHECK: sqdmlsl
   2008   %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
   2009   %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
   2010   ret i64 %tmp5
   2011 }
   2012 
   2013 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
   2014 ; CHECK-LABEL: test_pmull_64:
   2015 ; CHECK: pmull.1q
   2016   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   2017   ret <16 x i8> %val
   2018 }
   2019 
   2020 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
   2021 ; CHECK-LABEL: test_pmull_high_64:
   2022 ; CHECK: pmull2.1q
   2023   %l_hi = extractelement <2 x i64> %l, i32 1
   2024   %r_hi = extractelement <2 x i64> %r, i32 1
   2025   %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
   2026   ret <16 x i8> %val
   2027 }
   2028 
   2029 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
   2030 
   2031 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
   2032 ; CHECK-LABEL: test_mul_v1i64:
   2033 ; CHECK: mul
   2034   %prod = mul <1 x i64> %lhs, %rhs
   2035   ret <1 x i64> %prod
   2036 }
   2037