Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
      2 
      3 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
      4 
      5 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
      6 
      7 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
      8 
      9 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
     10 
     11 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
     12 
     13 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
     14 
     15 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
     16 
     17 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
     18 
     19 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
     20 
     21 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
     22 
     23 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
     24 
     25 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
     26 
     27 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
     28 
     29 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
     30 
     31 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
     32 
     33 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
     34 
     35 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
     36 
     37 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
     38 
     39 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
     40 
     41 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
     42 
     43 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
     44 
     45 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
     46 ; CHECK-LABEL: test_vmla_lane_s16:
     47 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
     48 ; CHECK-NEXT: ret
     49 entry:
     50   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
     51   %mul = mul <4 x i16> %shuffle, %b
     52   %add = add <4 x i16> %mul, %a
     53   ret <4 x i16> %add
     54 }
     55 
     56 define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
     57 ; CHECK-LABEL: test_vmlaq_lane_s16:
     58 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
     59 ; CHECK-NEXT: ret
     60 entry:
     61   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
     62   %mul = mul <8 x i16> %shuffle, %b
     63   %add = add <8 x i16> %mul, %a
     64   ret <8 x i16> %add
     65 }
     66 
     67 define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
     68 ; CHECK-LABEL: test_vmla_lane_s32:
     69 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
     70 ; CHECK-NEXT: ret
     71 entry:
     72   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
     73   %mul = mul <2 x i32> %shuffle, %b
     74   %add = add <2 x i32> %mul, %a
     75   ret <2 x i32> %add
     76 }
     77 
     78 define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
     79 ; CHECK-LABEL: test_vmlaq_lane_s32:
     80 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
     81 ; CHECK-NEXT: ret
     82 entry:
     83   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
     84   %mul = mul <4 x i32> %shuffle, %b
     85   %add = add <4 x i32> %mul, %a
     86   ret <4 x i32> %add
     87 }
     88 
     89 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
     90 ; CHECK-LABEL: test_vmla_laneq_s16:
     91 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
     92 ; CHECK-NEXT: ret
     93 entry:
     94   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
     95   %mul = mul <4 x i16> %shuffle, %b
     96   %add = add <4 x i16> %mul, %a
     97   ret <4 x i16> %add
     98 }
     99 
    100 define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
    101 ; CHECK-LABEL: test_vmlaq_laneq_s16:
    102 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    103 ; CHECK-NEXT: ret
    104 entry:
    105   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    106   %mul = mul <8 x i16> %shuffle, %b
    107   %add = add <8 x i16> %mul, %a
    108   ret <8 x i16> %add
    109 }
    110 
    111 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
    112 ; CHECK-LABEL: test_vmla_laneq_s32:
    113 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    114 ; CHECK-NEXT: ret
    115 entry:
    116   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    117   %mul = mul <2 x i32> %shuffle, %b
    118   %add = add <2 x i32> %mul, %a
    119   ret <2 x i32> %add
    120 }
    121 
    122 define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
    123 ; CHECK-LABEL: test_vmlaq_laneq_s32:
    124 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    125 ; CHECK-NEXT: ret
    126 entry:
    127   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    128   %mul = mul <4 x i32> %shuffle, %b
    129   %add = add <4 x i32> %mul, %a
    130   ret <4 x i32> %add
    131 }
    132 
    133 define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
    134 ; CHECK-LABEL: test_vmls_lane_s16:
    135 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    136 ; CHECK-NEXT: ret
    137 entry:
    138   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    139   %mul = mul <4 x i16> %shuffle, %b
    140   %sub = sub <4 x i16> %a, %mul
    141   ret <4 x i16> %sub
    142 }
    143 
    144 define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
    145 ; CHECK-LABEL: test_vmlsq_lane_s16:
    146 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    147 ; CHECK-NEXT: ret
    148 entry:
    149   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    150   %mul = mul <8 x i16> %shuffle, %b
    151   %sub = sub <8 x i16> %a, %mul
    152   ret <8 x i16> %sub
    153 }
    154 
    155 define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
    156 ; CHECK-LABEL: test_vmls_lane_s32:
    157 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    158 ; CHECK-NEXT: ret
    159 entry:
    160   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    161   %mul = mul <2 x i32> %shuffle, %b
    162   %sub = sub <2 x i32> %a, %mul
    163   ret <2 x i32> %sub
    164 }
    165 
    166 define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
    167 ; CHECK-LABEL: test_vmlsq_lane_s32:
    168 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    169 ; CHECK-NEXT: ret
    170 entry:
    171   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    172   %mul = mul <4 x i32> %shuffle, %b
    173   %sub = sub <4 x i32> %a, %mul
    174   ret <4 x i32> %sub
    175 }
    176 
    177 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
    178 ; CHECK-LABEL: test_vmls_laneq_s16:
    179 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    180 ; CHECK-NEXT: ret
    181 entry:
    182   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    183   %mul = mul <4 x i16> %shuffle, %b
    184   %sub = sub <4 x i16> %a, %mul
    185   ret <4 x i16> %sub
    186 }
    187 
    188 define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
    189 ; CHECK-LABEL: test_vmlsq_laneq_s16:
    190 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    191 ; CHECK-NEXT: ret
    192 entry:
    193   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    194   %mul = mul <8 x i16> %shuffle, %b
    195   %sub = sub <8 x i16> %a, %mul
    196   ret <8 x i16> %sub
    197 }
    198 
    199 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
    200 ; CHECK-LABEL: test_vmls_laneq_s32:
    201 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    202 ; CHECK-NEXT: ret
    203 entry:
    204   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    205   %mul = mul <2 x i32> %shuffle, %b
    206   %sub = sub <2 x i32> %a, %mul
    207   ret <2 x i32> %sub
    208 }
    209 
    210 define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
    211 ; CHECK-LABEL: test_vmlsq_laneq_s32:
    212 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    213 ; CHECK-NEXT: ret
    214 entry:
    215   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    216   %mul = mul <4 x i32> %shuffle, %b
    217   %sub = sub <4 x i32> %a, %mul
    218   ret <4 x i32> %sub
    219 }
    220 
    221 define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
    222 ; CHECK-LABEL: test_vmul_lane_s16:
    223 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    224 ; CHECK-NEXT: ret
    225 entry:
    226   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    227   %mul = mul <4 x i16> %shuffle, %a
    228   ret <4 x i16> %mul
    229 }
    230 
    231 define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
    232 ; CHECK-LABEL: test_vmulq_lane_s16:
    233 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    234 ; CHECK-NEXT: ret
    235 entry:
    236   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    237   %mul = mul <8 x i16> %shuffle, %a
    238   ret <8 x i16> %mul
    239 }
    240 
    241 define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
    242 ; CHECK-LABEL: test_vmul_lane_s32:
    243 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    244 ; CHECK-NEXT: ret
    245 entry:
    246   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    247   %mul = mul <2 x i32> %shuffle, %a
    248   ret <2 x i32> %mul
    249 }
    250 
    251 define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
    252 ; CHECK-LABEL: test_vmulq_lane_s32:
    253 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    254 ; CHECK-NEXT: ret
    255 entry:
    256   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    257   %mul = mul <4 x i32> %shuffle, %a
    258   ret <4 x i32> %mul
    259 }
    260 
    261 define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
    262 ; CHECK-LABEL: test_vmul_lane_u16:
    263 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    264 ; CHECK-NEXT: ret
    265 entry:
    266   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    267   %mul = mul <4 x i16> %shuffle, %a
    268   ret <4 x i16> %mul
    269 }
    270 
    271 define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
    272 ; CHECK-LABEL: test_vmulq_lane_u16:
    273 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    274 ; CHECK-NEXT: ret
    275 entry:
    276   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    277   %mul = mul <8 x i16> %shuffle, %a
    278   ret <8 x i16> %mul
    279 }
    280 
    281 define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
    282 ; CHECK-LABEL: test_vmul_lane_u32:
    283 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    284 ; CHECK-NEXT: ret
    285 entry:
    286   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    287   %mul = mul <2 x i32> %shuffle, %a
    288   ret <2 x i32> %mul
    289 }
    290 
    291 define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
    292 ; CHECK-LABEL: test_vmulq_lane_u32:
    293 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    294 ; CHECK-NEXT: ret
    295 entry:
    296   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    297   %mul = mul <4 x i32> %shuffle, %a
    298   ret <4 x i32> %mul
    299 }
    300 
    301 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
    302 ; CHECK-LABEL: test_vmul_laneq_s16:
    303 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    304 ; CHECK-NEXT: ret
    305 entry:
    306   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    307   %mul = mul <4 x i16> %shuffle, %a
    308   ret <4 x i16> %mul
    309 }
    310 
    311 define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
    312 ; CHECK-LABEL: test_vmulq_laneq_s16:
    313 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    314 ; CHECK-NEXT: ret
    315 entry:
    316   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    317   %mul = mul <8 x i16> %shuffle, %a
    318   ret <8 x i16> %mul
    319 }
    320 
    321 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
    322 ; CHECK-LABEL: test_vmul_laneq_s32:
    323 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    324 ; CHECK-NEXT: ret
    325 entry:
    326   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    327   %mul = mul <2 x i32> %shuffle, %a
    328   ret <2 x i32> %mul
    329 }
    330 
    331 define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
    332 ; CHECK-LABEL: test_vmulq_laneq_s32:
    333 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    334 ; CHECK-NEXT: ret
    335 entry:
    336   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    337   %mul = mul <4 x i32> %shuffle, %a
    338   ret <4 x i32> %mul
    339 }
    340 
    341 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
    342 ; CHECK-LABEL: test_vmul_laneq_u16:
    343 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    344 ; CHECK-NEXT: ret
    345 entry:
    346   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    347   %mul = mul <4 x i16> %shuffle, %a
    348   ret <4 x i16> %mul
    349 }
    350 
    351 define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
    352 ; CHECK-LABEL: test_vmulq_laneq_u16:
    353 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    354 ; CHECK-NEXT: ret
    355 entry:
    356   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    357   %mul = mul <8 x i16> %shuffle, %a
    358   ret <8 x i16> %mul
    359 }
    360 
    361 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
    362 ; CHECK-LABEL: test_vmul_laneq_u32:
    363 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    364 ; CHECK-NEXT: ret
    365 entry:
    366   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    367   %mul = mul <2 x i32> %shuffle, %a
    368   ret <2 x i32> %mul
    369 }
    370 
    371 define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
    372 ; CHECK-LABEL: test_vmulq_laneq_u32:
    373 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    374 ; CHECK-NEXT: ret
    375 entry:
    376   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    377   %mul = mul <4 x i32> %shuffle, %a
    378   ret <4 x i32> %mul
    379 }
    380 
    381 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
    382 ; CHECK-LABEL: test_vfma_lane_f32:
    383 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    384 ; CHECK-NEXT: ret
    385 entry:
    386   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
    387   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    388   ret <2 x float> %0
    389 }
    390 
    391 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
    392 
    393 define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
    394 ; CHECK-LABEL: test_vfmaq_lane_f32:
    395 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    396 ; CHECK-NEXT: ret
    397 entry:
    398   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    399   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    400   ret <4 x float> %0
    401 }
    402 
    403 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
    404 
    405 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
    406 ; CHECK-LABEL: test_vfma_laneq_f32:
    407 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    408 ; CHECK-NEXT: ret
    409 entry:
    410   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
    411   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    412   ret <2 x float> %0
    413 }
    414 
    415 define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
    416 ; CHECK-LABEL: test_vfmaq_laneq_f32:
    417 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    418 ; CHECK-NEXT: ret
    419 entry:
    420   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    421   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    422   ret <4 x float> %0
    423 }
    424 
    425 define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
    426 ; CHECK-LABEL: test_vfms_lane_f32:
    427 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    428 ; CHECK-NEXT: ret
    429 entry:
    430   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
    431   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
    432   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    433   ret <2 x float> %0
    434 }
    435 
    436 define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
    437 ; CHECK-LABEL: test_vfmsq_lane_f32:
    438 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    439 ; CHECK-NEXT: ret
    440 entry:
    441   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
    442   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    443   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    444   ret <4 x float> %0
    445 }
    446 
    447 define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
    448 ; CHECK-LABEL: test_vfms_laneq_f32:
    449 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    450 ; CHECK-NEXT: ret
    451 entry:
    452   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
    453   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
    454   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    455   ret <2 x float> %0
    456 }
    457 
    458 define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
    459 ; CHECK-LABEL: test_vfmsq_laneq_f32:
    460 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    461 ; CHECK-NEXT: ret
    462 entry:
    463   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
    464   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    465   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    466   ret <4 x float> %0
    467 }
    468 
    469 define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
    470 ; CHECK-LABEL: test_vfmaq_lane_f64:
    471 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
    472 ; CHECK-NEXT: ret
    473 entry:
    474   %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
    475   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    476   ret <2 x double> %0
    477 }
    478 
    479 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
    480 
    481 define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
    482 ; CHECK-LABEL: test_vfmaq_laneq_f64:
    483 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
    484 ; CHECK-NEXT: ret
    485 entry:
    486   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
    487   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    488   ret <2 x double> %0
    489 }
    490 
    491 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
    492 ; CHECK-LABEL: test_vfmsq_lane_f64:
    493 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
    494 ; CHECK-NEXT: ret
    495 entry:
    496   %sub = fsub <1 x double> <double -0.000000e+00>, %v
    497   %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
    498   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    499   ret <2 x double> %0
    500 }
    501 
    502 define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
    503 ; CHECK-LABEL: test_vfmsq_laneq_f64:
    504 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
    505 ; CHECK-NEXT: ret
    506 entry:
    507   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
    508   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
    509   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    510   ret <2 x double> %0
    511 }
    512 
    513 define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
    514 ; CHECK-LABEL: test_vfmas_laneq_f32
    515 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
    516 ; CHECK-NEXT: ret
    517 entry:
    518   %extract = extractelement <4 x float> %v, i32 3
    519   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
    520   ret float %0
    521 }
    522 
    523 declare float @llvm.fma.f32(float, float, float)
    524 
    525 define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
    526 ; CHECK-LABEL: test_vfmsd_lane_f64
    527 ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    528 ; CHECK-NEXT: ret
    529 entry:
    530   %extract.rhs = extractelement <1 x double> %v, i32 0
    531   %extract = fsub double -0.000000e+00, %extract.rhs
    532   %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
    533   ret double %0
    534 }
    535 
    536 declare double @llvm.fma.f64(double, double, double)
    537 
    538 define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
    539 ; CHECK-LABEL: test_vfmss_lane_f32
    540 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
    541 ; CHECK-NEXT: ret
    542 entry:
    543   %extract.rhs = extractelement <2 x float> %v, i32 1
    544   %extract = fsub float -0.000000e+00, %extract.rhs
    545   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
    546   ret float %0
    547 }
    548 
    549 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
    550 ; CHECK-LABEL: test_vfmss_laneq_f32
    551 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
    552 ; CHECK-NEXT: ret
    553 entry:
    554   %extract.rhs = extractelement <4 x float> %v, i32 3
    555   %extract = fsub float -0.000000e+00, %extract.rhs
    556   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
    557   ret float %0
    558 }
    559 
    560 define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
    561 ; CHECK-LABEL: test_vfmsd_laneq_f64
    562 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
    563 ; CHECK-NEXT: ret
    564 entry:
    565   %extract.rhs = extractelement <2 x double> %v, i32 1
    566   %extract = fsub double -0.000000e+00, %extract.rhs
    567   %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
    568   ret double %0
    569 }
    570 
    571 define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) {
    572 ; CHCK-LABEL: test_vfmsd_lane_f64_0
    573 ; CHCK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    574 ; CHCK-NEXT: ret
    575 entry:
    576   %tmp0 = fsub <1 x double> <double -0.000000e+00>, %v
    577   %tmp1 = extractelement <1 x double> %tmp0, i32 0
    578   %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
    579   ret double %0
    580 }
    581 
    582 define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
    583 ; CHECK-LABEL: test_vfmss_lane_f32_0
    584 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
    585 ; CHECK-NEXT: ret
    586 entry:
    587   %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
    588   %tmp1 = extractelement <2 x float> %tmp0, i32 1
    589   %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
    590   ret float %0
    591 }
    592 
    593 define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
    594 ; CHECK-LABEL: test_vfmss_laneq_f32_0
    595 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
    596 ; CHECK-NEXT: ret
    597 entry:
    598   %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
    599   %tmp1 = extractelement <4 x float> %tmp0, i32 3
    600   %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
    601   ret float %0
    602 }
    603 
    604 define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
    605 ; CHECK-LABEL: test_vfmsd_laneq_f64_0
    606 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
    607 ; CHECK-NEXT: ret
    608 entry:
    609   %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
    610   %tmp1 = extractelement <2 x double> %tmp0, i32 1
    611   %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
    612   ret double %0
    613 }
    614 
    615 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    616 ; CHECK-LABEL: test_vmlal_lane_s16:
    617 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    618 ; CHECK-NEXT: ret
    619 entry:
    620   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    621   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    622   %add = add <4 x i32> %vmull2.i, %a
    623   ret <4 x i32> %add
    624 }
    625 
    626 define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    627 ; CHECK-LABEL: test_vmlal_lane_s32:
    628 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    629 ; CHECK-NEXT: ret
    630 entry:
    631   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    632   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    633   %add = add <2 x i64> %vmull2.i, %a
    634   ret <2 x i64> %add
    635 }
    636 
    637 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    638 ; CHECK-LABEL: test_vmlal_laneq_s16:
    639 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    640 ; CHECK-NEXT: ret
    641 entry:
    642   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    643   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    644   %add = add <4 x i32> %vmull2.i, %a
    645   ret <4 x i32> %add
    646 }
    647 
    648 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    649 ; CHECK-LABEL: test_vmlal_laneq_s32:
    650 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    651 ; CHECK-NEXT: ret
    652 entry:
    653   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    654   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    655   %add = add <2 x i64> %vmull2.i, %a
    656   ret <2 x i64> %add
    657 }
    658 
    659 define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    660 ; CHECK-LABEL: test_vmlal_high_lane_s16:
    661 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    662 ; CHECK-NEXT: ret
    663 entry:
    664   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    665   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    666   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    667   %add = add <4 x i32> %vmull2.i, %a
    668   ret <4 x i32> %add
    669 }
    670 
    671 define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    672 ; CHECK-LABEL: test_vmlal_high_lane_s32:
    673 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    674 ; CHECK-NEXT: ret
    675 entry:
    676   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    677   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    678   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    679   %add = add <2 x i64> %vmull2.i, %a
    680   ret <2 x i64> %add
    681 }
    682 
    683 define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    684 ; CHECK-LABEL: test_vmlal_high_laneq_s16:
    685 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    686 ; CHECK-NEXT: ret
    687 entry:
    688   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    689   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    690   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    691   %add = add <4 x i32> %vmull2.i, %a
    692   ret <4 x i32> %add
    693 }
    694 
    695 define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    696 ; CHECK-LABEL: test_vmlal_high_laneq_s32:
    697 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    698 ; CHECK-NEXT: ret
    699 entry:
    700   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    701   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    702   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    703   %add = add <2 x i64> %vmull2.i, %a
    704   ret <2 x i64> %add
    705 }
    706 
    707 define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    708 ; CHECK-LABEL: test_vmlsl_lane_s16:
    709 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    710 ; CHECK-NEXT: ret
    711 entry:
    712   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    713   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    714   %sub = sub <4 x i32> %a, %vmull2.i
    715   ret <4 x i32> %sub
    716 }
    717 
    718 define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    719 ; CHECK-LABEL: test_vmlsl_lane_s32:
    720 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    721 ; CHECK-NEXT: ret
    722 entry:
    723   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    724   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    725   %sub = sub <2 x i64> %a, %vmull2.i
    726   ret <2 x i64> %sub
    727 }
    728 
    729 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    730 ; CHECK-LABEL: test_vmlsl_laneq_s16:
    731 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    732 ; CHECK-NEXT: ret
    733 entry:
    734   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    735   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    736   %sub = sub <4 x i32> %a, %vmull2.i
    737   ret <4 x i32> %sub
    738 }
    739 
    740 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    741 ; CHECK-LABEL: test_vmlsl_laneq_s32:
    742 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    743 ; CHECK-NEXT: ret
    744 entry:
    745   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    746   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    747   %sub = sub <2 x i64> %a, %vmull2.i
    748   ret <2 x i64> %sub
    749 }
    750 
    751 define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    752 ; CHECK-LABEL: test_vmlsl_high_lane_s16:
    753 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    754 ; CHECK-NEXT: ret
    755 entry:
    756   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    757   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    758   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    759   %sub = sub <4 x i32> %a, %vmull2.i
    760   ret <4 x i32> %sub
    761 }
    762 
    763 define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    764 ; CHECK-LABEL: test_vmlsl_high_lane_s32:
    765 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    766 ; CHECK-NEXT: ret
    767 entry:
    768   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    769   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    770   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    771   %sub = sub <2 x i64> %a, %vmull2.i
    772   ret <2 x i64> %sub
    773 }
    774 
    775 define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    776 ; CHECK-LABEL: test_vmlsl_high_laneq_s16:
    777 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    778 ; CHECK-NEXT: ret
    779 entry:
    780   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    781   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    782   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    783   %sub = sub <4 x i32> %a, %vmull2.i
    784   ret <4 x i32> %sub
    785 }
    786 
    787 define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    788 ; CHECK-LABEL: test_vmlsl_high_laneq_s32:
    789 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    790 ; CHECK-NEXT: ret
    791 entry:
    792   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    793   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    794   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    795   %sub = sub <2 x i64> %a, %vmull2.i
    796   ret <2 x i64> %sub
    797 }
    798 
    799 define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    800 ; CHECK-LABEL: test_vmlal_lane_u16:
    801 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    802 ; CHECK-NEXT: ret
    803 entry:
    804   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    805   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    806   %add = add <4 x i32> %vmull2.i, %a
    807   ret <4 x i32> %add
    808 }
    809 
    810 define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    811 ; CHECK-LABEL: test_vmlal_lane_u32:
    812 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    813 ; CHECK-NEXT: ret
    814 entry:
    815   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    816   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    817   %add = add <2 x i64> %vmull2.i, %a
    818   ret <2 x i64> %add
    819 }
    820 
    821 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    822 ; CHECK-LABEL: test_vmlal_laneq_u16:
    823 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    824 ; CHECK-NEXT: ret
    825 entry:
    826   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    827   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    828   %add = add <4 x i32> %vmull2.i, %a
    829   ret <4 x i32> %add
    830 }
    831 
    832 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    833 ; CHECK-LABEL: test_vmlal_laneq_u32:
    834 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    835 ; CHECK-NEXT: ret
    836 entry:
    837   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    838   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    839   %add = add <2 x i64> %vmull2.i, %a
    840   ret <2 x i64> %add
    841 }
    842 
    843 define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    844 ; CHECK-LABEL: test_vmlal_high_lane_u16:
    845 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    846 ; CHECK-NEXT: ret
    847 entry:
    848   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    849   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    850   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    851   %add = add <4 x i32> %vmull2.i, %a
    852   ret <4 x i32> %add
    853 }
    854 
    855 define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    856 ; CHECK-LABEL: test_vmlal_high_lane_u32:
    857 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    858 ; CHECK-NEXT: ret
    859 entry:
    860   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    861   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    862   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    863   %add = add <2 x i64> %vmull2.i, %a
    864   ret <2 x i64> %add
    865 }
    866 
    867 define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    868 ; CHECK-LABEL: test_vmlal_high_laneq_u16:
    869 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    870 ; CHECK-NEXT: ret
    871 entry:
    872   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    873   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    874   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    875   %add = add <4 x i32> %vmull2.i, %a
    876   ret <4 x i32> %add
    877 }
    878 
    879 define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    880 ; CHECK-LABEL: test_vmlal_high_laneq_u32:
    881 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    882 ; CHECK-NEXT: ret
    883 entry:
    884   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    885   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    886   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    887   %add = add <2 x i64> %vmull2.i, %a
    888   ret <2 x i64> %add
    889 }
    890 
    891 define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    892 ; CHECK-LABEL: test_vmlsl_lane_u16:
    893 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    894 ; CHECK-NEXT: ret
    895 entry:
    896   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    897   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    898   %sub = sub <4 x i32> %a, %vmull2.i
    899   ret <4 x i32> %sub
    900 }
    901 
    902 define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    903 ; CHECK-LABEL: test_vmlsl_lane_u32:
    904 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    905 ; CHECK-NEXT: ret
    906 entry:
    907   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    908   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    909   %sub = sub <2 x i64> %a, %vmull2.i
    910   ret <2 x i64> %sub
    911 }
    912 
    913 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    914 ; CHECK-LABEL: test_vmlsl_laneq_u16:
    915 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    916 ; CHECK-NEXT: ret
    917 entry:
    918   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    919   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    920   %sub = sub <4 x i32> %a, %vmull2.i
    921   ret <4 x i32> %sub
    922 }
    923 
    924 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    925 ; CHECK-LABEL: test_vmlsl_laneq_u32:
    926 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    927 ; CHECK-NEXT: ret
    928 entry:
    929   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    930   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    931   %sub = sub <2 x i64> %a, %vmull2.i
    932   ret <2 x i64> %sub
    933 }
    934 
    935 define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    936 ; CHECK-LABEL: test_vmlsl_high_lane_u16:
    937 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    938 ; CHECK-NEXT: ret
    939 entry:
    940   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    941   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    942   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    943   %sub = sub <4 x i32> %a, %vmull2.i
    944   ret <4 x i32> %sub
    945 }
    946 
    947 define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    948 ; CHECK-LABEL: test_vmlsl_high_lane_u32:
    949 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    950 ; CHECK-NEXT: ret
    951 entry:
    952   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    953   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    954   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    955   %sub = sub <2 x i64> %a, %vmull2.i
    956   ret <2 x i64> %sub
    957 }
    958 
    959 define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    960 ; CHECK-LABEL: test_vmlsl_high_laneq_u16:
    961 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    962 ; CHECK-NEXT: ret
    963 entry:
    964   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    965   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    966   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    967   %sub = sub <4 x i32> %a, %vmull2.i
    968   ret <4 x i32> %sub
    969 }
    970 
    971 define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    972 ; CHECK-LABEL: test_vmlsl_high_laneq_u32:
    973 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    974 ; CHECK-NEXT: ret
    975 entry:
    976   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    977   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    978   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    979   %sub = sub <2 x i64> %a, %vmull2.i
    980   ret <2 x i64> %sub
    981 }
    982 
    983 define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
    984 ; CHECK-LABEL: test_vmull_lane_s16:
    985 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    986 ; CHECK-NEXT: ret
    987 entry:
    988   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    989   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
    990   ret <4 x i32> %vmull2.i
    991 }
    992 
    993 define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
    994 ; CHECK-LABEL: test_vmull_lane_s32:
    995 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    996 ; CHECK-NEXT: ret
    997 entry:
    998   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    999   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1000   ret <2 x i64> %vmull2.i
   1001 }
   1002 
   1003 define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
   1004 ; CHECK-LABEL: test_vmull_lane_u16:
   1005 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1006 ; CHECK-NEXT: ret
   1007 entry:
   1008   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1009   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1010   ret <4 x i32> %vmull2.i
   1011 }
   1012 
   1013 define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
   1014 ; CHECK-LABEL: test_vmull_lane_u32:
   1015 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1016 ; CHECK-NEXT: ret
   1017 entry:
   1018   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1019   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1020   ret <2 x i64> %vmull2.i
   1021 }
   1022 
   1023 define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
   1024 ; CHECK-LABEL: test_vmull_high_lane_s16:
   1025 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1026 ; CHECK-NEXT: ret
   1027 entry:
   1028   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1029   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1030   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1031   ret <4 x i32> %vmull2.i
   1032 }
   1033 
   1034 define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
   1035 ; CHECK-LABEL: test_vmull_high_lane_s32:
   1036 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1037 ; CHECK-NEXT: ret
   1038 entry:
   1039   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1040   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1041   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1042   ret <2 x i64> %vmull2.i
   1043 }
   1044 
   1045 define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
   1046 ; CHECK-LABEL: test_vmull_high_lane_u16:
   1047 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1048 ; CHECK-NEXT: ret
   1049 entry:
   1050   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1051   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1052   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1053   ret <4 x i32> %vmull2.i
   1054 }
   1055 
   1056 define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
   1057 ; CHECK-LABEL: test_vmull_high_lane_u32:
   1058 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1059 ; CHECK-NEXT: ret
   1060 entry:
   1061   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1062   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1063   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1064   ret <2 x i64> %vmull2.i
   1065 }
   1066 
   1067 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
   1068 ; CHECK-LABEL: test_vmull_laneq_s16:
   1069 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
   1070 ; CHECK-NEXT: ret
   1071 entry:
   1072   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1073   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1074   ret <4 x i32> %vmull2.i
   1075 }
   1076 
   1077 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
   1078 ; CHECK-LABEL: test_vmull_laneq_s32:
   1079 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1080 ; CHECK-NEXT: ret
   1081 entry:
   1082   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1083   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1084   ret <2 x i64> %vmull2.i
   1085 }
   1086 
   1087 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
   1088 ; CHECK-LABEL: test_vmull_laneq_u16:
   1089 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
   1090 ; CHECK-NEXT: ret
   1091 entry:
   1092   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1093   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1094   ret <4 x i32> %vmull2.i
   1095 }
   1096 
   1097 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
   1098 ; CHECK-LABEL: test_vmull_laneq_u32:
   1099 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1100 ; CHECK-NEXT: ret
   1101 entry:
   1102   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1103   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1104   ret <2 x i64> %vmull2.i
   1105 }
   1106 
   1107 define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
   1108 ; CHECK-LABEL: test_vmull_high_laneq_s16:
   1109 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
   1110 ; CHECK-NEXT: ret
   1111 entry:
   1112   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1113   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1114   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1115   ret <4 x i32> %vmull2.i
   1116 }
   1117 
   1118 define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
   1119 ; CHECK-LABEL: test_vmull_high_laneq_s32:
   1120 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1121 ; CHECK-NEXT: ret
   1122 entry:
   1123   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1124   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1125   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1126   ret <2 x i64> %vmull2.i
   1127 }
   1128 
   1129 define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
   1130 ; CHECK-LABEL: test_vmull_high_laneq_u16:
   1131 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
   1132 ; CHECK-NEXT: ret
   1133 entry:
   1134   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1135   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1136   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1137   ret <4 x i32> %vmull2.i
   1138 }
   1139 
   1140 define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
   1141 ; CHECK-LABEL: test_vmull_high_laneq_u32:
   1142 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1143 ; CHECK-NEXT: ret
   1144 entry:
   1145   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1146   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1147   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1148   ret <2 x i64> %vmull2.i
   1149 }
   1150 
   1151 define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   1152 ; CHECK-LABEL: test_vqdmlal_lane_s16:
   1153 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1154 ; CHECK-NEXT: ret
   1155 entry:
   1156   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1157   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   1158   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   1159   ret <4 x i32> %vqdmlal4.i
   1160 }
   1161 
   1162 define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   1163 ; CHECK-LABEL: test_vqdmlal_lane_s32:
   1164 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1165 ; CHECK-NEXT: ret
   1166 entry:
   1167   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1168   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   1169   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   1170   ret <2 x i64> %vqdmlal4.i
   1171 }
   1172 
   1173 define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   1174 ; CHECK-LABEL: test_vqdmlal_high_lane_s16:
   1175 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1176 ; CHECK-NEXT: ret
   1177 entry:
   1178   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1179   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1180   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1181   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   1182   ret <4 x i32> %vqdmlal4.i
   1183 }
   1184 
   1185 define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   1186 ; CHECK-LABEL: test_vqdmlal_high_lane_s32:
   1187 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1188 ; CHECK-NEXT: ret
   1189 entry:
   1190   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1191   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1192   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1193   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   1194   ret <2 x i64> %vqdmlal4.i
   1195 }
   1196 
   1197 define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   1198 ; CHECK-LABEL: test_vqdmlsl_lane_s16:
   1199 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1200 ; CHECK-NEXT: ret
   1201 entry:
   1202   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1203   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   1204   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   1205   ret <4 x i32> %vqdmlsl4.i
   1206 }
   1207 
   1208 define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   1209 ; CHECK-LABEL: test_vqdmlsl_lane_s32:
   1210 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1211 ; CHECK-NEXT: ret
   1212 entry:
   1213   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1214   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   1215   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   1216   ret <2 x i64> %vqdmlsl4.i
   1217 }
   1218 
   1219 define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   1220 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
   1221 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1222 ; CHECK-NEXT: ret
   1223 entry:
   1224   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1225   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1226   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1227   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   1228   ret <4 x i32> %vqdmlsl4.i
   1229 }
   1230 
   1231 define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   1232 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
   1233 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1234 ; CHECK-NEXT: ret
   1235 entry:
   1236   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1237   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1238   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1239   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   1240   ret <2 x i64> %vqdmlsl4.i
   1241 }
   1242 
   1243 define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
   1244 ; CHECK-LABEL: test_vqdmull_lane_s16:
   1245 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1246 ; CHECK-NEXT: ret
   1247 entry:
   1248   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1249   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1250   ret <4 x i32> %vqdmull2.i
   1251 }
   1252 
   1253 define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
   1254 ; CHECK-LABEL: test_vqdmull_lane_s32:
   1255 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1256 ; CHECK-NEXT: ret
   1257 entry:
   1258   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1259   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1260   ret <2 x i64> %vqdmull2.i
   1261 }
   1262 
   1263 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
   1264 ; CHECK-LABEL: test_vqdmull_laneq_s16:
   1265 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1266 ; CHECK-NEXT: ret
   1267 entry:
   1268   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1269   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1270   ret <4 x i32> %vqdmull2.i
   1271 }
   1272 
   1273 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
   1274 ; CHECK-LABEL: test_vqdmull_laneq_s32:
   1275 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1276 ; CHECK-NEXT: ret
   1277 entry:
   1278   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1279   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1280   ret <2 x i64> %vqdmull2.i
   1281 }
   1282 
   1283 define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
   1284 ; CHECK-LABEL: test_vqdmull_high_lane_s16:
   1285 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1286 ; CHECK-NEXT: ret
   1287 entry:
   1288   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1289   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1290   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1291   ret <4 x i32> %vqdmull2.i
   1292 }
   1293 
   1294 define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
   1295 ; CHECK-LABEL: test_vqdmull_high_lane_s32:
   1296 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1297 ; CHECK-NEXT: ret
   1298 entry:
   1299   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1300   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1301   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1302   ret <2 x i64> %vqdmull2.i
   1303 }
   1304 
   1305 define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
   1306 ; CHECK-LABEL: test_vqdmull_high_laneq_s16:
   1307 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
   1308 ; CHECK-NEXT: ret
   1309 entry:
   1310   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1311   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1312   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1313   ret <4 x i32> %vqdmull2.i
   1314 }
   1315 
   1316 define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
   1317 ; CHECK-LABEL: test_vqdmull_high_laneq_s32:
   1318 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1319 ; CHECK-NEXT: ret
   1320 entry:
   1321   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1322   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1323   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1324   ret <2 x i64> %vqdmull2.i
   1325 }
   1326 
   1327 define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
   1328 ; CHECK-LABEL: test_vqdmulh_lane_s16:
   1329 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1330 ; CHECK-NEXT: ret
   1331 entry:
   1332   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1333   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   1334   ret <4 x i16> %vqdmulh2.i
   1335 }
   1336 
   1337 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
   1338 ; CHECK-LABEL: test_vqdmulhq_lane_s16:
   1339 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1340 ; CHECK-NEXT: ret
   1341 entry:
   1342   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   1343   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   1344   ret <8 x i16> %vqdmulh2.i
   1345 }
   1346 
   1347 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
   1348 ; CHECK-LABEL: test_vqdmulh_lane_s32:
   1349 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1350 ; CHECK-NEXT: ret
   1351 entry:
   1352   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1353   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   1354   ret <2 x i32> %vqdmulh2.i
   1355 }
   1356 
   1357 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
   1358 ; CHECK-LABEL: test_vqdmulhq_lane_s32:
   1359 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1360 ; CHECK-NEXT: ret
   1361 entry:
   1362   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1363   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   1364   ret <4 x i32> %vqdmulh2.i
   1365 }
   1366 
   1367 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
   1368 ; CHECK-LABEL: test_vqrdmulh_lane_s16:
   1369 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1370 ; CHECK-NEXT: ret
   1371 entry:
   1372   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1373   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   1374   ret <4 x i16> %vqrdmulh2.i
   1375 }
   1376 
   1377 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
   1378 ; CHECK-LABEL: test_vqrdmulhq_lane_s16:
   1379 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1380 ; CHECK-NEXT: ret
   1381 entry:
   1382   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   1383   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   1384   ret <8 x i16> %vqrdmulh2.i
   1385 }
   1386 
   1387 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
   1388 ; CHECK-LABEL: test_vqrdmulh_lane_s32:
   1389 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1390 ; CHECK-NEXT: ret
   1391 entry:
   1392   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1393   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   1394   ret <2 x i32> %vqrdmulh2.i
   1395 }
   1396 
   1397 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
   1398 ; CHECK-LABEL: test_vqrdmulhq_lane_s32:
   1399 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1400 ; CHECK-NEXT: ret
   1401 entry:
   1402   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1403   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   1404   ret <4 x i32> %vqrdmulh2.i
   1405 }
   1406 
   1407 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
   1408 ; CHECK-LABEL: test_vmul_lane_f32:
   1409 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1410 ; CHECK-NEXT: ret
   1411 entry:
   1412   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   1413   %mul = fmul <2 x float> %shuffle, %a
   1414   ret <2 x float> %mul
   1415 }
   1416 
   1417 define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
   1418 ; CHECK-LABEL: test_vmul_lane_f64:
   1419 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   1420 ; CHECK-NEXT: ret
   1421 entry:
   1422   %0 = bitcast <1 x double> %a to <8 x i8>
   1423   %1 = bitcast <8 x i8> %0 to double
   1424   %extract = extractelement <1 x double> %v, i32 0
   1425   %2 = fmul double %1, %extract
   1426   %3 = insertelement <1 x double> undef, double %2, i32 0
   1427   ret <1 x double> %3
   1428 }
   1429 
   1430 define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
   1431 ; CHECK-LABEL: test_vmulq_lane_f32:
   1432 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1433 ; CHECK-NEXT: ret
   1434 entry:
   1435   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1436   %mul = fmul <4 x float> %shuffle, %a
   1437   ret <4 x float> %mul
   1438 }
   1439 
   1440 define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
   1441 ; CHECK-LABEL: test_vmulq_lane_f64:
   1442 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1443 ; CHECK-NEXT: ret
   1444 entry:
   1445   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   1446   %mul = fmul <2 x double> %shuffle, %a
   1447   ret <2 x double> %mul
   1448 }
   1449 
   1450 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
   1451 ; CHECK-LABEL: test_vmul_laneq_f32:
   1452 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1453 ; CHECK-NEXT: ret
   1454 entry:
   1455   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   1456   %mul = fmul <2 x float> %shuffle, %a
   1457   ret <2 x float> %mul
   1458 }
   1459 
   1460 define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
   1461 ; CHECK-LABEL: test_vmul_laneq_f64:
   1462 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   1463 ; CHECK-NEXT: ret
   1464 entry:
   1465   %0 = bitcast <1 x double> %a to <8 x i8>
   1466   %1 = bitcast <8 x i8> %0 to double
   1467   %extract = extractelement <2 x double> %v, i32 1
   1468   %2 = fmul double %1, %extract
   1469   %3 = insertelement <1 x double> undef, double %2, i32 0
   1470   ret <1 x double> %3
   1471 }
   1472 
   1473 define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
   1474 ; CHECK-LABEL: test_vmulq_laneq_f32:
   1475 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1476 ; CHECK-NEXT: ret
   1477 entry:
   1478   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1479   %mul = fmul <4 x float> %shuffle, %a
   1480   ret <4 x float> %mul
   1481 }
   1482 
   1483 define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
   1484 ; CHECK-LABEL: test_vmulq_laneq_f64:
   1485 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
   1486 ; CHECK-NEXT: ret
   1487 entry:
   1488   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   1489   %mul = fmul <2 x double> %shuffle, %a
   1490   ret <2 x double> %mul
   1491 }
   1492 
   1493 define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
   1494 ; CHECK-LABEL: test_vmulx_lane_f32:
   1495 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1496 ; CHECK-NEXT: ret
   1497 entry:
   1498   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   1499   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   1500   ret <2 x float> %vmulx2.i
   1501 }
   1502 
   1503 define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
   1504 ; CHECK-LABEL: test_vmulxq_lane_f32:
   1505 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1506 ; CHECK-NEXT: ret
   1507 entry:
   1508   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1509   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   1510   ret <4 x float> %vmulx2.i
   1511 }
   1512 
   1513 define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
   1514 ; CHECK-LABEL: test_vmulxq_lane_f64:
   1515 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1516 ; CHECK-NEXT: ret
   1517 entry:
   1518   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   1519   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   1520   ret <2 x double> %vmulx2.i
   1521 }
   1522 
   1523 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
   1524 ; CHECK-LABEL: test_vmulx_laneq_f32:
   1525 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1526 ; CHECK-NEXT: ret
   1527 entry:
   1528   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   1529   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   1530   ret <2 x float> %vmulx2.i
   1531 }
   1532 
   1533 define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
   1534 ; CHECK-LABEL: test_vmulxq_laneq_f32:
   1535 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1536 ; CHECK-NEXT: ret
   1537 entry:
   1538   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1539   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   1540   ret <4 x float> %vmulx2.i
   1541 }
   1542 
   1543 define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
   1544 ; CHECK-LABEL: test_vmulxq_laneq_f64:
   1545 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
   1546 ; CHECK-NEXT: ret
   1547 entry:
   1548   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   1549   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   1550   ret <2 x double> %vmulx2.i
   1551 }
   1552 
   1553 define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
   1554 ; CHECK-LABEL: test_vmla_lane_s16_0:
   1555 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1556 ; CHECK-NEXT: ret
   1557 entry:
   1558   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1559   %mul = mul <4 x i16> %shuffle, %b
   1560   %add = add <4 x i16> %mul, %a
   1561   ret <4 x i16> %add
   1562 }
   1563 
   1564 define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
   1565 ; CHECK-LABEL: test_vmlaq_lane_s16_0:
   1566 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1567 ; CHECK-NEXT: ret
   1568 entry:
   1569   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1570   %mul = mul <8 x i16> %shuffle, %b
   1571   %add = add <8 x i16> %mul, %a
   1572   ret <8 x i16> %add
   1573 }
   1574 
   1575 define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
   1576 ; CHECK-LABEL: test_vmla_lane_s32_0:
   1577 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1578 ; CHECK-NEXT: ret
   1579 entry:
   1580   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1581   %mul = mul <2 x i32> %shuffle, %b
   1582   %add = add <2 x i32> %mul, %a
   1583   ret <2 x i32> %add
   1584 }
   1585 
   1586 define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
   1587 ; CHECK-LABEL: test_vmlaq_lane_s32_0:
   1588 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1589 ; CHECK-NEXT: ret
   1590 entry:
   1591   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1592   %mul = mul <4 x i32> %shuffle, %b
   1593   %add = add <4 x i32> %mul, %a
   1594   ret <4 x i32> %add
   1595 }
   1596 
   1597 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
   1598 ; CHECK-LABEL: test_vmla_laneq_s16_0:
   1599 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1600 ; CHECK-NEXT: ret
   1601 entry:
   1602   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1603   %mul = mul <4 x i16> %shuffle, %b
   1604   %add = add <4 x i16> %mul, %a
   1605   ret <4 x i16> %add
   1606 }
   1607 
   1608 define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
   1609 ; CHECK-LABEL: test_vmlaq_laneq_s16_0:
   1610 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1611 ; CHECK-NEXT: ret
   1612 entry:
   1613   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1614   %mul = mul <8 x i16> %shuffle, %b
   1615   %add = add <8 x i16> %mul, %a
   1616   ret <8 x i16> %add
   1617 }
   1618 
   1619 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
   1620 ; CHECK-LABEL: test_vmla_laneq_s32_0:
   1621 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1622 ; CHECK-NEXT: ret
   1623 entry:
   1624   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1625   %mul = mul <2 x i32> %shuffle, %b
   1626   %add = add <2 x i32> %mul, %a
   1627   ret <2 x i32> %add
   1628 }
   1629 
   1630 define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
   1631 ; CHECK-LABEL: test_vmlaq_laneq_s32_0:
   1632 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1633 ; CHECK-NEXT: ret
   1634 entry:
   1635   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1636   %mul = mul <4 x i32> %shuffle, %b
   1637   %add = add <4 x i32> %mul, %a
   1638   ret <4 x i32> %add
   1639 }
   1640 
   1641 define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
   1642 ; CHECK-LABEL: test_vmls_lane_s16_0:
   1643 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1644 ; CHECK-NEXT: ret
   1645 entry:
   1646   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1647   %mul = mul <4 x i16> %shuffle, %b
   1648   %sub = sub <4 x i16> %a, %mul
   1649   ret <4 x i16> %sub
   1650 }
   1651 
   1652 define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
   1653 ; CHECK-LABEL: test_vmlsq_lane_s16_0:
   1654 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1655 ; CHECK-NEXT: ret
   1656 entry:
   1657   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1658   %mul = mul <8 x i16> %shuffle, %b
   1659   %sub = sub <8 x i16> %a, %mul
   1660   ret <8 x i16> %sub
   1661 }
   1662 
   1663 define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
   1664 ; CHECK-LABEL: test_vmls_lane_s32_0:
   1665 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1666 ; CHECK-NEXT: ret
   1667 entry:
   1668   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1669   %mul = mul <2 x i32> %shuffle, %b
   1670   %sub = sub <2 x i32> %a, %mul
   1671   ret <2 x i32> %sub
   1672 }
   1673 
   1674 define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
   1675 ; CHECK-LABEL: test_vmlsq_lane_s32_0:
   1676 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1677 ; CHECK-NEXT: ret
   1678 entry:
   1679   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1680   %mul = mul <4 x i32> %shuffle, %b
   1681   %sub = sub <4 x i32> %a, %mul
   1682   ret <4 x i32> %sub
   1683 }
   1684 
   1685 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
   1686 ; CHECK-LABEL: test_vmls_laneq_s16_0:
   1687 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1688 ; CHECK-NEXT: ret
   1689 entry:
   1690   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1691   %mul = mul <4 x i16> %shuffle, %b
   1692   %sub = sub <4 x i16> %a, %mul
   1693   ret <4 x i16> %sub
   1694 }
   1695 
   1696 define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
   1697 ; CHECK-LABEL: test_vmlsq_laneq_s16_0:
   1698 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1699 ; CHECK-NEXT: ret
   1700 entry:
   1701   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1702   %mul = mul <8 x i16> %shuffle, %b
   1703   %sub = sub <8 x i16> %a, %mul
   1704   ret <8 x i16> %sub
   1705 }
   1706 
   1707 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
   1708 ; CHECK-LABEL: test_vmls_laneq_s32_0:
   1709 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1710 ; CHECK-NEXT: ret
   1711 entry:
   1712   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1713   %mul = mul <2 x i32> %shuffle, %b
   1714   %sub = sub <2 x i32> %a, %mul
   1715   ret <2 x i32> %sub
   1716 }
   1717 
   1718 define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
   1719 ; CHECK-LABEL: test_vmlsq_laneq_s32_0:
   1720 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1721 ; CHECK-NEXT: ret
   1722 entry:
   1723   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1724   %mul = mul <4 x i32> %shuffle, %b
   1725   %sub = sub <4 x i32> %a, %mul
   1726   ret <4 x i32> %sub
   1727 }
   1728 
   1729 define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   1730 ; CHECK-LABEL: test_vmul_lane_s16_0:
   1731 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1732 ; CHECK-NEXT: ret
   1733 entry:
   1734   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1735   %mul = mul <4 x i16> %shuffle, %a
   1736   ret <4 x i16> %mul
   1737 }
   1738 
   1739 define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   1740 ; CHECK-LABEL: test_vmulq_lane_s16_0:
   1741 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1742 ; CHECK-NEXT: ret
   1743 entry:
   1744   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1745   %mul = mul <8 x i16> %shuffle, %a
   1746   ret <8 x i16> %mul
   1747 }
   1748 
   1749 define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   1750 ; CHECK-LABEL: test_vmul_lane_s32_0:
   1751 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1752 ; CHECK-NEXT: ret
   1753 entry:
   1754   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1755   %mul = mul <2 x i32> %shuffle, %a
   1756   ret <2 x i32> %mul
   1757 }
   1758 
   1759 define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   1760 ; CHECK-LABEL: test_vmulq_lane_s32_0:
   1761 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1762 ; CHECK-NEXT: ret
   1763 entry:
   1764   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1765   %mul = mul <4 x i32> %shuffle, %a
   1766   ret <4 x i32> %mul
   1767 }
   1768 
   1769 define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
   1770 ; CHECK-LABEL: test_vmul_lane_u16_0:
   1771 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1772 ; CHECK-NEXT: ret
   1773 entry:
   1774   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1775   %mul = mul <4 x i16> %shuffle, %a
   1776   ret <4 x i16> %mul
   1777 }
   1778 
   1779 define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
   1780 ; CHECK-LABEL: test_vmulq_lane_u16_0:
   1781 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1782 ; CHECK-NEXT: ret
   1783 entry:
   1784   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1785   %mul = mul <8 x i16> %shuffle, %a
   1786   ret <8 x i16> %mul
   1787 }
   1788 
   1789 define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
   1790 ; CHECK-LABEL: test_vmul_lane_u32_0:
   1791 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1792 ; CHECK-NEXT: ret
   1793 entry:
   1794   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1795   %mul = mul <2 x i32> %shuffle, %a
   1796   ret <2 x i32> %mul
   1797 }
   1798 
   1799 define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
   1800 ; CHECK-LABEL: test_vmulq_lane_u32_0:
   1801 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1802 ; CHECK-NEXT: ret
   1803 entry:
   1804   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1805   %mul = mul <4 x i32> %shuffle, %a
   1806   ret <4 x i32> %mul
   1807 }
   1808 
   1809 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
   1810 ; CHECK-LABEL: test_vmul_laneq_s16_0:
   1811 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1812 ; CHECK-NEXT: ret
   1813 entry:
   1814   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1815   %mul = mul <4 x i16> %shuffle, %a
   1816   ret <4 x i16> %mul
   1817 }
   1818 
   1819 define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
   1820 ; CHECK-LABEL: test_vmulq_laneq_s16_0:
   1821 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1822 ; CHECK-NEXT: ret
   1823 entry:
   1824   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1825   %mul = mul <8 x i16> %shuffle, %a
   1826   ret <8 x i16> %mul
   1827 }
   1828 
   1829 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
   1830 ; CHECK-LABEL: test_vmul_laneq_s32_0:
   1831 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1832 ; CHECK-NEXT: ret
   1833 entry:
   1834   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1835   %mul = mul <2 x i32> %shuffle, %a
   1836   ret <2 x i32> %mul
   1837 }
   1838 
   1839 define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
   1840 ; CHECK-LABEL: test_vmulq_laneq_s32_0:
   1841 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1842 ; CHECK-NEXT: ret
   1843 entry:
   1844   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1845   %mul = mul <4 x i32> %shuffle, %a
   1846   ret <4 x i32> %mul
   1847 }
   1848 
   1849 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
   1850 ; CHECK-LABEL: test_vmul_laneq_u16_0:
   1851 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1852 ; CHECK-NEXT: ret
   1853 entry:
   1854   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1855   %mul = mul <4 x i16> %shuffle, %a
   1856   ret <4 x i16> %mul
   1857 }
   1858 
   1859 define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
   1860 ; CHECK-LABEL: test_vmulq_laneq_u16_0:
   1861 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1862 ; CHECK-NEXT: ret
   1863 entry:
   1864   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1865   %mul = mul <8 x i16> %shuffle, %a
   1866   ret <8 x i16> %mul
   1867 }
   1868 
   1869 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
   1870 ; CHECK-LABEL: test_vmul_laneq_u32_0:
   1871 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1872 ; CHECK-NEXT: ret
   1873 entry:
   1874   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1875   %mul = mul <2 x i32> %shuffle, %a
   1876   ret <2 x i32> %mul
   1877 }
   1878 
   1879 define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
   1880 ; CHECK-LABEL: test_vmulq_laneq_u32_0:
   1881 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1882 ; CHECK-NEXT: ret
   1883 entry:
   1884   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1885   %mul = mul <4 x i32> %shuffle, %a
   1886   ret <4 x i32> %mul
   1887 }
   1888 
   1889 define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
   1890 ; CHECK-LABEL: test_vfma_lane_f32_0:
   1891 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1892 ; CHECK-NEXT: ret
   1893 entry:
   1894   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   1895   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1896   ret <2 x float> %0
   1897 }
   1898 
   1899 define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
   1900 ; CHECK-LABEL: test_vfmaq_lane_f32_0:
   1901 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1902 ; CHECK-NEXT: ret
   1903 entry:
   1904   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   1905   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1906   ret <4 x float> %0
   1907 }
   1908 
   1909 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
   1910 ; CHECK-LABEL: test_vfma_laneq_f32_0:
   1911 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1912 ; CHECK-NEXT: ret
   1913 entry:
   1914   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   1915   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1916   ret <2 x float> %0
   1917 }
   1918 
   1919 define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
   1920 ; CHECK-LABEL: test_vfmaq_laneq_f32_0:
   1921 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1922 ; CHECK-NEXT: ret
   1923 entry:
   1924   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   1925   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1926   ret <4 x float> %0
   1927 }
   1928 
   1929 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
   1930 ; CHECK-LABEL: test_vfms_lane_f32_0:
   1931 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1932 ; CHECK-NEXT: ret
   1933 entry:
   1934   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   1935   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
   1936   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1937   ret <2 x float> %0
   1938 }
   1939 
   1940 define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
   1941 ; CHECK-LABEL: test_vfmsq_lane_f32_0:
   1942 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1943 ; CHECK-NEXT: ret
   1944 entry:
   1945   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   1946   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
   1947   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1948   ret <4 x float> %0
   1949 }
   1950 
   1951 define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
   1952 ; CHECK-LABEL: test_vfms_laneq_f32_0:
   1953 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1954 ; CHECK-NEXT: ret
   1955 entry:
   1956   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   1957   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
   1958   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1959   ret <2 x float> %0
   1960 }
   1961 
   1962 define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
   1963 ; CHECK-LABEL: test_vfmsq_laneq_f32_0:
   1964 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1965 ; CHECK-NEXT: ret
   1966 entry:
   1967   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   1968   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
   1969   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1970   ret <4 x float> %0
   1971 }
   1972 
   1973 define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
   1974 ; CHECK-LABEL: test_vfmaq_laneq_f64_0:
   1975 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1976 ; CHECK-NEXT: ret
   1977 entry:
   1978   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   1979   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
   1980   ret <2 x double> %0
   1981 }
   1982 
   1983 define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
   1984 ; CHECK-LABEL: test_vfmsq_laneq_f64_0:
   1985 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1986 ; CHECK-NEXT: ret
   1987 entry:
   1988   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   1989   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
   1990   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
   1991   ret <2 x double> %0
   1992 }
   1993 
   1994 define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   1995 ; CHECK-LABEL: test_vmlal_lane_s16_0:
   1996 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1997 ; CHECK-NEXT: ret
   1998 entry:
   1999   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2000   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2001   %add = add <4 x i32> %vmull2.i, %a
   2002   ret <4 x i32> %add
   2003 }
   2004 
   2005 define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2006 ; CHECK-LABEL: test_vmlal_lane_s32_0:
   2007 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2008 ; CHECK-NEXT: ret
   2009 entry:
   2010   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2011   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2012   %add = add <2 x i64> %vmull2.i, %a
   2013   ret <2 x i64> %add
   2014 }
   2015 
   2016 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   2017 ; CHECK-LABEL: test_vmlal_laneq_s16_0:
   2018 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2019 ; CHECK-NEXT: ret
   2020 entry:
   2021   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2022   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2023   %add = add <4 x i32> %vmull2.i, %a
   2024   ret <4 x i32> %add
   2025 }
   2026 
   2027 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   2028 ; CHECK-LABEL: test_vmlal_laneq_s32_0:
   2029 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2030 ; CHECK-NEXT: ret
   2031 entry:
   2032   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2033   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2034   %add = add <2 x i64> %vmull2.i, %a
   2035   ret <2 x i64> %add
   2036 }
   2037 
   2038 define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2039 ; CHECK-LABEL: test_vmlal_high_lane_s16_0:
   2040 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2041 ; CHECK-NEXT: ret
   2042 entry:
   2043   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2044   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2045   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2046   %add = add <4 x i32> %vmull2.i, %a
   2047   ret <4 x i32> %add
   2048 }
   2049 
   2050 define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2051 ; CHECK-LABEL: test_vmlal_high_lane_s32_0:
   2052 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2053 ; CHECK-NEXT: ret
   2054 entry:
   2055   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2056   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2057   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2058   %add = add <2 x i64> %vmull2.i, %a
   2059   ret <2 x i64> %add
   2060 }
   2061 
   2062 define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2063 ; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
   2064 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2065 ; CHECK-NEXT: ret
   2066 entry:
   2067   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2068   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2069   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2070   %add = add <4 x i32> %vmull2.i, %a
   2071   ret <4 x i32> %add
   2072 }
   2073 
   2074 define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2075 ; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
   2076 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2077 ; CHECK-NEXT: ret
   2078 entry:
   2079   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2080   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2081   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2082   %add = add <2 x i64> %vmull2.i, %a
   2083   ret <2 x i64> %add
   2084 }
   2085 
   2086 define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2087 ; CHECK-LABEL: test_vmlsl_lane_s16_0:
   2088 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2089 ; CHECK-NEXT: ret
   2090 entry:
   2091   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2092   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2093   %sub = sub <4 x i32> %a, %vmull2.i
   2094   ret <4 x i32> %sub
   2095 }
   2096 
   2097 define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2098 ; CHECK-LABEL: test_vmlsl_lane_s32_0:
   2099 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2100 ; CHECK-NEXT: ret
   2101 entry:
   2102   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2103   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2104   %sub = sub <2 x i64> %a, %vmull2.i
   2105   ret <2 x i64> %sub
   2106 }
   2107 
   2108 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   2109 ; CHECK-LABEL: test_vmlsl_laneq_s16_0:
   2110 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2111 ; CHECK-NEXT: ret
   2112 entry:
   2113   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2114   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2115   %sub = sub <4 x i32> %a, %vmull2.i
   2116   ret <4 x i32> %sub
   2117 }
   2118 
   2119 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   2120 ; CHECK-LABEL: test_vmlsl_laneq_s32_0:
   2121 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2122 ; CHECK-NEXT: ret
   2123 entry:
   2124   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2125   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2126   %sub = sub <2 x i64> %a, %vmull2.i
   2127   ret <2 x i64> %sub
   2128 }
   2129 
   2130 define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2131 ; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
   2132 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2133 ; CHECK-NEXT: ret
   2134 entry:
   2135   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2136   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2137   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2138   %sub = sub <4 x i32> %a, %vmull2.i
   2139   ret <4 x i32> %sub
   2140 }
   2141 
   2142 define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2143 ; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
   2144 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2145 ; CHECK-NEXT: ret
   2146 entry:
   2147   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2148   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2149   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2150   %sub = sub <2 x i64> %a, %vmull2.i
   2151   ret <2 x i64> %sub
   2152 }
   2153 
   2154 define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2155 ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
   2156 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2157 ; CHECK-NEXT: ret
   2158 entry:
   2159   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2160   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2161   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2162   %sub = sub <4 x i32> %a, %vmull2.i
   2163   ret <4 x i32> %sub
   2164 }
   2165 
   2166 define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2167 ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
   2168 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2169 ; CHECK-NEXT: ret
   2170 entry:
   2171   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2172   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2173   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2174   %sub = sub <2 x i64> %a, %vmull2.i
   2175   ret <2 x i64> %sub
   2176 }
   2177 
   2178 define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2179 ; CHECK-LABEL: test_vmlal_lane_u16_0:
   2180 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2181 ; CHECK-NEXT: ret
   2182 entry:
   2183   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2184   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2185   %add = add <4 x i32> %vmull2.i, %a
   2186   ret <4 x i32> %add
   2187 }
   2188 
   2189 define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2190 ; CHECK-LABEL: test_vmlal_lane_u32_0:
   2191 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2192 ; CHECK-NEXT: ret
   2193 entry:
   2194   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2195   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2196   %add = add <2 x i64> %vmull2.i, %a
   2197   ret <2 x i64> %add
   2198 }
   2199 
   2200 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   2201 ; CHECK-LABEL: test_vmlal_laneq_u16_0:
   2202 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2203 ; CHECK-NEXT: ret
   2204 entry:
   2205   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2206   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2207   %add = add <4 x i32> %vmull2.i, %a
   2208   ret <4 x i32> %add
   2209 }
   2210 
   2211 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   2212 ; CHECK-LABEL: test_vmlal_laneq_u32_0:
   2213 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2214 ; CHECK-NEXT: ret
   2215 entry:
   2216   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2217   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2218   %add = add <2 x i64> %vmull2.i, %a
   2219   ret <2 x i64> %add
   2220 }
   2221 
   2222 define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2223 ; CHECK-LABEL: test_vmlal_high_lane_u16_0:
   2224 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2225 ; CHECK-NEXT: ret
   2226 entry:
   2227   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2228   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2229   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2230   %add = add <4 x i32> %vmull2.i, %a
   2231   ret <4 x i32> %add
   2232 }
   2233 
   2234 define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2235 ; CHECK-LABEL: test_vmlal_high_lane_u32_0:
   2236 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2237 ; CHECK-NEXT: ret
   2238 entry:
   2239   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2240   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2241   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2242   %add = add <2 x i64> %vmull2.i, %a
   2243   ret <2 x i64> %add
   2244 }
   2245 
   2246 define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2247 ; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
   2248 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2249 ; CHECK-NEXT: ret
   2250 entry:
   2251   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2252   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2253   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2254   %add = add <4 x i32> %vmull2.i, %a
   2255   ret <4 x i32> %add
   2256 }
   2257 
   2258 define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2259 ; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
   2260 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2261 ; CHECK-NEXT: ret
   2262 entry:
   2263   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2264   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2265   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2266   %add = add <2 x i64> %vmull2.i, %a
   2267   ret <2 x i64> %add
   2268 }
   2269 
   2270 define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2271 ; CHECK-LABEL: test_vmlsl_lane_u16_0:
   2272 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2273 ; CHECK-NEXT: ret
   2274 entry:
   2275   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2276   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2277   %sub = sub <4 x i32> %a, %vmull2.i
   2278   ret <4 x i32> %sub
   2279 }
   2280 
   2281 define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2282 ; CHECK-LABEL: test_vmlsl_lane_u32_0:
   2283 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2284 ; CHECK-NEXT: ret
   2285 entry:
   2286   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2287   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2288   %sub = sub <2 x i64> %a, %vmull2.i
   2289   ret <2 x i64> %sub
   2290 }
   2291 
   2292 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   2293 ; CHECK-LABEL: test_vmlsl_laneq_u16_0:
   2294 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2295 ; CHECK-NEXT: ret
   2296 entry:
   2297   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2298   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2299   %sub = sub <4 x i32> %a, %vmull2.i
   2300   ret <4 x i32> %sub
   2301 }
   2302 
   2303 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   2304 ; CHECK-LABEL: test_vmlsl_laneq_u32_0:
   2305 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2306 ; CHECK-NEXT: ret
   2307 entry:
   2308   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2309   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2310   %sub = sub <2 x i64> %a, %vmull2.i
   2311   ret <2 x i64> %sub
   2312 }
   2313 
   2314 define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2315 ; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
   2316 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2317 ; CHECK-NEXT: ret
   2318 entry:
   2319   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2320   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2321   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2322   %sub = sub <4 x i32> %a, %vmull2.i
   2323   ret <4 x i32> %sub
   2324 }
   2325 
   2326 define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2327 ; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
   2328 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2329 ; CHECK-NEXT: ret
   2330 entry:
   2331   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2332   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2333   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2334   %sub = sub <2 x i64> %a, %vmull2.i
   2335   ret <2 x i64> %sub
   2336 }
   2337 
   2338 define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2339 ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
   2340 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2341 ; CHECK-NEXT: ret
   2342 entry:
   2343   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2344   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2345   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2346   %sub = sub <4 x i32> %a, %vmull2.i
   2347   ret <4 x i32> %sub
   2348 }
   2349 
   2350 define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2351 ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
   2352 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2353 ; CHECK-NEXT: ret
   2354 entry:
   2355   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2356   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2357   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2358   %sub = sub <2 x i64> %a, %vmull2.i
   2359   ret <2 x i64> %sub
   2360 }
   2361 
   2362 define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2363 ; CHECK-LABEL: test_vmull_lane_s16_0:
   2364 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2365 ; CHECK-NEXT: ret
   2366 entry:
   2367   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2368   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2369   ret <4 x i32> %vmull2.i
   2370 }
   2371 
   2372 define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2373 ; CHECK-LABEL: test_vmull_lane_s32_0:
   2374 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2375 ; CHECK-NEXT: ret
   2376 entry:
   2377   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2378   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2379   ret <2 x i64> %vmull2.i
   2380 }
   2381 
   2382 define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
   2383 ; CHECK-LABEL: test_vmull_lane_u16_0:
   2384 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2385 ; CHECK-NEXT: ret
   2386 entry:
   2387   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2388   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2389   ret <4 x i32> %vmull2.i
   2390 }
   2391 
   2392 define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
   2393 ; CHECK-LABEL: test_vmull_lane_u32_0:
   2394 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2395 ; CHECK-NEXT: ret
   2396 entry:
   2397   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2398   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2399   ret <2 x i64> %vmull2.i
   2400 }
   2401 
   2402 define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2403 ; CHECK-LABEL: test_vmull_high_lane_s16_0:
   2404 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2405 ; CHECK-NEXT: ret
   2406 entry:
   2407   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2408   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2409   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2410   ret <4 x i32> %vmull2.i
   2411 }
   2412 
   2413 define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2414 ; CHECK-LABEL: test_vmull_high_lane_s32_0:
   2415 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2416 ; CHECK-NEXT: ret
   2417 entry:
   2418   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2419   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2420   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2421   ret <2 x i64> %vmull2.i
   2422 }
   2423 
   2424 define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
   2425 ; CHECK-LABEL: test_vmull_high_lane_u16_0:
   2426 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2427 ; CHECK-NEXT: ret
   2428 entry:
   2429   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2430   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2431   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2432   ret <4 x i32> %vmull2.i
   2433 }
   2434 
   2435 define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
   2436 ; CHECK-LABEL: test_vmull_high_lane_u32_0:
   2437 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2438 ; CHECK-NEXT: ret
   2439 entry:
   2440   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2441   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2442   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2443   ret <2 x i64> %vmull2.i
   2444 }
   2445 
   2446 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
   2447 ; CHECK-LABEL: test_vmull_laneq_s16_0:
   2448 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2449 ; CHECK-NEXT: ret
   2450 entry:
   2451   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2452   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2453   ret <4 x i32> %vmull2.i
   2454 }
   2455 
   2456 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
   2457 ; CHECK-LABEL: test_vmull_laneq_s32_0:
   2458 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2459 ; CHECK-NEXT: ret
   2460 entry:
   2461   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2462   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2463   ret <2 x i64> %vmull2.i
   2464 }
   2465 
   2466 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
   2467 ; CHECK-LABEL: test_vmull_laneq_u16_0:
   2468 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2469 ; CHECK-NEXT: ret
   2470 entry:
   2471   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2472   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2473   ret <4 x i32> %vmull2.i
   2474 }
   2475 
   2476 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
   2477 ; CHECK-LABEL: test_vmull_laneq_u32_0:
   2478 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2479 ; CHECK-NEXT: ret
   2480 entry:
   2481   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2482   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2483   ret <2 x i64> %vmull2.i
   2484 }
   2485 
   2486 define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
   2487 ; CHECK-LABEL: test_vmull_high_laneq_s16_0:
   2488 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2489 ; CHECK-NEXT: ret
   2490 entry:
   2491   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2492   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2493   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2494   ret <4 x i32> %vmull2.i
   2495 }
   2496 
   2497 define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
   2498 ; CHECK-LABEL: test_vmull_high_laneq_s32_0:
   2499 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2500 ; CHECK-NEXT: ret
   2501 entry:
   2502   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2503   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2504   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2505   ret <2 x i64> %vmull2.i
   2506 }
   2507 
   2508 define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
   2509 ; CHECK-LABEL: test_vmull_high_laneq_u16_0:
   2510 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2511 ; CHECK-NEXT: ret
   2512 entry:
   2513   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2514   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2515   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2516   ret <4 x i32> %vmull2.i
   2517 }
   2518 
   2519 define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
   2520 ; CHECK-LABEL: test_vmull_high_laneq_u32_0:
   2521 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2522 ; CHECK-NEXT: ret
   2523 entry:
   2524   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2525   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2526   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2527   ret <2 x i64> %vmull2.i
   2528 }
   2529 
   2530 define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2531 ; CHECK-LABEL: test_vqdmlal_lane_s16_0:
   2532 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2533 ; CHECK-NEXT: ret
   2534 entry:
   2535   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2536   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2537   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   2538   ret <4 x i32> %vqdmlal4.i
   2539 }
   2540 
   2541 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2542 ; CHECK-LABEL: test_vqdmlal_lane_s32_0:
   2543 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2544 ; CHECK-NEXT: ret
   2545 entry:
   2546   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2547   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2548   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   2549   ret <2 x i64> %vqdmlal4.i
   2550 }
   2551 
   2552 define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2553 ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
   2554 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2555 ; CHECK-NEXT: ret
   2556 entry:
   2557   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2558   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2559   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2560   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   2561   ret <4 x i32> %vqdmlal4.i
   2562 }
   2563 
   2564 define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2565 ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
   2566 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2567 ; CHECK-NEXT: ret
   2568 entry:
   2569   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2570   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2571   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2572   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   2573   ret <2 x i64> %vqdmlal4.i
   2574 }
   2575 
   2576 define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2577 ; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
   2578 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2579 ; CHECK-NEXT: ret
   2580 entry:
   2581   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2582   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2583   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   2584   ret <4 x i32> %vqdmlsl4.i
   2585 }
   2586 
   2587 define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2588 ; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
   2589 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2590 ; CHECK-NEXT: ret
   2591 entry:
   2592   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2593   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2594   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   2595   ret <2 x i64> %vqdmlsl4.i
   2596 }
   2597 
   2598 define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2599 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
   2600 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2601 ; CHECK-NEXT: ret
   2602 entry:
   2603   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2604   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2605   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2606   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   2607   ret <4 x i32> %vqdmlsl4.i
   2608 }
   2609 
   2610 define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2611 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
   2612 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2613 ; CHECK-NEXT: ret
   2614 entry:
   2615   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2616   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2617   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2618   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   2619   ret <2 x i64> %vqdmlsl4.i
   2620 }
   2621 
   2622 define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2623 ; CHECK-LABEL: test_vqdmull_lane_s16_0:
   2624 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2625 ; CHECK-NEXT: ret
   2626 entry:
   2627   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2628   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2629   ret <4 x i32> %vqdmull2.i
   2630 }
   2631 
   2632 define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2633 ; CHECK-LABEL: test_vqdmull_lane_s32_0:
   2634 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2635 ; CHECK-NEXT: ret
   2636 entry:
   2637   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2638   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2639   ret <2 x i64> %vqdmull2.i
   2640 }
   2641 
   2642 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
   2643 ; CHECK-LABEL: test_vqdmull_laneq_s16_0:
   2644 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2645 ; CHECK-NEXT: ret
   2646 entry:
   2647   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2648   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2649   ret <4 x i32> %vqdmull2.i
   2650 }
   2651 
   2652 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
   2653 ; CHECK-LABEL: test_vqdmull_laneq_s32_0:
   2654 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2655 ; CHECK-NEXT: ret
   2656 entry:
   2657   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2658   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2659   ret <2 x i64> %vqdmull2.i
   2660 }
   2661 
   2662 define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2663 ; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
   2664 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2665 ; CHECK-NEXT: ret
   2666 entry:
   2667   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2668   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2669   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2670   ret <4 x i32> %vqdmull2.i
   2671 }
   2672 
   2673 define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2674 ; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
   2675 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2676 ; CHECK-NEXT: ret
   2677 entry:
   2678   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2679   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2680   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2681   ret <2 x i64> %vqdmull2.i
   2682 }
   2683 
   2684 define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
   2685 ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
   2686 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2687 ; CHECK-NEXT: ret
   2688 entry:
   2689   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2690   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2691   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2692   ret <4 x i32> %vqdmull2.i
   2693 }
   2694 
   2695 define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
   2696 ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
   2697 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2698 ; CHECK-NEXT: ret
   2699 entry:
   2700   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2701   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2702   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2703   ret <2 x i64> %vqdmull2.i
   2704 }
   2705 
   2706 define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2707 ; CHECK-LABEL: test_vqdmulh_lane_s16_0:
   2708 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2709 ; CHECK-NEXT: ret
   2710 entry:
   2711   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2712   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   2713   ret <4 x i16> %vqdmulh2.i
   2714 }
   2715 
   2716 define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2717 ; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
   2718 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2719 ; CHECK-NEXT: ret
   2720 entry:
   2721   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   2722   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   2723   ret <8 x i16> %vqdmulh2.i
   2724 }
   2725 
   2726 define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2727 ; CHECK-LABEL: test_vqdmulh_lane_s32_0:
   2728 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2729 ; CHECK-NEXT: ret
   2730 entry:
   2731   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2732   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   2733   ret <2 x i32> %vqdmulh2.i
   2734 }
   2735 
   2736 define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2737 ; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
   2738 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2739 ; CHECK-NEXT: ret
   2740 entry:
   2741   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   2742   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   2743   ret <4 x i32> %vqdmulh2.i
   2744 }
   2745 
   2746 define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2747 ; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
   2748 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2749 ; CHECK-NEXT: ret
   2750 entry:
   2751   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2752   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   2753   ret <4 x i16> %vqrdmulh2.i
   2754 }
   2755 
   2756 define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2757 ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
   2758 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2759 ; CHECK-NEXT: ret
   2760 entry:
   2761   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   2762   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   2763   ret <8 x i16> %vqrdmulh2.i
   2764 }
   2765 
   2766 define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2767 ; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
   2768 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2769 ; CHECK-NEXT: ret
   2770 entry:
   2771   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2772   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   2773   ret <2 x i32> %vqrdmulh2.i
   2774 }
   2775 
   2776 define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2777 ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
   2778 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2779 ; CHECK-NEXT: ret
   2780 entry:
   2781   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   2782   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   2783   ret <4 x i32> %vqrdmulh2.i
   2784 }
   2785 
   2786 define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
   2787 ; CHECK-LABEL: test_vmul_lane_f32_0:
   2788 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2789 ; CHECK-NEXT: ret
   2790 entry:
   2791   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   2792   %mul = fmul <2 x float> %shuffle, %a
   2793   ret <2 x float> %mul
   2794 }
   2795 
   2796 define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
   2797 ; CHECK-LABEL: test_vmulq_lane_f32_0:
   2798 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2799 ; CHECK-NEXT: ret
   2800 entry:
   2801   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   2802   %mul = fmul <4 x float> %shuffle, %a
   2803   ret <4 x float> %mul
   2804 }
   2805 
   2806 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
   2807 ; CHECK-LABEL: test_vmul_laneq_f32_0:
   2808 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2809 ; CHECK-NEXT: ret
   2810 entry:
   2811   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   2812   %mul = fmul <2 x float> %shuffle, %a
   2813   ret <2 x float> %mul
   2814 }
   2815 
   2816 define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
   2817 ; CHECK-LABEL: test_vmul_laneq_f64_0:
   2818 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   2819 ; CHECK-NEXT: ret
   2820 entry:
   2821   %0 = bitcast <1 x double> %a to <8 x i8>
   2822   %1 = bitcast <8 x i8> %0 to double
   2823   %extract = extractelement <2 x double> %v, i32 0
   2824   %2 = fmul double %1, %extract
   2825   %3 = insertelement <1 x double> undef, double %2, i32 0
   2826   ret <1 x double> %3
   2827 }
   2828 
   2829 define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
   2830 ; CHECK-LABEL: test_vmulq_laneq_f32_0:
   2831 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2832 ; CHECK-NEXT: ret
   2833 entry:
   2834   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   2835   %mul = fmul <4 x float> %shuffle, %a
   2836   ret <4 x float> %mul
   2837 }
   2838 
   2839 define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
   2840 ; CHECK-LABEL: test_vmulq_laneq_f64_0:
   2841 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   2842 ; CHECK-NEXT: ret
   2843 entry:
   2844   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   2845   %mul = fmul <2 x double> %shuffle, %a
   2846   ret <2 x double> %mul
   2847 }
   2848 
   2849 define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
   2850 ; CHECK-LABEL: test_vmulx_lane_f32_0:
   2851 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2852 ; CHECK-NEXT: ret
   2853 entry:
   2854   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   2855   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   2856   ret <2 x float> %vmulx2.i
   2857 }
   2858 
   2859 define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
   2860 ; CHECK-LABEL: test_vmulxq_lane_f32_0:
   2861 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2862 ; CHECK-NEXT: ret
   2863 entry:
   2864   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   2865   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   2866   ret <4 x float> %vmulx2.i
   2867 }
   2868 
   2869 define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
   2870 ; CHECK-LABEL: test_vmulxq_lane_f64_0:
   2871 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   2872 ; CHECK-NEXT: ret
   2873 entry:
   2874   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   2875   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   2876   ret <2 x double> %vmulx2.i
   2877 }
   2878 
   2879 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
   2880 ; CHECK-LABEL: test_vmulx_laneq_f32_0:
   2881 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2882 ; CHECK-NEXT: ret
   2883 entry:
   2884   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   2885   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   2886   ret <2 x float> %vmulx2.i
   2887 }
   2888 
   2889 define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
   2890 ; CHECK-LABEL: test_vmulxq_laneq_f32_0:
   2891 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2892 ; CHECK-NEXT: ret
   2893 entry:
   2894   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   2895   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   2896   ret <4 x float> %vmulx2.i
   2897 }
   2898 
   2899 define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
   2900 ; CHECK-LABEL: test_vmulxq_laneq_f64_0:
   2901 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   2902 ; CHECK-NEXT: ret
   2903 entry:
   2904   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   2905   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   2906   ret <2 x double> %vmulx2.i
   2907 }
   2908 
   2909