Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
      2 
      3 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
      4 
      5 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
      6 
      7 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
      8 
      9 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
     10 
     11 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
     12 
     13 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
     14 
     15 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
     16 
     17 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
     18 
     19 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
     20 
     21 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
     22 
     23 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
     24 
     25 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
     26 
     27 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
     28 
     29 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
     30 
     31 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
     32 
     33 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
     34 
     35 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
     36 
     37 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
     38 
     39 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
     40 
     41 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
     42 
     43 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
     44 
     45 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
     46 ; CHECK-LABEL: test_vmla_lane_s16:
     47 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
     48 ; CHECK-NEXT: ret
     49 entry:
     50   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
     51   %mul = mul <4 x i16> %shuffle, %b
     52   %add = add <4 x i16> %mul, %a
     53   ret <4 x i16> %add
     54 }
     55 
     56 define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
     57 ; CHECK-LABEL: test_vmlaq_lane_s16:
     58 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
     59 ; CHECK-NEXT: ret
     60 entry:
     61   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
     62   %mul = mul <8 x i16> %shuffle, %b
     63   %add = add <8 x i16> %mul, %a
     64   ret <8 x i16> %add
     65 }
     66 
     67 define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
     68 ; CHECK-LABEL: test_vmla_lane_s32:
     69 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
     70 ; CHECK-NEXT: ret
     71 entry:
     72   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
     73   %mul = mul <2 x i32> %shuffle, %b
     74   %add = add <2 x i32> %mul, %a
     75   ret <2 x i32> %add
     76 }
     77 
     78 define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
     79 ; CHECK-LABEL: test_vmlaq_lane_s32:
     80 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
     81 ; CHECK-NEXT: ret
     82 entry:
     83   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
     84   %mul = mul <4 x i32> %shuffle, %b
     85   %add = add <4 x i32> %mul, %a
     86   ret <4 x i32> %add
     87 }
     88 
     89 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
     90 ; CHECK-LABEL: test_vmla_laneq_s16:
     91 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
     92 ; CHECK-NEXT: ret
     93 entry:
     94   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
     95   %mul = mul <4 x i16> %shuffle, %b
     96   %add = add <4 x i16> %mul, %a
     97   ret <4 x i16> %add
     98 }
     99 
    100 define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
    101 ; CHECK-LABEL: test_vmlaq_laneq_s16:
    102 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    103 ; CHECK-NEXT: ret
    104 entry:
    105   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    106   %mul = mul <8 x i16> %shuffle, %b
    107   %add = add <8 x i16> %mul, %a
    108   ret <8 x i16> %add
    109 }
    110 
    111 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
    112 ; CHECK-LABEL: test_vmla_laneq_s32:
    113 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    114 ; CHECK-NEXT: ret
    115 entry:
    116   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    117   %mul = mul <2 x i32> %shuffle, %b
    118   %add = add <2 x i32> %mul, %a
    119   ret <2 x i32> %add
    120 }
    121 
    122 define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
    123 ; CHECK-LABEL: test_vmlaq_laneq_s32:
    124 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    125 ; CHECK-NEXT: ret
    126 entry:
    127   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    128   %mul = mul <4 x i32> %shuffle, %b
    129   %add = add <4 x i32> %mul, %a
    130   ret <4 x i32> %add
    131 }
    132 
    133 define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
    134 ; CHECK-LABEL: test_vmls_lane_s16:
    135 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    136 ; CHECK-NEXT: ret
    137 entry:
    138   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    139   %mul = mul <4 x i16> %shuffle, %b
    140   %sub = sub <4 x i16> %a, %mul
    141   ret <4 x i16> %sub
    142 }
    143 
    144 define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
    145 ; CHECK-LABEL: test_vmlsq_lane_s16:
    146 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    147 ; CHECK-NEXT: ret
    148 entry:
    149   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    150   %mul = mul <8 x i16> %shuffle, %b
    151   %sub = sub <8 x i16> %a, %mul
    152   ret <8 x i16> %sub
    153 }
    154 
    155 define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
    156 ; CHECK-LABEL: test_vmls_lane_s32:
    157 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    158 ; CHECK-NEXT: ret
    159 entry:
    160   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    161   %mul = mul <2 x i32> %shuffle, %b
    162   %sub = sub <2 x i32> %a, %mul
    163   ret <2 x i32> %sub
    164 }
    165 
    166 define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
    167 ; CHECK-LABEL: test_vmlsq_lane_s32:
    168 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    169 ; CHECK-NEXT: ret
    170 entry:
    171   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    172   %mul = mul <4 x i32> %shuffle, %b
    173   %sub = sub <4 x i32> %a, %mul
    174   ret <4 x i32> %sub
    175 }
    176 
    177 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
    178 ; CHECK-LABEL: test_vmls_laneq_s16:
    179 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    180 ; CHECK-NEXT: ret
    181 entry:
    182   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    183   %mul = mul <4 x i16> %shuffle, %b
    184   %sub = sub <4 x i16> %a, %mul
    185   ret <4 x i16> %sub
    186 }
    187 
    188 define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
    189 ; CHECK-LABEL: test_vmlsq_laneq_s16:
    190 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    191 ; CHECK-NEXT: ret
    192 entry:
    193   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    194   %mul = mul <8 x i16> %shuffle, %b
    195   %sub = sub <8 x i16> %a, %mul
    196   ret <8 x i16> %sub
    197 }
    198 
    199 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
    200 ; CHECK-LABEL: test_vmls_laneq_s32:
    201 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    202 ; CHECK-NEXT: ret
    203 entry:
    204   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    205   %mul = mul <2 x i32> %shuffle, %b
    206   %sub = sub <2 x i32> %a, %mul
    207   ret <2 x i32> %sub
    208 }
    209 
    210 define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
    211 ; CHECK-LABEL: test_vmlsq_laneq_s32:
    212 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    213 ; CHECK-NEXT: ret
    214 entry:
    215   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    216   %mul = mul <4 x i32> %shuffle, %b
    217   %sub = sub <4 x i32> %a, %mul
    218   ret <4 x i32> %sub
    219 }
    220 
    221 define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
    222 ; CHECK-LABEL: test_vmul_lane_s16:
    223 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    224 ; CHECK-NEXT: ret
    225 entry:
    226   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    227   %mul = mul <4 x i16> %shuffle, %a
    228   ret <4 x i16> %mul
    229 }
    230 
    231 define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
    232 ; CHECK-LABEL: test_vmulq_lane_s16:
    233 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    234 ; CHECK-NEXT: ret
    235 entry:
    236   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    237   %mul = mul <8 x i16> %shuffle, %a
    238   ret <8 x i16> %mul
    239 }
    240 
    241 define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
    242 ; CHECK-LABEL: test_vmul_lane_s32:
    243 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    244 ; CHECK-NEXT: ret
    245 entry:
    246   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    247   %mul = mul <2 x i32> %shuffle, %a
    248   ret <2 x i32> %mul
    249 }
    250 
    251 define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
    252 ; CHECK-LABEL: test_vmulq_lane_s32:
    253 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    254 ; CHECK-NEXT: ret
    255 entry:
    256   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    257   %mul = mul <4 x i32> %shuffle, %a
    258   ret <4 x i32> %mul
    259 }
    260 
    261 define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
    262 ; CHECK-LABEL: test_vmul_lane_u16:
    263 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    264 ; CHECK-NEXT: ret
    265 entry:
    266   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    267   %mul = mul <4 x i16> %shuffle, %a
    268   ret <4 x i16> %mul
    269 }
    270 
    271 define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
    272 ; CHECK-LABEL: test_vmulq_lane_u16:
    273 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    274 ; CHECK-NEXT: ret
    275 entry:
    276   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
    277   %mul = mul <8 x i16> %shuffle, %a
    278   ret <8 x i16> %mul
    279 }
    280 
    281 define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
    282 ; CHECK-LABEL: test_vmul_lane_u32:
    283 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    284 ; CHECK-NEXT: ret
    285 entry:
    286   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    287   %mul = mul <2 x i32> %shuffle, %a
    288   ret <2 x i32> %mul
    289 }
    290 
    291 define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
    292 ; CHECK-LABEL: test_vmulq_lane_u32:
    293 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    294 ; CHECK-NEXT: ret
    295 entry:
    296   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    297   %mul = mul <4 x i32> %shuffle, %a
    298   ret <4 x i32> %mul
    299 }
    300 
    301 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
    302 ; CHECK-LABEL: test_vmul_laneq_s16:
    303 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    304 ; CHECK-NEXT: ret
    305 entry:
    306   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    307   %mul = mul <4 x i16> %shuffle, %a
    308   ret <4 x i16> %mul
    309 }
    310 
    311 define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
    312 ; CHECK-LABEL: test_vmulq_laneq_s16:
    313 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    314 ; CHECK-NEXT: ret
    315 entry:
    316   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    317   %mul = mul <8 x i16> %shuffle, %a
    318   ret <8 x i16> %mul
    319 }
    320 
    321 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
    322 ; CHECK-LABEL: test_vmul_laneq_s32:
    323 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    324 ; CHECK-NEXT: ret
    325 entry:
    326   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    327   %mul = mul <2 x i32> %shuffle, %a
    328   ret <2 x i32> %mul
    329 }
    330 
    331 define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
    332 ; CHECK-LABEL: test_vmulq_laneq_s32:
    333 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    334 ; CHECK-NEXT: ret
    335 entry:
    336   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    337   %mul = mul <4 x i32> %shuffle, %a
    338   ret <4 x i32> %mul
    339 }
    340 
    341 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
    342 ; CHECK-LABEL: test_vmul_laneq_u16:
    343 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    344 ; CHECK-NEXT: ret
    345 entry:
    346   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    347   %mul = mul <4 x i16> %shuffle, %a
    348   ret <4 x i16> %mul
    349 }
    350 
    351 define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
    352 ; CHECK-LABEL: test_vmulq_laneq_u16:
    353 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    354 ; CHECK-NEXT: ret
    355 entry:
    356   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
    357   %mul = mul <8 x i16> %shuffle, %a
    358   ret <8 x i16> %mul
    359 }
    360 
    361 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
    362 ; CHECK-LABEL: test_vmul_laneq_u32:
    363 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    364 ; CHECK-NEXT: ret
    365 entry:
    366   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    367   %mul = mul <2 x i32> %shuffle, %a
    368   ret <2 x i32> %mul
    369 }
    370 
    371 define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
    372 ; CHECK-LABEL: test_vmulq_laneq_u32:
    373 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    374 ; CHECK-NEXT: ret
    375 entry:
    376   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    377   %mul = mul <4 x i32> %shuffle, %a
    378   ret <4 x i32> %mul
    379 }
    380 
    381 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
    382 ; CHECK-LABEL: test_vfma_lane_f32:
    383 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    384 ; CHECK-NEXT: ret
    385 entry:
    386   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
    387   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    388   ret <2 x float> %0
    389 }
    390 
    391 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
    392 
    393 define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
    394 ; CHECK-LABEL: test_vfmaq_lane_f32:
    395 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    396 ; CHECK-NEXT: ret
    397 entry:
    398   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    399   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    400   ret <4 x float> %0
    401 }
    402 
    403 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
    404 
    405 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
    406 ; CHECK-LABEL: test_vfma_laneq_f32:
    407 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    408 ; CHECK-NEXT: ret
    409 entry:
    410   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
    411   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    412   ret <2 x float> %0
    413 }
    414 
    415 define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
    416 ; CHECK-LABEL: test_vfmaq_laneq_f32:
    417 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    418 ; CHECK-NEXT: ret
    419 entry:
    420   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    421   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    422   ret <4 x float> %0
    423 }
    424 
    425 define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
    426 ; CHECK-LABEL: test_vfms_lane_f32:
    427 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    428 ; CHECK-NEXT: ret
    429 entry:
    430   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
    431   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
    432   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    433   ret <2 x float> %0
    434 }
    435 
    436 define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
    437 ; CHECK-LABEL: test_vfmsq_lane_f32:
    438 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    439 ; CHECK-NEXT: ret
    440 entry:
    441   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
    442   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
    443   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    444   ret <4 x float> %0
    445 }
    446 
    447 define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
    448 ; CHECK-LABEL: test_vfms_laneq_f32:
    449 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    450 ; CHECK-NEXT: ret
    451 entry:
    452   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
    453   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
    454   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
    455   ret <2 x float> %0
    456 }
    457 
    458 define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
    459 ; CHECK-LABEL: test_vfmsq_laneq_f32:
    460 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    461 ; CHECK-NEXT: ret
    462 entry:
    463   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
    464   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    465   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
    466   ret <4 x float> %0
    467 }
    468 
    469 define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
    470 ; CHECK-LABEL: test_vfmaq_lane_f64:
    471 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
    472 ; CHECK-NEXT: ret
    473 entry:
    474   %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
    475   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    476   ret <2 x double> %0
    477 }
    478 
    479 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
    480 
    481 define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
    482 ; CHECK-LABEL: test_vfmaq_laneq_f64:
    483 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
    484 ; CHECK-NEXT: ret
    485 entry:
    486   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
    487   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    488   ret <2 x double> %0
    489 }
    490 
    491 define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
    492 ; CHECK-LABEL: test_vfmsq_lane_f64:
    493 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
    494 ; CHECK-NEXT: ret
    495 entry:
    496   %sub = fsub <1 x double> <double -0.000000e+00>, %v
    497   %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
    498   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    499   ret <2 x double> %0
    500 }
    501 
    502 define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
    503 ; CHECK-LABEL: test_vfmsq_laneq_f64:
    504 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
    505 ; CHECK-NEXT: ret
    506 entry:
    507   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
    508   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
    509   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
    510   ret <2 x double> %0
    511 }
    512 
    513 define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
    514 ; CHECK-LABEL: test_vfmas_laneq_f32
    515 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
    516 ; CHECK-NEXT: ret
    517 entry:
    518   %extract = extractelement <4 x float> %v, i32 3
    519   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
    520   ret float %0
    521 }
    522 
    523 declare float @llvm.fma.f32(float, float, float)
    524 
    525 define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
    526 ; CHECK-LABEL: test_vfmsd_lane_f64
    527 ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    528 ; CHECK-NEXT: ret
    529 entry:
    530   %extract.rhs = extractelement <1 x double> %v, i32 0
    531   %extract = fsub double -0.000000e+00, %extract.rhs
    532   %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
    533   ret double %0
    534 }
    535 
    536 declare double @llvm.fma.f64(double, double, double)
    537 
    538 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
    539 ; CHECK-LABEL: test_vfmss_laneq_f32
    540 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
    541 ; CHECK-NEXT: ret
    542 entry:
    543   %extract.rhs = extractelement <4 x float> %v, i32 3
    544   %extract = fsub float -0.000000e+00, %extract.rhs
    545   %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
    546   ret float %0
    547 }
    548 
    549 define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
    550 ; CHECK-LABEL: test_vfmsd_laneq_f64
    551 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
    552 ; CHECK-NEXT: ret
    553 entry:
    554   %extract.rhs = extractelement <2 x double> %v, i32 1
    555   %extract = fsub double -0.000000e+00, %extract.rhs
    556   %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
    557   ret double %0
    558 }
    559 
    560 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    561 ; CHECK-LABEL: test_vmlal_lane_s16:
    562 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    563 ; CHECK-NEXT: ret
    564 entry:
    565   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    566   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    567   %add = add <4 x i32> %vmull2.i, %a
    568   ret <4 x i32> %add
    569 }
    570 
    571 define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    572 ; CHECK-LABEL: test_vmlal_lane_s32:
    573 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    574 ; CHECK-NEXT: ret
    575 entry:
    576   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    577   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    578   %add = add <2 x i64> %vmull2.i, %a
    579   ret <2 x i64> %add
    580 }
    581 
    582 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    583 ; CHECK-LABEL: test_vmlal_laneq_s16:
    584 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    585 ; CHECK-NEXT: ret
    586 entry:
    587   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    588   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    589   %add = add <4 x i32> %vmull2.i, %a
    590   ret <4 x i32> %add
    591 }
    592 
    593 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    594 ; CHECK-LABEL: test_vmlal_laneq_s32:
    595 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    596 ; CHECK-NEXT: ret
    597 entry:
    598   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    599   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    600   %add = add <2 x i64> %vmull2.i, %a
    601   ret <2 x i64> %add
    602 }
    603 
    604 define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    605 ; CHECK-LABEL: test_vmlal_high_lane_s16:
    606 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    607 ; CHECK-NEXT: ret
    608 entry:
    609   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    610   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    611   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    612   %add = add <4 x i32> %vmull2.i, %a
    613   ret <4 x i32> %add
    614 }
    615 
    616 define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    617 ; CHECK-LABEL: test_vmlal_high_lane_s32:
    618 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    619 ; CHECK-NEXT: ret
    620 entry:
    621   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    622   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    623   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    624   %add = add <2 x i64> %vmull2.i, %a
    625   ret <2 x i64> %add
    626 }
    627 
    628 define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    629 ; CHECK-LABEL: test_vmlal_high_laneq_s16:
    630 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    631 ; CHECK-NEXT: ret
    632 entry:
    633   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    634   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    635   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    636   %add = add <4 x i32> %vmull2.i, %a
    637   ret <4 x i32> %add
    638 }
    639 
    640 define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    641 ; CHECK-LABEL: test_vmlal_high_laneq_s32:
    642 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    643 ; CHECK-NEXT: ret
    644 entry:
    645   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    646   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    647   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    648   %add = add <2 x i64> %vmull2.i, %a
    649   ret <2 x i64> %add
    650 }
    651 
    652 define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    653 ; CHECK-LABEL: test_vmlsl_lane_s16:
    654 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    655 ; CHECK-NEXT: ret
    656 entry:
    657   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    658   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    659   %sub = sub <4 x i32> %a, %vmull2.i
    660   ret <4 x i32> %sub
    661 }
    662 
    663 define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    664 ; CHECK-LABEL: test_vmlsl_lane_s32:
    665 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    666 ; CHECK-NEXT: ret
    667 entry:
    668   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    669   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    670   %sub = sub <2 x i64> %a, %vmull2.i
    671   ret <2 x i64> %sub
    672 }
    673 
    674 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    675 ; CHECK-LABEL: test_vmlsl_laneq_s16:
    676 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    677 ; CHECK-NEXT: ret
    678 entry:
    679   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    680   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    681   %sub = sub <4 x i32> %a, %vmull2.i
    682   ret <4 x i32> %sub
    683 }
    684 
    685 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    686 ; CHECK-LABEL: test_vmlsl_laneq_s32:
    687 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    688 ; CHECK-NEXT: ret
    689 entry:
    690   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    691   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    692   %sub = sub <2 x i64> %a, %vmull2.i
    693   ret <2 x i64> %sub
    694 }
    695 
    696 define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    697 ; CHECK-LABEL: test_vmlsl_high_lane_s16:
    698 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    699 ; CHECK-NEXT: ret
    700 entry:
    701   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    702   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    703   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    704   %sub = sub <4 x i32> %a, %vmull2.i
    705   ret <4 x i32> %sub
    706 }
    707 
    708 define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    709 ; CHECK-LABEL: test_vmlsl_high_lane_s32:
    710 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    711 ; CHECK-NEXT: ret
    712 entry:
    713   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    714   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    715   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    716   %sub = sub <2 x i64> %a, %vmull2.i
    717   ret <2 x i64> %sub
    718 }
    719 
    720 define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    721 ; CHECK-LABEL: test_vmlsl_high_laneq_s16:
    722 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    723 ; CHECK-NEXT: ret
    724 entry:
    725   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    726   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    727   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    728   %sub = sub <4 x i32> %a, %vmull2.i
    729   ret <4 x i32> %sub
    730 }
    731 
    732 define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    733 ; CHECK-LABEL: test_vmlsl_high_laneq_s32:
    734 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    735 ; CHECK-NEXT: ret
    736 entry:
    737   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    738   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    739   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    740   %sub = sub <2 x i64> %a, %vmull2.i
    741   ret <2 x i64> %sub
    742 }
    743 
    744 define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    745 ; CHECK-LABEL: test_vmlal_lane_u16:
    746 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    747 ; CHECK-NEXT: ret
    748 entry:
    749   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    750   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    751   %add = add <4 x i32> %vmull2.i, %a
    752   ret <4 x i32> %add
    753 }
    754 
    755 define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    756 ; CHECK-LABEL: test_vmlal_lane_u32:
    757 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    758 ; CHECK-NEXT: ret
    759 entry:
    760   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    761   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    762   %add = add <2 x i64> %vmull2.i, %a
    763   ret <2 x i64> %add
    764 }
    765 
    766 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    767 ; CHECK-LABEL: test_vmlal_laneq_u16:
    768 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    769 ; CHECK-NEXT: ret
    770 entry:
    771   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    772   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    773   %add = add <4 x i32> %vmull2.i, %a
    774   ret <4 x i32> %add
    775 }
    776 
    777 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    778 ; CHECK-LABEL: test_vmlal_laneq_u32:
    779 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    780 ; CHECK-NEXT: ret
    781 entry:
    782   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    783   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    784   %add = add <2 x i64> %vmull2.i, %a
    785   ret <2 x i64> %add
    786 }
    787 
    788 define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    789 ; CHECK-LABEL: test_vmlal_high_lane_u16:
    790 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    791 ; CHECK-NEXT: ret
    792 entry:
    793   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    794   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    795   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    796   %add = add <4 x i32> %vmull2.i, %a
    797   ret <4 x i32> %add
    798 }
    799 
    800 define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    801 ; CHECK-LABEL: test_vmlal_high_lane_u32:
    802 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    803 ; CHECK-NEXT: ret
    804 entry:
    805   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    806   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    807   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    808   %add = add <2 x i64> %vmull2.i, %a
    809   ret <2 x i64> %add
    810 }
    811 
    812 define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    813 ; CHECK-LABEL: test_vmlal_high_laneq_u16:
    814 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    815 ; CHECK-NEXT: ret
    816 entry:
    817   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    818   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    819   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    820   %add = add <4 x i32> %vmull2.i, %a
    821   ret <4 x i32> %add
    822 }
    823 
    824 define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    825 ; CHECK-LABEL: test_vmlal_high_laneq_u32:
    826 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    827 ; CHECK-NEXT: ret
    828 entry:
    829   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    830   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    831   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    832   %add = add <2 x i64> %vmull2.i, %a
    833   ret <2 x i64> %add
    834 }
    835 
    836 define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
    837 ; CHECK-LABEL: test_vmlsl_lane_u16:
    838 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    839 ; CHECK-NEXT: ret
    840 entry:
    841   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    842   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    843   %sub = sub <4 x i32> %a, %vmull2.i
    844   ret <4 x i32> %sub
    845 }
    846 
    847 define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
    848 ; CHECK-LABEL: test_vmlsl_lane_u32:
    849 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    850 ; CHECK-NEXT: ret
    851 entry:
    852   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    853   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    854   %sub = sub <2 x i64> %a, %vmull2.i
    855   ret <2 x i64> %sub
    856 }
    857 
    858 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
    859 ; CHECK-LABEL: test_vmlsl_laneq_u16:
    860 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
    861 ; CHECK-NEXT: ret
    862 entry:
    863   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    864   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
    865   %sub = sub <4 x i32> %a, %vmull2.i
    866   ret <4 x i32> %sub
    867 }
    868 
    869 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
    870 ; CHECK-LABEL: test_vmlsl_laneq_u32:
    871 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
    872 ; CHECK-NEXT: ret
    873 entry:
    874   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    875   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
    876   %sub = sub <2 x i64> %a, %vmull2.i
    877   ret <2 x i64> %sub
    878 }
    879 
    880 define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
    881 ; CHECK-LABEL: test_vmlsl_high_lane_u16:
    882 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    883 ; CHECK-NEXT: ret
    884 entry:
    885   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    886   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    887   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    888   %sub = sub <4 x i32> %a, %vmull2.i
    889   ret <4 x i32> %sub
    890 }
    891 
    892 define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
    893 ; CHECK-LABEL: test_vmlsl_high_lane_u32:
    894 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    895 ; CHECK-NEXT: ret
    896 entry:
    897   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    898   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    899   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    900   %sub = sub <2 x i64> %a, %vmull2.i
    901   ret <2 x i64> %sub
    902 }
    903 
    904 define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
    905 ; CHECK-LABEL: test_vmlsl_high_laneq_u16:
    906 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
    907 ; CHECK-NEXT: ret
    908 entry:
    909   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    910   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
    911   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    912   %sub = sub <4 x i32> %a, %vmull2.i
    913   ret <4 x i32> %sub
    914 }
    915 
    916 define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
    917 ; CHECK-LABEL: test_vmlsl_high_laneq_u32:
    918 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
    919 ; CHECK-NEXT: ret
    920 entry:
    921   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    922   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
    923   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    924   %sub = sub <2 x i64> %a, %vmull2.i
    925   ret <2 x i64> %sub
    926 }
    927 
    928 define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
    929 ; CHECK-LABEL: test_vmull_lane_s16:
    930 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    931 ; CHECK-NEXT: ret
    932 entry:
    933   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    934   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
    935   ret <4 x i32> %vmull2.i
    936 }
    937 
    938 define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
    939 ; CHECK-LABEL: test_vmull_lane_s32:
    940 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    941 ; CHECK-NEXT: ret
    942 entry:
    943   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    944   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
    945   ret <2 x i64> %vmull2.i
    946 }
    947 
    948 define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
    949 ; CHECK-LABEL: test_vmull_lane_u16:
    950 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
    951 ; CHECK-NEXT: ret
    952 entry:
    953   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    954   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
    955   ret <4 x i32> %vmull2.i
    956 }
    957 
    958 define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
    959 ; CHECK-LABEL: test_vmull_lane_u32:
    960 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
    961 ; CHECK-NEXT: ret
    962 entry:
    963   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    964   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
    965   ret <2 x i64> %vmull2.i
    966 }
    967 
    968 define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
    969 ; CHECK-LABEL: test_vmull_high_lane_s16:
    970 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    971 ; CHECK-NEXT: ret
    972 entry:
    973   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    974   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    975   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    976   ret <4 x i32> %vmull2.i
    977 }
    978 
    979 define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
    980 ; CHECK-LABEL: test_vmull_high_lane_s32:
    981 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
    982 ; CHECK-NEXT: ret
    983 entry:
    984   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    985   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
    986   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
    987   ret <2 x i64> %vmull2.i
    988 }
    989 
    990 define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
    991 ; CHECK-LABEL: test_vmull_high_lane_u16:
    992 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
    993 ; CHECK-NEXT: ret
    994 entry:
    995   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    996   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
    997   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
    998   ret <4 x i32> %vmull2.i
    999 }
   1000 
   1001 define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
   1002 ; CHECK-LABEL: test_vmull_high_lane_u32:
   1003 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1004 ; CHECK-NEXT: ret
   1005 entry:
   1006   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1007   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1008   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1009   ret <2 x i64> %vmull2.i
   1010 }
   1011 
   1012 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
   1013 ; CHECK-LABEL: test_vmull_laneq_s16:
   1014 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
   1015 ; CHECK-NEXT: ret
   1016 entry:
   1017   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1018   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1019   ret <4 x i32> %vmull2.i
   1020 }
   1021 
   1022 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
   1023 ; CHECK-LABEL: test_vmull_laneq_s32:
   1024 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1025 ; CHECK-NEXT: ret
   1026 entry:
   1027   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1028   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1029   ret <2 x i64> %vmull2.i
   1030 }
   1031 
   1032 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
   1033 ; CHECK-LABEL: test_vmull_laneq_u16:
   1034 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
   1035 ; CHECK-NEXT: ret
   1036 entry:
   1037   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1038   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1039   ret <4 x i32> %vmull2.i
   1040 }
   1041 
   1042 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
   1043 ; CHECK-LABEL: test_vmull_laneq_u32:
   1044 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1045 ; CHECK-NEXT: ret
   1046 entry:
   1047   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1048   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1049   ret <2 x i64> %vmull2.i
   1050 }
   1051 
   1052 define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
   1053 ; CHECK-LABEL: test_vmull_high_laneq_s16:
   1054 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
   1055 ; CHECK-NEXT: ret
   1056 entry:
   1057   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1058   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1059   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1060   ret <4 x i32> %vmull2.i
   1061 }
   1062 
   1063 define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
   1064 ; CHECK-LABEL: test_vmull_high_laneq_s32:
   1065 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1066 ; CHECK-NEXT: ret
   1067 entry:
   1068   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1069   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1070   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1071   ret <2 x i64> %vmull2.i
   1072 }
   1073 
   1074 define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
   1075 ; CHECK-LABEL: test_vmull_high_laneq_u16:
   1076 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
   1077 ; CHECK-NEXT: ret
   1078 entry:
   1079   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1080   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1081   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1082   ret <4 x i32> %vmull2.i
   1083 }
   1084 
   1085 define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
   1086 ; CHECK-LABEL: test_vmull_high_laneq_u32:
   1087 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1088 ; CHECK-NEXT: ret
   1089 entry:
   1090   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1091   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1092   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1093   ret <2 x i64> %vmull2.i
   1094 }
   1095 
   1096 define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   1097 ; CHECK-LABEL: test_vqdmlal_lane_s16:
   1098 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1099 ; CHECK-NEXT: ret
   1100 entry:
   1101   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1102   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   1103   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   1104   ret <4 x i32> %vqdmlal4.i
   1105 }
   1106 
   1107 define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   1108 ; CHECK-LABEL: test_vqdmlal_lane_s32:
   1109 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1110 ; CHECK-NEXT: ret
   1111 entry:
   1112   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1113   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   1114   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   1115   ret <2 x i64> %vqdmlal4.i
   1116 }
   1117 
   1118 define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   1119 ; CHECK-LABEL: test_vqdmlal_high_lane_s16:
   1120 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1121 ; CHECK-NEXT: ret
   1122 entry:
   1123   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1124   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1125   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1126   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   1127   ret <4 x i32> %vqdmlal4.i
   1128 }
   1129 
   1130 define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   1131 ; CHECK-LABEL: test_vqdmlal_high_lane_s32:
   1132 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1133 ; CHECK-NEXT: ret
   1134 entry:
   1135   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1136   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1137   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1138   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   1139   ret <2 x i64> %vqdmlal4.i
   1140 }
   1141 
   1142 define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   1143 ; CHECK-LABEL: test_vqdmlsl_lane_s16:
   1144 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1145 ; CHECK-NEXT: ret
   1146 entry:
   1147   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1148   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   1149   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   1150   ret <4 x i32> %vqdmlsl4.i
   1151 }
   1152 
   1153 define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   1154 ; CHECK-LABEL: test_vqdmlsl_lane_s32:
   1155 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1156 ; CHECK-NEXT: ret
   1157 entry:
   1158   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1159   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   1160   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   1161   ret <2 x i64> %vqdmlsl4.i
   1162 }
   1163 
   1164 define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   1165 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
   1166 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1167 ; CHECK-NEXT: ret
   1168 entry:
   1169   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1170   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1171   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1172   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   1173   ret <4 x i32> %vqdmlsl4.i
   1174 }
   1175 
   1176 define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   1177 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
   1178 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1179 ; CHECK-NEXT: ret
   1180 entry:
   1181   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1182   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1183   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1184   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   1185   ret <2 x i64> %vqdmlsl4.i
   1186 }
   1187 
   1188 define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
   1189 ; CHECK-LABEL: test_vqdmull_lane_s16:
   1190 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1191 ; CHECK-NEXT: ret
   1192 entry:
   1193   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1194   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1195   ret <4 x i32> %vqdmull2.i
   1196 }
   1197 
   1198 define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
   1199 ; CHECK-LABEL: test_vqdmull_lane_s32:
   1200 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1201 ; CHECK-NEXT: ret
   1202 entry:
   1203   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1204   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1205   ret <2 x i64> %vqdmull2.i
   1206 }
   1207 
   1208 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
   1209 ; CHECK-LABEL: test_vqdmull_laneq_s16:
   1210 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1211 ; CHECK-NEXT: ret
   1212 entry:
   1213   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1214   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   1215   ret <4 x i32> %vqdmull2.i
   1216 }
   1217 
   1218 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
   1219 ; CHECK-LABEL: test_vqdmull_laneq_s32:
   1220 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1221 ; CHECK-NEXT: ret
   1222 entry:
   1223   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1224   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   1225   ret <2 x i64> %vqdmull2.i
   1226 }
   1227 
   1228 define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
   1229 ; CHECK-LABEL: test_vqdmull_high_lane_s16:
   1230 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1231 ; CHECK-NEXT: ret
   1232 entry:
   1233   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1234   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1235   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1236   ret <4 x i32> %vqdmull2.i
   1237 }
   1238 
   1239 define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
   1240 ; CHECK-LABEL: test_vqdmull_high_lane_s32:
   1241 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1242 ; CHECK-NEXT: ret
   1243 entry:
   1244   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1245   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1246   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1247   ret <2 x i64> %vqdmull2.i
   1248 }
   1249 
   1250 define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
   1251 ; CHECK-LABEL: test_vqdmull_high_laneq_s16:
   1252 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
   1253 ; CHECK-NEXT: ret
   1254 entry:
   1255   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1256   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   1257   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1258   ret <4 x i32> %vqdmull2.i
   1259 }
   1260 
   1261 define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
   1262 ; CHECK-LABEL: test_vqdmull_high_laneq_s32:
   1263 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1264 ; CHECK-NEXT: ret
   1265 entry:
   1266   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1267   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   1268   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   1269   ret <2 x i64> %vqdmull2.i
   1270 }
   1271 
   1272 define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
   1273 ; CHECK-LABEL: test_vqdmulh_lane_s16:
   1274 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1275 ; CHECK-NEXT: ret
   1276 entry:
   1277   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1278   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   1279   ret <4 x i16> %vqdmulh2.i
   1280 }
   1281 
   1282 define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
   1283 ; CHECK-LABEL: test_vqdmulhq_lane_s16:
   1284 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1285 ; CHECK-NEXT: ret
   1286 entry:
   1287   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   1288   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   1289   ret <8 x i16> %vqdmulh2.i
   1290 }
   1291 
   1292 define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
   1293 ; CHECK-LABEL: test_vqdmulh_lane_s32:
   1294 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1295 ; CHECK-NEXT: ret
   1296 entry:
   1297   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1298   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   1299   ret <2 x i32> %vqdmulh2.i
   1300 }
   1301 
   1302 define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
   1303 ; CHECK-LABEL: test_vqdmulhq_lane_s32:
   1304 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1305 ; CHECK-NEXT: ret
   1306 entry:
   1307   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1308   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   1309   ret <4 x i32> %vqdmulh2.i
   1310 }
   1311 
   1312 define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
   1313 ; CHECK-LABEL: test_vqrdmulh_lane_s16:
   1314 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
   1315 ; CHECK-NEXT: ret
   1316 entry:
   1317   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1318   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   1319   ret <4 x i16> %vqrdmulh2.i
   1320 }
   1321 
   1322 define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
   1323 ; CHECK-LABEL: test_vqrdmulhq_lane_s16:
   1324 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
   1325 ; CHECK-NEXT: ret
   1326 entry:
   1327   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
   1328   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   1329   ret <8 x i16> %vqrdmulh2.i
   1330 }
   1331 
   1332 define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
   1333 ; CHECK-LABEL: test_vqrdmulh_lane_s32:
   1334 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1335 ; CHECK-NEXT: ret
   1336 entry:
   1337   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   1338   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   1339   ret <2 x i32> %vqrdmulh2.i
   1340 }
   1341 
   1342 define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
   1343 ; CHECK-LABEL: test_vqrdmulhq_lane_s32:
   1344 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1345 ; CHECK-NEXT: ret
   1346 entry:
   1347   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1348   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   1349   ret <4 x i32> %vqrdmulh2.i
   1350 }
   1351 
   1352 define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
   1353 ; CHECK-LABEL: test_vmul_lane_f32:
   1354 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1355 ; CHECK-NEXT: ret
   1356 entry:
   1357   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   1358   %mul = fmul <2 x float> %shuffle, %a
   1359   ret <2 x float> %mul
   1360 }
   1361 
   1362 define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
   1363 ; CHECK-LABEL: test_vmul_lane_f64:
   1364 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   1365 ; CHECK-NEXT: ret
   1366 entry:
   1367   %0 = bitcast <1 x double> %a to <8 x i8>
   1368   %1 = bitcast <8 x i8> %0 to double
   1369   %extract = extractelement <1 x double> %v, i32 0
   1370   %2 = fmul double %1, %extract
   1371   %3 = insertelement <1 x double> undef, double %2, i32 0
   1372   ret <1 x double> %3
   1373 }
   1374 
   1375 define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
   1376 ; CHECK-LABEL: test_vmulq_lane_f32:
   1377 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1378 ; CHECK-NEXT: ret
   1379 entry:
   1380   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1381   %mul = fmul <4 x float> %shuffle, %a
   1382   ret <4 x float> %mul
   1383 }
   1384 
   1385 define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
   1386 ; CHECK-LABEL: test_vmulq_lane_f64:
   1387 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1388 ; CHECK-NEXT: ret
   1389 entry:
   1390   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   1391   %mul = fmul <2 x double> %shuffle, %a
   1392   ret <2 x double> %mul
   1393 }
   1394 
   1395 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
   1396 ; CHECK-LABEL: test_vmul_laneq_f32:
   1397 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1398 ; CHECK-NEXT: ret
   1399 entry:
   1400   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   1401   %mul = fmul <2 x float> %shuffle, %a
   1402   ret <2 x float> %mul
   1403 }
   1404 
   1405 define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
   1406 ; CHECK-LABEL: test_vmul_laneq_f64:
   1407 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   1408 ; CHECK-NEXT: ret
   1409 entry:
   1410   %0 = bitcast <1 x double> %a to <8 x i8>
   1411   %1 = bitcast <8 x i8> %0 to double
   1412   %extract = extractelement <2 x double> %v, i32 1
   1413   %2 = fmul double %1, %extract
   1414   %3 = insertelement <1 x double> undef, double %2, i32 0
   1415   ret <1 x double> %3
   1416 }
   1417 
   1418 define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
   1419 ; CHECK-LABEL: test_vmulq_laneq_f32:
   1420 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1421 ; CHECK-NEXT: ret
   1422 entry:
   1423   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1424   %mul = fmul <4 x float> %shuffle, %a
   1425   ret <4 x float> %mul
   1426 }
   1427 
   1428 define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
   1429 ; CHECK-LABEL: test_vmulq_laneq_f64:
   1430 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
   1431 ; CHECK-NEXT: ret
   1432 entry:
   1433   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   1434   %mul = fmul <2 x double> %shuffle, %a
   1435   ret <2 x double> %mul
   1436 }
   1437 
   1438 define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
   1439 ; CHECK-LABEL: test_vmulx_lane_f32:
   1440 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
   1441 ; CHECK-NEXT: ret
   1442 entry:
   1443   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
   1444   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   1445   ret <2 x float> %vmulx2.i
   1446 }
   1447 
   1448 define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
   1449 ; CHECK-LABEL: test_vmulxq_lane_f32:
   1450 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
   1451 ; CHECK-NEXT: ret
   1452 entry:
   1453   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   1454   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   1455   ret <4 x float> %vmulx2.i
   1456 }
   1457 
   1458 define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
   1459 ; CHECK-LABEL: test_vmulxq_lane_f64:
   1460 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1461 ; CHECK-NEXT: ret
   1462 entry:
   1463   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   1464   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   1465   ret <2 x double> %vmulx2.i
   1466 }
   1467 
   1468 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
   1469 ; CHECK-LABEL: test_vmulx_laneq_f32:
   1470 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
   1471 ; CHECK-NEXT: ret
   1472 entry:
   1473   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   1474   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   1475   ret <2 x float> %vmulx2.i
   1476 }
   1477 
   1478 define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
   1479 ; CHECK-LABEL: test_vmulxq_laneq_f32:
   1480 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
   1481 ; CHECK-NEXT: ret
   1482 entry:
   1483   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   1484   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   1485   ret <4 x float> %vmulx2.i
   1486 }
   1487 
   1488 define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
   1489 ; CHECK-LABEL: test_vmulxq_laneq_f64:
   1490 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
   1491 ; CHECK-NEXT: ret
   1492 entry:
   1493   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   1494   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   1495   ret <2 x double> %vmulx2.i
   1496 }
   1497 
   1498 define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
   1499 ; CHECK-LABEL: test_vmla_lane_s16_0:
   1500 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1501 ; CHECK-NEXT: ret
   1502 entry:
   1503   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1504   %mul = mul <4 x i16> %shuffle, %b
   1505   %add = add <4 x i16> %mul, %a
   1506   ret <4 x i16> %add
   1507 }
   1508 
   1509 define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
   1510 ; CHECK-LABEL: test_vmlaq_lane_s16_0:
   1511 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1512 ; CHECK-NEXT: ret
   1513 entry:
   1514   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1515   %mul = mul <8 x i16> %shuffle, %b
   1516   %add = add <8 x i16> %mul, %a
   1517   ret <8 x i16> %add
   1518 }
   1519 
   1520 define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
   1521 ; CHECK-LABEL: test_vmla_lane_s32_0:
   1522 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1523 ; CHECK-NEXT: ret
   1524 entry:
   1525   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1526   %mul = mul <2 x i32> %shuffle, %b
   1527   %add = add <2 x i32> %mul, %a
   1528   ret <2 x i32> %add
   1529 }
   1530 
   1531 define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
   1532 ; CHECK-LABEL: test_vmlaq_lane_s32_0:
   1533 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1534 ; CHECK-NEXT: ret
   1535 entry:
   1536   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1537   %mul = mul <4 x i32> %shuffle, %b
   1538   %add = add <4 x i32> %mul, %a
   1539   ret <4 x i32> %add
   1540 }
   1541 
   1542 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
   1543 ; CHECK-LABEL: test_vmla_laneq_s16_0:
   1544 ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1545 ; CHECK-NEXT: ret
   1546 entry:
   1547   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1548   %mul = mul <4 x i16> %shuffle, %b
   1549   %add = add <4 x i16> %mul, %a
   1550   ret <4 x i16> %add
   1551 }
   1552 
   1553 define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
   1554 ; CHECK-LABEL: test_vmlaq_laneq_s16_0:
   1555 ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1556 ; CHECK-NEXT: ret
   1557 entry:
   1558   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1559   %mul = mul <8 x i16> %shuffle, %b
   1560   %add = add <8 x i16> %mul, %a
   1561   ret <8 x i16> %add
   1562 }
   1563 
   1564 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
   1565 ; CHECK-LABEL: test_vmla_laneq_s32_0:
   1566 ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1567 ; CHECK-NEXT: ret
   1568 entry:
   1569   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1570   %mul = mul <2 x i32> %shuffle, %b
   1571   %add = add <2 x i32> %mul, %a
   1572   ret <2 x i32> %add
   1573 }
   1574 
   1575 define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
   1576 ; CHECK-LABEL: test_vmlaq_laneq_s32_0:
   1577 ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1578 ; CHECK-NEXT: ret
   1579 entry:
   1580   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1581   %mul = mul <4 x i32> %shuffle, %b
   1582   %add = add <4 x i32> %mul, %a
   1583   ret <4 x i32> %add
   1584 }
   1585 
   1586 define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
   1587 ; CHECK-LABEL: test_vmls_lane_s16_0:
   1588 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1589 ; CHECK-NEXT: ret
   1590 entry:
   1591   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1592   %mul = mul <4 x i16> %shuffle, %b
   1593   %sub = sub <4 x i16> %a, %mul
   1594   ret <4 x i16> %sub
   1595 }
   1596 
   1597 define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
   1598 ; CHECK-LABEL: test_vmlsq_lane_s16_0:
   1599 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1600 ; CHECK-NEXT: ret
   1601 entry:
   1602   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1603   %mul = mul <8 x i16> %shuffle, %b
   1604   %sub = sub <8 x i16> %a, %mul
   1605   ret <8 x i16> %sub
   1606 }
   1607 
   1608 define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
   1609 ; CHECK-LABEL: test_vmls_lane_s32_0:
   1610 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1611 ; CHECK-NEXT: ret
   1612 entry:
   1613   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1614   %mul = mul <2 x i32> %shuffle, %b
   1615   %sub = sub <2 x i32> %a, %mul
   1616   ret <2 x i32> %sub
   1617 }
   1618 
   1619 define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
   1620 ; CHECK-LABEL: test_vmlsq_lane_s32_0:
   1621 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1622 ; CHECK-NEXT: ret
   1623 entry:
   1624   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1625   %mul = mul <4 x i32> %shuffle, %b
   1626   %sub = sub <4 x i32> %a, %mul
   1627   ret <4 x i32> %sub
   1628 }
   1629 
   1630 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
   1631 ; CHECK-LABEL: test_vmls_laneq_s16_0:
   1632 ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1633 ; CHECK-NEXT: ret
   1634 entry:
   1635   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1636   %mul = mul <4 x i16> %shuffle, %b
   1637   %sub = sub <4 x i16> %a, %mul
   1638   ret <4 x i16> %sub
   1639 }
   1640 
   1641 define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
   1642 ; CHECK-LABEL: test_vmlsq_laneq_s16_0:
   1643 ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1644 ; CHECK-NEXT: ret
   1645 entry:
   1646   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1647   %mul = mul <8 x i16> %shuffle, %b
   1648   %sub = sub <8 x i16> %a, %mul
   1649   ret <8 x i16> %sub
   1650 }
   1651 
   1652 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
   1653 ; CHECK-LABEL: test_vmls_laneq_s32_0:
   1654 ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1655 ; CHECK-NEXT: ret
   1656 entry:
   1657   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1658   %mul = mul <2 x i32> %shuffle, %b
   1659   %sub = sub <2 x i32> %a, %mul
   1660   ret <2 x i32> %sub
   1661 }
   1662 
   1663 define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
   1664 ; CHECK-LABEL: test_vmlsq_laneq_s32_0:
   1665 ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1666 ; CHECK-NEXT: ret
   1667 entry:
   1668   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1669   %mul = mul <4 x i32> %shuffle, %b
   1670   %sub = sub <4 x i32> %a, %mul
   1671   ret <4 x i32> %sub
   1672 }
   1673 
   1674 define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   1675 ; CHECK-LABEL: test_vmul_lane_s16_0:
   1676 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1677 ; CHECK-NEXT: ret
   1678 entry:
   1679   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1680   %mul = mul <4 x i16> %shuffle, %a
   1681   ret <4 x i16> %mul
   1682 }
   1683 
   1684 define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   1685 ; CHECK-LABEL: test_vmulq_lane_s16_0:
   1686 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1687 ; CHECK-NEXT: ret
   1688 entry:
   1689   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1690   %mul = mul <8 x i16> %shuffle, %a
   1691   ret <8 x i16> %mul
   1692 }
   1693 
   1694 define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   1695 ; CHECK-LABEL: test_vmul_lane_s32_0:
   1696 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1697 ; CHECK-NEXT: ret
   1698 entry:
   1699   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1700   %mul = mul <2 x i32> %shuffle, %a
   1701   ret <2 x i32> %mul
   1702 }
   1703 
   1704 define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   1705 ; CHECK-LABEL: test_vmulq_lane_s32_0:
   1706 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1707 ; CHECK-NEXT: ret
   1708 entry:
   1709   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1710   %mul = mul <4 x i32> %shuffle, %a
   1711   ret <4 x i32> %mul
   1712 }
   1713 
   1714 define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
   1715 ; CHECK-LABEL: test_vmul_lane_u16_0:
   1716 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1717 ; CHECK-NEXT: ret
   1718 entry:
   1719   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1720   %mul = mul <4 x i16> %shuffle, %a
   1721   ret <4 x i16> %mul
   1722 }
   1723 
   1724 define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
   1725 ; CHECK-LABEL: test_vmulq_lane_u16_0:
   1726 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1727 ; CHECK-NEXT: ret
   1728 entry:
   1729   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   1730   %mul = mul <8 x i16> %shuffle, %a
   1731   ret <8 x i16> %mul
   1732 }
   1733 
   1734 define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
   1735 ; CHECK-LABEL: test_vmul_lane_u32_0:
   1736 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1737 ; CHECK-NEXT: ret
   1738 entry:
   1739   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1740   %mul = mul <2 x i32> %shuffle, %a
   1741   ret <2 x i32> %mul
   1742 }
   1743 
   1744 define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
   1745 ; CHECK-LABEL: test_vmulq_lane_u32_0:
   1746 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1747 ; CHECK-NEXT: ret
   1748 entry:
   1749   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   1750   %mul = mul <4 x i32> %shuffle, %a
   1751   ret <4 x i32> %mul
   1752 }
   1753 
   1754 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
   1755 ; CHECK-LABEL: test_vmul_laneq_s16_0:
   1756 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1757 ; CHECK-NEXT: ret
   1758 entry:
   1759   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1760   %mul = mul <4 x i16> %shuffle, %a
   1761   ret <4 x i16> %mul
   1762 }
   1763 
   1764 define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
   1765 ; CHECK-LABEL: test_vmulq_laneq_s16_0:
   1766 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1767 ; CHECK-NEXT: ret
   1768 entry:
   1769   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1770   %mul = mul <8 x i16> %shuffle, %a
   1771   ret <8 x i16> %mul
   1772 }
   1773 
   1774 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
   1775 ; CHECK-LABEL: test_vmul_laneq_s32_0:
   1776 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1777 ; CHECK-NEXT: ret
   1778 entry:
   1779   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1780   %mul = mul <2 x i32> %shuffle, %a
   1781   ret <2 x i32> %mul
   1782 }
   1783 
   1784 define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
   1785 ; CHECK-LABEL: test_vmulq_laneq_s32_0:
   1786 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1787 ; CHECK-NEXT: ret
   1788 entry:
   1789   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1790   %mul = mul <4 x i32> %shuffle, %a
   1791   ret <4 x i32> %mul
   1792 }
   1793 
   1794 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
   1795 ; CHECK-LABEL: test_vmul_laneq_u16_0:
   1796 ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1797 ; CHECK-NEXT: ret
   1798 entry:
   1799   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1800   %mul = mul <4 x i16> %shuffle, %a
   1801   ret <4 x i16> %mul
   1802 }
   1803 
   1804 define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
   1805 ; CHECK-LABEL: test_vmulq_laneq_u16_0:
   1806 ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1807 ; CHECK-NEXT: ret
   1808 entry:
   1809   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
   1810   %mul = mul <8 x i16> %shuffle, %a
   1811   ret <8 x i16> %mul
   1812 }
   1813 
   1814 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
   1815 ; CHECK-LABEL: test_vmul_laneq_u32_0:
   1816 ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1817 ; CHECK-NEXT: ret
   1818 entry:
   1819   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1820   %mul = mul <2 x i32> %shuffle, %a
   1821   ret <2 x i32> %mul
   1822 }
   1823 
   1824 define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
   1825 ; CHECK-LABEL: test_vmulq_laneq_u32_0:
   1826 ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1827 ; CHECK-NEXT: ret
   1828 entry:
   1829   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
   1830   %mul = mul <4 x i32> %shuffle, %a
   1831   ret <4 x i32> %mul
   1832 }
   1833 
   1834 define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
   1835 ; CHECK-LABEL: test_vfma_lane_f32_0:
   1836 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1837 ; CHECK-NEXT: ret
   1838 entry:
   1839   %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   1840   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1841   ret <2 x float> %0
   1842 }
   1843 
   1844 define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
   1845 ; CHECK-LABEL: test_vfmaq_lane_f32_0:
   1846 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1847 ; CHECK-NEXT: ret
   1848 entry:
   1849   %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   1850   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1851   ret <4 x float> %0
   1852 }
   1853 
   1854 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
   1855 ; CHECK-LABEL: test_vfma_laneq_f32_0:
   1856 ; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1857 ; CHECK-NEXT: ret
   1858 entry:
   1859   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   1860   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1861   ret <2 x float> %0
   1862 }
   1863 
   1864 define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
   1865 ; CHECK-LABEL: test_vfmaq_laneq_f32_0:
   1866 ; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1867 ; CHECK-NEXT: ret
   1868 entry:
   1869   %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   1870   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1871   ret <4 x float> %0
   1872 }
   1873 
   1874 define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
   1875 ; CHECK-LABEL: test_vfms_lane_f32_0:
   1876 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1877 ; CHECK-NEXT: ret
   1878 entry:
   1879   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   1880   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
   1881   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1882   ret <2 x float> %0
   1883 }
   1884 
   1885 define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
   1886 ; CHECK-LABEL: test_vfmsq_lane_f32_0:
   1887 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1888 ; CHECK-NEXT: ret
   1889 entry:
   1890   %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
   1891   %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
   1892   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1893   ret <4 x float> %0
   1894 }
   1895 
   1896 define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
   1897 ; CHECK-LABEL: test_vfms_laneq_f32_0:
   1898 ; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1899 ; CHECK-NEXT: ret
   1900 entry:
   1901   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   1902   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
   1903   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
   1904   ret <2 x float> %0
   1905 }
   1906 
   1907 define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
   1908 ; CHECK-LABEL: test_vfmsq_laneq_f32_0:
   1909 ; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1910 ; CHECK-NEXT: ret
   1911 entry:
   1912   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
   1913   %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
   1914   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
   1915   ret <4 x float> %0
   1916 }
   1917 
   1918 define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
   1919 ; CHECK-LABEL: test_vfmaq_laneq_f64_0:
   1920 ; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1921 ; CHECK-NEXT: ret
   1922 entry:
   1923   %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   1924   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
   1925   ret <2 x double> %0
   1926 }
   1927 
   1928 define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
   1929 ; CHECK-LABEL: test_vfmsq_laneq_f64_0:
   1930 ; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   1931 ; CHECK-NEXT: ret
   1932 entry:
   1933   %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
   1934   %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
   1935   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
   1936   ret <2 x double> %0
   1937 }
   1938 
   1939 define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   1940 ; CHECK-LABEL: test_vmlal_lane_s16_0:
   1941 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1942 ; CHECK-NEXT: ret
   1943 entry:
   1944   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1945   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   1946   %add = add <4 x i32> %vmull2.i, %a
   1947   ret <4 x i32> %add
   1948 }
   1949 
   1950 define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   1951 ; CHECK-LABEL: test_vmlal_lane_s32_0:
   1952 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1953 ; CHECK-NEXT: ret
   1954 entry:
   1955   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   1956   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   1957   %add = add <2 x i64> %vmull2.i, %a
   1958   ret <2 x i64> %add
   1959 }
   1960 
   1961 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   1962 ; CHECK-LABEL: test_vmlal_laneq_s16_0:
   1963 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   1964 ; CHECK-NEXT: ret
   1965 entry:
   1966   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   1967   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   1968   %add = add <4 x i32> %vmull2.i, %a
   1969   ret <4 x i32> %add
   1970 }
   1971 
   1972 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   1973 ; CHECK-LABEL: test_vmlal_laneq_s32_0:
   1974 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   1975 ; CHECK-NEXT: ret
   1976 entry:
   1977   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   1978   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   1979   %add = add <2 x i64> %vmull2.i, %a
   1980   ret <2 x i64> %add
   1981 }
   1982 
   1983 define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   1984 ; CHECK-LABEL: test_vmlal_high_lane_s16_0:
   1985 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   1986 ; CHECK-NEXT: ret
   1987 entry:
   1988   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1989   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   1990   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   1991   %add = add <4 x i32> %vmull2.i, %a
   1992   ret <4 x i32> %add
   1993 }
   1994 
   1995 define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   1996 ; CHECK-LABEL: test_vmlal_high_lane_s32_0:
   1997 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   1998 ; CHECK-NEXT: ret
   1999 entry:
   2000   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2001   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2002   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2003   %add = add <2 x i64> %vmull2.i, %a
   2004   ret <2 x i64> %add
   2005 }
   2006 
   2007 define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2008 ; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
   2009 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2010 ; CHECK-NEXT: ret
   2011 entry:
   2012   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2013   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2014   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2015   %add = add <4 x i32> %vmull2.i, %a
   2016   ret <4 x i32> %add
   2017 }
   2018 
   2019 define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2020 ; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
   2021 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2022 ; CHECK-NEXT: ret
   2023 entry:
   2024   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2025   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2026   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2027   %add = add <2 x i64> %vmull2.i, %a
   2028   ret <2 x i64> %add
   2029 }
   2030 
   2031 define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2032 ; CHECK-LABEL: test_vmlsl_lane_s16_0:
   2033 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2034 ; CHECK-NEXT: ret
   2035 entry:
   2036   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2037   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2038   %sub = sub <4 x i32> %a, %vmull2.i
   2039   ret <4 x i32> %sub
   2040 }
   2041 
   2042 define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2043 ; CHECK-LABEL: test_vmlsl_lane_s32_0:
   2044 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2045 ; CHECK-NEXT: ret
   2046 entry:
   2047   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2048   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2049   %sub = sub <2 x i64> %a, %vmull2.i
   2050   ret <2 x i64> %sub
   2051 }
   2052 
   2053 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   2054 ; CHECK-LABEL: test_vmlsl_laneq_s16_0:
   2055 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2056 ; CHECK-NEXT: ret
   2057 entry:
   2058   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2059   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2060   %sub = sub <4 x i32> %a, %vmull2.i
   2061   ret <4 x i32> %sub
   2062 }
   2063 
   2064 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   2065 ; CHECK-LABEL: test_vmlsl_laneq_s32_0:
   2066 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2067 ; CHECK-NEXT: ret
   2068 entry:
   2069   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2070   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2071   %sub = sub <2 x i64> %a, %vmull2.i
   2072   ret <2 x i64> %sub
   2073 }
   2074 
   2075 define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2076 ; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
   2077 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2078 ; CHECK-NEXT: ret
   2079 entry:
   2080   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2081   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2082   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2083   %sub = sub <4 x i32> %a, %vmull2.i
   2084   ret <4 x i32> %sub
   2085 }
   2086 
   2087 define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2088 ; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
   2089 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2090 ; CHECK-NEXT: ret
   2091 entry:
   2092   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2093   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2094   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2095   %sub = sub <2 x i64> %a, %vmull2.i
   2096   ret <2 x i64> %sub
   2097 }
   2098 
   2099 define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2100 ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
   2101 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2102 ; CHECK-NEXT: ret
   2103 entry:
   2104   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2105   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2106   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2107   %sub = sub <4 x i32> %a, %vmull2.i
   2108   ret <4 x i32> %sub
   2109 }
   2110 
   2111 define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2112 ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
   2113 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2114 ; CHECK-NEXT: ret
   2115 entry:
   2116   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2117   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2118   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2119   %sub = sub <2 x i64> %a, %vmull2.i
   2120   ret <2 x i64> %sub
   2121 }
   2122 
   2123 define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2124 ; CHECK-LABEL: test_vmlal_lane_u16_0:
   2125 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2126 ; CHECK-NEXT: ret
   2127 entry:
   2128   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2129   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2130   %add = add <4 x i32> %vmull2.i, %a
   2131   ret <4 x i32> %add
   2132 }
   2133 
   2134 define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2135 ; CHECK-LABEL: test_vmlal_lane_u32_0:
   2136 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2137 ; CHECK-NEXT: ret
   2138 entry:
   2139   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2140   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2141   %add = add <2 x i64> %vmull2.i, %a
   2142   ret <2 x i64> %add
   2143 }
   2144 
   2145 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   2146 ; CHECK-LABEL: test_vmlal_laneq_u16_0:
   2147 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2148 ; CHECK-NEXT: ret
   2149 entry:
   2150   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2151   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2152   %add = add <4 x i32> %vmull2.i, %a
   2153   ret <4 x i32> %add
   2154 }
   2155 
   2156 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   2157 ; CHECK-LABEL: test_vmlal_laneq_u32_0:
   2158 ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2159 ; CHECK-NEXT: ret
   2160 entry:
   2161   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2162   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2163   %add = add <2 x i64> %vmull2.i, %a
   2164   ret <2 x i64> %add
   2165 }
   2166 
   2167 define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2168 ; CHECK-LABEL: test_vmlal_high_lane_u16_0:
   2169 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2170 ; CHECK-NEXT: ret
   2171 entry:
   2172   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2173   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2174   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2175   %add = add <4 x i32> %vmull2.i, %a
   2176   ret <4 x i32> %add
   2177 }
   2178 
   2179 define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2180 ; CHECK-LABEL: test_vmlal_high_lane_u32_0:
   2181 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2182 ; CHECK-NEXT: ret
   2183 entry:
   2184   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2185   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2186   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2187   %add = add <2 x i64> %vmull2.i, %a
   2188   ret <2 x i64> %add
   2189 }
   2190 
   2191 define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2192 ; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
   2193 ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2194 ; CHECK-NEXT: ret
   2195 entry:
   2196   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2197   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2198   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2199   %add = add <4 x i32> %vmull2.i, %a
   2200   ret <4 x i32> %add
   2201 }
   2202 
   2203 define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2204 ; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
   2205 ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2206 ; CHECK-NEXT: ret
   2207 entry:
   2208   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2209   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2210   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2211   %add = add <2 x i64> %vmull2.i, %a
   2212   ret <2 x i64> %add
   2213 }
   2214 
   2215 define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2216 ; CHECK-LABEL: test_vmlsl_lane_u16_0:
   2217 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2218 ; CHECK-NEXT: ret
   2219 entry:
   2220   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2221   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2222   %sub = sub <4 x i32> %a, %vmull2.i
   2223   ret <4 x i32> %sub
   2224 }
   2225 
   2226 define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2227 ; CHECK-LABEL: test_vmlsl_lane_u32_0:
   2228 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2229 ; CHECK-NEXT: ret
   2230 entry:
   2231   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2232   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2233   %sub = sub <2 x i64> %a, %vmull2.i
   2234   ret <2 x i64> %sub
   2235 }
   2236 
   2237 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
   2238 ; CHECK-LABEL: test_vmlsl_laneq_u16_0:
   2239 ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2240 ; CHECK-NEXT: ret
   2241 entry:
   2242   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2243   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2244   %sub = sub <4 x i32> %a, %vmull2.i
   2245   ret <4 x i32> %sub
   2246 }
   2247 
   2248 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
   2249 ; CHECK-LABEL: test_vmlsl_laneq_u32_0:
   2250 ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2251 ; CHECK-NEXT: ret
   2252 entry:
   2253   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2254   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2255   %sub = sub <2 x i64> %a, %vmull2.i
   2256   ret <2 x i64> %sub
   2257 }
   2258 
   2259 define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2260 ; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
   2261 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2262 ; CHECK-NEXT: ret
   2263 entry:
   2264   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2265   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2266   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2267   %sub = sub <4 x i32> %a, %vmull2.i
   2268   ret <4 x i32> %sub
   2269 }
   2270 
   2271 define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2272 ; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
   2273 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2274 ; CHECK-NEXT: ret
   2275 entry:
   2276   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2277   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2278   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2279   %sub = sub <2 x i64> %a, %vmull2.i
   2280   ret <2 x i64> %sub
   2281 }
   2282 
   2283 define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
   2284 ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
   2285 ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2286 ; CHECK-NEXT: ret
   2287 entry:
   2288   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2289   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2290   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2291   %sub = sub <4 x i32> %a, %vmull2.i
   2292   ret <4 x i32> %sub
   2293 }
   2294 
   2295 define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
   2296 ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
   2297 ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2298 ; CHECK-NEXT: ret
   2299 entry:
   2300   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2301   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2302   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2303   %sub = sub <2 x i64> %a, %vmull2.i
   2304   ret <2 x i64> %sub
   2305 }
   2306 
   2307 define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2308 ; CHECK-LABEL: test_vmull_lane_s16_0:
   2309 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2310 ; CHECK-NEXT: ret
   2311 entry:
   2312   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2313   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2314   ret <4 x i32> %vmull2.i
   2315 }
   2316 
   2317 define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2318 ; CHECK-LABEL: test_vmull_lane_s32_0:
   2319 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2320 ; CHECK-NEXT: ret
   2321 entry:
   2322   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2323   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2324   ret <2 x i64> %vmull2.i
   2325 }
   2326 
   2327 define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
   2328 ; CHECK-LABEL: test_vmull_lane_u16_0:
   2329 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2330 ; CHECK-NEXT: ret
   2331 entry:
   2332   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2333   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2334   ret <4 x i32> %vmull2.i
   2335 }
   2336 
   2337 define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
   2338 ; CHECK-LABEL: test_vmull_lane_u32_0:
   2339 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2340 ; CHECK-NEXT: ret
   2341 entry:
   2342   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2343   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2344   ret <2 x i64> %vmull2.i
   2345 }
   2346 
   2347 define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2348 ; CHECK-LABEL: test_vmull_high_lane_s16_0:
   2349 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2350 ; CHECK-NEXT: ret
   2351 entry:
   2352   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2353   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2354   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2355   ret <4 x i32> %vmull2.i
   2356 }
   2357 
   2358 define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2359 ; CHECK-LABEL: test_vmull_high_lane_s32_0:
   2360 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2361 ; CHECK-NEXT: ret
   2362 entry:
   2363   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2364   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2365   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2366   ret <2 x i64> %vmull2.i
   2367 }
   2368 
   2369 define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
   2370 ; CHECK-LABEL: test_vmull_high_lane_u16_0:
   2371 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2372 ; CHECK-NEXT: ret
   2373 entry:
   2374   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2375   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2376   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2377   ret <4 x i32> %vmull2.i
   2378 }
   2379 
   2380 define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
   2381 ; CHECK-LABEL: test_vmull_high_lane_u32_0:
   2382 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2383 ; CHECK-NEXT: ret
   2384 entry:
   2385   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2386   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2387   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2388   ret <2 x i64> %vmull2.i
   2389 }
   2390 
   2391 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
   2392 ; CHECK-LABEL: test_vmull_laneq_s16_0:
   2393 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2394 ; CHECK-NEXT: ret
   2395 entry:
   2396   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2397   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2398   ret <4 x i32> %vmull2.i
   2399 }
   2400 
   2401 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
   2402 ; CHECK-LABEL: test_vmull_laneq_s32_0:
   2403 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2404 ; CHECK-NEXT: ret
   2405 entry:
   2406   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2407   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2408   ret <2 x i64> %vmull2.i
   2409 }
   2410 
   2411 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
   2412 ; CHECK-LABEL: test_vmull_laneq_u16_0:
   2413 ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2414 ; CHECK-NEXT: ret
   2415 entry:
   2416   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2417   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2418   ret <4 x i32> %vmull2.i
   2419 }
   2420 
   2421 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
   2422 ; CHECK-LABEL: test_vmull_laneq_u32_0:
   2423 ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2424 ; CHECK-NEXT: ret
   2425 entry:
   2426   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2427   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2428   ret <2 x i64> %vmull2.i
   2429 }
   2430 
   2431 define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
   2432 ; CHECK-LABEL: test_vmull_high_laneq_s16_0:
   2433 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2434 ; CHECK-NEXT: ret
   2435 entry:
   2436   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2437   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2438   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2439   ret <4 x i32> %vmull2.i
   2440 }
   2441 
   2442 define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
   2443 ; CHECK-LABEL: test_vmull_high_laneq_s32_0:
   2444 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2445 ; CHECK-NEXT: ret
   2446 entry:
   2447   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2448   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2449   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2450   ret <2 x i64> %vmull2.i
   2451 }
   2452 
   2453 define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
   2454 ; CHECK-LABEL: test_vmull_high_laneq_u16_0:
   2455 ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2456 ; CHECK-NEXT: ret
   2457 entry:
   2458   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2459   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2460   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2461   ret <4 x i32> %vmull2.i
   2462 }
   2463 
   2464 define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
   2465 ; CHECK-LABEL: test_vmull_high_laneq_u32_0:
   2466 ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2467 ; CHECK-NEXT: ret
   2468 entry:
   2469   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2470   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2471   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2472   ret <2 x i64> %vmull2.i
   2473 }
   2474 
   2475 define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2476 ; CHECK-LABEL: test_vqdmlal_lane_s16_0:
   2477 ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2478 ; CHECK-NEXT: ret
   2479 entry:
   2480   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2481   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2482   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   2483   ret <4 x i32> %vqdmlal4.i
   2484 }
   2485 
   2486 define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2487 ; CHECK-LABEL: test_vqdmlal_lane_s32_0:
   2488 ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2489 ; CHECK-NEXT: ret
   2490 entry:
   2491   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2492   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2493   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   2494   ret <2 x i64> %vqdmlal4.i
   2495 }
   2496 
   2497 define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2498 ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
   2499 ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2500 ; CHECK-NEXT: ret
   2501 entry:
   2502   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2503   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2504   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2505   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   2506   ret <4 x i32> %vqdmlal4.i
   2507 }
   2508 
   2509 define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2510 ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
   2511 ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2512 ; CHECK-NEXT: ret
   2513 entry:
   2514   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2515   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2516   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2517   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   2518   ret <2 x i64> %vqdmlal4.i
   2519 }
   2520 
   2521 define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
   2522 ; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
   2523 ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2524 ; CHECK-NEXT: ret
   2525 entry:
   2526   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2527   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   2528   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   2529   ret <4 x i32> %vqdmlsl4.i
   2530 }
   2531 
   2532 define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
   2533 ; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
   2534 ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2535 ; CHECK-NEXT: ret
   2536 entry:
   2537   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2538   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   2539   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   2540   ret <2 x i64> %vqdmlsl4.i
   2541 }
   2542 
   2543 define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
   2544 ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
   2545 ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2546 ; CHECK-NEXT: ret
   2547 entry:
   2548   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2549   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2550   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2551   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   2552   ret <4 x i32> %vqdmlsl4.i
   2553 }
   2554 
   2555 define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
   2556 ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
   2557 ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2558 ; CHECK-NEXT: ret
   2559 entry:
   2560   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2561   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2562   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2563   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   2564   ret <2 x i64> %vqdmlsl4.i
   2565 }
   2566 
   2567 define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2568 ; CHECK-LABEL: test_vqdmull_lane_s16_0:
   2569 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2570 ; CHECK-NEXT: ret
   2571 entry:
   2572   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2573   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2574   ret <4 x i32> %vqdmull2.i
   2575 }
   2576 
   2577 define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2578 ; CHECK-LABEL: test_vqdmull_lane_s32_0:
   2579 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2580 ; CHECK-NEXT: ret
   2581 entry:
   2582   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2583   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2584   ret <2 x i64> %vqdmull2.i
   2585 }
   2586 
   2587 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
   2588 ; CHECK-LABEL: test_vqdmull_laneq_s16_0:
   2589 ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2590 ; CHECK-NEXT: ret
   2591 entry:
   2592   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2593   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   2594   ret <4 x i32> %vqdmull2.i
   2595 }
   2596 
   2597 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
   2598 ; CHECK-LABEL: test_vqdmull_laneq_s32_0:
   2599 ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2600 ; CHECK-NEXT: ret
   2601 entry:
   2602   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2603   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   2604   ret <2 x i64> %vqdmull2.i
   2605 }
   2606 
   2607 define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2608 ; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
   2609 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2610 ; CHECK-NEXT: ret
   2611 entry:
   2612   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2613   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2614   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2615   ret <4 x i32> %vqdmull2.i
   2616 }
   2617 
   2618 define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2619 ; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
   2620 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2621 ; CHECK-NEXT: ret
   2622 entry:
   2623   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2624   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2625   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2626   ret <2 x i64> %vqdmull2.i
   2627 }
   2628 
   2629 define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
   2630 ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
   2631 ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2632 ; CHECK-NEXT: ret
   2633 entry:
   2634   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2635   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   2636   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   2637   ret <4 x i32> %vqdmull2.i
   2638 }
   2639 
   2640 define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
   2641 ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
   2642 ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2643 ; CHECK-NEXT: ret
   2644 entry:
   2645   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   2646   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   2647   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   2648   ret <2 x i64> %vqdmull2.i
   2649 }
   2650 
   2651 define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2652 ; CHECK-LABEL: test_vqdmulh_lane_s16_0:
   2653 ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2654 ; CHECK-NEXT: ret
   2655 entry:
   2656   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2657   %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   2658   ret <4 x i16> %vqdmulh2.i
   2659 }
   2660 
   2661 define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2662 ; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
   2663 ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2664 ; CHECK-NEXT: ret
   2665 entry:
   2666   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   2667   %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   2668   ret <8 x i16> %vqdmulh2.i
   2669 }
   2670 
   2671 define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2672 ; CHECK-LABEL: test_vqdmulh_lane_s32_0:
   2673 ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2674 ; CHECK-NEXT: ret
   2675 entry:
   2676   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2677   %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   2678   ret <2 x i32> %vqdmulh2.i
   2679 }
   2680 
   2681 define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2682 ; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
   2683 ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2684 ; CHECK-NEXT: ret
   2685 entry:
   2686   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   2687   %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   2688   ret <4 x i32> %vqdmulh2.i
   2689 }
   2690 
   2691 define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
   2692 ; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
   2693 ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
   2694 ; CHECK-NEXT: ret
   2695 entry:
   2696   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
   2697   %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   2698   ret <4 x i16> %vqrdmulh2.i
   2699 }
   2700 
   2701 define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
   2702 ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
   2703 ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
   2704 ; CHECK-NEXT: ret
   2705 entry:
   2706   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
   2707   %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   2708   ret <8 x i16> %vqrdmulh2.i
   2709 }
   2710 
   2711 define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
   2712 ; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
   2713 ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2714 ; CHECK-NEXT: ret
   2715 entry:
   2716   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
   2717   %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   2718   ret <2 x i32> %vqrdmulh2.i
   2719 }
   2720 
   2721 define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
   2722 ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
   2723 ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2724 ; CHECK-NEXT: ret
   2725 entry:
   2726   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   2727   %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   2728   ret <4 x i32> %vqrdmulh2.i
   2729 }
   2730 
   2731 define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
   2732 ; CHECK-LABEL: test_vmul_lane_f32_0:
   2733 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2734 ; CHECK-NEXT: ret
   2735 entry:
   2736   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   2737   %mul = fmul <2 x float> %shuffle, %a
   2738   ret <2 x float> %mul
   2739 }
   2740 
   2741 define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
   2742 ; CHECK-LABEL: test_vmulq_lane_f32_0:
   2743 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2744 ; CHECK-NEXT: ret
   2745 entry:
   2746   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   2747   %mul = fmul <4 x float> %shuffle, %a
   2748   ret <4 x float> %mul
   2749 }
   2750 
   2751 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
   2752 ; CHECK-LABEL: test_vmul_laneq_f32_0:
   2753 ; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2754 ; CHECK-NEXT: ret
   2755 entry:
   2756   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   2757   %mul = fmul <2 x float> %shuffle, %a
   2758   ret <2 x float> %mul
   2759 }
   2760 
   2761 define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
   2762 ; CHECK-LABEL: test_vmul_laneq_f64_0:
   2763 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   2764 ; CHECK-NEXT: ret
   2765 entry:
   2766   %0 = bitcast <1 x double> %a to <8 x i8>
   2767   %1 = bitcast <8 x i8> %0 to double
   2768   %extract = extractelement <2 x double> %v, i32 0
   2769   %2 = fmul double %1, %extract
   2770   %3 = insertelement <1 x double> undef, double %2, i32 0
   2771   ret <1 x double> %3
   2772 }
   2773 
   2774 define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
   2775 ; CHECK-LABEL: test_vmulq_laneq_f32_0:
   2776 ; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2777 ; CHECK-NEXT: ret
   2778 entry:
   2779   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   2780   %mul = fmul <4 x float> %shuffle, %a
   2781   ret <4 x float> %mul
   2782 }
   2783 
   2784 define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
   2785 ; CHECK-LABEL: test_vmulq_laneq_f64_0:
   2786 ; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   2787 ; CHECK-NEXT: ret
   2788 entry:
   2789   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   2790   %mul = fmul <2 x double> %shuffle, %a
   2791   ret <2 x double> %mul
   2792 }
   2793 
   2794 define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
   2795 ; CHECK-LABEL: test_vmulx_lane_f32_0:
   2796 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2797 ; CHECK-NEXT: ret
   2798 entry:
   2799   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
   2800   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   2801   ret <2 x float> %vmulx2.i
   2802 }
   2803 
   2804 define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
   2805 ; CHECK-LABEL: test_vmulxq_lane_f32_0:
   2806 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2807 ; CHECK-NEXT: ret
   2808 entry:
   2809   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
   2810   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   2811   ret <4 x float> %vmulx2.i
   2812 }
   2813 
   2814 define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
   2815 ; CHECK-LABEL: test_vmulxq_lane_f64_0:
   2816 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   2817 ; CHECK-NEXT: ret
   2818 entry:
   2819   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
   2820   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   2821   ret <2 x double> %vmulx2.i
   2822 }
   2823 
   2824 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
   2825 ; CHECK-LABEL: test_vmulx_laneq_f32_0:
   2826 ; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
   2827 ; CHECK-NEXT: ret
   2828 entry:
   2829   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   2830   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   2831   ret <2 x float> %vmulx2.i
   2832 }
   2833 
   2834 define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
   2835 ; CHECK-LABEL: test_vmulxq_laneq_f32_0:
   2836 ; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
   2837 ; CHECK-NEXT: ret
   2838 entry:
   2839   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
   2840   %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   2841   ret <4 x float> %vmulx2.i
   2842 }
   2843 
   2844 define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
   2845 ; CHECK-LABEL: test_vmulxq_laneq_f64_0:
   2846 ; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
   2847 ; CHECK-NEXT: ret
   2848 entry:
   2849   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
   2850   %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   2851   ret <2 x double> %vmulx2.i
   2852 }
   2853 
   2854