Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
      2 
      3 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
      4 
      5 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
      6 
      7 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
      8 
      9 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
     10 
     11 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
     12 
     13 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
     14 
     15 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
     16 
     17 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
     18 
     19 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
     20 
     21 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
     22 
     23 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
     24 
     25 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
     26 
     27 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
     28 
     29 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
     30 
     31 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
     32 
     33 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
     34 
     35 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
     36 
     37 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
     38 
     39 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
     40 
     41 declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
     42 
     43 declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
     44 
     45 declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
     46 
     47 declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
     48 
     49 declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
     50 
     51 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
     52 
     53 define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
     54 ; CHECK-LABEL: test_vaddl_s8:
     55 ; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
     56 entry:
     57   %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
     58   %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
     59   %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
     60   ret <8 x i16> %add.i
     61 }
     62 
     63 define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
     64 ; CHECK-LABEL: test_vaddl_s16:
     65 ; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
     66 entry:
     67   %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
     68   %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
     69   %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
     70   ret <4 x i32> %add.i
     71 }
     72 
     73 define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
     74 ; CHECK-LABEL: test_vaddl_s32:
     75 ; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
     76 entry:
     77   %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
     78   %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
     79   %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
     80   ret <2 x i64> %add.i
     81 }
     82 
     83 define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
     84 ; CHECK-LABEL: test_vaddl_u8:
     85 ; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
     86 entry:
     87   %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
     88   %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
     89   %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
     90   ret <8 x i16> %add.i
     91 }
     92 
     93 define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
     94 ; CHECK-LABEL: test_vaddl_u16:
     95 ; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
     96 entry:
     97   %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
     98   %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
     99   %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
    100   ret <4 x i32> %add.i
    101 }
    102 
    103 define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
    104 ; CHECK-LABEL: test_vaddl_u32:
    105 ; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    106 entry:
    107   %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
    108   %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
    109   %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
    110   ret <2 x i64> %add.i
    111 }
    112 
    113 define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
    114 ; CHECK-LABEL: test_vaddl_high_s8:
    115 ; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    116 entry:
    117   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    118   %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
    119   %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    120   %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
    121   %add.i = add <8 x i16> %0, %1
    122   ret <8 x i16> %add.i
    123 }
    124 
    125 define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
    126 ; CHECK-LABEL: test_vaddl_high_s16:
    127 ; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    128 entry:
    129   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    130   %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
    131   %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    132   %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
    133   %add.i = add <4 x i32> %0, %1
    134   ret <4 x i32> %add.i
    135 }
    136 
    137 define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
    138 ; CHECK-LABEL: test_vaddl_high_s32:
    139 ; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    140 entry:
    141   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    142   %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
    143   %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    144   %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
    145   %add.i = add <2 x i64> %0, %1
    146   ret <2 x i64> %add.i
    147 }
    148 
    149 define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
    150 ; CHECK-LABEL: test_vaddl_high_u8:
    151 ; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    152 entry:
    153   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    154   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
    155   %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    156   %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
    157   %add.i = add <8 x i16> %0, %1
    158   ret <8 x i16> %add.i
    159 }
    160 
    161 define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
    162 ; CHECK-LABEL: test_vaddl_high_u16:
    163 ; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    164 entry:
    165   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    166   %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
    167   %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    168   %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
    169   %add.i = add <4 x i32> %0, %1
    170   ret <4 x i32> %add.i
    171 }
    172 
    173 define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
    174 ; CHECK-LABEL: test_vaddl_high_u32:
    175 ; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    176 entry:
    177   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    178   %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
    179   %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    180   %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
    181   %add.i = add <2 x i64> %0, %1
    182   ret <2 x i64> %add.i
    183 }
    184 
    185 define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
    186 ; CHECK-LABEL: test_vaddw_s8:
    187 ; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
    188 entry:
    189   %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
    190   %add.i = add <8 x i16> %vmovl.i.i, %a
    191   ret <8 x i16> %add.i
    192 }
    193 
    194 define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
    195 ; CHECK-LABEL: test_vaddw_s16:
    196 ; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
    197 entry:
    198   %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
    199   %add.i = add <4 x i32> %vmovl.i.i, %a
    200   ret <4 x i32> %add.i
    201 }
    202 
    203 define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
    204 ; CHECK-LABEL: test_vaddw_s32:
    205 ; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
    206 entry:
    207   %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
    208   %add.i = add <2 x i64> %vmovl.i.i, %a
    209   ret <2 x i64> %add.i
    210 }
    211 
    212 define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
    213 ; CHECK-LABEL: test_vaddw_u8:
    214 ; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
    215 entry:
    216   %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
    217   %add.i = add <8 x i16> %vmovl.i.i, %a
    218   ret <8 x i16> %add.i
    219 }
    220 
    221 define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
    222 ; CHECK-LABEL: test_vaddw_u16:
    223 ; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
    224 entry:
    225   %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
    226   %add.i = add <4 x i32> %vmovl.i.i, %a
    227   ret <4 x i32> %add.i
    228 }
    229 
    230 define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
    231 ; CHECK-LABEL: test_vaddw_u32:
    232 ; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
    233 entry:
    234   %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
    235   %add.i = add <2 x i64> %vmovl.i.i, %a
    236   ret <2 x i64> %add.i
    237 }
    238 
    239 define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
    240 ; CHECK-LABEL: test_vaddw_high_s8:
    241 ; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
    242 entry:
    243   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    244   %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
    245   %add.i = add <8 x i16> %0, %a
    246   ret <8 x i16> %add.i
    247 }
    248 
    249 define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
    250 ; CHECK-LABEL: test_vaddw_high_s16:
    251 ; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
    252 entry:
    253   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    254   %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
    255   %add.i = add <4 x i32> %0, %a
    256   ret <4 x i32> %add.i
    257 }
    258 
    259 define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
    260 ; CHECK-LABEL: test_vaddw_high_s32:
    261 ; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
    262 entry:
    263   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    264   %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
    265   %add.i = add <2 x i64> %0, %a
    266   ret <2 x i64> %add.i
    267 }
    268 
    269 define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
    270 ; CHECK-LABEL: test_vaddw_high_u8:
    271 ; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
    272 entry:
    273   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    274   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
    275   %add.i = add <8 x i16> %0, %a
    276   ret <8 x i16> %add.i
    277 }
    278 
    279 define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
    280 ; CHECK-LABEL: test_vaddw_high_u16:
    281 ; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
    282 entry:
    283   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    284   %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
    285   %add.i = add <4 x i32> %0, %a
    286   ret <4 x i32> %add.i
    287 }
    288 
    289 define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
    290 ; CHECK-LABEL: test_vaddw_high_u32:
    291 ; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
    292 entry:
    293   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    294   %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
    295   %add.i = add <2 x i64> %0, %a
    296   ret <2 x i64> %add.i
    297 }
    298 
    299 define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
    300 ; CHECK-LABEL: test_vsubl_s8:
    301 ; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    302 entry:
    303   %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
    304   %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
    305   %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
    306   ret <8 x i16> %sub.i
    307 }
    308 
    309 define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
    310 ; CHECK-LABEL: test_vsubl_s16:
    311 ; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    312 entry:
    313   %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
    314   %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
    315   %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
    316   ret <4 x i32> %sub.i
    317 }
    318 
    319 define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
    320 ; CHECK-LABEL: test_vsubl_s32:
    321 ; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    322 entry:
    323   %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
    324   %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
    325   %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
    326   ret <2 x i64> %sub.i
    327 }
    328 
    329 define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
    330 ; CHECK-LABEL: test_vsubl_u8:
    331 ; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    332 entry:
    333   %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
    334   %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
    335   %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
    336   ret <8 x i16> %sub.i
    337 }
    338 
    339 define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
    340 ; CHECK-LABEL: test_vsubl_u16:
    341 ; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
    342 entry:
    343   %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
    344   %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
    345   %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
    346   ret <4 x i32> %sub.i
    347 }
    348 
    349 define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
    350 ; CHECK-LABEL: test_vsubl_u32:
    351 ; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
    352 entry:
    353   %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
    354   %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
    355   %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
    356   ret <2 x i64> %sub.i
    357 }
    358 
    359 define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
    360 ; CHECK-LABEL: test_vsubl_high_s8:
    361 ; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    362 entry:
    363   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    364   %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
    365   %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    366   %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
    367   %sub.i = sub <8 x i16> %0, %1
    368   ret <8 x i16> %sub.i
    369 }
    370 
    371 define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
    372 ; CHECK-LABEL: test_vsubl_high_s16:
    373 ; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    374 entry:
    375   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    376   %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
    377   %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    378   %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
    379   %sub.i = sub <4 x i32> %0, %1
    380   ret <4 x i32> %sub.i
    381 }
    382 
    383 define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
    384 ; CHECK-LABEL: test_vsubl_high_s32:
    385 ; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    386 entry:
    387   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    388   %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
    389   %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    390   %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
    391   %sub.i = sub <2 x i64> %0, %1
    392   ret <2 x i64> %sub.i
    393 }
    394 
    395 define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
    396 ; CHECK-LABEL: test_vsubl_high_u8:
    397 ; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    398 entry:
    399   %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    400   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
    401   %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    402   %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
    403   %sub.i = sub <8 x i16> %0, %1
    404   ret <8 x i16> %sub.i
    405 }
    406 
    407 define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
    408 ; CHECK-LABEL: test_vsubl_high_u16:
    409 ; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    410 entry:
    411   %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    412   %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
    413   %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    414   %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
    415   %sub.i = sub <4 x i32> %0, %1
    416   ret <4 x i32> %sub.i
    417 }
    418 
    419 define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
    420 ; CHECK-LABEL: test_vsubl_high_u32:
    421 ; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    422 entry:
    423   %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    424   %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
    425   %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    426   %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
    427   %sub.i = sub <2 x i64> %0, %1
    428   ret <2 x i64> %sub.i
    429 }
    430 
    431 define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
    432 ; CHECK-LABEL: test_vsubw_s8:
    433 ; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
    434 entry:
    435   %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
    436   %sub.i = sub <8 x i16> %a, %vmovl.i.i
    437   ret <8 x i16> %sub.i
    438 }
    439 
    440 define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
    441 ; CHECK-LABEL: test_vsubw_s16:
    442 ; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
    443 entry:
    444   %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
    445   %sub.i = sub <4 x i32> %a, %vmovl.i.i
    446   ret <4 x i32> %sub.i
    447 }
    448 
    449 define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
    450 ; CHECK-LABEL: test_vsubw_s32:
    451 ; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
    452 entry:
    453   %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
    454   %sub.i = sub <2 x i64> %a, %vmovl.i.i
    455   ret <2 x i64> %sub.i
    456 }
    457 
    458 define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
    459 ; CHECK-LABEL: test_vsubw_u8:
    460 ; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
    461 entry:
    462   %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
    463   %sub.i = sub <8 x i16> %a, %vmovl.i.i
    464   ret <8 x i16> %sub.i
    465 }
    466 
    467 define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
    468 ; CHECK-LABEL: test_vsubw_u16:
    469 ; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
    470 entry:
    471   %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
    472   %sub.i = sub <4 x i32> %a, %vmovl.i.i
    473   ret <4 x i32> %sub.i
    474 }
    475 
    476 define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
    477 ; CHECK-LABEL: test_vsubw_u32:
    478 ; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
    479 entry:
    480   %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
    481   %sub.i = sub <2 x i64> %a, %vmovl.i.i
    482   ret <2 x i64> %sub.i
    483 }
    484 
    485 define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
    486 ; CHECK-LABEL: test_vsubw_high_s8:
    487 ; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
    488 entry:
    489   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    490   %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
    491   %sub.i = sub <8 x i16> %a, %0
    492   ret <8 x i16> %sub.i
    493 }
    494 
    495 define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
    496 ; CHECK-LABEL: test_vsubw_high_s16:
    497 ; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
    498 entry:
    499   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    500   %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
    501   %sub.i = sub <4 x i32> %a, %0
    502   ret <4 x i32> %sub.i
    503 }
    504 
    505 define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
    506 ; CHECK-LABEL: test_vsubw_high_s32:
    507 ; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
    508 entry:
    509   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    510   %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
    511   %sub.i = sub <2 x i64> %a, %0
    512   ret <2 x i64> %sub.i
    513 }
    514 
    515 define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
    516 ; CHECK-LABEL: test_vsubw_high_u8:
    517 ; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
    518 entry:
    519   %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    520   %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
    521   %sub.i = sub <8 x i16> %a, %0
    522   ret <8 x i16> %sub.i
    523 }
    524 
    525 define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
    526 ; CHECK-LABEL: test_vsubw_high_u16:
    527 ; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
    528 entry:
    529   %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    530   %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
    531   %sub.i = sub <4 x i32> %a, %0
    532   ret <4 x i32> %sub.i
    533 }
    534 
    535 define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
    536 ; CHECK-LABEL: test_vsubw_high_u32:
    537 ; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
    538 entry:
    539   %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    540   %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
    541   %sub.i = sub <2 x i64> %a, %0
    542   ret <2 x i64> %sub.i
    543 }
    544 
    545 define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
    546 ; CHECK-LABEL: test_vaddhn_s16:
    547 ; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    548 entry:
    549   %vaddhn.i = add <8 x i16> %a, %b
    550   %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    551   %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
    552   ret <8 x i8> %vaddhn2.i
    553 }
    554 
    555 define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
    556 ; CHECK-LABEL: test_vaddhn_s32:
    557 ; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    558 entry:
    559   %vaddhn.i = add <4 x i32> %a, %b
    560   %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
    561   %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
    562   ret <4 x i16> %vaddhn2.i
    563 }
    564 
    565 define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
    566 ; CHECK-LABEL: test_vaddhn_s64:
    567 ; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    568 entry:
    569   %vaddhn.i = add <2 x i64> %a, %b
    570   %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
    571   %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
    572   ret <2 x i32> %vaddhn2.i
    573 }
    574 
    575 define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
    576 ; CHECK-LABEL: test_vaddhn_u16:
    577 ; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    578 entry:
    579   %vaddhn.i = add <8 x i16> %a, %b
    580   %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    581   %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
    582   ret <8 x i8> %vaddhn2.i
    583 }
    584 
    585 define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
    586 ; CHECK-LABEL: test_vaddhn_u32:
    587 ; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    588 entry:
    589   %vaddhn.i = add <4 x i32> %a, %b
    590   %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
    591   %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
    592   ret <4 x i16> %vaddhn2.i
    593 }
    594 
    595 define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
    596 ; CHECK-LABEL: test_vaddhn_u64:
    597 ; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    598 entry:
    599   %vaddhn.i = add <2 x i64> %a, %b
    600   %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
    601   %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
    602   ret <2 x i32> %vaddhn2.i
    603 }
    604 
    605 define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
    606 ; CHECK-LABEL: test_vaddhn_high_s16:
    607 ; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    608 entry:
    609   %vaddhn.i.i = add <8 x i16> %a, %b
    610   %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    611   %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
    612   %0 = bitcast <8 x i8> %r to <1 x i64>
    613   %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
    614   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    615   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
    616   ret <16 x i8> %2
    617 }
    618 
    619 define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
    620 ; CHECK-LABEL: test_vaddhn_high_s32:
    621 ; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    622 entry:
    623   %vaddhn.i.i = add <4 x i32> %a, %b
    624   %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
    625   %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
    626   %0 = bitcast <4 x i16> %r to <1 x i64>
    627   %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
    628   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    629   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
    630   ret <8 x i16> %2
    631 }
    632 
    633 define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
    634 ; CHECK-LABEL: test_vaddhn_high_s64:
    635 ; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    636 entry:
    637   %vaddhn.i.i = add <2 x i64> %a, %b
    638   %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
    639   %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
    640   %0 = bitcast <2 x i32> %r to <1 x i64>
    641   %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
    642   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    643   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
    644   ret <4 x i32> %2
    645 }
    646 
    647 define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
    648 ; CHECK-LABEL: test_vaddhn_high_u16:
    649 ; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    650 entry:
    651   %vaddhn.i.i = add <8 x i16> %a, %b
    652   %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    653   %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
    654   %0 = bitcast <8 x i8> %r to <1 x i64>
    655   %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
    656   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    657   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
    658   ret <16 x i8> %2
    659 }
    660 
    661 define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
    662 ; CHECK-LABEL: test_vaddhn_high_u32:
    663 ; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    664 entry:
    665   %vaddhn.i.i = add <4 x i32> %a, %b
    666   %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
    667   %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
    668   %0 = bitcast <4 x i16> %r to <1 x i64>
    669   %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
    670   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    671   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
    672   ret <8 x i16> %2
    673 }
    674 
    675 define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
    676 ; CHECK-LABEL: test_vaddhn_high_u64:
    677 ; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    678 entry:
    679   %vaddhn.i.i = add <2 x i64> %a, %b
    680   %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
    681   %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
    682   %0 = bitcast <2 x i32> %r to <1 x i64>
    683   %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
    684   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    685   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
    686   ret <4 x i32> %2
    687 }
    688 
    689 define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
    690 ; CHECK-LABEL: test_vraddhn_s16:
    691 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    692 entry:
    693   %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
    694   ret <8 x i8> %vraddhn2.i
    695 }
    696 
    697 define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
    698 ; CHECK-LABEL: test_vraddhn_s32:
    699 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    700 entry:
    701   %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
    702   ret <4 x i16> %vraddhn2.i
    703 }
    704 
    705 define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
    706 ; CHECK-LABEL: test_vraddhn_s64:
    707 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    708 entry:
    709   %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
    710   ret <2 x i32> %vraddhn2.i
    711 }
    712 
    713 define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
    714 ; CHECK-LABEL: test_vraddhn_u16:
    715 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    716 entry:
    717   %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
    718   ret <8 x i8> %vraddhn2.i
    719 }
    720 
    721 define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
    722 ; CHECK-LABEL: test_vraddhn_u32:
    723 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    724 entry:
    725   %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
    726   ret <4 x i16> %vraddhn2.i
    727 }
    728 
    729 define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
    730 ; CHECK-LABEL: test_vraddhn_u64:
    731 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    732 entry:
    733   %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
    734   ret <2 x i32> %vraddhn2.i
    735 }
    736 
    737 define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
    738 ; CHECK-LABEL: test_vraddhn_high_s16:
    739 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    740 entry:
    741   %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
    742   %0 = bitcast <8 x i8> %r to <1 x i64>
    743   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
    744   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    745   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
    746   ret <16 x i8> %2
    747 }
    748 
    749 define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
    750 ; CHECK-LABEL: test_vraddhn_high_s32:
    751 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    752 entry:
    753   %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
    754   %0 = bitcast <4 x i16> %r to <1 x i64>
    755   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
    756   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    757   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
    758   ret <8 x i16> %2
    759 }
    760 
    761 define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
    762 ; CHECK-LABEL: test_vraddhn_high_s64:
    763 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    764 entry:
    765   %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
    766   %0 = bitcast <2 x i32> %r to <1 x i64>
    767   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
    768   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    769   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
    770   ret <4 x i32> %2
    771 }
    772 
    773 define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
    774 ; CHECK-LABEL: test_vraddhn_high_u16:
    775 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    776 entry:
    777   %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
    778   %0 = bitcast <8 x i8> %r to <1 x i64>
    779   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
    780   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    781   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
    782   ret <16 x i8> %2
    783 }
    784 
    785 define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
    786 ; CHECK-LABEL: test_vraddhn_high_u32:
    787 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    788 entry:
    789   %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
    790   %0 = bitcast <4 x i16> %r to <1 x i64>
    791   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
    792   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    793   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
    794   ret <8 x i16> %2
    795 }
    796 
    797 define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
    798 ; CHECK-LABEL: test_vraddhn_high_u64:
    799 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    800 entry:
    801   %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
    802   %0 = bitcast <2 x i32> %r to <1 x i64>
    803   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
    804   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    805   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
    806   ret <4 x i32> %2
    807 }
    808 
    809 define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
    810 ; CHECK-LABEL: test_vsubhn_s16:
    811 ; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    812 entry:
    813   %vsubhn.i = sub <8 x i16> %a, %b
    814   %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    815   %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
    816   ret <8 x i8> %vsubhn2.i
    817 }
    818 
    819 define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
    820 ; CHECK-LABEL: test_vsubhn_s32:
    821 ; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    822 entry:
    823   %vsubhn.i = sub <4 x i32> %a, %b
    824   %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
    825   %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
    826   ret <4 x i16> %vsubhn2.i
    827 }
    828 
    829 define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
    830 ; CHECK-LABEL: test_vsubhn_s64:
    831 ; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    832 entry:
    833   %vsubhn.i = sub <2 x i64> %a, %b
    834   %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
    835   %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
    836   ret <2 x i32> %vsubhn2.i
    837 }
    838 
    839 define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
    840 ; CHECK-LABEL: test_vsubhn_u16:
    841 ; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    842 entry:
    843   %vsubhn.i = sub <8 x i16> %a, %b
    844   %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    845   %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
    846   ret <8 x i8> %vsubhn2.i
    847 }
    848 
    849 define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
    850 ; CHECK-LABEL: test_vsubhn_u32:
    851 ; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    852 entry:
    853   %vsubhn.i = sub <4 x i32> %a, %b
    854   %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
    855   %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
    856   ret <4 x i16> %vsubhn2.i
    857 }
    858 
    859 define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
    860 ; CHECK-LABEL: test_vsubhn_u64:
    861 ; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    862 entry:
    863   %vsubhn.i = sub <2 x i64> %a, %b
    864   %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
    865   %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
    866   ret <2 x i32> %vsubhn2.i
    867 }
    868 
    869 define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
    870 ; CHECK-LABEL: test_vsubhn_high_s16:
    871 ; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    872 entry:
    873   %vsubhn.i.i = sub <8 x i16> %a, %b
    874   %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    875   %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
    876   %0 = bitcast <8 x i8> %r to <1 x i64>
    877   %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
    878   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    879   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
    880   ret <16 x i8> %2
    881 }
    882 
    883 define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
    884 ; CHECK-LABEL: test_vsubhn_high_s32:
    885 ; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    886 entry:
    887   %vsubhn.i.i = sub <4 x i32> %a, %b
    888   %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
    889   %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
    890   %0 = bitcast <4 x i16> %r to <1 x i64>
    891   %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
    892   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    893   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
    894   ret <8 x i16> %2
    895 }
    896 
    897 define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
    898 ; CHECK-LABEL: test_vsubhn_high_s64:
    899 ; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    900 entry:
    901   %vsubhn.i.i = sub <2 x i64> %a, %b
    902   %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
    903   %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
    904   %0 = bitcast <2 x i32> %r to <1 x i64>
    905   %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
    906   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    907   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
    908   ret <4 x i32> %2
    909 }
    910 
    911 define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
    912 ; CHECK-LABEL: test_vsubhn_high_u16:
    913 ; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    914 entry:
    915   %vsubhn.i.i = sub <8 x i16> %a, %b
    916   %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    917   %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
    918   %0 = bitcast <8 x i8> %r to <1 x i64>
    919   %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
    920   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    921   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
    922   ret <16 x i8> %2
    923 }
    924 
    925 define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
    926 ; CHECK-LABEL: test_vsubhn_high_u32:
    927 ; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    928 entry:
    929   %vsubhn.i.i = sub <4 x i32> %a, %b
    930   %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
    931   %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
    932   %0 = bitcast <4 x i16> %r to <1 x i64>
    933   %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
    934   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    935   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
    936   ret <8 x i16> %2
    937 }
    938 
    939 define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
    940 ; CHECK-LABEL: test_vsubhn_high_u64:
    941 ; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    942 entry:
    943   %vsubhn.i.i = sub <2 x i64> %a, %b
    944   %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
    945   %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
    946   %0 = bitcast <2 x i32> %r to <1 x i64>
    947   %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
    948   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
    949   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
    950   ret <4 x i32> %2
    951 }
    952 
    953 define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
    954 ; CHECK-LABEL: test_vrsubhn_s16:
    955 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    956 entry:
    957   %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
    958   ret <8 x i8> %vrsubhn2.i
    959 }
    960 
    961 define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
    962 ; CHECK-LABEL: test_vrsubhn_s32:
    963 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    964 entry:
    965   %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
    966   ret <4 x i16> %vrsubhn2.i
    967 }
    968 
    969 define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
    970 ; CHECK-LABEL: test_vrsubhn_s64:
    971 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    972 entry:
    973   %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
    974   ret <2 x i32> %vrsubhn2.i
    975 }
    976 
    977 define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
    978 ; CHECK-LABEL: test_vrsubhn_u16:
    979 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
    980 entry:
    981   %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
    982   ret <8 x i8> %vrsubhn2.i
    983 }
    984 
    985 define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
    986 ; CHECK-LABEL: test_vrsubhn_u32:
    987 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
    988 entry:
    989   %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
    990   ret <4 x i16> %vrsubhn2.i
    991 }
    992 
    993 define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
    994 ; CHECK-LABEL: test_vrsubhn_u64:
    995 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
    996 entry:
    997   %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
    998   ret <2 x i32> %vrsubhn2.i
    999 }
   1000 
   1001 define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
   1002 ; CHECK-LABEL: test_vrsubhn_high_s16:
   1003 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1004 entry:
   1005   %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   1006   %0 = bitcast <8 x i8> %r to <1 x i64>
   1007   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   1008   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
   1009   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
   1010   ret <16 x i8> %2
   1011 }
   1012 
   1013 define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
   1014 ; CHECK-LABEL: test_vrsubhn_high_s32:
   1015 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1016 entry:
   1017   %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   1018   %0 = bitcast <4 x i16> %r to <1 x i64>
   1019   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   1020   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
   1021   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
   1022   ret <8 x i16> %2
   1023 }
   1024 
   1025 define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
   1026 ; CHECK-LABEL: test_vrsubhn_high_s64:
   1027 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
   1028 entry:
   1029   %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   1030   %0 = bitcast <2 x i32> %r to <1 x i64>
   1031   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   1032   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
   1033   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
   1034   ret <4 x i32> %2
   1035 }
   1036 
   1037 define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
   1038 ; CHECK-LABEL: test_vrsubhn_high_u16:
   1039 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1040 entry:
   1041   %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   1042   %0 = bitcast <8 x i8> %r to <1 x i64>
   1043   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   1044   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
   1045   %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
   1046   ret <16 x i8> %2
   1047 }
   1048 
   1049 define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
   1050 ; CHECK-LABEL: test_vrsubhn_high_u32:
   1051 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1052 entry:
   1053   %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   1054   %0 = bitcast <4 x i16> %r to <1 x i64>
   1055   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   1056   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
   1057   %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
   1058   ret <8 x i16> %2
   1059 }
   1060 
   1061 define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
   1062 ; CHECK-LABEL: test_vrsubhn_high_u64:
   1063 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
   1064 entry:
   1065   %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   1066   %0 = bitcast <2 x i32> %r to <1 x i64>
   1067   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   1068   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
   1069   %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
   1070   ret <4 x i32> %2
   1071 }
   1072 
   1073 define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
   1074 ; CHECK-LABEL: test_vabdl_s8:
   1075 ; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1076 entry:
   1077   %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   1078   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   1079   ret <8 x i16> %vmovl.i.i
   1080 }
   1081 
   1082 define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
   1083 ; CHECK-LABEL: test_vabdl_s16:
   1084 ; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1085 entry:
   1086   %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   1087   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   1088   ret <4 x i32> %vmovl.i.i
   1089 }
   1090 
   1091 define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
   1092 ; CHECK-LABEL: test_vabdl_s32:
   1093 ; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1094 entry:
   1095   %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   1096   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   1097   ret <2 x i64> %vmovl.i.i
   1098 }
   1099 
   1100 define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
   1101 ; CHECK-LABEL: test_vabdl_u8:
   1102 ; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1103 entry:
   1104   %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   1105   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   1106   ret <8 x i16> %vmovl.i.i
   1107 }
   1108 
   1109 define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
   1110 ; CHECK-LABEL: test_vabdl_u16:
   1111 ; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1112 entry:
   1113   %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   1114   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   1115   ret <4 x i32> %vmovl.i.i
   1116 }
   1117 
   1118 define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
   1119 ; CHECK-LABEL: test_vabdl_u32:
   1120 ; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1121 entry:
   1122   %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   1123   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   1124   ret <2 x i64> %vmovl.i.i
   1125 }
   1126 
   1127 define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
   1128 ; CHECK-LABEL: test_vabal_s8:
   1129 ; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1130 entry:
   1131   %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   1132   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   1133   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   1134   ret <8 x i16> %add.i
   1135 }
   1136 
   1137 define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1138 ; CHECK-LABEL: test_vabal_s16:
   1139 ; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1140 entry:
   1141   %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   1142   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   1143   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   1144   ret <4 x i32> %add.i
   1145 }
   1146 
   1147 define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1148 ; CHECK-LABEL: test_vabal_s32:
   1149 ; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1150 entry:
   1151   %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   1152   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   1153   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   1154   ret <2 x i64> %add.i
   1155 }
   1156 
   1157 define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
   1158 ; CHECK-LABEL: test_vabal_u8:
   1159 ; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1160 entry:
   1161   %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   1162   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   1163   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   1164   ret <8 x i16> %add.i
   1165 }
   1166 
   1167 define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1168 ; CHECK-LABEL: test_vabal_u16:
   1169 ; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1170 entry:
   1171   %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   1172   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   1173   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   1174   ret <4 x i32> %add.i
   1175 }
   1176 
   1177 define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1178 ; CHECK-LABEL: test_vabal_u32:
   1179 ; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1180 entry:
   1181   %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   1182   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   1183   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   1184   ret <2 x i64> %add.i
   1185 }
   1186 
   1187 define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
   1188 ; CHECK-LABEL: test_vabdl_high_s8:
   1189 ; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1190 entry:
   1191   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1192   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1193   %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1194   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   1195   ret <8 x i16> %vmovl.i.i.i
   1196 }
   1197 
   1198 define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
   1199 ; CHECK-LABEL: test_vabdl_high_s16:
   1200 ; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1201 entry:
   1202   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1203   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1204   %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1205   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   1206   ret <4 x i32> %vmovl.i.i.i
   1207 }
   1208 
   1209 define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
   1210 ; CHECK-LABEL: test_vabdl_high_s32:
   1211 ; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1212 entry:
   1213   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1214   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1215   %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1216   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   1217   ret <2 x i64> %vmovl.i.i.i
   1218 }
   1219 
   1220 define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
   1221 ; CHECK-LABEL: test_vabdl_high_u8:
   1222 ; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1223 entry:
   1224   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1225   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1226   %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1227   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   1228   ret <8 x i16> %vmovl.i.i.i
   1229 }
   1230 
   1231 define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
   1232 ; CHECK-LABEL: test_vabdl_high_u16:
   1233 ; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1234 entry:
   1235   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1236   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1237   %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1238   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   1239   ret <4 x i32> %vmovl.i.i.i
   1240 }
   1241 
   1242 define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
   1243 ; CHECK-LABEL: test_vabdl_high_u32:
   1244 ; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1245 entry:
   1246   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1247   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1248   %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1249   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   1250   ret <2 x i64> %vmovl.i.i.i
   1251 }
   1252 
   1253 define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
   1254 ; CHECK-LABEL: test_vabal_high_s8:
   1255 ; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1256 entry:
   1257   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1258   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1259   %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1260   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   1261   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   1262   ret <8 x i16> %add.i.i
   1263 }
   1264 
   1265 define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1266 ; CHECK-LABEL: test_vabal_high_s16:
   1267 ; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1268 entry:
   1269   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1270   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1271   %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1272   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   1273   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   1274   ret <4 x i32> %add.i.i
   1275 }
   1276 
   1277 define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1278 ; CHECK-LABEL: test_vabal_high_s32:
   1279 ; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1280 entry:
   1281   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1282   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1283   %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1284   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   1285   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   1286   ret <2 x i64> %add.i.i
   1287 }
   1288 
   1289 define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
   1290 ; CHECK-LABEL: test_vabal_high_u8:
   1291 ; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1292 entry:
   1293   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1294   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1295   %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1296   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   1297   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   1298   ret <8 x i16> %add.i.i
   1299 }
   1300 
   1301 define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1302 ; CHECK-LABEL: test_vabal_high_u16:
   1303 ; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1304 entry:
   1305   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1306   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1307   %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1308   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   1309   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   1310   ret <4 x i32> %add.i.i
   1311 }
   1312 
   1313 define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1314 ; CHECK-LABEL: test_vabal_high_u32:
   1315 ; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1316 entry:
   1317   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1318   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1319   %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1320   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   1321   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   1322   ret <2 x i64> %add.i.i
   1323 }
   1324 
   1325 define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
   1326 ; CHECK-LABEL: test_vmull_s8:
   1327 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1328 entry:
   1329   %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
   1330   ret <8 x i16> %vmull.i
   1331 }
   1332 
   1333 define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
   1334 ; CHECK-LABEL: test_vmull_s16:
   1335 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1336 entry:
   1337   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
   1338   ret <4 x i32> %vmull2.i
   1339 }
   1340 
   1341 define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
   1342 ; CHECK-LABEL: test_vmull_s32:
   1343 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1344 entry:
   1345   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
   1346   ret <2 x i64> %vmull2.i
   1347 }
   1348 
   1349 define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
   1350 ; CHECK-LABEL: test_vmull_u8:
   1351 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1352 entry:
   1353   %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
   1354   ret <8 x i16> %vmull.i
   1355 }
   1356 
   1357 define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
   1358 ; CHECK-LABEL: test_vmull_u16:
   1359 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1360 entry:
   1361   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
   1362   ret <4 x i32> %vmull2.i
   1363 }
   1364 
   1365 define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
   1366 ; CHECK-LABEL: test_vmull_u32:
   1367 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1368 entry:
   1369   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
   1370   ret <2 x i64> %vmull2.i
   1371 }
   1372 
   1373 define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
   1374 ; CHECK-LABEL: test_vmull_high_s8:
   1375 ; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1376 entry:
   1377   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1378   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1379   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1380   ret <8 x i16> %vmull.i.i
   1381 }
   1382 
   1383 define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
   1384 ; CHECK-LABEL: test_vmull_high_s16:
   1385 ; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1386 entry:
   1387   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1388   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1389   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1390   ret <4 x i32> %vmull2.i.i
   1391 }
   1392 
   1393 define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
   1394 ; CHECK-LABEL: test_vmull_high_s32:
   1395 ; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1396 entry:
   1397   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1398   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1399   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1400   ret <2 x i64> %vmull2.i.i
   1401 }
   1402 
   1403 define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
   1404 ; CHECK-LABEL: test_vmull_high_u8:
   1405 ; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1406 entry:
   1407   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1408   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1409   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1410   ret <8 x i16> %vmull.i.i
   1411 }
   1412 
   1413 define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
   1414 ; CHECK-LABEL: test_vmull_high_u16:
   1415 ; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1416 entry:
   1417   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1418   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1419   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1420   ret <4 x i32> %vmull2.i.i
   1421 }
   1422 
   1423 define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
   1424 ; CHECK-LABEL: test_vmull_high_u32:
   1425 ; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1426 entry:
   1427   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1428   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1429   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1430   ret <2 x i64> %vmull2.i.i
   1431 }
   1432 
   1433 define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
   1434 ; CHECK-LABEL: test_vmlal_s8:
   1435 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1436 entry:
   1437   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   1438   %add.i = add <8 x i16> %vmull.i.i, %a
   1439   ret <8 x i16> %add.i
   1440 }
   1441 
   1442 define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1443 ; CHECK-LABEL: test_vmlal_s16:
   1444 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1445 entry:
   1446   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   1447   %add.i = add <4 x i32> %vmull2.i.i, %a
   1448   ret <4 x i32> %add.i
   1449 }
   1450 
   1451 define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1452 ; CHECK-LABEL: test_vmlal_s32:
   1453 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1454 entry:
   1455   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   1456   %add.i = add <2 x i64> %vmull2.i.i, %a
   1457   ret <2 x i64> %add.i
   1458 }
   1459 
   1460 define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
   1461 ; CHECK-LABEL: test_vmlal_u8:
   1462 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1463 entry:
   1464   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   1465   %add.i = add <8 x i16> %vmull.i.i, %a
   1466   ret <8 x i16> %add.i
   1467 }
   1468 
   1469 define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1470 ; CHECK-LABEL: test_vmlal_u16:
   1471 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1472 entry:
   1473   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   1474   %add.i = add <4 x i32> %vmull2.i.i, %a
   1475   ret <4 x i32> %add.i
   1476 }
   1477 
   1478 define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1479 ; CHECK-LABEL: test_vmlal_u32:
   1480 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1481 entry:
   1482   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   1483   %add.i = add <2 x i64> %vmull2.i.i, %a
   1484   ret <2 x i64> %add.i
   1485 }
   1486 
   1487 define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
   1488 ; CHECK-LABEL: test_vmlal_high_s8:
   1489 ; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1490 entry:
   1491   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1492   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1493   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1494   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   1495   ret <8 x i16> %add.i.i
   1496 }
   1497 
   1498 define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1499 ; CHECK-LABEL: test_vmlal_high_s16:
   1500 ; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1501 entry:
   1502   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1503   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1504   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1505   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   1506   ret <4 x i32> %add.i.i
   1507 }
   1508 
   1509 define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1510 ; CHECK-LABEL: test_vmlal_high_s32:
   1511 ; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1512 entry:
   1513   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1514   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1515   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1516   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   1517   ret <2 x i64> %add.i.i
   1518 }
   1519 
   1520 define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
   1521 ; CHECK-LABEL: test_vmlal_high_u8:
   1522 ; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1523 entry:
   1524   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1525   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1526   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1527   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   1528   ret <8 x i16> %add.i.i
   1529 }
   1530 
   1531 define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1532 ; CHECK-LABEL: test_vmlal_high_u16:
   1533 ; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1534 entry:
   1535   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1536   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1537   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1538   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   1539   ret <4 x i32> %add.i.i
   1540 }
   1541 
   1542 define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1543 ; CHECK-LABEL: test_vmlal_high_u32:
   1544 ; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1545 entry:
   1546   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1547   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1548   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1549   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   1550   ret <2 x i64> %add.i.i
   1551 }
   1552 
   1553 define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
   1554 ; CHECK-LABEL: test_vmlsl_s8:
   1555 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1556 entry:
   1557   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   1558   %sub.i = sub <8 x i16> %a, %vmull.i.i
   1559   ret <8 x i16> %sub.i
   1560 }
   1561 
   1562 define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1563 ; CHECK-LABEL: test_vmlsl_s16:
   1564 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1565 entry:
   1566   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   1567   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   1568   ret <4 x i32> %sub.i
   1569 }
   1570 
   1571 define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1572 ; CHECK-LABEL: test_vmlsl_s32:
   1573 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1574 entry:
   1575   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   1576   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   1577   ret <2 x i64> %sub.i
   1578 }
   1579 
   1580 define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
   1581 ; CHECK-LABEL: test_vmlsl_u8:
   1582 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1583 entry:
   1584   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   1585   %sub.i = sub <8 x i16> %a, %vmull.i.i
   1586   ret <8 x i16> %sub.i
   1587 }
   1588 
   1589 define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1590 ; CHECK-LABEL: test_vmlsl_u16:
   1591 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1592 entry:
   1593   %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   1594   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   1595   ret <4 x i32> %sub.i
   1596 }
   1597 
   1598 define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1599 ; CHECK-LABEL: test_vmlsl_u32:
   1600 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1601 entry:
   1602   %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   1603   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   1604   ret <2 x i64> %sub.i
   1605 }
   1606 
   1607 define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
   1608 ; CHECK-LABEL: test_vmlsl_high_s8:
   1609 ; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1610 entry:
   1611   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1612   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1613   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1614   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   1615   ret <8 x i16> %sub.i.i
   1616 }
   1617 
   1618 define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1619 ; CHECK-LABEL: test_vmlsl_high_s16:
   1620 ; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1621 entry:
   1622   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1623   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1624   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1625   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   1626   ret <4 x i32> %sub.i.i
   1627 }
   1628 
   1629 define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1630 ; CHECK-LABEL: test_vmlsl_high_s32:
   1631 ; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1632 entry:
   1633   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1634   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1635   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1636   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   1637   ret <2 x i64> %sub.i.i
   1638 }
   1639 
   1640 define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
   1641 ; CHECK-LABEL: test_vmlsl_high_u8:
   1642 ; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1643 entry:
   1644   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1645   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1646   %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1647   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   1648   ret <8 x i16> %sub.i.i
   1649 }
   1650 
   1651 define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1652 ; CHECK-LABEL: test_vmlsl_high_u16:
   1653 ; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1654 entry:
   1655   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1656   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1657   %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1658   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   1659   ret <4 x i32> %sub.i.i
   1660 }
   1661 
   1662 define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1663 ; CHECK-LABEL: test_vmlsl_high_u32:
   1664 ; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1665 entry:
   1666   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1667   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1668   %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1669   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   1670   ret <2 x i64> %sub.i.i
   1671 }
   1672 
   1673 define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
   1674 ; CHECK-LABEL: test_vqdmull_s16:
   1675 ; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1676 entry:
   1677   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
   1678   ret <4 x i32> %vqdmull2.i
   1679 }
   1680 
   1681 define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
   1682 ; CHECK-LABEL: test_vqdmull_s32:
   1683 ; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1684 entry:
   1685   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
   1686   ret <2 x i64> %vqdmull2.i
   1687 }
   1688 
   1689 define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1690 ; CHECK-LABEL: test_vqdmlal_s16:
   1691 ; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1692 entry:
   1693   %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
   1694   %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   1695   ret <4 x i32> %vqdmlal4.i
   1696 }
   1697 
   1698 define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1699 ; CHECK-LABEL: test_vqdmlal_s32:
   1700 ; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1701 entry:
   1702   %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
   1703   %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   1704   ret <2 x i64> %vqdmlal4.i
   1705 }
   1706 
   1707 define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
   1708 ; CHECK-LABEL: test_vqdmlsl_s16:
   1709 ; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
   1710 entry:
   1711   %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
   1712   %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   1713   ret <4 x i32> %vqdmlsl4.i
   1714 }
   1715 
   1716 define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
   1717 ; CHECK-LABEL: test_vqdmlsl_s32:
   1718 ; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   1719 entry:
   1720   %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
   1721   %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   1722   ret <2 x i64> %vqdmlsl4.i
   1723 }
   1724 
   1725 define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
   1726 ; CHECK-LABEL: test_vqdmull_high_s16:
   1727 ; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1728 entry:
   1729   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1730   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1731   %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1732   ret <4 x i32> %vqdmull2.i.i
   1733 }
   1734 
   1735 define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
   1736 ; CHECK-LABEL: test_vqdmull_high_s32:
   1737 ; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1738 entry:
   1739   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1740   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1741   %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1742   ret <2 x i64> %vqdmull2.i.i
   1743 }
   1744 
   1745 define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1746 ; CHECK-LABEL: test_vqdmlal_high_s16:
   1747 ; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1748 entry:
   1749   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1750   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1751   %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1752   %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
   1753   ret <4 x i32> %vqdmlal4.i.i
   1754 }
   1755 
   1756 define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1757 ; CHECK-LABEL: test_vqdmlal_high_s32:
   1758 ; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1759 entry:
   1760   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1761   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1762   %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1763   %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
   1764   ret <2 x i64> %vqdmlal4.i.i
   1765 }
   1766 
   1767 define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
   1768 ; CHECK-LABEL: test_vqdmlsl_high_s16:
   1769 ; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
   1770 entry:
   1771   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1772   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   1773   %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   1774   %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
   1775   ret <4 x i32> %vqdmlsl4.i.i
   1776 }
   1777 
   1778 define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
   1779 ; CHECK-LABEL: test_vqdmlsl_high_s32:
   1780 ; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   1781 entry:
   1782   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1783   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   1784   %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   1785   %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
   1786   ret <2 x i64> %vqdmlsl4.i.i
   1787 }
   1788 
   1789 define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
   1790 ; CHECK-LABEL: test_vmull_p8:
   1791 ; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
   1792 entry:
   1793   %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
   1794   ret <8 x i16> %vmull.i
   1795 }
   1796 
   1797 define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
   1798 ; CHECK-LABEL: test_vmull_high_p8:
   1799 ; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
   1800 entry:
   1801   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1802   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1803   %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   1804   ret <8 x i16> %vmull.i.i
   1805 }
   1806 
   1807 define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
   1808 ; CHECK-LABEL: test_vmull_p64
   1809 ; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
   1810 entry:
   1811   %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
   1812   %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
   1813   ret i128 %vmull3.i
   1814 }
   1815 
   1816 define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
   1817 ; CHECK-LABEL: test_vmull_high_p64
   1818 ; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
   1819 entry:
   1820   %0 = extractelement <2 x i64> %a, i32 1
   1821   %1 = extractelement <2 x i64> %b, i32 1
   1822   %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
   1823   %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
   1824   ret i128 %vmull3.i.i
   1825 }
   1826 
   1827 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
   1828 
   1829 
   1830