Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
      2 
      3 define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
      4 ;CHECK-LABEL: addhn8b:
      5 ;CHECK: addhn.8b
      6         %tmp1 = load <8 x i16>, <8 x i16>* %A
      7         %tmp2 = load <8 x i16>, <8 x i16>* %B
      8         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
      9         ret <8 x i8> %tmp3
     10 }
     11 
     12 define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     13 ;CHECK-LABEL: addhn4h:
     14 ;CHECK: addhn.4h
     15         %tmp1 = load <4 x i32>, <4 x i32>* %A
     16         %tmp2 = load <4 x i32>, <4 x i32>* %B
     17         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
     18         ret <4 x i16> %tmp3
     19 }
     20 
     21 define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
     22 ;CHECK-LABEL: addhn2s:
     23 ;CHECK: addhn.2s
     24         %tmp1 = load <2 x i64>, <2 x i64>* %A
     25         %tmp2 = load <2 x i64>, <2 x i64>* %B
     26         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
     27         ret <2 x i32> %tmp3
     28 }
     29 
     30 define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
     31 ;CHECK-LABEL: addhn2_16b:
     32 ;CHECK: addhn.8b
     33 ;CHECK-NEXT: addhn2.16b
     34   %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     35   %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     36   %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     37   ret <16 x i8> %res
     38 }
     39 
     40 define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
     41 ;CHECK-LABEL: addhn2_8h:
     42 ;CHECK: addhn.4h
     43 ;CHECK-NEXT: addhn2.8h
     44   %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
     45   %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
     46   %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
     47   ret <8 x i16> %res
     48 }
     49 
     50 define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
     51 ;CHECK-LABEL: addhn2_4s:
     52 ;CHECK: addhn.2s
     53 ;CHECK-NEXT: addhn2.4s
     54   %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
     55   %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
     56   %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
     57   ret <4 x i32> %res
     58 }
     59 
     60 declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
     61 declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
     62 declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
     63 
     64 
     65 define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     66 ;CHECK-LABEL: raddhn8b:
     67 ;CHECK: raddhn.8b
     68         %tmp1 = load <8 x i16>, <8 x i16>* %A
     69         %tmp2 = load <8 x i16>, <8 x i16>* %B
     70         %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
     71         ret <8 x i8> %tmp3
     72 }
     73 
     74 define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     75 ;CHECK-LABEL: raddhn4h:
     76 ;CHECK: raddhn.4h
     77         %tmp1 = load <4 x i32>, <4 x i32>* %A
     78         %tmp2 = load <4 x i32>, <4 x i32>* %B
     79         %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
     80         ret <4 x i16> %tmp3
     81 }
     82 
     83 define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
     84 ;CHECK-LABEL: raddhn2s:
     85 ;CHECK: raddhn.2s
     86         %tmp1 = load <2 x i64>, <2 x i64>* %A
     87         %tmp2 = load <2 x i64>, <2 x i64>* %B
     88         %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
     89         ret <2 x i32> %tmp3
     90 }
     91 
     92 define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
     93 ;CHECK-LABEL: raddhn2_16b:
     94 ;CHECK: raddhn.8b
     95 ;CHECK-NEXT: raddhn2.16b
     96   %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     97   %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
     98   %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     99   ret <16 x i8> %res
    100 }
    101 
    102 define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
    103 ;CHECK-LABEL: raddhn2_8h:
    104 ;CHECK: raddhn.4h
    105 ;CHECK-NEXT: raddhn2.8h
    106   %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
    107   %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
    108   %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    109   ret <8 x i16> %res
    110 }
    111 
    112 define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
    113 ;CHECK-LABEL: raddhn2_4s:
    114 ;CHECK: raddhn.2s
    115 ;CHECK-NEXT: raddhn2.4s
    116   %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
    117   %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
    118   %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    119   ret <4 x i32> %res
    120 }
    121 
    122 declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
    123 declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
    124 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
    125 
    126 define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    127 ;CHECK-LABEL: saddl8h:
    128 ;CHECK: saddl.8h
    129         %tmp1 = load <8 x i8>, <8 x i8>* %A
    130         %tmp2 = load <8 x i8>, <8 x i8>* %B
    131   %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
    132   %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
    133   %tmp5 = add <8 x i16> %tmp3, %tmp4
    134         ret <8 x i16> %tmp5
    135 }
    136 
    137 define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    138 ;CHECK-LABEL: saddl4s:
    139 ;CHECK: saddl.4s
    140         %tmp1 = load <4 x i16>, <4 x i16>* %A
    141         %tmp2 = load <4 x i16>, <4 x i16>* %B
    142   %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
    143   %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
    144   %tmp5 = add <4 x i32> %tmp3, %tmp4
    145         ret <4 x i32> %tmp5
    146 }
    147 
    148 define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    149 ;CHECK-LABEL: saddl2d:
    150 ;CHECK: saddl.2d
    151         %tmp1 = load <2 x i32>, <2 x i32>* %A
    152         %tmp2 = load <2 x i32>, <2 x i32>* %B
    153   %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
    154   %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
    155   %tmp5 = add <2 x i64> %tmp3, %tmp4
    156         ret <2 x i64> %tmp5
    157 }
    158 
    159 define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
    160 ; CHECK-LABEL: saddl2_8h:
    161 ; CHECK-NEXT: saddl2.8h v0, v0, v1
    162 ; CHECK-NEXT: ret
    163   %tmp = bitcast <16 x i8> %a to <2 x i64>
    164   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
    165   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
    166   %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
    167   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
    168   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
    169   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
    170   %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
    171   %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
    172   ret <8 x i16> %add.i
    173 }
    174 
    175 define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
    176 ; CHECK-LABEL: saddl2_4s:
    177 ; CHECK-NEXT: saddl2.4s v0, v0, v1
    178 ; CHECK-NEXT: ret
    179   %tmp = bitcast <8 x i16> %a to <2 x i64>
    180   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
    181   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
    182   %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
    183   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
    184   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
    185   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
    186   %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
    187   %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
    188   ret <4 x i32> %add.i
    189 }
    190 
    191 define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
    192 ; CHECK-LABEL: saddl2_2d:
    193 ; CHECK-NEXT: saddl2.2d v0, v0, v1
    194 ; CHECK-NEXT: ret
    195   %tmp = bitcast <4 x i32> %a to <2 x i64>
    196   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
    197   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
    198   %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
    199   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
    200   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
    201   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
    202   %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
    203   %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
    204   ret <2 x i64> %add.i
    205 }
    206 
    207 define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    208 ;CHECK-LABEL: uaddl8h:
    209 ;CHECK: uaddl.8h
    210   %tmp1 = load <8 x i8>, <8 x i8>* %A
    211   %tmp2 = load <8 x i8>, <8 x i8>* %B
    212   %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
    213   %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
    214   %tmp5 = add <8 x i16> %tmp3, %tmp4
    215   ret <8 x i16> %tmp5
    216 }
    217 
    218 define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    219 ;CHECK-LABEL: uaddl4s:
    220 ;CHECK: uaddl.4s
    221   %tmp1 = load <4 x i16>, <4 x i16>* %A
    222   %tmp2 = load <4 x i16>, <4 x i16>* %B
    223   %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
    224   %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
    225   %tmp5 = add <4 x i32> %tmp3, %tmp4
    226   ret <4 x i32> %tmp5
    227 }
    228 
    229 define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    230 ;CHECK-LABEL: uaddl2d:
    231 ;CHECK: uaddl.2d
    232   %tmp1 = load <2 x i32>, <2 x i32>* %A
    233   %tmp2 = load <2 x i32>, <2 x i32>* %B
    234   %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
    235   %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
    236   %tmp5 = add <2 x i64> %tmp3, %tmp4
    237   ret <2 x i64> %tmp5
    238 }
    239 
    240 
    241 define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
    242 ; CHECK-LABEL: uaddl2_8h:
    243 ; CHECK-NEXT: uaddl2.8h v0, v0, v1
    244 ; CHECK-NEXT: ret
    245   %tmp = bitcast <16 x i8> %a to <2 x i64>
    246   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
    247   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
    248   %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
    249   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
    250   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
    251   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
    252   %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
    253   %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
    254   ret <8 x i16> %add.i
    255 }
    256 
    257 define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
    258 ; CHECK-LABEL: uaddl2_4s:
    259 ; CHECK-NEXT: uaddl2.4s v0, v0, v1
    260 ; CHECK-NEXT: ret
    261   %tmp = bitcast <8 x i16> %a to <2 x i64>
    262   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
    263   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
    264   %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
    265   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
    266   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
    267   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
    268   %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
    269   %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
    270   ret <4 x i32> %add.i
    271 }
    272 
    273 define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
    274 ; CHECK-LABEL: uaddl2_2d:
    275 ; CHECK-NEXT: uaddl2.2d v0, v0, v1
    276 ; CHECK-NEXT: ret
    277   %tmp = bitcast <4 x i32> %a to <2 x i64>
    278   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
    279   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
    280   %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
    281   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
    282   %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
    283   %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
    284   %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
    285   %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
    286   ret <2 x i64> %add.i
    287 }
    288 
    289 define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
    290 ;CHECK-LABEL: uaddw8h:
    291 ;CHECK: uaddw.8h
    292         %tmp1 = load <8 x i16>, <8 x i16>* %A
    293         %tmp2 = load <8 x i8>, <8 x i8>* %B
    294   %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
    295   %tmp4 = add <8 x i16> %tmp1, %tmp3
    296         ret <8 x i16> %tmp4
    297 }
    298 
    299 define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
    300 ;CHECK-LABEL: uaddw4s:
    301 ;CHECK: uaddw.4s
    302         %tmp1 = load <4 x i32>, <4 x i32>* %A
    303         %tmp2 = load <4 x i16>, <4 x i16>* %B
    304   %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
    305   %tmp4 = add <4 x i32> %tmp1, %tmp3
    306         ret <4 x i32> %tmp4
    307 }
    308 
    309 define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
    310 ;CHECK-LABEL: uaddw2d:
    311 ;CHECK: uaddw.2d
    312         %tmp1 = load <2 x i64>, <2 x i64>* %A
    313         %tmp2 = load <2 x i32>, <2 x i32>* %B
    314   %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
    315   %tmp4 = add <2 x i64> %tmp1, %tmp3
    316         ret <2 x i64> %tmp4
    317 }
    318 
    319 define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
    320 ;CHECK-LABEL: uaddw2_8h:
    321 ;CHECK: uaddw2.8h
    322         %tmp1 = load <8 x i16>, <8 x i16>* %A
    323 
    324         %tmp2 = load <16 x i8>, <16 x i8>* %B
    325         %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    326         %ext2 = zext <8 x i8> %high2 to <8 x i16>
    327 
    328         %res = add <8 x i16> %tmp1, %ext2
    329         ret <8 x i16> %res
    330 }
    331 
    332 define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
    333 ;CHECK-LABEL: uaddw2_4s:
    334 ;CHECK: uaddw2.4s
    335         %tmp1 = load <4 x i32>, <4 x i32>* %A
    336 
    337         %tmp2 = load <8 x i16>, <8 x i16>* %B
    338         %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    339         %ext2 = zext <4 x i16> %high2 to <4 x i32>
    340 
    341         %res = add <4 x i32> %tmp1, %ext2
    342         ret <4 x i32> %res
    343 }
    344 
    345 define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
    346 ;CHECK-LABEL: uaddw2_2d:
    347 ;CHECK: uaddw2.2d
    348         %tmp1 = load <2 x i64>, <2 x i64>* %A
    349 
    350         %tmp2 = load <4 x i32>, <4 x i32>* %B
    351         %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    352         %ext2 = zext <2 x i32> %high2 to <2 x i64>
    353 
    354         %res = add <2 x i64> %tmp1, %ext2
    355         ret <2 x i64> %res
    356 }
    357 
    358 define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
    359 ;CHECK-LABEL: saddw8h:
    360 ;CHECK: saddw.8h
    361         %tmp1 = load <8 x i16>, <8 x i16>* %A
    362         %tmp2 = load <8 x i8>, <8 x i8>* %B
    363         %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
    364         %tmp4 = add <8 x i16> %tmp1, %tmp3
    365         ret <8 x i16> %tmp4
    366 }
    367 
    368 define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
    369 ;CHECK-LABEL: saddw4s:
    370 ;CHECK: saddw.4s
    371         %tmp1 = load <4 x i32>, <4 x i32>* %A
    372         %tmp2 = load <4 x i16>, <4 x i16>* %B
    373         %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
    374         %tmp4 = add <4 x i32> %tmp1, %tmp3
    375         ret <4 x i32> %tmp4
    376 }
    377 
    378 define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
    379 ;CHECK-LABEL: saddw2d:
    380 ;CHECK: saddw.2d
    381         %tmp1 = load <2 x i64>, <2 x i64>* %A
    382         %tmp2 = load <2 x i32>, <2 x i32>* %B
    383         %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
    384         %tmp4 = add <2 x i64> %tmp1, %tmp3
    385         ret <2 x i64> %tmp4
    386 }
    387 
    388 define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
    389 ;CHECK-LABEL: saddw2_8h:
    390 ;CHECK: saddw2.8h
    391         %tmp1 = load <8 x i16>, <8 x i16>* %A
    392 
    393         %tmp2 = load <16 x i8>, <16 x i8>* %B
    394         %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    395         %ext2 = sext <8 x i8> %high2 to <8 x i16>
    396 
    397         %res = add <8 x i16> %tmp1, %ext2
    398         ret <8 x i16> %res
    399 }
    400 
    401 define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
    402 ;CHECK-LABEL: saddw2_4s:
    403 ;CHECK: saddw2.4s
    404         %tmp1 = load <4 x i32>, <4 x i32>* %A
    405 
    406         %tmp2 = load <8 x i16>, <8 x i16>* %B
    407         %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    408         %ext2 = sext <4 x i16> %high2 to <4 x i32>
    409 
    410         %res = add <4 x i32> %tmp1, %ext2
    411         ret <4 x i32> %res
    412 }
    413 
    414 define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
    415 ;CHECK-LABEL: saddw2_2d:
    416 ;CHECK: saddw2.2d
    417         %tmp1 = load <2 x i64>, <2 x i64>* %A
    418 
    419         %tmp2 = load <4 x i32>, <4 x i32>* %B
    420         %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    421         %ext2 = sext <2 x i32> %high2 to <2 x i64>
    422 
    423         %res = add <2 x i64> %tmp1, %ext2
    424         ret <2 x i64> %res
    425 }
    426 
    427 define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
    428 ;CHECK-LABEL: saddlp4h:
    429 ;CHECK: saddlp.4h
    430         %tmp1 = load <8 x i8>, <8 x i8>* %A
    431         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
    432         ret <4 x i16> %tmp3
    433 }
    434 
    435 define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
    436 ;CHECK-LABEL: saddlp2s:
    437 ;CHECK: saddlp.2s
    438         %tmp1 = load <4 x i16>, <4 x i16>* %A
    439         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
    440         ret <2 x i32> %tmp3
    441 }
    442 
    443 define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
    444 ;CHECK-LABEL: saddlp1d:
    445 ;CHECK: saddlp.1d
    446         %tmp1 = load <2 x i32>, <2 x i32>* %A
    447         %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
    448         ret <1 x i64> %tmp3
    449 }
    450 
    451 define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
    452 ;CHECK-LABEL: saddlp8h:
    453 ;CHECK: saddlp.8h
    454         %tmp1 = load <16 x i8>, <16 x i8>* %A
    455         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
    456         ret <8 x i16> %tmp3
    457 }
    458 
    459 define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
    460 ;CHECK-LABEL: saddlp4s:
    461 ;CHECK: saddlp.4s
    462         %tmp1 = load <8 x i16>, <8 x i16>* %A
    463         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
    464         ret <4 x i32> %tmp3
    465 }
    466 
    467 define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
    468 ;CHECK-LABEL: saddlp2d:
    469 ;CHECK: saddlp.2d
    470         %tmp1 = load <4 x i32>, <4 x i32>* %A
    471         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
    472         ret <2 x i64> %tmp3
    473 }
    474 
    475 declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
    476 declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
    477 declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
    478 
    479 declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
    480 declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
    481 declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
    482 
    483 define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
    484 ;CHECK-LABEL: uaddlp4h:
    485 ;CHECK: uaddlp.4h
    486         %tmp1 = load <8 x i8>, <8 x i8>* %A
    487         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
    488         ret <4 x i16> %tmp3
    489 }
    490 
    491 define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
    492 ;CHECK-LABEL: uaddlp2s:
    493 ;CHECK: uaddlp.2s
    494         %tmp1 = load <4 x i16>, <4 x i16>* %A
    495         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
    496         ret <2 x i32> %tmp3
    497 }
    498 
    499 define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
    500 ;CHECK-LABEL: uaddlp1d:
    501 ;CHECK: uaddlp.1d
    502         %tmp1 = load <2 x i32>, <2 x i32>* %A
    503         %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
    504         ret <1 x i64> %tmp3
    505 }
    506 
    507 define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
    508 ;CHECK-LABEL: uaddlp8h:
    509 ;CHECK: uaddlp.8h
    510         %tmp1 = load <16 x i8>, <16 x i8>* %A
    511         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
    512         ret <8 x i16> %tmp3
    513 }
    514 
    515 define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
    516 ;CHECK-LABEL: uaddlp4s:
    517 ;CHECK: uaddlp.4s
    518         %tmp1 = load <8 x i16>, <8 x i16>* %A
    519         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
    520         ret <4 x i32> %tmp3
    521 }
    522 
    523 define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
    524 ;CHECK-LABEL: uaddlp2d:
    525 ;CHECK: uaddlp.2d
    526         %tmp1 = load <4 x i32>, <4 x i32>* %A
    527         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
    528         ret <2 x i64> %tmp3
    529 }
    530 
    531 declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
    532 declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
    533 declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
    534 
    535 declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
    536 declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
    537 declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
    538 
    539 define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
    540 ;CHECK-LABEL: sadalp4h:
    541 ;CHECK: sadalp.4h
    542         %tmp1 = load <8 x i8>, <8 x i8>* %A
    543         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
    544         %tmp4 = load <4 x i16>, <4 x i16>* %B
    545         %tmp5 = add <4 x i16> %tmp3, %tmp4
    546         ret <4 x i16> %tmp5
    547 }
    548 
    549 define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
    550 ;CHECK-LABEL: sadalp2s:
    551 ;CHECK: sadalp.2s
    552         %tmp1 = load <4 x i16>, <4 x i16>* %A
    553         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
    554         %tmp4 = load <2 x i32>, <2 x i32>* %B
    555         %tmp5 = add <2 x i32> %tmp3, %tmp4
    556         ret <2 x i32> %tmp5
    557 }
    558 
    559 define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
    560 ;CHECK-LABEL: sadalp8h:
    561 ;CHECK: sadalp.8h
    562         %tmp1 = load <16 x i8>, <16 x i8>* %A
    563         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
    564         %tmp4 = load <8 x i16>, <8 x i16>* %B
    565         %tmp5 = add <8 x i16> %tmp3, %tmp4
    566         ret <8 x i16> %tmp5
    567 }
    568 
    569 define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
    570 ;CHECK-LABEL: sadalp4s:
    571 ;CHECK: sadalp.4s
    572         %tmp1 = load <8 x i16>, <8 x i16>* %A
    573         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
    574         %tmp4 = load <4 x i32>, <4 x i32>* %B
    575         %tmp5 = add <4 x i32> %tmp3, %tmp4
    576         ret <4 x i32> %tmp5
    577 }
    578 
    579 define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
    580 ;CHECK-LABEL: sadalp2d:
    581 ;CHECK: sadalp.2d
    582         %tmp1 = load <4 x i32>, <4 x i32>* %A
    583         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
    584         %tmp4 = load <2 x i64>, <2 x i64>* %B
    585         %tmp5 = add <2 x i64> %tmp3, %tmp4
    586         ret <2 x i64> %tmp5
    587 }
    588 
    589 define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
    590 ;CHECK-LABEL: uadalp4h:
    591 ;CHECK: uadalp.4h
    592         %tmp1 = load <8 x i8>, <8 x i8>* %A
    593         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
    594         %tmp4 = load <4 x i16>, <4 x i16>* %B
    595         %tmp5 = add <4 x i16> %tmp3, %tmp4
    596         ret <4 x i16> %tmp5
    597 }
    598 
    599 define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
    600 ;CHECK-LABEL: uadalp2s:
    601 ;CHECK: uadalp.2s
    602         %tmp1 = load <4 x i16>, <4 x i16>* %A
    603         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
    604         %tmp4 = load <2 x i32>, <2 x i32>* %B
    605         %tmp5 = add <2 x i32> %tmp3, %tmp4
    606         ret <2 x i32> %tmp5
    607 }
    608 
    609 define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
    610 ;CHECK-LABEL: uadalp8h:
    611 ;CHECK: uadalp.8h
    612         %tmp1 = load <16 x i8>, <16 x i8>* %A
    613         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
    614         %tmp4 = load <8 x i16>, <8 x i16>* %B
    615         %tmp5 = add <8 x i16> %tmp3, %tmp4
    616         ret <8 x i16> %tmp5
    617 }
    618 
    619 define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
    620 ;CHECK-LABEL: uadalp4s:
    621 ;CHECK: uadalp.4s
    622         %tmp1 = load <8 x i16>, <8 x i16>* %A
    623         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
    624         %tmp4 = load <4 x i32>, <4 x i32>* %B
    625         %tmp5 = add <4 x i32> %tmp3, %tmp4
    626         ret <4 x i32> %tmp5
    627 }
    628 
    629 define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
    630 ;CHECK-LABEL: uadalp2d:
    631 ;CHECK: uadalp.2d
    632         %tmp1 = load <4 x i32>, <4 x i32>* %A
    633         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
    634         %tmp4 = load <2 x i64>, <2 x i64>* %B
    635         %tmp5 = add <2 x i64> %tmp3, %tmp4
    636         ret <2 x i64> %tmp5
    637 }
    638 
    639 define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    640 ;CHECK-LABEL: addp_8b:
    641 ;CHECK: addp.8b
    642         %tmp1 = load <8 x i8>, <8 x i8>* %A
    643         %tmp2 = load <8 x i8>, <8 x i8>* %B
    644         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    645         ret <8 x i8> %tmp3
    646 }
    647 
    648 define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    649 ;CHECK-LABEL: addp_16b:
    650 ;CHECK: addp.16b
    651         %tmp1 = load <16 x i8>, <16 x i8>* %A
    652         %tmp2 = load <16 x i8>, <16 x i8>* %B
    653         %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    654         ret <16 x i8> %tmp3
    655 }
    656 
    657 define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    658 ;CHECK-LABEL: addp_4h:
    659 ;CHECK: addp.4h
    660         %tmp1 = load <4 x i16>, <4 x i16>* %A
    661         %tmp2 = load <4 x i16>, <4 x i16>* %B
    662         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    663         ret <4 x i16> %tmp3
    664 }
    665 
    666 define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    667 ;CHECK-LABEL: addp_8h:
    668 ;CHECK: addp.8h
    669         %tmp1 = load <8 x i16>, <8 x i16>* %A
    670         %tmp2 = load <8 x i16>, <8 x i16>* %B
    671         %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    672         ret <8 x i16> %tmp3
    673 }
    674 
    675 define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    676 ;CHECK-LABEL: addp_2s:
    677 ;CHECK: addp.2s
    678         %tmp1 = load <2 x i32>, <2 x i32>* %A
    679         %tmp2 = load <2 x i32>, <2 x i32>* %B
    680         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    681         ret <2 x i32> %tmp3
    682 }
    683 
    684 define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    685 ;CHECK-LABEL: addp_4s:
    686 ;CHECK: addp.4s
    687         %tmp1 = load <4 x i32>, <4 x i32>* %A
    688         %tmp2 = load <4 x i32>, <4 x i32>* %B
    689         %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    690         ret <4 x i32> %tmp3
    691 }
    692 
    693 define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
    694 ;CHECK-LABEL: addp_2d:
    695 ;CHECK: addp.2d
    696         %tmp1 = load <2 x i64>, <2 x i64>* %A
    697         %tmp2 = load <2 x i64>, <2 x i64>* %B
    698         %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
    699         ret <2 x i64> %tmp3
    700 }
    701 
    702 declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    703 declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    704 declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    705 declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    706 declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    707 declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    708 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
    709 
    710 define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
    711 ;CHECK-LABEL: faddp_2s:
    712 ;CHECK: faddp.2s
    713         %tmp1 = load <2 x float>, <2 x float>* %A
    714         %tmp2 = load <2 x float>, <2 x float>* %B
    715         %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
    716         ret <2 x float> %tmp3
    717 }
    718 
    719 define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
    720 ;CHECK-LABEL: faddp_4s:
    721 ;CHECK: faddp.4s
    722         %tmp1 = load <4 x float>, <4 x float>* %A
    723         %tmp2 = load <4 x float>, <4 x float>* %B
    724         %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
    725         ret <4 x float> %tmp3
    726 }
    727 
    728 define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
    729 ;CHECK-LABEL: faddp_2d:
    730 ;CHECK: faddp.2d
    731         %tmp1 = load <2 x double>, <2 x double>* %A
    732         %tmp2 = load <2 x double>, <2 x double>* %B
    733         %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
    734         ret <2 x double> %tmp3
    735 }
    736 
    737 declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
    738 declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
    739 declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
    740 
    741 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
    742 ; CHECK-LABEL: uaddl2_duprhs
    743 ; CHECK-NOT: ext.16b
    744 ; CHECK: uaddl2.2d
    745   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
    746   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
    747 
    748   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    749 
    750   %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
    751   %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
    752 
    753   %res = add <2 x i64> %lhs.ext, %rhs.ext
    754   ret <2 x i64> %res
    755 }
    756 
    757 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
    758 ; CHECK-LABEL: saddl2_duplhs
    759 ; CHECK-NOT: ext.16b
    760 ; CHECK: saddl2.2d
    761   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
    762   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
    763 
    764   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    765 
    766   %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
    767   %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
    768 
    769   %res = add <2 x i64> %lhs.ext, %rhs.ext
    770   ret <2 x i64> %res
    771 }
    772 
    773 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
    774 ; CHECK-LABEL: usubl2_duprhs
    775 ; CHECK-NOT: ext.16b
    776 ; CHECK: usubl2.2d
    777   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
    778   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
    779 
    780   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    781 
    782   %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
    783   %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
    784 
    785   %res = sub <2 x i64> %lhs.ext, %rhs.ext
    786   ret <2 x i64> %res
    787 }
    788 
    789 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
    790 ; CHECK-LABEL: ssubl2_duplhs
    791 ; CHECK-NOT: ext.16b
    792 ; CHECK: ssubl2.2d
    793   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
    794   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
    795 
    796   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    797 
    798   %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
    799   %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
    800 
    801   %res = sub <2 x i64> %lhs.ext, %rhs.ext
    802   ret <2 x i64> %res
    803 }
    804 
    805 define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    806 ;CHECK-LABEL: addhn8b_natural:
    807 ;CHECK: addhn.8b
    808         %tmp1 = load <8 x i16>, <8 x i16>* %A
    809         %tmp2 = load <8 x i16>, <8 x i16>* %B
    810         %sum = add <8 x i16> %tmp1, %tmp2
    811         %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    812         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
    813         ret <8 x i8> %narrowed
    814 }
    815 
    816 define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    817 ;CHECK-LABEL: addhn4h_natural:
    818 ;CHECK: addhn.4h
    819         %tmp1 = load <4 x i32>, <4 x i32>* %A
    820         %tmp2 = load <4 x i32>, <4 x i32>* %B
    821         %sum = add <4 x i32> %tmp1, %tmp2
    822         %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
    823         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
    824         ret <4 x i16> %narrowed
    825 }
    826 
    827 define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
    828 ;CHECK-LABEL: addhn2s_natural:
    829 ;CHECK: addhn.2s
    830         %tmp1 = load <2 x i64>, <2 x i64>* %A
    831         %tmp2 = load <2 x i64>, <2 x i64>* %B
    832         %sum = add <2 x i64> %tmp1, %tmp2
    833         %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
    834         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
    835         ret <2 x i32> %narrowed
    836 }
    837 
    838 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
    839 ;CHECK-LABEL: addhn2_16b_natural:
    840 ;CHECK: addhn2.16b
    841         %tmp1 = load <8 x i16>, <8 x i16>* %A
    842         %tmp2 = load <8 x i16>, <8 x i16>* %B
    843         %sum = add <8 x i16> %tmp1, %tmp2
    844         %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    845         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
    846         %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    847         ret <16 x i8> %res
    848 }
    849 
    850 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
    851 ;CHECK-LABEL: addhn2_8h_natural:
    852 ;CHECK: addhn2.8h
    853         %tmp1 = load <4 x i32>, <4 x i32>* %A
    854         %tmp2 = load <4 x i32>, <4 x i32>* %B
    855         %sum = add <4 x i32> %tmp1, %tmp2
    856         %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
    857         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
    858         %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    859         ret <8 x i16> %res
    860 }
    861 
    862 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
    863 ;CHECK-LABEL: addhn2_4s_natural:
    864 ;CHECK: addhn2.4s
    865         %tmp1 = load <2 x i64>, <2 x i64>* %A
    866         %tmp2 = load <2 x i64>, <2 x i64>* %B
    867         %sum = add <2 x i64> %tmp1, %tmp2
    868         %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
    869         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
    870         %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    871         ret <4 x i32> %res
    872 }
    873 
    874 define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    875 ;CHECK-LABEL: subhn8b_natural:
    876 ;CHECK: subhn.8b
    877         %tmp1 = load <8 x i16>, <8 x i16>* %A
    878         %tmp2 = load <8 x i16>, <8 x i16>* %B
    879         %diff = sub <8 x i16> %tmp1, %tmp2
    880         %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    881         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
    882         ret <8 x i8> %narrowed
    883 }
    884 
    885 define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    886 ;CHECK-LABEL: subhn4h_natural:
    887 ;CHECK: subhn.4h
    888         %tmp1 = load <4 x i32>, <4 x i32>* %A
    889         %tmp2 = load <4 x i32>, <4 x i32>* %B
    890         %diff = sub <4 x i32> %tmp1, %tmp2
    891         %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
    892         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
    893         ret <4 x i16> %narrowed
    894 }
    895 
    896 define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
    897 ;CHECK-LABEL: subhn2s_natural:
    898 ;CHECK: subhn.2s
    899         %tmp1 = load <2 x i64>, <2 x i64>* %A
    900         %tmp2 = load <2 x i64>, <2 x i64>* %B
    901         %diff = sub <2 x i64> %tmp1, %tmp2
    902         %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
    903         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
    904         ret <2 x i32> %narrowed
    905 }
    906 
    907 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
    908 ;CHECK-LABEL: subhn2_16b_natural:
    909 ;CHECK: subhn2.16b
    910         %tmp1 = load <8 x i16>, <8 x i16>* %A
    911         %tmp2 = load <8 x i16>, <8 x i16>* %B
    912         %diff = sub <8 x i16> %tmp1, %tmp2
    913         %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
    914         %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
    915         %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    916         ret <16 x i8> %res
    917 }
    918 
    919 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
    920 ;CHECK-LABEL: subhn2_8h_natural:
    921 ;CHECK: subhn2.8h
    922         %tmp1 = load <4 x i32>, <4 x i32>* %A
    923         %tmp2 = load <4 x i32>, <4 x i32>* %B
    924         %diff = sub <4 x i32> %tmp1, %tmp2
    925         %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
    926         %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
    927         %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    928         ret <8 x i16> %res
    929 }
    930 
    931 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
    932 ;CHECK-LABEL: subhn2_4s_natural:
    933 ;CHECK: subhn2.4s
    934         %tmp1 = load <2 x i64>, <2 x i64>* %A
    935         %tmp2 = load <2 x i64>, <2 x i64>* %B
    936         %diff = sub <2 x i64> %tmp1, %tmp2
    937         %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
    938         %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
    939         %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    940         ret <4 x i32> %res
    941 }
    942