Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
      2 
      3 
      4 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
      5 ;CHECK-LABEL: sabdl8h:
      6 ;CHECK: sabdl.8h
      7         %tmp1 = load <8 x i8>, <8 x i8>* %A
      8         %tmp2 = load <8 x i8>, <8 x i8>* %B
      9         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
     10         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
     11         ret <8 x i16> %tmp4
     12 }
     13 
     14 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     15 ;CHECK-LABEL: sabdl4s:
     16 ;CHECK: sabdl.4s
     17         %tmp1 = load <4 x i16>, <4 x i16>* %A
     18         %tmp2 = load <4 x i16>, <4 x i16>* %B
     19         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     20         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
     21         ret <4 x i32> %tmp4
     22 }
     23 
     24 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     25 ;CHECK-LABEL: sabdl2d:
     26 ;CHECK: sabdl.2d
     27         %tmp1 = load <2 x i32>, <2 x i32>* %A
     28         %tmp2 = load <2 x i32>, <2 x i32>* %B
     29         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     30         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
     31         ret <2 x i64> %tmp4
     32 }
     33 
     34 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
     35 ;CHECK-LABEL: sabdl2_8h:
     36 ;CHECK: sabdl2.8h
     37         %load1 = load <16 x i8>, <16 x i8>* %A
     38         %load2 = load <16 x i8>, <16 x i8>* %B
     39         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     40         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     41         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
     42         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
     43         ret <8 x i16> %tmp4
     44 }
     45 
     46 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     47 ;CHECK-LABEL: sabdl2_4s:
     48 ;CHECK: sabdl2.4s
     49         %load1 = load <8 x i16>, <8 x i16>* %A
     50         %load2 = load <8 x i16>, <8 x i16>* %B
     51         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
     52         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
     53         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     54         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
     55         ret <4 x i32> %tmp4
     56 }
     57 
     58 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     59 ;CHECK-LABEL: sabdl2_2d:
     60 ;CHECK: sabdl2.2d
     61         %load1 = load <4 x i32>, <4 x i32>* %A
     62         %load2 = load <4 x i32>, <4 x i32>* %B
     63         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
     64         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
     65         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     66         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
     67         ret <2 x i64> %tmp4
     68 }
     69 
     70 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
     71 ;CHECK-LABEL: uabdl8h:
     72 ;CHECK: uabdl.8h
     73   %tmp1 = load <8 x i8>, <8 x i8>* %A
     74   %tmp2 = load <8 x i8>, <8 x i8>* %B
     75   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
     76   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
     77   ret <8 x i16> %tmp4
     78 }
     79 
     80 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     81 ;CHECK-LABEL: uabdl4s:
     82 ;CHECK: uabdl.4s
     83   %tmp1 = load <4 x i16>, <4 x i16>* %A
     84   %tmp2 = load <4 x i16>, <4 x i16>* %B
     85   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     86   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
     87   ret <4 x i32> %tmp4
     88 }
     89 
     90 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     91 ;CHECK-LABEL: uabdl2d:
     92 ;CHECK: uabdl.2d
     93   %tmp1 = load <2 x i32>, <2 x i32>* %A
     94   %tmp2 = load <2 x i32>, <2 x i32>* %B
     95   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     96   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
     97   ret <2 x i64> %tmp4
     98 }
     99 
    100 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    101 ;CHECK-LABEL: uabdl2_8h:
    102 ;CHECK: uabdl2.8h
    103   %load1 = load <16 x i8>, <16 x i8>* %A
    104   %load2 = load <16 x i8>, <16 x i8>* %B
    105   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    106   %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    107 
    108   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    109   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
    110   ret <8 x i16> %tmp4
    111 }
    112 
    113 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    114 ;CHECK-LABEL: uabdl2_4s:
    115 ;CHECK: uabdl2.4s
    116   %load1 = load <8 x i16>, <8 x i16>* %A
    117   %load2 = load <8 x i16>, <8 x i16>* %B
    118   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    119   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    120   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    121   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
    122   ret <4 x i32> %tmp4
    123 }
    124 
    125 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    126 ;CHECK-LABEL: uabdl2_2d:
    127 ;CHECK: uabdl2.2d
    128   %load1 = load <4 x i32>, <4 x i32>* %A
    129   %load2 = load <4 x i32>, <4 x i32>* %B
    130   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    131   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    132   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    133   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
    134   ret <2 x i64> %tmp4
    135 }
    136 
    137 define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
    138 ; CHECK-LABEL: uabdl8h_log2_shuffle
    139 ; CHECK: uabdl2.8h
    140 ; CHECK: uabdl.8h
    141   %aload = load <16 x i8>, <16 x i8>* %a, align 1
    142   %bload = load <16 x i8>, <16 x i8>* %b, align 1
    143   %aext = zext <16 x i8> %aload to <16 x i16>
    144   %bext = zext <16 x i8> %bload to <16 x i16>
    145   %abdiff = sub nsw <16 x i16> %aext, %bext
    146   %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
    147   %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
    148   %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
    149   %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    150   %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
    151   %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    152   %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
    153   %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    154   %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
    155   %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    156   %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
    157   %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
    158   ret i16 %reduced_v
    159 }
    160 
    161 define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
    162 ; CHECK-LABEL: uabdl4s_log2_shuffle
    163 ; CHECK: uabdl2.4s
    164 ; CHECK: uabdl.4s
    165   %aload = load <8 x i16>, <8 x i16>* %a, align 1
    166   %bload = load <8 x i16>, <8 x i16>* %b, align 1
    167   %aext = zext <8 x i16> %aload to <8 x i32>
    168   %bext = zext <8 x i16> %bload to <8 x i32>
    169   %abdiff = sub nsw <8 x i32> %aext, %bext
    170   %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
    171   %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
    172   %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
    173   %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
    174   %bin.rdx = add <8 x i32> %absel, %rdx.shuf
    175   %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    176   %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
    177   %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
    178   %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
    179   %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
    180   ret i32 %reduced_v
    181 }
    182 
    183 define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
    184 ; CHECK: uabdl2d_log2_shuffle
    185 ; CHECK: uabdl2.2d
    186 ; CHECK: uabdl.2d
    187   %aload = load <4 x i32>, <4 x i32>* %a, align 1
    188   %bload = load <4 x i32>, <4 x i32>* %b, align 1
    189   %aext = zext <4 x i32> %aload to <4 x i64>
    190   %bext = zext <4 x i32> %bload to <4 x i64>
    191   %abdiff = sub nsw <4 x i64> %aext, %bext
    192   %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
    193   %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
    194   %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
    195   %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
    196   %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
    197   %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
    198   %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
    199   %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
    200   ret i64 %reduced_v
    201 }
    202 
    203 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
    204 ;CHECK-LABEL: fabd_2s:
    205 ;CHECK: fabd.2s
    206         %tmp1 = load <2 x float>, <2 x float>* %A
    207         %tmp2 = load <2 x float>, <2 x float>* %B
    208         %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
    209         ret <2 x float> %tmp3
    210 }
    211 
    212 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
    213 ;CHECK-LABEL: fabd_4s:
    214 ;CHECK: fabd.4s
    215         %tmp1 = load <4 x float>, <4 x float>* %A
    216         %tmp2 = load <4 x float>, <4 x float>* %B
    217         %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
    218         ret <4 x float> %tmp3
    219 }
    220 
    221 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
    222 ;CHECK-LABEL: fabd_2d:
    223 ;CHECK: fabd.2d
    224         %tmp1 = load <2 x double>, <2 x double>* %A
    225         %tmp2 = load <2 x double>, <2 x double>* %B
    226         %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
    227         ret <2 x double> %tmp3
    228 }
    229 
    230 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
    231 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
    232 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
    233 
    234 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    235 ;CHECK-LABEL: sabd_8b:
    236 ;CHECK: sabd.8b
    237         %tmp1 = load <8 x i8>, <8 x i8>* %A
    238         %tmp2 = load <8 x i8>, <8 x i8>* %B
    239         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    240         ret <8 x i8> %tmp3
    241 }
    242 
    243 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    244 ;CHECK-LABEL: sabd_16b:
    245 ;CHECK: sabd.16b
    246         %tmp1 = load <16 x i8>, <16 x i8>* %A
    247         %tmp2 = load <16 x i8>, <16 x i8>* %B
    248         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    249         ret <16 x i8> %tmp3
    250 }
    251 
    252 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    253 ;CHECK-LABEL: sabd_4h:
    254 ;CHECK: sabd.4h
    255         %tmp1 = load <4 x i16>, <4 x i16>* %A
    256         %tmp2 = load <4 x i16>, <4 x i16>* %B
    257         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    258         ret <4 x i16> %tmp3
    259 }
    260 
    261 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    262 ;CHECK-LABEL: sabd_8h:
    263 ;CHECK: sabd.8h
    264         %tmp1 = load <8 x i16>, <8 x i16>* %A
    265         %tmp2 = load <8 x i16>, <8 x i16>* %B
    266         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    267         ret <8 x i16> %tmp3
    268 }
    269 
    270 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    271 ;CHECK-LABEL: sabd_2s:
    272 ;CHECK: sabd.2s
    273         %tmp1 = load <2 x i32>, <2 x i32>* %A
    274         %tmp2 = load <2 x i32>, <2 x i32>* %B
    275         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    276         ret <2 x i32> %tmp3
    277 }
    278 
    279 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    280 ;CHECK-LABEL: sabd_4s:
    281 ;CHECK: sabd.4s
    282         %tmp1 = load <4 x i32>, <4 x i32>* %A
    283         %tmp2 = load <4 x i32>, <4 x i32>* %B
    284         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    285         ret <4 x i32> %tmp3
    286 }
    287 
    288 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    289 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    290 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    291 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    292 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    293 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    294 
    295 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    296 ;CHECK-LABEL: uabd_8b:
    297 ;CHECK: uabd.8b
    298         %tmp1 = load <8 x i8>, <8 x i8>* %A
    299         %tmp2 = load <8 x i8>, <8 x i8>* %B
    300         %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    301         ret <8 x i8> %tmp3
    302 }
    303 
    304 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    305 ;CHECK-LABEL: uabd_16b:
    306 ;CHECK: uabd.16b
    307         %tmp1 = load <16 x i8>, <16 x i8>* %A
    308         %tmp2 = load <16 x i8>, <16 x i8>* %B
    309         %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    310         ret <16 x i8> %tmp3
    311 }
    312 
    313 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    314 ;CHECK-LABEL: uabd_4h:
    315 ;CHECK: uabd.4h
    316         %tmp1 = load <4 x i16>, <4 x i16>* %A
    317         %tmp2 = load <4 x i16>, <4 x i16>* %B
    318         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    319         ret <4 x i16> %tmp3
    320 }
    321 
    322 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    323 ;CHECK-LABEL: uabd_8h:
    324 ;CHECK: uabd.8h
    325         %tmp1 = load <8 x i16>, <8 x i16>* %A
    326         %tmp2 = load <8 x i16>, <8 x i16>* %B
    327         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    328         ret <8 x i16> %tmp3
    329 }
    330 
    331 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    332 ;CHECK-LABEL: uabd_2s:
    333 ;CHECK: uabd.2s
    334         %tmp1 = load <2 x i32>, <2 x i32>* %A
    335         %tmp2 = load <2 x i32>, <2 x i32>* %B
    336         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    337         ret <2 x i32> %tmp3
    338 }
    339 
    340 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    341 ;CHECK-LABEL: uabd_4s:
    342 ;CHECK: uabd.4s
    343         %tmp1 = load <4 x i32>, <4 x i32>* %A
    344         %tmp2 = load <4 x i32>, <4 x i32>* %B
    345         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    346         ret <4 x i32> %tmp3
    347 }
    348 
    349 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    350 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    351 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    352 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    353 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    354 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    355 
    356 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
    357 ;CHECK-LABEL: sqabs_8b:
    358 ;CHECK: sqabs.8b
    359         %tmp1 = load <8 x i8>, <8 x i8>* %A
    360         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
    361         ret <8 x i8> %tmp3
    362 }
    363 
    364 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
    365 ;CHECK-LABEL: sqabs_16b:
    366 ;CHECK: sqabs.16b
    367         %tmp1 = load <16 x i8>, <16 x i8>* %A
    368         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
    369         ret <16 x i8> %tmp3
    370 }
    371 
    372 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
    373 ;CHECK-LABEL: sqabs_4h:
    374 ;CHECK: sqabs.4h
    375         %tmp1 = load <4 x i16>, <4 x i16>* %A
    376         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
    377         ret <4 x i16> %tmp3
    378 }
    379 
    380 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
    381 ;CHECK-LABEL: sqabs_8h:
    382 ;CHECK: sqabs.8h
    383         %tmp1 = load <8 x i16>, <8 x i16>* %A
    384         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
    385         ret <8 x i16> %tmp3
    386 }
    387 
    388 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
    389 ;CHECK-LABEL: sqabs_2s:
    390 ;CHECK: sqabs.2s
    391         %tmp1 = load <2 x i32>, <2 x i32>* %A
    392         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
    393         ret <2 x i32> %tmp3
    394 }
    395 
    396 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
    397 ;CHECK-LABEL: sqabs_4s:
    398 ;CHECK: sqabs.4s
    399         %tmp1 = load <4 x i32>, <4 x i32>* %A
    400         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
    401         ret <4 x i32> %tmp3
    402 }
    403 
    404 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
    405 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
    406 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
    407 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
    408 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
    409 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
    410 
    411 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
    412 ;CHECK-LABEL: sqneg_8b:
    413 ;CHECK: sqneg.8b
    414         %tmp1 = load <8 x i8>, <8 x i8>* %A
    415         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
    416         ret <8 x i8> %tmp3
    417 }
    418 
    419 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
    420 ;CHECK-LABEL: sqneg_16b:
    421 ;CHECK: sqneg.16b
    422         %tmp1 = load <16 x i8>, <16 x i8>* %A
    423         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
    424         ret <16 x i8> %tmp3
    425 }
    426 
    427 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
    428 ;CHECK-LABEL: sqneg_4h:
    429 ;CHECK: sqneg.4h
    430         %tmp1 = load <4 x i16>, <4 x i16>* %A
    431         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
    432         ret <4 x i16> %tmp3
    433 }
    434 
    435 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
    436 ;CHECK-LABEL: sqneg_8h:
    437 ;CHECK: sqneg.8h
    438         %tmp1 = load <8 x i16>, <8 x i16>* %A
    439         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
    440         ret <8 x i16> %tmp3
    441 }
    442 
    443 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
    444 ;CHECK-LABEL: sqneg_2s:
    445 ;CHECK: sqneg.2s
    446         %tmp1 = load <2 x i32>, <2 x i32>* %A
    447         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
    448         ret <2 x i32> %tmp3
    449 }
    450 
    451 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
    452 ;CHECK-LABEL: sqneg_4s:
    453 ;CHECK: sqneg.4s
    454         %tmp1 = load <4 x i32>, <4 x i32>* %A
    455         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
    456         ret <4 x i32> %tmp3
    457 }
    458 
    459 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
    460 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
    461 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
    462 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
    463 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
    464 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
    465 
    466 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
    467 ;CHECK-LABEL: abs_8b:
    468 ;CHECK: abs.8b
    469         %tmp1 = load <8 x i8>, <8 x i8>* %A
    470         %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
    471         ret <8 x i8> %tmp3
    472 }
    473 
    474 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
    475 ;CHECK-LABEL: abs_16b:
    476 ;CHECK: abs.16b
    477         %tmp1 = load <16 x i8>, <16 x i8>* %A
    478         %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
    479         ret <16 x i8> %tmp3
    480 }
    481 
    482 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
    483 ;CHECK-LABEL: abs_4h:
    484 ;CHECK: abs.4h
    485         %tmp1 = load <4 x i16>, <4 x i16>* %A
    486         %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
    487         ret <4 x i16> %tmp3
    488 }
    489 
    490 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
    491 ;CHECK-LABEL: abs_8h:
    492 ;CHECK: abs.8h
    493         %tmp1 = load <8 x i16>, <8 x i16>* %A
    494         %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
    495         ret <8 x i16> %tmp3
    496 }
    497 
    498 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
    499 ;CHECK-LABEL: abs_2s:
    500 ;CHECK: abs.2s
    501         %tmp1 = load <2 x i32>, <2 x i32>* %A
    502         %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
    503         ret <2 x i32> %tmp3
    504 }
    505 
    506 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
    507 ;CHECK-LABEL: abs_4s:
    508 ;CHECK: abs.4s
    509         %tmp1 = load <4 x i32>, <4 x i32>* %A
    510         %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
    511         ret <4 x i32> %tmp3
    512 }
    513 
    514 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
    515 ; CHECK-LABEL: abs_1d:
    516 ; CHECK: abs d0, d0
    517   %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
    518   ret <1 x i64> %abs
    519 }
    520 
    521 define i64 @abs_1d_honestly(i64 %A) nounwind {
    522 ; CHECK-LABEL: abs_1d_honestly:
    523 ; CHECK: abs d0, d0
    524   %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
    525   ret i64 %abs
    526 }
    527 
    528 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
    529 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
    530 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
    531 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
    532 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
    533 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
    534 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
    535 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
    536 
    537 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
    538 ;CHECK-LABEL: sabal8h:
    539 ;CHECK: sabal.8h
    540         %tmp1 = load <8 x i8>, <8 x i8>* %A
    541         %tmp2 = load <8 x i8>, <8 x i8>* %B
    542         %tmp3 = load <8 x i16>, <8 x i16>* %C
    543         %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    544         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    545         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    546         ret <8 x i16> %tmp5
    547 }
    548 
    549 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    550 ;CHECK-LABEL: sabal4s:
    551 ;CHECK: sabal.4s
    552         %tmp1 = load <4 x i16>, <4 x i16>* %A
    553         %tmp2 = load <4 x i16>, <4 x i16>* %B
    554         %tmp3 = load <4 x i32>, <4 x i32>* %C
    555         %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    556         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    557         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    558         ret <4 x i32> %tmp5
    559 }
    560 
    561 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    562 ;CHECK-LABEL: sabal2d:
    563 ;CHECK: sabal.2d
    564         %tmp1 = load <2 x i32>, <2 x i32>* %A
    565         %tmp2 = load <2 x i32>, <2 x i32>* %B
    566         %tmp3 = load <2 x i64>, <2 x i64>* %C
    567         %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    568         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    569         %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
    570         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    571         ret <2 x i64> %tmp5
    572 }
    573 
    574 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
    575 ;CHECK-LABEL: sabal2_8h:
    576 ;CHECK: sabal2.8h
    577         %load1 = load <16 x i8>, <16 x i8>* %A
    578         %load2 = load <16 x i8>, <16 x i8>* %B
    579         %tmp3 = load <8 x i16>, <8 x i16>* %C
    580         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    581         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    582         %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    583         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    584         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    585         ret <8 x i16> %tmp5
    586 }
    587 
    588 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
    589 ;CHECK-LABEL: sabal2_4s:
    590 ;CHECK: sabal2.4s
    591         %load1 = load <8 x i16>, <8 x i16>* %A
    592         %load2 = load <8 x i16>, <8 x i16>* %B
    593         %tmp3 = load <4 x i32>, <4 x i32>* %C
    594         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    595         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    596         %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    597         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    598         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    599         ret <4 x i32> %tmp5
    600 }
    601 
    602 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
    603 ;CHECK-LABEL: sabal2_2d:
    604 ;CHECK: sabal2.2d
    605         %load1 = load <4 x i32>, <4 x i32>* %A
    606         %load2 = load <4 x i32>, <4 x i32>* %B
    607         %tmp3 = load <2 x i64>, <2 x i64>* %C
    608         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    609         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    610         %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    611         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    612         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    613         ret <2 x i64> %tmp5
    614 }
    615 
    616 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
    617 ;CHECK-LABEL: uabal8h:
    618 ;CHECK: uabal.8h
    619         %tmp1 = load <8 x i8>, <8 x i8>* %A
    620         %tmp2 = load <8 x i8>, <8 x i8>* %B
    621         %tmp3 = load <8 x i16>, <8 x i16>* %C
    622         %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    623         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    624         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    625         ret <8 x i16> %tmp5
    626 }
    627 
    628 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    629 ;CHECK-LABEL: uabal4s:
    630 ;CHECK: uabal.4s
    631         %tmp1 = load <4 x i16>, <4 x i16>* %A
    632         %tmp2 = load <4 x i16>, <4 x i16>* %B
    633         %tmp3 = load <4 x i32>, <4 x i32>* %C
    634         %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    635         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    636         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    637         ret <4 x i32> %tmp5
    638 }
    639 
    640 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    641 ;CHECK-LABEL: uabal2d:
    642 ;CHECK: uabal.2d
    643         %tmp1 = load <2 x i32>, <2 x i32>* %A
    644         %tmp2 = load <2 x i32>, <2 x i32>* %B
    645         %tmp3 = load <2 x i64>, <2 x i64>* %C
    646         %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    647         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    648         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    649         ret <2 x i64> %tmp5
    650 }
    651 
    652 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
    653 ;CHECK-LABEL: uabal2_8h:
    654 ;CHECK: uabal2.8h
    655         %load1 = load <16 x i8>, <16 x i8>* %A
    656         %load2 = load <16 x i8>, <16 x i8>* %B
    657         %tmp3 = load <8 x i16>, <8 x i16>* %C
    658         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    659         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    660         %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    661         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    662         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    663         ret <8 x i16> %tmp5
    664 }
    665 
    666 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
    667 ;CHECK-LABEL: uabal2_4s:
    668 ;CHECK: uabal2.4s
    669         %load1 = load <8 x i16>, <8 x i16>* %A
    670         %load2 = load <8 x i16>, <8 x i16>* %B
    671         %tmp3 = load <4 x i32>, <4 x i32>* %C
    672         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    673         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    674         %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    675         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    676         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    677         ret <4 x i32> %tmp5
    678 }
    679 
    680 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
    681 ;CHECK-LABEL: uabal2_2d:
    682 ;CHECK: uabal2.2d
    683         %load1 = load <4 x i32>, <4 x i32>* %A
    684         %load2 = load <4 x i32>, <4 x i32>* %B
    685         %tmp3 = load <2 x i64>, <2 x i64>* %C
    686         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    687         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    688         %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    689         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    690         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    691         ret <2 x i64> %tmp5
    692 }
    693 
    694 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
    695 ;CHECK-LABEL: saba_8b:
    696 ;CHECK: saba.8b
    697         %tmp1 = load <8 x i8>, <8 x i8>* %A
    698         %tmp2 = load <8 x i8>, <8 x i8>* %B
    699         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    700         %tmp4 = load <8 x i8>, <8 x i8>* %C
    701         %tmp5 = add <8 x i8> %tmp3, %tmp4
    702         ret <8 x i8> %tmp5
    703 }
    704 
    705 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
    706 ;CHECK-LABEL: saba_16b:
    707 ;CHECK: saba.16b
    708         %tmp1 = load <16 x i8>, <16 x i8>* %A
    709         %tmp2 = load <16 x i8>, <16 x i8>* %B
    710         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    711         %tmp4 = load <16 x i8>, <16 x i8>* %C
    712         %tmp5 = add <16 x i8> %tmp3, %tmp4
    713         ret <16 x i8> %tmp5
    714 }
    715 
    716 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    717 ;CHECK-LABEL: saba_4h:
    718 ;CHECK: saba.4h
    719         %tmp1 = load <4 x i16>, <4 x i16>* %A
    720         %tmp2 = load <4 x i16>, <4 x i16>* %B
    721         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    722         %tmp4 = load <4 x i16>, <4 x i16>* %C
    723         %tmp5 = add <4 x i16> %tmp3, %tmp4
    724         ret <4 x i16> %tmp5
    725 }
    726 
    727 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
    728 ;CHECK-LABEL: saba_8h:
    729 ;CHECK: saba.8h
    730         %tmp1 = load <8 x i16>, <8 x i16>* %A
    731         %tmp2 = load <8 x i16>, <8 x i16>* %B
    732         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    733         %tmp4 = load <8 x i16>, <8 x i16>* %C
    734         %tmp5 = add <8 x i16> %tmp3, %tmp4
    735         ret <8 x i16> %tmp5
    736 }
    737 
    738 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    739 ;CHECK-LABEL: saba_2s:
    740 ;CHECK: saba.2s
    741         %tmp1 = load <2 x i32>, <2 x i32>* %A
    742         %tmp2 = load <2 x i32>, <2 x i32>* %B
    743         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    744         %tmp4 = load <2 x i32>, <2 x i32>* %C
    745         %tmp5 = add <2 x i32> %tmp3, %tmp4
    746         ret <2 x i32> %tmp5
    747 }
    748 
    749 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
    750 ;CHECK-LABEL: saba_4s:
    751 ;CHECK: saba.4s
    752         %tmp1 = load <4 x i32>, <4 x i32>* %A
    753         %tmp2 = load <4 x i32>, <4 x i32>* %B
    754         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    755         %tmp4 = load <4 x i32>, <4 x i32>* %C
    756         %tmp5 = add <4 x i32> %tmp3, %tmp4
    757         ret <4 x i32> %tmp5
    758 }
    759 
    760 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
    761 ;CHECK-LABEL: uaba_8b:
    762 ;CHECK: uaba.8b
    763         %tmp1 = load <8 x i8>, <8 x i8>* %A
    764         %tmp2 = load <8 x i8>, <8 x i8>* %B
    765         %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    766         %tmp4 = load <8 x i8>, <8 x i8>* %C
    767         %tmp5 = add <8 x i8> %tmp3, %tmp4
    768         ret <8 x i8> %tmp5
    769 }
    770 
    771 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
    772 ;CHECK-LABEL: uaba_16b:
    773 ;CHECK: uaba.16b
    774         %tmp1 = load <16 x i8>, <16 x i8>* %A
    775         %tmp2 = load <16 x i8>, <16 x i8>* %B
    776         %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    777         %tmp4 = load <16 x i8>, <16 x i8>* %C
    778         %tmp5 = add <16 x i8> %tmp3, %tmp4
    779         ret <16 x i8> %tmp5
    780 }
    781 
    782 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    783 ;CHECK-LABEL: uaba_4h:
    784 ;CHECK: uaba.4h
    785         %tmp1 = load <4 x i16>, <4 x i16>* %A
    786         %tmp2 = load <4 x i16>, <4 x i16>* %B
    787         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    788         %tmp4 = load <4 x i16>, <4 x i16>* %C
    789         %tmp5 = add <4 x i16> %tmp3, %tmp4
    790         ret <4 x i16> %tmp5
    791 }
    792 
    793 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
    794 ;CHECK-LABEL: uaba_8h:
    795 ;CHECK: uaba.8h
    796         %tmp1 = load <8 x i16>, <8 x i16>* %A
    797         %tmp2 = load <8 x i16>, <8 x i16>* %B
    798         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    799         %tmp4 = load <8 x i16>, <8 x i16>* %C
    800         %tmp5 = add <8 x i16> %tmp3, %tmp4
    801         ret <8 x i16> %tmp5
    802 }
    803 
    804 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    805 ;CHECK-LABEL: uaba_2s:
    806 ;CHECK: uaba.2s
    807         %tmp1 = load <2 x i32>, <2 x i32>* %A
    808         %tmp2 = load <2 x i32>, <2 x i32>* %B
    809         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    810         %tmp4 = load <2 x i32>, <2 x i32>* %C
    811         %tmp5 = add <2 x i32> %tmp3, %tmp4
    812         ret <2 x i32> %tmp5
    813 }
    814 
    815 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
    816 ;CHECK-LABEL: uaba_4s:
    817 ;CHECK: uaba.4s
    818         %tmp1 = load <4 x i32>, <4 x i32>* %A
    819         %tmp2 = load <4 x i32>, <4 x i32>* %B
    820         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    821         %tmp4 = load <4 x i32>, <4 x i32>* %C
    822         %tmp5 = add <4 x i32> %tmp3, %tmp4
    823         ret <4 x i32> %tmp5
    824 }
    825 
    826 ; Scalar FABD
    827 define float @fabds(float %a, float %b) nounwind {
    828 ; CHECK-LABEL: fabds:
    829 ; CHECK: fabd s0, s0, s1
    830   %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
    831   ret float %vabd.i
    832 }
    833 
    834 define double @fabdd(double %a, double %b) nounwind {
    835 ; CHECK-LABEL: fabdd:
    836 ; CHECK: fabd d0, d0, d1
    837   %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
    838   ret double %vabd.i
    839 }
    840 
    841 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
    842 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
    843 
    844 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
    845 ; CHECK-LABEL: uabdl_from_extract_dup:
    846 ; CHECK-NOT: ext.16b
    847 ; CHECK: uabdl2.2d
    848   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
    849   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
    850 
    851   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    852 
    853   %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
    854   %res1 = zext <2 x i32> %res to <2 x i64>
    855   ret <2 x i64> %res1
    856 }
    857 
    858 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
    859 ; CHECK-LABEL: sabdl_from_extract_dup:
    860 ; CHECK-NOT: ext.16b
    861 ; CHECK: sabdl2.2d
    862   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
    863   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
    864 
    865   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    866 
    867   %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
    868   %res1 = zext <2 x i32> %res to <2 x i64>
    869   ret <2 x i64> %res1
    870 }
    871 
    872 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
    873 ; CHECK-LABEL: abspattern1:
    874 ; CHECK: abs.2s
    875 ; CHECK-NEXT: ret
    876         %tmp1neg = sub <2 x i32> zeroinitializer, %a
    877         %b = icmp sge <2 x i32> %a, zeroinitializer
    878         %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
    879         ret <2 x i32> %abs
    880 }
    881 
    882 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
    883 ; CHECK-LABEL: abspattern2:
    884 ; CHECK: abs.4h
    885 ; CHECK-NEXT: ret
    886         %tmp1neg = sub <4 x i16> zeroinitializer, %a
    887         %b = icmp sgt <4 x i16> %a, zeroinitializer
    888         %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
    889         ret <4 x i16> %abs
    890 }
    891 
    892 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
    893 ; CHECK-LABEL: abspattern3:
    894 ; CHECK: abs.8b
    895 ; CHECK-NEXT: ret
    896         %tmp1neg = sub <8 x i8> zeroinitializer, %a
    897         %b = icmp slt <8 x i8> %a, zeroinitializer
    898         %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
    899         ret <8 x i8> %abs
    900 }
    901 
    902 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
    903 ; CHECK-LABEL: abspattern4:
    904 ; CHECK: abs.4s
    905 ; CHECK-NEXT: ret
    906         %tmp1neg = sub <4 x i32> zeroinitializer, %a
    907         %b = icmp sge <4 x i32> %a, zeroinitializer
    908         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
    909         ret <4 x i32> %abs
    910 }
    911 
    912 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
    913 ; CHECK-LABEL: abspattern5:
    914 ; CHECK: abs.8h
    915 ; CHECK-NEXT: ret
    916         %tmp1neg = sub <8 x i16> zeroinitializer, %a
    917         %b = icmp sgt <8 x i16> %a, zeroinitializer
    918         %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
    919         ret <8 x i16> %abs
    920 }
    921 
    922 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
    923 ; CHECK-LABEL: abspattern6:
    924 ; CHECK: abs.16b
    925 ; CHECK-NEXT: ret
    926         %tmp1neg = sub <16 x i8> zeroinitializer, %a
    927         %b = icmp slt <16 x i8> %a, zeroinitializer
    928         %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
    929         ret <16 x i8> %abs
    930 }
    931 
    932 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
    933 ; CHECK-LABEL: abspattern7:
    934 ; CHECK: abs.2d
    935 ; CHECK-NEXT: ret
    936         %tmp1neg = sub <2 x i64> zeroinitializer, %a
    937         %b = icmp sle <2 x i64> %a, zeroinitializer
    938         %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
    939         ret <2 x i64> %abs
    940 }
    941