Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
      2 
      3 
      4 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
      5 ;CHECK-LABEL: sabdl8h:
      6 ;CHECK: sabdl.8h
      7         %tmp1 = load <8 x i8>, <8 x i8>* %A
      8         %tmp2 = load <8 x i8>, <8 x i8>* %B
      9         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
     10         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
     11         ret <8 x i16> %tmp4
     12 }
     13 
     14 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     15 ;CHECK-LABEL: sabdl4s:
     16 ;CHECK: sabdl.4s
     17         %tmp1 = load <4 x i16>, <4 x i16>* %A
     18         %tmp2 = load <4 x i16>, <4 x i16>* %B
     19         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     20         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
     21         ret <4 x i32> %tmp4
     22 }
     23 
     24 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     25 ;CHECK-LABEL: sabdl2d:
     26 ;CHECK: sabdl.2d
     27         %tmp1 = load <2 x i32>, <2 x i32>* %A
     28         %tmp2 = load <2 x i32>, <2 x i32>* %B
     29         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     30         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
     31         ret <2 x i64> %tmp4
     32 }
     33 
     34 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
     35 ;CHECK-LABEL: sabdl2_8h:
     36 ;CHECK: sabdl.8h
     37         %load1 = load <16 x i8>, <16 x i8>* %A
     38         %load2 = load <16 x i8>, <16 x i8>* %B
     39         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     40         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
     41         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
     42         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
     43         ret <8 x i16> %tmp4
     44 }
     45 
     46 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     47 ;CHECK-LABEL: sabdl2_4s:
     48 ;CHECK: sabdl.4s
     49         %load1 = load <8 x i16>, <8 x i16>* %A
     50         %load2 = load <8 x i16>, <8 x i16>* %B
     51         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
     52         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
     53         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     54         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
     55         ret <4 x i32> %tmp4
     56 }
     57 
     58 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     59 ;CHECK-LABEL: sabdl2_2d:
     60 ;CHECK: sabdl.2d
     61         %load1 = load <4 x i32>, <4 x i32>* %A
     62         %load2 = load <4 x i32>, <4 x i32>* %B
     63         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
     64         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
     65         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     66         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
     67         ret <2 x i64> %tmp4
     68 }
     69 
     70 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
     71 ;CHECK-LABEL: uabdl8h:
     72 ;CHECK: uabdl.8h
     73   %tmp1 = load <8 x i8>, <8 x i8>* %A
     74   %tmp2 = load <8 x i8>, <8 x i8>* %B
     75   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
     76   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
     77   ret <8 x i16> %tmp4
     78 }
     79 
     80 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     81 ;CHECK-LABEL: uabdl4s:
     82 ;CHECK: uabdl.4s
     83   %tmp1 = load <4 x i16>, <4 x i16>* %A
     84   %tmp2 = load <4 x i16>, <4 x i16>* %B
     85   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     86   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
     87   ret <4 x i32> %tmp4
     88 }
     89 
     90 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     91 ;CHECK-LABEL: uabdl2d:
     92 ;CHECK: uabdl.2d
     93   %tmp1 = load <2 x i32>, <2 x i32>* %A
     94   %tmp2 = load <2 x i32>, <2 x i32>* %B
     95   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     96   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
     97   ret <2 x i64> %tmp4
     98 }
     99 
    100 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    101 ;CHECK-LABEL: uabdl2_8h:
    102 ;CHECK: uabdl.8h
    103   %load1 = load <16 x i8>, <16 x i8>* %A
    104   %load2 = load <16 x i8>, <16 x i8>* %B
    105   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    106   %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    107 
    108   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    109   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
    110   ret <8 x i16> %tmp4
    111 }
    112 
    113 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    114 ;CHECK-LABEL: uabdl2_4s:
    115 ;CHECK: uabdl.4s
    116   %load1 = load <8 x i16>, <8 x i16>* %A
    117   %load2 = load <8 x i16>, <8 x i16>* %B
    118   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    119   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    120   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    121   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
    122   ret <4 x i32> %tmp4
    123 }
    124 
    125 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    126 ;CHECK-LABEL: uabdl2_2d:
    127 ;CHECK: uabdl.2d
    128   %load1 = load <4 x i32>, <4 x i32>* %A
    129   %load2 = load <4 x i32>, <4 x i32>* %B
    130   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    131   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    132   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    133   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
    134   ret <2 x i64> %tmp4
    135 }
    136 
    137 declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
    138 
    139 define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
    140 ; CHECK-LABEL: uabdl8h_rdx
    141 ; CHECK: uabdl2.8h
    142 ; CHECK: uabdl.8h
    143   %aload = load <16 x i8>, <16 x i8>* %a, align 1
    144   %bload = load <16 x i8>, <16 x i8>* %b, align 1
    145   %aext = zext <16 x i8> %aload to <16 x i16>
    146   %bext = zext <16 x i8> %bload to <16 x i16>
    147   %abdiff = sub nsw <16 x i16> %aext, %bext
    148   %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
    149   %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
    150   %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
    151   %reduced_v = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %absel)
    152   ret i16 %reduced_v
    153 }
    154 
    155 declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
    156 
    157 define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
    158 ; CHECK-LABEL: uabdl4s_rdx
    159 ; CHECK: uabdl2.4s
    160 ; CHECK: uabdl.4s
    161   %aload = load <8 x i16>, <8 x i16>* %a, align 1
    162   %bload = load <8 x i16>, <8 x i16>* %b, align 1
    163   %aext = zext <8 x i16> %aload to <8 x i32>
    164   %bext = zext <8 x i16> %bload to <8 x i32>
    165   %abdiff = sub nsw <8 x i32> %aext, %bext
    166   %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
    167   %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
    168   %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
    169   %reduced_v = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %absel)
    170   ret i32 %reduced_v
    171 }
    172 
    173 declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
    174 
    175 define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
    176 ; CHECK: uabdl2d_rdx
    177 ; CHECK: uabdl2.2d
    178 ; CHECK: uabdl.2d
    179   %aload = load <4 x i32>, <4 x i32>* %a, align 1
    180   %bload = load <4 x i32>, <4 x i32>* %b, align 1
    181   %aext = zext <4 x i32> %aload to <4 x i64>
    182   %bext = zext <4 x i32> %bload to <4 x i64>
    183   %abdiff = sub nsw <4 x i64> %aext, %bext
    184   %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
    185   %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
    186   %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
    187   %reduced_v = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %absel)
    188   ret i64 %reduced_v
    189 }
    190 
    191 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
    192 ;CHECK-LABEL: fabd_2s:
    193 ;CHECK: fabd.2s
    194         %tmp1 = load <2 x float>, <2 x float>* %A
    195         %tmp2 = load <2 x float>, <2 x float>* %B
    196         %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
    197         ret <2 x float> %tmp3
    198 }
    199 
    200 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
    201 ;CHECK-LABEL: fabd_4s:
    202 ;CHECK: fabd.4s
    203         %tmp1 = load <4 x float>, <4 x float>* %A
    204         %tmp2 = load <4 x float>, <4 x float>* %B
    205         %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
    206         ret <4 x float> %tmp3
    207 }
    208 
    209 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
    210 ;CHECK-LABEL: fabd_2d:
    211 ;CHECK: fabd.2d
    212         %tmp1 = load <2 x double>, <2 x double>* %A
    213         %tmp2 = load <2 x double>, <2 x double>* %B
    214         %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
    215         ret <2 x double> %tmp3
    216 }
    217 
    218 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
    219 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
    220 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
    221 
    222 define <2 x float> @fabd_2s_from_fsub_fabs(<2 x float>* %A, <2 x float>* %B) nounwind {
    223 ;CHECK-LABEL: fabd_2s_from_fsub_fabs:
    224 ;CHECK: fabd.2s
    225         %tmp1 = load <2 x float>, <2 x float>* %A
    226         %tmp2 = load <2 x float>, <2 x float>* %B
    227         %sub = fsub <2 x float> %tmp1, %tmp2
    228         %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %sub)
    229         ret <2 x float> %abs
    230 }
    231 
    232 define <4 x float> @fabd_4s_from_fsub_fabs(<4 x float>* %A, <4 x float>* %B) nounwind {
    233 ;CHECK-LABEL: fabd_4s_from_fsub_fabs:
    234 ;CHECK: fabd.4s
    235         %tmp1 = load <4 x float>, <4 x float>* %A
    236         %tmp2 = load <4 x float>, <4 x float>* %B
    237         %sub = fsub <4 x float> %tmp1, %tmp2
    238         %abs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %sub)
    239         ret <4 x float> %abs
    240 }
    241 
    242 define <2 x double> @fabd_2d_from_fsub_fabs(<2 x double>* %A, <2 x double>* %B) nounwind {
    243 ;CHECK-LABEL: fabd_2d_from_fsub_fabs:
    244 ;CHECK: fabd.2d
    245         %tmp1 = load <2 x double>, <2 x double>* %A
    246         %tmp2 = load <2 x double>, <2 x double>* %B
    247         %sub = fsub <2 x double> %tmp1, %tmp2
    248         %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %sub)
    249         ret <2 x double> %abs
    250 }
    251 
    252 declare <2 x float> @llvm.fabs.v2f32(<2 x float>) nounwind readnone
    253 declare <4 x float> @llvm.fabs.v4f32(<4 x float>) nounwind readnone
    254 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) nounwind readnone
    255 
    256 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    257 ;CHECK-LABEL: sabd_8b:
    258 ;CHECK: sabd.8b
    259         %tmp1 = load <8 x i8>, <8 x i8>* %A
    260         %tmp2 = load <8 x i8>, <8 x i8>* %B
    261         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    262         ret <8 x i8> %tmp3
    263 }
    264 
    265 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    266 ;CHECK-LABEL: sabd_16b:
    267 ;CHECK: sabd.16b
    268         %tmp1 = load <16 x i8>, <16 x i8>* %A
    269         %tmp2 = load <16 x i8>, <16 x i8>* %B
    270         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    271         ret <16 x i8> %tmp3
    272 }
    273 
    274 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    275 ;CHECK-LABEL: sabd_4h:
    276 ;CHECK: sabd.4h
    277         %tmp1 = load <4 x i16>, <4 x i16>* %A
    278         %tmp2 = load <4 x i16>, <4 x i16>* %B
    279         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    280         ret <4 x i16> %tmp3
    281 }
    282 
    283 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    284 ;CHECK-LABEL: sabd_8h:
    285 ;CHECK: sabd.8h
    286         %tmp1 = load <8 x i16>, <8 x i16>* %A
    287         %tmp2 = load <8 x i16>, <8 x i16>* %B
    288         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    289         ret <8 x i16> %tmp3
    290 }
    291 
    292 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    293 ;CHECK-LABEL: sabd_2s:
    294 ;CHECK: sabd.2s
    295         %tmp1 = load <2 x i32>, <2 x i32>* %A
    296         %tmp2 = load <2 x i32>, <2 x i32>* %B
    297         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    298         ret <2 x i32> %tmp3
    299 }
    300 
    301 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    302 ;CHECK-LABEL: sabd_4s:
    303 ;CHECK: sabd.4s
    304         %tmp1 = load <4 x i32>, <4 x i32>* %A
    305         %tmp2 = load <4 x i32>, <4 x i32>* %B
    306         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    307         ret <4 x i32> %tmp3
    308 }
    309 
    310 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    311 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    312 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    313 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    314 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    315 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    316 
    317 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
    318 ;CHECK-LABEL: uabd_8b:
    319 ;CHECK: uabd.8b
    320         %tmp1 = load <8 x i8>, <8 x i8>* %A
    321         %tmp2 = load <8 x i8>, <8 x i8>* %B
    322         %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    323         ret <8 x i8> %tmp3
    324 }
    325 
    326 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
    327 ;CHECK-LABEL: uabd_16b:
    328 ;CHECK: uabd.16b
    329         %tmp1 = load <16 x i8>, <16 x i8>* %A
    330         %tmp2 = load <16 x i8>, <16 x i8>* %B
    331         %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    332         ret <16 x i8> %tmp3
    333 }
    334 
    335 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    336 ;CHECK-LABEL: uabd_4h:
    337 ;CHECK: uabd.4h
    338         %tmp1 = load <4 x i16>, <4 x i16>* %A
    339         %tmp2 = load <4 x i16>, <4 x i16>* %B
    340         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    341         ret <4 x i16> %tmp3
    342 }
    343 
    344 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    345 ;CHECK-LABEL: uabd_8h:
    346 ;CHECK: uabd.8h
    347         %tmp1 = load <8 x i16>, <8 x i16>* %A
    348         %tmp2 = load <8 x i16>, <8 x i16>* %B
    349         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    350         ret <8 x i16> %tmp3
    351 }
    352 
    353 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    354 ;CHECK-LABEL: uabd_2s:
    355 ;CHECK: uabd.2s
    356         %tmp1 = load <2 x i32>, <2 x i32>* %A
    357         %tmp2 = load <2 x i32>, <2 x i32>* %B
    358         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    359         ret <2 x i32> %tmp3
    360 }
    361 
    362 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    363 ;CHECK-LABEL: uabd_4s:
    364 ;CHECK: uabd.4s
    365         %tmp1 = load <4 x i32>, <4 x i32>* %A
    366         %tmp2 = load <4 x i32>, <4 x i32>* %B
    367         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    368         ret <4 x i32> %tmp3
    369 }
    370 
    371 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
    372 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
    373 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    374 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    375 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    376 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    377 
    378 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
    379 ;CHECK-LABEL: sqabs_8b:
    380 ;CHECK: sqabs.8b
    381         %tmp1 = load <8 x i8>, <8 x i8>* %A
    382         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
    383         ret <8 x i8> %tmp3
    384 }
    385 
    386 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
    387 ;CHECK-LABEL: sqabs_16b:
    388 ;CHECK: sqabs.16b
    389         %tmp1 = load <16 x i8>, <16 x i8>* %A
    390         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
    391         ret <16 x i8> %tmp3
    392 }
    393 
    394 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
    395 ;CHECK-LABEL: sqabs_4h:
    396 ;CHECK: sqabs.4h
    397         %tmp1 = load <4 x i16>, <4 x i16>* %A
    398         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
    399         ret <4 x i16> %tmp3
    400 }
    401 
    402 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
    403 ;CHECK-LABEL: sqabs_8h:
    404 ;CHECK: sqabs.8h
    405         %tmp1 = load <8 x i16>, <8 x i16>* %A
    406         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
    407         ret <8 x i16> %tmp3
    408 }
    409 
    410 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
    411 ;CHECK-LABEL: sqabs_2s:
    412 ;CHECK: sqabs.2s
    413         %tmp1 = load <2 x i32>, <2 x i32>* %A
    414         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
    415         ret <2 x i32> %tmp3
    416 }
    417 
    418 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
    419 ;CHECK-LABEL: sqabs_4s:
    420 ;CHECK: sqabs.4s
    421         %tmp1 = load <4 x i32>, <4 x i32>* %A
    422         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
    423         ret <4 x i32> %tmp3
    424 }
    425 
    426 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
    427 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
    428 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
    429 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
    430 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
    431 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
    432 
    433 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
    434 ;CHECK-LABEL: sqneg_8b:
    435 ;CHECK: sqneg.8b
    436         %tmp1 = load <8 x i8>, <8 x i8>* %A
    437         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
    438         ret <8 x i8> %tmp3
    439 }
    440 
    441 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
    442 ;CHECK-LABEL: sqneg_16b:
    443 ;CHECK: sqneg.16b
    444         %tmp1 = load <16 x i8>, <16 x i8>* %A
    445         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
    446         ret <16 x i8> %tmp3
    447 }
    448 
    449 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
    450 ;CHECK-LABEL: sqneg_4h:
    451 ;CHECK: sqneg.4h
    452         %tmp1 = load <4 x i16>, <4 x i16>* %A
    453         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
    454         ret <4 x i16> %tmp3
    455 }
    456 
    457 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
    458 ;CHECK-LABEL: sqneg_8h:
    459 ;CHECK: sqneg.8h
    460         %tmp1 = load <8 x i16>, <8 x i16>* %A
    461         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
    462         ret <8 x i16> %tmp3
    463 }
    464 
    465 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
    466 ;CHECK-LABEL: sqneg_2s:
    467 ;CHECK: sqneg.2s
    468         %tmp1 = load <2 x i32>, <2 x i32>* %A
    469         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
    470         ret <2 x i32> %tmp3
    471 }
    472 
    473 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
    474 ;CHECK-LABEL: sqneg_4s:
    475 ;CHECK: sqneg.4s
    476         %tmp1 = load <4 x i32>, <4 x i32>* %A
    477         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
    478         ret <4 x i32> %tmp3
    479 }
    480 
    481 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
    482 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
    483 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
    484 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
    485 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
    486 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
    487 
    488 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
    489 ;CHECK-LABEL: abs_8b:
    490 ;CHECK: abs.8b
    491         %tmp1 = load <8 x i8>, <8 x i8>* %A
    492         %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
    493         ret <8 x i8> %tmp3
    494 }
    495 
    496 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
    497 ;CHECK-LABEL: abs_16b:
    498 ;CHECK: abs.16b
    499         %tmp1 = load <16 x i8>, <16 x i8>* %A
    500         %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
    501         ret <16 x i8> %tmp3
    502 }
    503 
    504 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
    505 ;CHECK-LABEL: abs_4h:
    506 ;CHECK: abs.4h
    507         %tmp1 = load <4 x i16>, <4 x i16>* %A
    508         %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
    509         ret <4 x i16> %tmp3
    510 }
    511 
    512 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
    513 ;CHECK-LABEL: abs_8h:
    514 ;CHECK: abs.8h
    515         %tmp1 = load <8 x i16>, <8 x i16>* %A
    516         %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
    517         ret <8 x i16> %tmp3
    518 }
    519 
    520 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
    521 ;CHECK-LABEL: abs_2s:
    522 ;CHECK: abs.2s
    523         %tmp1 = load <2 x i32>, <2 x i32>* %A
    524         %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
    525         ret <2 x i32> %tmp3
    526 }
    527 
    528 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
    529 ;CHECK-LABEL: abs_4s:
    530 ;CHECK: abs.4s
    531         %tmp1 = load <4 x i32>, <4 x i32>* %A
    532         %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
    533         ret <4 x i32> %tmp3
    534 }
    535 
    536 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
    537 ; CHECK-LABEL: abs_1d:
    538 ; CHECK: abs d0, d0
    539   %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
    540   ret <1 x i64> %abs
    541 }
    542 
    543 define i64 @abs_1d_honestly(i64 %A) nounwind {
    544 ; CHECK-LABEL: abs_1d_honestly:
    545 ; CHECK: abs d0, d0
    546   %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
    547   ret i64 %abs
    548 }
    549 
    550 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
    551 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
    552 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
    553 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
    554 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
    555 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
    556 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
    557 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
    558 
    559 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
    560 ;CHECK-LABEL: sabal8h:
    561 ;CHECK: sabal.8h
    562         %tmp1 = load <8 x i8>, <8 x i8>* %A
    563         %tmp2 = load <8 x i8>, <8 x i8>* %B
    564         %tmp3 = load <8 x i16>, <8 x i16>* %C
    565         %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    566         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    567         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    568         ret <8 x i16> %tmp5
    569 }
    570 
    571 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    572 ;CHECK-LABEL: sabal4s:
    573 ;CHECK: sabal.4s
    574         %tmp1 = load <4 x i16>, <4 x i16>* %A
    575         %tmp2 = load <4 x i16>, <4 x i16>* %B
    576         %tmp3 = load <4 x i32>, <4 x i32>* %C
    577         %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    578         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    579         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    580         ret <4 x i32> %tmp5
    581 }
    582 
    583 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    584 ;CHECK-LABEL: sabal2d:
    585 ;CHECK: sabal.2d
    586         %tmp1 = load <2 x i32>, <2 x i32>* %A
    587         %tmp2 = load <2 x i32>, <2 x i32>* %B
    588         %tmp3 = load <2 x i64>, <2 x i64>* %C
    589         %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    590         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    591         %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
    592         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    593         ret <2 x i64> %tmp5
    594 }
    595 
    596 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
    597 ;CHECK-LABEL: sabal2_8h:
    598 ;CHECK: sabal.8h
    599         %load1 = load <16 x i8>, <16 x i8>* %A
    600         %load2 = load <16 x i8>, <16 x i8>* %B
    601         %tmp3 = load <8 x i16>, <8 x i16>* %C
    602         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    603         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    604         %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    605         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    606         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    607         ret <8 x i16> %tmp5
    608 }
    609 
    610 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
    611 ;CHECK-LABEL: sabal2_4s:
    612 ;CHECK: sabal.4s
    613         %load1 = load <8 x i16>, <8 x i16>* %A
    614         %load2 = load <8 x i16>, <8 x i16>* %B
    615         %tmp3 = load <4 x i32>, <4 x i32>* %C
    616         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    617         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    618         %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    619         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    620         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    621         ret <4 x i32> %tmp5
    622 }
    623 
    624 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
    625 ;CHECK-LABEL: sabal2_2d:
    626 ;CHECK: sabal.2d
    627         %load1 = load <4 x i32>, <4 x i32>* %A
    628         %load2 = load <4 x i32>, <4 x i32>* %B
    629         %tmp3 = load <2 x i64>, <2 x i64>* %C
    630         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    631         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    632         %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    633         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    634         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    635         ret <2 x i64> %tmp5
    636 }
    637 
    638 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
    639 ;CHECK-LABEL: uabal8h:
    640 ;CHECK: uabal.8h
    641         %tmp1 = load <8 x i8>, <8 x i8>* %A
    642         %tmp2 = load <8 x i8>, <8 x i8>* %B
    643         %tmp3 = load <8 x i16>, <8 x i16>* %C
    644         %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    645         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    646         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    647         ret <8 x i16> %tmp5
    648 }
    649 
    650 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
    651 ;CHECK-LABEL: uabal4s:
    652 ;CHECK: uabal.4s
    653         %tmp1 = load <4 x i16>, <4 x i16>* %A
    654         %tmp2 = load <4 x i16>, <4 x i16>* %B
    655         %tmp3 = load <4 x i32>, <4 x i32>* %C
    656         %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    657         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    658         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    659         ret <4 x i32> %tmp5
    660 }
    661 
    662 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
    663 ;CHECK-LABEL: uabal2d:
    664 ;CHECK: uabal.2d
    665         %tmp1 = load <2 x i32>, <2 x i32>* %A
    666         %tmp2 = load <2 x i32>, <2 x i32>* %B
    667         %tmp3 = load <2 x i64>, <2 x i64>* %C
    668         %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    669         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    670         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    671         ret <2 x i64> %tmp5
    672 }
    673 
    674 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
    675 ;CHECK-LABEL: uabal2_8h:
    676 ;CHECK: uabal.8h
    677         %load1 = load <16 x i8>, <16 x i8>* %A
    678         %load2 = load <16 x i8>, <16 x i8>* %B
    679         %tmp3 = load <8 x i16>, <8 x i16>* %C
    680         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    681         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    682         %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    683         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
    684         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
    685         ret <8 x i16> %tmp5
    686 }
    687 
    688 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
    689 ;CHECK-LABEL: uabal2_4s:
    690 ;CHECK: uabal.4s
    691         %load1 = load <8 x i16>, <8 x i16>* %A
    692         %load2 = load <8 x i16>, <8 x i16>* %B
    693         %tmp3 = load <4 x i32>, <4 x i32>* %C
    694         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    695         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    696         %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    697         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
    698         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
    699         ret <4 x i32> %tmp5
    700 }
    701 
    702 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
    703 ;CHECK-LABEL: uabal2_2d:
    704 ;CHECK: uabal.2d
    705         %load1 = load <4 x i32>, <4 x i32>* %A
    706         %load2 = load <4 x i32>, <4 x i32>* %B
    707         %tmp3 = load <2 x i64>, <2 x i64>* %C
    708         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    709         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    710         %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    711         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
    712         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
    713         ret <2 x i64> %tmp5
    714 }
    715 
    716 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
    717 ;CHECK-LABEL: saba_8b:
    718 ;CHECK: saba.8b
    719         %tmp1 = load <8 x i8>, <8 x i8>* %A
    720         %tmp2 = load <8 x i8>, <8 x i8>* %B
    721         %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    722         %tmp4 = load <8 x i8>, <8 x i8>* %C
    723         %tmp5 = add <8 x i8> %tmp3, %tmp4
    724         ret <8 x i8> %tmp5
    725 }
    726 
    727 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
    728 ;CHECK-LABEL: saba_16b:
    729 ;CHECK: saba.16b
    730         %tmp1 = load <16 x i8>, <16 x i8>* %A
    731         %tmp2 = load <16 x i8>, <16 x i8>* %B
    732         %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    733         %tmp4 = load <16 x i8>, <16 x i8>* %C
    734         %tmp5 = add <16 x i8> %tmp3, %tmp4
    735         ret <16 x i8> %tmp5
    736 }
    737 
    738 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    739 ;CHECK-LABEL: saba_4h:
    740 ;CHECK: saba.4h
    741         %tmp1 = load <4 x i16>, <4 x i16>* %A
    742         %tmp2 = load <4 x i16>, <4 x i16>* %B
    743         %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    744         %tmp4 = load <4 x i16>, <4 x i16>* %C
    745         %tmp5 = add <4 x i16> %tmp3, %tmp4
    746         ret <4 x i16> %tmp5
    747 }
    748 
    749 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
    750 ;CHECK-LABEL: saba_8h:
    751 ;CHECK: saba.8h
    752         %tmp1 = load <8 x i16>, <8 x i16>* %A
    753         %tmp2 = load <8 x i16>, <8 x i16>* %B
    754         %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    755         %tmp4 = load <8 x i16>, <8 x i16>* %C
    756         %tmp5 = add <8 x i16> %tmp3, %tmp4
    757         ret <8 x i16> %tmp5
    758 }
    759 
    760 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    761 ;CHECK-LABEL: saba_2s:
    762 ;CHECK: saba.2s
    763         %tmp1 = load <2 x i32>, <2 x i32>* %A
    764         %tmp2 = load <2 x i32>, <2 x i32>* %B
    765         %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    766         %tmp4 = load <2 x i32>, <2 x i32>* %C
    767         %tmp5 = add <2 x i32> %tmp3, %tmp4
    768         ret <2 x i32> %tmp5
    769 }
    770 
    771 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
    772 ;CHECK-LABEL: saba_4s:
    773 ;CHECK: saba.4s
    774         %tmp1 = load <4 x i32>, <4 x i32>* %A
    775         %tmp2 = load <4 x i32>, <4 x i32>* %B
    776         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    777         %tmp4 = load <4 x i32>, <4 x i32>* %C
    778         %tmp5 = add <4 x i32> %tmp3, %tmp4
    779         ret <4 x i32> %tmp5
    780 }
    781 
    782 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
    783 ;CHECK-LABEL: uaba_8b:
    784 ;CHECK: uaba.8b
    785         %tmp1 = load <8 x i8>, <8 x i8>* %A
    786         %tmp2 = load <8 x i8>, <8 x i8>* %B
    787         %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
    788         %tmp4 = load <8 x i8>, <8 x i8>* %C
    789         %tmp5 = add <8 x i8> %tmp3, %tmp4
    790         ret <8 x i8> %tmp5
    791 }
    792 
    793 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
    794 ;CHECK-LABEL: uaba_16b:
    795 ;CHECK: uaba.16b
    796         %tmp1 = load <16 x i8>, <16 x i8>* %A
    797         %tmp2 = load <16 x i8>, <16 x i8>* %B
    798         %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
    799         %tmp4 = load <16 x i8>, <16 x i8>* %C
    800         %tmp5 = add <16 x i8> %tmp3, %tmp4
    801         ret <16 x i8> %tmp5
    802 }
    803 
    804 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    805 ;CHECK-LABEL: uaba_4h:
    806 ;CHECK: uaba.4h
    807         %tmp1 = load <4 x i16>, <4 x i16>* %A
    808         %tmp2 = load <4 x i16>, <4 x i16>* %B
    809         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
    810         %tmp4 = load <4 x i16>, <4 x i16>* %C
    811         %tmp5 = add <4 x i16> %tmp3, %tmp4
    812         ret <4 x i16> %tmp5
    813 }
    814 
    815 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
    816 ;CHECK-LABEL: uaba_8h:
    817 ;CHECK: uaba.8h
    818         %tmp1 = load <8 x i16>, <8 x i16>* %A
    819         %tmp2 = load <8 x i16>, <8 x i16>* %B
    820         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    821         %tmp4 = load <8 x i16>, <8 x i16>* %C
    822         %tmp5 = add <8 x i16> %tmp3, %tmp4
    823         ret <8 x i16> %tmp5
    824 }
    825 
    826 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    827 ;CHECK-LABEL: uaba_2s:
    828 ;CHECK: uaba.2s
    829         %tmp1 = load <2 x i32>, <2 x i32>* %A
    830         %tmp2 = load <2 x i32>, <2 x i32>* %B
    831         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
    832         %tmp4 = load <2 x i32>, <2 x i32>* %C
    833         %tmp5 = add <2 x i32> %tmp3, %tmp4
    834         ret <2 x i32> %tmp5
    835 }
    836 
    837 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
    838 ;CHECK-LABEL: uaba_4s:
    839 ;CHECK: uaba.4s
    840         %tmp1 = load <4 x i32>, <4 x i32>* %A
    841         %tmp2 = load <4 x i32>, <4 x i32>* %B
    842         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    843         %tmp4 = load <4 x i32>, <4 x i32>* %C
    844         %tmp5 = add <4 x i32> %tmp3, %tmp4
    845         ret <4 x i32> %tmp5
    846 }
    847 
    848 ; Scalar FABD
    849 define float @fabds(float %a, float %b) nounwind {
    850 ; CHECK-LABEL: fabds:
    851 ; CHECK: fabd s0, s0, s1
    852   %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
    853   ret float %vabd.i
    854 }
    855 
    856 define double @fabdd(double %a, double %b) nounwind {
    857 ; CHECK-LABEL: fabdd:
    858 ; CHECK: fabd d0, d0, d1
    859   %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
    860   ret double %vabd.i
    861 }
    862 
    863 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
    864 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
    865 
    866 define float @fabds_from_fsub_fabs(float %a, float %b) nounwind {
    867 ; CHECK-LABEL: fabds_from_fsub_fabs:
    868 ; CHECK: fabd s0, s0, s1
    869   %sub = fsub float %a, %b
    870   %abs = tail call float @llvm.fabs.f32(float %sub)
    871   ret float %abs
    872 }
    873 
    874 define double @fabdd_from_fsub_fabs(double %a, double %b) nounwind {
    875 ; CHECK-LABEL: fabdd_from_fsub_fabs:
    876 ; CHECK: fabd d0, d0, d1
    877   %sub = fsub double %a, %b
    878   %abs = tail call double @llvm.fabs.f64(double %sub)
    879   ret double %abs
    880 }
    881 
    882 declare float @llvm.fabs.f32(float) nounwind readnone
    883 declare double @llvm.fabs.f64(double) nounwind readnone
    884 
    885 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
    886 ; CHECK-LABEL: uabdl_from_extract_dup:
    887 ; CHECK-NOT: ext.16b
    888 ; CHECK: uabdl2.2d
    889   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
    890   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
    891 
    892   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    893 
    894   %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
    895   %res1 = zext <2 x i32> %res to <2 x i64>
    896   ret <2 x i64> %res1
    897 }
    898 
    899 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
    900 ; CHECK-LABEL: sabdl_from_extract_dup:
    901 ; CHECK-NOT: ext.16b
    902 ; CHECK: sabdl2.2d
    903   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
    904   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
    905 
    906   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
    907 
    908   %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
    909   %res1 = zext <2 x i32> %res to <2 x i64>
    910   ret <2 x i64> %res1
    911 }
    912 
    913 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
    914 ; CHECK-LABEL: abspattern1:
    915 ; CHECK: abs.2s
    916 ; CHECK-NEXT: ret
    917         %tmp1neg = sub <2 x i32> zeroinitializer, %a
    918         %b = icmp sge <2 x i32> %a, zeroinitializer
    919         %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
    920         ret <2 x i32> %abs
    921 }
    922 
    923 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
    924 ; CHECK-LABEL: abspattern2:
    925 ; CHECK: abs.4h
    926 ; CHECK-NEXT: ret
    927         %tmp1neg = sub <4 x i16> zeroinitializer, %a
    928         %b = icmp sgt <4 x i16> %a, zeroinitializer
    929         %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
    930         ret <4 x i16> %abs
    931 }
    932 
    933 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
    934 ; CHECK-LABEL: abspattern3:
    935 ; CHECK: abs.8b
    936 ; CHECK-NEXT: ret
    937         %tmp1neg = sub <8 x i8> zeroinitializer, %a
    938         %b = icmp slt <8 x i8> %a, zeroinitializer
    939         %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
    940         ret <8 x i8> %abs
    941 }
    942 
    943 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
    944 ; CHECK-LABEL: abspattern4:
    945 ; CHECK: abs.4s
    946 ; CHECK-NEXT: ret
    947         %tmp1neg = sub <4 x i32> zeroinitializer, %a
    948         %b = icmp sge <4 x i32> %a, zeroinitializer
    949         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
    950         ret <4 x i32> %abs
    951 }
    952 
    953 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
    954 ; CHECK-LABEL: abspattern5:
    955 ; CHECK: abs.8h
    956 ; CHECK-NEXT: ret
    957         %tmp1neg = sub <8 x i16> zeroinitializer, %a
    958         %b = icmp sgt <8 x i16> %a, zeroinitializer
    959         %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
    960         ret <8 x i16> %abs
    961 }
    962 
    963 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
    964 ; CHECK-LABEL: abspattern6:
    965 ; CHECK: abs.16b
    966 ; CHECK-NEXT: ret
    967         %tmp1neg = sub <16 x i8> zeroinitializer, %a
    968         %b = icmp slt <16 x i8> %a, zeroinitializer
    969         %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
    970         ret <16 x i8> %abs
    971 }
    972 
    973 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
    974 ; CHECK-LABEL: abspattern7:
    975 ; CHECK: abs.2d
    976 ; CHECK-NEXT: ret
    977         %tmp1neg = sub <2 x i64> zeroinitializer, %a
    978         %b = icmp sle <2 x i64> %a, zeroinitializer
    979         %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
    980         ret <2 x i64> %abs
    981 }
    982