Home | History | Annotate | Download | only in AArch64
      1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s
      2 
      3 define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
      4 ; CHECK-LABEL: test_vaddv_s8:
      5 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
      6 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
      7 ; CHECK-NEXT: ret
      8 entry:
      9   %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
     10   %0 = trunc i32 %vaddv.i to i8
     11   ret i8 %0
     12 }
     13 
     14 define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
     15 ; CHECK-LABEL: test_vaddv_s8_used_by_laneop:
     16 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v1
     17 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
     18 ; CHECK-NEXT: ret
     19 entry:
     20   %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a2)
     21   %1 = trunc i32 %0 to i8
     22   %2 = insertelement <8 x i8> %a1, i8 %1, i32 3
     23   ret <8 x i8> %2
     24 }
     25 
     26 define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
     27 ; CHECK-LABEL: test_vaddv_s16:
     28 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
     29 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
     30 ; CHECK-NEXT: ret
     31 entry:
     32   %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
     33   %0 = trunc i32 %vaddv.i to i16
     34   ret i16 %0
     35 }
     36 
     37 define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
     38 ; CHECK-LABEL: test_vaddv_s16_used_by_laneop:
     39 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v1
     40 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
     41 ; CHECK-NEXT: ret
     42 entry:
     43   %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a2)
     44   %1 = trunc i32 %0 to i16
     45   %2 = insertelement <4 x i16> %a1, i16 %1, i32 3
     46   ret <4 x i16> %2
     47 }
     48 
     49 define i32 @test_vaddv_s32(<2 x i32> %a1) {
     50 ; CHECK-LABEL: test_vaddv_s32:
     51 ; 2 x i32 is not supported by the ISA, thus, this is a special case
     52 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
     53 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
     54 ; CHECK-NEXT: ret
     55 entry:
     56   %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
     57   ret i32 %vaddv.i
     58 }
     59 
     60 define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
     61 ; CHECK-LABEL: test_vaddv_s32_used_by_laneop:
     62 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v1, v1
     63 ; CHECK-NEXT: mov.s v0[1], v[[REGNUM]][0]
     64 ; CHECK-NEXT: ret
     65 entry:
     66   %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a2)
     67   %1 = insertelement <2 x i32> %a1, i32 %0, i32 1
     68   ret <2 x i32> %1
     69 }
     70 
     71 define i64 @test_vaddv_s64(<2 x i64> %a1) {
     72 ; CHECK-LABEL: test_vaddv_s64:
     73 ; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
     74 ; CHECK-NEXT: fmov x0, [[REGNUM]]
     75 ; CHECK-NEXT: ret
     76 entry:
     77   %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
     78   ret i64 %vaddv.i
     79 }
     80 
     81 define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
     82 ; CHECK-LABEL: test_vaddv_s64_used_by_laneop:
     83 ; CHECK: addp.2d d[[REGNUM:[0-9]+]], v1
     84 ; CHECK-NEXT: mov.d v0[1], v[[REGNUM]][0]
     85 ; CHECK-NEXT: ret
     86 entry:
     87   %0 = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a2)
     88   %1 = insertelement <2 x i64> %a1, i64 %0, i64 1
     89   ret <2 x i64> %1
     90 }
     91 
     92 define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
     93 ; CHECK-LABEL: test_vaddv_u8:
     94 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
     95 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
     96 ; CHECK-NEXT: ret
     97 entry:
     98   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
     99   %0 = trunc i32 %vaddv.i to i8
    100   ret i8 %0
    101 }
    102 
    103 define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
    104 ; CHECK-LABEL: test_vaddv_u8_used_by_laneop:
    105 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v1
    106 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
    107 ; CHECK-NEXT: ret
    108 entry:
    109   %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a2)
    110   %1 = trunc i32 %0 to i8
    111   %2 = insertelement <8 x i8> %a1, i8 %1, i32 3
    112   ret <8 x i8> %2
    113 }
    114 
    115 define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
    116 ; CHECK-LABEL: test_vaddv_u8_masked:
    117 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
    118 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
    119 ; CHECK-NEXT: ret
    120 entry:
    121   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
    122   %0 = and i32 %vaddv.i, 511 ; 0x1ff
    123   ret i32 %0
    124 }
    125 
    126 define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
    127 ; CHECK-LABEL: test_vaddv_u16:
    128 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
    129 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
    130 ; CHECK-NEXT: ret
    131 entry:
    132   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
    133   %0 = trunc i32 %vaddv.i to i16
    134   ret i16 %0
    135 }
    136 
    137 define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
    138 ; CHECK-LABEL: test_vaddv_u16_used_by_laneop:
    139 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v1
    140 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
    141 ; CHECK-NEXT: ret
    142 entry:
    143   %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a2)
    144   %1 = trunc i32 %0 to i16
    145   %2 = insertelement <4 x i16> %a1, i16 %1, i32 3
    146   ret <4 x i16> %2
    147 }
    148 
    149 define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
    150 ; CHECK-LABEL: test_vaddv_u16_masked:
    151 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
    152 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
    153 ; CHECK-NEXT: ret
    154 entry:
    155   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
    156   %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
    157   ret i32 %0
    158 }
    159 
    160 define i32 @test_vaddv_u32(<2 x i32> %a1) {
    161 ; CHECK-LABEL: test_vaddv_u32:
    162 ; 2 x i32 is not supported by the ISA, thus, this is a special case
    163 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
    164 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
    165 ; CHECK-NEXT: ret
    166 entry:
    167   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
    168   ret i32 %vaddv.i
    169 }
    170 
    171 define <2 x i32> @test_vaddv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
    172 ; CHECK-LABEL: test_vaddv_u32_used_by_laneop:
    173 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v1, v1
    174 ; CHECK-NEXT: mov.s v0[1], v[[REGNUM]][0]
    175 ; CHECK-NEXT: ret
    176 entry:
    177   %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a2)
    178   %1 = insertelement <2 x i32> %a1, i32 %0, i32 1
    179   ret <2 x i32> %1
    180 }
    181 
    182 define float @test_vaddv_f32(<2 x float> %a1) {
    183 ; CHECK-LABEL: test_vaddv_f32:
    184 ; CHECK: faddp.2s s0, v0
    185 ; CHECK-NEXT: ret
    186 entry:
    187   %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
    188   ret float %vaddv.i
    189 }
    190 
    191 define float @test_vaddv_v4f32(<4 x float> %a1) {
    192 ; CHECK-LABEL: test_vaddv_v4f32:
    193 ; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
    194 ; CHECK: faddp.2s s0, [[REGNUM]]
    195 ; CHECK-NEXT: ret
    196 entry:
    197   %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
    198   ret float %vaddv.i
    199 }
    200 
    201 define double @test_vaddv_f64(<2 x double> %a1) {
    202 ; CHECK-LABEL: test_vaddv_f64:
    203 ; CHECK: faddp.2d d0, v0
    204 ; CHECK-NEXT: ret
    205 entry:
    206   %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
    207   ret double %vaddv.i
    208 }
    209 
    210 define i64 @test_vaddv_u64(<2 x i64> %a1) {
    211 ; CHECK-LABEL: test_vaddv_u64:
    212 ; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
    213 ; CHECK-NEXT: fmov x0, [[REGNUM]]
    214 ; CHECK-NEXT: ret
    215 entry:
    216   %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
    217   ret i64 %vaddv.i
    218 }
    219 
    220 define <2 x i64> @test_vaddv_u64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
    221 ; CHECK-LABEL: test_vaddv_u64_used_by_laneop:
    222 ; CHECK: addp.2d d[[REGNUM:[0-9]+]], v1
    223 ; CHECK-NEXT: mov.d v0[1], v[[REGNUM]][0]
    224 ; CHECK-NEXT: ret
    225 entry:
    226   %0 = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a2)
    227   %1 = insertelement <2 x i64> %a1, i64 %0, i64 1
    228   ret <2 x i64> %1
    229 }
    230 
    231 define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
    232 ; CHECK-LABEL: test_vaddv_u64_to_vec:
    233 ; CHECK: addp.2d d0, v0
    234 ; CHECK-NOT: fmov
    235 ; CHECK-NOT: ins
    236 ; CHECK: ret
    237 entry:
    238   %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
    239   %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
    240   ret <1 x i64> %vec
    241 }
    242 
    243 define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
    244 ; CHECK-LABEL: test_vaddvq_s8:
    245 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
    246 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
    247 ; CHECK-NEXT: ret
    248 entry:
    249   %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
    250   %0 = trunc i32 %vaddv.i to i8
    251   ret i8 %0
    252 }
    253 
    254 define <16 x i8> @test_vaddvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
    255 ; CHECK-LABEL: test_vaddvq_s8_used_by_laneop:
    256 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v1
    257 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
    258 ; CHECK-NEXT: ret
    259 entry:
    260   %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a2)
    261   %1 = trunc i32 %0 to i8
    262   %2 = insertelement <16 x i8> %a1, i8 %1, i32 3
    263   ret <16 x i8> %2
    264 }
    265 
    266 define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
    267 ; CHECK-LABEL: test_vaddvq_s16:
    268 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
    269 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
    270 ; CHECK-NEXT: ret
    271 entry:
    272   %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
    273   %0 = trunc i32 %vaddv.i to i16
    274   ret i16 %0
    275 }
    276 
    277 define <8 x i16> @test_vaddvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
    278 ; CHECK-LABEL: test_vaddvq_s16_used_by_laneop:
    279 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v1
    280 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
    281 ; CHECK-NEXT: ret
    282 entry:
    283   %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a2)
    284   %1 = trunc i32 %0 to i16
    285   %2 = insertelement <8 x i16> %a1, i16 %1, i32 3
    286   ret <8 x i16> %2
    287 }
    288 
    289 define i32 @test_vaddvq_s32(<4 x i32> %a1) {
    290 ; CHECK-LABEL: test_vaddvq_s32:
    291 ; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
    292 ; CHECK-NEXT: fmov w0, [[REGNUM]]
    293 ; CHECK-NEXT: ret
    294 entry:
    295   %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
    296   ret i32 %vaddv.i
    297 }
    298 
    299 define <4 x i32> @test_vaddvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
    300 ; CHECK-LABEL: test_vaddvq_s32_used_by_laneop:
    301 ; CHECK: addv.4s s[[REGNUM:[0-9]+]], v1
    302 ; CHECK-NEXT: mov.s v0[3], v[[REGNUM]][0]
    303 ; CHECK-NEXT: ret
    304 entry:
    305   %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a2)
    306   %1 = insertelement <4 x i32> %a1, i32 %0, i32 3
    307   ret <4 x i32> %1
    308 }
    309 
    310 define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
    311 ; CHECK-LABEL: test_vaddvq_u8:
    312 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
    313 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
    314 ; CHECK-NEXT: ret
    315 entry:
    316   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
    317   %0 = trunc i32 %vaddv.i to i8
    318   ret i8 %0
    319 }
    320 
    321 define <16 x i8> @test_vaddvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
    322 ; CHECK-LABEL: test_vaddvq_u8_used_by_laneop:
    323 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v1
    324 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
    325 ; CHECK-NEXT: ret
    326 entry:
    327   %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a2)
    328   %1 = trunc i32 %0 to i8
    329   %2 = insertelement <16 x i8> %a1, i8 %1, i32 3
    330   ret <16 x i8> %2
    331 }
    332 
    333 define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
    334 ; CHECK-LABEL: test_vaddvq_u16:
    335 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
    336 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
    337 ; CHECK-NEXT: ret
    338 entry:
    339   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
    340   %0 = trunc i32 %vaddv.i to i16
    341   ret i16 %0
    342 }
    343 
    344 define <8 x i16> @test_vaddvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
    345 ; CHECK-LABEL: test_vaddvq_u16_used_by_laneop:
    346 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v1
    347 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
    348 ; CHECK-NEXT: ret
    349 entry:
    350   %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a2)
    351   %1 = trunc i32 %0 to i16
    352   %2 = insertelement <8 x i16> %a1, i16 %1, i32 3
    353   ret <8 x i16> %2
    354 }
    355 
    356 define i32 @test_vaddvq_u32(<4 x i32> %a1) {
    357 ; CHECK-LABEL: test_vaddvq_u32:
    358 ; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
    359 ; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
    360 ; CHECK-NEXT: ret
    361 entry:
    362   %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
    363   ret i32 %vaddv.i
    364 }
    365 
    366 define <4 x i32> @test_vaddvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
    367 ; CHECK-LABEL: test_vaddvq_u32_used_by_laneop:
    368 ; CHECK: addv.4s s[[REGNUM:[0-9]+]], v1
    369 ; CHECK-NEXT: mov.s v0[3], v[[REGNUM]][0]
    370 ; CHECK-NEXT: ret
    371 entry:
    372   %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a2)
    373   %1 = insertelement <4 x i32> %a1, i32 %0, i32 3
    374   ret <4 x i32> %1
    375 }
    376 
    377 declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
    378 
    379 declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
    380 
    381 declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
    382 
    383 declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
    384 
    385 declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
    386 
    387 declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
    388 
    389 declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
    390 
    391 declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
    392 
    393 declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
    394 
    395 declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
    396 
    397 declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
    398 
    399 declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
    400 
    401 declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
    402 
    403 declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
    404 
    405 declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
    406 declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
    407 declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
    408