Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc -mattr=+neon < %s | FileCheck %s
      2 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
      3 target triple = "thumbv7-elf"
      4 
      5 define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
      6 ;CHECK-LABEL: vqdmulhs16:
      7 ;CHECK: vqdmulh.s16
      8 	%tmp1 = load <4 x i16>, <4 x i16>* %A
      9 	%tmp2 = load <4 x i16>, <4 x i16>* %B
     10 	%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     11 	ret <4 x i16> %tmp3
     12 }
     13 
     14 define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     15 ;CHECK-LABEL: vqdmulhs32:
     16 ;CHECK: vqdmulh.s32
     17 	%tmp1 = load <2 x i32>, <2 x i32>* %A
     18 	%tmp2 = load <2 x i32>, <2 x i32>* %B
     19 	%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     20 	ret <2 x i32> %tmp3
     21 }
     22 
     23 define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     24 ;CHECK-LABEL: vqdmulhQs16:
     25 ;CHECK: vqdmulh.s16
     26 	%tmp1 = load <8 x i16>, <8 x i16>* %A
     27 	%tmp2 = load <8 x i16>, <8 x i16>* %B
     28 	%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
     29 	ret <8 x i16> %tmp3
     30 }
     31 
     32 define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     33 ;CHECK-LABEL: vqdmulhQs32:
     34 ;CHECK: vqdmulh.s32
     35 	%tmp1 = load <4 x i32>, <4 x i32>* %A
     36 	%tmp2 = load <4 x i32>, <4 x i32>* %B
     37 	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
     38 	ret <4 x i32> %tmp3
     39 }
     40 
     41 define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
     42 entry:
     43 ; CHECK: test_vqdmulhQ_lanes16
     44 ; CHECK: vqdmulh.s16 q0, q0, d2[1]
     45   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
     46   %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
     47   ret <8 x i16> %1
     48 }
     49 
     50 define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
     51 entry:
     52 ; CHECK: test_vqdmulhQ_lanes32
     53 ; CHECK: vqdmulh.s32 q0, q0, d2[1]
     54   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
     55   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
     56   ret <4 x i32> %1
     57 }
     58 
     59 define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
     60 entry:
     61 ; CHECK: test_vqdmulh_lanes16
     62 ; CHECK: vqdmulh.s16 d0, d0, d1[1]
     63   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
     64   %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
     65   ret <4 x i16> %1
     66 }
     67 
     68 define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
     69 entry:
     70 ; CHECK: test_vqdmulh_lanes32
     71 ; CHECK: vqdmulh.s32 d0, d0, d1[1]
     72   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
     73   %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
     74   ret <2 x i32> %1
     75 }
     76 
     77 declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     78 declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     79 
     80 declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
     81 declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     82 
     83 define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     84 ;CHECK-LABEL: vqrdmulhs16:
     85 ;CHECK: vqrdmulh.s16
     86 	%tmp1 = load <4 x i16>, <4 x i16>* %A
     87 	%tmp2 = load <4 x i16>, <4 x i16>* %B
     88 	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     89 	ret <4 x i16> %tmp3
     90 }
     91 
     92 define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     93 ;CHECK-LABEL: vqrdmulhs32:
     94 ;CHECK: vqrdmulh.s32
     95 	%tmp1 = load <2 x i32>, <2 x i32>* %A
     96 	%tmp2 = load <2 x i32>, <2 x i32>* %B
     97 	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     98 	ret <2 x i32> %tmp3
     99 }
    100 
    101 define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    102 ;CHECK-LABEL: vqrdmulhQs16:
    103 ;CHECK: vqrdmulh.s16
    104 	%tmp1 = load <8 x i16>, <8 x i16>* %A
    105 	%tmp2 = load <8 x i16>, <8 x i16>* %B
    106 	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    107 	ret <8 x i16> %tmp3
    108 }
    109 
    110 define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    111 ;CHECK-LABEL: vqrdmulhQs32:
    112 ;CHECK: vqrdmulh.s32
    113 	%tmp1 = load <4 x i32>, <4 x i32>* %A
    114 	%tmp2 = load <4 x i32>, <4 x i32>* %B
    115 	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    116 	ret <4 x i32> %tmp3
    117 }
    118 
    119 define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
    120 entry:
    121 ; CHECK: test_vqRdmulhQ_lanes16
    122 ; CHECK: vqrdmulh.s16 q0, q0, d2[1]
    123   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
    124   %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
    125   ret <8 x i16> %1
    126 }
    127 
    128 define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
    129 entry:
    130 ; CHECK: test_vqRdmulhQ_lanes32
    131 ; CHECK: vqrdmulh.s32 q0, q0, d2[1]
    132   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
    133   %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
    134   ret <4 x i32> %1
    135 }
    136 
    137 define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
    138 entry:
    139 ; CHECK: test_vqRdmulh_lanes16
    140 ; CHECK: vqrdmulh.s16 d0, d0, d1[1]
    141   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    142   %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
    143   ret <4 x i16> %1
    144 }
    145 
    146 define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
    147 entry:
    148 ; CHECK: test_vqRdmulh_lanes32
    149 ; CHECK: vqrdmulh.s32 d0, d0, d1[1]
    150   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    151   %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
    152   ret <2 x i32> %1
    153 }
    154 
    155 declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    156 declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    157 
    158 declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    159 declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    160 
    161 define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    162 ;CHECK-LABEL: vqdmulls16:
    163 ;CHECK: vqdmull.s16
    164 	%tmp1 = load <4 x i16>, <4 x i16>* %A
    165 	%tmp2 = load <4 x i16>, <4 x i16>* %B
    166 	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    167 	ret <4 x i32> %tmp3
    168 }
    169 
    170 define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    171 ;CHECK-LABEL: vqdmulls32:
    172 ;CHECK: vqdmull.s32
    173 	%tmp1 = load <2 x i32>, <2 x i32>* %A
    174 	%tmp2 = load <2 x i32>, <2 x i32>* %B
    175 	%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    176 	ret <2 x i64> %tmp3
    177 }
    178 
    179 define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
    180 entry:
    181 ; CHECK: test_vqdmull_lanes16
    182 ; CHECK: vqdmull.s16 q0, d0, d1[1]
    183   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    184   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
    185   ret <4 x i32> %1
    186 }
    187 
    188 define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
    189 entry:
    190 ; CHECK: test_vqdmull_lanes32
    191 ; CHECK: vqdmull.s32 q0, d0, d1[1]
    192   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    193   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
    194   ret <2 x i64> %1
    195 }
    196 
    197 declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
    198 declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
    199 
    200 define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    201 ;CHECK-LABEL: vqdmlals16_natural:
    202 ;CHECK: vqdmlal.s16
    203         %tmp1 = load <4 x i32>, <4 x i32>* %A
    204         %tmp2 = load <4 x i16>, <4 x i16>* %B
    205         %tmp3 = load <4 x i16>, <4 x i16>* %C
    206         %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
    207         %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
    208         ret <4 x i32> %tmp5
    209 }
    210 
    211 define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    212 ;CHECK-LABEL: vqdmlals32_natural:
    213 ;CHECK: vqdmlal.s32
    214         %tmp1 = load <2 x i64>, <2 x i64>* %A
    215         %tmp2 = load <2 x i32>, <2 x i32>* %B
    216         %tmp3 = load <2 x i32>, <2 x i32>* %C
    217         %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
    218         %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
    219         ret <2 x i64> %tmp5
    220 }
    221 
    222 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
    223 entry:
    224 ; CHECK-LABEL: test_vqdmlal_lanes16_natural:
    225 ; CHECK: vqdmlal.s16 q0, d2, d3[1]
    226   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    227   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
    228   %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
    229   ret <4 x i32> %2
    230 }
    231 
    232 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
    233 entry:
    234 ; CHECK-LABEL: test_vqdmlal_lanes32_natural:
    235 ; CHECK: vqdmlal.s32 q0, d2, d3[1]
    236   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    237   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
    238   %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
    239   ret <2 x i64> %2
    240 }
    241 
    242 declare <4 x i32>  @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    243 declare <2 x i64>  @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
    244 
    245 define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    246 ;CHECK-LABEL: vqdmlsls16_natural:
    247 ;CHECK: vqdmlsl.s16
    248         %tmp1 = load <4 x i32>, <4 x i32>* %A
    249         %tmp2 = load <4 x i16>, <4 x i16>* %B
    250         %tmp3 = load <4 x i16>, <4 x i16>* %C
    251         %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
    252         %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
    253         ret <4 x i32> %tmp5
    254 }
    255 
    256 define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    257 ;CHECK-LABEL: vqdmlsls32_natural:
    258 ;CHECK: vqdmlsl.s32
    259         %tmp1 = load <2 x i64>, <2 x i64>* %A
    260         %tmp2 = load <2 x i32>, <2 x i32>* %B
    261         %tmp3 = load <2 x i32>, <2 x i32>* %C
    262         %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
    263         %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
    264         ret <2 x i64> %tmp5
    265 }
    266 
    267 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
    268 entry:
    269 ; CHECK-LABEL: test_vqdmlsl_lanes16_natural:
    270 ; CHECK: vqdmlsl.s16 q0, d2, d3[1]
    271   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    272   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
    273   %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
    274   ret <4 x i32> %2
    275 }
    276 
    277 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
    278 entry:
    279 ; CHECK-LABEL: test_vqdmlsl_lanes32_natural:
    280 ; CHECK: vqdmlsl.s32 q0, d2, d3[1]
    281   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    282   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
    283   %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
    284   ret <2 x i64> %2
    285 }
    286 
    287 declare <4 x i32>  @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    288 declare <2 x i64>  @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
    289