Home | History | Annotate | Download | only in ARM
      1 ; RUN: llc -mattr=+neon < %s | FileCheck %s
      2 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
      3 target triple = "thumbv7-elf"
      4 
      5 define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
      6 ;CHECK: vqdmulhs16:
      7 ;CHECK: vqdmulh.s16
      8 	%tmp1 = load <4 x i16>* %A
      9 	%tmp2 = load <4 x i16>* %B
     10 	%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     11 	ret <4 x i16> %tmp3
     12 }
     13 
     14 define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     15 ;CHECK: vqdmulhs32:
     16 ;CHECK: vqdmulh.s32
     17 	%tmp1 = load <2 x i32>* %A
     18 	%tmp2 = load <2 x i32>* %B
     19 	%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     20 	ret <2 x i32> %tmp3
     21 }
     22 
     23 define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
     24 ;CHECK: vqdmulhQs16:
     25 ;CHECK: vqdmulh.s16
     26 	%tmp1 = load <8 x i16>* %A
     27 	%tmp2 = load <8 x i16>* %B
     28 	%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
     29 	ret <8 x i16> %tmp3
     30 }
     31 
     32 define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
     33 ;CHECK: vqdmulhQs32:
     34 ;CHECK: vqdmulh.s32
     35 	%tmp1 = load <4 x i32>* %A
     36 	%tmp2 = load <4 x i32>* %B
     37 	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
     38 	ret <4 x i32> %tmp3
     39 }
     40 
     41 define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
     42 entry:
     43 ; CHECK: test_vqdmulhQ_lanes16
     44 ; CHECK: vqdmulh.s16 q0, q0, d2[1]
     45   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
     46   %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
     47   ret <8 x i16> %1
     48 }
     49 
     50 define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
     51 entry:
     52 ; CHECK: test_vqdmulhQ_lanes32
     53 ; CHECK: vqdmulh.s32 q0, q0, d2[1]
     54   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
     55   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
     56   ret <4 x i32> %1
     57 }
     58 
     59 define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
     60 entry:
     61 ; CHECK: test_vqdmulh_lanes16
     62 ; CHECK: vqdmulh.s16 d0, d0, d1[1]
     63   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
     64   %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
     65   ret <4 x i16> %1
     66 }
     67 
     68 define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
     69 entry:
     70 ; CHECK: test_vqdmulh_lanes32
     71 ; CHECK: vqdmulh.s32 d0, d0, d1[1]
     72   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
     73   %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
     74   ret <2 x i32> %1
     75 }
     76 
     77 declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
     78 declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
     79 
     80 declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
     81 declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
     82 
     83 define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
     84 ;CHECK: vqrdmulhs16:
     85 ;CHECK: vqrdmulh.s16
     86 	%tmp1 = load <4 x i16>* %A
     87 	%tmp2 = load <4 x i16>* %B
     88 	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
     89 	ret <4 x i16> %tmp3
     90 }
     91 
     92 define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
     93 ;CHECK: vqrdmulhs32:
     94 ;CHECK: vqrdmulh.s32
     95 	%tmp1 = load <2 x i32>* %A
     96 	%tmp2 = load <2 x i32>* %B
     97 	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
     98 	ret <2 x i32> %tmp3
     99 }
    100 
    101 define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
    102 ;CHECK: vqrdmulhQs16:
    103 ;CHECK: vqrdmulh.s16
    104 	%tmp1 = load <8 x i16>* %A
    105 	%tmp2 = load <8 x i16>* %B
    106 	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
    107 	ret <8 x i16> %tmp3
    108 }
    109 
    110 define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
    111 ;CHECK: vqrdmulhQs32:
    112 ;CHECK: vqrdmulh.s32
    113 	%tmp1 = load <4 x i32>* %A
    114 	%tmp2 = load <4 x i32>* %B
    115 	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
    116 	ret <4 x i32> %tmp3
    117 }
    118 
    119 define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
    120 entry:
    121 ; CHECK: test_vqRdmulhQ_lanes16
    122 ; CHECK: vqrdmulh.s16 q0, q0, d2[1]
    123   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
    124   %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
    125   ret <8 x i16> %1
    126 }
    127 
    128 define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
    129 entry:
    130 ; CHECK: test_vqRdmulhQ_lanes32
    131 ; CHECK: vqrdmulh.s32 q0, q0, d2[1]
    132   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
    133   %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
    134   ret <4 x i32> %1
    135 }
    136 
    137 define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
    138 entry:
    139 ; CHECK: test_vqRdmulh_lanes16
    140 ; CHECK: vqrdmulh.s16 d0, d0, d1[1]
    141   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    142   %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
    143   ret <4 x i16> %1
    144 }
    145 
    146 define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
    147 entry:
    148 ; CHECK: test_vqRdmulh_lanes32
    149 ; CHECK: vqrdmulh.s32 d0, d0, d1[1]
    150   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    151   %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
    152   ret <2 x i32> %1
    153 }
    154 
    155 declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
    156 declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
    157 
    158 declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
    159 declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
    160 
    161 define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
    162 ;CHECK: vqdmulls16:
    163 ;CHECK: vqdmull.s16
    164 	%tmp1 = load <4 x i16>* %A
    165 	%tmp2 = load <4 x i16>* %B
    166 	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
    167 	ret <4 x i32> %tmp3
    168 }
    169 
    170 define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
    171 ;CHECK: vqdmulls32:
    172 ;CHECK: vqdmull.s32
    173 	%tmp1 = load <2 x i32>* %A
    174 	%tmp2 = load <2 x i32>* %B
    175 	%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
    176 	ret <2 x i64> %tmp3
    177 }
    178 
    179 define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
    180 entry:
    181 ; CHECK: test_vqdmull_lanes16
    182 ; CHECK: vqdmull.s16 q0, d0, d1[1]
    183   %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    184   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
    185   ret <4 x i32> %1
    186 }
    187 
    188 define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
    189 entry:
    190 ; CHECK: test_vqdmull_lanes32
    191 ; CHECK: vqdmull.s32 q0, d0, d1[1]
    192   %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    193   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
    194   ret <2 x i64> %1
    195 }
    196 
    197 declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
    198 declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
    199 
    200 define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    201 ;CHECK: vqdmlals16:
    202 ;CHECK: vqdmlal.s16
    203 	%tmp1 = load <4 x i32>* %A
    204 	%tmp2 = load <4 x i16>* %B
    205 	%tmp3 = load <4 x i16>* %C
    206 	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
    207 	ret <4 x i32> %tmp4
    208 }
    209 
    210 define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    211 ;CHECK: vqdmlals32:
    212 ;CHECK: vqdmlal.s32
    213 	%tmp1 = load <2 x i64>* %A
    214 	%tmp2 = load <2 x i32>* %B
    215 	%tmp3 = load <2 x i32>* %C
    216 	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
    217 	ret <2 x i64> %tmp4
    218 }
    219 
    220 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
    221 entry:
    222 ; CHECK: test_vqdmlal_lanes16
    223 ; CHECK: vqdmlal.s16 q0, d2, d3[1]
    224   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    225   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
    226   ret <4 x i32> %1
    227 }
    228 
    229 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
    230 entry:
    231 ; CHECK: test_vqdmlal_lanes32
    232 ; CHECK: vqdmlal.s32 q0, d2, d3[1]
    233   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    234   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
    235   ret <2 x i64> %1
    236 }
    237 
    238 declare <4 x i32>  @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
    239 declare <2 x i64>  @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
    240 
    241 define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
    242 ;CHECK: vqdmlsls16:
    243 ;CHECK: vqdmlsl.s16
    244 	%tmp1 = load <4 x i32>* %A
    245 	%tmp2 = load <4 x i16>* %B
    246 	%tmp3 = load <4 x i16>* %C
    247 	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
    248 	ret <4 x i32> %tmp4
    249 }
    250 
    251 define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
    252 ;CHECK: vqdmlsls32:
    253 ;CHECK: vqdmlsl.s32
    254 	%tmp1 = load <2 x i64>* %A
    255 	%tmp2 = load <2 x i32>* %B
    256 	%tmp3 = load <2 x i32>* %C
    257 	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
    258 	ret <2 x i64> %tmp4
    259 }
    260 
    261 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
    262 entry:
    263 ; CHECK: test_vqdmlsl_lanes16
    264 ; CHECK: vqdmlsl.s16 q0, d2, d3[1]
    265   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
    266   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
    267   ret <4 x i32> %1
    268 }
    269 
    270 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
    271 entry:
    272 ; CHECK: test_vqdmlsl_lanes32
    273 ; CHECK: vqdmlsl.s32 q0, d2, d3[1]
    274   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
    275   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
    276   ret <2 x i64> %1
    277 }
    278 
    279 declare <4 x i32>  @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
    280 declare <2 x i64>  @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
    281