1 ; RUN: llc -mattr=+neon < %s | FileCheck %s 2 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" 3 target triple = "thumbv7-elf" 4 5 define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 6 ;CHECK: vqdmulhs16: 7 ;CHECK: vqdmulh.s16 8 %tmp1 = load <4 x i16>* %A 9 %tmp2 = load <4 x i16>* %B 10 %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 11 ret <4 x i16> %tmp3 12 } 13 14 define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 15 ;CHECK: vqdmulhs32: 16 ;CHECK: vqdmulh.s32 17 %tmp1 = load <2 x i32>* %A 18 %tmp2 = load <2 x i32>* %B 19 %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 20 ret <2 x i32> %tmp3 21 } 22 23 define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 24 ;CHECK: vqdmulhQs16: 25 ;CHECK: vqdmulh.s16 26 %tmp1 = load <8 x i16>* %A 27 %tmp2 = load <8 x i16>* %B 28 %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 29 ret <8 x i16> %tmp3 30 } 31 32 define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 33 ;CHECK: vqdmulhQs32: 34 ;CHECK: vqdmulh.s32 35 %tmp1 = load <4 x i32>* %A 36 %tmp2 = load <4 x i32>* %B 37 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 38 ret <4 x i32> %tmp3 39 } 40 41 define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 42 entry: 43 ; CHECK: test_vqdmulhQ_lanes16 44 ; CHECK: vqdmulh.s16 q0, q0, d2[1] 45 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 46 %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 47 ret <8 x i16> %1 48 } 49 50 define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 51 entry: 52 ; CHECK: test_vqdmulhQ_lanes32 53 ; CHECK: vqdmulh.s32 q0, q0, d2[1] 54 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 55 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 56 ret <4 x i32> %1 57 } 58 59 define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 60 entry: 61 ; CHECK: test_vqdmulh_lanes16 62 ; CHECK: vqdmulh.s16 d0, d0, d1[1] 63 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 64 %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 65 ret <4 x i16> %1 66 } 67 68 define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 69 entry: 70 ; CHECK: test_vqdmulh_lanes32 71 ; CHECK: vqdmulh.s32 d0, d0, d1[1] 72 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 73 %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 74 ret <2 x i32> %1 75 } 76 77 declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 78 declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 79 80 declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 81 declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 82 83 define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 84 ;CHECK: vqrdmulhs16: 85 ;CHECK: vqrdmulh.s16 86 %tmp1 = load <4 x i16>* %A 87 %tmp2 = load <4 x i16>* %B 88 %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 89 ret <4 x i16> %tmp3 90 } 91 92 define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 93 ;CHECK: vqrdmulhs32: 94 ;CHECK: vqrdmulh.s32 95 %tmp1 = load <2 x i32>* %A 96 %tmp2 = load <2 x i32>* %B 97 %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 98 ret <2 x i32> %tmp3 99 } 100 101 define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 102 ;CHECK: vqrdmulhQs16: 103 ;CHECK: vqrdmulh.s16 104 %tmp1 = load <8 x i16>* %A 105 %tmp2 = load <8 x i16>* %B 106 %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 107 ret <8 x i16> %tmp3 108 } 109 110 define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 111 ;CHECK: vqrdmulhQs32: 112 ;CHECK: vqrdmulh.s32 113 %tmp1 = load <4 x i32>* %A 114 %tmp2 = load <4 x i32>* %B 115 %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 116 ret <4 x i32> %tmp3 117 } 118 119 define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 120 entry: 121 ; CHECK: test_vqRdmulhQ_lanes16 122 ; CHECK: vqrdmulh.s16 q0, q0, d2[1] 123 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 124 %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 125 ret <8 x i16> %1 126 } 127 128 define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 129 entry: 130 ; CHECK: test_vqRdmulhQ_lanes32 131 ; CHECK: vqrdmulh.s32 q0, q0, d2[1] 132 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 133 %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 134 ret <4 x i32> %1 135 } 136 137 define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 138 entry: 139 ; CHECK: test_vqRdmulh_lanes16 140 ; CHECK: vqrdmulh.s16 d0, d0, d1[1] 141 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 142 %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 143 ret <4 x i16> %1 144 } 145 146 define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 147 entry: 148 ; CHECK: test_vqRdmulh_lanes32 149 ; CHECK: vqrdmulh.s32 d0, d0, d1[1] 150 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 151 %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 152 ret <2 x i32> %1 153 } 154 155 declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 156 declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 157 158 declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 159 declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 160 161 define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 162 ;CHECK: vqdmulls16: 163 ;CHECK: vqdmull.s16 164 %tmp1 = load <4 x i16>* %A 165 %tmp2 = load <4 x i16>* %B 166 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 167 ret <4 x i32> %tmp3 168 } 169 170 define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 171 ;CHECK: vqdmulls32: 172 ;CHECK: vqdmull.s32 173 %tmp1 = load <2 x i32>* %A 174 %tmp2 = load <2 x i32>* %B 175 %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 176 ret <2 x i64> %tmp3 177 } 178 179 define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 180 entry: 181 ; CHECK: test_vqdmull_lanes16 182 ; CHECK: vqdmull.s16 q0, d0, d1[1] 183 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 184 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 185 ret <4 x i32> %1 186 } 187 188 define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 189 entry: 190 ; CHECK: test_vqdmull_lanes32 191 ; CHECK: vqdmull.s32 q0, d0, d1[1] 192 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 193 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 194 ret <2 x i64> %1 195 } 196 197 declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 198 declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 199 200 define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 201 ;CHECK: vqdmlals16: 202 ;CHECK: vqdmlal.s16 203 %tmp1 = load <4 x i32>* %A 204 %tmp2 = load <4 x i16>* %B 205 %tmp3 = load <4 x i16>* %C 206 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) 207 ret <4 x i32> %tmp4 208 } 209 210 define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 211 ;CHECK: vqdmlals32: 212 ;CHECK: vqdmlal.s32 213 %tmp1 = load <2 x i64>* %A 214 %tmp2 = load <2 x i32>* %B 215 %tmp3 = load <2 x i32>* %C 216 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) 217 ret <2 x i64> %tmp4 218 } 219 220 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 221 entry: 222 ; CHECK: test_vqdmlal_lanes16 223 ; CHECK: vqdmlal.s16 q0, d2, d3[1] 224 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 225 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 226 ret <4 x i32> %1 227 } 228 229 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 230 entry: 231 ; CHECK: test_vqdmlal_lanes32 232 ; CHECK: vqdmlal.s32 q0, d2, d3[1] 233 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 234 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 235 ret <2 x i64> %1 236 } 237 238 declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone 239 declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone 240 241 define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 242 ;CHECK: vqdmlsls16: 243 ;CHECK: vqdmlsl.s16 244 %tmp1 = load <4 x i32>* %A 245 %tmp2 = load <4 x i16>* %B 246 %tmp3 = load <4 x i16>* %C 247 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) 248 ret <4 x i32> %tmp4 249 } 250 251 define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 252 ;CHECK: vqdmlsls32: 253 ;CHECK: vqdmlsl.s32 254 %tmp1 = load <2 x i64>* %A 255 %tmp2 = load <2 x i32>* %B 256 %tmp3 = load <2 x i32>* %C 257 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) 258 ret <2 x i64> %tmp4 259 } 260 261 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 262 entry: 263 ; CHECK: test_vqdmlsl_lanes16 264 ; CHECK: vqdmlsl.s16 q0, d2, d3[1] 265 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 266 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 267 ret <4 x i32> %1 268 } 269 270 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 271 entry: 272 ; CHECK: test_vqdmlsl_lanes32 273 ; CHECK: vqdmlsl.s32 q0, d2, d3[1] 274 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 275 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 276 ret <2 x i64> %1 277 } 278 279 declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone 280 declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone 281