1 ; RUN: llc -mattr=+neon < %s | FileCheck %s 2 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" 3 target triple = "thumbv7-elf" 4 5 define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 6 ;CHECK-LABEL: vqdmulhs16: 7 ;CHECK: vqdmulh.s16 8 %tmp1 = load <4 x i16>, <4 x i16>* %A 9 %tmp2 = load <4 x i16>, <4 x i16>* %B 10 %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 11 ret <4 x i16> %tmp3 12 } 13 14 define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 15 ;CHECK-LABEL: vqdmulhs32: 16 ;CHECK: vqdmulh.s32 17 %tmp1 = load <2 x i32>, <2 x i32>* %A 18 %tmp2 = load <2 x i32>, <2 x i32>* %B 19 %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 20 ret <2 x i32> %tmp3 21 } 22 23 define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 24 ;CHECK-LABEL: vqdmulhQs16: 25 ;CHECK: vqdmulh.s16 26 %tmp1 = load <8 x i16>, <8 x i16>* %A 27 %tmp2 = load <8 x i16>, <8 x i16>* %B 28 %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 29 ret <8 x i16> %tmp3 30 } 31 32 define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 33 ;CHECK-LABEL: vqdmulhQs32: 34 ;CHECK: vqdmulh.s32 35 %tmp1 = load <4 x i32>, <4 x i32>* %A 36 %tmp2 = load <4 x i32>, <4 x i32>* %B 37 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 38 ret <4 x i32> %tmp3 39 } 40 41 define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 42 entry: 43 ; CHECK: test_vqdmulhQ_lanes16 44 ; CHECK: vqdmulh.s16 q0, q0, d2[1] 45 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 46 %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 47 ret <8 x i16> %1 48 } 49 50 define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 51 entry: 52 ; CHECK: test_vqdmulhQ_lanes32 53 ; CHECK: vqdmulh.s32 q0, q0, d2[1] 54 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 55 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 56 ret <4 x i32> %1 57 } 58 59 define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 60 entry: 61 ; CHECK: test_vqdmulh_lanes16 62 ; CHECK: vqdmulh.s16 d0, d0, d1[1] 63 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 64 %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 65 ret <4 x i16> %1 66 } 67 68 define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 69 entry: 70 ; CHECK: test_vqdmulh_lanes32 71 ; CHECK: vqdmulh.s32 d0, d0, d1[1] 72 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 73 %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 74 ret <2 x i32> %1 75 } 76 77 declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 78 declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 79 80 declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 81 declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 82 83 define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 84 ;CHECK-LABEL: vqrdmulhs16: 85 ;CHECK: vqrdmulh.s16 86 %tmp1 = load <4 x i16>, <4 x i16>* %A 87 %tmp2 = load <4 x i16>, <4 x i16>* %B 88 %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 89 ret <4 x i16> %tmp3 90 } 91 92 define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 93 ;CHECK-LABEL: vqrdmulhs32: 94 ;CHECK: vqrdmulh.s32 95 %tmp1 = load <2 x i32>, <2 x i32>* %A 96 %tmp2 = load <2 x i32>, <2 x i32>* %B 97 %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 98 ret <2 x i32> %tmp3 99 } 100 101 define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 102 ;CHECK-LABEL: vqrdmulhQs16: 103 ;CHECK: vqrdmulh.s16 104 %tmp1 = load <8 x i16>, <8 x i16>* %A 105 %tmp2 = load <8 x i16>, <8 x i16>* %B 106 %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 107 ret <8 x i16> %tmp3 108 } 109 110 define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 111 ;CHECK-LABEL: vqrdmulhQs32: 112 ;CHECK: vqrdmulh.s32 113 %tmp1 = load <4 x i32>, <4 x i32>* %A 114 %tmp2 = load <4 x i32>, <4 x i32>* %B 115 %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 116 ret <4 x i32> %tmp3 117 } 118 119 define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 120 entry: 121 ; CHECK: test_vqRdmulhQ_lanes16 122 ; CHECK: vqrdmulh.s16 q0, q0, d2[1] 123 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] 124 %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] 125 ret <8 x i16> %1 126 } 127 128 define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 129 entry: 130 ; CHECK: test_vqRdmulhQ_lanes32 131 ; CHECK: vqrdmulh.s32 q0, q0, d2[1] 132 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] 133 %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] 134 ret <4 x i32> %1 135 } 136 137 define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 138 entry: 139 ; CHECK: test_vqRdmulh_lanes16 140 ; CHECK: vqrdmulh.s16 d0, d0, d1[1] 141 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 142 %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] 143 ret <4 x i16> %1 144 } 145 146 define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 147 entry: 148 ; CHECK: test_vqRdmulh_lanes32 149 ; CHECK: vqrdmulh.s32 d0, d0, d1[1] 150 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 151 %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] 152 ret <2 x i32> %1 153 } 154 155 declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 156 declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 157 158 declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 159 declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 160 161 define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 162 ;CHECK-LABEL: vqdmulls16: 163 ;CHECK: vqdmull.s16 164 %tmp1 = load <4 x i16>, <4 x i16>* %A 165 %tmp2 = load <4 x i16>, <4 x i16>* %B 166 %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 167 ret <4 x i32> %tmp3 168 } 169 170 define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { 171 ;CHECK-LABEL: vqdmulls32: 172 ;CHECK: vqdmull.s32 173 %tmp1 = load <2 x i32>, <2 x i32>* %A 174 %tmp2 = load <2 x i32>, <2 x i32>* %B 175 %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 176 ret <2 x i64> %tmp3 177 } 178 179 define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { 180 entry: 181 ; CHECK: test_vqdmull_lanes16 182 ; CHECK: vqdmull.s16 q0, d0, d1[1] 183 %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 184 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] 185 ret <4 x i32> %1 186 } 187 188 define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { 189 entry: 190 ; CHECK: test_vqdmull_lanes32 191 ; CHECK: vqdmull.s32 q0, d0, d1[1] 192 %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 193 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] 194 ret <2 x i64> %1 195 } 196 197 declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 198 declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 199 200 define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 201 ;CHECK-LABEL: vqdmlals16_natural: 202 ;CHECK: vqdmlal.s16 203 %tmp1 = load <4 x i32>, <4 x i32>* %A 204 %tmp2 = load <4 x i16>, <4 x i16>* %B 205 %tmp3 = load <4 x i16>, <4 x i16>* %C 206 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) 207 %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) 208 ret <4 x i32> %tmp5 209 } 210 211 define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 212 ;CHECK-LABEL: vqdmlals32_natural: 213 ;CHECK: vqdmlal.s32 214 %tmp1 = load <2 x i64>, <2 x i64>* %A 215 %tmp2 = load <2 x i32>, <2 x i32>* %B 216 %tmp3 = load <2 x i32>, <2 x i32>* %C 217 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) 218 %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) 219 ret <2 x i64> %tmp5 220 } 221 222 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 223 entry: 224 ; CHECK-LABEL: test_vqdmlal_lanes16_natural: 225 ; CHECK: vqdmlal.s16 q0, d2, d3[1] 226 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 227 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) 228 %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) 229 ret <4 x i32> %2 230 } 231 232 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 233 entry: 234 ; CHECK-LABEL: test_vqdmlal_lanes32_natural: 235 ; CHECK: vqdmlal.s32 q0, d2, d3[1] 236 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 237 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) 238 %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) 239 ret <2 x i64> %2 240 } 241 242 declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 243 declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 244 245 define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 246 ;CHECK-LABEL: vqdmlsls16_natural: 247 ;CHECK: vqdmlsl.s16 248 %tmp1 = load <4 x i32>, <4 x i32>* %A 249 %tmp2 = load <4 x i16>, <4 x i16>* %B 250 %tmp3 = load <4 x i16>, <4 x i16>* %C 251 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) 252 %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) 253 ret <4 x i32> %tmp5 254 } 255 256 define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 257 ;CHECK-LABEL: vqdmlsls32_natural: 258 ;CHECK: vqdmlsl.s32 259 %tmp1 = load <2 x i64>, <2 x i64>* %A 260 %tmp2 = load <2 x i32>, <2 x i32>* %B 261 %tmp3 = load <2 x i32>, <2 x i32>* %C 262 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) 263 %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) 264 ret <2 x i64> %tmp5 265 } 266 267 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { 268 entry: 269 ; CHECK-LABEL: test_vqdmlsl_lanes16_natural: 270 ; CHECK: vqdmlsl.s16 q0, d2, d3[1] 271 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] 272 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) 273 %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) 274 ret <4 x i32> %2 275 } 276 277 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { 278 entry: 279 ; CHECK-LABEL: test_vqdmlsl_lanes32_natural: 280 ; CHECK: vqdmlsl.s32 q0, d2, d3[1] 281 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] 282 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) 283 %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) 284 ret <2 x i64> %2 285 } 286 287 declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 288 declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone 289