1 ; RUN: llc < %s -asm-verbose=false -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s 2 3 4 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5 ;CHECK-LABEL: smull8h: 6 ;CHECK: smull.8h 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = load <8 x i8>, <8 x i8>* %B 9 %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 10 ret <8 x i16> %tmp3 11 } 12 13 define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 14 ;CHECK-LABEL: smull4s: 15 ;CHECK: smull.4s 16 %tmp1 = load <4 x i16>, <4 x i16>* %A 17 %tmp2 = load <4 x i16>, <4 x i16>* %B 18 %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 19 ret <4 x i32> %tmp3 20 } 21 22 define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 23 ;CHECK-LABEL: smull2d: 24 ;CHECK: smull.2d 25 %tmp1 = load <2 x i32>, <2 x i32>* %A 26 %tmp2 = load <2 x i32>, <2 x i32>* %B 27 %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 28 ret <2 x i64> %tmp3 29 } 30 31 declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 32 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 33 declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 34 35 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 36 ;CHECK-LABEL: umull8h: 37 ;CHECK: umull.8h 38 %tmp1 = load <8 x i8>, <8 x i8>* %A 39 %tmp2 = load <8 x i8>, <8 x i8>* %B 40 %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 41 ret <8 x i16> %tmp3 42 } 43 44 define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 45 ;CHECK-LABEL: umull4s: 46 ;CHECK: umull.4s 47 %tmp1 = load <4 x i16>, <4 x i16>* %A 48 %tmp2 = load <4 x i16>, <4 x i16>* %B 49 %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 50 ret <4 x i32> %tmp3 51 } 52 53 define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 54 ;CHECK-LABEL: umull2d: 55 ;CHECK: umull.2d 56 %tmp1 = load <2 x i32>, <2 x i32>* %A 57 %tmp2 = load <2 x i32>, <2 x i32>* %B 58 %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 59 ret <2 x i64> %tmp3 60 } 61 62 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 63 declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 64 declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 65 66 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 67 ;CHECK-LABEL: sqdmull4s: 68 ;CHECK: sqdmull.4s 69 %tmp1 = load <4 x i16>, <4 x i16>* %A 70 %tmp2 = load <4 x i16>, <4 x i16>* %B 71 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 72 ret <4 x i32> %tmp3 73 } 74 75 define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 76 ;CHECK-LABEL: sqdmull2d: 77 ;CHECK: sqdmull.2d 78 %tmp1 = load <2 x i32>, <2 x i32>* %A 79 %tmp2 = load <2 x i32>, <2 x i32>* %B 80 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 81 ret <2 x i64> %tmp3 82 } 83 84 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 85 ;CHECK-LABEL: sqdmull2_4s: 86 ;CHECK: sqdmull.4s 87 %load1 = load <8 x i16>, <8 x i16>* %A 88 %load2 = load <8 x i16>, <8 x i16>* %B 89 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 90 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 91 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 92 ret <4 x i32> %tmp3 93 } 94 95 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 96 ;CHECK-LABEL: sqdmull2_2d: 97 ;CHECK: sqdmull.2d 98 %load1 = load <4 x i32>, <4 x i32>* %A 99 %load2 = load <4 x i32>, <4 x i32>* %B 100 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 101 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 102 %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 103 ret <2 x i64> %tmp3 104 } 105 106 107 declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone 108 declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone 109 110 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { 111 ;CHECK-LABEL: pmull8h: 112 ;CHECK: pmull.8h 113 %tmp1 = load <8 x i8>, <8 x i8>* %A 114 %tmp2 = load <8 x i8>, <8 x i8>* %B 115 %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) 116 ret <8 x i16> %tmp3 117 } 118 119 declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone 120 121 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 122 ;CHECK-LABEL: sqdmulh_4h: 123 ;CHECK: sqdmulh.4h 124 %tmp1 = load <4 x i16>, <4 x i16>* %A 125 %tmp2 = load <4 x i16>, <4 x i16>* %B 126 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 127 ret <4 x i16> %tmp3 128 } 129 130 define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 131 ;CHECK-LABEL: sqdmulh_8h: 132 ;CHECK: sqdmulh.8h 133 %tmp1 = load <8 x i16>, <8 x i16>* %A 134 %tmp2 = load <8 x i16>, <8 x i16>* %B 135 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 136 ret <8 x i16> %tmp3 137 } 138 139 define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 140 ;CHECK-LABEL: sqdmulh_2s: 141 ;CHECK: sqdmulh.2s 142 %tmp1 = load <2 x i32>, <2 x i32>* %A 143 %tmp2 = load <2 x i32>, <2 x i32>* %B 144 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 145 ret <2 x i32> %tmp3 146 } 147 148 define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 149 ;CHECK-LABEL: sqdmulh_4s: 150 ;CHECK: sqdmulh.4s 151 %tmp1 = load <4 x i32>, <4 x i32>* %A 152 %tmp2 = load <4 x i32>, <4 x i32>* %B 153 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 154 ret <4 x i32> %tmp3 155 } 156 157 define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind { 158 ;CHECK-LABEL: sqdmulh_1s: 159 ;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}} 160 %tmp1 = load i32, i32* %A 161 %tmp2 = load i32, i32* %B 162 %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2) 163 ret i32 %tmp3 164 } 165 166 declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 167 declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 168 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 169 declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 170 declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone 171 172 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 173 ;CHECK-LABEL: sqrdmulh_4h: 174 ;CHECK: sqrdmulh.4h 175 %tmp1 = load <4 x i16>, <4 x i16>* %A 176 %tmp2 = load <4 x i16>, <4 x i16>* %B 177 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) 178 ret <4 x i16> %tmp3 179 } 180 181 define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 182 ;CHECK-LABEL: sqrdmulh_8h: 183 ;CHECK: sqrdmulh.8h 184 %tmp1 = load <8 x i16>, <8 x i16>* %A 185 %tmp2 = load <8 x i16>, <8 x i16>* %B 186 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) 187 ret <8 x i16> %tmp3 188 } 189 190 define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 191 ;CHECK-LABEL: sqrdmulh_2s: 192 ;CHECK: sqrdmulh.2s 193 %tmp1 = load <2 x i32>, <2 x i32>* %A 194 %tmp2 = load <2 x i32>, <2 x i32>* %B 195 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) 196 ret <2 x i32> %tmp3 197 } 198 199 define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 200 ;CHECK-LABEL: sqrdmulh_4s: 201 ;CHECK: sqrdmulh.4s 202 %tmp1 = load <4 x i32>, <4 x i32>* %A 203 %tmp2 = load <4 x i32>, <4 x i32>* %B 204 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) 205 ret <4 x i32> %tmp3 206 } 207 208 define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind { 209 ;CHECK-LABEL: sqrdmulh_1s: 210 ;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}} 211 %tmp1 = load i32, i32* %A 212 %tmp2 = load i32, i32* %B 213 %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2) 214 ret i32 %tmp3 215 } 216 217 declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 218 declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone 219 declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 220 declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 221 declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone 222 223 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 224 ;CHECK-LABEL: fmulx_2s: 225 ;CHECK: fmulx.2s 226 %tmp1 = load <2 x float>, <2 x float>* %A 227 %tmp2 = load <2 x float>, <2 x float>* %B 228 %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) 229 ret <2 x float> %tmp3 230 } 231 232 define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 233 ;CHECK-LABEL: fmulx_4s: 234 ;CHECK: fmulx.4s 235 %tmp1 = load <4 x float>, <4 x float>* %A 236 %tmp2 = load <4 x float>, <4 x float>* %B 237 %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) 238 ret <4 x float> %tmp3 239 } 240 241 define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 242 ;CHECK-LABEL: fmulx_2d: 243 ;CHECK: fmulx.2d 244 %tmp1 = load <2 x double>, <2 x double>* %A 245 %tmp2 = load <2 x double>, <2 x double>* %B 246 %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) 247 ret <2 x double> %tmp3 248 } 249 250 declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone 251 declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone 252 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone 253 254 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 255 ;CHECK-LABEL: smlal4s: 256 ;CHECK: smlal.4s 257 %tmp1 = load <4 x i16>, <4 x i16>* %A 258 %tmp2 = load <4 x i16>, <4 x i16>* %B 259 %tmp3 = load <4 x i32>, <4 x i32>* %C 260 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 261 %tmp5 = add <4 x i32> %tmp3, %tmp4 262 ret <4 x i32> %tmp5 263 } 264 265 define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 266 ;CHECK-LABEL: smlal2d: 267 ;CHECK: smlal.2d 268 %tmp1 = load <2 x i32>, <2 x i32>* %A 269 %tmp2 = load <2 x i32>, <2 x i32>* %B 270 %tmp3 = load <2 x i64>, <2 x i64>* %C 271 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 272 %tmp5 = add <2 x i64> %tmp3, %tmp4 273 ret <2 x i64> %tmp5 274 } 275 276 define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 277 ;CHECK-LABEL: smlsl4s: 278 ;CHECK: smlsl.4s 279 %tmp1 = load <4 x i16>, <4 x i16>* %A 280 %tmp2 = load <4 x i16>, <4 x i16>* %B 281 %tmp3 = load <4 x i32>, <4 x i32>* %C 282 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 283 %tmp5 = sub <4 x i32> %tmp3, %tmp4 284 ret <4 x i32> %tmp5 285 } 286 287 define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 288 ;CHECK-LABEL: smlsl2d: 289 ;CHECK: smlsl.2d 290 %tmp1 = load <2 x i32>, <2 x i32>* %A 291 %tmp2 = load <2 x i32>, <2 x i32>* %B 292 %tmp3 = load <2 x i64>, <2 x i64>* %C 293 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 294 %tmp5 = sub <2 x i64> %tmp3, %tmp4 295 ret <2 x i64> %tmp5 296 } 297 298 declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 299 declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) 300 declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 301 declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) 302 303 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 304 ;CHECK-LABEL: sqdmlal4s: 305 ;CHECK: sqdmlal.4s 306 %tmp1 = load <4 x i16>, <4 x i16>* %A 307 %tmp2 = load <4 x i16>, <4 x i16>* %B 308 %tmp3 = load <4 x i32>, <4 x i32>* %C 309 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 310 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 311 ret <4 x i32> %tmp5 312 } 313 314 define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 315 ;CHECK-LABEL: sqdmlal2d: 316 ;CHECK: sqdmlal.2d 317 %tmp1 = load <2 x i32>, <2 x i32>* %A 318 %tmp2 = load <2 x i32>, <2 x i32>* %B 319 %tmp3 = load <2 x i64>, <2 x i64>* %C 320 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 321 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 322 ret <2 x i64> %tmp5 323 } 324 325 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 326 ;CHECK-LABEL: sqdmlal2_4s: 327 ;CHECK: sqdmlal.4s 328 %load1 = load <8 x i16>, <8 x i16>* %A 329 %load2 = load <8 x i16>, <8 x i16>* %B 330 %tmp3 = load <4 x i32>, <4 x i32>* %C 331 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 332 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 333 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 334 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 335 ret <4 x i32> %tmp5 336 } 337 338 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 339 ;CHECK-LABEL: sqdmlal2_2d: 340 ;CHECK: sqdmlal.2d 341 %load1 = load <4 x i32>, <4 x i32>* %A 342 %load2 = load <4 x i32>, <4 x i32>* %B 343 %tmp3 = load <2 x i64>, <2 x i64>* %C 344 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 345 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 346 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 347 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 348 ret <2 x i64> %tmp5 349 } 350 351 define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 352 ;CHECK-LABEL: sqdmlsl4s: 353 ;CHECK: sqdmlsl.4s 354 %tmp1 = load <4 x i16>, <4 x i16>* %A 355 %tmp2 = load <4 x i16>, <4 x i16>* %B 356 %tmp3 = load <4 x i32>, <4 x i32>* %C 357 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 358 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 359 ret <4 x i32> %tmp5 360 } 361 362 define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 363 ;CHECK-LABEL: sqdmlsl2d: 364 ;CHECK: sqdmlsl.2d 365 %tmp1 = load <2 x i32>, <2 x i32>* %A 366 %tmp2 = load <2 x i32>, <2 x i32>* %B 367 %tmp3 = load <2 x i64>, <2 x i64>* %C 368 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 369 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 370 ret <2 x i64> %tmp5 371 } 372 373 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 374 ;CHECK-LABEL: sqdmlsl2_4s: 375 ;CHECK: sqdmlsl.4s 376 %load1 = load <8 x i16>, <8 x i16>* %A 377 %load2 = load <8 x i16>, <8 x i16>* %B 378 %tmp3 = load <4 x i32>, <4 x i32>* %C 379 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 380 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 381 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 382 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4) 383 ret <4 x i32> %tmp5 384 } 385 386 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 387 ;CHECK-LABEL: sqdmlsl2_2d: 388 ;CHECK: sqdmlsl.2d 389 %load1 = load <4 x i32>, <4 x i32>* %A 390 %load2 = load <4 x i32>, <4 x i32>* %B 391 %tmp3 = load <2 x i64>, <2 x i64>* %C 392 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 393 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 394 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 395 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4) 396 ret <2 x i64> %tmp5 397 } 398 399 define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 400 ;CHECK-LABEL: umlal4s: 401 ;CHECK: umlal.4s 402 %tmp1 = load <4 x i16>, <4 x i16>* %A 403 %tmp2 = load <4 x i16>, <4 x i16>* %B 404 %tmp3 = load <4 x i32>, <4 x i32>* %C 405 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 406 %tmp5 = add <4 x i32> %tmp3, %tmp4 407 ret <4 x i32> %tmp5 408 } 409 410 define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 411 ;CHECK-LABEL: umlal2d: 412 ;CHECK: umlal.2d 413 %tmp1 = load <2 x i32>, <2 x i32>* %A 414 %tmp2 = load <2 x i32>, <2 x i32>* %B 415 %tmp3 = load <2 x i64>, <2 x i64>* %C 416 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 417 %tmp5 = add <2 x i64> %tmp3, %tmp4 418 ret <2 x i64> %tmp5 419 } 420 421 define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 422 ;CHECK-LABEL: umlsl4s: 423 ;CHECK: umlsl.4s 424 %tmp1 = load <4 x i16>, <4 x i16>* %A 425 %tmp2 = load <4 x i16>, <4 x i16>* %B 426 %tmp3 = load <4 x i32>, <4 x i32>* %C 427 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 428 %tmp5 = sub <4 x i32> %tmp3, %tmp4 429 ret <4 x i32> %tmp5 430 } 431 432 define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 433 ;CHECK-LABEL: umlsl2d: 434 ;CHECK: umlsl.2d 435 %tmp1 = load <2 x i32>, <2 x i32>* %A 436 %tmp2 = load <2 x i32>, <2 x i32>* %B 437 %tmp3 = load <2 x i64>, <2 x i64>* %C 438 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 439 %tmp5 = sub <2 x i64> %tmp3, %tmp4 440 ret <2 x i64> %tmp5 441 } 442 443 define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { 444 ;CHECK-LABEL: fmla_2s: 445 ;CHECK: fmla.2s 446 %tmp1 = load <2 x float>, <2 x float>* %A 447 %tmp2 = load <2 x float>, <2 x float>* %B 448 %tmp3 = load <2 x float>, <2 x float>* %C 449 %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3) 450 ret <2 x float> %tmp4 451 } 452 453 define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 454 ;CHECK-LABEL: fmla_4s: 455 ;CHECK: fmla.4s 456 %tmp1 = load <4 x float>, <4 x float>* %A 457 %tmp2 = load <4 x float>, <4 x float>* %B 458 %tmp3 = load <4 x float>, <4 x float>* %C 459 %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3) 460 ret <4 x float> %tmp4 461 } 462 463 define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { 464 ;CHECK-LABEL: fmla_2d: 465 ;CHECK: fmla.2d 466 %tmp1 = load <2 x double>, <2 x double>* %A 467 %tmp2 = load <2 x double>, <2 x double>* %B 468 %tmp3 = load <2 x double>, <2 x double>* %C 469 %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3) 470 ret <2 x double> %tmp4 471 } 472 473 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone 474 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 475 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 476 477 define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { 478 ;CHECK-LABEL: fmls_2s: 479 ;CHECK: fmls.2s 480 %tmp1 = load <2 x float>, <2 x float>* %A 481 %tmp2 = load <2 x float>, <2 x float>* %B 482 %tmp3 = load <2 x float>, <2 x float>* %C 483 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2 484 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3) 485 ret <2 x float> %tmp5 486 } 487 488 define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 489 ;CHECK-LABEL: fmls_4s: 490 ;CHECK: fmls.4s 491 %tmp1 = load <4 x float>, <4 x float>* %A 492 %tmp2 = load <4 x float>, <4 x float>* %B 493 %tmp3 = load <4 x float>, <4 x float>* %C 494 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2 495 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3) 496 ret <4 x float> %tmp5 497 } 498 499 define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { 500 ;CHECK-LABEL: fmls_2d: 501 ;CHECK: fmls.2d 502 %tmp1 = load <2 x double>, <2 x double>* %A 503 %tmp2 = load <2 x double>, <2 x double>* %B 504 %tmp3 = load <2 x double>, <2 x double>* %C 505 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2 506 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3) 507 ret <2 x double> %tmp5 508 } 509 510 define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { 511 ;CHECK-LABEL: fmls_commuted_neg_2s: 512 ;CHECK: fmls.2s 513 %tmp1 = load <2 x float>, <2 x float>* %A 514 %tmp2 = load <2 x float>, <2 x float>* %B 515 %tmp3 = load <2 x float>, <2 x float>* %C 516 %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2 517 %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3) 518 ret <2 x float> %tmp5 519 } 520 521 define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 522 ;CHECK-LABEL: fmls_commuted_neg_4s: 523 ;CHECK: fmls.4s 524 %tmp1 = load <4 x float>, <4 x float>* %A 525 %tmp2 = load <4 x float>, <4 x float>* %B 526 %tmp3 = load <4 x float>, <4 x float>* %C 527 %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2 528 %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3) 529 ret <4 x float> %tmp5 530 } 531 532 define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { 533 ;CHECK-LABEL: fmls_commuted_neg_2d: 534 ;CHECK: fmls.2d 535 %tmp1 = load <2 x double>, <2 x double>* %A 536 %tmp2 = load <2 x double>, <2 x double>* %B 537 %tmp3 = load <2 x double>, <2 x double>* %C 538 %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2 539 %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3) 540 ret <2 x double> %tmp5 541 } 542 543 define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp { 544 ;CHECK-LABEL: fmls_indexed_2s: 545 ;CHECK: fmls.2s 546 entry: 547 %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c 548 %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer 549 %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a) 550 ret <2 x float> %fmls1 551 } 552 553 define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp { 554 ;CHECK-LABEL: fmls_indexed_4s: 555 ;CHECK: fmls.4s 556 entry: 557 %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c 558 %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer 559 %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a) 560 ret <4 x float> %fmls1 561 } 562 563 define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp { 564 ;CHECK-LABEL: fmls_indexed_2d: 565 ;CHECK: fmls.2d 566 entry: 567 %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c 568 %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer 569 %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a) 570 ret <2 x double> %fmls1 571 } 572 573 define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp { 574 entry: 575 ; CHECK-LABEL: fmla_indexed_scalar_2s: 576 ; CHECK-NEXT: fmla.2s 577 ; CHECK-NEXT: ret 578 %v1 = insertelement <2 x float> undef, float %c, i32 0 579 %v2 = insertelement <2 x float> %v1, float %c, i32 1 580 %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind 581 ret <2 x float> %fmla1 582 } 583 584 define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp { 585 entry: 586 ; CHECK-LABEL: fmla_indexed_scalar_4s: 587 ; CHECK-NEXT: fmla.4s 588 ; CHECK-NEXT: ret 589 %v1 = insertelement <4 x float> undef, float %c, i32 0 590 %v2 = insertelement <4 x float> %v1, float %c, i32 1 591 %v3 = insertelement <4 x float> %v2, float %c, i32 2 592 %v4 = insertelement <4 x float> %v3, float %c, i32 3 593 %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind 594 ret <4 x float> %fmla1 595 } 596 597 define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp { 598 ; CHECK-LABEL: fmla_indexed_scalar_2d: 599 ; CHECK-NEXT: fmla.2d 600 ; CHECK-NEXT: ret 601 entry: 602 %v1 = insertelement <2 x double> undef, double %c, i32 0 603 %v2 = insertelement <2 x double> %v1, double %c, i32 1 604 %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind 605 ret <2 x double> %fmla1 606 } 607 608 define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 609 ;CHECK-LABEL: mul_4h: 610 ;CHECK-NOT: dup 611 ;CHECK: mul.4h 612 %tmp1 = load <4 x i16>, <4 x i16>* %A 613 %tmp2 = load <4 x i16>, <4 x i16>* %B 614 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 615 %tmp4 = mul <4 x i16> %tmp1, %tmp3 616 ret <4 x i16> %tmp4 617 } 618 619 define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 620 ;CHECK-LABEL: mul_8h: 621 ;CHECK-NOT: dup 622 ;CHECK: mul.8h 623 %tmp1 = load <8 x i16>, <8 x i16>* %A 624 %tmp2 = load <8 x i16>, <8 x i16>* %B 625 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 626 %tmp4 = mul <8 x i16> %tmp1, %tmp3 627 ret <8 x i16> %tmp4 628 } 629 630 define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 631 ;CHECK-LABEL: mul_2s: 632 ;CHECK-NOT: dup 633 ;CHECK: mul.2s 634 %tmp1 = load <2 x i32>, <2 x i32>* %A 635 %tmp2 = load <2 x i32>, <2 x i32>* %B 636 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 637 %tmp4 = mul <2 x i32> %tmp1, %tmp3 638 ret <2 x i32> %tmp4 639 } 640 641 define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 642 ;CHECK-LABEL: mul_4s: 643 ;CHECK-NOT: dup 644 ;CHECK: mul.4s 645 %tmp1 = load <4 x i32>, <4 x i32>* %A 646 %tmp2 = load <4 x i32>, <4 x i32>* %B 647 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 648 %tmp4 = mul <4 x i32> %tmp1, %tmp3 649 ret <4 x i32> %tmp4 650 } 651 652 define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind { 653 ; CHECK-LABEL: mul_2d: 654 ; CHECK: mul 655 ; CHECK: mul 656 %tmp1 = mul <2 x i64> %A, %B 657 ret <2 x i64> %tmp1 658 } 659 660 define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 661 ;CHECK-LABEL: fmul_lane_2s: 662 ;CHECK-NOT: dup 663 ;CHECK: fmul.2s 664 %tmp1 = load <2 x float>, <2 x float>* %A 665 %tmp2 = load <2 x float>, <2 x float>* %B 666 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1> 667 %tmp4 = fmul <2 x float> %tmp1, %tmp3 668 ret <2 x float> %tmp4 669 } 670 671 define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 672 ;CHECK-LABEL: fmul_lane_4s: 673 ;CHECK-NOT: dup 674 ;CHECK: fmul.4s 675 %tmp1 = load <4 x float>, <4 x float>* %A 676 %tmp2 = load <4 x float>, <4 x float>* %B 677 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 678 %tmp4 = fmul <4 x float> %tmp1, %tmp3 679 ret <4 x float> %tmp4 680 } 681 682 define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 683 ;CHECK-LABEL: fmul_lane_2d: 684 ;CHECK-NOT: dup 685 ;CHECK: fmul.2d 686 %tmp1 = load <2 x double>, <2 x double>* %A 687 %tmp2 = load <2 x double>, <2 x double>* %B 688 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1> 689 %tmp4 = fmul <2 x double> %tmp1, %tmp3 690 ret <2 x double> %tmp4 691 } 692 693 define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind { 694 ;CHECK-LABEL: fmul_lane_s: 695 ;CHECK-NOT: dup 696 ;CHECK: fmul.s s0, s0, v1[3] 697 %B = extractelement <4 x float> %vec, i32 3 698 %res = fmul float %A, %B 699 ret float %res 700 } 701 702 define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { 703 ;CHECK-LABEL: fmul_lane_d: 704 ;CHECK-NOT: dup 705 ;CHECK: fmul.d d0, d0, v1[1] 706 %B = extractelement <2 x double> %vec, i32 1 707 %res = fmul double %A, %B 708 ret double %res 709 } 710 711 712 713 define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { 714 ;CHECK-LABEL: fmulx_lane_2s: 715 ;CHECK-NOT: dup 716 ;CHECK: fmulx.2s 717 %tmp1 = load <2 x float>, <2 x float>* %A 718 %tmp2 = load <2 x float>, <2 x float>* %B 719 %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1> 720 %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3) 721 ret <2 x float> %tmp4 722 } 723 724 define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { 725 ;CHECK-LABEL: fmulx_lane_4s: 726 ;CHECK-NOT: dup 727 ;CHECK: fmulx.4s 728 %tmp1 = load <4 x float>, <4 x float>* %A 729 %tmp2 = load <4 x float>, <4 x float>* %B 730 %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 731 %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3) 732 ret <4 x float> %tmp4 733 } 734 735 define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { 736 ;CHECK-LABEL: fmulx_lane_2d: 737 ;CHECK-NOT: dup 738 ;CHECK: fmulx.2d 739 %tmp1 = load <2 x double>, <2 x double>* %A 740 %tmp2 = load <2 x double>, <2 x double>* %B 741 %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1> 742 %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3) 743 ret <2 x double> %tmp4 744 } 745 746 define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 747 ;CHECK-LABEL: sqdmulh_lane_4h: 748 ;CHECK-NOT: dup 749 ;CHECK: sqdmulh.4h 750 %tmp1 = load <4 x i16>, <4 x i16>* %A 751 %tmp2 = load <4 x i16>, <4 x i16>* %B 752 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 753 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3) 754 ret <4 x i16> %tmp4 755 } 756 757 define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 758 ;CHECK-LABEL: sqdmulh_lane_8h: 759 ;CHECK-NOT: dup 760 ;CHECK: sqdmulh.8h 761 %tmp1 = load <8 x i16>, <8 x i16>* %A 762 %tmp2 = load <8 x i16>, <8 x i16>* %B 763 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 764 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3) 765 ret <8 x i16> %tmp4 766 } 767 768 define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 769 ;CHECK-LABEL: sqdmulh_lane_2s: 770 ;CHECK-NOT: dup 771 ;CHECK: sqdmulh.2s 772 %tmp1 = load <2 x i32>, <2 x i32>* %A 773 %tmp2 = load <2 x i32>, <2 x i32>* %B 774 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 775 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3) 776 ret <2 x i32> %tmp4 777 } 778 779 define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 780 ;CHECK-LABEL: sqdmulh_lane_4s: 781 ;CHECK-NOT: dup 782 ;CHECK: sqdmulh.4s 783 %tmp1 = load <4 x i32>, <4 x i32>* %A 784 %tmp2 = load <4 x i32>, <4 x i32>* %B 785 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 786 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3) 787 ret <4 x i32> %tmp4 788 } 789 790 define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { 791 ;CHECK-LABEL: sqdmulh_lane_1s: 792 ;CHECK-NOT: dup 793 ;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1] 794 %tmp1 = extractelement <4 x i32> %B, i32 1 795 %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1) 796 ret i32 %tmp2 797 } 798 799 define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { 800 ;CHECK-LABEL: sqrdmulh_lane_4h: 801 ;CHECK-NOT: dup 802 ;CHECK: sqrdmulh.4h 803 %tmp1 = load <4 x i16>, <4 x i16>* %A 804 %tmp2 = load <4 x i16>, <4 x i16>* %B 805 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 806 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3) 807 ret <4 x i16> %tmp4 808 } 809 810 define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { 811 ;CHECK-LABEL: sqrdmulh_lane_8h: 812 ;CHECK-NOT: dup 813 ;CHECK: sqrdmulh.8h 814 %tmp1 = load <8 x i16>, <8 x i16>* %A 815 %tmp2 = load <8 x i16>, <8 x i16>* %B 816 %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 817 %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3) 818 ret <8 x i16> %tmp4 819 } 820 821 define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { 822 ;CHECK-LABEL: sqrdmulh_lane_2s: 823 ;CHECK-NOT: dup 824 ;CHECK: sqrdmulh.2s 825 %tmp1 = load <2 x i32>, <2 x i32>* %A 826 %tmp2 = load <2 x i32>, <2 x i32>* %B 827 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 828 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3) 829 ret <2 x i32> %tmp4 830 } 831 832 define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { 833 ;CHECK-LABEL: sqrdmulh_lane_4s: 834 ;CHECK-NOT: dup 835 ;CHECK: sqrdmulh.4s 836 %tmp1 = load <4 x i32>, <4 x i32>* %A 837 %tmp2 = load <4 x i32>, <4 x i32>* %B 838 %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 839 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3) 840 ret <4 x i32> %tmp4 841 } 842 843 define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { 844 ;CHECK-LABEL: sqrdmulh_lane_1s: 845 ;CHECK-NOT: dup 846 ;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1] 847 %tmp1 = extractelement <4 x i32> %B, i32 1 848 %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1) 849 ret i32 %tmp2 850 } 851 852 define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 853 ;CHECK-LABEL: sqdmull_lane_4s: 854 ;CHECK-NOT: dup 855 ;CHECK: sqdmull.4s 856 %tmp1 = load <4 x i16>, <4 x i16>* %A 857 %tmp2 = load <4 x i16>, <4 x i16>* %B 858 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 859 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) 860 ret <4 x i32> %tmp4 861 } 862 863 define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 864 ;CHECK-LABEL: sqdmull_lane_2d: 865 ;CHECK-NOT: dup 866 ;CHECK: sqdmull.2d 867 %tmp1 = load <2 x i32>, <2 x i32>* %A 868 %tmp2 = load <2 x i32>, <2 x i32>* %B 869 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 870 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) 871 ret <2 x i64> %tmp4 872 } 873 874 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { 875 ;CHECK-LABEL: sqdmull2_lane_4s: 876 ;CHECK-NOT: dup 877 ;CHECK: sqdmull.4s 878 %load1 = load <8 x i16>, <8 x i16>* %A 879 %load2 = load <8 x i16>, <8 x i16>* %B 880 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 881 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 882 %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 883 ret <4 x i32> %tmp4 884 } 885 886 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { 887 ;CHECK-LABEL: sqdmull2_lane_2d: 888 ;CHECK-NOT: dup 889 ;CHECK: sqdmull.2d 890 %load1 = load <4 x i32>, <4 x i32>* %A 891 %load2 = load <4 x i32>, <4 x i32>* %B 892 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 893 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 894 %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 895 ret <2 x i64> %tmp4 896 } 897 898 define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 899 ;CHECK-LABEL: umull_lane_4s: 900 ;CHECK-NOT: dup 901 ;CHECK: umull.4s 902 %tmp1 = load <4 x i16>, <4 x i16>* %A 903 %tmp2 = load <4 x i16>, <4 x i16>* %B 904 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 905 %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) 906 ret <4 x i32> %tmp4 907 } 908 909 define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 910 ;CHECK-LABEL: umull_lane_2d: 911 ;CHECK-NOT: dup 912 ;CHECK: umull.2d 913 %tmp1 = load <2 x i32>, <2 x i32>* %A 914 %tmp2 = load <2 x i32>, <2 x i32>* %B 915 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 916 %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) 917 ret <2 x i64> %tmp4 918 } 919 920 define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { 921 ;CHECK-LABEL: smull_lane_4s: 922 ;CHECK-NOT: dup 923 ;CHECK: smull.4s 924 %tmp1 = load <4 x i16>, <4 x i16>* %A 925 %tmp2 = load <4 x i16>, <4 x i16>* %B 926 %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 927 %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) 928 ret <4 x i32> %tmp4 929 } 930 931 define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { 932 ;CHECK-LABEL: smull_lane_2d: 933 ;CHECK-NOT: dup 934 ;CHECK: smull.2d 935 %tmp1 = load <2 x i32>, <2 x i32>* %A 936 %tmp2 = load <2 x i32>, <2 x i32>* %B 937 %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 938 %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) 939 ret <2 x i64> %tmp4 940 } 941 942 define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 943 ;CHECK-LABEL: smlal_lane_4s: 944 ;CHECK-NOT: dup 945 ;CHECK: smlal.4s 946 %tmp1 = load <4 x i16>, <4 x i16>* %A 947 %tmp2 = load <4 x i16>, <4 x i16>* %B 948 %tmp3 = load <4 x i32>, <4 x i32>* %C 949 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 950 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 951 %tmp6 = add <4 x i32> %tmp3, %tmp5 952 ret <4 x i32> %tmp6 953 } 954 955 define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 956 ;CHECK-LABEL: smlal_lane_2d: 957 ;CHECK-NOT: dup 958 ;CHECK: smlal.2d 959 %tmp1 = load <2 x i32>, <2 x i32>* %A 960 %tmp2 = load <2 x i32>, <2 x i32>* %B 961 %tmp3 = load <2 x i64>, <2 x i64>* %C 962 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 963 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 964 %tmp6 = add <2 x i64> %tmp3, %tmp5 965 ret <2 x i64> %tmp6 966 } 967 968 define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 969 ;CHECK-LABEL: sqdmlal_lane_4s: 970 ;CHECK-NOT: dup 971 ;CHECK: sqdmlal.4s 972 %tmp1 = load <4 x i16>, <4 x i16>* %A 973 %tmp2 = load <4 x i16>, <4 x i16>* %B 974 %tmp3 = load <4 x i32>, <4 x i32>* %C 975 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 976 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 977 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 978 ret <4 x i32> %tmp6 979 } 980 981 define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 982 ;CHECK-LABEL: sqdmlal_lane_2d: 983 ;CHECK-NOT: dup 984 ;CHECK: sqdmlal.2d 985 %tmp1 = load <2 x i32>, <2 x i32>* %A 986 %tmp2 = load <2 x i32>, <2 x i32>* %B 987 %tmp3 = load <2 x i64>, <2 x i64>* %C 988 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 989 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 990 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 991 ret <2 x i64> %tmp6 992 } 993 994 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 995 ;CHECK-LABEL: sqdmlal2_lane_4s: 996 ;CHECK-NOT: dup 997 ;CHECK: sqdmlal.4s 998 %load1 = load <8 x i16>, <8 x i16>* %A 999 %load2 = load <8 x i16>, <8 x i16>* %B 1000 %tmp3 = load <4 x i32>, <4 x i32>* %C 1001 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1002 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1003 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 1004 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 1005 ret <4 x i32> %tmp6 1006 } 1007 1008 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 1009 ;CHECK-LABEL: sqdmlal2_lane_2d: 1010 ;CHECK-NOT: dup 1011 ;CHECK: sqdmlal.2d 1012 %load1 = load <4 x i32>, <4 x i32>* %A 1013 %load2 = load <4 x i32>, <4 x i32>* %B 1014 %tmp3 = load <2 x i64>, <2 x i64>* %C 1015 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1016 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 1017 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 1018 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 1019 ret <2 x i64> %tmp6 1020 } 1021 1022 define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { 1023 ;CHECK-LABEL: sqdmlal_lane_1s: 1024 ;CHECK: sqdmlal.4s 1025 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 1026 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1027 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) 1028 %prod = extractelement <4 x i32> %prod.vec, i32 0 1029 %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod) 1030 ret i32 %res 1031 } 1032 declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) 1033 1034 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { 1035 ;CHECK-LABEL: sqdmlsl_lane_1s: 1036 ;CHECK: sqdmlsl.4s 1037 %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 1038 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1039 %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) 1040 %prod = extractelement <4 x i32> %prod.vec, i32 0 1041 %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod) 1042 ret i32 %res 1043 } 1044 declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) 1045 1046 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { 1047 ;CHECK-LABEL: sqdmlal_lane_1d: 1048 ;CHECK: sqdmlal.s 1049 %rhs = extractelement <2 x i32> %C, i32 1 1050 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) 1051 %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod) 1052 ret i64 %res 1053 } 1054 declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32) 1055 declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) 1056 1057 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { 1058 ;CHECK-LABEL: sqdmlsl_lane_1d: 1059 ;CHECK: sqdmlsl.s 1060 %rhs = extractelement <2 x i32> %C, i32 1 1061 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) 1062 %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod) 1063 ret i64 %res 1064 } 1065 declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) 1066 1067 1068 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1069 ;CHECK-LABEL: umlal_lane_4s: 1070 ;CHECK-NOT: dup 1071 ;CHECK: umlal.4s 1072 %tmp1 = load <4 x i16>, <4 x i16>* %A 1073 %tmp2 = load <4 x i16>, <4 x i16>* %B 1074 %tmp3 = load <4 x i32>, <4 x i32>* %C 1075 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1076 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1077 %tmp6 = add <4 x i32> %tmp3, %tmp5 1078 ret <4 x i32> %tmp6 1079 } 1080 1081 define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1082 ;CHECK-LABEL: umlal_lane_2d: 1083 ;CHECK-NOT: dup 1084 ;CHECK: umlal.2d 1085 %tmp1 = load <2 x i32>, <2 x i32>* %A 1086 %tmp2 = load <2 x i32>, <2 x i32>* %B 1087 %tmp3 = load <2 x i64>, <2 x i64>* %C 1088 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1089 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1090 %tmp6 = add <2 x i64> %tmp3, %tmp5 1091 ret <2 x i64> %tmp6 1092 } 1093 1094 1095 define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1096 ;CHECK-LABEL: smlsl_lane_4s: 1097 ;CHECK-NOT: dup 1098 ;CHECK: smlsl.4s 1099 %tmp1 = load <4 x i16>, <4 x i16>* %A 1100 %tmp2 = load <4 x i16>, <4 x i16>* %B 1101 %tmp3 = load <4 x i32>, <4 x i32>* %C 1102 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1103 %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1104 %tmp6 = sub <4 x i32> %tmp3, %tmp5 1105 ret <4 x i32> %tmp6 1106 } 1107 1108 define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1109 ;CHECK-LABEL: smlsl_lane_2d: 1110 ;CHECK-NOT: dup 1111 ;CHECK: smlsl.2d 1112 %tmp1 = load <2 x i32>, <2 x i32>* %A 1113 %tmp2 = load <2 x i32>, <2 x i32>* %B 1114 %tmp3 = load <2 x i64>, <2 x i64>* %C 1115 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1116 %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1117 %tmp6 = sub <2 x i64> %tmp3, %tmp5 1118 ret <2 x i64> %tmp6 1119 } 1120 1121 define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1122 ;CHECK-LABEL: sqdmlsl_lane_4s: 1123 ;CHECK-NOT: dup 1124 ;CHECK: sqdmlsl.4s 1125 %tmp1 = load <4 x i16>, <4 x i16>* %A 1126 %tmp2 = load <4 x i16>, <4 x i16>* %B 1127 %tmp3 = load <4 x i32>, <4 x i32>* %C 1128 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1129 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1130 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 1131 ret <4 x i32> %tmp6 1132 } 1133 1134 define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1135 ;CHECK-LABEL: sqdmlsl_lane_2d: 1136 ;CHECK-NOT: dup 1137 ;CHECK: sqdmlsl.2d 1138 %tmp1 = load <2 x i32>, <2 x i32>* %A 1139 %tmp2 = load <2 x i32>, <2 x i32>* %B 1140 %tmp3 = load <2 x i64>, <2 x i64>* %C 1141 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1142 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1143 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 1144 ret <2 x i64> %tmp6 1145 } 1146 1147 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { 1148 ;CHECK-LABEL: sqdmlsl2_lane_4s: 1149 ;CHECK-NOT: dup 1150 ;CHECK: sqdmlsl.4s 1151 %load1 = load <8 x i16>, <8 x i16>* %A 1152 %load2 = load <8 x i16>, <8 x i16>* %B 1153 %tmp3 = load <4 x i32>, <4 x i32>* %C 1154 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1155 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1156 %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) 1157 %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) 1158 ret <4 x i32> %tmp6 1159 } 1160 1161 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { 1162 ;CHECK-LABEL: sqdmlsl2_lane_2d: 1163 ;CHECK-NOT: dup 1164 ;CHECK: sqdmlsl.2d 1165 %load1 = load <4 x i32>, <4 x i32>* %A 1166 %load2 = load <4 x i32>, <4 x i32>* %B 1167 %tmp3 = load <2 x i64>, <2 x i64>* %C 1168 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1169 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1> 1170 %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) 1171 %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) 1172 ret <2 x i64> %tmp6 1173 } 1174 1175 define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { 1176 ;CHECK-LABEL: umlsl_lane_4s: 1177 ;CHECK-NOT: dup 1178 ;CHECK: umlsl.4s 1179 %tmp1 = load <4 x i16>, <4 x i16>* %A 1180 %tmp2 = load <4 x i16>, <4 x i16>* %B 1181 %tmp3 = load <4 x i32>, <4 x i32>* %C 1182 %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1183 %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) 1184 %tmp6 = sub <4 x i32> %tmp3, %tmp5 1185 ret <4 x i32> %tmp6 1186 } 1187 1188 define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { 1189 ;CHECK-LABEL: umlsl_lane_2d: 1190 ;CHECK-NOT: dup 1191 ;CHECK: umlsl.2d 1192 %tmp1 = load <2 x i32>, <2 x i32>* %A 1193 %tmp2 = load <2 x i32>, <2 x i32>* %B 1194 %tmp3 = load <2 x i64>, <2 x i64>* %C 1195 %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1> 1196 %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) 1197 %tmp6 = sub <2 x i64> %tmp3, %tmp5 1198 ret <2 x i64> %tmp6 1199 } 1200 1201 ; Scalar FMULX 1202 define float @fmulxs(float %a, float %b) nounwind { 1203 ; CHECK-LABEL: fmulxs: 1204 ; CHECK-NEXT: fmulx s0, s0, s1 1205 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind 1206 ; CHECK-NEXT: ret 1207 ret float %fmulx.i 1208 } 1209 1210 define double @fmulxd(double %a, double %b) nounwind { 1211 ; CHECK-LABEL: fmulxd: 1212 ; CHECK-NEXT: fmulx d0, d0, d1 1213 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind 1214 ; CHECK-NEXT: ret 1215 ret double %fmulx.i 1216 } 1217 1218 define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { 1219 ; CHECK-LABEL: fmulxs_lane: 1220 ; CHECK-NEXT: fmulx.s s0, s0, v1[3] 1221 %b = extractelement <4 x float> %vec, i32 3 1222 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind 1223 ; CHECK-NEXT: ret 1224 ret float %fmulx.i 1225 } 1226 1227 define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { 1228 ; CHECK-LABEL: fmulxd_lane: 1229 ; CHECK-NEXT: fmulx.d d0, d0, v1[1] 1230 %b = extractelement <2 x double> %vec, i32 1 1231 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind 1232 ; CHECK-NEXT: ret 1233 ret double %fmulx.i 1234 } 1235 1236 declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone 1237 declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone 1238 1239 1240 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind { 1241 ; CHECK-LABEL: smull2_8h_simple: 1242 ; CHECK-NEXT: smull2.8h v0, v0, v1 1243 ; CHECK-NEXT: ret 1244 %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1245 %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1246 %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2 1247 ret <8 x i16> %3 1248 } 1249 1250 define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind { 1251 ; CHECK-LABEL: foo0: 1252 ; CHECK: smull2.8h v0, v0, v1 1253 %tmp = bitcast <16 x i8> %a to <2 x i64> 1254 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1255 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> 1256 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 1257 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1258 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8> 1259 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1260 ret <8 x i16> %vmull.i.i 1261 } 1262 1263 define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind { 1264 ; CHECK-LABEL: foo1: 1265 ; CHECK: smull2.4s v0, v0, v1 1266 %tmp = bitcast <8 x i16> %a to <2 x i64> 1267 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1268 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1269 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 1270 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1271 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1272 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1273 ret <4 x i32> %vmull2.i.i 1274 } 1275 1276 define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind { 1277 ; CHECK-LABEL: foo2: 1278 ; CHECK: smull2.2d v0, v0, v1 1279 %tmp = bitcast <4 x i32> %a to <2 x i64> 1280 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1281 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1282 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 1283 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1284 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1285 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1286 ret <2 x i64> %vmull2.i.i 1287 } 1288 1289 define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind { 1290 ; CHECK-LABEL: foo3: 1291 ; CHECK: umull2.8h v0, v0, v1 1292 %tmp = bitcast <16 x i8> %a to <2 x i64> 1293 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1294 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> 1295 %tmp2 = bitcast <16 x i8> %b to <2 x i64> 1296 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1297 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8> 1298 %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1299 ret <8 x i16> %vmull.i.i 1300 } 1301 1302 define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind { 1303 ; CHECK-LABEL: foo4: 1304 ; CHECK: umull2.4s v0, v0, v1 1305 %tmp = bitcast <8 x i16> %a to <2 x i64> 1306 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1307 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1308 %tmp2 = bitcast <8 x i16> %b to <2 x i64> 1309 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1310 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1311 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1312 ret <4 x i32> %vmull2.i.i 1313 } 1314 1315 define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind { 1316 ; CHECK-LABEL: foo5: 1317 ; CHECK: umull2.2d v0, v0, v1 1318 %tmp = bitcast <4 x i32> %a to <2 x i64> 1319 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1320 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1321 %tmp2 = bitcast <4 x i32> %b to <2 x i64> 1322 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1323 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1324 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1325 ret <2 x i64> %vmull2.i.i 1326 } 1327 1328 define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1329 ; CHECK-LABEL: foo6: 1330 ; CHECK-NEXT: smull2.4s v0, v1, v2[1] 1331 ; CHECK-NEXT: ret 1332 entry: 1333 %0 = bitcast <8 x i16> %b to <2 x i64> 1334 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1335 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1336 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1337 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1338 ret <4 x i32> %vmull2.i 1339 } 1340 1341 define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1342 ; CHECK-LABEL: foo7: 1343 ; CHECK-NEXT: smull2.2d v0, v1, v2[1] 1344 ; CHECK-NEXT: ret 1345 entry: 1346 %0 = bitcast <4 x i32> %b to <2 x i64> 1347 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1348 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1349 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1350 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1351 ret <2 x i64> %vmull2.i 1352 } 1353 1354 define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 1355 ; CHECK-LABEL: foo8: 1356 ; CHECK-NEXT: umull2.4s v0, v1, v2[1] 1357 ; CHECK-NEXT: ret 1358 entry: 1359 %0 = bitcast <8 x i16> %b to <2 x i64> 1360 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1361 %1 = bitcast <1 x i64> %shuffle.i to <4 x i16> 1362 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1363 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind 1364 ret <4 x i32> %vmull2.i 1365 } 1366 1367 define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 1368 ; CHECK-LABEL: foo9: 1369 ; CHECK-NEXT: umull2.2d v0, v1, v2[1] 1370 ; CHECK-NEXT: ret 1371 entry: 1372 %0 = bitcast <4 x i32> %b to <2 x i64> 1373 %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1374 %1 = bitcast <1 x i64> %shuffle.i to <2 x i32> 1375 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 1376 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind 1377 ret <2 x i64> %vmull2.i 1378 } 1379 1380 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { 1381 ; CHECK-LABEL: bar0: 1382 ; CHECK: smlal2.8h v0, v1, v2 1383 ; CHECK-NEXT: ret 1384 1385 %tmp = bitcast <16 x i8> %b to <2 x i64> 1386 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1387 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 1388 %tmp2 = bitcast <16 x i8> %c to <2 x i64> 1389 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1390 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8> 1391 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1392 %add.i = add <8 x i16> %vmull.i.i.i, %a 1393 ret <8 x i16> %add.i 1394 } 1395 1396 define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { 1397 ; CHECK-LABEL: bar1: 1398 ; CHECK: smlal2.4s v0, v1, v2 1399 ; CHECK-NEXT: ret 1400 1401 %tmp = bitcast <8 x i16> %b to <2 x i64> 1402 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1403 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 1404 %tmp2 = bitcast <8 x i16> %c to <2 x i64> 1405 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1406 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16> 1407 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1408 %add.i = add <4 x i32> %vmull2.i.i.i, %a 1409 ret <4 x i32> %add.i 1410 } 1411 1412 define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { 1413 ; CHECK-LABEL: bar2: 1414 ; CHECK: smlal2.2d v0, v1, v2 1415 ; CHECK-NEXT: ret 1416 1417 %tmp = bitcast <4 x i32> %b to <2 x i64> 1418 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1419 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 1420 %tmp2 = bitcast <4 x i32> %c to <2 x i64> 1421 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1422 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32> 1423 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1424 %add.i = add <2 x i64> %vmull2.i.i.i, %a 1425 ret <2 x i64> %add.i 1426 } 1427 1428 define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { 1429 ; CHECK-LABEL: bar3: 1430 ; CHECK: umlal2.8h v0, v1, v2 1431 ; CHECK-NEXT: ret 1432 1433 %tmp = bitcast <16 x i8> %b to <2 x i64> 1434 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1435 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> 1436 %tmp2 = bitcast <16 x i8> %c to <2 x i64> 1437 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1438 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8> 1439 %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind 1440 %add.i = add <8 x i16> %vmull.i.i.i, %a 1441 ret <8 x i16> %add.i 1442 } 1443 1444 define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { 1445 ; CHECK-LABEL: bar4: 1446 ; CHECK: umlal2.4s v0, v1, v2 1447 ; CHECK-NEXT: ret 1448 1449 %tmp = bitcast <8 x i16> %b to <2 x i64> 1450 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1451 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> 1452 %tmp2 = bitcast <8 x i16> %c to <2 x i64> 1453 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1454 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16> 1455 %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1456 %add.i = add <4 x i32> %vmull2.i.i.i, %a 1457 ret <4 x i32> %add.i 1458 } 1459 1460 define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { 1461 ; CHECK-LABEL: bar5: 1462 ; CHECK: umlal2.2d v0, v1, v2 1463 ; CHECK-NEXT: ret 1464 1465 %tmp = bitcast <4 x i32> %b to <2 x i64> 1466 %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1467 %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> 1468 %tmp2 = bitcast <4 x i32> %c to <2 x i64> 1469 %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1470 %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32> 1471 %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1472 %add.i = add <2 x i64> %vmull2.i.i.i, %a 1473 ret <2 x i64> %add.i 1474 } 1475 1476 define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { 1477 ; CHECK-LABEL: mlal2_1: 1478 ; CHECK: smlal2.4s v0, v1, v2[3] 1479 ; CHECK-NEXT: ret 1480 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 1481 %tmp = bitcast <8 x i16> %b to <2 x i64> 1482 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1483 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1484 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64> 1485 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1486 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1487 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1488 %add = add <4 x i32> %vmull2.i.i, %a 1489 ret <4 x i32> %add 1490 } 1491 1492 define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { 1493 ; CHECK-LABEL: mlal2_2: 1494 ; CHECK: smlal2.2d v0, v1, v2[1] 1495 ; CHECK-NEXT: ret 1496 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1497 %tmp = bitcast <4 x i32> %b to <2 x i64> 1498 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1499 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1500 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64> 1501 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1502 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1503 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1504 %add = add <2 x i64> %vmull2.i.i, %a 1505 ret <2 x i64> %add 1506 } 1507 1508 define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { 1509 ; CHECK-LABEL: mlal2_4: 1510 ; CHECK: umlal2.4s v0, v1, v2[2] 1511 ; CHECK-NEXT: ret 1512 1513 %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1514 %tmp = bitcast <8 x i16> %b to <2 x i64> 1515 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1516 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1517 %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64> 1518 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1519 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16> 1520 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind 1521 %add = add <4 x i32> %vmull2.i.i, %a 1522 ret <4 x i32> %add 1523 } 1524 1525 define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { 1526 ; CHECK-LABEL: mlal2_5: 1527 ; CHECK: umlal2.2d v0, v1, v2[0] 1528 ; CHECK-NEXT: ret 1529 %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer 1530 %tmp = bitcast <4 x i32> %b to <2 x i64> 1531 %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1> 1532 %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1533 %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64> 1534 %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1> 1535 %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32> 1536 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind 1537 %add = add <2 x i64> %vmull2.i.i, %a 1538 ret <2 x i64> %add 1539 } 1540 1541 ; rdar://12328502 1542 define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp { 1543 entry: 1544 ; CHECK-LABEL: vmulq_n_f64: 1545 ; CHECK-NOT: dup.2d 1546 ; CHECK: fmul.2d v0, v0, v1[0] 1547 %vecinit.i = insertelement <2 x double> undef, double %y, i32 0 1548 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1 1549 %mul.i = fmul <2 x double> %vecinit1.i, %x 1550 ret <2 x double> %mul.i 1551 } 1552 1553 define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp { 1554 entry: 1555 ; CHECK-LABEL: vmulq_n_f32: 1556 ; CHECK-NOT: dup.4s 1557 ; CHECK: fmul.4s v0, v0, v1[0] 1558 %vecinit.i = insertelement <4 x float> undef, float %y, i32 0 1559 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1 1560 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2 1561 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3 1562 %mul.i = fmul <4 x float> %vecinit3.i, %x 1563 ret <4 x float> %mul.i 1564 } 1565 1566 define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp { 1567 entry: 1568 ; CHECK-LABEL: vmul_n_f32: 1569 ; CHECK-NOT: dup.2s 1570 ; CHECK: fmul.2s v0, v0, v1[0] 1571 %vecinit.i = insertelement <2 x float> undef, float %y, i32 0 1572 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1 1573 %mul.i = fmul <2 x float> %vecinit1.i, %x 1574 ret <2 x float> %mul.i 1575 } 1576 1577 define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp { 1578 entry: 1579 ; CHECK: vmla_laneq_s16_test 1580 ; CHECK-NOT: ext 1581 ; CHECK: mla.4h v0, v1, v2[6] 1582 ; CHECK-NEXT: ret 1583 %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1584 %mul = mul <4 x i16> %shuffle, %b 1585 %add = add <4 x i16> %mul, %a 1586 ret <4 x i16> %add 1587 } 1588 1589 define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp { 1590 entry: 1591 ; CHECK: vmla_laneq_s32_test 1592 ; CHECK-NOT: ext 1593 ; CHECK: mla.2s v0, v1, v2[3] 1594 ; CHECK-NEXT: ret 1595 %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3> 1596 %mul = mul <2 x i32> %shuffle, %b 1597 %add = add <2 x i32> %mul, %a 1598 ret <2 x i32> %add 1599 } 1600 1601 define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp { 1602 entry: 1603 ; CHECK: not_really_vmlaq_laneq_s16_test 1604 ; CHECK-NOT: ext 1605 ; CHECK: mla.8h v0, v1, v2[5] 1606 ; CHECK-NEXT: ret 1607 %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1608 %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1609 %mul = mul <8 x i16> %shuffle2, %b 1610 %add = add <8 x i16> %mul, %a 1611 ret <8 x i16> %add 1612 } 1613 1614 define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp { 1615 entry: 1616 ; CHECK: not_really_vmlaq_laneq_s32_test 1617 ; CHECK-NOT: ext 1618 ; CHECK: mla.4s v0, v1, v2[3] 1619 ; CHECK-NEXT: ret 1620 %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1621 %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1622 %mul = mul <4 x i32> %shuffle2, %b 1623 %add = add <4 x i32> %mul, %a 1624 ret <4 x i32> %add 1625 } 1626 1627 define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { 1628 entry: 1629 ; CHECK: vmull_laneq_s16_test 1630 ; CHECK-NOT: ext 1631 ; CHECK: smull.4s v0, v0, v1[6] 1632 ; CHECK-NEXT: ret 1633 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1634 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 1635 ret <4 x i32> %vmull2.i 1636 } 1637 1638 define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { 1639 entry: 1640 ; CHECK: vmull_laneq_s32_test 1641 ; CHECK-NOT: ext 1642 ; CHECK: smull.2d v0, v0, v1[2] 1643 ; CHECK-NEXT: ret 1644 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> 1645 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 1646 ret <2 x i64> %vmull2.i 1647 } 1648 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { 1649 entry: 1650 ; CHECK: vmull_laneq_u16_test 1651 ; CHECK-NOT: ext 1652 ; CHECK: umull.4s v0, v0, v1[6] 1653 ; CHECK-NEXT: ret 1654 %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6> 1655 %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 1656 ret <4 x i32> %vmull2.i 1657 } 1658 1659 define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { 1660 entry: 1661 ; CHECK: vmull_laneq_u32_test 1662 ; CHECK-NOT: ext 1663 ; CHECK: umull.2d v0, v0, v1[2] 1664 ; CHECK-NEXT: ret 1665 %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2> 1666 %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 1667 ret <2 x i64> %vmull2.i 1668 } 1669 1670 define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { 1671 entry: 1672 ; CHECK: vmull_high_n_s16_test 1673 ; CHECK-NOT: ext 1674 ; CHECK: smull2.4s 1675 ; CHECK-NEXT: ret 1676 %conv = trunc i32 %d to i16 1677 %0 = bitcast <8 x i16> %b to <2 x i64> 1678 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1679 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1680 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 1681 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 1682 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 1683 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 1684 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind 1685 ret <4 x i32> %vmull2.i.i 1686 } 1687 1688 define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { 1689 entry: 1690 ; CHECK: vmull_high_n_s32_test 1691 ; CHECK-NOT: ext 1692 ; CHECK: smull2.2d 1693 ; CHECK-NEXT: ret 1694 %0 = bitcast <4 x i32> %b to <2 x i64> 1695 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1696 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1697 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0 1698 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1 1699 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind 1700 ret <2 x i64> %vmull2.i.i 1701 } 1702 1703 define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { 1704 entry: 1705 ; CHECK: vmull_high_n_u16_test 1706 ; CHECK-NOT: ext 1707 ; CHECK: umull2.4s 1708 ; CHECK-NEXT: ret 1709 %conv = trunc i32 %d to i16 1710 %0 = bitcast <8 x i16> %b to <2 x i64> 1711 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1712 %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> 1713 %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0 1714 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1 1715 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2 1716 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3 1717 %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind 1718 ret <4 x i32> %vmull2.i.i 1719 } 1720 1721 define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { 1722 entry: 1723 ; CHECK: vmull_high_n_u32_test 1724 ; CHECK-NOT: ext 1725 ; CHECK: umull2.2d 1726 ; CHECK-NEXT: ret 1727 %0 = bitcast <4 x i32> %b to <2 x i64> 1728 %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1> 1729 %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> 1730 %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0 1731 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1 1732 %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind 1733 ret <2 x i64> %vmull2.i.i 1734 } 1735 1736 define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) { 1737 ; CHECK-LABEL: vmul_built_dup_test: 1738 ; CHECK-NOT: ins 1739 ; CHECK-NOT: dup 1740 ; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1] 1741 %vget_lane = extractelement <4 x i32> %b, i32 1 1742 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0 1743 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1 1744 %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2 1745 %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3 1746 %prod = mul <4 x i32> %a, %vecinit3.i 1747 ret <4 x i32> %prod 1748 } 1749 1750 define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) { 1751 ; CHECK-LABEL: vmul_built_dup_fromsmall_test: 1752 ; CHECK-NOT: ins 1753 ; CHECK-NOT: dup 1754 ; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3] 1755 %vget_lane = extractelement <4 x i16> %b, i32 3 1756 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 1757 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 1758 %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2 1759 %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3 1760 %prod = mul <4 x i16> %a, %vecinit3.i 1761 ret <4 x i16> %prod 1762 } 1763 1764 define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) { 1765 ; CHECK-LABEL: vmulq_built_dup_fromsmall_test: 1766 ; CHECK-NOT: ins 1767 ; CHECK-NOT: dup 1768 ; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] 1769 %vget_lane = extractelement <4 x i16> %b, i32 0 1770 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 1771 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1 1772 %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2 1773 %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3 1774 %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4 1775 %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5 1776 %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6 1777 %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7 1778 %prod = mul <8 x i16> %a, %vecinit7.i 1779 ret <8 x i16> %prod 1780 } 1781 1782 define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) { 1783 ; CHECK-LABEL: mull_from_two_extracts: 1784 ; CHECK-NOT: ext 1785 ; CHECK: sqdmull2.2d 1786 1787 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1788 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1789 1790 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1791 ret <2 x i64> %res 1792 } 1793 1794 define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 1795 ; CHECK-LABEL: mlal_from_two_extracts: 1796 ; CHECK-NOT: ext 1797 ; CHECK: sqdmlal2.2d 1798 1799 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1800 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1801 1802 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1803 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) 1804 ret <2 x i64> %sum 1805 } 1806 1807 define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { 1808 ; CHECK-LABEL: mull_from_extract_dup: 1809 ; CHECK-NOT: ext 1810 ; CHECK: sqdmull2.2d 1811 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 1812 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 1813 1814 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1815 1816 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind 1817 ret <2 x i64> %res 1818 } 1819 1820 define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) { 1821 ; CHECK-LABEL: pmull_from_extract_dup: 1822 ; CHECK-NOT: ext 1823 ; CHECK: pmull2.8h 1824 %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 1825 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 1826 1827 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1828 1829 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind 1830 ret <8 x i16> %res 1831 } 1832 1833 define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) { 1834 ; CHECK-LABEL: pmull_from_extract_duplane: 1835 ; CHECK-NOT: ext 1836 ; CHECK: pmull2.8h 1837 1838 %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1839 %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 1840 1841 %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind 1842 ret <8 x i16> %res 1843 } 1844 1845 define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) { 1846 ; CHECK-LABEL: sqdmull_from_extract_duplane: 1847 ; CHECK-NOT: ext 1848 ; CHECK: sqdmull2.2d 1849 1850 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1851 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 1852 1853 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1854 ret <2 x i64> %res 1855 } 1856 1857 define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 1858 ; CHECK-LABEL: sqdmlal_from_extract_duplane: 1859 ; CHECK-NOT: ext 1860 ; CHECK: sqdmlal2.2d 1861 1862 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1863 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 1864 1865 %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1866 %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res) 1867 ret <2 x i64> %sum 1868 } 1869 1870 define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { 1871 ; CHECK-LABEL: umlal_from_extract_duplane: 1872 ; CHECK-NOT: ext 1873 ; CHECK: umlal2.2d 1874 1875 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 1876 %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0> 1877 1878 %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind 1879 %sum = add <2 x i64> %accum, %res 1880 ret <2 x i64> %sum 1881 } 1882 1883 define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { 1884 ; CHECK-LABEL: scalar_fmla_from_extract_v4f32: 1885 ; CHECK: fmla.s s0, s1, v2[3] 1886 %rhs = extractelement <4 x float> %rvec, i32 3 1887 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1888 ret float %res 1889 } 1890 1891 define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { 1892 ; CHECK-LABEL: scalar_fmla_from_extract_v2f32: 1893 ; CHECK: fmla.s s0, s1, v2[1] 1894 %rhs = extractelement <2 x float> %rvec, i32 1 1895 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1896 ret float %res 1897 } 1898 1899 define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { 1900 ; CHECK-LABEL: scalar_fmls_from_extract_v4f32: 1901 ; CHECK: fmls.s s0, s1, v2[3] 1902 %rhs.scal = extractelement <4 x float> %rvec, i32 3 1903 %rhs = fsub float -0.0, %rhs.scal 1904 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1905 ret float %res 1906 } 1907 1908 define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { 1909 ; CHECK-LABEL: scalar_fmls_from_extract_v2f32: 1910 ; CHECK: fmls.s s0, s1, v2[1] 1911 %rhs.scal = extractelement <2 x float> %rvec, i32 1 1912 %rhs = fsub float -0.0, %rhs.scal 1913 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) 1914 ret float %res 1915 } 1916 1917 declare float @llvm.fma.f32(float, float, float) 1918 1919 define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { 1920 ; CHECK-LABEL: scalar_fmla_from_extract_v2f64: 1921 ; CHECK: fmla.d d0, d1, v2[1] 1922 %rhs = extractelement <2 x double> %rvec, i32 1 1923 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) 1924 ret double %res 1925 } 1926 1927 define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { 1928 ; CHECK-LABEL: scalar_fmls_from_extract_v2f64: 1929 ; CHECK: fmls.d d0, d1, v2[1] 1930 %rhs.scal = extractelement <2 x double> %rvec, i32 1 1931 %rhs = fsub double -0.0, %rhs.scal 1932 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) 1933 ret double %res 1934 } 1935 1936 declare double @llvm.fma.f64(double, double, double) 1937 1938 define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) { 1939 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32: 1940 ; CHECK: fmls.2s v0, v1, v2[3] 1941 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs 1942 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3> 1943 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) 1944 ret <2 x float> %res 1945 } 1946 1947 define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) { 1948 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1: 1949 ; CHECK: fmls.2s v0, v1, v2[1] 1950 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs 1951 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1> 1952 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) 1953 ret <2 x float> %res 1954 } 1955 1956 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) { 1957 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32: 1958 ; CHECK: fmls.4s v0, v1, v2[3] 1959 %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs 1960 %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 1961 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) 1962 ret <4 x float> %res 1963 } 1964 1965 define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) { 1966 ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1: 1967 ; CHECK: fmls.4s v0, v1, v2[1] 1968 %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs 1969 %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 1970 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) 1971 ret <4 x float> %res 1972 } 1973 1974 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) { 1975 ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64: 1976 ; CHECK: fmls.2d v0, v1, v2[1] 1977 %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs 1978 %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1> 1979 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum) 1980 ret <2 x double> %res 1981 } 1982 1983 define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind { 1984 ; CHECK-LABEL: test_fmul_v1f64: 1985 ; CHECK: fmul 1986 %prod = fmul <1 x double> %L, %R 1987 ret <1 x double> %prod 1988 } 1989 1990 define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind { 1991 ; CHECK-LABEL: test_fdiv_v1f64: 1992 ; CHECK-LABEL: fdiv 1993 %prod = fdiv <1 x double> %L, %R 1994 ret <1 x double> %prod 1995 } 1996 1997 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind { 1998 ;CHECK-LABEL: sqdmlal_d: 1999 ;CHECK: sqdmlal 2000 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) 2001 %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4) 2002 ret i64 %tmp5 2003 } 2004 2005 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { 2006 ;CHECK-LABEL: sqdmlsl_d: 2007 ;CHECK: sqdmlsl 2008 %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) 2009 %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4) 2010 ret i64 %tmp5 2011 } 2012 2013 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { 2014 ; CHECK-LABEL: test_pmull_64: 2015 ; CHECK: pmull.1q 2016 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) 2017 ret <16 x i8> %val 2018 } 2019 2020 define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { 2021 ; CHECK-LABEL: test_pmull_high_64: 2022 ; CHECK: pmull2.1q 2023 %l_hi = extractelement <2 x i64> %l, i32 1 2024 %r_hi = extractelement <2 x i64> %r, i32 1 2025 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi) 2026 ret <16 x i8> %val 2027 } 2028 2029 declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) 2030 2031 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind { 2032 ; CHECK-LABEL: test_mul_v1i64: 2033 ; CHECK: mul 2034 %prod = mul <1 x i64> %lhs, %rhs 2035 ret <1 x i64> %prod 2036 } 2037