1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -disable-peephole -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 3 ; RUN: llc < %s -disable-peephole -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 4 5 target triple = "x86_64-unknown-unknown" 6 7 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) 8 declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) 9 declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) 10 declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) 11 12 declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) 13 declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) 14 declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) 15 declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) 16 17 define void @fmadd_aab_ss(float* %a, float* %b) { 18 ; CHECK-LABEL: fmadd_aab_ss: 19 ; CHECK: # %bb.0: 20 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 21 ; CHECK-NEXT: vfmadd213ss (%rsi), %xmm0, %xmm0 22 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 23 ; CHECK-NEXT: retq 24 %a.val = load float, float* %a 25 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 26 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 27 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 28 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 29 30 %b.val = load float, float* %b 31 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 32 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 33 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 34 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 35 36 %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv) 37 38 %sr = extractelement <4 x float> %vr, i32 0 39 store float %sr, float* %a 40 ret void 41 } 42 43 define void @fmadd_aba_ss(float* %a, float* %b) { 44 ; CHECK-LABEL: fmadd_aba_ss: 45 ; CHECK: # %bb.0: 46 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 47 ; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 48 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 49 ; CHECK-NEXT: retq 50 %a.val = load float, float* %a 51 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 52 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 53 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 54 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 55 56 %b.val = load float, float* %b 57 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 58 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 59 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 60 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 61 62 %vr = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av) 63 64 %sr = extractelement <4 x float> %vr, i32 0 65 store float %sr, float* %a 66 ret void 67 } 68 69 define void @fmsub_aab_ss(float* %a, float* %b) { 70 ; CHECK-LABEL: fmsub_aab_ss: 71 ; CHECK: # %bb.0: 72 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 73 ; CHECK-NEXT: vfmsub213ss (%rsi), %xmm0, %xmm0 74 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 75 ; CHECK-NEXT: retq 76 %a.val = load float, float* %a 77 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 78 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 79 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 80 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 81 82 %b.val = load float, float* %b 83 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 84 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 85 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 86 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 87 88 %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv) 89 90 %sr = extractelement <4 x float> %vr, i32 0 91 store float %sr, float* %a 92 ret void 93 } 94 95 define void @fmsub_aba_ss(float* %a, float* %b) { 96 ; CHECK-LABEL: fmsub_aba_ss: 97 ; CHECK: # %bb.0: 98 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 99 ; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 100 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 101 ; CHECK-NEXT: retq 102 %a.val = load float, float* %a 103 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 104 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 105 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 106 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 107 108 %b.val = load float, float* %b 109 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 110 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 111 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 112 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 113 114 %vr = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av) 115 116 %sr = extractelement <4 x float> %vr, i32 0 117 store float %sr, float* %a 118 ret void 119 } 120 121 define void @fnmadd_aab_ss(float* %a, float* %b) { 122 ; CHECK-LABEL: fnmadd_aab_ss: 123 ; CHECK: # %bb.0: 124 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 125 ; CHECK-NEXT: vfnmadd213ss (%rsi), %xmm0, %xmm0 126 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 127 ; CHECK-NEXT: retq 128 %a.val = load float, float* %a 129 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 130 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 131 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 132 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 133 134 %b.val = load float, float* %b 135 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 136 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 137 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 138 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 139 140 %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv) 141 142 %sr = extractelement <4 x float> %vr, i32 0 143 store float %sr, float* %a 144 ret void 145 } 146 147 define void @fnmadd_aba_ss(float* %a, float* %b) { 148 ; CHECK-LABEL: fnmadd_aba_ss: 149 ; CHECK: # %bb.0: 150 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 151 ; CHECK-NEXT: vfnmadd231ss {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 152 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 153 ; CHECK-NEXT: retq 154 %a.val = load float, float* %a 155 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 156 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 157 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 158 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 159 160 %b.val = load float, float* %b 161 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 162 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 163 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 164 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 165 166 %vr = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av) 167 168 %sr = extractelement <4 x float> %vr, i32 0 169 store float %sr, float* %a 170 ret void 171 } 172 173 define void @fnmsub_aab_ss(float* %a, float* %b) { 174 ; CHECK-LABEL: fnmsub_aab_ss: 175 ; CHECK: # %bb.0: 176 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 177 ; CHECK-NEXT: vfnmsub213ss (%rsi), %xmm0, %xmm0 178 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 179 ; CHECK-NEXT: retq 180 %a.val = load float, float* %a 181 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 182 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 183 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 184 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 185 186 %b.val = load float, float* %b 187 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 188 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 189 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 190 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 191 192 %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv) 193 194 %sr = extractelement <4 x float> %vr, i32 0 195 store float %sr, float* %a 196 ret void 197 } 198 199 define void @fnmsub_aba_ss(float* %a, float* %b) { 200 ; CHECK-LABEL: fnmsub_aba_ss: 201 ; CHECK: # %bb.0: 202 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 203 ; CHECK-NEXT: vfnmsub231ss {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 204 ; CHECK-NEXT: vmovss %xmm0, (%rdi) 205 ; CHECK-NEXT: retq 206 %a.val = load float, float* %a 207 %av0 = insertelement <4 x float> undef, float %a.val, i32 0 208 %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1 209 %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2 210 %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3 211 212 %b.val = load float, float* %b 213 %bv0 = insertelement <4 x float> undef, float %b.val, i32 0 214 %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1 215 %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2 216 %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3 217 218 %vr = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av) 219 220 %sr = extractelement <4 x float> %vr, i32 0 221 store float %sr, float* %a 222 ret void 223 } 224 225 define void @fmadd_aab_sd(double* %a, double* %b) { 226 ; CHECK-LABEL: fmadd_aab_sd: 227 ; CHECK: # %bb.0: 228 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 229 ; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + mem 230 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 231 ; CHECK-NEXT: retq 232 %a.val = load double, double* %a 233 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 234 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 235 236 %b.val = load double, double* %b 237 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 238 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 239 240 %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv) 241 242 %sr = extractelement <2 x double> %vr, i32 0 243 store double %sr, double* %a 244 ret void 245 } 246 247 define void @fmadd_aba_sd(double* %a, double* %b) { 248 ; CHECK-LABEL: fmadd_aba_sd: 249 ; CHECK: # %bb.0: 250 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 251 ; CHECK-NEXT: vfmadd231sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 252 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 253 ; CHECK-NEXT: retq 254 %a.val = load double, double* %a 255 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 256 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 257 258 %b.val = load double, double* %b 259 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 260 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 261 262 %vr = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av) 263 264 %sr = extractelement <2 x double> %vr, i32 0 265 store double %sr, double* %a 266 ret void 267 } 268 269 define void @fmsub_aab_sd(double* %a, double* %b) { 270 ; CHECK-LABEL: fmsub_aab_sd: 271 ; CHECK: # %bb.0: 272 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 273 ; CHECK-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm0 * xmm0) - mem 274 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 275 ; CHECK-NEXT: retq 276 %a.val = load double, double* %a 277 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 278 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 279 280 %b.val = load double, double* %b 281 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 282 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 283 284 %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv) 285 286 %sr = extractelement <2 x double> %vr, i32 0 287 store double %sr, double* %a 288 ret void 289 } 290 291 define void @fmsub_aba_sd(double* %a, double* %b) { 292 ; CHECK-LABEL: fmsub_aba_sd: 293 ; CHECK: # %bb.0: 294 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 295 ; CHECK-NEXT: vfmsub231sd {{.*#+}} xmm0 = (xmm0 * mem) - xmm0 296 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 297 ; CHECK-NEXT: retq 298 %a.val = load double, double* %a 299 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 300 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 301 302 %b.val = load double, double* %b 303 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 304 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 305 306 %vr = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av) 307 308 %sr = extractelement <2 x double> %vr, i32 0 309 store double %sr, double* %a 310 ret void 311 } 312 313 define void @fnmadd_aab_sd(double* %a, double* %b) { 314 ; CHECK-LABEL: fnmadd_aab_sd: 315 ; CHECK: # %bb.0: 316 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 317 ; CHECK-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm0 * xmm0) + mem 318 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 319 ; CHECK-NEXT: retq 320 %a.val = load double, double* %a 321 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 322 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 323 324 %b.val = load double, double* %b 325 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 326 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 327 328 %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv) 329 330 %sr = extractelement <2 x double> %vr, i32 0 331 store double %sr, double* %a 332 ret void 333 } 334 335 define void @fnmadd_aba_sd(double* %a, double* %b) { 336 ; CHECK-LABEL: fnmadd_aba_sd: 337 ; CHECK: # %bb.0: 338 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 339 ; CHECK-NEXT: vfnmadd231sd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm0 340 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 341 ; CHECK-NEXT: retq 342 %a.val = load double, double* %a 343 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 344 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 345 346 %b.val = load double, double* %b 347 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 348 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 349 350 %vr = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av) 351 352 %sr = extractelement <2 x double> %vr, i32 0 353 store double %sr, double* %a 354 ret void 355 } 356 357 define void @fnmsub_aab_sd(double* %a, double* %b) { 358 ; CHECK-LABEL: fnmsub_aab_sd: 359 ; CHECK: # %bb.0: 360 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 361 ; CHECK-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm0 * xmm0) - mem 362 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 363 ; CHECK-NEXT: retq 364 %a.val = load double, double* %a 365 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 366 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 367 368 %b.val = load double, double* %b 369 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 370 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 371 372 %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv) 373 374 %sr = extractelement <2 x double> %vr, i32 0 375 store double %sr, double* %a 376 ret void 377 } 378 379 define void @fnmsub_aba_sd(double* %a, double* %b) { 380 ; CHECK-LABEL: fnmsub_aba_sd: 381 ; CHECK: # %bb.0: 382 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 383 ; CHECK-NEXT: vfnmsub231sd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm0 384 ; CHECK-NEXT: vmovsd %xmm0, (%rdi) 385 ; CHECK-NEXT: retq 386 %a.val = load double, double* %a 387 %av0 = insertelement <2 x double> undef, double %a.val, i32 0 388 %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1 389 390 %b.val = load double, double* %b 391 %bv0 = insertelement <2 x double> undef, double %b.val, i32 0 392 %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1 393 394 %vr = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av) 395 396 %sr = extractelement <2 x double> %vr, i32 0 397 store double %sr, double* %a 398 ret void 399 } 400 401 402