Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL
      4 ; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
      5 
      6 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/fma-builtins.c
      7 
      8 define <4 x float> @test_mm_fmadd_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
      9 ; CHECK-FMA-LABEL: test_mm_fmadd_ps:
     10 ; CHECK-FMA:       # %bb.0: # %entry
     11 ; CHECK-FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
     12 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     13 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
     14 ;
     15 ; CHECK-AVX512VL-LABEL: test_mm_fmadd_ps:
     16 ; CHECK-AVX512VL:       # %bb.0: # %entry
     17 ; CHECK-AVX512VL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
     18 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     19 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
     20 ;
     21 ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_ps:
     22 ; CHECK-FMA-WIN:       # %bb.0: # %entry
     23 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
     24 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
     25 ; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
     26 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
     27 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
     28 entry:
     29   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
     30   ret <4 x float> %0
     31 }
     32 
     33 define <2 x double> @test_mm_fmadd_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
     34 ; CHECK-FMA-LABEL: test_mm_fmadd_pd:
     35 ; CHECK-FMA:       # %bb.0: # %entry
     36 ; CHECK-FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
     37 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     38 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
     39 ;
     40 ; CHECK-AVX512VL-LABEL: test_mm_fmadd_pd:
     41 ; CHECK-AVX512VL:       # %bb.0: # %entry
     42 ; CHECK-AVX512VL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
     43 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     44 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
     45 ;
     46 ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_pd:
     47 ; CHECK-FMA-WIN:       # %bb.0: # %entry
     48 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
     49 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
     50 ; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
     51 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
     52 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
     53 entry:
     54   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2
     55   ret <2 x double> %0
     56 }
     57 
     58 define <4 x float> @test_mm_fmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
     59 ; CHECK-FMA-LABEL: test_mm_fmadd_ss:
     60 ; CHECK-FMA:       # %bb.0: # %entry
     61 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
     62 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     63 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
     64 ;
     65 ; CHECK-AVX512VL-LABEL: test_mm_fmadd_ss:
     66 ; CHECK-AVX512VL:       # %bb.0: # %entry
     67 ; CHECK-AVX512VL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
     68 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     69 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
     70 ;
     71 ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_ss:
     72 ; CHECK-FMA-WIN:       # %bb.0: # %entry
     73 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
     74 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
     75 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
     76 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02]
     77 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
     78 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
     79 entry:
     80   %0 = extractelement <4 x float> %a, i64 0
     81   %1 = extractelement <4 x float> %b, i64 0
     82   %2 = extractelement <4 x float> %c, i64 0
     83   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
     84   %4 = insertelement <4 x float> %a, float %3, i64 0
     85   ret <4 x float> %4
     86 }
     87 
     88 define <2 x double> @test_mm_fmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
     89 ; CHECK-FMA-LABEL: test_mm_fmadd_sd:
     90 ; CHECK-FMA:       # %bb.0: # %entry
     91 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
     92 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     93 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
     94 ;
     95 ; CHECK-AVX512VL-LABEL: test_mm_fmadd_sd:
     96 ; CHECK-AVX512VL:       # %bb.0: # %entry
     97 ; CHECK-AVX512VL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
     98 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     99 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    100 ;
    101 ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_sd:
    102 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    103 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    104 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    105 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    106 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02]
    107 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
    108 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    109 entry:
    110   %0 = extractelement <2 x double> %a, i64 0
    111   %1 = extractelement <2 x double> %b, i64 0
    112   %2 = extractelement <2 x double> %c, i64 0
    113   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
    114   %4 = insertelement <2 x double> %a, double %3, i64 0
    115   ret <2 x double> %4
    116 }
    117 
    118 define <4 x float> @test_mm_fmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    119 ; CHECK-FMA-LABEL: test_mm_fmsub_ps:
    120 ; CHECK-FMA:       # %bb.0: # %entry
    121 ; CHECK-FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
    122 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    123 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    124 ;
    125 ; CHECK-AVX512VL-LABEL: test_mm_fmsub_ps:
    126 ; CHECK-AVX512VL:       # %bb.0: # %entry
    127 ; CHECK-AVX512VL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
    128 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    129 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    130 ;
    131 ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_ps:
    132 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    133 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    134 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    135 ; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
    136 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
    137 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    138 entry:
    139   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    140   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i) #2
    141   ret <4 x float> %0
    142 }
    143 
    144 define <2 x double> @test_mm_fmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    145 ; CHECK-FMA-LABEL: test_mm_fmsub_pd:
    146 ; CHECK-FMA:       # %bb.0: # %entry
    147 ; CHECK-FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
    148 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    149 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    150 ;
    151 ; CHECK-AVX512VL-LABEL: test_mm_fmsub_pd:
    152 ; CHECK-AVX512VL:       # %bb.0: # %entry
    153 ; CHECK-AVX512VL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
    154 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    155 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    156 ;
    157 ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_pd:
    158 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    159 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    160 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    161 ; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
    162 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
    163 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    164 entry:
    165   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
    166   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %sub.i) #2
    167   ret <2 x double> %0
    168 }
    169 
    170 define <4 x float> @test_mm_fmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    171 ; CHECK-FMA-LABEL: test_mm_fmsub_ss:
    172 ; CHECK-FMA:       # %bb.0: # %entry
    173 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
    174 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    175 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    176 ;
    177 ; CHECK-AVX512VL-LABEL: test_mm_fmsub_ss:
    178 ; CHECK-AVX512VL:       # %bb.0: # %entry
    179 ; CHECK-AVX512VL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
    180 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    181 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    182 ;
    183 ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_ss:
    184 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    185 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
    186 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    187 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    188 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02]
    189 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
    190 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    191 entry:
    192   %0 = extractelement <4 x float> %a, i64 0
    193   %1 = extractelement <4 x float> %b, i64 0
    194   %.rhs.i = extractelement <4 x float> %c, i64 0
    195   %2 = fsub float -0.000000e+00, %.rhs.i
    196   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
    197   %4 = insertelement <4 x float> %a, float %3, i64 0
    198   ret <4 x float> %4
    199 }
    200 
    201 define <2 x double> @test_mm_fmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    202 ; CHECK-FMA-LABEL: test_mm_fmsub_sd:
    203 ; CHECK-FMA:       # %bb.0: # %entry
    204 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
    205 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    206 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    207 ;
    208 ; CHECK-AVX512VL-LABEL: test_mm_fmsub_sd:
    209 ; CHECK-AVX512VL:       # %bb.0: # %entry
    210 ; CHECK-AVX512VL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
    211 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    212 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    213 ;
    214 ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_sd:
    215 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    216 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    217 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    218 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    219 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02]
    220 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
    221 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    222 entry:
    223   %0 = extractelement <2 x double> %a, i64 0
    224   %1 = extractelement <2 x double> %b, i64 0
    225   %.rhs.i = extractelement <2 x double> %c, i64 0
    226   %2 = fsub double -0.000000e+00, %.rhs.i
    227   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
    228   %4 = insertelement <2 x double> %a, double %3, i64 0
    229   ret <2 x double> %4
    230 }
    231 
    232 define <4 x float> @test_mm_fnmadd_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    233 ; CHECK-FMA-LABEL: test_mm_fnmadd_ps:
    234 ; CHECK-FMA:       # %bb.0: # %entry
    235 ; CHECK-FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
    236 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    237 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    238 ;
    239 ; CHECK-AVX512VL-LABEL: test_mm_fnmadd_ps:
    240 ; CHECK-AVX512VL:       # %bb.0: # %entry
    241 ; CHECK-AVX512VL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
    242 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    243 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    244 ;
    245 ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_ps:
    246 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    247 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    248 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    249 ; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
    250 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
    251 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    252 entry:
    253   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
    254   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %b, <4 x float> %c) #2
    255   ret <4 x float> %0
    256 }
    257 
    258 define <2 x double> @test_mm_fnmadd_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    259 ; CHECK-FMA-LABEL: test_mm_fnmadd_pd:
    260 ; CHECK-FMA:       # %bb.0: # %entry
    261 ; CHECK-FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
    262 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    263 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    264 ;
    265 ; CHECK-AVX512VL-LABEL: test_mm_fnmadd_pd:
    266 ; CHECK-AVX512VL:       # %bb.0: # %entry
    267 ; CHECK-AVX512VL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
    268 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    269 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    270 ;
    271 ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_pd:
    272 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    273 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    274 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    275 ; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
    276 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
    277 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    278 entry:
    279   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
    280   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c) #2
    281   ret <2 x double> %0
    282 }
    283 
    284 define <4 x float> @test_mm_fnmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    285 ; CHECK-FMA-LABEL: test_mm_fnmadd_ss:
    286 ; CHECK-FMA:       # %bb.0: # %entry
    287 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
    288 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    289 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    290 ;
    291 ; CHECK-AVX512VL-LABEL: test_mm_fnmadd_ss:
    292 ; CHECK-AVX512VL:       # %bb.0: # %entry
    293 ; CHECK-AVX512VL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
    294 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    295 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    296 ;
    297 ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_ss:
    298 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    299 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
    300 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    301 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    302 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02]
    303 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
    304 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    305 entry:
    306   %0 = extractelement <4 x float> %a, i64 0
    307   %.rhs.i = extractelement <4 x float> %b, i64 0
    308   %1 = fsub float -0.000000e+00, %.rhs.i
    309   %2 = extractelement <4 x float> %c, i64 0
    310   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
    311   %4 = insertelement <4 x float> %a, float %3, i64 0
    312   ret <4 x float> %4
    313 }
    314 
    315 define <2 x double> @test_mm_fnmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    316 ; CHECK-FMA-LABEL: test_mm_fnmadd_sd:
    317 ; CHECK-FMA:       # %bb.0: # %entry
    318 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
    319 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    320 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    321 ;
    322 ; CHECK-AVX512VL-LABEL: test_mm_fnmadd_sd:
    323 ; CHECK-AVX512VL:       # %bb.0: # %entry
    324 ; CHECK-AVX512VL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
    325 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    326 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    327 ;
    328 ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_sd:
    329 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    330 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    331 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    332 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    333 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02]
    334 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
    335 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    336 entry:
    337   %0 = extractelement <2 x double> %a, i64 0
    338   %.rhs.i = extractelement <2 x double> %b, i64 0
    339   %1 = fsub double -0.000000e+00, %.rhs.i
    340   %2 = extractelement <2 x double> %c, i64 0
    341   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
    342   %4 = insertelement <2 x double> %a, double %3, i64 0
    343   ret <2 x double> %4
    344 }
    345 
    346 define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    347 ; CHECK-FMA-LABEL: test_mm_fnmsub_ps:
    348 ; CHECK-FMA:       # %bb.0: # %entry
    349 ; CHECK-FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
    350 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    351 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    352 ;
    353 ; CHECK-AVX512VL-LABEL: test_mm_fnmsub_ps:
    354 ; CHECK-AVX512VL:       # %bb.0: # %entry
    355 ; CHECK-AVX512VL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
    356 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    357 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    358 ;
    359 ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_ps:
    360 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    361 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    362 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    363 ; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
    364 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
    365 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    366 entry:
    367   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
    368   %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    369   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %b, <4 x float> %sub1.i) #2
    370   ret <4 x float> %0
    371 }
    372 
    373 define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    374 ; CHECK-FMA-LABEL: test_mm_fnmsub_pd:
    375 ; CHECK-FMA:       # %bb.0: # %entry
    376 ; CHECK-FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
    377 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    378 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    379 ;
    380 ; CHECK-AVX512VL-LABEL: test_mm_fnmsub_pd:
    381 ; CHECK-AVX512VL:       # %bb.0: # %entry
    382 ; CHECK-AVX512VL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
    383 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    384 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    385 ;
    386 ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_pd:
    387 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    388 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    389 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    390 ; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
    391 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
    392 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    393 entry:
    394   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
    395   %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
    396   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %b, <2 x double> %sub1.i) #2
    397   ret <2 x double> %0
    398 }
    399 
    400 define <4 x float> @test_mm_fnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    401 ; CHECK-FMA-LABEL: test_mm_fnmsub_ss:
    402 ; CHECK-FMA:       # %bb.0: # %entry
    403 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
    404 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    405 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    406 ;
    407 ; CHECK-AVX512VL-LABEL: test_mm_fnmsub_ss:
    408 ; CHECK-AVX512VL:       # %bb.0: # %entry
    409 ; CHECK-AVX512VL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
    410 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    411 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    412 ;
    413 ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_ss:
    414 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    415 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
    416 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    417 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    418 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02]
    419 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
    420 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    421 entry:
    422   %0 = extractelement <4 x float> %a, i64 0
    423   %.rhs.i = extractelement <4 x float> %b, i64 0
    424   %1 = fsub float -0.000000e+00, %.rhs.i
    425   %.rhs2.i = extractelement <4 x float> %c, i64 0
    426   %2 = fsub float -0.000000e+00, %.rhs2.i
    427   %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #2
    428   %4 = insertelement <4 x float> %a, float %3, i64 0
    429   ret <4 x float> %4
    430 }
    431 
    432 define <2 x double> @test_mm_fnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    433 ; CHECK-FMA-LABEL: test_mm_fnmsub_sd:
    434 ; CHECK-FMA:       # %bb.0: # %entry
    435 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
    436 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    437 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    438 ;
    439 ; CHECK-AVX512VL-LABEL: test_mm_fnmsub_sd:
    440 ; CHECK-AVX512VL:       # %bb.0: # %entry
    441 ; CHECK-AVX512VL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
    442 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    443 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    444 ;
    445 ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_sd:
    446 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    447 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    448 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    449 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    450 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02]
    451 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
    452 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    453 entry:
    454   %0 = extractelement <2 x double> %a, i64 0
    455   %.rhs.i = extractelement <2 x double> %b, i64 0
    456   %1 = fsub double -0.000000e+00, %.rhs.i
    457   %.rhs2.i = extractelement <2 x double> %c, i64 0
    458   %2 = fsub double -0.000000e+00, %.rhs2.i
    459   %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #2
    460   %4 = insertelement <2 x double> %a, double %3, i64 0
    461   ret <2 x double> %4
    462 }
    463 
    464 define <4 x float> @test_mm_fmaddsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    465 ; CHECK-FMA-LABEL: test_mm_fmaddsub_ps:
    466 ; CHECK-FMA:       # %bb.0: # %entry
    467 ; CHECK-FMA-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
    468 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    469 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    470 ;
    471 ; CHECK-AVX512VL-LABEL: test_mm_fmaddsub_ps:
    472 ; CHECK-AVX512VL:       # %bb.0: # %entry
    473 ; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
    474 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    475 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    476 ;
    477 ; CHECK-FMA-WIN-LABEL: test_mm_fmaddsub_ps:
    478 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    479 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    480 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    481 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
    482 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
    483 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    484 entry:
    485   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
    486   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    487   %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %1) #2
    488   %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    489   ret <4 x float> %3
    490 }
    491 
    492 define <2 x double> @test_mm_fmaddsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    493 ; CHECK-FMA-LABEL: test_mm_fmaddsub_pd:
    494 ; CHECK-FMA:       # %bb.0: # %entry
    495 ; CHECK-FMA-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
    496 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    497 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    498 ;
    499 ; CHECK-AVX512VL-LABEL: test_mm_fmaddsub_pd:
    500 ; CHECK-AVX512VL:       # %bb.0: # %entry
    501 ; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
    502 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    503 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    504 ;
    505 ; CHECK-FMA-WIN-LABEL: test_mm_fmaddsub_pd:
    506 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    507 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    508 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    509 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
    510 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
    511 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    512 entry:
    513   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2
    514   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
    515   %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %1) #2
    516   %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
    517   ret <2 x double> %3
    518 }
    519 
    520 define <4 x float> @test_mm_fmsubadd_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
    521 ; CHECK-FMA-LABEL: test_mm_fmsubadd_ps:
    522 ; CHECK-FMA:       # %bb.0: # %entry
    523 ; CHECK-FMA-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
    524 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
    525 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    526 ;
    527 ; CHECK-AVX512VL-LABEL: test_mm_fmsubadd_ps:
    528 ; CHECK-AVX512VL:       # %bb.0: # %entry
    529 ; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
    530 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
    531 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    532 ;
    533 ; CHECK-FMA-WIN-LABEL: test_mm_fmsubadd_ps:
    534 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    535 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    536 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    537 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
    538 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
    539 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    540 entry:
    541   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    542   %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i) #2
    543   %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #2
    544   %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    545   ret <4 x float> %2
    546 }
    547 
    548 define <2 x double> @test_mm_fmsubadd_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
    549 ; CHECK-FMA-LABEL: test_mm_fmsubadd_pd:
    550 ; CHECK-FMA:       # %bb.0: # %entry
    551 ; CHECK-FMA-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
    552 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
    553 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    554 ;
    555 ; CHECK-AVX512VL-LABEL: test_mm_fmsubadd_pd:
    556 ; CHECK-AVX512VL:       # %bb.0: # %entry
    557 ; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
    558 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
    559 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    560 ;
    561 ; CHECK-FMA-WIN-LABEL: test_mm_fmsubadd_pd:
    562 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    563 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    564 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    565 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
    566 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
    567 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    568 entry:
    569   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
    570   %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %sub.i) #2
    571   %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #2
    572   %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
    573   ret <2 x double> %2
    574 }
    575 
    576 define <8 x float> @test_mm256_fmadd_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
    577 ; CHECK-FMA-LABEL: test_mm256_fmadd_ps:
    578 ; CHECK-FMA:       # %bb.0: # %entry
    579 ; CHECK-FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
    580 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    581 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    582 ;
    583 ; CHECK-AVX512VL-LABEL: test_mm256_fmadd_ps:
    584 ; CHECK-AVX512VL:       # %bb.0: # %entry
    585 ; CHECK-AVX512VL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
    586 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    587 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    588 ;
    589 ; CHECK-FMA-WIN-LABEL: test_mm256_fmadd_ps:
    590 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    591 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    592 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    593 ; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
    594 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
    595 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    596 entry:
    597   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
    598   ret <8 x float> %0
    599 }
    600 
    601 define <4 x double> @test_mm256_fmadd_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
    602 ; CHECK-FMA-LABEL: test_mm256_fmadd_pd:
    603 ; CHECK-FMA:       # %bb.0: # %entry
    604 ; CHECK-FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
    605 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    606 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    607 ;
    608 ; CHECK-AVX512VL-LABEL: test_mm256_fmadd_pd:
    609 ; CHECK-AVX512VL:       # %bb.0: # %entry
    610 ; CHECK-AVX512VL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
    611 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    612 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    613 ;
    614 ; CHECK-FMA-WIN-LABEL: test_mm256_fmadd_pd:
    615 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    616 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    617 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    618 ; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
    619 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
    620 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    621 entry:
    622   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #2
    623   ret <4 x double> %0
    624 }
    625 
    626 define <8 x float> @test_mm256_fmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
    627 ; CHECK-FMA-LABEL: test_mm256_fmsub_ps:
    628 ; CHECK-FMA:       # %bb.0: # %entry
    629 ; CHECK-FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
    630 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    631 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    632 ;
    633 ; CHECK-AVX512VL-LABEL: test_mm256_fmsub_ps:
    634 ; CHECK-AVX512VL:       # %bb.0: # %entry
    635 ; CHECK-AVX512VL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
    636 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    637 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    638 ;
    639 ; CHECK-FMA-WIN-LABEL: test_mm256_fmsub_ps:
    640 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    641 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    642 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    643 ; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
    644 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
    645 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    646 entry:
    647   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    648   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %sub.i) #2
    649   ret <8 x float> %0
    650 }
    651 
    652 define <4 x double> @test_mm256_fmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
    653 ; CHECK-FMA-LABEL: test_mm256_fmsub_pd:
    654 ; CHECK-FMA:       # %bb.0: # %entry
    655 ; CHECK-FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
    656 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    657 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    658 ;
    659 ; CHECK-AVX512VL-LABEL: test_mm256_fmsub_pd:
    660 ; CHECK-AVX512VL:       # %bb.0: # %entry
    661 ; CHECK-AVX512VL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
    662 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    663 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    664 ;
    665 ; CHECK-FMA-WIN-LABEL: test_mm256_fmsub_pd:
    666 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    667 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    668 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    669 ; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
    670 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
    671 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    672 entry:
    673   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
    674   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %sub.i) #2
    675   ret <4 x double> %0
    676 }
    677 
    678 define <8 x float> @test_mm256_fnmadd_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
    679 ; CHECK-FMA-LABEL: test_mm256_fnmadd_ps:
    680 ; CHECK-FMA:       # %bb.0: # %entry
    681 ; CHECK-FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
    682 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    683 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    684 ;
    685 ; CHECK-AVX512VL-LABEL: test_mm256_fnmadd_ps:
    686 ; CHECK-AVX512VL:       # %bb.0: # %entry
    687 ; CHECK-AVX512VL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
    688 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    689 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    690 ;
    691 ; CHECK-FMA-WIN-LABEL: test_mm256_fnmadd_ps:
    692 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    693 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    694 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    695 ; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
    696 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
    697 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    698 entry:
    699   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
    700   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %b, <8 x float> %c) #2
    701   ret <8 x float> %0
    702 }
    703 
    704 define <4 x double> @test_mm256_fnmadd_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
    705 ; CHECK-FMA-LABEL: test_mm256_fnmadd_pd:
    706 ; CHECK-FMA:       # %bb.0: # %entry
    707 ; CHECK-FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
    708 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    709 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    710 ;
    711 ; CHECK-AVX512VL-LABEL: test_mm256_fnmadd_pd:
    712 ; CHECK-AVX512VL:       # %bb.0: # %entry
    713 ; CHECK-AVX512VL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
    714 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    715 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    716 ;
    717 ; CHECK-FMA-WIN-LABEL: test_mm256_fnmadd_pd:
    718 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    719 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    720 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    721 ; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
    722 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
    723 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    724 entry:
    725   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a
    726   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %b, <4 x double> %c) #2
    727   ret <4 x double> %0
    728 }
    729 
    730 define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
    731 ; CHECK-FMA-LABEL: test_mm256_fnmsub_ps:
    732 ; CHECK-FMA:       # %bb.0: # %entry
    733 ; CHECK-FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
    734 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    735 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    736 ;
    737 ; CHECK-AVX512VL-LABEL: test_mm256_fnmsub_ps:
    738 ; CHECK-AVX512VL:       # %bb.0: # %entry
    739 ; CHECK-AVX512VL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
    740 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    741 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    742 ;
    743 ; CHECK-FMA-WIN-LABEL: test_mm256_fnmsub_ps:
    744 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    745 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    746 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    747 ; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
    748 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
    749 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    750 entry:
    751   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
    752   %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    753   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %b, <8 x float> %sub1.i) #2
    754   ret <8 x float> %0
    755 }
    756 
    757 define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
    758 ; CHECK-FMA-LABEL: test_mm256_fnmsub_pd:
    759 ; CHECK-FMA:       # %bb.0: # %entry
    760 ; CHECK-FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
    761 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    762 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    763 ;
    764 ; CHECK-AVX512VL-LABEL: test_mm256_fnmsub_pd:
    765 ; CHECK-AVX512VL:       # %bb.0: # %entry
    766 ; CHECK-AVX512VL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
    767 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    768 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    769 ;
    770 ; CHECK-FMA-WIN-LABEL: test_mm256_fnmsub_pd:
    771 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    772 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    773 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    774 ; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
    775 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
    776 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    777 entry:
    778   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a
    779   %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
    780   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %b, <4 x double> %sub1.i) #2
    781   ret <4 x double> %0
    782 }
    783 
    784 define <8 x float> @test_mm256_fmaddsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
    785 ; CHECK-FMA-LABEL: test_mm256_fmaddsub_ps:
    786 ; CHECK-FMA:       # %bb.0: # %entry
    787 ; CHECK-FMA-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
    788 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
    789 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    790 ;
    791 ; CHECK-AVX512VL-LABEL: test_mm256_fmaddsub_ps:
    792 ; CHECK-AVX512VL:       # %bb.0: # %entry
    793 ; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
    794 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
    795 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    796 ;
    797 ; CHECK-FMA-WIN-LABEL: test_mm256_fmaddsub_ps:
    798 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    799 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    800 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    801 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
    802 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
    803 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    804 entry:
    805   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
    806   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    807   %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %1) #2
    808   %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
    809   ret <8 x float> %3
    810 }
    811 
    812 define <4 x double> @test_mm256_fmaddsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
    813 ; CHECK-FMA-LABEL: test_mm256_fmaddsub_pd:
    814 ; CHECK-FMA:       # %bb.0: # %entry
    815 ; CHECK-FMA-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
    816 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
    817 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    818 ;
    819 ; CHECK-AVX512VL-LABEL: test_mm256_fmaddsub_pd:
    820 ; CHECK-AVX512VL:       # %bb.0: # %entry
    821 ; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
    822 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
    823 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    824 ;
    825 ; CHECK-FMA-WIN-LABEL: test_mm256_fmaddsub_pd:
    826 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    827 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    828 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    829 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
    830 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
    831 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    832 entry:
    833   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #2
    834   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
    835   %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %1) #2
    836   %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    837   ret <4 x double> %3
    838 }
    839 
    840 define <8 x float> @test_mm256_fmsubadd_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
    841 ; CHECK-FMA-LABEL: test_mm256_fmsubadd_ps:
    842 ; CHECK-FMA:       # %bb.0: # %entry
    843 ; CHECK-FMA-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
    844 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
    845 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    846 ;
    847 ; CHECK-AVX512VL-LABEL: test_mm256_fmsubadd_ps:
    848 ; CHECK-AVX512VL:       # %bb.0: # %entry
    849 ; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
    850 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
    851 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    852 ;
    853 ; CHECK-FMA-WIN-LABEL: test_mm256_fmsubadd_ps:
    854 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    855 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    856 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    857 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
    858 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
    859 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    860 entry:
    861   %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
    862   %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %sub.i) #2
    863   %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #2
    864   %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
    865   ret <8 x float> %2
    866 }
    867 
    868 define <4 x double> @test_mm256_fmsubadd_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
    869 ; CHECK-FMA-LABEL: test_mm256_fmsubadd_pd:
    870 ; CHECK-FMA:       # %bb.0: # %entry
    871 ; CHECK-FMA-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
    872 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
    873 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    874 ;
    875 ; CHECK-AVX512VL-LABEL: test_mm256_fmsubadd_pd:
    876 ; CHECK-AVX512VL:       # %bb.0: # %entry
    877 ; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
    878 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
    879 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    880 ;
    881 ; CHECK-FMA-WIN-LABEL: test_mm256_fmsubadd_pd:
    882 ; CHECK-FMA-WIN:       # %bb.0: # %entry
    883 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    884 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    885 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
    886 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
    887 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    888 entry:
    889   %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
    890   %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %sub.i) #2
    891   %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #2
    892   %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    893   ret <4 x double> %2
    894 }
    895 
    896 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
    897 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #1
    898 declare float @llvm.fma.f32(float, float, float) #1
    899 declare double @llvm.fma.f64(double, double, double) #1
    900 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #1
    901 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #1
    902