Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL
      4 ; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
      5 
      6 ; VFMADD
      7 define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
      8 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
      9 ; CHECK-FMA:       # %bb.0:
     10 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
     11 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     12 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
     13 ;
     14 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss:
     15 ; CHECK-AVX512VL:       # %bb.0:
     16 ; CHECK-AVX512VL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
     17 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
     18 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
     19 ;
     20 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
     21 ; CHECK-FMA-WIN:       # %bb.0:
     22 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
     23 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
     24 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
     25 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02]
     26 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
     27 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
     28   %1 = extractelement <4 x float> %a0, i64 0
     29   %2 = extractelement <4 x float> %a1, i64 0
     30   %3 = extractelement <4 x float> %a2, i64 0
     31   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
     32   %5 = insertelement <4 x float> %a0, float %4, i64 0
     33   ret <4 x float> %5
     34 }
     35 
     36 define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
     37 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss:
     38 ; CHECK-FMA:       # %bb.0:
     39 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca]
     40 ; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
     41 ; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
     42 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
     43 ;
     44 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss:
     45 ; CHECK-AVX512VL:       # %bb.0:
     46 ; CHECK-AVX512VL-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
     47 ; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
     48 ; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
     49 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
     50 ;
     51 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
     52 ; CHECK-FMA-WIN:       # %bb.0:
     53 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
     54 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
     55 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
     56 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01]
     57 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
     58 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
     59   %1 = extractelement <4 x float> %a1, i64 0
     60   %2 = extractelement <4 x float> %a0, i64 0
     61   %3 = extractelement <4 x float> %a2, i64 0
     62   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
     63   %5 = insertelement <4 x float> %a1, float %4, i64 0
     64   ret <4 x float> %5
     65 }
     66 
     67 define <4 x float> @test_x86_fma_vfmadd_ss_231(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
     68 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss_231:
     69 ; CHECK-FMA:       # %bb.0:
     70 ; CHECK-FMA-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 # encoding: [0xc4,0xe2,0x79,0xb9,0xd1]
     71 ; CHECK-FMA-NEXT:    # xmm2 = (xmm0 * xmm1) + xmm2
     72 ; CHECK-FMA-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
     73 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
     74 ;
     75 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss_231:
     76 ; CHECK-AVX512VL:       # %bb.0:
     77 ; CHECK-AVX512VL-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0xd1]
     78 ; CHECK-AVX512VL-NEXT:    # xmm2 = (xmm0 * xmm1) + xmm2
     79 ; CHECK-AVX512VL-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
     80 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
     81 ;
     82 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss_231:
     83 ; CHECK-FMA-WIN:       # %bb.0:
     84 ; CHECK-FMA-WIN-NEXT:    vmovaps (%r8), %xmm0 # encoding: [0xc4,0xc1,0x78,0x28,0x00]
     85 ; CHECK-FMA-WIN-NEXT:    vmovss (%rcx), %xmm1 # encoding: [0xc5,0xfa,0x10,0x09]
     86 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
     87 ; CHECK-FMA-WIN-NEXT:    vfmadd231ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xb9,0x02]
     88 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * mem) + xmm0
     89 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
     90   %1 = extractelement <4 x float> %a0, i64 0
     91   %2 = extractelement <4 x float> %a1, i64 0
     92   %3 = extractelement <4 x float> %a2, i64 0
     93   %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
     94   %5 = insertelement <4 x float> %a2, float %4, i64 0
     95   ret <4 x float> %5
     96 }
     97 
     98 define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
     99 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
    100 ; CHECK-FMA:       # %bb.0:
    101 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
    102 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
    103 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    104 ;
    105 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd:
    106 ; CHECK-AVX512VL:       # %bb.0:
    107 ; CHECK-AVX512VL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
    108 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
    109 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    110 ;
    111 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
    112 ; CHECK-FMA-WIN:       # %bb.0:
    113 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    114 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    115 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    116 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02]
    117 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
    118 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    119   %1 = extractelement <2 x double> %a0, i64 0
    120   %2 = extractelement <2 x double> %a1, i64 0
    121   %3 = extractelement <2 x double> %a2, i64 0
    122   %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
    123   %5 = insertelement <2 x double> %a0, double %4, i64 0
    124   ret <2 x double> %5
    125 }
    126 
    127 define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    128 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd:
    129 ; CHECK-FMA:       # %bb.0:
    130 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
    131 ; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
    132 ; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
    133 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    134 ;
    135 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd:
    136 ; CHECK-AVX512VL:       # %bb.0:
    137 ; CHECK-AVX512VL-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
    138 ; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) + xmm2
    139 ; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
    140 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    141 ;
    142 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
    143 ; CHECK-FMA-WIN:       # %bb.0:
    144 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    145 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    146 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    147 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x01]
    148 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
    149 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    150   %1 = extractelement <2 x double> %a1, i64 0
    151   %2 = extractelement <2 x double> %a0, i64 0
    152   %3 = extractelement <2 x double> %a2, i64 0
    153   %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
    154   %5 = insertelement <2 x double> %a1, double %4, i64 0
    155   ret <2 x double> %5
    156 }
    157 
    158 define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    159 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
    160 ; CHECK-FMA:       # %bb.0:
    161 ; CHECK-FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
    162 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
    163 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    164 ;
    165 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps:
    166 ; CHECK-AVX512VL:       # %bb.0:
    167 ; CHECK-AVX512VL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
    168 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
    169 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    170 ;
    171 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps:
    172 ; CHECK-FMA-WIN:       # %bb.0:
    173 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    174 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    175 ; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
    176 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
    177 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    178   %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    179   ret <4 x float> %1
    180 }
    181 
    182 define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    183 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
    184 ; CHECK-FMA:       # %bb.0:
    185 ; CHECK-FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
    186 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
    187 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    188 ;
    189 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd:
    190 ; CHECK-AVX512VL:       # %bb.0:
    191 ; CHECK-AVX512VL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
    192 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) + xmm2
    193 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    194 ;
    195 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd:
    196 ; CHECK-FMA-WIN:       # %bb.0:
    197 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    198 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    199 ; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
    200 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
    201 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    202   %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    203   ret <2 x double> %1
    204 }
    205 
    206 define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    207 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
    208 ; CHECK-FMA:       # %bb.0:
    209 ; CHECK-FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
    210 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    211 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    212 ;
    213 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256:
    214 ; CHECK-AVX512VL:       # %bb.0:
    215 ; CHECK-AVX512VL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
    216 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    217 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    218 ;
    219 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256:
    220 ; CHECK-FMA-WIN:       # %bb.0:
    221 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    222 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    223 ; CHECK-FMA-WIN-NEXT:    vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
    224 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
    225 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    226   %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    227   ret <8 x float> %1
    228 }
    229 
    230 define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    231 ; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
    232 ; CHECK-FMA:       # %bb.0:
    233 ; CHECK-FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
    234 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    235 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    236 ;
    237 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256:
    238 ; CHECK-AVX512VL:       # %bb.0:
    239 ; CHECK-AVX512VL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
    240 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) + ymm2
    241 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    242 ;
    243 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256:
    244 ; CHECK-FMA-WIN:       # %bb.0:
    245 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    246 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    247 ; CHECK-FMA-WIN-NEXT:    vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
    248 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) + mem
    249 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    250   %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    251   ret <4 x double> %1
    252 }
    253 
    254 ; VFMSUB
    255 define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    256 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
    257 ; CHECK-FMA:       # %bb.0:
    258 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
    259 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    260 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    261 ;
    262 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss:
    263 ; CHECK-AVX512VL:       # %bb.0:
    264 ; CHECK-AVX512VL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
    265 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    266 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    267 ;
    268 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
    269 ; CHECK-FMA-WIN:       # %bb.0:
    270 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
    271 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    272 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    273 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02]
    274 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
    275 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    276   %1 = extractelement <4 x float> %a0, i64 0
    277   %2 = extractelement <4 x float> %a1, i64 0
    278   %3 = extractelement <4 x float> %a2, i64 0
    279   %4 = fsub float -0.000000e+00, %3
    280   %5 = call float @llvm.fma.f32(float %1, float %2, float %4)
    281   %6 = insertelement <4 x float> %a0, float %5, i64 0
    282   ret <4 x float> %6
    283 }
    284 
    285 define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    286 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss:
    287 ; CHECK-FMA:       # %bb.0:
    288 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca]
    289 ; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
    290 ; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
    291 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    292 ;
    293 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss:
    294 ; CHECK-AVX512VL:       # %bb.0:
    295 ; CHECK-AVX512VL-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
    296 ; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
    297 ; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
    298 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    299 ;
    300 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
    301 ; CHECK-FMA-WIN:       # %bb.0:
    302 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    303 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    304 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    305 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01]
    306 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
    307 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    308   %1 = extractelement <4 x float> %a1, i64 0
    309   %2 = extractelement <4 x float> %a0, i64 0
    310   %3 = extractelement <4 x float> %a2, i64 0
    311   %4 = fsub float -0.000000e+00, %3
    312   %5 = call float @llvm.fma.f32(float %1, float %2, float %4)
    313   %6 = insertelement <4 x float> %a1, float %5, i64 0
    314   ret <4 x float> %6
    315 }
    316 
    317 define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    318 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
    319 ; CHECK-FMA:       # %bb.0:
    320 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
    321 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    322 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    323 ;
    324 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd:
    325 ; CHECK-AVX512VL:       # %bb.0:
    326 ; CHECK-AVX512VL-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
    327 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    328 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    329 ;
    330 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
    331 ; CHECK-FMA-WIN:       # %bb.0:
    332 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    333 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    334 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    335 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02]
    336 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
    337 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    338   %1 = extractelement <2 x double> %a0, i64 0
    339   %2 = extractelement <2 x double> %a1, i64 0
    340   %3 = extractelement <2 x double> %a2, i64 0
    341   %4 = fsub double -0.000000e+00, %3
    342   %5 = call double @llvm.fma.f64(double %1, double %2, double %4)
    343   %6 = insertelement <2 x double> %a0, double %5, i64 0
    344   ret <2 x double> %6
    345 }
    346 
    347 define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    348 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd:
    349 ; CHECK-FMA:       # %bb.0:
    350 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca]
    351 ; CHECK-FMA-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
    352 ; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
    353 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    354 ;
    355 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd:
    356 ; CHECK-AVX512VL:       # %bb.0:
    357 ; CHECK-AVX512VL-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
    358 ; CHECK-AVX512VL-NEXT:    # xmm1 = (xmm0 * xmm1) - xmm2
    359 ; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
    360 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    361 ;
    362 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
    363 ; CHECK-FMA-WIN:       # %bb.0:
    364 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    365 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    366 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    367 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x01]
    368 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
    369 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    370   %1 = extractelement <2 x double> %a1, i64 0
    371   %2 = extractelement <2 x double> %a0, i64 0
    372   %3 = extractelement <2 x double> %a2, i64 0
    373   %4 = fsub double -0.000000e+00, %3
    374   %5 = call double @llvm.fma.f64(double %1, double %2, double %4)
    375   %6 = insertelement <2 x double> %a1, double %5, i64 0
    376   ret <2 x double> %6
    377 }
    378 
    379 define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    380 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
    381 ; CHECK-FMA:       # %bb.0:
    382 ; CHECK-FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
    383 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    384 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    385 ;
    386 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps:
    387 ; CHECK-AVX512VL:       # %bb.0:
    388 ; CHECK-AVX512VL-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
    389 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    390 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    391 ;
    392 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps:
    393 ; CHECK-FMA-WIN:       # %bb.0:
    394 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    395 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    396 ; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
    397 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
    398 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    399   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
    400   %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %1)
    401   ret <4 x float> %2
    402 }
    403 
    404 define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    405 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
    406 ; CHECK-FMA:       # %bb.0:
    407 ; CHECK-FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
    408 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    409 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    410 ;
    411 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd:
    412 ; CHECK-AVX512VL:       # %bb.0:
    413 ; CHECK-AVX512VL-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
    414 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) - xmm2
    415 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    416 ;
    417 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd:
    418 ; CHECK-FMA-WIN:       # %bb.0:
    419 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    420 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    421 ; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
    422 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) - mem
    423 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    424   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
    425   %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %1)
    426   ret <2 x double> %2
    427 }
    428 
    429 define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    430 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
    431 ; CHECK-FMA:       # %bb.0:
    432 ; CHECK-FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
    433 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    434 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    435 ;
    436 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256:
    437 ; CHECK-AVX512VL:       # %bb.0:
    438 ; CHECK-AVX512VL-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
    439 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    440 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    441 ;
    442 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256:
    443 ; CHECK-FMA-WIN:       # %bb.0:
    444 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    445 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    446 ; CHECK-FMA-WIN-NEXT:    vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
    447 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
    448 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    449   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
    450   %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %1)
    451   ret <8 x float> %2
    452 }
    453 
    454 define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    455 ; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
    456 ; CHECK-FMA:       # %bb.0:
    457 ; CHECK-FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
    458 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    459 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    460 ;
    461 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256:
    462 ; CHECK-AVX512VL:       # %bb.0:
    463 ; CHECK-AVX512VL-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
    464 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) - ymm2
    465 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    466 ;
    467 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256:
    468 ; CHECK-FMA-WIN:       # %bb.0:
    469 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    470 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    471 ; CHECK-FMA-WIN-NEXT:    vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
    472 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) - mem
    473 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    474   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
    475   %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %1)
    476   ret <4 x double> %2
    477 }
    478 
    479 ; VFNMADD
    480 define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    481 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
    482 ; CHECK-FMA:       # %bb.0:
    483 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
    484 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    485 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    486 ;
    487 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss:
    488 ; CHECK-AVX512VL:       # %bb.0:
    489 ; CHECK-AVX512VL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
    490 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    491 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    492 ;
    493 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
    494 ; CHECK-FMA-WIN:       # %bb.0:
    495 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
    496 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    497 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    498 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02]
    499 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
    500 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    501   %1 = extractelement <4 x float> %a0, i64 0
    502   %2 = extractelement <4 x float> %a1, i64 0
    503   %3 = extractelement <4 x float> %a2, i64 0
    504   %4 = fsub float -0.000000e+00, %2
    505   %5 = call float @llvm.fma.f32(float %1, float %4, float %3)
    506   %6 = insertelement <4 x float> %a0, float %5, i64 0
    507   ret <4 x float> %6
    508 }
    509 
    510 define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    511 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss:
    512 ; CHECK-FMA:       # %bb.0:
    513 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca]
    514 ; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
    515 ; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
    516 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    517 ;
    518 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss:
    519 ; CHECK-AVX512VL:       # %bb.0:
    520 ; CHECK-AVX512VL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
    521 ; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
    522 ; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
    523 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    524 ;
    525 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
    526 ; CHECK-FMA-WIN:       # %bb.0:
    527 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    528 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    529 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    530 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01]
    531 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
    532 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    533   %1 = extractelement <4 x float> %a1, i64 0
    534   %2 = extractelement <4 x float> %a0, i64 0
    535   %3 = extractelement <4 x float> %a2, i64 0
    536   %4 = fsub float -0.000000e+00, %2
    537   %5 = call float @llvm.fma.f32(float %1, float %4, float %3)
    538   %6 = insertelement <4 x float> %a1, float %5, i64 0
    539   ret <4 x float> %6
    540 }
    541 
    542 define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    543 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
    544 ; CHECK-FMA:       # %bb.0:
    545 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
    546 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    547 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    548 ;
    549 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd:
    550 ; CHECK-AVX512VL:       # %bb.0:
    551 ; CHECK-AVX512VL-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
    552 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    553 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    554 ;
    555 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
    556 ; CHECK-FMA-WIN:       # %bb.0:
    557 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    558 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    559 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    560 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02]
    561 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
    562 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    563   %1 = extractelement <2 x double> %a0, i64 0
    564   %2 = extractelement <2 x double> %a1, i64 0
    565   %3 = extractelement <2 x double> %a2, i64 0
    566   %4 = fsub double -0.000000e+00, %2
    567   %5 = call double @llvm.fma.f64(double %1, double %4, double %3)
    568   %6 = insertelement <2 x double> %a0, double %5, i64 0
    569   ret <2 x double> %6
    570 }
    571 
    572 define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    573 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd:
    574 ; CHECK-FMA:       # %bb.0:
    575 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca]
    576 ; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
    577 ; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
    578 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    579 ;
    580 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd:
    581 ; CHECK-AVX512VL:       # %bb.0:
    582 ; CHECK-AVX512VL-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
    583 ; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) + xmm2
    584 ; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
    585 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    586 ;
    587 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
    588 ; CHECK-FMA-WIN:       # %bb.0:
    589 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    590 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    591 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    592 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x01]
    593 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
    594 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    595   %1 = extractelement <2 x double> %a1, i64 0
    596   %2 = extractelement <2 x double> %a0, i64 0
    597   %3 = extractelement <2 x double> %a2, i64 0
    598   %4 = fsub double -0.000000e+00, %2
    599   %5 = call double @llvm.fma.f64(double %1, double %4, double %3)
    600   %6 = insertelement <2 x double> %a1, double %5, i64 0
    601   ret <2 x double> %6
    602 }
    603 
    604 define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    605 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
    606 ; CHECK-FMA:       # %bb.0:
    607 ; CHECK-FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
    608 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    609 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    610 ;
    611 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps:
    612 ; CHECK-AVX512VL:       # %bb.0:
    613 ; CHECK-AVX512VL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
    614 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    615 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    616 ;
    617 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps:
    618 ; CHECK-FMA-WIN:       # %bb.0:
    619 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    620 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    621 ; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
    622 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
    623 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    624   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
    625   %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %a2)
    626   ret <4 x float> %2
    627 }
    628 
    629 define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    630 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
    631 ; CHECK-FMA:       # %bb.0:
    632 ; CHECK-FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
    633 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    634 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    635 ;
    636 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd:
    637 ; CHECK-AVX512VL:       # %bb.0:
    638 ; CHECK-AVX512VL-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
    639 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) + xmm2
    640 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    641 ;
    642 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd:
    643 ; CHECK-FMA-WIN:       # %bb.0:
    644 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    645 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    646 ; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
    647 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) + mem
    648 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    649   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a0
    650   %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %a2)
    651   ret <2 x double> %2
    652 }
    653 
    654 define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    655 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
    656 ; CHECK-FMA:       # %bb.0:
    657 ; CHECK-FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
    658 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    659 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    660 ;
    661 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256:
    662 ; CHECK-AVX512VL:       # %bb.0:
    663 ; CHECK-AVX512VL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
    664 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    665 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    666 ;
    667 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256:
    668 ; CHECK-FMA-WIN:       # %bb.0:
    669 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    670 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    671 ; CHECK-FMA-WIN-NEXT:    vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
    672 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
    673 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    674   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
    675   %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %a2)
    676   ret <8 x float> %2
    677 }
    678 
    679 define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    680 ; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
    681 ; CHECK-FMA:       # %bb.0:
    682 ; CHECK-FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
    683 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    684 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    685 ;
    686 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256:
    687 ; CHECK-AVX512VL:       # %bb.0:
    688 ; CHECK-AVX512VL-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
    689 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) + ymm2
    690 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    691 ;
    692 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256:
    693 ; CHECK-FMA-WIN:       # %bb.0:
    694 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    695 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    696 ; CHECK-FMA-WIN-NEXT:    vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
    697 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) + mem
    698 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    699   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a0
    700   %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %a2)
    701   ret <4 x double> %2
    702 }
    703 
    704 ; VFNMSUB
    705 define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    706 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
    707 ; CHECK-FMA:       # %bb.0:
    708 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
    709 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    710 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    711 ;
    712 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss:
    713 ; CHECK-AVX512VL:       # %bb.0:
    714 ; CHECK-AVX512VL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
    715 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    716 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    717 ;
    718 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
    719 ; CHECK-FMA-WIN:       # %bb.0:
    720 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
    721 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    722 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    723 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02]
    724 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
    725 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    726   %1 = extractelement <4 x float> %a0, i64 0
    727   %2 = extractelement <4 x float> %a1, i64 0
    728   %3 = extractelement <4 x float> %a2, i64 0
    729   %4 = fsub float -0.000000e+00, %2
    730   %5 = fsub float -0.000000e+00, %3
    731   %6 = call float @llvm.fma.f32(float %1, float %4, float %5)
    732   %7 = insertelement <4 x float> %a0, float %6, i64 0
    733   ret <4 x float> %7
    734 }
    735 
    736 define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    737 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss:
    738 ; CHECK-FMA:       # %bb.0:
    739 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca]
    740 ; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
    741 ; CHECK-FMA-NEXT:    vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
    742 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    743 ;
    744 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss:
    745 ; CHECK-AVX512VL:       # %bb.0:
    746 ; CHECK-AVX512VL-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
    747 ; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
    748 ; CHECK-AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
    749 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    750 ;
    751 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
    752 ; CHECK-FMA-WIN:       # %bb.0:
    753 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    754 ; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
    755 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
    756 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01]
    757 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
    758 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    759   %1 = extractelement <4 x float> %a1, i64 0
    760   %2 = extractelement <4 x float> %a0, i64 0
    761   %3 = extractelement <4 x float> %a2, i64 0
    762   %4 = fsub float -0.000000e+00, %2
    763   %5 = fsub float -0.000000e+00, %3
    764   %6 = call float @llvm.fma.f32(float %1, float %4, float %5)
    765   %7 = insertelement <4 x float> %a1, float %6, i64 0
    766   ret <4 x float> %7
    767 }
    768 
    769 define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    770 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
    771 ; CHECK-FMA:       # %bb.0:
    772 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
    773 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    774 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    775 ;
    776 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd:
    777 ; CHECK-AVX512VL:       # %bb.0:
    778 ; CHECK-AVX512VL-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
    779 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    780 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    781 ;
    782 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
    783 ; CHECK-FMA-WIN:       # %bb.0:
    784 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
    785 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    786 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    787 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02]
    788 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
    789 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    790   %1 = extractelement <2 x double> %a0, i64 0
    791   %2 = extractelement <2 x double> %a1, i64 0
    792   %3 = extractelement <2 x double> %a2, i64 0
    793   %4 = fsub double -0.000000e+00, %2
    794   %5 = fsub double -0.000000e+00, %3
    795   %6 = call double @llvm.fma.f64(double %1, double %4, double %5)
    796   %7 = insertelement <2 x double> %a0, double %6, i64 0
    797   ret <2 x double> %7
    798 }
    799 
    800 define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    801 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd:
    802 ; CHECK-FMA:       # %bb.0:
    803 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
    804 ; CHECK-FMA-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
    805 ; CHECK-FMA-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
    806 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    807 ;
    808 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd:
    809 ; CHECK-AVX512VL:       # %bb.0:
    810 ; CHECK-AVX512VL-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
    811 ; CHECK-AVX512VL-NEXT:    # xmm1 = -(xmm0 * xmm1) - xmm2
    812 ; CHECK-AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
    813 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    814 ;
    815 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
    816 ; CHECK-FMA-WIN:       # %bb.0:
    817 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    818 ; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
    819 ; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
    820 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x01]
    821 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
    822 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    823   %1 = extractelement <2 x double> %a1, i64 0
    824   %2 = extractelement <2 x double> %a0, i64 0
    825   %3 = extractelement <2 x double> %a2, i64 0
    826   %4 = fsub double -0.000000e+00, %2
    827   %5 = fsub double -0.000000e+00, %3
    828   %6 = call double @llvm.fma.f64(double %1, double %4, double %5)
    829   %7 = insertelement <2 x double> %a1, double %6, i64 0
    830   ret <2 x double> %7
    831 }
    832 
    833 define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    834 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
    835 ; CHECK-FMA:       # %bb.0:
    836 ; CHECK-FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
    837 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    838 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    839 ;
    840 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps:
    841 ; CHECK-AVX512VL:       # %bb.0:
    842 ; CHECK-AVX512VL-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
    843 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    844 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    845 ;
    846 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps:
    847 ; CHECK-FMA-WIN:       # %bb.0:
    848 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    849 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    850 ; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
    851 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
    852 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    853   %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
    854   %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
    855   %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %a1, <4 x float> %2)
    856   ret <4 x float> %3
    857 }
    858 
    859 define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    860 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
    861 ; CHECK-FMA:       # %bb.0:
    862 ; CHECK-FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
    863 ; CHECK-FMA-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    864 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    865 ;
    866 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd:
    867 ; CHECK-AVX512VL:       # %bb.0:
    868 ; CHECK-AVX512VL-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
    869 ; CHECK-AVX512VL-NEXT:    # xmm0 = -(xmm1 * xmm0) - xmm2
    870 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    871 ;
    872 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd:
    873 ; CHECK-FMA-WIN:       # %bb.0:
    874 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    875 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    876 ; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
    877 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm1 * xmm0) - mem
    878 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    879   %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a0
    880   %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
    881   %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %a1, <2 x double> %2)
    882   ret <2 x double> %3
    883 }
    884 
    885 define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    886 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
    887 ; CHECK-FMA:       # %bb.0:
    888 ; CHECK-FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
    889 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    890 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    891 ;
    892 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256:
    893 ; CHECK-AVX512VL:       # %bb.0:
    894 ; CHECK-AVX512VL-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
    895 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    896 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    897 ;
    898 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256:
    899 ; CHECK-FMA-WIN:       # %bb.0:
    900 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
    901 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
    902 ; CHECK-FMA-WIN-NEXT:    vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
    903 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
    904 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    905   %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a0
    906   %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
    907   %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %a1, <8 x float> %2)
    908   ret <8 x float> %3
    909 }
    910 
    911 define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    912 ; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
    913 ; CHECK-FMA:       # %bb.0:
    914 ; CHECK-FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
    915 ; CHECK-FMA-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    916 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    917 ;
    918 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256:
    919 ; CHECK-AVX512VL:       # %bb.0:
    920 ; CHECK-AVX512VL-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
    921 ; CHECK-AVX512VL-NEXT:    # ymm0 = -(ymm1 * ymm0) - ymm2
    922 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    923 ;
    924 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256:
    925 ; CHECK-FMA-WIN:       # %bb.0:
    926 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
    927 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
    928 ; CHECK-FMA-WIN-NEXT:    vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
    929 ; CHECK-FMA-WIN-NEXT:    # ymm0 = -(ymm1 * ymm0) - mem
    930 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    931   %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a0
    932   %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
    933   %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %a1, <4 x double> %2)
    934   ret <4 x double> %3
    935 }
    936 
    937 ; VFMADDSUB
    938 define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    939 ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
    940 ; CHECK-FMA:       # %bb.0:
    941 ; CHECK-FMA-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
    942 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    943 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    944 ;
    945 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps:
    946 ; CHECK-AVX512VL:       # %bb.0:
    947 ; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
    948 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    949 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    950 ;
    951 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps:
    952 ; CHECK-FMA-WIN:       # %bb.0:
    953 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
    954 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
    955 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
    956 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
    957 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    958   %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    959   %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
    960   %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2)
    961   %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
    962   ret <4 x float> %4
    963 }
    964 
    965 define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    966 ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
    967 ; CHECK-FMA:       # %bb.0:
    968 ; CHECK-FMA-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
    969 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    970 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    971 ;
    972 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd:
    973 ; CHECK-AVX512VL:       # %bb.0:
    974 ; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
    975 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) +/- xmm2
    976 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
    977 ;
    978 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd:
    979 ; CHECK-FMA-WIN:       # %bb.0:
    980 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
    981 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
    982 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
    983 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) +/- mem
    984 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
    985   %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    986   %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
    987   %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
    988   %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
    989   ret <2 x double> %4
    990 }
    991 
    992 define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    993 ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
    994 ; CHECK-FMA:       # %bb.0:
    995 ; CHECK-FMA-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
    996 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
    997 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
    998 ;
    999 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256:
   1000 ; CHECK-AVX512VL:       # %bb.0:
   1001 ; CHECK-AVX512VL-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
   1002 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
   1003 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
   1004 ;
   1005 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256:
   1006 ; CHECK-FMA-WIN:       # %bb.0:
   1007 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
   1008 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
   1009 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
   1010 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
   1011 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
   1012   %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   1013   %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
   1014   %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2)
   1015   %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   1016   ret <8 x float> %4
   1017 }
   1018 
   1019 define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
   1020 ; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
   1021 ; CHECK-FMA:       # %bb.0:
   1022 ; CHECK-FMA-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
   1023 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
   1024 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
   1025 ;
   1026 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256:
   1027 ; CHECK-AVX512VL:       # %bb.0:
   1028 ; CHECK-AVX512VL-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
   1029 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) +/- ymm2
   1030 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
   1031 ;
   1032 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256:
   1033 ; CHECK-FMA-WIN:       # %bb.0:
   1034 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
   1035 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
   1036 ; CHECK-FMA-WIN-NEXT:    vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
   1037 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) +/- mem
   1038 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
   1039   %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   1040   %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
   1041   %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
   1042   %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1043   ret <4 x double> %4
   1044 }
   1045 
   1046 ; VFMSUBADD
   1047 define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
   1048 ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
   1049 ; CHECK-FMA:       # %bb.0:
   1050 ; CHECK-FMA-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
   1051 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
   1052 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
   1053 ;
   1054 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps:
   1055 ; CHECK-AVX512VL:       # %bb.0:
   1056 ; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
   1057 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
   1058 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
   1059 ;
   1060 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps:
   1061 ; CHECK-FMA-WIN:       # %bb.0:
   1062 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
   1063 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
   1064 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
   1065 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
   1066 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
   1067   %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
   1068   %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
   1069   %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %2)
   1070   %4 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1071   ret <4 x float> %4
   1072 }
   1073 
   1074 define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
   1075 ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
   1076 ; CHECK-FMA:       # %bb.0:
   1077 ; CHECK-FMA-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
   1078 ; CHECK-FMA-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
   1079 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
   1080 ;
   1081 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd:
   1082 ; CHECK-AVX512VL:       # %bb.0:
   1083 ; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
   1084 ; CHECK-AVX512VL-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ xmm2
   1085 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
   1086 ;
   1087 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd:
   1088 ; CHECK-FMA-WIN:       # %bb.0:
   1089 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
   1090 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
   1091 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
   1092 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * xmm0) -/+ mem
   1093 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
   1094   %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
   1095   %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
   1096   %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
   1097   %4 = shufflevector <2 x double> %1, <2 x double> %3, <2 x i32> <i32 0, i32 3>
   1098   ret <2 x double> %4
   1099 }
   1100 
   1101 define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
   1102 ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
   1103 ; CHECK-FMA:       # %bb.0:
   1104 ; CHECK-FMA-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
   1105 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
   1106 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
   1107 ;
   1108 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256:
   1109 ; CHECK-AVX512VL:       # %bb.0:
   1110 ; CHECK-AVX512VL-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
   1111 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
   1112 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
   1113 ;
   1114 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256:
   1115 ; CHECK-FMA-WIN:       # %bb.0:
   1116 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
   1117 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
   1118 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
   1119 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
   1120 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
   1121   %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
   1122   %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
   1123   %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %2)
   1124   %4 = shufflevector <8 x float> %1, <8 x float> %3, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
   1125   ret <8 x float> %4
   1126 }
   1127 
   1128 define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
   1129 ; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
   1130 ; CHECK-FMA:       # %bb.0:
   1131 ; CHECK-FMA-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
   1132 ; CHECK-FMA-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
   1133 ; CHECK-FMA-NEXT:    retq # encoding: [0xc3]
   1134 ;
   1135 ; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256:
   1136 ; CHECK-AVX512VL:       # %bb.0:
   1137 ; CHECK-AVX512VL-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
   1138 ; CHECK-AVX512VL-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ ymm2
   1139 ; CHECK-AVX512VL-NEXT:    retq # encoding: [0xc3]
   1140 ;
   1141 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256:
   1142 ; CHECK-FMA-WIN:       # %bb.0:
   1143 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
   1144 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
   1145 ; CHECK-FMA-WIN-NEXT:    vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
   1146 ; CHECK-FMA-WIN-NEXT:    # ymm0 = (ymm1 * ymm0) -/+ mem
   1147 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
   1148   %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
   1149   %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
   1150   %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
   1151   %4 = shufflevector <4 x double> %1, <4 x double> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   1152   ret <4 x double> %4
   1153 }
   1154 
   1155 declare float @llvm.fma.f32(float, float, float)
   1156 declare double @llvm.fma.f64(double, double, double)
   1157 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
   1158 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
   1159 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
   1160 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
   1161 
   1162 attributes #0 = { nounwind }
   1163