Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
      3 ; RUN: llc < %s -mtriple=x86_64-pc-windows -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
      7 
      8 ; VFMADD
      9 define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
     10 ; CHECK-LABEL: test_x86_fma_vfmadd_ss:
     11 ; CHECK-NEXT:  # BB#0:
     12 ;
     13 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
     14 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
     15 ; CHECK-FMA-WIN-NEXT: vfmadd132ss     (%rdx), %xmm1, %xmm0
     16 ;
     17 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0
     18 ;
     19 ; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
     20 ;
     21 ; CHECK-NEXT: retq
     22   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
     23   ret <4 x float> %res
     24 }
     25 
     26 define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
     27 ; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss:
     28 ; CHECK-NEXT:  # BB#0:
     29 ;
     30 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
     31 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
     32 ; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0
     33 ;
     34 ; CHECK-FMA-NEXT:    vfmadd213ss %xmm2, %xmm0, %xmm1
     35 ; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
     36 ;
     37 ; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
     38 ; CHECK-NEXT: retq
     39   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
     40   ret <4 x float> %res
     41 }
     42 declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
     43 
     44 define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
     45 ; CHECK-LABEL: test_x86_fma_vfmadd_sd:
     46 ; CHECK-NEXT:  # BB#0:
     47 ;
     48 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
     49 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
     50 ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0
     51 ;
     52 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
     53 ;
     54 ; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
     55 ;
     56 ; CHECK-NEXT: retq
     57   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
     58   ret <2 x double> %res
     59 }
     60 
     61 define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
     62 ; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd:
     63 ; CHECK-NEXT:  # BB#0:
     64 ;
     65 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
     66 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
     67 ; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0
     68 ;
     69 ; CHECK-FMA-NEXT:    vfmadd213sd %xmm2, %xmm0, %xmm1
     70 ; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
     71 ;
     72 ; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
     73 ;
     74 ; CHECK-NEXT: retq
     75   %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
     76   ret <2 x double> %res
     77 }
     78 declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
     79 
     80 define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
     81 ; CHECK-LABEL: test_x86_fma_vfmadd_ps:
     82 ; CHECK-NEXT:  # BB#0:
     83 ;
     84 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
     85 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
     86 ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
     87 ;
     88 ; CHECK-FMA-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
     89 ;
     90 ; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
     91 ;
     92 ; CHECK-NEXT: retq
     93   %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
     94   ret <4 x float> %res
     95 }
     96 declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
     97 
     98 define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
     99 ; CHECK-LABEL: test_x86_fma_vfmadd_pd:
    100 ; CHECK-NEXT:  # BB#0:
    101 ;
    102 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    103 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    104 ; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
    105 ;
    106 ; CHECK-FMA-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
    107 ;
    108 ; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
    109 ;
    110 ; CHECK-NEXT: retq
    111   %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    112   ret <2 x double> %res
    113 }
    114 declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
    115 
    116 define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    117 ; CHECK-LABEL: test_x86_fma_vfmadd_ps_256:
    118 ; CHECK-NEXT:  # BB#0:
    119 ;
    120 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    121 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    122 ; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
    123 ;
    124 ; CHECK-FMA-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
    125 ;
    126 ; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
    127 ;
    128 ; CHECK-NEXT: retq
    129   %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    130   ret <8 x float> %res
    131 }
    132 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
    133 
    134 define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    135 ; CHECK-LABEL: test_x86_fma_vfmadd_pd_256:
    136 ; CHECK-NEXT:  # BB#0:
    137 ;
    138 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    139 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    140 ; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
    141 ;
    142 ; CHECK-FMA-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
    143 ;
    144 ; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
    145 ;
    146 ; CHECK-NEXT: retq
    147   %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    148   ret <4 x double> %res
    149 }
    150 declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
    151 
    152 ; VFMSUB
    153 define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    154 ; CHECK-LABEL: test_x86_fma_vfmsub_ss:
    155 ; CHECK-NEXT:  # BB#0:
    156 ;
    157 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    158 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    159 ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0
    160 ;
    161 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0
    162 ;
    163 ; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
    164 ;
    165 ; CHECK-NEXT: retq
    166   %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    167   ret <4 x float> %res
    168 }
    169 
    170 define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    171 ; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss:
    172 ; CHECK-NEXT:  # BB#0:
    173 ;
    174 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    175 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    176 ; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0
    177 ;
    178 ; CHECK-FMA-NEXT:    vfmsub213ss %xmm2, %xmm0, %xmm1
    179 ; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
    180 ;
    181 ; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
    182 ;
    183 ; CHECK-NEXT: retq
    184   %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
    185   ret <4 x float> %res
    186 }
    187 declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
    188 
    189 define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    190 ; CHECK-LABEL: test_x86_fma_vfmsub_sd:
    191 ; CHECK-NEXT:  # BB#0:
    192 ;
    193 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    194 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    195 ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0
    196 ;
    197 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0
    198 ;
    199 ; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
    200 ;
    201 ; CHECK-NEXT: retq
    202   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    203   ret <2 x double> %res
    204 }
    205 
    206 define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    207 ; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd:
    208 ; CHECK-NEXT:  # BB#0:
    209 ;
    210 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    211 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    212 ; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0
    213 ;
    214 ; CHECK-FMA-NEXT:    vfmsub213sd %xmm2, %xmm0, %xmm1
    215 ; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
    216 ;
    217 ; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
    218 ;
    219 ; CHECK-NEXT: retq
    220   %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
    221   ret <2 x double> %res
    222 }
    223 declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
    224 
    225 define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    226 ; CHECK-LABEL: test_x86_fma_vfmsub_ps:
    227 ; CHECK-NEXT:  # BB#0:
    228 ;
    229 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    230 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    231 ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
    232 ;
    233 ; CHECK-FMA-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
    234 ;
    235 ; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
    236 ;
    237 ; CHECK-NEXT: retq
    238   %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    239   ret <4 x float> %res
    240 }
    241 declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
    242 
    243 define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    244 ; CHECK-LABEL: test_x86_fma_vfmsub_pd:
    245 ; CHECK-NEXT:  # BB#0:
    246 ;
    247 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    248 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    249 ; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
    250 ;
    251 ; CHECK-FMA-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
    252 ;
    253 ; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
    254 ;
    255 ; CHECK-NEXT: retq
    256   %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    257   ret <2 x double> %res
    258 }
    259 declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
    260 
    261 define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    262 ; CHECK-LABEL: test_x86_fma_vfmsub_ps_256:
    263 ; CHECK-NEXT:  # BB#0:
    264 ;
    265 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    266 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    267 ; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
    268 ;
    269 ; CHECK-FMA-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
    270 ;
    271 ; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
    272 ;
    273 ; CHECK-NEXT: retq
    274   %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    275   ret <8 x float> %res
    276 }
    277 declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
    278 
    279 define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    280 ; CHECK-LABEL: test_x86_fma_vfmsub_pd_256:
    281 ; CHECK-NEXT:  # BB#0:
    282 ;
    283 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    284 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    285 ; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
    286 ;
    287 ; CHECK-FMA-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
    288 ;
    289 ; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
    290 ;
    291 ; CHECK-NEXT: retq
    292   %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    293   ret <4 x double> %res
    294 }
    295 declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
    296 
    297 ; VFNMADD
    298 define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    299 ; CHECK-LABEL: test_x86_fma_vfnmadd_ss:
    300 ; CHECK-NEXT:  # BB#0:
    301 ;
    302 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    303 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    304 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0
    305 ;
    306 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0
    307 ;
    308 ; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
    309 ;
    310 ; CHECK-NEXT: retq
    311   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    312   ret <4 x float> %res
    313 }
    314 
    315 define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    316 ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss:
    317 ; CHECK-NEXT:  # BB#0:
    318 ;
    319 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    320 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    321 ; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0
    322 ;
    323 ; CHECK-FMA-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm1
    324 ; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
    325 ;
    326 ; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
    327 ;
    328 ; CHECK-NEXT: retq
    329   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
    330   ret <4 x float> %res
    331 }
    332 declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
    333 
    334 define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    335 ; CHECK-LABEL: test_x86_fma_vfnmadd_sd:
    336 ; CHECK-NEXT:  # BB#0:
    337 ;
    338 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    339 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    340 ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0
    341 ;
    342 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0
    343 ;
    344 ; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
    345 ;
    346 ; CHECK-NEXT: retq
    347   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    348   ret <2 x double> %res
    349 }
    350 
    351 define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    352 ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd:
    353 ; CHECK-NEXT:  # BB#0:
    354 ;
    355 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    356 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    357 ; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0
    358 ;
    359 ; CHECK-FMA-NEXT:    vfnmadd213sd %xmm2, %xmm0, %xmm1
    360 ; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
    361 ;
    362 ; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
    363 ;
    364 ; CHECK-NEXT: retq
    365   %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
    366   ret <2 x double> %res
    367 }
    368 declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
    369 
    370 define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    371 ; CHECK-LABEL: test_x86_fma_vfnmadd_ps:
    372 ; CHECK-NEXT:  # BB#0:
    373 ;
    374 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    375 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    376 ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
    377 ;
    378 ; CHECK-FMA-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
    379 ;
    380 ; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
    381 ;
    382 ; CHECK-NEXT: retq
    383   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    384   ret <4 x float> %res
    385 }
    386 declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
    387 
    388 define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    389 ; CHECK-LABEL: test_x86_fma_vfnmadd_pd:
    390 ; CHECK-NEXT:  # BB#0:
    391 ;
    392 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    393 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    394 ; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
    395 ;
    396 ; CHECK-FMA-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
    397 ;
    398 ; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
    399 ;
    400 ; CHECK-NEXT: retq
    401   %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    402   ret <2 x double> %res
    403 }
    404 declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
    405 
    406 define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    407 ; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256:
    408 ; CHECK-NEXT:  # BB#0:
    409 ;
    410 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    411 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    412 ; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
    413 ;
    414 ; CHECK-FMA-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
    415 ;
    416 ; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
    417 ;
    418 ; CHECK-NEXT: retq
    419   %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    420   ret <8 x float> %res
    421 }
    422 declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
    423 
    424 define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    425 ; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256:
    426 ; CHECK-NEXT:  # BB#0:
    427 ;
    428 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    429 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    430 ; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
    431 ;
    432 ; CHECK-FMA-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
    433 ;
    434 ; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
    435 ;
    436 ; CHECK-NEXT: retq
    437   %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    438   ret <4 x double> %res
    439 }
    440 declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
    441 
    442 ; VFNMSUB
    443 define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    444 ; CHECK-LABEL: test_x86_fma_vfnmsub_ss:
    445 ; CHECK-NEXT:  # BB#0:
    446 ;
    447 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    448 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    449 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0
    450 ;
    451 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0
    452 ;
    453 ; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
    454 ;
    455 ; CHECK-NEXT: retq
    456   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    457   ret <4 x float> %res
    458 }
    459 
    460 define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    461 ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss:
    462 ; CHECK-NEXT:  # BB#0:
    463 ;
    464 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    465 ; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    466 ; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0
    467 ;
    468 ; CHECK-FMA-NEXT:    vfnmsub213ss %xmm2, %xmm0, %xmm1
    469 ; CHECK-FMA-NEXT:    vmovaps	%xmm1, %xmm0
    470 ;
    471 ; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
    472 ;
    473 ; CHECK-NEXT: retq
    474   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
    475   ret <4 x float> %res
    476 }
    477 declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
    478 
    479 define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    480 ; CHECK-LABEL: test_x86_fma_vfnmsub_sd:
    481 ; CHECK-NEXT:  # BB#0:
    482 ;
    483 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    484 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
    485 ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0
    486 ;
    487 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
    488 ;
    489 ; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
    490 ;
    491 ; CHECK-NEXT: retq
    492   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    493   ret <2 x double> %res
    494 }
    495 
    496 define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    497 ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd:
    498 ; CHECK-NEXT:  # BB#0:
    499 ;
    500 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    501 ; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
    502 ; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0
    503 ;
    504 ; CHECK-FMA-NEXT:    vfnmsub213sd %xmm2, %xmm0, %xmm1
    505 ; CHECK-FMA-NEXT:    vmovapd	%xmm1, %xmm0
    506 ;
    507 ; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
    508 ;
    509 ; CHECK-NEXT: retq
    510   %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
    511   ret <2 x double> %res
    512 }
    513 declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
    514 
    515 define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    516 ; CHECK-LABEL: test_x86_fma_vfnmsub_ps:
    517 ; CHECK-NEXT:  # BB#0:
    518 ;
    519 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    520 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    521 ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
    522 ;
    523 ; CHECK-FMA-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
    524 ;
    525 ; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
    526 ;
    527 ; CHECK-NEXT: retq
    528   %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    529   ret <4 x float> %res
    530 }
    531 declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
    532 
    533 define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    534 ; CHECK-LABEL: test_x86_fma_vfnmsub_pd:
    535 ; CHECK-NEXT:  # BB#0:
    536 ;
    537 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    538 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    539 ; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
    540 ;
    541 ; CHECK-FMA-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
    542 ;
    543 ; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
    544 ;
    545 ; CHECK-NEXT: retq
    546   %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    547   ret <2 x double> %res
    548 }
    549 declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
    550 
    551 define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    552 ; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256:
    553 ; CHECK-NEXT:  # BB#0:
    554 ;
    555 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    556 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    557 ; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
    558 ;
    559 ; CHECK-FMA-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
    560 ;
    561 ; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
    562 ;
    563 ; CHECK-NEXT: retq
    564   %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    565   ret <8 x float> %res
    566 }
    567 declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
    568 
    569 define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    570 ; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256:
    571 ; CHECK-NEXT:  # BB#0:
    572 ;
    573 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    574 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    575 ; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
    576 ;
    577 ; CHECK-FMA-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
    578 ;
    579 ; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
    580 ;
    581 ; CHECK-NEXT: retq
    582   %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    583   ret <4 x double> %res
    584 }
    585 declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
    586 
    587 ; VFMADDSUB
    588 define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    589 ; CHECK-LABEL: test_x86_fma_vfmaddsub_ps:
    590 ; CHECK-NEXT:  # BB#0:
    591 ;
    592 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    593 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    594 ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
    595 ;
    596 ; CHECK-FMA-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
    597 ;
    598 ; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
    599 ;
    600 ; CHECK-NEXT: retq
    601   %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    602   ret <4 x float> %res
    603 }
    604 declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
    605 
    606 define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    607 ; CHECK-LABEL: test_x86_fma_vfmaddsub_pd:
    608 ; CHECK-NEXT:  # BB#0:
    609 ;
    610 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    611 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    612 ; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
    613 ;
    614 ; CHECK-FMA-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
    615 ;
    616 ; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
    617 ;
    618 ; CHECK-NEXT: retq
    619   %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    620   ret <2 x double> %res
    621 }
    622 declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
    623 
    624 define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    625 ; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256:
    626 ; CHECK-NEXT:  # BB#0:
    627 ;
    628 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    629 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    630 ; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
    631 ;
    632 ; CHECK-FMA-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
    633 ;
    634 ; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
    635 ;
    636 ; CHECK-NEXT: retq
    637   %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    638   ret <8 x float> %res
    639 }
    640 declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
    641 
    642 define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    643 ; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256:
    644 ; CHECK-NEXT:  # BB#0:
    645 ;
    646 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    647 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    648 ; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
    649 ;
    650 ; CHECK-FMA-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
    651 ;
    652 ; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
    653 ;
    654 ; CHECK-NEXT: retq
    655   %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    656   ret <4 x double> %res
    657 }
    658 declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
    659 
    660 ; VFMSUBADD
    661 define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
    662 ; CHECK-LABEL: test_x86_fma_vfmsubadd_ps:
    663 ; CHECK-NEXT:  # BB#0:
    664 ;
    665 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    666 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
    667 ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
    668 ;
    669 ; CHECK-FMA-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
    670 ;
    671 ; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
    672 ;
    673 ; CHECK-NEXT: retq
    674   %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    675   ret <4 x float> %res
    676 }
    677 declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
    678 
    679 define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
    680 ; CHECK-LABEL: test_x86_fma_vfmsubadd_pd:
    681 ; CHECK-NEXT:  # BB#0:
    682 ;
    683 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    684 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
    685 ; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
    686 ;
    687 ; CHECK-FMA-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
    688 ;
    689 ; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
    690 ;
    691 ; CHECK-NEXT: retq
    692   %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
    693   ret <2 x double> %res
    694 }
    695 declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
    696 
    697 define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
    698 ; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256:
    699 ; CHECK-NEXT:  # BB#0:
    700 ;
    701 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    702 ; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
    703 ; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
    704 ;
    705 ; CHECK-FMA-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
    706 ;
    707 ; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
    708 ;
    709 ; CHECK-NEXT: retq
    710   %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    711   ret <8 x float> %res
    712 }
    713 declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
    714 
    715 define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
    716 ; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256:
    717 ; CHECK-NEXT:  # BB#0:
    718 ;
    719 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    720 ; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
    721 ; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
    722 ;
    723 ; CHECK-FMA-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
    724 ;
    725 ; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
    726 ;
    727 ; CHECK-NEXT: retq
    728   %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    729   ret <4 x double> %res
    730 }
    731 declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
    732 
    733 attributes #0 = { nounwind }
    734