Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
      5 
      6 ; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
      7 
      8 define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B,  <2 x double> %C) #0 {
      9 ; FMA3-LABEL: mul_addsub_pd128:
     10 ; FMA3:       # %bb.0: # %entry
     11 ; FMA3-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
     12 ; FMA3-NEXT:    retq
     13 ;
     14 ; FMA4-LABEL: mul_addsub_pd128:
     15 ; FMA4:       # %bb.0: # %entry
     16 ; FMA4-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
     17 ; FMA4-NEXT:    retq
     18 entry:
     19   %AB = fmul <2 x double> %A, %B
     20   %Sub = fsub <2 x double> %AB, %C
     21   %Add = fadd <2 x double> %AB, %C
     22   %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
     23   ret <2 x double> %Addsub
     24 }
     25 
     26 define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
     27 ; FMA3-LABEL: mul_addsub_ps128:
     28 ; FMA3:       # %bb.0: # %entry
     29 ; FMA3-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
     30 ; FMA3-NEXT:    retq
     31 ;
     32 ; FMA4-LABEL: mul_addsub_ps128:
     33 ; FMA4:       # %bb.0: # %entry
     34 ; FMA4-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
     35 ; FMA4-NEXT:    retq
     36 entry:
     37   %AB = fmul <4 x float> %A, %B
     38   %Sub = fsub <4 x float> %AB, %C
     39   %Add = fadd <4 x float> %AB, %C
     40   %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
     41   ret <4 x float> %Addsub
     42 }
     43 
     44 define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
     45 ; FMA3-LABEL: mul_addsub_pd256:
     46 ; FMA3:       # %bb.0: # %entry
     47 ; FMA3-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
     48 ; FMA3-NEXT:    retq
     49 ;
     50 ; FMA4-LABEL: mul_addsub_pd256:
     51 ; FMA4:       # %bb.0: # %entry
     52 ; FMA4-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
     53 ; FMA4-NEXT:    retq
     54 entry:
     55   %AB = fmul <4 x double> %A, %B
     56   %Sub = fsub <4 x double> %AB, %C
     57   %Add = fadd <4 x double> %AB, %C
     58   %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
     59   ret <4 x double> %Addsub
     60 }
     61 
     62 define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
     63 ; FMA3-LABEL: mul_addsub_ps256:
     64 ; FMA3:       # %bb.0: # %entry
     65 ; FMA3-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
     66 ; FMA3-NEXT:    retq
     67 ;
     68 ; FMA4-LABEL: mul_addsub_ps256:
     69 ; FMA4:       # %bb.0: # %entry
     70 ; FMA4-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
     71 ; FMA4-NEXT:    retq
     72 entry:
     73   %AB = fmul <8 x float> %A, %B
     74   %Sub = fsub <8 x float> %AB, %C
     75   %Add = fadd <8 x float> %AB, %C
     76   %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
     77   ret <8 x float> %Addsub
     78 }
     79 
     80 define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
     81 ; FMA3_256-LABEL: mul_addsub_pd512:
     82 ; FMA3_256:       # %bb.0: # %entry
     83 ; FMA3_256-NEXT:    vfmaddsub213pd %ymm4, %ymm2, %ymm0
     84 ; FMA3_256-NEXT:    vfmaddsub213pd %ymm5, %ymm3, %ymm1
     85 ; FMA3_256-NEXT:    retq
     86 ;
     87 ; FMA3_512-LABEL: mul_addsub_pd512:
     88 ; FMA3_512:       # %bb.0: # %entry
     89 ; FMA3_512-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0
     90 ; FMA3_512-NEXT:    retq
     91 ;
     92 ; FMA4-LABEL: mul_addsub_pd512:
     93 ; FMA4:       # %bb.0: # %entry
     94 ; FMA4-NEXT:    vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
     95 ; FMA4-NEXT:    vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
     96 ; FMA4-NEXT:    retq
     97 entry:
     98   %AB = fmul <8 x double> %A, %B
     99   %Sub = fsub <8 x double> %AB, %C
    100   %Add = fadd <8 x double> %AB, %C
    101   %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
    102   ret <8 x double> %Addsub
    103 }
    104 
    105 define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
    106 ; FMA3_256-LABEL: mul_addsub_ps512:
    107 ; FMA3_256:       # %bb.0: # %entry
    108 ; FMA3_256-NEXT:    vfmaddsub213ps %ymm4, %ymm2, %ymm0
    109 ; FMA3_256-NEXT:    vfmaddsub213ps %ymm5, %ymm3, %ymm1
    110 ; FMA3_256-NEXT:    retq
    111 ;
    112 ; FMA3_512-LABEL: mul_addsub_ps512:
    113 ; FMA3_512:       # %bb.0: # %entry
    114 ; FMA3_512-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0
    115 ; FMA3_512-NEXT:    retq
    116 ;
    117 ; FMA4-LABEL: mul_addsub_ps512:
    118 ; FMA4:       # %bb.0: # %entry
    119 ; FMA4-NEXT:    vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
    120 ; FMA4-NEXT:    vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
    121 ; FMA4-NEXT:    retq
    122 entry:
    123   %AB = fmul <16 x float> %A, %B
    124   %Sub = fsub <16 x float> %AB, %C
    125   %Add = fadd <16 x float> %AB, %C
    126   %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
    127   ret <16 x float> %Addsub
    128 }
    129 
    130 define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
    131 ; FMA3-LABEL: buildvector_mul_addsub_ps128:
    132 ; FMA3:       # %bb.0: # %bb
    133 ; FMA3-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
    134 ; FMA3-NEXT:    retq
    135 ;
    136 ; FMA4-LABEL: buildvector_mul_addsub_ps128:
    137 ; FMA4:       # %bb.0: # %bb
    138 ; FMA4-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
    139 ; FMA4-NEXT:    retq
    140 bb:
    141   %A = fmul <4 x float> %C, %D
    142   %A0 = extractelement <4 x float> %A, i32 0
    143   %B0 = extractelement <4 x float> %B, i32 0
    144   %sub0 = fsub float %A0, %B0
    145   %A2 = extractelement <4 x float> %A, i32 2
    146   %B2 = extractelement <4 x float> %B, i32 2
    147   %sub2 = fsub float %A2, %B2
    148   %A1 = extractelement <4 x float> %A, i32 1
    149   %B1 = extractelement <4 x float> %B, i32 1
    150   %add1 = fadd float %A1, %B1
    151   %A3 = extractelement <4 x float> %A, i32 3
    152   %B3 = extractelement <4 x float> %B, i32 3
    153   %add3 = fadd float %A3, %B3
    154   %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
    155   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
    156   %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
    157   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
    158   ret <4 x float> %vecinsert4
    159 }
    160 
    161 define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
    162 ; FMA3-LABEL: buildvector_mul_addsub_pd128:
    163 ; FMA3:       # %bb.0: # %bb
    164 ; FMA3-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
    165 ; FMA3-NEXT:    retq
    166 ;
    167 ; FMA4-LABEL: buildvector_mul_addsub_pd128:
    168 ; FMA4:       # %bb.0: # %bb
    169 ; FMA4-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
    170 ; FMA4-NEXT:    retq
    171 bb:
    172   %A = fmul <2 x double> %C, %D
    173   %A0 = extractelement <2 x double> %A, i32 0
    174   %B0 = extractelement <2 x double> %B, i32 0
    175   %sub0 = fsub double %A0, %B0
    176   %A1 = extractelement <2 x double> %A, i32 1
    177   %B1 = extractelement <2 x double> %B, i32 1
    178   %add1 = fadd double %A1, %B1
    179   %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
    180   %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
    181   ret <2 x double> %vecinsert2
    182 }
    183 
    184 define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
    185 ; FMA3-LABEL: buildvector_mul_addsub_ps256:
    186 ; FMA3:       # %bb.0: # %bb
    187 ; FMA3-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
    188 ; FMA3-NEXT:    retq
    189 ;
    190 ; FMA4-LABEL: buildvector_mul_addsub_ps256:
    191 ; FMA4:       # %bb.0: # %bb
    192 ; FMA4-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
    193 ; FMA4-NEXT:    retq
    194 bb:
    195   %A = fmul <8 x float> %C, %D
    196   %A0 = extractelement <8 x float> %A, i32 0
    197   %B0 = extractelement <8 x float> %B, i32 0
    198   %sub0 = fsub float %A0, %B0
    199   %A2 = extractelement <8 x float> %A, i32 2
    200   %B2 = extractelement <8 x float> %B, i32 2
    201   %sub2 = fsub float %A2, %B2
    202   %A4 = extractelement <8 x float> %A, i32 4
    203   %B4 = extractelement <8 x float> %B, i32 4
    204   %sub4 = fsub float %A4, %B4
    205   %A6 = extractelement <8 x float> %A, i32 6
    206   %B6 = extractelement <8 x float> %B, i32 6
    207   %sub6 = fsub float %A6, %B6
    208   %A1 = extractelement <8 x float> %A, i32 1
    209   %B1 = extractelement <8 x float> %B, i32 1
    210   %add1 = fadd float %A1, %B1
    211   %A3 = extractelement <8 x float> %A, i32 3
    212   %B3 = extractelement <8 x float> %B, i32 3
    213   %add3 = fadd float %A3, %B3
    214   %A5 = extractelement <8 x float> %A, i32 5
    215   %B5 = extractelement <8 x float> %B, i32 5
    216   %add5 = fadd float %A5, %B5
    217   %A7 = extractelement <8 x float> %A, i32 7
    218   %B7 = extractelement <8 x float> %B, i32 7
    219   %add7 = fadd float %A7, %B7
    220   %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
    221   %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
    222   %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
    223   %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
    224   %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
    225   %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
    226   %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
    227   %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
    228   ret <8 x float> %vecinsert8
    229 }
    230 
    231 define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
    232 ; FMA3-LABEL: buildvector_mul_addsub_pd256:
    233 ; FMA3:       # %bb.0: # %bb
    234 ; FMA3-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
    235 ; FMA3-NEXT:    retq
    236 ;
    237 ; FMA4-LABEL: buildvector_mul_addsub_pd256:
    238 ; FMA4:       # %bb.0: # %bb
    239 ; FMA4-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
    240 ; FMA4-NEXT:    retq
    241 bb:
    242   %A = fmul <4 x double> %C, %D
    243   %A0 = extractelement <4 x double> %A, i32 0
    244   %B0 = extractelement <4 x double> %B, i32 0
    245   %sub0 = fsub double %A0, %B0
    246   %A2 = extractelement <4 x double> %A, i32 2
    247   %B2 = extractelement <4 x double> %B, i32 2
    248   %sub2 = fsub double %A2, %B2
    249   %A1 = extractelement <4 x double> %A, i32 1
    250   %B1 = extractelement <4 x double> %B, i32 1
    251   %add1 = fadd double %A1, %B1
    252   %A3 = extractelement <4 x double> %A, i32 3
    253   %B3 = extractelement <4 x double> %B, i32 3
    254   %add3 = fadd double %A3, %B3
    255   %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
    256   %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
    257   %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
    258   %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
    259   ret <4 x double> %vecinsert4
    260 }
    261 
    262 define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
    263 ; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
    264 ; FMA3_256:       # %bb.0: # %bb
    265 ; FMA3_256-NEXT:    vfmaddsub213ps %ymm4, %ymm2, %ymm0
    266 ; FMA3_256-NEXT:    vfmaddsub213ps %ymm5, %ymm3, %ymm1
    267 ; FMA3_256-NEXT:    retq
    268 ;
    269 ; FMA3_512-LABEL: buildvector_mul_addsub_ps512:
    270 ; FMA3_512:       # %bb.0: # %bb
    271 ; FMA3_512-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0
    272 ; FMA3_512-NEXT:    retq
    273 ;
    274 ; FMA4-LABEL: buildvector_mul_addsub_ps512:
    275 ; FMA4:       # %bb.0: # %bb
    276 ; FMA4-NEXT:    vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
    277 ; FMA4-NEXT:    vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
    278 ; FMA4-NEXT:    retq
    279 bb:
    280   %A = fmul <16 x float> %C, %D
    281   %A0 = extractelement <16 x float> %A, i32 0
    282   %B0 = extractelement <16 x float> %B, i32 0
    283   %sub0 = fsub float %A0, %B0
    284   %A2 = extractelement <16 x float> %A, i32 2
    285   %B2 = extractelement <16 x float> %B, i32 2
    286   %sub2 = fsub float %A2, %B2
    287   %A4 = extractelement <16 x float> %A, i32 4
    288   %B4 = extractelement <16 x float> %B, i32 4
    289   %sub4 = fsub float %A4, %B4
    290   %A6 = extractelement <16 x float> %A, i32 6
    291   %B6 = extractelement <16 x float> %B, i32 6
    292   %sub6 = fsub float %A6, %B6
    293   %A8 = extractelement <16 x float> %A, i32 8
    294   %B8 = extractelement <16 x float> %B, i32 8
    295   %sub8 = fsub float %A8, %B8
    296   %A10 = extractelement <16 x float> %A, i32 10
    297   %B10 = extractelement <16 x float> %B, i32 10
    298   %sub10 = fsub float %A10, %B10
    299   %A12 = extractelement <16 x float> %A, i32 12
    300   %B12 = extractelement <16 x float> %B, i32 12
    301   %sub12 = fsub float %A12, %B12
    302   %A14 = extractelement <16 x float> %A, i32 14
    303   %B14 = extractelement <16 x float> %B, i32 14
    304   %sub14 = fsub float %A14, %B14
    305   %A1 = extractelement <16 x float> %A, i32 1
    306   %B1 = extractelement <16 x float> %B, i32 1
    307   %add1 = fadd float %A1, %B1
    308   %A3 = extractelement <16 x float> %A, i32 3
    309   %B3 = extractelement <16 x float> %B, i32 3
    310   %add3 = fadd float %A3, %B3
    311   %A5 = extractelement <16 x float> %A, i32 5
    312   %B5 = extractelement <16 x float> %B, i32 5
    313   %add5 = fadd float %A5, %B5
    314   %A7 = extractelement <16 x float> %A, i32 7
    315   %B7 = extractelement <16 x float> %B, i32 7
    316   %add7 = fadd float %A7, %B7
    317   %A9 = extractelement <16 x float> %A, i32 9
    318   %B9 = extractelement <16 x float> %B, i32 9
    319   %add9 = fadd float %A9, %B9
    320   %A11 = extractelement <16 x float> %A, i32 11
    321   %B11 = extractelement <16 x float> %B, i32 11
    322   %add11 = fadd float %A11, %B11
    323   %A13 = extractelement <16 x float> %A, i32 13
    324   %B13 = extractelement <16 x float> %B, i32 13
    325   %add13 = fadd float %A13, %B13
    326   %A15 = extractelement <16 x float> %A, i32 15
    327   %B15 = extractelement <16 x float> %B, i32 15
    328   %add15 = fadd float %A15, %B15
    329   %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
    330   %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
    331   %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
    332   %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
    333   %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
    334   ; element 5 is undef
    335   %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
    336   %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
    337   %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
    338   %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
    339   %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
    340   %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
    341   ; element 12 is undef
    342   %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
    343   %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
    344   %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
    345   ret <16 x float> %vecinsert16
    346 }
    347 
    348 define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
    349 ; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
    350 ; FMA3_256:       # %bb.0: # %bb
    351 ; FMA3_256-NEXT:    vfmaddsub213pd %ymm4, %ymm2, %ymm0
    352 ; FMA3_256-NEXT:    vfmaddsub213pd %ymm5, %ymm3, %ymm1
    353 ; FMA3_256-NEXT:    retq
    354 ;
    355 ; FMA3_512-LABEL: buildvector_mul_addsub_pd512:
    356 ; FMA3_512:       # %bb.0: # %bb
    357 ; FMA3_512-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0
    358 ; FMA3_512-NEXT:    retq
    359 ;
    360 ; FMA4-LABEL: buildvector_mul_addsub_pd512:
    361 ; FMA4:       # %bb.0: # %bb
    362 ; FMA4-NEXT:    vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
    363 ; FMA4-NEXT:    vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
    364 ; FMA4-NEXT:    retq
    365 bb:
    366   %A = fmul <8 x double> %C, %D
    367   %A0 = extractelement <8 x double> %A, i32 0
    368   %B0 = extractelement <8 x double> %B, i32 0
    369   %sub0 = fsub double %A0, %B0
    370   %A2 = extractelement <8 x double> %A, i32 2
    371   %B2 = extractelement <8 x double> %B, i32 2
    372   %sub2 = fsub double %A2, %B2
    373   %A4 = extractelement <8 x double> %A, i32 4
    374   %B4 = extractelement <8 x double> %B, i32 4
    375   %sub4 = fsub double %A4, %B4
    376   %A6 = extractelement <8 x double> %A, i32 6
    377   %B6 = extractelement <8 x double> %B, i32 6
    378   %sub6 = fsub double %A6, %B6
    379   %A1 = extractelement <8 x double> %A, i32 1
    380   %B1 = extractelement <8 x double> %B, i32 1
    381   %add1 = fadd double %A1, %B1
    382   %A3 = extractelement <8 x double> %A, i32 3
    383   %B3 = extractelement <8 x double> %B, i32 3
    384   %add3 = fadd double %A3, %B3
    385   %A7 = extractelement <8 x double> %A, i32 7
    386   %B7 = extractelement <8 x double> %B, i32 7
    387   %add7 = fadd double %A7, %B7
    388   %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
    389   %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
    390   %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
    391   %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
    392   %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
    393   ; element 5 is undef
    394   %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
    395   %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
    396   ret <8 x double> %vecinsert8
    397 }
    398 
    399 define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
    400 ; FMA3-LABEL: buildvector_mul_subadd_ps128:
    401 ; FMA3:       # %bb.0: # %bb
    402 ; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
    403 ; FMA3-NEXT:    retq
    404 ;
    405 ; FMA4-LABEL: buildvector_mul_subadd_ps128:
    406 ; FMA4:       # %bb.0: # %bb
    407 ; FMA4-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
    408 ; FMA4-NEXT:    retq
    409 bb:
    410   %A = fmul <4 x float> %C, %D
    411   %A0 = extractelement <4 x float> %A, i32 0
    412   %B0 = extractelement <4 x float> %B, i32 0
    413   %sub0 = fadd float %A0, %B0
    414   %A2 = extractelement <4 x float> %A, i32 2
    415   %B2 = extractelement <4 x float> %B, i32 2
    416   %sub2 = fadd float %A2, %B2
    417   %A1 = extractelement <4 x float> %A, i32 1
    418   %B1 = extractelement <4 x float> %B, i32 1
    419   %add1 = fsub float %A1, %B1
    420   %A3 = extractelement <4 x float> %A, i32 3
    421   %B3 = extractelement <4 x float> %B, i32 3
    422   %add3 = fsub float %A3, %B3
    423   %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
    424   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
    425   %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
    426   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
    427   ret <4 x float> %vecinsert4
    428 }
    429 
    430 define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
    431 ; FMA3-LABEL: buildvector_mul_subadd_pd128:
    432 ; FMA3:       # %bb.0: # %bb
    433 ; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2
    434 ; FMA3-NEXT:    retq
    435 ;
    436 ; FMA4-LABEL: buildvector_mul_subadd_pd128:
    437 ; FMA4:       # %bb.0: # %bb
    438 ; FMA4-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
    439 ; FMA4-NEXT:    retq
    440 bb:
    441   %A = fmul <2 x double> %C, %D
    442   %A0 = extractelement <2 x double> %A, i32 0
    443   %B0 = extractelement <2 x double> %B, i32 0
    444   %sub0 = fadd double %A0, %B0
    445   %A1 = extractelement <2 x double> %A, i32 1
    446   %B1 = extractelement <2 x double> %B, i32 1
    447   %add1 = fsub double %A1, %B1
    448   %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
    449   %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
    450   ret <2 x double> %vecinsert2
    451 }
    452 
    453 define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
    454 ; FMA3-LABEL: buildvector_mul_subadd_ps256:
    455 ; FMA3:       # %bb.0: # %bb
    456 ; FMA3-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
    457 ; FMA3-NEXT:    retq
    458 ;
    459 ; FMA4-LABEL: buildvector_mul_subadd_ps256:
    460 ; FMA4:       # %bb.0: # %bb
    461 ; FMA4-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
    462 ; FMA4-NEXT:    retq
    463 bb:
    464   %A = fmul <8 x float> %C, %D
    465   %A0 = extractelement <8 x float> %A, i32 0
    466   %B0 = extractelement <8 x float> %B, i32 0
    467   %sub0 = fadd float %A0, %B0
    468   %A2 = extractelement <8 x float> %A, i32 2
    469   %B2 = extractelement <8 x float> %B, i32 2
    470   %sub2 = fadd float %A2, %B2
    471   %A4 = extractelement <8 x float> %A, i32 4
    472   %B4 = extractelement <8 x float> %B, i32 4
    473   %sub4 = fadd float %A4, %B4
    474   %A6 = extractelement <8 x float> %A, i32 6
    475   %B6 = extractelement <8 x float> %B, i32 6
    476   %sub6 = fadd float %A6, %B6
    477   %A1 = extractelement <8 x float> %A, i32 1
    478   %B1 = extractelement <8 x float> %B, i32 1
    479   %add1 = fsub float %A1, %B1
    480   %A3 = extractelement <8 x float> %A, i32 3
    481   %B3 = extractelement <8 x float> %B, i32 3
    482   %add3 = fsub float %A3, %B3
    483   %A5 = extractelement <8 x float> %A, i32 5
    484   %B5 = extractelement <8 x float> %B, i32 5
    485   %add5 = fsub float %A5, %B5
    486   %A7 = extractelement <8 x float> %A, i32 7
    487   %B7 = extractelement <8 x float> %B, i32 7
    488   %add7 = fsub float %A7, %B7
    489   %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
    490   %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
    491   %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
    492   %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
    493   %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
    494   %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
    495   %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
    496   %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
    497   ret <8 x float> %vecinsert8
    498 }
    499 
    500 define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
    501 ; FMA3-LABEL: buildvector_mul_subadd_pd256:
    502 ; FMA3:       # %bb.0: # %bb
    503 ; FMA3-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2
    504 ; FMA3-NEXT:    retq
    505 ;
    506 ; FMA4-LABEL: buildvector_mul_subadd_pd256:
    507 ; FMA4:       # %bb.0: # %bb
    508 ; FMA4-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
    509 ; FMA4-NEXT:    retq
    510 bb:
    511   %A = fmul <4 x double> %C, %D
    512   %A0 = extractelement <4 x double> %A, i32 0
    513   %B0 = extractelement <4 x double> %B, i32 0
    514   %sub0 = fadd double %A0, %B0
    515   %A2 = extractelement <4 x double> %A, i32 2
    516   %B2 = extractelement <4 x double> %B, i32 2
    517   %sub2 = fadd double %A2, %B2
    518   %A1 = extractelement <4 x double> %A, i32 1
    519   %B1 = extractelement <4 x double> %B, i32 1
    520   %add1 = fsub double %A1, %B1
    521   %A3 = extractelement <4 x double> %A, i32 3
    522   %B3 = extractelement <4 x double> %B, i32 3
    523   %add3 = fsub double %A3, %B3
    524   %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
    525   %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
    526   %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
    527   %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
    528   ret <4 x double> %vecinsert4
    529 }
    530 
    531 define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
    532 ; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
    533 ; FMA3_256:       # %bb.0: # %bb
    534 ; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
    535 ; FMA3_256-NEXT:    vfmsubadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
    536 ; FMA3_256-NEXT:    retq
    537 ;
    538 ; FMA3_512-LABEL: buildvector_mul_subadd_ps512:
    539 ; FMA3_512:       # %bb.0: # %bb
    540 ; FMA3_512-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
    541 ; FMA3_512-NEXT:    retq
    542 ;
    543 ; FMA4-LABEL: buildvector_mul_subadd_ps512:
    544 ; FMA4:       # %bb.0: # %bb
    545 ; FMA4-NEXT:    vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0
    546 ; FMA4-NEXT:    vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1
    547 ; FMA4-NEXT:    retq
    548 bb:
    549   %A = fmul <16 x float> %C, %D
    550   %A0 = extractelement <16 x float> %A, i32 0
    551   %B0 = extractelement <16 x float> %B, i32 0
    552   %sub0 = fadd float %A0, %B0
    553   %A2 = extractelement <16 x float> %A, i32 2
    554   %B2 = extractelement <16 x float> %B, i32 2
    555   %sub2 = fadd float %A2, %B2
    556   %A4 = extractelement <16 x float> %A, i32 4
    557   %B4 = extractelement <16 x float> %B, i32 4
    558   %sub4 = fadd float %A4, %B4
    559   %A6 = extractelement <16 x float> %A, i32 6
    560   %B6 = extractelement <16 x float> %B, i32 6
    561   %sub6 = fadd float %A6, %B6
    562   %A8 = extractelement <16 x float> %A, i32 8
    563   %B8 = extractelement <16 x float> %B, i32 8
    564   %sub8 = fadd float %A8, %B8
    565   %A10 = extractelement <16 x float> %A, i32 10
    566   %B10 = extractelement <16 x float> %B, i32 10
    567   %sub10 = fadd float %A10, %B10
    568   %A12 = extractelement <16 x float> %A, i32 12
    569   %B12 = extractelement <16 x float> %B, i32 12
    570   %sub12 = fadd float %A12, %B12
    571   %A14 = extractelement <16 x float> %A, i32 14
    572   %B14 = extractelement <16 x float> %B, i32 14
    573   %sub14 = fadd float %A14, %B14
    574   %A1 = extractelement <16 x float> %A, i32 1
    575   %B1 = extractelement <16 x float> %B, i32 1
    576   %add1 = fsub float %A1, %B1
    577   %A3 = extractelement <16 x float> %A, i32 3
    578   %B3 = extractelement <16 x float> %B, i32 3
    579   %add3 = fsub float %A3, %B3
    580   %A5 = extractelement <16 x float> %A, i32 5
    581   %B5 = extractelement <16 x float> %B, i32 5
    582   %add5 = fsub float %A5, %B5
    583   %A7 = extractelement <16 x float> %A, i32 7
    584   %B7 = extractelement <16 x float> %B, i32 7
    585   %add7 = fsub float %A7, %B7
    586   %A9 = extractelement <16 x float> %A, i32 9
    587   %B9 = extractelement <16 x float> %B, i32 9
    588   %add9 = fsub float %A9, %B9
    589   %A11 = extractelement <16 x float> %A, i32 11
    590   %B11 = extractelement <16 x float> %B, i32 11
    591   %add11 = fsub float %A11, %B11
    592   %A13 = extractelement <16 x float> %A, i32 13
    593   %B13 = extractelement <16 x float> %B, i32 13
    594   %add13 = fsub float %A13, %B13
    595   %A15 = extractelement <16 x float> %A, i32 15
    596   %B15 = extractelement <16 x float> %B, i32 15
    597   %add15 = fsub float %A15, %B15
    598   %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
    599   %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
    600   %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
    601   %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
    602   %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
    603   ; element 5 is undef
    604   %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
    605   %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
    606   %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
    607   %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
    608   %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
    609   %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
    610   ; element 12 is undef
    611   %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
    612   %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
    613   %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
    614   ret <16 x float> %vecinsert16
    615 }
    616 
    617 define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
    618 ; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
    619 ; FMA3_256:       # %bb.0: # %bb
    620 ; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) -/+ ymm4
    621 ; FMA3_256-NEXT:    vfmsubadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) -/+ ymm5
    622 ; FMA3_256-NEXT:    retq
    623 ;
    624 ; FMA3_512-LABEL: buildvector_mul_subadd_pd512:
    625 ; FMA3_512:       # %bb.0: # %bb
    626 ; FMA3_512-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
    627 ; FMA3_512-NEXT:    retq
    628 ;
    629 ; FMA4-LABEL: buildvector_mul_subadd_pd512:
    630 ; FMA4:       # %bb.0: # %bb
    631 ; FMA4-NEXT:    vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0
    632 ; FMA4-NEXT:    vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1
    633 ; FMA4-NEXT:    retq
    634 bb:
    635   %A = fmul <8 x double> %C, %D
    636   %A0 = extractelement <8 x double> %A, i32 0
    637   %B0 = extractelement <8 x double> %B, i32 0
    638   %sub0 = fadd double %A0, %B0
    639   %A2 = extractelement <8 x double> %A, i32 2
    640   %B2 = extractelement <8 x double> %B, i32 2
    641   %sub2 = fadd double %A2, %B2
    642   %A4 = extractelement <8 x double> %A, i32 4
    643   %B4 = extractelement <8 x double> %B, i32 4
    644   %sub4 = fadd double %A4, %B4
    645   %A6 = extractelement <8 x double> %A, i32 6
    646   %B6 = extractelement <8 x double> %B, i32 6
    647   %sub6 = fadd double %A6, %B6
    648   %A1 = extractelement <8 x double> %A, i32 1
    649   %B1 = extractelement <8 x double> %B, i32 1
    650   %add1 = fsub double %A1, %B1
    651   %A3 = extractelement <8 x double> %A, i32 3
    652   %B3 = extractelement <8 x double> %B, i32 3
    653   %add3 = fsub double %A3, %B3
    654   %A7 = extractelement <8 x double> %A, i32 7
    655   %B7 = extractelement <8 x double> %B, i32 7
    656   %add7 = fsub double %A7, %B7
    657   %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
    658   %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
    659   %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
    660   %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
    661   %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
    662   ; element 5 is undef
    663   %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
    664   %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
    665   ret <8 x double> %vecinsert8
    666 }
    667 
    668 attributes #0 = { nounwind "unsafe-fp-math"="true" }
    669