Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule       | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
     11 
     12 ; It's the extra tests coverage for recip as discussed on D26855.
     13 
     14 define float @f32_no_step_2(float %x) #3 {
     15 ; SSE-LABEL: f32_no_step_2:
     16 ; SSE:       # %bb.0:
     17 ; SSE-NEXT:    rcpss %xmm0, %xmm0
     18 ; SSE-NEXT:    mulss {{.*}}(%rip), %xmm0
     19 ; SSE-NEXT:    retq
     20 ;
     21 ; AVX-RECIP-LABEL: f32_no_step_2:
     22 ; AVX-RECIP:       # %bb.0:
     23 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
     24 ; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
     25 ; AVX-RECIP-NEXT:    retq
     26 ;
     27 ; FMA-RECIP-LABEL: f32_no_step_2:
     28 ; FMA-RECIP:       # %bb.0:
     29 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
     30 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
     31 ; FMA-RECIP-NEXT:    retq
     32 ;
     33 ; BTVER2-LABEL: f32_no_step_2:
     34 ; BTVER2:       # %bb.0:
     35 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
     36 ; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
     37 ; BTVER2-NEXT:    retq # sched: [4:1.00]
     38 ;
     39 ; SANDY-LABEL: f32_no_step_2:
     40 ; SANDY:       # %bb.0:
     41 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
     42 ; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
     43 ; SANDY-NEXT:    retq # sched: [1:1.00]
     44 ;
     45 ; HASWELL-LABEL: f32_no_step_2:
     46 ; HASWELL:       # %bb.0:
     47 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
     48 ; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
     49 ; HASWELL-NEXT:    retq # sched: [7:1.00]
     50 ;
     51 ; HASWELL-NO-FMA-LABEL: f32_no_step_2:
     52 ; HASWELL-NO-FMA:       # %bb.0:
     53 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
     54 ; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
     55 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
     56 ;
     57 ; KNL-LABEL: f32_no_step_2:
     58 ; KNL:       # %bb.0:
     59 ; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
     60 ; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
     61 ; KNL-NEXT:    retq # sched: [7:1.00]
     62 ;
     63 ; SKX-LABEL: f32_no_step_2:
     64 ; SKX:       # %bb.0:
     65 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
     66 ; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
     67 ; SKX-NEXT:    retq # sched: [7:1.00]
     68   %div = fdiv fast float 1234.0, %x
     69   ret float %div
     70 }
     71 
     72 define float @f32_one_step_2(float %x) #1 {
     73 ; SSE-LABEL: f32_one_step_2:
     74 ; SSE:       # %bb.0:
     75 ; SSE-NEXT:    rcpss %xmm0, %xmm2
     76 ; SSE-NEXT:    mulss %xmm2, %xmm0
     77 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     78 ; SSE-NEXT:    subss %xmm0, %xmm1
     79 ; SSE-NEXT:    mulss %xmm2, %xmm1
     80 ; SSE-NEXT:    addss %xmm2, %xmm1
     81 ; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
     82 ; SSE-NEXT:    movaps %xmm1, %xmm0
     83 ; SSE-NEXT:    retq
     84 ;
     85 ; AVX-RECIP-LABEL: f32_one_step_2:
     86 ; AVX-RECIP:       # %bb.0:
     87 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
     88 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     89 ; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
     90 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
     91 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
     92 ; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
     93 ; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
     94 ; AVX-RECIP-NEXT:    retq
     95 ;
     96 ; FMA-RECIP-LABEL: f32_one_step_2:
     97 ; FMA-RECIP:       # %bb.0:
     98 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
     99 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
    100 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
    101 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
    102 ; FMA-RECIP-NEXT:    retq
    103 ;
    104 ; BTVER2-LABEL: f32_one_step_2:
    105 ; BTVER2:       # %bb.0:
    106 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
    107 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
    108 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    109 ; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    110 ; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    111 ; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    112 ; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
    113 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    114 ;
    115 ; SANDY-LABEL: f32_one_step_2:
    116 ; SANDY:       # %bb.0:
    117 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    118 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    119 ; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
    120 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    121 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    122 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    123 ; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
    124 ; SANDY-NEXT:    retq # sched: [1:1.00]
    125 ;
    126 ; HASWELL-LABEL: f32_one_step_2:
    127 ; HASWELL:       # %bb.0:
    128 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    129 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    130 ; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    131 ; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    132 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    133 ;
    134 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
    135 ; HASWELL-NO-FMA:       # %bb.0:
    136 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    137 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
    138 ; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    139 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    140 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    141 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    142 ; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    143 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    144 ;
    145 ; KNL-LABEL: f32_one_step_2:
    146 ; KNL:       # %bb.0:
    147 ; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    148 ; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    149 ; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    150 ; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    151 ; KNL-NEXT:    retq # sched: [7:1.00]
    152 ;
    153 ; SKX-LABEL: f32_one_step_2:
    154 ; SKX:       # %bb.0:
    155 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
    156 ; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
    157 ; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
    158 ; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
    159 ; SKX-NEXT:    retq # sched: [7:1.00]
    160   %div = fdiv fast float 3456.0, %x
    161   ret float %div
    162 }
    163 
    164 define float @f32_one_step_2_divs(float %x) #1 {
    165 ; SSE-LABEL: f32_one_step_2_divs:
    166 ; SSE:       # %bb.0:
    167 ; SSE-NEXT:    rcpss %xmm0, %xmm1
    168 ; SSE-NEXT:    mulss %xmm1, %xmm0
    169 ; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    170 ; SSE-NEXT:    subss %xmm0, %xmm2
    171 ; SSE-NEXT:    mulss %xmm1, %xmm2
    172 ; SSE-NEXT:    addss %xmm1, %xmm2
    173 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    174 ; SSE-NEXT:    mulss %xmm2, %xmm0
    175 ; SSE-NEXT:    mulss %xmm2, %xmm0
    176 ; SSE-NEXT:    retq
    177 ;
    178 ; AVX-RECIP-LABEL: f32_one_step_2_divs:
    179 ; AVX-RECIP:       # %bb.0:
    180 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    181 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    182 ; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    183 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
    184 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    185 ; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    186 ; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
    187 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    188 ; AVX-RECIP-NEXT:    retq
    189 ;
    190 ; FMA-RECIP-LABEL: f32_one_step_2_divs:
    191 ; FMA-RECIP:       # %bb.0:
    192 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    193 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
    194 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
    195 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
    196 ; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    197 ; FMA-RECIP-NEXT:    retq
    198 ;
    199 ; BTVER2-LABEL: f32_one_step_2_divs:
    200 ; BTVER2:       # %bb.0:
    201 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
    202 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
    203 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    204 ; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    205 ; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    206 ; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    207 ; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
    208 ; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    209 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    210 ;
    211 ; SANDY-LABEL: f32_one_step_2_divs:
    212 ; SANDY:       # %bb.0:
    213 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    214 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    215 ; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
    216 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    217 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    218 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    219 ; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
    220 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    221 ; SANDY-NEXT:    retq # sched: [1:1.00]
    222 ;
    223 ; HASWELL-LABEL: f32_one_step_2_divs:
    224 ; HASWELL:       # %bb.0:
    225 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    226 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    227 ; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    228 ; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
    229 ; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    230 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    231 ;
    232 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
    233 ; HASWELL-NO-FMA:       # %bb.0:
    234 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    235 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
    236 ; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    237 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    238 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    239 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    240 ; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
    241 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    242 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    243 ;
    244 ; KNL-LABEL: f32_one_step_2_divs:
    245 ; KNL:       # %bb.0:
    246 ; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    247 ; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    248 ; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    249 ; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
    250 ; KNL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    251 ; KNL-NEXT:    retq # sched: [7:1.00]
    252 ;
    253 ; SKX-LABEL: f32_one_step_2_divs:
    254 ; SKX:       # %bb.0:
    255 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
    256 ; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
    257 ; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
    258 ; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
    259 ; SKX-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
    260 ; SKX-NEXT:    retq # sched: [7:1.00]
    261   %div = fdiv fast float 3456.0, %x
    262   %div2 = fdiv fast float %div, %x
    263   ret float %div2
    264 }
    265 
    266 define float @f32_two_step_2(float %x) #2 {
    267 ; SSE-LABEL: f32_two_step_2:
    268 ; SSE:       # %bb.0:
    269 ; SSE-NEXT:    rcpss %xmm0, %xmm2
    270 ; SSE-NEXT:    movaps %xmm0, %xmm3
    271 ; SSE-NEXT:    mulss %xmm2, %xmm3
    272 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    273 ; SSE-NEXT:    movaps %xmm1, %xmm4
    274 ; SSE-NEXT:    subss %xmm3, %xmm4
    275 ; SSE-NEXT:    mulss %xmm2, %xmm4
    276 ; SSE-NEXT:    addss %xmm2, %xmm4
    277 ; SSE-NEXT:    mulss %xmm4, %xmm0
    278 ; SSE-NEXT:    subss %xmm0, %xmm1
    279 ; SSE-NEXT:    mulss %xmm4, %xmm1
    280 ; SSE-NEXT:    addss %xmm4, %xmm1
    281 ; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
    282 ; SSE-NEXT:    movaps %xmm1, %xmm0
    283 ; SSE-NEXT:    retq
    284 ;
    285 ; AVX-RECIP-LABEL: f32_two_step_2:
    286 ; AVX-RECIP:       # %bb.0:
    287 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    288 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
    289 ; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
    290 ; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
    291 ; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
    292 ; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    293 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    294 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
    295 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    296 ; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    297 ; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
    298 ; AVX-RECIP-NEXT:    retq
    299 ;
    300 ; FMA-RECIP-LABEL: f32_two_step_2:
    301 ; FMA-RECIP:       # %bb.0:
    302 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    303 ; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    304 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
    305 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
    306 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
    307 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
    308 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
    309 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
    310 ; FMA-RECIP-NEXT:    retq
    311 ;
    312 ; BTVER2-LABEL: f32_two_step_2:
    313 ; BTVER2:       # %bb.0:
    314 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
    315 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
    316 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
    317 ; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    318 ; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
    319 ; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    320 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    321 ; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    322 ; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    323 ; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    324 ; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
    325 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    326 ;
    327 ; SANDY-LABEL: f32_two_step_2:
    328 ; SANDY:       # %bb.0:
    329 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    330 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
    331 ; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
    332 ; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    333 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
    334 ; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    335 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    336 ; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    337 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    338 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    339 ; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
    340 ; SANDY-NEXT:    retq # sched: [1:1.00]
    341 ;
    342 ; HASWELL-LABEL: f32_two_step_2:
    343 ; HASWELL:       # %bb.0:
    344 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    345 ; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    346 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    347 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    348 ; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    349 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    350 ; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    351 ; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    352 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    353 ;
    354 ; HASWELL-NO-FMA-LABEL: f32_two_step_2:
    355 ; HASWELL-NO-FMA:       # %bb.0:
    356 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    357 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
    358 ; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:0.50]
    359 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    360 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
    361 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    362 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
    363 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    364 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    365 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    366 ; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    367 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    368 ;
    369 ; KNL-LABEL: f32_two_step_2:
    370 ; KNL:       # %bb.0:
    371 ; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    372 ; KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    373 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    374 ; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    375 ; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    376 ; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    377 ; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    378 ; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    379 ; KNL-NEXT:    retq # sched: [7:1.00]
    380 ;
    381 ; SKX-LABEL: f32_two_step_2:
    382 ; SKX:       # %bb.0:
    383 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
    384 ; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    385 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
    386 ; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
    387 ; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
    388 ; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
    389 ; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
    390 ; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
    391 ; SKX-NEXT:    retq # sched: [7:1.00]
    392   %div = fdiv fast float 6789.0, %x
    393   ret float %div
    394 }
    395 
    396 define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
    397 ; SSE-LABEL: v4f32_one_step2:
    398 ; SSE:       # %bb.0:
    399 ; SSE-NEXT:    rcpps %xmm0, %xmm2
    400 ; SSE-NEXT:    mulps %xmm2, %xmm0
    401 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    402 ; SSE-NEXT:    subps %xmm0, %xmm1
    403 ; SSE-NEXT:    mulps %xmm2, %xmm1
    404 ; SSE-NEXT:    addps %xmm2, %xmm1
    405 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
    406 ; SSE-NEXT:    movaps %xmm1, %xmm0
    407 ; SSE-NEXT:    retq
    408 ;
    409 ; AVX-RECIP-LABEL: v4f32_one_step2:
    410 ; AVX-RECIP:       # %bb.0:
    411 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    412 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    413 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    414 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
    415 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    416 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    417 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
    418 ; AVX-RECIP-NEXT:    retq
    419 ;
    420 ; FMA-RECIP-LABEL: v4f32_one_step2:
    421 ; FMA-RECIP:       # %bb.0:
    422 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    423 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
    424 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
    425 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
    426 ; FMA-RECIP-NEXT:    retq
    427 ;
    428 ; BTVER2-LABEL: v4f32_one_step2:
    429 ; BTVER2:       # %bb.0:
    430 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    431 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
    432 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    433 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    434 ; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    435 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    436 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
    437 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    438 ;
    439 ; SANDY-LABEL: v4f32_one_step2:
    440 ; SANDY:       # %bb.0:
    441 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    442 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    443 ; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
    444 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    445 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    446 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    447 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
    448 ; SANDY-NEXT:    retq # sched: [1:1.00]
    449 ;
    450 ; HASWELL-LABEL: v4f32_one_step2:
    451 ; HASWELL:       # %bb.0:
    452 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    453 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    454 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
    455 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    456 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
    457 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    458 ;
    459 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
    460 ; HASWELL-NO-FMA:       # %bb.0:
    461 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    462 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
    463 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    464 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    465 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    466 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    467 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
    468 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    469 ;
    470 ; KNL-LABEL: v4f32_one_step2:
    471 ; KNL:       # %bb.0:
    472 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    473 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    474 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
    475 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    476 ; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
    477 ; KNL-NEXT:    retq # sched: [7:1.00]
    478 ;
    479 ; SKX-LABEL: v4f32_one_step2:
    480 ; SKX:       # %bb.0:
    481 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
    482 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    483 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
    484 ; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    485 ; SKX-NEXT:    retq # sched: [7:1.00]
    486   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
    487   ret <4 x float> %div
    488 }
    489 
    490 define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
    491 ; SSE-LABEL: v4f32_one_step_2_divs:
    492 ; SSE:       # %bb.0:
    493 ; SSE-NEXT:    rcpps %xmm0, %xmm1
    494 ; SSE-NEXT:    mulps %xmm1, %xmm0
    495 ; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    496 ; SSE-NEXT:    subps %xmm0, %xmm2
    497 ; SSE-NEXT:    mulps %xmm1, %xmm2
    498 ; SSE-NEXT:    addps %xmm1, %xmm2
    499 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
    500 ; SSE-NEXT:    mulps %xmm2, %xmm0
    501 ; SSE-NEXT:    mulps %xmm2, %xmm0
    502 ; SSE-NEXT:    retq
    503 ;
    504 ; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
    505 ; AVX-RECIP:       # %bb.0:
    506 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    507 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    508 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    509 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
    510 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    511 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    512 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
    513 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    514 ; AVX-RECIP-NEXT:    retq
    515 ;
    516 ; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
    517 ; FMA-RECIP:       # %bb.0:
    518 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    519 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
    520 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
    521 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
    522 ; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    523 ; FMA-RECIP-NEXT:    retq
    524 ;
    525 ; BTVER2-LABEL: v4f32_one_step_2_divs:
    526 ; BTVER2:       # %bb.0:
    527 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    528 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
    529 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    530 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    531 ; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    532 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    533 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
    534 ; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    535 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    536 ;
    537 ; SANDY-LABEL: v4f32_one_step_2_divs:
    538 ; SANDY:       # %bb.0:
    539 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    540 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    541 ; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
    542 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    543 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    544 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    545 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
    546 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    547 ; SANDY-NEXT:    retq # sched: [1:1.00]
    548 ;
    549 ; HASWELL-LABEL: v4f32_one_step_2_divs:
    550 ; HASWELL:       # %bb.0:
    551 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    552 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    553 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
    554 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    555 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
    556 ; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    557 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    558 ;
    559 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
    560 ; HASWELL-NO-FMA:       # %bb.0:
    561 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    562 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
    563 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    564 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    565 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    566 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    567 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
    568 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    569 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    570 ;
    571 ; KNL-LABEL: v4f32_one_step_2_divs:
    572 ; KNL:       # %bb.0:
    573 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    574 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    575 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
    576 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    577 ; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
    578 ; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    579 ; KNL-NEXT:    retq # sched: [7:1.00]
    580 ;
    581 ; SKX-LABEL: v4f32_one_step_2_divs:
    582 ; SKX:       # %bb.0:
    583 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
    584 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    585 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
    586 ; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
    587 ; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
    588 ; SKX-NEXT:    retq # sched: [7:1.00]
    589   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
    590   %div2 = fdiv fast <4 x float> %div, %x
    591   ret <4 x float> %div2
    592 }
    593 
    594 define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
    595 ; SSE-LABEL: v4f32_two_step2:
    596 ; SSE:       # %bb.0:
    597 ; SSE-NEXT:    rcpps %xmm0, %xmm2
    598 ; SSE-NEXT:    movaps %xmm0, %xmm3
    599 ; SSE-NEXT:    mulps %xmm2, %xmm3
    600 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    601 ; SSE-NEXT:    movaps %xmm1, %xmm4
    602 ; SSE-NEXT:    subps %xmm3, %xmm4
    603 ; SSE-NEXT:    mulps %xmm2, %xmm4
    604 ; SSE-NEXT:    addps %xmm2, %xmm4
    605 ; SSE-NEXT:    mulps %xmm4, %xmm0
    606 ; SSE-NEXT:    subps %xmm0, %xmm1
    607 ; SSE-NEXT:    mulps %xmm4, %xmm1
    608 ; SSE-NEXT:    addps %xmm4, %xmm1
    609 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
    610 ; SSE-NEXT:    movaps %xmm1, %xmm0
    611 ; SSE-NEXT:    retq
    612 ;
    613 ; AVX-RECIP-LABEL: v4f32_two_step2:
    614 ; AVX-RECIP:       # %bb.0:
    615 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    616 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
    617 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    618 ; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
    619 ; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
    620 ; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
    621 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    622 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
    623 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    624 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    625 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
    626 ; AVX-RECIP-NEXT:    retq
    627 ;
    628 ; FMA-RECIP-LABEL: v4f32_two_step2:
    629 ; FMA-RECIP:       # %bb.0:
    630 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    631 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    632 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
    633 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
    634 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
    635 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
    636 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
    637 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
    638 ; FMA-RECIP-NEXT:    retq
    639 ;
    640 ; BTVER2-LABEL: v4f32_two_step2:
    641 ; BTVER2:       # %bb.0:
    642 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    643 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
    644 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
    645 ; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    646 ; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
    647 ; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    648 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    649 ; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    650 ; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    651 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    652 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
    653 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    654 ;
    655 ; SANDY-LABEL: v4f32_two_step2:
    656 ; SANDY:       # %bb.0:
    657 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    658 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
    659 ; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
    660 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    661 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
    662 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    663 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    664 ; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    665 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    666 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    667 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
    668 ; SANDY-NEXT:    retq # sched: [1:1.00]
    669 ;
    670 ; HASWELL-LABEL: v4f32_two_step2:
    671 ; HASWELL:       # %bb.0:
    672 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    673 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    674 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    675 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    676 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    677 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    678 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    679 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
    680 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    681 ;
    682 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
    683 ; HASWELL-NO-FMA:       # %bb.0:
    684 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    685 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
    686 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
    687 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    688 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
    689 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    690 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
    691 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    692 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
    693 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    694 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
    695 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    696 ;
    697 ; KNL-LABEL: v4f32_two_step2:
    698 ; KNL:       # %bb.0:
    699 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    700 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    701 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    702 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    703 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    704 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    705 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    706 ; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
    707 ; KNL-NEXT:    retq # sched: [7:1.00]
    708 ;
    709 ; SKX-LABEL: v4f32_two_step2:
    710 ; SKX:       # %bb.0:
    711 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
    712 ; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    713 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
    714 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
    715 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
    716 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
    717 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
    718 ; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
    719 ; SKX-NEXT:    retq # sched: [7:1.00]
    720   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
    721   ret <4 x float> %div
    722 }
    723 
    724 define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
    725 ; SSE-LABEL: v8f32_one_step2:
    726 ; SSE:       # %bb.0:
    727 ; SSE-NEXT:    rcpps %xmm1, %xmm4
    728 ; SSE-NEXT:    mulps %xmm4, %xmm1
    729 ; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    730 ; SSE-NEXT:    movaps %xmm2, %xmm3
    731 ; SSE-NEXT:    subps %xmm1, %xmm3
    732 ; SSE-NEXT:    mulps %xmm4, %xmm3
    733 ; SSE-NEXT:    addps %xmm4, %xmm3
    734 ; SSE-NEXT:    rcpps %xmm0, %xmm1
    735 ; SSE-NEXT:    mulps %xmm1, %xmm0
    736 ; SSE-NEXT:    subps %xmm0, %xmm2
    737 ; SSE-NEXT:    mulps %xmm1, %xmm2
    738 ; SSE-NEXT:    addps %xmm1, %xmm2
    739 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
    740 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
    741 ; SSE-NEXT:    movaps %xmm2, %xmm0
    742 ; SSE-NEXT:    movaps %xmm3, %xmm1
    743 ; SSE-NEXT:    retq
    744 ;
    745 ; AVX-RECIP-LABEL: v8f32_one_step2:
    746 ; AVX-RECIP:       # %bb.0:
    747 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    748 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    749 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    750 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
    751 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    752 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
    753 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
    754 ; AVX-RECIP-NEXT:    retq
    755 ;
    756 ; FMA-RECIP-LABEL: v8f32_one_step2:
    757 ; FMA-RECIP:       # %bb.0:
    758 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    759 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
    760 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
    761 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
    762 ; FMA-RECIP-NEXT:    retq
    763 ;
    764 ; BTVER2-LABEL: v8f32_one_step2:
    765 ; BTVER2:       # %bb.0:
    766 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    767 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
    768 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
    769 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
    770 ; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
    771 ; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
    772 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
    773 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    774 ;
    775 ; SANDY-LABEL: v8f32_one_step2:
    776 ; SANDY:       # %bb.0:
    777 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
    778 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
    779 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
    780 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
    781 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
    782 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    783 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
    784 ; SANDY-NEXT:    retq # sched: [1:1.00]
    785 ;
    786 ; HASWELL-LABEL: v8f32_one_step2:
    787 ; HASWELL:       # %bb.0:
    788 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    789 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    790 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
    791 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
    792 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
    793 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    794 ;
    795 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
    796 ; HASWELL-NO-FMA:       # %bb.0:
    797 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    798 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
    799 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    800 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
    801 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
    802 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    803 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
    804 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    805 ;
    806 ; KNL-LABEL: v8f32_one_step2:
    807 ; KNL:       # %bb.0:
    808 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    809 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    810 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
    811 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
    812 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
    813 ; KNL-NEXT:    retq # sched: [7:1.00]
    814 ;
    815 ; SKX-LABEL: v8f32_one_step2:
    816 ; SKX:       # %bb.0:
    817 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
    818 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
    819 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
    820 ; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
    821 ; SKX-NEXT:    retq # sched: [7:1.00]
    822   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
    823   ret <8 x float> %div
    824 }
    825 
    826 define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
    827 ; SSE-LABEL: v8f32_one_step_2_divs:
    828 ; SSE:       # %bb.0:
    829 ; SSE-NEXT:    rcpps %xmm0, %xmm2
    830 ; SSE-NEXT:    mulps %xmm2, %xmm0
    831 ; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    832 ; SSE-NEXT:    movaps %xmm3, %xmm4
    833 ; SSE-NEXT:    subps %xmm0, %xmm4
    834 ; SSE-NEXT:    mulps %xmm2, %xmm4
    835 ; SSE-NEXT:    addps %xmm2, %xmm4
    836 ; SSE-NEXT:    rcpps %xmm1, %xmm0
    837 ; SSE-NEXT:    mulps %xmm0, %xmm1
    838 ; SSE-NEXT:    subps %xmm1, %xmm3
    839 ; SSE-NEXT:    mulps %xmm0, %xmm3
    840 ; SSE-NEXT:    addps %xmm0, %xmm3
    841 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
    842 ; SSE-NEXT:    mulps %xmm3, %xmm1
    843 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
    844 ; SSE-NEXT:    mulps %xmm4, %xmm0
    845 ; SSE-NEXT:    mulps %xmm4, %xmm0
    846 ; SSE-NEXT:    mulps %xmm3, %xmm1
    847 ; SSE-NEXT:    retq
    848 ;
    849 ; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
    850 ; AVX-RECIP:       # %bb.0:
    851 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    852 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    853 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    854 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
    855 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    856 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
    857 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
    858 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    859 ; AVX-RECIP-NEXT:    retq
    860 ;
    861 ; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
    862 ; FMA-RECIP:       # %bb.0:
    863 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    864 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
    865 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
    866 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
    867 ; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    868 ; FMA-RECIP-NEXT:    retq
    869 ;
    870 ; BTVER2-LABEL: v8f32_one_step_2_divs:
    871 ; BTVER2:       # %bb.0:
    872 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    873 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
    874 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
    875 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
    876 ; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
    877 ; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
    878 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00]
    879 ; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
    880 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    881 ;
    882 ; SANDY-LABEL: v8f32_one_step_2_divs:
    883 ; SANDY:       # %bb.0:
    884 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
    885 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
    886 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
    887 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
    888 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
    889 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    890 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
    891 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
    892 ; SANDY-NEXT:    retq # sched: [1:1.00]
    893 ;
    894 ; HASWELL-LABEL: v8f32_one_step_2_divs:
    895 ; HASWELL:       # %bb.0:
    896 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    897 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    898 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
    899 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
    900 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
    901 ; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
    902 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    903 ;
    904 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
    905 ; HASWELL-NO-FMA:       # %bb.0:
    906 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    907 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
    908 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    909 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
    910 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
    911 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    912 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
    913 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
    914 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
    915 ;
    916 ; KNL-LABEL: v8f32_one_step_2_divs:
    917 ; KNL:       # %bb.0:
    918 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    919 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    920 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
    921 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
    922 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
    923 ; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
    924 ; KNL-NEXT:    retq # sched: [7:1.00]
    925 ;
    926 ; SKX-LABEL: v8f32_one_step_2_divs:
    927 ; SKX:       # %bb.0:
    928 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
    929 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
    930 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
    931 ; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
    932 ; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
    933 ; SKX-NEXT:    retq # sched: [7:1.00]
    934   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
    935   %div2 = fdiv fast <8 x float> %div, %x
    936   ret <8 x float> %div2
    937 }
    938 
    939 define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
    940 ; SSE-LABEL: v8f32_two_step2:
    941 ; SSE:       # %bb.0:
    942 ; SSE-NEXT:    movaps %xmm0, %xmm2
    943 ; SSE-NEXT:    rcpps %xmm1, %xmm3
    944 ; SSE-NEXT:    movaps %xmm1, %xmm4
    945 ; SSE-NEXT:    mulps %xmm3, %xmm4
    946 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    947 ; SSE-NEXT:    movaps %xmm0, %xmm5
    948 ; SSE-NEXT:    subps %xmm4, %xmm5
    949 ; SSE-NEXT:    mulps %xmm3, %xmm5
    950 ; SSE-NEXT:    addps %xmm3, %xmm5
    951 ; SSE-NEXT:    mulps %xmm5, %xmm1
    952 ; SSE-NEXT:    movaps %xmm0, %xmm3
    953 ; SSE-NEXT:    subps %xmm1, %xmm3
    954 ; SSE-NEXT:    mulps %xmm5, %xmm3
    955 ; SSE-NEXT:    addps %xmm5, %xmm3
    956 ; SSE-NEXT:    rcpps %xmm2, %xmm1
    957 ; SSE-NEXT:    movaps %xmm2, %xmm4
    958 ; SSE-NEXT:    mulps %xmm1, %xmm4
    959 ; SSE-NEXT:    movaps %xmm0, %xmm5
    960 ; SSE-NEXT:    subps %xmm4, %xmm5
    961 ; SSE-NEXT:    mulps %xmm1, %xmm5
    962 ; SSE-NEXT:    addps %xmm1, %xmm5
    963 ; SSE-NEXT:    mulps %xmm5, %xmm2
    964 ; SSE-NEXT:    subps %xmm2, %xmm0
    965 ; SSE-NEXT:    mulps %xmm5, %xmm0
    966 ; SSE-NEXT:    addps %xmm5, %xmm0
    967 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
    968 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
    969 ; SSE-NEXT:    movaps %xmm3, %xmm1
    970 ; SSE-NEXT:    retq
    971 ;
    972 ; AVX-RECIP-LABEL: v8f32_two_step2:
    973 ; AVX-RECIP:       # %bb.0:
    974 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    975 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
    976 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    977 ; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
    978 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
    979 ; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
    980 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    981 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
    982 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    983 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
    984 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
    985 ; AVX-RECIP-NEXT:    retq
    986 ;
    987 ; FMA-RECIP-LABEL: v8f32_two_step2:
    988 ; FMA-RECIP:       # %bb.0:
    989 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    990 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    991 ; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
    992 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
    993 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
    994 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
    995 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
    996 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
    997 ; FMA-RECIP-NEXT:    retq
    998 ;
    999 ; BTVER2-LABEL: v8f32_two_step2:
   1000 ; BTVER2:       # %bb.0:
   1001 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
   1002 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
   1003 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
   1004 ; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
   1005 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
   1006 ; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
   1007 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
   1008 ; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
   1009 ; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
   1010 ; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
   1011 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
   1012 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1013 ;
   1014 ; SANDY-LABEL: v8f32_two_step2:
   1015 ; SANDY:       # %bb.0:
   1016 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
   1017 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
   1018 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
   1019 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
   1020 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
   1021 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
   1022 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
   1023 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
   1024 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
   1025 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   1026 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
   1027 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1028 ;
   1029 ; HASWELL-LABEL: v8f32_two_step2:
   1030 ; HASWELL:       # %bb.0:
   1031 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
   1032 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1033 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
   1034 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
   1035 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
   1036 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
   1037 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
   1038 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1039 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1040 ;
   1041 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
   1042 ; HASWELL-NO-FMA:       # %bb.0:
   1043 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
   1044 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
   1045 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1046 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
   1047 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
   1048 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
   1049 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
   1050 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
   1051 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
   1052 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
   1053 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1054 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1055 ;
   1056 ; KNL-LABEL: v8f32_two_step2:
   1057 ; KNL:       # %bb.0:
   1058 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
   1059 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1060 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
   1061 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
   1062 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
   1063 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
   1064 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
   1065 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1066 ; KNL-NEXT:    retq # sched: [7:1.00]
   1067 ;
   1068 ; SKX-LABEL: v8f32_two_step2:
   1069 ; SKX:       # %bb.0:
   1070 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
   1071 ; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1072 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
   1073 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
   1074 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
   1075 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50]
   1076 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50]
   1077 ; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
   1078 ; SKX-NEXT:    retq # sched: [7:1.00]
   1079   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   1080   ret <8 x float> %div
   1081 }
   1082 
   1083 define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
   1084 ; SSE-LABEL: v8f32_no_step:
   1085 ; SSE:       # %bb.0:
   1086 ; SSE-NEXT:    rcpps %xmm0, %xmm0
   1087 ; SSE-NEXT:    rcpps %xmm1, %xmm1
   1088 ; SSE-NEXT:    retq
   1089 ;
   1090 ; AVX-RECIP-LABEL: v8f32_no_step:
   1091 ; AVX-RECIP:       # %bb.0:
   1092 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1093 ; AVX-RECIP-NEXT:    retq
   1094 ;
   1095 ; FMA-RECIP-LABEL: v8f32_no_step:
   1096 ; FMA-RECIP:       # %bb.0:
   1097 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1098 ; FMA-RECIP-NEXT:    retq
   1099 ;
   1100 ; BTVER2-LABEL: v8f32_no_step:
   1101 ; BTVER2:       # %bb.0:
   1102 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
   1103 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1104 ;
   1105 ; SANDY-LABEL: v8f32_no_step:
   1106 ; SANDY:       # %bb.0:
   1107 ; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
   1108 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1109 ;
   1110 ; HASWELL-LABEL: v8f32_no_step:
   1111 ; HASWELL:       # %bb.0:
   1112 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1113 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1114 ;
   1115 ; HASWELL-NO-FMA-LABEL: v8f32_no_step:
   1116 ; HASWELL-NO-FMA:       # %bb.0:
   1117 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1118 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1119 ;
   1120 ; KNL-LABEL: v8f32_no_step:
   1121 ; KNL:       # %bb.0:
   1122 ; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1123 ; KNL-NEXT:    retq # sched: [7:1.00]
   1124 ;
   1125 ; SKX-LABEL: v8f32_no_step:
   1126 ; SKX:       # %bb.0:
   1127 ; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
   1128 ; SKX-NEXT:    retq # sched: [7:1.00]
   1129   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   1130   ret <8 x float> %div
   1131 }
   1132 
   1133 define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
   1134 ; SSE-LABEL: v8f32_no_step2:
   1135 ; SSE:       # %bb.0:
   1136 ; SSE-NEXT:    rcpps %xmm1, %xmm1
   1137 ; SSE-NEXT:    rcpps %xmm0, %xmm0
   1138 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
   1139 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
   1140 ; SSE-NEXT:    retq
   1141 ;
   1142 ; AVX-RECIP-LABEL: v8f32_no_step2:
   1143 ; AVX-RECIP:       # %bb.0:
   1144 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1145 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1146 ; AVX-RECIP-NEXT:    retq
   1147 ;
   1148 ; FMA-RECIP-LABEL: v8f32_no_step2:
   1149 ; FMA-RECIP:       # %bb.0:
   1150 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1151 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1152 ; FMA-RECIP-NEXT:    retq
   1153 ;
   1154 ; BTVER2-LABEL: v8f32_no_step2:
   1155 ; BTVER2:       # %bb.0:
   1156 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
   1157 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
   1158 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1159 ;
   1160 ; SANDY-LABEL: v8f32_no_step2:
   1161 ; SANDY:       # %bb.0:
   1162 ; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
   1163 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
   1164 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1165 ;
   1166 ; HASWELL-LABEL: v8f32_no_step2:
   1167 ; HASWELL:       # %bb.0:
   1168 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1169 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1170 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1171 ;
   1172 ; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
   1173 ; HASWELL-NO-FMA:       # %bb.0:
   1174 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1175 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1176 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1177 ;
   1178 ; KNL-LABEL: v8f32_no_step2:
   1179 ; KNL:       # %bb.0:
   1180 ; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1181 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1182 ; KNL-NEXT:    retq # sched: [7:1.00]
   1183 ;
   1184 ; SKX-LABEL: v8f32_no_step2:
   1185 ; SKX:       # %bb.0:
   1186 ; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
   1187 ; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
   1188 ; SKX-NEXT:    retq # sched: [7:1.00]
   1189   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   1190   ret <8 x float> %div
   1191 }
   1192 
   1193 define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
   1194 ; SSE-LABEL: v16f32_one_step2:
   1195 ; SSE:       # %bb.0:
   1196 ; SSE-NEXT:    movaps %xmm3, %xmm4
   1197 ; SSE-NEXT:    movaps %xmm2, %xmm5
   1198 ; SSE-NEXT:    movaps %xmm0, %xmm6
   1199 ; SSE-NEXT:    rcpps %xmm3, %xmm2
   1200 ; SSE-NEXT:    mulps %xmm2, %xmm4
   1201 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1202 ; SSE-NEXT:    movaps %xmm0, %xmm3
   1203 ; SSE-NEXT:    subps %xmm4, %xmm3
   1204 ; SSE-NEXT:    mulps %xmm2, %xmm3
   1205 ; SSE-NEXT:    addps %xmm2, %xmm3
   1206 ; SSE-NEXT:    rcpps %xmm5, %xmm4
   1207 ; SSE-NEXT:    mulps %xmm4, %xmm5
   1208 ; SSE-NEXT:    movaps %xmm0, %xmm2
   1209 ; SSE-NEXT:    subps %xmm5, %xmm2
   1210 ; SSE-NEXT:    mulps %xmm4, %xmm2
   1211 ; SSE-NEXT:    addps %xmm4, %xmm2
   1212 ; SSE-NEXT:    rcpps %xmm1, %xmm5
   1213 ; SSE-NEXT:    mulps %xmm5, %xmm1
   1214 ; SSE-NEXT:    movaps %xmm0, %xmm4
   1215 ; SSE-NEXT:    subps %xmm1, %xmm4
   1216 ; SSE-NEXT:    mulps %xmm5, %xmm4
   1217 ; SSE-NEXT:    addps %xmm5, %xmm4
   1218 ; SSE-NEXT:    rcpps %xmm6, %xmm1
   1219 ; SSE-NEXT:    mulps %xmm1, %xmm6
   1220 ; SSE-NEXT:    subps %xmm6, %xmm0
   1221 ; SSE-NEXT:    mulps %xmm1, %xmm0
   1222 ; SSE-NEXT:    addps %xmm1, %xmm0
   1223 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
   1224 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm4
   1225 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
   1226 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
   1227 ; SSE-NEXT:    movaps %xmm4, %xmm1
   1228 ; SSE-NEXT:    retq
   1229 ;
   1230 ; AVX-RECIP-LABEL: v16f32_one_step2:
   1231 ; AVX-RECIP:       # %bb.0:
   1232 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1233 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
   1234 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1235 ; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
   1236 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1237 ; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
   1238 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1239 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
   1240 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
   1241 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
   1242 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
   1243 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1244 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
   1245 ; AVX-RECIP-NEXT:    retq
   1246 ;
   1247 ; FMA-RECIP-LABEL: v16f32_one_step2:
   1248 ; FMA-RECIP:       # %bb.0:
   1249 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1250 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1251 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
   1252 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
   1253 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1254 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
   1255 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
   1256 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1257 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
   1258 ; FMA-RECIP-NEXT:    retq
   1259 ;
   1260 ; BTVER2-LABEL: v16f32_one_step2:
   1261 ; BTVER2:       # %bb.0:
   1262 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
   1263 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
   1264 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [2:2.00]
   1265 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
   1266 ; BTVER2-NEXT:    vmulps %ymm4, %ymm0, %ymm0 # sched: [2:2.00]
   1267 ; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
   1268 ; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
   1269 ; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
   1270 ; BTVER2-NEXT:    vmulps %ymm0, %ymm4, %ymm0 # sched: [2:2.00]
   1271 ; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
   1272 ; BTVER2-NEXT:    vaddps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
   1273 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
   1274 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
   1275 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1276 ;
   1277 ; SANDY-LABEL: v16f32_one_step2:
   1278 ; SANDY:       # %bb.0:
   1279 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
   1280 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
   1281 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
   1282 ; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
   1283 ; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
   1284 ; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
   1285 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
   1286 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
   1287 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
   1288 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
   1289 ; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
   1290 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
   1291 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
   1292 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1293 ;
   1294 ; HASWELL-LABEL: v16f32_one_step2:
   1295 ; HASWELL:       # %bb.0:
   1296 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
   1297 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1298 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm4 # sched: [11:2.00]
   1299 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
   1300 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
   1301 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
   1302 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
   1303 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1304 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
   1305 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1306 ;
   1307 ; HASWELL-NO-FMA-LABEL: v16f32_one_step2:
   1308 ; HASWELL-NO-FMA:       # %bb.0:
   1309 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
   1310 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
   1311 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1312 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
   1313 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
   1314 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
   1315 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
   1316 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
   1317 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
   1318 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
   1319 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
   1320 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1321 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
   1322 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1323 ;
   1324 ; KNL-LABEL: v16f32_one_step2:
   1325 ; KNL:       # %bb.0:
   1326 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
   1327 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
   1328 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
   1329 ; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
   1330 ; KNL-NEXT:    retq # sched: [7:1.00]
   1331 ;
   1332 ; SKX-LABEL: v16f32_one_step2:
   1333 ; SKX:       # %bb.0:
   1334 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
   1335 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
   1336 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
   1337 ; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
   1338 ; SKX-NEXT:    retq # sched: [7:1.00]
   1339   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   1340   ret <16 x float> %div
   1341 }
   1342 
   1343 define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
   1344 ; SSE-LABEL: v16f32_one_step_2_divs:
   1345 ; SSE:       # %bb.0:
   1346 ; SSE-NEXT:    rcpps %xmm0, %xmm6
   1347 ; SSE-NEXT:    mulps %xmm6, %xmm0
   1348 ; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1349 ; SSE-NEXT:    movaps %xmm4, %xmm5
   1350 ; SSE-NEXT:    subps %xmm0, %xmm5
   1351 ; SSE-NEXT:    mulps %xmm6, %xmm5
   1352 ; SSE-NEXT:    addps %xmm6, %xmm5
   1353 ; SSE-NEXT:    rcpps %xmm1, %xmm0
   1354 ; SSE-NEXT:    mulps %xmm0, %xmm1
   1355 ; SSE-NEXT:    movaps %xmm4, %xmm6
   1356 ; SSE-NEXT:    subps %xmm1, %xmm6
   1357 ; SSE-NEXT:    mulps %xmm0, %xmm6
   1358 ; SSE-NEXT:    addps %xmm0, %xmm6
   1359 ; SSE-NEXT:    rcpps %xmm2, %xmm0
   1360 ; SSE-NEXT:    mulps %xmm0, %xmm2
   1361 ; SSE-NEXT:    movaps %xmm4, %xmm7
   1362 ; SSE-NEXT:    subps %xmm2, %xmm7
   1363 ; SSE-NEXT:    mulps %xmm0, %xmm7
   1364 ; SSE-NEXT:    addps %xmm0, %xmm7
   1365 ; SSE-NEXT:    rcpps %xmm3, %xmm0
   1366 ; SSE-NEXT:    mulps %xmm0, %xmm3
   1367 ; SSE-NEXT:    subps %xmm3, %xmm4
   1368 ; SSE-NEXT:    mulps %xmm0, %xmm4
   1369 ; SSE-NEXT:    addps %xmm0, %xmm4
   1370 ; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01]
   1371 ; SSE-NEXT:    mulps %xmm4, %xmm3
   1372 ; SSE-NEXT:    movaps {{.*#+}} xmm2 = [9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01]
   1373 ; SSE-NEXT:    mulps %xmm7, %xmm2
   1374 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
   1375 ; SSE-NEXT:    mulps %xmm6, %xmm1
   1376 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
   1377 ; SSE-NEXT:    mulps %xmm5, %xmm0
   1378 ; SSE-NEXT:    mulps %xmm5, %xmm0
   1379 ; SSE-NEXT:    mulps %xmm6, %xmm1
   1380 ; SSE-NEXT:    mulps %xmm7, %xmm2
   1381 ; SSE-NEXT:    mulps %xmm4, %xmm3
   1382 ; SSE-NEXT:    retq
   1383 ;
   1384 ; AVX-RECIP-LABEL: v16f32_one_step_2_divs:
   1385 ; AVX-RECIP:       # %bb.0:
   1386 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1387 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
   1388 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1389 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
   1390 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
   1391 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
   1392 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1393 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
   1394 ; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
   1395 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1396 ; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
   1397 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
   1398 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
   1399 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm3, %ymm0
   1400 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1401 ; AVX-RECIP-NEXT:    retq
   1402 ;
   1403 ; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
   1404 ; FMA-RECIP:       # %bb.0:
   1405 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1406 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1407 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
   1408 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
   1409 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1410 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
   1411 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
   1412 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
   1413 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
   1414 ; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm3, %ymm0
   1415 ; FMA-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1416 ; FMA-RECIP-NEXT:    retq
   1417 ;
   1418 ; BTVER2-LABEL: v16f32_one_step_2_divs:
   1419 ; BTVER2:       # %bb.0:
   1420 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
   1421 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
   1422 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
   1423 ; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
   1424 ; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
   1425 ; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
   1426 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
   1427 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
   1428 ; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
   1429 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [7:2.00]
   1430 ; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
   1431 ; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
   1432 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [7:2.00]
   1433 ; BTVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [2:2.00]
   1434 ; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
   1435 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1436 ;
   1437 ; SANDY-LABEL: v16f32_one_step_2_divs:
   1438 ; SANDY:       # %bb.0:
   1439 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
   1440 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
   1441 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
   1442 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
   1443 ; SANDY-NEXT:    vrcpps %ymm1, %ymm4 # sched: [7:2.00]
   1444 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
   1445 ; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
   1446 ; SANDY-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [5:1.00]
   1447 ; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
   1448 ; SANDY-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [5:1.00]
   1449 ; SANDY-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
   1450 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00]
   1451 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00]
   1452 ; SANDY-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00]
   1453 ; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
   1454 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1455 ;
   1456 ; HASWELL-LABEL: v16f32_one_step_2_divs:
   1457 ; HASWELL:       # %bb.0:
   1458 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
   1459 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1460 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
   1461 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
   1462 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
   1463 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
   1464 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
   1465 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
   1466 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
   1467 ; HASWELL-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
   1468 ; HASWELL-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
   1469 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1470 ;
   1471 ; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs:
   1472 ; HASWELL-NO-FMA:       # %bb.0:
   1473 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
   1474 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
   1475 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1476 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
   1477 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
   1478 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
   1479 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
   1480 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
   1481 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
   1482 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
   1483 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
   1484 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
   1485 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
   1486 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
   1487 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
   1488 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1489 ;
   1490 ; KNL-LABEL: v16f32_one_step_2_divs:
   1491 ; KNL:       # %bb.0:
   1492 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
   1493 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
   1494 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
   1495 ; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50]
   1496 ; KNL-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50]
   1497 ; KNL-NEXT:    retq # sched: [7:1.00]
   1498 ;
   1499 ; SKX-LABEL: v16f32_one_step_2_divs:
   1500 ; SKX:       # %bb.0:
   1501 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
   1502 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
   1503 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
   1504 ; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50]
   1505 ; SKX-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.50]
   1506 ; SKX-NEXT:    retq # sched: [7:1.00]
   1507   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   1508   %div2 = fdiv fast <16 x float> %div, %x
   1509   ret <16 x float> %div2
   1510 }
   1511 
   1512 define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
   1513 ; SSE-LABEL: v16f32_two_step2:
   1514 ; SSE:       # %bb.0:
   1515 ; SSE-NEXT:    movaps %xmm3, %xmm6
   1516 ; SSE-NEXT:    movaps %xmm2, %xmm5
   1517 ; SSE-NEXT:    movaps %xmm0, %xmm4
   1518 ; SSE-NEXT:    rcpps %xmm3, %xmm2
   1519 ; SSE-NEXT:    mulps %xmm2, %xmm3
   1520 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1521 ; SSE-NEXT:    movaps %xmm0, %xmm7
   1522 ; SSE-NEXT:    subps %xmm3, %xmm7
   1523 ; SSE-NEXT:    mulps %xmm2, %xmm7
   1524 ; SSE-NEXT:    addps %xmm2, %xmm7
   1525 ; SSE-NEXT:    mulps %xmm7, %xmm6
   1526 ; SSE-NEXT:    movaps %xmm0, %xmm3
   1527 ; SSE-NEXT:    subps %xmm6, %xmm3
   1528 ; SSE-NEXT:    mulps %xmm7, %xmm3
   1529 ; SSE-NEXT:    addps %xmm7, %xmm3
   1530 ; SSE-NEXT:    rcpps %xmm5, %xmm2
   1531 ; SSE-NEXT:    movaps %xmm5, %xmm6
   1532 ; SSE-NEXT:    mulps %xmm2, %xmm6
   1533 ; SSE-NEXT:    movaps %xmm0, %xmm7
   1534 ; SSE-NEXT:    subps %xmm6, %xmm7
   1535 ; SSE-NEXT:    mulps %xmm2, %xmm7
   1536 ; SSE-NEXT:    addps %xmm2, %xmm7
   1537 ; SSE-NEXT:    mulps %xmm7, %xmm5
   1538 ; SSE-NEXT:    movaps %xmm0, %xmm2
   1539 ; SSE-NEXT:    subps %xmm5, %xmm2
   1540 ; SSE-NEXT:    mulps %xmm7, %xmm2
   1541 ; SSE-NEXT:    addps %xmm7, %xmm2
   1542 ; SSE-NEXT:    rcpps %xmm1, %xmm5
   1543 ; SSE-NEXT:    movaps %xmm1, %xmm6
   1544 ; SSE-NEXT:    mulps %xmm5, %xmm6
   1545 ; SSE-NEXT:    movaps %xmm0, %xmm7
   1546 ; SSE-NEXT:    subps %xmm6, %xmm7
   1547 ; SSE-NEXT:    mulps %xmm5, %xmm7
   1548 ; SSE-NEXT:    addps %xmm5, %xmm7
   1549 ; SSE-NEXT:    mulps %xmm7, %xmm1
   1550 ; SSE-NEXT:    movaps %xmm0, %xmm5
   1551 ; SSE-NEXT:    subps %xmm1, %xmm5
   1552 ; SSE-NEXT:    mulps %xmm7, %xmm5
   1553 ; SSE-NEXT:    addps %xmm7, %xmm5
   1554 ; SSE-NEXT:    rcpps %xmm4, %xmm1
   1555 ; SSE-NEXT:    movaps %xmm4, %xmm6
   1556 ; SSE-NEXT:    mulps %xmm1, %xmm6
   1557 ; SSE-NEXT:    movaps %xmm0, %xmm7
   1558 ; SSE-NEXT:    subps %xmm6, %xmm7
   1559 ; SSE-NEXT:    mulps %xmm1, %xmm7
   1560 ; SSE-NEXT:    addps %xmm1, %xmm7
   1561 ; SSE-NEXT:    mulps %xmm7, %xmm4
   1562 ; SSE-NEXT:    subps %xmm4, %xmm0
   1563 ; SSE-NEXT:    mulps %xmm7, %xmm0
   1564 ; SSE-NEXT:    addps %xmm7, %xmm0
   1565 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
   1566 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm5
   1567 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
   1568 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
   1569 ; SSE-NEXT:    movaps %xmm5, %xmm1
   1570 ; SSE-NEXT:    retq
   1571 ;
   1572 ; AVX-RECIP-LABEL: v16f32_two_step2:
   1573 ; AVX-RECIP:       # %bb.0:
   1574 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1575 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
   1576 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1577 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
   1578 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
   1579 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
   1580 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
   1581 ; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm4, %ymm1
   1582 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1583 ; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
   1584 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1585 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
   1586 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
   1587 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
   1588 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
   1589 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
   1590 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm4, %ymm0
   1591 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
   1592 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
   1593 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1594 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
   1595 ; AVX-RECIP-NEXT:    retq
   1596 ;
   1597 ; FMA-RECIP-LABEL: v16f32_two_step2:
   1598 ; FMA-RECIP:       # %bb.0:
   1599 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1600 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1601 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
   1602 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
   1603 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
   1604 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
   1605 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
   1606 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1607 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
   1608 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
   1609 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
   1610 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
   1611 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
   1612 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1613 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
   1614 ; FMA-RECIP-NEXT:    retq
   1615 ;
   1616 ; BTVER2-LABEL: v16f32_two_step2:
   1617 ; BTVER2:       # %bb.0:
   1618 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
   1619 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
   1620 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
   1621 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
   1622 ; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
   1623 ; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
   1624 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
   1625 ; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
   1626 ; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
   1627 ; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
   1628 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
   1629 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
   1630 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
   1631 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
   1632 ; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
   1633 ; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
   1634 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
   1635 ; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
   1636 ; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
   1637 ; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
   1638 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
   1639 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1640 ;
   1641 ; SANDY-LABEL: v16f32_two_step2:
   1642 ; SANDY:       # %bb.0:
   1643 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
   1644 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
   1645 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
   1646 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
   1647 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
   1648 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
   1649 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
   1650 ; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
   1651 ; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
   1652 ; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
   1653 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
   1654 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
   1655 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
   1656 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
   1657 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
   1658 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
   1659 ; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
   1660 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
   1661 ; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
   1662 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
   1663 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
   1664 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1665 ;
   1666 ; HASWELL-LABEL: v16f32_two_step2:
   1667 ; HASWELL:       # %bb.0:
   1668 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
   1669 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1670 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
   1671 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
   1672 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
   1673 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
   1674 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
   1675 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
   1676 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
   1677 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
   1678 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
   1679 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
   1680 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
   1681 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1682 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
   1683 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1684 ;
   1685 ; HASWELL-NO-FMA-LABEL: v16f32_two_step2:
   1686 ; HASWELL-NO-FMA:       # %bb.0:
   1687 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
   1688 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50]
   1689 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1690 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
   1691 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
   1692 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
   1693 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
   1694 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
   1695 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
   1696 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
   1697 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
   1698 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
   1699 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
   1700 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
   1701 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
   1702 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
   1703 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
   1704 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
   1705 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
   1706 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1707 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
   1708 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1709 ;
   1710 ; KNL-LABEL: v16f32_two_step2:
   1711 ; KNL:       # %bb.0:
   1712 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
   1713 ; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
   1714 ; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
   1715 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
   1716 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
   1717 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
   1718 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
   1719 ; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
   1720 ; KNL-NEXT:    retq # sched: [7:1.00]
   1721 ;
   1722 ; SKX-LABEL: v16f32_two_step2:
   1723 ; SKX:       # %bb.0:
   1724 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
   1725 ; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
   1726 ; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
   1727 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
   1728 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
   1729 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50]
   1730 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50]
   1731 ; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
   1732 ; SKX-NEXT:    retq # sched: [7:1.00]
   1733   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   1734   ret <16 x float> %div
   1735 }
   1736 
   1737 define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
   1738 ; SSE-LABEL: v16f32_no_step:
   1739 ; SSE:       # %bb.0:
   1740 ; SSE-NEXT:    rcpps %xmm0, %xmm0
   1741 ; SSE-NEXT:    rcpps %xmm1, %xmm1
   1742 ; SSE-NEXT:    rcpps %xmm2, %xmm2
   1743 ; SSE-NEXT:    rcpps %xmm3, %xmm3
   1744 ; SSE-NEXT:    retq
   1745 ;
   1746 ; AVX-RECIP-LABEL: v16f32_no_step:
   1747 ; AVX-RECIP:       # %bb.0:
   1748 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1749 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm1
   1750 ; AVX-RECIP-NEXT:    retq
   1751 ;
   1752 ; FMA-RECIP-LABEL: v16f32_no_step:
   1753 ; FMA-RECIP:       # %bb.0:
   1754 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1755 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
   1756 ; FMA-RECIP-NEXT:    retq
   1757 ;
   1758 ; BTVER2-LABEL: v16f32_no_step:
   1759 ; BTVER2:       # %bb.0:
   1760 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
   1761 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
   1762 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1763 ;
   1764 ; SANDY-LABEL: v16f32_no_step:
   1765 ; SANDY:       # %bb.0:
   1766 ; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
   1767 ; SANDY-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
   1768 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1769 ;
   1770 ; HASWELL-LABEL: v16f32_no_step:
   1771 ; HASWELL:       # %bb.0:
   1772 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1773 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
   1774 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1775 ;
   1776 ; HASWELL-NO-FMA-LABEL: v16f32_no_step:
   1777 ; HASWELL-NO-FMA:       # %bb.0:
   1778 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1779 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
   1780 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1781 ;
   1782 ; KNL-LABEL: v16f32_no_step:
   1783 ; KNL:       # %bb.0:
   1784 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
   1785 ; KNL-NEXT:    retq # sched: [7:1.00]
   1786 ;
   1787 ; SKX-LABEL: v16f32_no_step:
   1788 ; SKX:       # %bb.0:
   1789 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
   1790 ; SKX-NEXT:    retq # sched: [7:1.00]
   1791   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   1792   ret <16 x float> %div
   1793 }
   1794 
   1795 define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
   1796 ; SSE-LABEL: v16f32_no_step2:
   1797 ; SSE:       # %bb.0:
   1798 ; SSE-NEXT:    rcpps %xmm3, %xmm3
   1799 ; SSE-NEXT:    rcpps %xmm2, %xmm2
   1800 ; SSE-NEXT:    rcpps %xmm1, %xmm1
   1801 ; SSE-NEXT:    rcpps %xmm0, %xmm0
   1802 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
   1803 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
   1804 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
   1805 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
   1806 ; SSE-NEXT:    retq
   1807 ;
   1808 ; AVX-RECIP-LABEL: v16f32_no_step2:
   1809 ; AVX-RECIP:       # %bb.0:
   1810 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm1
   1811 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1812 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1813 ; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
   1814 ; AVX-RECIP-NEXT:    retq
   1815 ;
   1816 ; FMA-RECIP-LABEL: v16f32_no_step2:
   1817 ; FMA-RECIP:       # %bb.0:
   1818 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
   1819 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
   1820 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
   1821 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
   1822 ; FMA-RECIP-NEXT:    retq
   1823 ;
   1824 ; BTVER2-LABEL: v16f32_no_step2:
   1825 ; BTVER2:       # %bb.0:
   1826 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
   1827 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
   1828 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
   1829 ; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
   1830 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1831 ;
   1832 ; SANDY-LABEL: v16f32_no_step2:
   1833 ; SANDY:       # %bb.0:
   1834 ; SANDY-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
   1835 ; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
   1836 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
   1837 ; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
   1838 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1839 ;
   1840 ; HASWELL-LABEL: v16f32_no_step2:
   1841 ; HASWELL:       # %bb.0:
   1842 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
   1843 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1844 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1845 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
   1846 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1847 ;
   1848 ; HASWELL-NO-FMA-LABEL: v16f32_no_step2:
   1849 ; HASWELL-NO-FMA:       # %bb.0:
   1850 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
   1851 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
   1852 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
   1853 ; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
   1854 ; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
   1855 ;
   1856 ; KNL-LABEL: v16f32_no_step2:
   1857 ; KNL:       # %bb.0:
   1858 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
   1859 ; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
   1860 ; KNL-NEXT:    retq # sched: [7:1.00]
   1861 ;
   1862 ; SKX-LABEL: v16f32_no_step2:
   1863 ; SKX:       # %bb.0:
   1864 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
   1865 ; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
   1866 ; SKX-NEXT:    retq # sched: [7:1.00]
   1867   %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
   1868   ret <16 x float> %div
   1869 }
   1870 
   1871 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
   1872 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
   1873 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
   1874 attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }
   1875 
   1876