Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
     11 
     12 ; If the target's divss/divps instructions are substantially
     13 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
     14 ; we should generate the estimate sequence.
     15 
     16 ; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
     17 ; for details about the accuracy, speed, and implementation
     18 ; differences of x86 reciprocal estimates.
     19 
     20 define float @f32_no_estimate(float %x) #0 {
     21 ; SSE-LABEL: f32_no_estimate:
     22 ; SSE:       # %bb.0:
     23 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     24 ; SSE-NEXT:    divss %xmm0, %xmm1
     25 ; SSE-NEXT:    movaps %xmm1, %xmm0
     26 ; SSE-NEXT:    retq
     27 ;
     28 ; AVX-RECIP-LABEL: f32_no_estimate:
     29 ; AVX-RECIP:       # %bb.0:
     30 ; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     31 ; AVX-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
     32 ; AVX-RECIP-NEXT:    retq
     33 ;
     34 ; FMA-RECIP-LABEL: f32_no_estimate:
     35 ; FMA-RECIP:       # %bb.0:
     36 ; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     37 ; FMA-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
     38 ; FMA-RECIP-NEXT:    retq
     39 ;
     40 ; BTVER2-LABEL: f32_no_estimate:
     41 ; BTVER2:       # %bb.0:
     42 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
     43 ; BTVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
     44 ; BTVER2-NEXT:    retq # sched: [4:1.00]
     45 ;
     46 ; SANDY-LABEL: f32_no_estimate:
     47 ; SANDY:       # %bb.0:
     48 ; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
     49 ; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
     50 ; SANDY-NEXT:    retq # sched: [1:1.00]
     51 ;
     52 ; HASWELL-LABEL: f32_no_estimate:
     53 ; HASWELL:       # %bb.0:
     54 ; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
     55 ; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
     56 ; HASWELL-NEXT:    retq # sched: [7:1.00]
     57 ;
     58 ; HASWELL-NO-FMA-LABEL: f32_no_estimate:
     59 ; HASWELL-NO-FMA:       # %bb.0:
     60 ; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     61 ; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
     62 ; HASWELL-NO-FMA-NEXT:    retq
     63 ;
     64 ; KNL-LABEL: f32_no_estimate:
     65 ; KNL:       # %bb.0:
     66 ; KNL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
     67 ; KNL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
     68 ; KNL-NEXT:    retq # sched: [7:1.00]
     69 ;
     70 ; SKX-LABEL: f32_no_estimate:
     71 ; SKX:       # %bb.0:
     72 ; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
     73 ; SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
     74 ; SKX-NEXT:    retq # sched: [7:1.00]
     75   %div = fdiv fast float 1.0, %x
     76   ret float %div
     77 }
     78 
     79 define float @f32_one_step(float %x) #1 {
     80 ; SSE-LABEL: f32_one_step:
     81 ; SSE:       # %bb.0:
     82 ; SSE-NEXT:    rcpss %xmm0, %xmm2
     83 ; SSE-NEXT:    mulss %xmm2, %xmm0
     84 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     85 ; SSE-NEXT:    subss %xmm0, %xmm1
     86 ; SSE-NEXT:    mulss %xmm2, %xmm1
     87 ; SSE-NEXT:    addss %xmm2, %xmm1
     88 ; SSE-NEXT:    movaps %xmm1, %xmm0
     89 ; SSE-NEXT:    retq
     90 ;
     91 ; AVX-RECIP-LABEL: f32_one_step:
     92 ; AVX-RECIP:       # %bb.0:
     93 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
     94 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     95 ; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
     96 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
     97 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
     98 ; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
     99 ; AVX-RECIP-NEXT:    retq
    100 ;
    101 ; FMA-RECIP-LABEL: f32_one_step:
    102 ; FMA-RECIP:       # %bb.0:
    103 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    104 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
    105 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
    106 ; FMA-RECIP-NEXT:    retq
    107 ;
    108 ; BTVER2-LABEL: f32_one_step:
    109 ; BTVER2:       # %bb.0:
    110 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
    111 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
    112 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    113 ; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    114 ; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    115 ; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    116 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    117 ;
    118 ; SANDY-LABEL: f32_one_step:
    119 ; SANDY:       # %bb.0:
    120 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    121 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    122 ; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
    123 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    124 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    125 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    126 ; SANDY-NEXT:    retq # sched: [1:1.00]
    127 ;
    128 ; HASWELL-LABEL: f32_one_step:
    129 ; HASWELL:       # %bb.0:
    130 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    131 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    132 ; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    133 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    134 ;
    135 ; HASWELL-NO-FMA-LABEL: f32_one_step:
    136 ; HASWELL-NO-FMA:       # %bb.0:
    137 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    138 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    139 ; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    140 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
    141 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    142 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    143 ; HASWELL-NO-FMA-NEXT:    retq
    144 ;
    145 ; KNL-LABEL: f32_one_step:
    146 ; KNL:       # %bb.0:
    147 ; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    148 ; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    149 ; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    150 ; KNL-NEXT:    retq # sched: [7:1.00]
    151 ;
    152 ; SKX-LABEL: f32_one_step:
    153 ; SKX:       # %bb.0:
    154 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
    155 ; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
    156 ; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
    157 ; SKX-NEXT:    retq # sched: [7:1.00]
    158   %div = fdiv fast float 1.0, %x
    159   ret float %div
    160 }
    161 
    162 define float @f32_two_step(float %x) #2 {
    163 ; SSE-LABEL: f32_two_step:
    164 ; SSE:       # %bb.0:
    165 ; SSE-NEXT:    rcpss %xmm0, %xmm2
    166 ; SSE-NEXT:    movaps %xmm0, %xmm3
    167 ; SSE-NEXT:    mulss %xmm2, %xmm3
    168 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    169 ; SSE-NEXT:    movaps %xmm1, %xmm4
    170 ; SSE-NEXT:    subss %xmm3, %xmm4
    171 ; SSE-NEXT:    mulss %xmm2, %xmm4
    172 ; SSE-NEXT:    addss %xmm2, %xmm4
    173 ; SSE-NEXT:    mulss %xmm4, %xmm0
    174 ; SSE-NEXT:    subss %xmm0, %xmm1
    175 ; SSE-NEXT:    mulss %xmm4, %xmm1
    176 ; SSE-NEXT:    addss %xmm4, %xmm1
    177 ; SSE-NEXT:    movaps %xmm1, %xmm0
    178 ; SSE-NEXT:    retq
    179 ;
    180 ; AVX-RECIP-LABEL: f32_two_step:
    181 ; AVX-RECIP:       # %bb.0:
    182 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    183 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
    184 ; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
    185 ; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
    186 ; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
    187 ; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    188 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    189 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
    190 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    191 ; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    192 ; AVX-RECIP-NEXT:    retq
    193 ;
    194 ; FMA-RECIP-LABEL: f32_two_step:
    195 ; FMA-RECIP:       # %bb.0:
    196 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    197 ; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    198 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
    199 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
    200 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
    201 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
    202 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
    203 ; FMA-RECIP-NEXT:    retq
    204 ;
    205 ; BTVER2-LABEL: f32_two_step:
    206 ; BTVER2:       # %bb.0:
    207 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
    208 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
    209 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
    210 ; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    211 ; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
    212 ; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    213 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    214 ; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    215 ; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    216 ; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    217 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    218 ;
    219 ; SANDY-LABEL: f32_two_step:
    220 ; SANDY:       # %bb.0:
    221 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    222 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
    223 ; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
    224 ; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    225 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
    226 ; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    227 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    228 ; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    229 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    230 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    231 ; SANDY-NEXT:    retq # sched: [1:1.00]
    232 ;
    233 ; HASWELL-LABEL: f32_two_step:
    234 ; HASWELL:       # %bb.0:
    235 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    236 ; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    237 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    238 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    239 ; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    240 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    241 ; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    242 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    243 ;
    244 ; HASWELL-NO-FMA-LABEL: f32_two_step:
    245 ; HASWELL-NO-FMA:       # %bb.0:
    246 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
    247 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
    248 ; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
    249 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
    250 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
    251 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    252 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    253 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
    254 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    255 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    256 ; HASWELL-NO-FMA-NEXT:    retq
    257 ;
    258 ; KNL-LABEL: f32_two_step:
    259 ; KNL:       # %bb.0:
    260 ; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
    261 ; KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    262 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    263 ; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    264 ; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    265 ; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    266 ; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    267 ; KNL-NEXT:    retq # sched: [7:1.00]
    268 ;
    269 ; SKX-LABEL: f32_two_step:
    270 ; SKX:       # %bb.0:
    271 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
    272 ; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
    273 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
    274 ; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
    275 ; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
    276 ; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
    277 ; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
    278 ; SKX-NEXT:    retq # sched: [7:1.00]
    279   %div = fdiv fast float 1.0, %x
    280   ret float %div
    281 }
    282 
    283 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
    284 ; SSE-LABEL: v4f32_no_estimate:
    285 ; SSE:       # %bb.0:
    286 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    287 ; SSE-NEXT:    divps %xmm0, %xmm1
    288 ; SSE-NEXT:    movaps %xmm1, %xmm0
    289 ; SSE-NEXT:    retq
    290 ;
    291 ; AVX-RECIP-LABEL: v4f32_no_estimate:
    292 ; AVX-RECIP:       # %bb.0:
    293 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    294 ; AVX-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
    295 ; AVX-RECIP-NEXT:    retq
    296 ;
    297 ; FMA-RECIP-LABEL: v4f32_no_estimate:
    298 ; FMA-RECIP:       # %bb.0:
    299 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    300 ; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
    301 ; FMA-RECIP-NEXT:    retq
    302 ;
    303 ; BTVER2-LABEL: v4f32_no_estimate:
    304 ; BTVER2:       # %bb.0:
    305 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    306 ; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
    307 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    308 ;
    309 ; SANDY-LABEL: v4f32_no_estimate:
    310 ; SANDY:       # %bb.0:
    311 ; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
    312 ; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
    313 ; SANDY-NEXT:    retq # sched: [1:1.00]
    314 ;
    315 ; HASWELL-LABEL: v4f32_no_estimate:
    316 ; HASWELL:       # %bb.0:
    317 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
    318 ; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
    319 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    320 ;
    321 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
    322 ; HASWELL-NO-FMA:       # %bb.0:
    323 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
    324 ; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
    325 ; HASWELL-NO-FMA-NEXT:    retq
    326 ;
    327 ; KNL-LABEL: v4f32_no_estimate:
    328 ; KNL:       # %bb.0:
    329 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
    330 ; KNL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
    331 ; KNL-NEXT:    retq # sched: [7:1.00]
    332 ;
    333 ; SKX-LABEL: v4f32_no_estimate:
    334 ; SKX:       # %bb.0:
    335 ; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
    336 ; SKX-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
    337 ; SKX-NEXT:    retq # sched: [7:1.00]
    338   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
    339   ret <4 x float> %div
    340 }
    341 
    342 define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
    343 ; SSE-LABEL: v4f32_one_step:
    344 ; SSE:       # %bb.0:
    345 ; SSE-NEXT:    rcpps %xmm0, %xmm2
    346 ; SSE-NEXT:    mulps %xmm2, %xmm0
    347 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    348 ; SSE-NEXT:    subps %xmm0, %xmm1
    349 ; SSE-NEXT:    mulps %xmm2, %xmm1
    350 ; SSE-NEXT:    addps %xmm2, %xmm1
    351 ; SSE-NEXT:    movaps %xmm1, %xmm0
    352 ; SSE-NEXT:    retq
    353 ;
    354 ; AVX-RECIP-LABEL: v4f32_one_step:
    355 ; AVX-RECIP:       # %bb.0:
    356 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    357 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    358 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    359 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
    360 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    361 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    362 ; AVX-RECIP-NEXT:    retq
    363 ;
    364 ; FMA-RECIP-LABEL: v4f32_one_step:
    365 ; FMA-RECIP:       # %bb.0:
    366 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    367 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
    368 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
    369 ; FMA-RECIP-NEXT:    retq
    370 ;
    371 ; BTVER2-LABEL: v4f32_one_step:
    372 ; BTVER2:       # %bb.0:
    373 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    374 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
    375 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    376 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    377 ; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    378 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    379 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    380 ;
    381 ; SANDY-LABEL: v4f32_one_step:
    382 ; SANDY:       # %bb.0:
    383 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    384 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    385 ; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
    386 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
    387 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    388 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    389 ; SANDY-NEXT:    retq # sched: [1:1.00]
    390 ;
    391 ; HASWELL-LABEL: v4f32_one_step:
    392 ; HASWELL:       # %bb.0:
    393 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    394 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    395 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
    396 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    397 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    398 ;
    399 ; HASWELL-NO-FMA-LABEL: v4f32_one_step:
    400 ; HASWELL-NO-FMA:       # %bb.0:
    401 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
    402 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    403 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
    404 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
    405 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    406 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    407 ; HASWELL-NO-FMA-NEXT:    retq
    408 ;
    409 ; KNL-LABEL: v4f32_one_step:
    410 ; KNL:       # %bb.0:
    411 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    412 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    413 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
    414 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
    415 ; KNL-NEXT:    retq # sched: [7:1.00]
    416 ;
    417 ; SKX-LABEL: v4f32_one_step:
    418 ; SKX:       # %bb.0:
    419 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
    420 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
    421 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
    422 ; SKX-NEXT:    retq # sched: [7:1.00]
    423   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
    424   ret <4 x float> %div
    425 }
    426 
    427 define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
    428 ; SSE-LABEL: v4f32_two_step:
    429 ; SSE:       # %bb.0:
    430 ; SSE-NEXT:    rcpps %xmm0, %xmm2
    431 ; SSE-NEXT:    movaps %xmm0, %xmm3
    432 ; SSE-NEXT:    mulps %xmm2, %xmm3
    433 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    434 ; SSE-NEXT:    movaps %xmm1, %xmm4
    435 ; SSE-NEXT:    subps %xmm3, %xmm4
    436 ; SSE-NEXT:    mulps %xmm2, %xmm4
    437 ; SSE-NEXT:    addps %xmm2, %xmm4
    438 ; SSE-NEXT:    mulps %xmm4, %xmm0
    439 ; SSE-NEXT:    subps %xmm0, %xmm1
    440 ; SSE-NEXT:    mulps %xmm4, %xmm1
    441 ; SSE-NEXT:    addps %xmm4, %xmm1
    442 ; SSE-NEXT:    movaps %xmm1, %xmm0
    443 ; SSE-NEXT:    retq
    444 ;
    445 ; AVX-RECIP-LABEL: v4f32_two_step:
    446 ; AVX-RECIP:       # %bb.0:
    447 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    448 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
    449 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    450 ; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
    451 ; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
    452 ; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
    453 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    454 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
    455 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    456 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    457 ; AVX-RECIP-NEXT:    retq
    458 ;
    459 ; FMA-RECIP-LABEL: v4f32_two_step:
    460 ; FMA-RECIP:       # %bb.0:
    461 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
    462 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    463 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
    464 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
    465 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
    466 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
    467 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
    468 ; FMA-RECIP-NEXT:    retq
    469 ;
    470 ; BTVER2-LABEL: v4f32_two_step:
    471 ; BTVER2:       # %bb.0:
    472 ; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    473 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
    474 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
    475 ; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    476 ; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
    477 ; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    478 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
    479 ; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    480 ; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
    481 ; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    482 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    483 ;
    484 ; SANDY-LABEL: v4f32_two_step:
    485 ; SANDY:       # %bb.0:
    486 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    487 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
    488 ; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
    489 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
    490 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
    491 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
    492 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
    493 ; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
    494 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
    495 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
    496 ; SANDY-NEXT:    retq # sched: [1:1.00]
    497 ;
    498 ; HASWELL-LABEL: v4f32_two_step:
    499 ; HASWELL:       # %bb.0:
    500 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    501 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    502 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    503 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    504 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    505 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    506 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    507 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    508 ;
    509 ; HASWELL-NO-FMA-LABEL: v4f32_two_step:
    510 ; HASWELL-NO-FMA:       # %bb.0:
    511 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
    512 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
    513 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
    514 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
    515 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
    516 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
    517 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    518 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
    519 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
    520 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
    521 ; HASWELL-NO-FMA-NEXT:    retq
    522 ;
    523 ; KNL-LABEL: v4f32_two_step:
    524 ; KNL:       # %bb.0:
    525 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
    526 ; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    527 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
    528 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
    529 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
    530 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
    531 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
    532 ; KNL-NEXT:    retq # sched: [7:1.00]
    533 ;
    534 ; SKX-LABEL: v4f32_two_step:
    535 ; SKX:       # %bb.0:
    536 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
    537 ; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
    538 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
    539 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
    540 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
    541 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
    542 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
    543 ; SKX-NEXT:    retq # sched: [7:1.00]
    544   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
    545   ret <4 x float> %div
    546 }
    547 
    548 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
    549 ; SSE-LABEL: v8f32_no_estimate:
    550 ; SSE:       # %bb.0:
    551 ; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    552 ; SSE-NEXT:    movaps %xmm2, %xmm3
    553 ; SSE-NEXT:    divps %xmm0, %xmm3
    554 ; SSE-NEXT:    divps %xmm1, %xmm2
    555 ; SSE-NEXT:    movaps %xmm3, %xmm0
    556 ; SSE-NEXT:    movaps %xmm2, %xmm1
    557 ; SSE-NEXT:    retq
    558 ;
    559 ; AVX-RECIP-LABEL: v8f32_no_estimate:
    560 ; AVX-RECIP:       # %bb.0:
    561 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    562 ; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
    563 ; AVX-RECIP-NEXT:    retq
    564 ;
    565 ; FMA-RECIP-LABEL: v8f32_no_estimate:
    566 ; FMA-RECIP:       # %bb.0:
    567 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    568 ; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
    569 ; FMA-RECIP-NEXT:    retq
    570 ;
    571 ; BTVER2-LABEL: v8f32_no_estimate:
    572 ; BTVER2:       # %bb.0:
    573 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    574 ; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
    575 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    576 ;
    577 ; SANDY-LABEL: v8f32_no_estimate:
    578 ; SANDY:       # %bb.0:
    579 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
    580 ; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00]
    581 ; SANDY-NEXT:    retq # sched: [1:1.00]
    582 ;
    583 ; HASWELL-LABEL: v8f32_no_estimate:
    584 ; HASWELL:       # %bb.0:
    585 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    586 ; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
    587 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    588 ;
    589 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
    590 ; HASWELL-NO-FMA:       # %bb.0:
    591 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
    592 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
    593 ; HASWELL-NO-FMA-NEXT:    retq
    594 ;
    595 ; KNL-LABEL: v8f32_no_estimate:
    596 ; KNL:       # %bb.0:
    597 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    598 ; KNL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
    599 ; KNL-NEXT:    retq # sched: [7:1.00]
    600 ;
    601 ; SKX-LABEL: v8f32_no_estimate:
    602 ; SKX:       # %bb.0:
    603 ; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    604 ; SKX-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [11:5.00]
    605 ; SKX-NEXT:    retq # sched: [7:1.00]
    606   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
    607   ret <8 x float> %div
    608 }
    609 
    610 define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
    611 ; SSE-LABEL: v8f32_one_step:
    612 ; SSE:       # %bb.0:
    613 ; SSE-NEXT:    rcpps %xmm0, %xmm4
    614 ; SSE-NEXT:    mulps %xmm4, %xmm0
    615 ; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    616 ; SSE-NEXT:    movaps %xmm2, %xmm3
    617 ; SSE-NEXT:    subps %xmm0, %xmm3
    618 ; SSE-NEXT:    mulps %xmm4, %xmm3
    619 ; SSE-NEXT:    addps %xmm4, %xmm3
    620 ; SSE-NEXT:    rcpps %xmm1, %xmm0
    621 ; SSE-NEXT:    mulps %xmm0, %xmm1
    622 ; SSE-NEXT:    subps %xmm1, %xmm2
    623 ; SSE-NEXT:    mulps %xmm0, %xmm2
    624 ; SSE-NEXT:    addps %xmm0, %xmm2
    625 ; SSE-NEXT:    movaps %xmm3, %xmm0
    626 ; SSE-NEXT:    movaps %xmm2, %xmm1
    627 ; SSE-NEXT:    retq
    628 ;
    629 ; AVX-RECIP-LABEL: v8f32_one_step:
    630 ; AVX-RECIP:       # %bb.0:
    631 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    632 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    633 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    634 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
    635 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    636 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
    637 ; AVX-RECIP-NEXT:    retq
    638 ;
    639 ; FMA-RECIP-LABEL: v8f32_one_step:
    640 ; FMA-RECIP:       # %bb.0:
    641 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    642 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
    643 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
    644 ; FMA-RECIP-NEXT:    retq
    645 ;
    646 ; BTVER2-LABEL: v8f32_one_step:
    647 ; BTVER2:       # %bb.0:
    648 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    649 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
    650 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
    651 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
    652 ; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
    653 ; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
    654 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    655 ;
    656 ; SANDY-LABEL: v8f32_one_step:
    657 ; SANDY:       # %bb.0:
    658 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
    659 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
    660 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
    661 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
    662 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
    663 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    664 ; SANDY-NEXT:    retq # sched: [1:1.00]
    665 ;
    666 ; HASWELL-LABEL: v8f32_one_step:
    667 ; HASWELL:       # %bb.0:
    668 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    669 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    670 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
    671 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
    672 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    673 ;
    674 ; HASWELL-NO-FMA-LABEL: v8f32_one_step:
    675 ; HASWELL-NO-FMA:       # %bb.0:
    676 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
    677 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    678 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
    679 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
    680 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    681 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
    682 ; HASWELL-NO-FMA-NEXT:    retq
    683 ;
    684 ; KNL-LABEL: v8f32_one_step:
    685 ; KNL:       # %bb.0:
    686 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    687 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    688 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
    689 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
    690 ; KNL-NEXT:    retq # sched: [7:1.00]
    691 ;
    692 ; SKX-LABEL: v8f32_one_step:
    693 ; SKX:       # %bb.0:
    694 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
    695 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
    696 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
    697 ; SKX-NEXT:    retq # sched: [7:1.00]
    698   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
    699   ret <8 x float> %div
    700 }
    701 
    702 define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
    703 ; SSE-LABEL: v8f32_two_step:
    704 ; SSE:       # %bb.0:
    705 ; SSE-NEXT:    movaps %xmm1, %xmm2
    706 ; SSE-NEXT:    rcpps %xmm0, %xmm3
    707 ; SSE-NEXT:    movaps %xmm0, %xmm4
    708 ; SSE-NEXT:    mulps %xmm3, %xmm4
    709 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    710 ; SSE-NEXT:    movaps %xmm1, %xmm5
    711 ; SSE-NEXT:    subps %xmm4, %xmm5
    712 ; SSE-NEXT:    mulps %xmm3, %xmm5
    713 ; SSE-NEXT:    addps %xmm3, %xmm5
    714 ; SSE-NEXT:    mulps %xmm5, %xmm0
    715 ; SSE-NEXT:    movaps %xmm1, %xmm3
    716 ; SSE-NEXT:    subps %xmm0, %xmm3
    717 ; SSE-NEXT:    mulps %xmm5, %xmm3
    718 ; SSE-NEXT:    addps %xmm5, %xmm3
    719 ; SSE-NEXT:    rcpps %xmm2, %xmm0
    720 ; SSE-NEXT:    movaps %xmm2, %xmm4
    721 ; SSE-NEXT:    mulps %xmm0, %xmm4
    722 ; SSE-NEXT:    movaps %xmm1, %xmm5
    723 ; SSE-NEXT:    subps %xmm4, %xmm5
    724 ; SSE-NEXT:    mulps %xmm0, %xmm5
    725 ; SSE-NEXT:    addps %xmm0, %xmm5
    726 ; SSE-NEXT:    mulps %xmm5, %xmm2
    727 ; SSE-NEXT:    subps %xmm2, %xmm1
    728 ; SSE-NEXT:    mulps %xmm5, %xmm1
    729 ; SSE-NEXT:    addps %xmm5, %xmm1
    730 ; SSE-NEXT:    movaps %xmm3, %xmm0
    731 ; SSE-NEXT:    retq
    732 ;
    733 ; AVX-RECIP-LABEL: v8f32_two_step:
    734 ; AVX-RECIP:       # %bb.0:
    735 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    736 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
    737 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    738 ; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
    739 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
    740 ; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
    741 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    742 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
    743 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    744 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
    745 ; AVX-RECIP-NEXT:    retq
    746 ;
    747 ; FMA-RECIP-LABEL: v8f32_two_step:
    748 ; FMA-RECIP:       # %bb.0:
    749 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
    750 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    751 ; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
    752 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
    753 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
    754 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
    755 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
    756 ; FMA-RECIP-NEXT:    retq
    757 ;
    758 ; BTVER2-LABEL: v8f32_two_step:
    759 ; BTVER2:       # %bb.0:
    760 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    761 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
    762 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
    763 ; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
    764 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
    765 ; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
    766 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
    767 ; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
    768 ; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
    769 ; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
    770 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    771 ;
    772 ; SANDY-LABEL: v8f32_two_step:
    773 ; SANDY:       # %bb.0:
    774 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
    775 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
    776 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
    777 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
    778 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
    779 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
    780 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
    781 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
    782 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
    783 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
    784 ; SANDY-NEXT:    retq # sched: [1:1.00]
    785 ;
    786 ; HASWELL-LABEL: v8f32_two_step:
    787 ; HASWELL:       # %bb.0:
    788 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    789 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    790 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
    791 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
    792 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
    793 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
    794 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
    795 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    796 ;
    797 ; HASWELL-NO-FMA-LABEL: v8f32_two_step:
    798 ; HASWELL-NO-FMA:       # %bb.0:
    799 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
    800 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
    801 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
    802 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
    803 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
    804 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
    805 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    806 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
    807 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    808 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
    809 ; HASWELL-NO-FMA-NEXT:    retq
    810 ;
    811 ; KNL-LABEL: v8f32_two_step:
    812 ; KNL:       # %bb.0:
    813 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
    814 ; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    815 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
    816 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
    817 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
    818 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
    819 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
    820 ; KNL-NEXT:    retq # sched: [7:1.00]
    821 ;
    822 ; SKX-LABEL: v8f32_two_step:
    823 ; SKX:       # %bb.0:
    824 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
    825 ; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    826 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
    827 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
    828 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
    829 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50]
    830 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50]
    831 ; SKX-NEXT:    retq # sched: [7:1.00]
    832   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
    833   ret <8 x float> %div
    834 }
    835 
    836 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
    837 ; SSE-LABEL: v16f32_no_estimate:
    838 ; SSE:       # %bb.0:
    839 ; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    840 ; SSE-NEXT:    movaps %xmm4, %xmm5
    841 ; SSE-NEXT:    divps %xmm0, %xmm5
    842 ; SSE-NEXT:    movaps %xmm4, %xmm6
    843 ; SSE-NEXT:    divps %xmm1, %xmm6
    844 ; SSE-NEXT:    movaps %xmm4, %xmm7
    845 ; SSE-NEXT:    divps %xmm2, %xmm7
    846 ; SSE-NEXT:    divps %xmm3, %xmm4
    847 ; SSE-NEXT:    movaps %xmm5, %xmm0
    848 ; SSE-NEXT:    movaps %xmm6, %xmm1
    849 ; SSE-NEXT:    movaps %xmm7, %xmm2
    850 ; SSE-NEXT:    movaps %xmm4, %xmm3
    851 ; SSE-NEXT:    retq
    852 ;
    853 ; AVX-RECIP-LABEL: v16f32_no_estimate:
    854 ; AVX-RECIP:       # %bb.0:
    855 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    856 ; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
    857 ; AVX-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
    858 ; AVX-RECIP-NEXT:    retq
    859 ;
    860 ; FMA-RECIP-LABEL: v16f32_no_estimate:
    861 ; FMA-RECIP:       # %bb.0:
    862 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    863 ; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
    864 ; FMA-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
    865 ; FMA-RECIP-NEXT:    retq
    866 ;
    867 ; BTVER2-LABEL: v16f32_no_estimate:
    868 ; BTVER2:       # %bb.0:
    869 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    870 ; BTVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [38:38.00]
    871 ; BTVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [38:38.00]
    872 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    873 ;
    874 ; SANDY-LABEL: v16f32_no_estimate:
    875 ; SANDY:       # %bb.0:
    876 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
    877 ; SANDY-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00]
    878 ; SANDY-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00]
    879 ; SANDY-NEXT:    retq # sched: [1:1.00]
    880 ;
    881 ; HASWELL-LABEL: v16f32_no_estimate:
    882 ; HASWELL:       # %bb.0:
    883 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
    884 ; HASWELL-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [21:14.00]
    885 ; HASWELL-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [21:14.00]
    886 ; HASWELL-NEXT:    retq # sched: [7:1.00]
    887 ;
    888 ; HASWELL-NO-FMA-LABEL: v16f32_no_estimate:
    889 ; HASWELL-NO-FMA:       # %bb.0:
    890 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
    891 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm2, %ymm0
    892 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm1, %ymm2, %ymm1
    893 ; HASWELL-NO-FMA-NEXT:    retq
    894 ;
    895 ; KNL-LABEL: v16f32_no_estimate:
    896 ; KNL:       # %bb.0:
    897 ; KNL-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
    898 ; KNL-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [21:14.00]
    899 ; KNL-NEXT:    retq # sched: [7:1.00]
    900 ;
    901 ; SKX-LABEL: v16f32_no_estimate:
    902 ; SKX:       # %bb.0:
    903 ; SKX-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
    904 ; SKX-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
    905 ; SKX-NEXT:    retq # sched: [7:1.00]
    906   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
    907   ret <16 x float> %div
    908 }
    909 
    910 define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
    911 ; SSE-LABEL: v16f32_one_step:
    912 ; SSE:       # %bb.0:
    913 ; SSE-NEXT:    movaps %xmm3, %xmm4
    914 ; SSE-NEXT:    movaps %xmm0, %xmm5
    915 ; SSE-NEXT:    rcpps %xmm0, %xmm6
    916 ; SSE-NEXT:    mulps %xmm6, %xmm5
    917 ; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    918 ; SSE-NEXT:    movaps %xmm3, %xmm0
    919 ; SSE-NEXT:    subps %xmm5, %xmm0
    920 ; SSE-NEXT:    mulps %xmm6, %xmm0
    921 ; SSE-NEXT:    addps %xmm6, %xmm0
    922 ; SSE-NEXT:    rcpps %xmm1, %xmm6
    923 ; SSE-NEXT:    mulps %xmm6, %xmm1
    924 ; SSE-NEXT:    movaps %xmm3, %xmm5
    925 ; SSE-NEXT:    subps %xmm1, %xmm5
    926 ; SSE-NEXT:    mulps %xmm6, %xmm5
    927 ; SSE-NEXT:    addps %xmm6, %xmm5
    928 ; SSE-NEXT:    rcpps %xmm2, %xmm1
    929 ; SSE-NEXT:    mulps %xmm1, %xmm2
    930 ; SSE-NEXT:    movaps %xmm3, %xmm6
    931 ; SSE-NEXT:    subps %xmm2, %xmm6
    932 ; SSE-NEXT:    mulps %xmm1, %xmm6
    933 ; SSE-NEXT:    addps %xmm1, %xmm6
    934 ; SSE-NEXT:    rcpps %xmm4, %xmm1
    935 ; SSE-NEXT:    mulps %xmm1, %xmm4
    936 ; SSE-NEXT:    subps %xmm4, %xmm3
    937 ; SSE-NEXT:    mulps %xmm1, %xmm3
    938 ; SSE-NEXT:    addps %xmm1, %xmm3
    939 ; SSE-NEXT:    movaps %xmm5, %xmm1
    940 ; SSE-NEXT:    movaps %xmm6, %xmm2
    941 ; SSE-NEXT:    retq
    942 ;
    943 ; AVX-RECIP-LABEL: v16f32_one_step:
    944 ; AVX-RECIP:       # %bb.0:
    945 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
    946 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
    947 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    948 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
    949 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
    950 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
    951 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
    952 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
    953 ; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
    954 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
    955 ; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
    956 ; AVX-RECIP-NEXT:    retq
    957 ;
    958 ; FMA-RECIP-LABEL: v16f32_one_step:
    959 ; FMA-RECIP:       # %bb.0:
    960 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
    961 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
    962 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
    963 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
    964 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
    965 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
    966 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
    967 ; FMA-RECIP-NEXT:    retq
    968 ;
    969 ; BTVER2-LABEL: v16f32_one_step:
    970 ; BTVER2:       # %bb.0:
    971 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
    972 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
    973 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [2:2.00]
    974 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
    975 ; BTVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [2:2.00]
    976 ; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
    977 ; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
    978 ; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
    979 ; BTVER2-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [2:2.00]
    980 ; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
    981 ; BTVER2-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
    982 ; BTVER2-NEXT:    retq # sched: [4:1.00]
    983 ;
    984 ; SANDY-LABEL: v16f32_one_step:
    985 ; SANDY:       # %bb.0:
    986 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
    987 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
    988 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
    989 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
    990 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
    991 ; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
    992 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
    993 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
    994 ; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
    995 ; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
    996 ; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
    997 ; SANDY-NEXT:    retq # sched: [1:1.00]
    998 ;
    999 ; HASWELL-LABEL: v16f32_one_step:
   1000 ; HASWELL:       # %bb.0:
   1001 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
   1002 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1003 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
   1004 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
   1005 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
   1006 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
   1007 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
   1008 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1009 ;
   1010 ; HASWELL-NO-FMA-LABEL: v16f32_one_step:
   1011 ; HASWELL-NO-FMA:       # %bb.0:
   1012 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
   1013 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
   1014 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
   1015 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
   1016 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
   1017 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
   1018 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
   1019 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
   1020 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1
   1021 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1022 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
   1023 ; HASWELL-NO-FMA-NEXT:    retq
   1024 ;
   1025 ; KNL-LABEL: v16f32_one_step:
   1026 ; KNL:       # %bb.0:
   1027 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
   1028 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
   1029 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
   1030 ; KNL-NEXT:    retq # sched: [7:1.00]
   1031 ;
   1032 ; SKX-LABEL: v16f32_one_step:
   1033 ; SKX:       # %bb.0:
   1034 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
   1035 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
   1036 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
   1037 ; SKX-NEXT:    retq # sched: [7:1.00]
   1038   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   1039   ret <16 x float> %div
   1040 }
   1041 
   1042 define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
   1043 ; SSE-LABEL: v16f32_two_step:
   1044 ; SSE:       # %bb.0:
   1045 ; SSE-NEXT:    movaps %xmm3, %xmm4
   1046 ; SSE-NEXT:    movaps %xmm1, %xmm5
   1047 ; SSE-NEXT:    movaps %xmm0, %xmm1
   1048 ; SSE-NEXT:    rcpps %xmm0, %xmm0
   1049 ; SSE-NEXT:    movaps %xmm1, %xmm6
   1050 ; SSE-NEXT:    mulps %xmm0, %xmm6
   1051 ; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1052 ; SSE-NEXT:    movaps %xmm3, %xmm7
   1053 ; SSE-NEXT:    subps %xmm6, %xmm7
   1054 ; SSE-NEXT:    mulps %xmm0, %xmm7
   1055 ; SSE-NEXT:    addps %xmm0, %xmm7
   1056 ; SSE-NEXT:    mulps %xmm7, %xmm1
   1057 ; SSE-NEXT:    movaps %xmm3, %xmm0
   1058 ; SSE-NEXT:    subps %xmm1, %xmm0
   1059 ; SSE-NEXT:    mulps %xmm7, %xmm0
   1060 ; SSE-NEXT:    addps %xmm7, %xmm0
   1061 ; SSE-NEXT:    rcpps %xmm5, %xmm1
   1062 ; SSE-NEXT:    movaps %xmm5, %xmm6
   1063 ; SSE-NEXT:    mulps %xmm1, %xmm6
   1064 ; SSE-NEXT:    movaps %xmm3, %xmm7
   1065 ; SSE-NEXT:    subps %xmm6, %xmm7
   1066 ; SSE-NEXT:    mulps %xmm1, %xmm7
   1067 ; SSE-NEXT:    addps %xmm1, %xmm7
   1068 ; SSE-NEXT:    mulps %xmm7, %xmm5
   1069 ; SSE-NEXT:    movaps %xmm3, %xmm1
   1070 ; SSE-NEXT:    subps %xmm5, %xmm1
   1071 ; SSE-NEXT:    mulps %xmm7, %xmm1
   1072 ; SSE-NEXT:    addps %xmm7, %xmm1
   1073 ; SSE-NEXT:    rcpps %xmm2, %xmm5
   1074 ; SSE-NEXT:    movaps %xmm2, %xmm6
   1075 ; SSE-NEXT:    mulps %xmm5, %xmm6
   1076 ; SSE-NEXT:    movaps %xmm3, %xmm7
   1077 ; SSE-NEXT:    subps %xmm6, %xmm7
   1078 ; SSE-NEXT:    mulps %xmm5, %xmm7
   1079 ; SSE-NEXT:    addps %xmm5, %xmm7
   1080 ; SSE-NEXT:    mulps %xmm7, %xmm2
   1081 ; SSE-NEXT:    movaps %xmm3, %xmm5
   1082 ; SSE-NEXT:    subps %xmm2, %xmm5
   1083 ; SSE-NEXT:    mulps %xmm7, %xmm5
   1084 ; SSE-NEXT:    addps %xmm7, %xmm5
   1085 ; SSE-NEXT:    rcpps %xmm4, %xmm2
   1086 ; SSE-NEXT:    movaps %xmm4, %xmm6
   1087 ; SSE-NEXT:    mulps %xmm2, %xmm6
   1088 ; SSE-NEXT:    movaps %xmm3, %xmm7
   1089 ; SSE-NEXT:    subps %xmm6, %xmm7
   1090 ; SSE-NEXT:    mulps %xmm2, %xmm7
   1091 ; SSE-NEXT:    addps %xmm2, %xmm7
   1092 ; SSE-NEXT:    mulps %xmm7, %xmm4
   1093 ; SSE-NEXT:    subps %xmm4, %xmm3
   1094 ; SSE-NEXT:    mulps %xmm7, %xmm3
   1095 ; SSE-NEXT:    addps %xmm7, %xmm3
   1096 ; SSE-NEXT:    movaps %xmm5, %xmm2
   1097 ; SSE-NEXT:    retq
   1098 ;
   1099 ; AVX-RECIP-LABEL: v16f32_two_step:
   1100 ; AVX-RECIP:       # %bb.0:
   1101 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1102 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
   1103 ; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1104 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
   1105 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
   1106 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
   1107 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
   1108 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm4, %ymm0
   1109 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
   1110 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
   1111 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1112 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
   1113 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
   1114 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
   1115 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
   1116 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
   1117 ; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm4, %ymm1
   1118 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1119 ; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
   1120 ; AVX-RECIP-NEXT:    retq
   1121 ;
   1122 ; FMA-RECIP-LABEL: v16f32_two_step:
   1123 ; FMA-RECIP:       # %bb.0:
   1124 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
   1125 ; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
   1126 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
   1127 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
   1128 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
   1129 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
   1130 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
   1131 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
   1132 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
   1133 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
   1134 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
   1135 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
   1136 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
   1137 ; FMA-RECIP-NEXT:    retq
   1138 ;
   1139 ; BTVER2-LABEL: v16f32_two_step:
   1140 ; BTVER2:       # %bb.0:
   1141 ; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
   1142 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
   1143 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
   1144 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
   1145 ; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
   1146 ; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
   1147 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
   1148 ; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
   1149 ; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
   1150 ; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
   1151 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
   1152 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
   1153 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
   1154 ; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
   1155 ; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
   1156 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
   1157 ; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
   1158 ; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
   1159 ; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
   1160 ; BTVER2-NEXT:    retq # sched: [4:1.00]
   1161 ;
   1162 ; SANDY-LABEL: v16f32_two_step:
   1163 ; SANDY:       # %bb.0:
   1164 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
   1165 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
   1166 ; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
   1167 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
   1168 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
   1169 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
   1170 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
   1171 ; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
   1172 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
   1173 ; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
   1174 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
   1175 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
   1176 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
   1177 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
   1178 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
   1179 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
   1180 ; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
   1181 ; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
   1182 ; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
   1183 ; SANDY-NEXT:    retq # sched: [1:1.00]
   1184 ;
   1185 ; HASWELL-LABEL: v16f32_two_step:
   1186 ; HASWELL:       # %bb.0:
   1187 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
   1188 ; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
   1189 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
   1190 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
   1191 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
   1192 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
   1193 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
   1194 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
   1195 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
   1196 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
   1197 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
   1198 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
   1199 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
   1200 ; HASWELL-NEXT:    retq # sched: [7:1.00]
   1201 ;
   1202 ; HASWELL-NO-FMA-LABEL: v16f32_two_step:
   1203 ; HASWELL-NO-FMA:       # %bb.0:
   1204 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
   1205 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3
   1206 ; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
   1207 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
   1208 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
   1209 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
   1210 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
   1211 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0
   1212 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
   1213 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
   1214 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
   1215 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3
   1216 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
   1217 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
   1218 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
   1219 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
   1220 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1
   1221 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
   1222 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
   1223 ; HASWELL-NO-FMA-NEXT:    retq
   1224 ;
   1225 ; KNL-LABEL: v16f32_two_step:
   1226 ; KNL:       # %bb.0:
   1227 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
   1228 ; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
   1229 ; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
   1230 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
   1231 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
   1232 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
   1233 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
   1234 ; KNL-NEXT:    retq # sched: [7:1.00]
   1235 ;
   1236 ; SKX-LABEL: v16f32_two_step:
   1237 ; SKX:       # %bb.0:
   1238 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
   1239 ; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
   1240 ; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
   1241 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
   1242 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
   1243 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50]
   1244 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50]
   1245 ; SKX-NEXT:    retq # sched: [7:1.00]
   1246   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   1247   ret <16 x float> %div
   1248 }
   1249 
   1250 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
   1251 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
   1252 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
   1253 
   1254