1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX 11 12 ; If the target's divss/divps instructions are substantially 13 ; slower than rcpss/rcpps with a Newton-Raphson refinement, 14 ; we should generate the estimate sequence. 15 16 ; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 ) 17 ; for details about the accuracy, speed, and implementation 18 ; differences of x86 reciprocal estimates. 19 20 define float @f32_no_estimate(float %x) #0 { 21 ; SSE-LABEL: f32_no_estimate: 22 ; SSE: # %bb.0: 23 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 24 ; SSE-NEXT: divss %xmm0, %xmm1 25 ; SSE-NEXT: movaps %xmm1, %xmm0 26 ; SSE-NEXT: retq 27 ; 28 ; AVX-RECIP-LABEL: f32_no_estimate: 29 ; AVX-RECIP: # %bb.0: 30 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 31 ; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 32 ; AVX-RECIP-NEXT: retq 33 ; 34 ; FMA-RECIP-LABEL: f32_no_estimate: 35 ; FMA-RECIP: # %bb.0: 36 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 37 ; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0 38 ; FMA-RECIP-NEXT: retq 39 ; 40 ; BTVER2-LABEL: f32_no_estimate: 41 ; BTVER2: # %bb.0: 42 ; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00] 43 ; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [19:19.00] 44 ; BTVER2-NEXT: retq # sched: [4:1.00] 45 ; 46 ; SANDY-LABEL: f32_no_estimate: 47 ; SANDY: # %bb.0: 48 ; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50] 49 ; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [14:14.00] 50 ; SANDY-NEXT: retq # sched: [1:1.00] 51 ; 52 ; HASWELL-LABEL: f32_no_estimate: 53 ; HASWELL: # %bb.0: 54 ; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50] 55 ; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00] 56 ; HASWELL-NEXT: retq # sched: [7:1.00] 57 ; 58 ; HASWELL-NO-FMA-LABEL: f32_no_estimate: 59 ; HASWELL-NO-FMA: # %bb.0: 60 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 61 ; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 62 ; HASWELL-NO-FMA-NEXT: retq 63 ; 64 ; KNL-LABEL: f32_no_estimate: 65 ; KNL: # %bb.0: 66 ; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50] 67 ; KNL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00] 68 ; KNL-NEXT: retq # sched: [7:1.00] 69 ; 70 ; SKX-LABEL: f32_no_estimate: 71 ; SKX: # %bb.0: 72 ; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50] 73 ; SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [11:3.00] 74 ; SKX-NEXT: retq # sched: [7:1.00] 75 %div = fdiv fast float 1.0, %x 76 ret float %div 77 } 78 79 define float @f32_one_step(float %x) #1 { 80 ; SSE-LABEL: f32_one_step: 81 ; SSE: # %bb.0: 82 ; SSE-NEXT: rcpss %xmm0, %xmm2 83 ; SSE-NEXT: mulss %xmm2, %xmm0 84 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 85 ; SSE-NEXT: subss %xmm0, %xmm1 86 ; SSE-NEXT: mulss %xmm2, %xmm1 87 ; SSE-NEXT: addss %xmm2, %xmm1 88 ; SSE-NEXT: movaps %xmm1, %xmm0 89 ; SSE-NEXT: retq 90 ; 91 ; AVX-RECIP-LABEL: f32_one_step: 92 ; AVX-RECIP: # %bb.0: 93 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 94 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 95 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 96 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 97 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 98 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 99 ; AVX-RECIP-NEXT: retq 100 ; 101 ; FMA-RECIP-LABEL: f32_one_step: 102 ; FMA-RECIP: # %bb.0: 103 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 104 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 105 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 106 ; FMA-RECIP-NEXT: retq 107 ; 108 ; BTVER2-LABEL: f32_one_step: 109 ; BTVER2: # %bb.0: 110 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] 111 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 112 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 113 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 114 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 115 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 116 ; BTVER2-NEXT: retq # sched: [4:1.00] 117 ; 118 ; SANDY-LABEL: f32_one_step: 119 ; SANDY: # %bb.0: 120 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 121 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 122 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] 123 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 124 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 125 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 126 ; SANDY-NEXT: retq # sched: [1:1.00] 127 ; 128 ; HASWELL-LABEL: f32_one_step: 129 ; HASWELL: # %bb.0: 130 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 131 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 132 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 133 ; HASWELL-NEXT: retq # sched: [7:1.00] 134 ; 135 ; HASWELL-NO-FMA-LABEL: f32_one_step: 136 ; HASWELL-NO-FMA: # %bb.0: 137 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 138 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 139 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 140 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 141 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 142 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 143 ; HASWELL-NO-FMA-NEXT: retq 144 ; 145 ; KNL-LABEL: f32_one_step: 146 ; KNL: # %bb.0: 147 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 148 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 149 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 150 ; KNL-NEXT: retq # sched: [7:1.00] 151 ; 152 ; SKX-LABEL: f32_one_step: 153 ; SKX: # %bb.0: 154 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 155 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50] 156 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 157 ; SKX-NEXT: retq # sched: [7:1.00] 158 %div = fdiv fast float 1.0, %x 159 ret float %div 160 } 161 162 define float @f32_two_step(float %x) #2 { 163 ; SSE-LABEL: f32_two_step: 164 ; SSE: # %bb.0: 165 ; SSE-NEXT: rcpss %xmm0, %xmm2 166 ; SSE-NEXT: movaps %xmm0, %xmm3 167 ; SSE-NEXT: mulss %xmm2, %xmm3 168 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 169 ; SSE-NEXT: movaps %xmm1, %xmm4 170 ; SSE-NEXT: subss %xmm3, %xmm4 171 ; SSE-NEXT: mulss %xmm2, %xmm4 172 ; SSE-NEXT: addss %xmm2, %xmm4 173 ; SSE-NEXT: mulss %xmm4, %xmm0 174 ; SSE-NEXT: subss %xmm0, %xmm1 175 ; SSE-NEXT: mulss %xmm4, %xmm1 176 ; SSE-NEXT: addss %xmm4, %xmm1 177 ; SSE-NEXT: movaps %xmm1, %xmm0 178 ; SSE-NEXT: retq 179 ; 180 ; AVX-RECIP-LABEL: f32_two_step: 181 ; AVX-RECIP: # %bb.0: 182 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 183 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 184 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 185 ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 186 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 187 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 188 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 189 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 190 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 191 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 192 ; AVX-RECIP-NEXT: retq 193 ; 194 ; FMA-RECIP-LABEL: f32_two_step: 195 ; FMA-RECIP: # %bb.0: 196 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 197 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 198 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 199 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 200 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 201 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 202 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 203 ; FMA-RECIP-NEXT: retq 204 ; 205 ; BTVER2-LABEL: f32_two_step: 206 ; BTVER2: # %bb.0: 207 ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00] 208 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 209 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00] 210 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 211 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00] 212 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 213 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 214 ; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 215 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 216 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 217 ; BTVER2-NEXT: retq # sched: [4:1.00] 218 ; 219 ; SANDY-LABEL: f32_two_step: 220 ; SANDY: # %bb.0: 221 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 222 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00] 223 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50] 224 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 225 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00] 226 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 227 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 228 ; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 229 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 230 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 231 ; SANDY-NEXT: retq # sched: [1:1.00] 232 ; 233 ; HASWELL-LABEL: f32_two_step: 234 ; HASWELL: # %bb.0: 235 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 236 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 237 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 238 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 239 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 240 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 241 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 242 ; HASWELL-NEXT: retq # sched: [7:1.00] 243 ; 244 ; HASWELL-NO-FMA-LABEL: f32_two_step: 245 ; HASWELL-NO-FMA: # %bb.0: 246 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 247 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 248 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 249 ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 250 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 251 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 252 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 253 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 254 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 255 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 256 ; HASWELL-NO-FMA-NEXT: retq 257 ; 258 ; KNL-LABEL: f32_two_step: 259 ; KNL: # %bb.0: 260 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 261 ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 262 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 263 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 264 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 265 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 266 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 267 ; KNL-NEXT: retq # sched: [7:1.00] 268 ; 269 ; SKX-LABEL: f32_two_step: 270 ; SKX: # %bb.0: 271 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 272 ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 273 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33] 274 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50] 275 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50] 276 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50] 277 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50] 278 ; SKX-NEXT: retq # sched: [7:1.00] 279 %div = fdiv fast float 1.0, %x 280 ret float %div 281 } 282 283 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { 284 ; SSE-LABEL: v4f32_no_estimate: 285 ; SSE: # %bb.0: 286 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 287 ; SSE-NEXT: divps %xmm0, %xmm1 288 ; SSE-NEXT: movaps %xmm1, %xmm0 289 ; SSE-NEXT: retq 290 ; 291 ; AVX-RECIP-LABEL: v4f32_no_estimate: 292 ; AVX-RECIP: # %bb.0: 293 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 294 ; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 295 ; AVX-RECIP-NEXT: retq 296 ; 297 ; FMA-RECIP-LABEL: v4f32_no_estimate: 298 ; FMA-RECIP: # %bb.0: 299 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 300 ; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 301 ; FMA-RECIP-NEXT: retq 302 ; 303 ; BTVER2-LABEL: v4f32_no_estimate: 304 ; BTVER2: # %bb.0: 305 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 306 ; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00] 307 ; BTVER2-NEXT: retq # sched: [4:1.00] 308 ; 309 ; SANDY-LABEL: v4f32_no_estimate: 310 ; SANDY: # %bb.0: 311 ; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 312 ; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00] 313 ; SANDY-NEXT: retq # sched: [1:1.00] 314 ; 315 ; HASWELL-LABEL: v4f32_no_estimate: 316 ; HASWELL: # %bb.0: 317 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50] 318 ; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00] 319 ; HASWELL-NEXT: retq # sched: [7:1.00] 320 ; 321 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate: 322 ; HASWELL-NO-FMA: # %bb.0: 323 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] 324 ; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 325 ; HASWELL-NO-FMA-NEXT: retq 326 ; 327 ; KNL-LABEL: v4f32_no_estimate: 328 ; KNL: # %bb.0: 329 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50] 330 ; KNL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00] 331 ; KNL-NEXT: retq # sched: [7:1.00] 332 ; 333 ; SKX-LABEL: v4f32_no_estimate: 334 ; SKX: # %bb.0: 335 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50] 336 ; SKX-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [11:3.00] 337 ; SKX-NEXT: retq # sched: [7:1.00] 338 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 339 ret <4 x float> %div 340 } 341 342 define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { 343 ; SSE-LABEL: v4f32_one_step: 344 ; SSE: # %bb.0: 345 ; SSE-NEXT: rcpps %xmm0, %xmm2 346 ; SSE-NEXT: mulps %xmm2, %xmm0 347 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 348 ; SSE-NEXT: subps %xmm0, %xmm1 349 ; SSE-NEXT: mulps %xmm2, %xmm1 350 ; SSE-NEXT: addps %xmm2, %xmm1 351 ; SSE-NEXT: movaps %xmm1, %xmm0 352 ; SSE-NEXT: retq 353 ; 354 ; AVX-RECIP-LABEL: v4f32_one_step: 355 ; AVX-RECIP: # %bb.0: 356 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 357 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 358 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 359 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 360 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 361 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 362 ; AVX-RECIP-NEXT: retq 363 ; 364 ; FMA-RECIP-LABEL: v4f32_one_step: 365 ; FMA-RECIP: # %bb.0: 366 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 367 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 368 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 369 ; FMA-RECIP-NEXT: retq 370 ; 371 ; BTVER2-LABEL: v4f32_one_step: 372 ; BTVER2: # %bb.0: 373 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 374 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 375 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 376 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 377 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 378 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 379 ; BTVER2-NEXT: retq # sched: [4:1.00] 380 ; 381 ; SANDY-LABEL: v4f32_one_step: 382 ; SANDY: # %bb.0: 383 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 384 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 385 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 386 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 387 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 388 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 389 ; SANDY-NEXT: retq # sched: [1:1.00] 390 ; 391 ; HASWELL-LABEL: v4f32_one_step: 392 ; HASWELL: # %bb.0: 393 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 394 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 395 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 396 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 397 ; HASWELL-NEXT: retq # sched: [7:1.00] 398 ; 399 ; HASWELL-NO-FMA-LABEL: v4f32_one_step: 400 ; HASWELL-NO-FMA: # %bb.0: 401 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 402 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 403 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] 404 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 405 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 406 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 407 ; HASWELL-NO-FMA-NEXT: retq 408 ; 409 ; KNL-LABEL: v4f32_one_step: 410 ; KNL: # %bb.0: 411 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 412 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 413 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 414 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 415 ; KNL-NEXT: retq # sched: [7:1.00] 416 ; 417 ; SKX-LABEL: v4f32_one_step: 418 ; SKX: # %bb.0: 419 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 420 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 421 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 422 ; SKX-NEXT: retq # sched: [7:1.00] 423 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 424 ret <4 x float> %div 425 } 426 427 define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { 428 ; SSE-LABEL: v4f32_two_step: 429 ; SSE: # %bb.0: 430 ; SSE-NEXT: rcpps %xmm0, %xmm2 431 ; SSE-NEXT: movaps %xmm0, %xmm3 432 ; SSE-NEXT: mulps %xmm2, %xmm3 433 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 434 ; SSE-NEXT: movaps %xmm1, %xmm4 435 ; SSE-NEXT: subps %xmm3, %xmm4 436 ; SSE-NEXT: mulps %xmm2, %xmm4 437 ; SSE-NEXT: addps %xmm2, %xmm4 438 ; SSE-NEXT: mulps %xmm4, %xmm0 439 ; SSE-NEXT: subps %xmm0, %xmm1 440 ; SSE-NEXT: mulps %xmm4, %xmm1 441 ; SSE-NEXT: addps %xmm4, %xmm1 442 ; SSE-NEXT: movaps %xmm1, %xmm0 443 ; SSE-NEXT: retq 444 ; 445 ; AVX-RECIP-LABEL: v4f32_two_step: 446 ; AVX-RECIP: # %bb.0: 447 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 448 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 449 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 450 ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 451 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 452 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 453 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 454 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 455 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 456 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 457 ; AVX-RECIP-NEXT: retq 458 ; 459 ; FMA-RECIP-LABEL: v4f32_two_step: 460 ; FMA-RECIP: # %bb.0: 461 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 462 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 463 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 464 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 465 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 466 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 467 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 468 ; FMA-RECIP-NEXT: retq 469 ; 470 ; BTVER2-LABEL: v4f32_two_step: 471 ; BTVER2: # %bb.0: 472 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 473 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 474 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00] 475 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 476 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00] 477 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 478 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 479 ; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 480 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 481 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 482 ; BTVER2-NEXT: retq # sched: [4:1.00] 483 ; 484 ; SANDY-LABEL: v4f32_two_step: 485 ; SANDY: # %bb.0: 486 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 487 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00] 488 ; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 489 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 490 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00] 491 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 492 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 493 ; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 494 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 495 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 496 ; SANDY-NEXT: retq # sched: [1:1.00] 497 ; 498 ; HASWELL-LABEL: v4f32_two_step: 499 ; HASWELL: # %bb.0: 500 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 501 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 502 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 503 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 504 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 505 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 506 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 507 ; HASWELL-NEXT: retq # sched: [7:1.00] 508 ; 509 ; HASWELL-NO-FMA-LABEL: v4f32_two_step: 510 ; HASWELL-NO-FMA: # %bb.0: 511 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 512 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 513 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] 514 ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 515 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 516 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 517 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 518 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 519 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 520 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 521 ; HASWELL-NO-FMA-NEXT: retq 522 ; 523 ; KNL-LABEL: v4f32_two_step: 524 ; KNL: # %bb.0: 525 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 526 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 527 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 528 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 529 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 530 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 531 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 532 ; KNL-NEXT: retq # sched: [7:1.00] 533 ; 534 ; SKX-LABEL: v4f32_two_step: 535 ; SKX: # %bb.0: 536 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 537 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 538 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33] 539 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50] 540 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50] 541 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50] 542 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50] 543 ; SKX-NEXT: retq # sched: [7:1.00] 544 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x 545 ret <4 x float> %div 546 } 547 548 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { 549 ; SSE-LABEL: v8f32_no_estimate: 550 ; SSE: # %bb.0: 551 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 552 ; SSE-NEXT: movaps %xmm2, %xmm3 553 ; SSE-NEXT: divps %xmm0, %xmm3 554 ; SSE-NEXT: divps %xmm1, %xmm2 555 ; SSE-NEXT: movaps %xmm3, %xmm0 556 ; SSE-NEXT: movaps %xmm2, %xmm1 557 ; SSE-NEXT: retq 558 ; 559 ; AVX-RECIP-LABEL: v8f32_no_estimate: 560 ; AVX-RECIP: # %bb.0: 561 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 562 ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 563 ; AVX-RECIP-NEXT: retq 564 ; 565 ; FMA-RECIP-LABEL: v8f32_no_estimate: 566 ; FMA-RECIP: # %bb.0: 567 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 568 ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 569 ; FMA-RECIP-NEXT: retq 570 ; 571 ; BTVER2-LABEL: v8f32_no_estimate: 572 ; BTVER2: # %bb.0: 573 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 574 ; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00] 575 ; BTVER2-NEXT: retq # sched: [4:1.00] 576 ; 577 ; SANDY-LABEL: v8f32_no_estimate: 578 ; SANDY: # %bb.0: 579 ; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 580 ; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00] 581 ; SANDY-NEXT: retq # sched: [1:1.00] 582 ; 583 ; HASWELL-LABEL: v8f32_no_estimate: 584 ; HASWELL: # %bb.0: 585 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 586 ; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00] 587 ; HASWELL-NEXT: retq # sched: [7:1.00] 588 ; 589 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate: 590 ; HASWELL-NO-FMA: # %bb.0: 591 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] 592 ; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 593 ; HASWELL-NO-FMA-NEXT: retq 594 ; 595 ; KNL-LABEL: v8f32_no_estimate: 596 ; KNL: # %bb.0: 597 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 598 ; KNL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00] 599 ; KNL-NEXT: retq # sched: [7:1.00] 600 ; 601 ; SKX-LABEL: v8f32_no_estimate: 602 ; SKX: # %bb.0: 603 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 604 ; SKX-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [11:5.00] 605 ; SKX-NEXT: retq # sched: [7:1.00] 606 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 607 ret <8 x float> %div 608 } 609 610 define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { 611 ; SSE-LABEL: v8f32_one_step: 612 ; SSE: # %bb.0: 613 ; SSE-NEXT: rcpps %xmm0, %xmm4 614 ; SSE-NEXT: mulps %xmm4, %xmm0 615 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 616 ; SSE-NEXT: movaps %xmm2, %xmm3 617 ; SSE-NEXT: subps %xmm0, %xmm3 618 ; SSE-NEXT: mulps %xmm4, %xmm3 619 ; SSE-NEXT: addps %xmm4, %xmm3 620 ; SSE-NEXT: rcpps %xmm1, %xmm0 621 ; SSE-NEXT: mulps %xmm0, %xmm1 622 ; SSE-NEXT: subps %xmm1, %xmm2 623 ; SSE-NEXT: mulps %xmm0, %xmm2 624 ; SSE-NEXT: addps %xmm0, %xmm2 625 ; SSE-NEXT: movaps %xmm3, %xmm0 626 ; SSE-NEXT: movaps %xmm2, %xmm1 627 ; SSE-NEXT: retq 628 ; 629 ; AVX-RECIP-LABEL: v8f32_one_step: 630 ; AVX-RECIP: # %bb.0: 631 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 632 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 633 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 634 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 635 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 636 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 637 ; AVX-RECIP-NEXT: retq 638 ; 639 ; FMA-RECIP-LABEL: v8f32_one_step: 640 ; FMA-RECIP: # %bb.0: 641 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 642 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem 643 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 644 ; FMA-RECIP-NEXT: retq 645 ; 646 ; BTVER2-LABEL: v8f32_one_step: 647 ; BTVER2: # %bb.0: 648 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 649 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 650 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 651 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 652 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 653 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 654 ; BTVER2-NEXT: retq # sched: [4:1.00] 655 ; 656 ; SANDY-LABEL: v8f32_one_step: 657 ; SANDY: # %bb.0: 658 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 659 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 660 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 661 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 662 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 663 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 664 ; SANDY-NEXT: retq # sched: [1:1.00] 665 ; 666 ; HASWELL-LABEL: v8f32_one_step: 667 ; HASWELL: # %bb.0: 668 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 669 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 670 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 671 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 672 ; HASWELL-NEXT: retq # sched: [7:1.00] 673 ; 674 ; HASWELL-NO-FMA-LABEL: v8f32_one_step: 675 ; HASWELL-NO-FMA: # %bb.0: 676 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 677 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 678 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] 679 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 680 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 681 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 682 ; HASWELL-NO-FMA-NEXT: retq 683 ; 684 ; KNL-LABEL: v8f32_one_step: 685 ; KNL: # %bb.0: 686 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 687 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 688 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 689 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 690 ; KNL-NEXT: retq # sched: [7:1.00] 691 ; 692 ; SKX-LABEL: v8f32_one_step: 693 ; SKX: # %bb.0: 694 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 695 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50] 696 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50] 697 ; SKX-NEXT: retq # sched: [7:1.00] 698 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 699 ret <8 x float> %div 700 } 701 702 define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { 703 ; SSE-LABEL: v8f32_two_step: 704 ; SSE: # %bb.0: 705 ; SSE-NEXT: movaps %xmm1, %xmm2 706 ; SSE-NEXT: rcpps %xmm0, %xmm3 707 ; SSE-NEXT: movaps %xmm0, %xmm4 708 ; SSE-NEXT: mulps %xmm3, %xmm4 709 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 710 ; SSE-NEXT: movaps %xmm1, %xmm5 711 ; SSE-NEXT: subps %xmm4, %xmm5 712 ; SSE-NEXT: mulps %xmm3, %xmm5 713 ; SSE-NEXT: addps %xmm3, %xmm5 714 ; SSE-NEXT: mulps %xmm5, %xmm0 715 ; SSE-NEXT: movaps %xmm1, %xmm3 716 ; SSE-NEXT: subps %xmm0, %xmm3 717 ; SSE-NEXT: mulps %xmm5, %xmm3 718 ; SSE-NEXT: addps %xmm5, %xmm3 719 ; SSE-NEXT: rcpps %xmm2, %xmm0 720 ; SSE-NEXT: movaps %xmm2, %xmm4 721 ; SSE-NEXT: mulps %xmm0, %xmm4 722 ; SSE-NEXT: movaps %xmm1, %xmm5 723 ; SSE-NEXT: subps %xmm4, %xmm5 724 ; SSE-NEXT: mulps %xmm0, %xmm5 725 ; SSE-NEXT: addps %xmm0, %xmm5 726 ; SSE-NEXT: mulps %xmm5, %xmm2 727 ; SSE-NEXT: subps %xmm2, %xmm1 728 ; SSE-NEXT: mulps %xmm5, %xmm1 729 ; SSE-NEXT: addps %xmm5, %xmm1 730 ; SSE-NEXT: movaps %xmm3, %xmm0 731 ; SSE-NEXT: retq 732 ; 733 ; AVX-RECIP-LABEL: v8f32_two_step: 734 ; AVX-RECIP: # %bb.0: 735 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 736 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 737 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 738 ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 739 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 740 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 741 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 742 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 743 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 744 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 745 ; AVX-RECIP-NEXT: retq 746 ; 747 ; FMA-RECIP-LABEL: v8f32_two_step: 748 ; FMA-RECIP: # %bb.0: 749 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 750 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 751 ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 752 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 753 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 754 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 755 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 756 ; FMA-RECIP-NEXT: retq 757 ; 758 ; BTVER2-LABEL: v8f32_two_step: 759 ; BTVER2: # %bb.0: 760 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 761 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 762 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00] 763 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00] 764 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00] 765 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00] 766 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 767 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 768 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 769 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 770 ; BTVER2-NEXT: retq # sched: [4:1.00] 771 ; 772 ; SANDY-LABEL: v8f32_two_step: 773 ; SANDY: # %bb.0: 774 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 775 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] 776 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 777 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] 778 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] 779 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] 780 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 781 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 782 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 783 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 784 ; SANDY-NEXT: retq # sched: [1:1.00] 785 ; 786 ; HASWELL-LABEL: v8f32_two_step: 787 ; HASWELL: # %bb.0: 788 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 789 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 790 ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] 791 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50] 792 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50] 793 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50] 794 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50] 795 ; HASWELL-NEXT: retq # sched: [7:1.00] 796 ; 797 ; HASWELL-NO-FMA-LABEL: v8f32_two_step: 798 ; HASWELL-NO-FMA: # %bb.0: 799 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 800 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 801 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] 802 ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 803 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 804 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 805 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 806 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 807 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 808 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 809 ; HASWELL-NO-FMA-NEXT: retq 810 ; 811 ; KNL-LABEL: v8f32_two_step: 812 ; KNL: # %bb.0: 813 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 814 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 815 ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] 816 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50] 817 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50] 818 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50] 819 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50] 820 ; KNL-NEXT: retq # sched: [7:1.00] 821 ; 822 ; SKX-LABEL: v8f32_two_step: 823 ; SKX: # %bb.0: 824 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 825 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 826 ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33] 827 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50] 828 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50] 829 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50] 830 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50] 831 ; SKX-NEXT: retq # sched: [7:1.00] 832 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 833 ret <8 x float> %div 834 } 835 836 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { 837 ; SSE-LABEL: v16f32_no_estimate: 838 ; SSE: # %bb.0: 839 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 840 ; SSE-NEXT: movaps %xmm4, %xmm5 841 ; SSE-NEXT: divps %xmm0, %xmm5 842 ; SSE-NEXT: movaps %xmm4, %xmm6 843 ; SSE-NEXT: divps %xmm1, %xmm6 844 ; SSE-NEXT: movaps %xmm4, %xmm7 845 ; SSE-NEXT: divps %xmm2, %xmm7 846 ; SSE-NEXT: divps %xmm3, %xmm4 847 ; SSE-NEXT: movaps %xmm5, %xmm0 848 ; SSE-NEXT: movaps %xmm6, %xmm1 849 ; SSE-NEXT: movaps %xmm7, %xmm2 850 ; SSE-NEXT: movaps %xmm4, %xmm3 851 ; SSE-NEXT: retq 852 ; 853 ; AVX-RECIP-LABEL: v16f32_no_estimate: 854 ; AVX-RECIP: # %bb.0: 855 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 856 ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 857 ; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 858 ; AVX-RECIP-NEXT: retq 859 ; 860 ; FMA-RECIP-LABEL: v16f32_no_estimate: 861 ; FMA-RECIP: # %bb.0: 862 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 863 ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 864 ; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 865 ; FMA-RECIP-NEXT: retq 866 ; 867 ; BTVER2-LABEL: v16f32_no_estimate: 868 ; BTVER2: # %bb.0: 869 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 870 ; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 # sched: [38:38.00] 871 ; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 # sched: [38:38.00] 872 ; BTVER2-NEXT: retq # sched: [4:1.00] 873 ; 874 ; SANDY-LABEL: v16f32_no_estimate: 875 ; SANDY: # %bb.0: 876 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 877 ; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00] 878 ; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00] 879 ; SANDY-NEXT: retq # sched: [1:1.00] 880 ; 881 ; HASWELL-LABEL: v16f32_no_estimate: 882 ; HASWELL: # %bb.0: 883 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 884 ; HASWELL-NEXT: vdivps %ymm0, %ymm2, %ymm0 # sched: [21:14.00] 885 ; HASWELL-NEXT: vdivps %ymm1, %ymm2, %ymm1 # sched: [21:14.00] 886 ; HASWELL-NEXT: retq # sched: [7:1.00] 887 ; 888 ; HASWELL-NO-FMA-LABEL: v16f32_no_estimate: 889 ; HASWELL-NO-FMA: # %bb.0: 890 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] 891 ; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm2, %ymm0 892 ; HASWELL-NO-FMA-NEXT: vdivps %ymm1, %ymm2, %ymm1 893 ; HASWELL-NO-FMA-NEXT: retq 894 ; 895 ; KNL-LABEL: v16f32_no_estimate: 896 ; KNL: # %bb.0: 897 ; KNL-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] 898 ; KNL-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [21:14.00] 899 ; KNL-NEXT: retq # sched: [7:1.00] 900 ; 901 ; SKX-LABEL: v16f32_no_estimate: 902 ; SKX: # %bb.0: 903 ; SKX-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] 904 ; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00] 905 ; SKX-NEXT: retq # sched: [7:1.00] 906 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 907 ret <16 x float> %div 908 } 909 910 define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { 911 ; SSE-LABEL: v16f32_one_step: 912 ; SSE: # %bb.0: 913 ; SSE-NEXT: movaps %xmm3, %xmm4 914 ; SSE-NEXT: movaps %xmm0, %xmm5 915 ; SSE-NEXT: rcpps %xmm0, %xmm6 916 ; SSE-NEXT: mulps %xmm6, %xmm5 917 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 918 ; SSE-NEXT: movaps %xmm3, %xmm0 919 ; SSE-NEXT: subps %xmm5, %xmm0 920 ; SSE-NEXT: mulps %xmm6, %xmm0 921 ; SSE-NEXT: addps %xmm6, %xmm0 922 ; SSE-NEXT: rcpps %xmm1, %xmm6 923 ; SSE-NEXT: mulps %xmm6, %xmm1 924 ; SSE-NEXT: movaps %xmm3, %xmm5 925 ; SSE-NEXT: subps %xmm1, %xmm5 926 ; SSE-NEXT: mulps %xmm6, %xmm5 927 ; SSE-NEXT: addps %xmm6, %xmm5 928 ; SSE-NEXT: rcpps %xmm2, %xmm1 929 ; SSE-NEXT: mulps %xmm1, %xmm2 930 ; SSE-NEXT: movaps %xmm3, %xmm6 931 ; SSE-NEXT: subps %xmm2, %xmm6 932 ; SSE-NEXT: mulps %xmm1, %xmm6 933 ; SSE-NEXT: addps %xmm1, %xmm6 934 ; SSE-NEXT: rcpps %xmm4, %xmm1 935 ; SSE-NEXT: mulps %xmm1, %xmm4 936 ; SSE-NEXT: subps %xmm4, %xmm3 937 ; SSE-NEXT: mulps %xmm1, %xmm3 938 ; SSE-NEXT: addps %xmm1, %xmm3 939 ; SSE-NEXT: movaps %xmm5, %xmm1 940 ; SSE-NEXT: movaps %xmm6, %xmm2 941 ; SSE-NEXT: retq 942 ; 943 ; AVX-RECIP-LABEL: v16f32_one_step: 944 ; AVX-RECIP: # %bb.0: 945 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 946 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 947 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 948 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 949 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 950 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 951 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 952 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 953 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 954 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 955 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 956 ; AVX-RECIP-NEXT: retq 957 ; 958 ; FMA-RECIP-LABEL: v16f32_one_step: 959 ; FMA-RECIP: # %bb.0: 960 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 961 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 962 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 963 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 964 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 965 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 966 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 967 ; FMA-RECIP-NEXT: retq 968 ; 969 ; BTVER2-LABEL: v16f32_one_step: 970 ; BTVER2: # %bb.0: 971 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 972 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00] 973 ; BTVER2-NEXT: vrcpps %ymm1, %ymm4 # sched: [2:2.00] 974 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00] 975 ; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 # sched: [2:2.00] 976 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 977 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00] 978 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00] 979 ; BTVER2-NEXT: vmulps %ymm1, %ymm4, %ymm1 # sched: [2:2.00] 980 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 981 ; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 # sched: [3:2.00] 982 ; BTVER2-NEXT: retq # sched: [4:1.00] 983 ; 984 ; SANDY-LABEL: v16f32_one_step: 985 ; SANDY: # %bb.0: 986 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 987 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 988 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 989 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 990 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 991 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 992 ; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00] 993 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00] 994 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 995 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 996 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 997 ; SANDY-NEXT: retq # sched: [1:1.00] 998 ; 999 ; HASWELL-LABEL: v16f32_one_step: 1000 ; HASWELL: # %bb.0: 1001 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1002 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1003 ; HASWELL-NEXT: vrcpps %ymm1, %ymm4 # sched: [11:2.00] 1004 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50] 1005 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50] 1006 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50] 1007 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50] 1008 ; HASWELL-NEXT: retq # sched: [7:1.00] 1009 ; 1010 ; HASWELL-NO-FMA-LABEL: v16f32_one_step: 1011 ; HASWELL-NO-FMA: # %bb.0: 1012 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1013 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 1014 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] 1015 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 1016 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1017 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 1018 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 1019 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 1020 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 1021 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1022 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 1023 ; HASWELL-NO-FMA-NEXT: retq 1024 ; 1025 ; KNL-LABEL: v16f32_one_step: 1026 ; KNL: # %bb.0: 1027 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1028 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] 1029 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] 1030 ; KNL-NEXT: retq # sched: [7:1.00] 1031 ; 1032 ; SKX-LABEL: v16f32_one_step: 1033 ; SKX: # %bb.0: 1034 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1035 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] 1036 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50] 1037 ; SKX-NEXT: retq # sched: [7:1.00] 1038 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1039 ret <16 x float> %div 1040 } 1041 1042 define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { 1043 ; SSE-LABEL: v16f32_two_step: 1044 ; SSE: # %bb.0: 1045 ; SSE-NEXT: movaps %xmm3, %xmm4 1046 ; SSE-NEXT: movaps %xmm1, %xmm5 1047 ; SSE-NEXT: movaps %xmm0, %xmm1 1048 ; SSE-NEXT: rcpps %xmm0, %xmm0 1049 ; SSE-NEXT: movaps %xmm1, %xmm6 1050 ; SSE-NEXT: mulps %xmm0, %xmm6 1051 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1052 ; SSE-NEXT: movaps %xmm3, %xmm7 1053 ; SSE-NEXT: subps %xmm6, %xmm7 1054 ; SSE-NEXT: mulps %xmm0, %xmm7 1055 ; SSE-NEXT: addps %xmm0, %xmm7 1056 ; SSE-NEXT: mulps %xmm7, %xmm1 1057 ; SSE-NEXT: movaps %xmm3, %xmm0 1058 ; SSE-NEXT: subps %xmm1, %xmm0 1059 ; SSE-NEXT: mulps %xmm7, %xmm0 1060 ; SSE-NEXT: addps %xmm7, %xmm0 1061 ; SSE-NEXT: rcpps %xmm5, %xmm1 1062 ; SSE-NEXT: movaps %xmm5, %xmm6 1063 ; SSE-NEXT: mulps %xmm1, %xmm6 1064 ; SSE-NEXT: movaps %xmm3, %xmm7 1065 ; SSE-NEXT: subps %xmm6, %xmm7 1066 ; SSE-NEXT: mulps %xmm1, %xmm7 1067 ; SSE-NEXT: addps %xmm1, %xmm7 1068 ; SSE-NEXT: mulps %xmm7, %xmm5 1069 ; SSE-NEXT: movaps %xmm3, %xmm1 1070 ; SSE-NEXT: subps %xmm5, %xmm1 1071 ; SSE-NEXT: mulps %xmm7, %xmm1 1072 ; SSE-NEXT: addps %xmm7, %xmm1 1073 ; SSE-NEXT: rcpps %xmm2, %xmm5 1074 ; SSE-NEXT: movaps %xmm2, %xmm6 1075 ; SSE-NEXT: mulps %xmm5, %xmm6 1076 ; SSE-NEXT: movaps %xmm3, %xmm7 1077 ; SSE-NEXT: subps %xmm6, %xmm7 1078 ; SSE-NEXT: mulps %xmm5, %xmm7 1079 ; SSE-NEXT: addps %xmm5, %xmm7 1080 ; SSE-NEXT: mulps %xmm7, %xmm2 1081 ; SSE-NEXT: movaps %xmm3, %xmm5 1082 ; SSE-NEXT: subps %xmm2, %xmm5 1083 ; SSE-NEXT: mulps %xmm7, %xmm5 1084 ; SSE-NEXT: addps %xmm7, %xmm5 1085 ; SSE-NEXT: rcpps %xmm4, %xmm2 1086 ; SSE-NEXT: movaps %xmm4, %xmm6 1087 ; SSE-NEXT: mulps %xmm2, %xmm6 1088 ; SSE-NEXT: movaps %xmm3, %xmm7 1089 ; SSE-NEXT: subps %xmm6, %xmm7 1090 ; SSE-NEXT: mulps %xmm2, %xmm7 1091 ; SSE-NEXT: addps %xmm2, %xmm7 1092 ; SSE-NEXT: mulps %xmm7, %xmm4 1093 ; SSE-NEXT: subps %xmm4, %xmm3 1094 ; SSE-NEXT: mulps %xmm7, %xmm3 1095 ; SSE-NEXT: addps %xmm7, %xmm3 1096 ; SSE-NEXT: movaps %xmm5, %xmm2 1097 ; SSE-NEXT: retq 1098 ; 1099 ; AVX-RECIP-LABEL: v16f32_two_step: 1100 ; AVX-RECIP: # %bb.0: 1101 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1102 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 1103 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1104 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1105 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1106 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1107 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1108 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 1109 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1110 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1111 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1112 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 1113 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1114 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1115 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1116 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1117 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 1118 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1119 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1120 ; AVX-RECIP-NEXT: retq 1121 ; 1122 ; FMA-RECIP-LABEL: v16f32_two_step: 1123 ; FMA-RECIP: # %bb.0: 1124 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1125 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1126 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1127 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 1128 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 1129 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 1130 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 1131 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1132 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1133 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 1134 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 1135 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 1136 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 1137 ; FMA-RECIP-NEXT: retq 1138 ; 1139 ; BTVER2-LABEL: v16f32_two_step: 1140 ; BTVER2: # %bb.0: 1141 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1142 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00] 1143 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00] 1144 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00] 1145 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00] 1146 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00] 1147 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00] 1148 ; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00] 1149 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00] 1150 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 1151 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00] 1152 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00] 1153 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00] 1154 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00] 1155 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00] 1156 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00] 1157 ; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00] 1158 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1159 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00] 1160 ; BTVER2-NEXT: retq # sched: [4:1.00] 1161 ; 1162 ; SANDY-LABEL: v16f32_two_step: 1163 ; SANDY: # %bb.0: 1164 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 1165 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00] 1166 ; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1167 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1168 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00] 1169 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1170 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 1171 ; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00] 1172 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 1173 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1174 ; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00] 1175 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00] 1176 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1177 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00] 1178 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1179 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00] 1180 ; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1181 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 1182 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1183 ; SANDY-NEXT: retq # sched: [1:1.00] 1184 ; 1185 ; HASWELL-LABEL: v16f32_two_step: 1186 ; HASWELL: # %bb.0: 1187 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1188 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1189 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00] 1190 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50] 1191 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50] 1192 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50] 1193 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50] 1194 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1195 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00] 1196 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50] 1197 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50] 1198 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50] 1199 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50] 1200 ; HASWELL-NEXT: retq # sched: [7:1.00] 1201 ; 1202 ; HASWELL-NO-FMA-LABEL: v16f32_two_step: 1203 ; HASWELL-NO-FMA: # %bb.0: 1204 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 1205 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 1206 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] 1207 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 1208 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 1209 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 1210 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 1211 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 1212 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 1213 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 1214 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 1215 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 1216 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 1217 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 1218 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 1219 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 1220 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 1221 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 1222 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 1223 ; HASWELL-NO-FMA-NEXT: retq 1224 ; 1225 ; KNL-LABEL: v16f32_two_step: 1226 ; KNL: # %bb.0: 1227 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1228 ; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] 1229 ; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00] 1230 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50] 1231 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50] 1232 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50] 1233 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50] 1234 ; KNL-NEXT: retq # sched: [7:1.00] 1235 ; 1236 ; SKX-LABEL: v16f32_two_step: 1237 ; SKX: # %bb.0: 1238 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1239 ; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] 1240 ; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33] 1241 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50] 1242 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50] 1243 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50] 1244 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50] 1245 ; SKX-NEXT: retq # sched: [7:1.00] 1246 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1247 ret <16 x float> %div 1248 } 1249 1250 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } 1251 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } 1252 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } 1253 1254