1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX 11 12 ; It's the extra tests coverage for recip as discussed on D26855. 13 14 define float @f32_no_step_2(float %x) #3 { 15 ; SSE-LABEL: f32_no_step_2: 16 ; SSE: # %bb.0: 17 ; SSE-NEXT: rcpss %xmm0, %xmm0 18 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 19 ; SSE-NEXT: retq 20 ; 21 ; AVX-RECIP-LABEL: f32_no_step_2: 22 ; AVX-RECIP: # %bb.0: 23 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 24 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 25 ; AVX-RECIP-NEXT: retq 26 ; 27 ; FMA-RECIP-LABEL: f32_no_step_2: 28 ; FMA-RECIP: # %bb.0: 29 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 30 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 31 ; FMA-RECIP-NEXT: retq 32 ; 33 ; BTVER2-LABEL: f32_no_step_2: 34 ; BTVER2: # %bb.0: 35 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00] 36 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 37 ; BTVER2-NEXT: retq # sched: [4:1.00] 38 ; 39 ; SANDY-LABEL: f32_no_step_2: 40 ; SANDY: # %bb.0: 41 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 42 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 43 ; SANDY-NEXT: retq # sched: [1:1.00] 44 ; 45 ; HASWELL-LABEL: f32_no_step_2: 46 ; HASWELL: # %bb.0: 47 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 48 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 49 ; HASWELL-NEXT: retq # sched: [7:1.00] 50 ; 51 ; HASWELL-NO-FMA-LABEL: f32_no_step_2: 52 ; HASWELL-NO-FMA: # %bb.0: 53 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 54 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 55 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 56 ; 57 ; KNL-LABEL: f32_no_step_2: 58 ; KNL: # %bb.0: 59 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 60 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 61 ; KNL-NEXT: retq # sched: [7:1.00] 62 ; 63 ; SKX-LABEL: f32_no_step_2: 64 ; SKX: # %bb.0: 65 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] 66 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] 67 ; SKX-NEXT: retq # sched: [7:1.00] 68 %div = fdiv fast float 1234.0, %x 69 ret float %div 70 } 71 72 define float @f32_one_step_2(float %x) #1 { 73 ; SSE-LABEL: f32_one_step_2: 74 ; SSE: # %bb.0: 75 ; SSE-NEXT: rcpss %xmm0, %xmm2 76 ; SSE-NEXT: mulss %xmm2, %xmm0 77 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 78 ; SSE-NEXT: subss %xmm0, %xmm1 79 ; SSE-NEXT: mulss %xmm2, %xmm1 80 ; SSE-NEXT: addss %xmm2, %xmm1 81 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 82 ; SSE-NEXT: movaps %xmm1, %xmm0 83 ; SSE-NEXT: retq 84 ; 85 ; AVX-RECIP-LABEL: f32_one_step_2: 86 ; AVX-RECIP: # %bb.0: 87 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 88 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 89 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 90 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 91 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 92 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 93 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 94 ; AVX-RECIP-NEXT: retq 95 ; 96 ; FMA-RECIP-LABEL: f32_one_step_2: 97 ; FMA-RECIP: # %bb.0: 98 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 99 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 100 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 101 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 102 ; FMA-RECIP-NEXT: retq 103 ; 104 ; BTVER2-LABEL: f32_one_step_2: 105 ; BTVER2: # %bb.0: 106 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] 107 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 108 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 109 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 110 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 111 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 112 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 113 ; BTVER2-NEXT: retq # sched: [4:1.00] 114 ; 115 ; SANDY-LABEL: f32_one_step_2: 116 ; SANDY: # %bb.0: 117 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 118 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 119 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] 120 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 121 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 122 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 123 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 124 ; SANDY-NEXT: retq # sched: [1:1.00] 125 ; 126 ; HASWELL-LABEL: f32_one_step_2: 127 ; HASWELL: # %bb.0: 128 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 129 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 130 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 131 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 132 ; HASWELL-NEXT: retq # sched: [7:1.00] 133 ; 134 ; HASWELL-NO-FMA-LABEL: f32_one_step_2: 135 ; HASWELL-NO-FMA: # %bb.0: 136 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 137 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 138 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 139 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 140 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 141 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 142 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 143 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 144 ; 145 ; KNL-LABEL: f32_one_step_2: 146 ; KNL: # %bb.0: 147 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 148 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 149 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 150 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 151 ; KNL-NEXT: retq # sched: [7:1.00] 152 ; 153 ; SKX-LABEL: f32_one_step_2: 154 ; SKX: # %bb.0: 155 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 156 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50] 157 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 158 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] 159 ; SKX-NEXT: retq # sched: [7:1.00] 160 %div = fdiv fast float 3456.0, %x 161 ret float %div 162 } 163 164 define float @f32_one_step_2_divs(float %x) #1 { 165 ; SSE-LABEL: f32_one_step_2_divs: 166 ; SSE: # %bb.0: 167 ; SSE-NEXT: rcpss %xmm0, %xmm1 168 ; SSE-NEXT: mulss %xmm1, %xmm0 169 ; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 170 ; SSE-NEXT: subss %xmm0, %xmm2 171 ; SSE-NEXT: mulss %xmm1, %xmm2 172 ; SSE-NEXT: addss %xmm1, %xmm2 173 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 174 ; SSE-NEXT: mulss %xmm2, %xmm0 175 ; SSE-NEXT: mulss %xmm2, %xmm0 176 ; SSE-NEXT: retq 177 ; 178 ; AVX-RECIP-LABEL: f32_one_step_2_divs: 179 ; AVX-RECIP: # %bb.0: 180 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 181 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 182 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 183 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 184 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 185 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 186 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 187 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 188 ; AVX-RECIP-NEXT: retq 189 ; 190 ; FMA-RECIP-LABEL: f32_one_step_2_divs: 191 ; FMA-RECIP: # %bb.0: 192 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 193 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 194 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 195 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 196 ; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 197 ; FMA-RECIP-NEXT: retq 198 ; 199 ; BTVER2-LABEL: f32_one_step_2_divs: 200 ; BTVER2: # %bb.0: 201 ; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] 202 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 203 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 204 ; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 205 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 206 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 207 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00] 208 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 209 ; BTVER2-NEXT: retq # sched: [4:1.00] 210 ; 211 ; SANDY-LABEL: f32_one_step_2_divs: 212 ; SANDY: # %bb.0: 213 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 214 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 215 ; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] 216 ; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 217 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 218 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 219 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00] 220 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 221 ; SANDY-NEXT: retq # sched: [1:1.00] 222 ; 223 ; HASWELL-LABEL: f32_one_step_2_divs: 224 ; HASWELL: # %bb.0: 225 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 226 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 227 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 228 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 229 ; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 230 ; HASWELL-NEXT: retq # sched: [7:1.00] 231 ; 232 ; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: 233 ; HASWELL-NO-FMA: # %bb.0: 234 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 235 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 236 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 237 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 238 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 239 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 240 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 241 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 242 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 243 ; 244 ; KNL-LABEL: f32_one_step_2_divs: 245 ; KNL: # %bb.0: 246 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 247 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 248 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 249 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 250 ; KNL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 251 ; KNL-NEXT: retq # sched: [7:1.00] 252 ; 253 ; SKX-LABEL: f32_one_step_2_divs: 254 ; SKX: # %bb.0: 255 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 256 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50] 257 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 258 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] 259 ; SKX-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50] 260 ; SKX-NEXT: retq # sched: [7:1.00] 261 %div = fdiv fast float 3456.0, %x 262 %div2 = fdiv fast float %div, %x 263 ret float %div2 264 } 265 266 define float @f32_two_step_2(float %x) #2 { 267 ; SSE-LABEL: f32_two_step_2: 268 ; SSE: # %bb.0: 269 ; SSE-NEXT: rcpss %xmm0, %xmm2 270 ; SSE-NEXT: movaps %xmm0, %xmm3 271 ; SSE-NEXT: mulss %xmm2, %xmm3 272 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 273 ; SSE-NEXT: movaps %xmm1, %xmm4 274 ; SSE-NEXT: subss %xmm3, %xmm4 275 ; SSE-NEXT: mulss %xmm2, %xmm4 276 ; SSE-NEXT: addss %xmm2, %xmm4 277 ; SSE-NEXT: mulss %xmm4, %xmm0 278 ; SSE-NEXT: subss %xmm0, %xmm1 279 ; SSE-NEXT: mulss %xmm4, %xmm1 280 ; SSE-NEXT: addss %xmm4, %xmm1 281 ; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 282 ; SSE-NEXT: movaps %xmm1, %xmm0 283 ; SSE-NEXT: retq 284 ; 285 ; AVX-RECIP-LABEL: f32_two_step_2: 286 ; AVX-RECIP: # %bb.0: 287 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 288 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 289 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 290 ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 291 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 292 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 293 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 294 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 295 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 296 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 297 ; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 298 ; AVX-RECIP-NEXT: retq 299 ; 300 ; FMA-RECIP-LABEL: f32_two_step_2: 301 ; FMA-RECIP: # %bb.0: 302 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 303 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 304 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 305 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 306 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 307 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 308 ; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 309 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 310 ; FMA-RECIP-NEXT: retq 311 ; 312 ; BTVER2-LABEL: f32_two_step_2: 313 ; BTVER2: # %bb.0: 314 ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00] 315 ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 316 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00] 317 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 318 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00] 319 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 320 ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 321 ; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 322 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 323 ; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 324 ; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 325 ; BTVER2-NEXT: retq # sched: [4:1.00] 326 ; 327 ; SANDY-LABEL: f32_two_step_2: 328 ; SANDY: # %bb.0: 329 ; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 330 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00] 331 ; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50] 332 ; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 333 ; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00] 334 ; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 335 ; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 336 ; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 337 ; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 338 ; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 339 ; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 340 ; SANDY-NEXT: retq # sched: [1:1.00] 341 ; 342 ; HASWELL-LABEL: f32_two_step_2: 343 ; HASWELL: # %bb.0: 344 ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 345 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 346 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 347 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 348 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 349 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 350 ; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 351 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 352 ; HASWELL-NEXT: retq # sched: [7:1.00] 353 ; 354 ; HASWELL-NO-FMA-LABEL: f32_two_step_2: 355 ; HASWELL-NO-FMA: # %bb.0: 356 ; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 357 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50] 358 ; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:0.50] 359 ; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 360 ; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50] 361 ; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 362 ; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 363 ; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 364 ; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 365 ; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 366 ; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 367 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 368 ; 369 ; KNL-LABEL: f32_two_step_2: 370 ; KNL: # %bb.0: 371 ; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 372 ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 373 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 374 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 375 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 376 ; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 377 ; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 378 ; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 379 ; KNL-NEXT: retq # sched: [7:1.00] 380 ; 381 ; SKX-LABEL: f32_two_step_2: 382 ; SKX: # %bb.0: 383 ; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 384 ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 385 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33] 386 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50] 387 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50] 388 ; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50] 389 ; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50] 390 ; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] 391 ; SKX-NEXT: retq # sched: [7:1.00] 392 %div = fdiv fast float 6789.0, %x 393 ret float %div 394 } 395 396 define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { 397 ; SSE-LABEL: v4f32_one_step2: 398 ; SSE: # %bb.0: 399 ; SSE-NEXT: rcpps %xmm0, %xmm2 400 ; SSE-NEXT: mulps %xmm2, %xmm0 401 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 402 ; SSE-NEXT: subps %xmm0, %xmm1 403 ; SSE-NEXT: mulps %xmm2, %xmm1 404 ; SSE-NEXT: addps %xmm2, %xmm1 405 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 406 ; SSE-NEXT: movaps %xmm1, %xmm0 407 ; SSE-NEXT: retq 408 ; 409 ; AVX-RECIP-LABEL: v4f32_one_step2: 410 ; AVX-RECIP: # %bb.0: 411 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 412 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 413 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 414 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 415 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 416 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 417 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 418 ; AVX-RECIP-NEXT: retq 419 ; 420 ; FMA-RECIP-LABEL: v4f32_one_step2: 421 ; FMA-RECIP: # %bb.0: 422 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 423 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 424 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 425 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 426 ; FMA-RECIP-NEXT: retq 427 ; 428 ; BTVER2-LABEL: v4f32_one_step2: 429 ; BTVER2: # %bb.0: 430 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 431 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 432 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 433 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 434 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 435 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 436 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 437 ; BTVER2-NEXT: retq # sched: [4:1.00] 438 ; 439 ; SANDY-LABEL: v4f32_one_step2: 440 ; SANDY: # %bb.0: 441 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 442 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 443 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 444 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 445 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 446 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 447 ; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 448 ; SANDY-NEXT: retq # sched: [1:1.00] 449 ; 450 ; HASWELL-LABEL: v4f32_one_step2: 451 ; HASWELL: # %bb.0: 452 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 453 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 454 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 455 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 456 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 457 ; HASWELL-NEXT: retq # sched: [7:1.00] 458 ; 459 ; HASWELL-NO-FMA-LABEL: v4f32_one_step2: 460 ; HASWELL-NO-FMA: # %bb.0: 461 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 462 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 463 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 464 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 465 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 466 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 467 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 468 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 469 ; 470 ; KNL-LABEL: v4f32_one_step2: 471 ; KNL: # %bb.0: 472 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 473 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 474 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 475 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 476 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 477 ; KNL-NEXT: retq # sched: [7:1.00] 478 ; 479 ; SKX-LABEL: v4f32_one_step2: 480 ; SKX: # %bb.0: 481 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 482 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 483 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 484 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 485 ; SKX-NEXT: retq # sched: [7:1.00] 486 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 487 ret <4 x float> %div 488 } 489 490 define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { 491 ; SSE-LABEL: v4f32_one_step_2_divs: 492 ; SSE: # %bb.0: 493 ; SSE-NEXT: rcpps %xmm0, %xmm1 494 ; SSE-NEXT: mulps %xmm1, %xmm0 495 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 496 ; SSE-NEXT: subps %xmm0, %xmm2 497 ; SSE-NEXT: mulps %xmm1, %xmm2 498 ; SSE-NEXT: addps %xmm1, %xmm2 499 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 500 ; SSE-NEXT: mulps %xmm2, %xmm0 501 ; SSE-NEXT: mulps %xmm2, %xmm0 502 ; SSE-NEXT: retq 503 ; 504 ; AVX-RECIP-LABEL: v4f32_one_step_2_divs: 505 ; AVX-RECIP: # %bb.0: 506 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 507 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 508 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 509 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 510 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 511 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 512 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 513 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 514 ; AVX-RECIP-NEXT: retq 515 ; 516 ; FMA-RECIP-LABEL: v4f32_one_step_2_divs: 517 ; FMA-RECIP: # %bb.0: 518 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 519 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 520 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 521 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 522 ; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 523 ; FMA-RECIP-NEXT: retq 524 ; 525 ; BTVER2-LABEL: v4f32_one_step_2_divs: 526 ; BTVER2: # %bb.0: 527 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 528 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 529 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 530 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 531 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 532 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 533 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00] 534 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 535 ; BTVER2-NEXT: retq # sched: [4:1.00] 536 ; 537 ; SANDY-LABEL: v4f32_one_step_2_divs: 538 ; SANDY: # %bb.0: 539 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 540 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 541 ; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 542 ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 543 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 544 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 545 ; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00] 546 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 547 ; SANDY-NEXT: retq # sched: [1:1.00] 548 ; 549 ; HASWELL-LABEL: v4f32_one_step_2_divs: 550 ; HASWELL: # %bb.0: 551 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 552 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 553 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 554 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 555 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50] 556 ; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 557 ; HASWELL-NEXT: retq # sched: [7:1.00] 558 ; 559 ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: 560 ; HASWELL-NO-FMA: # %bb.0: 561 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 562 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 563 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 564 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 565 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 566 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 567 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50] 568 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 569 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 570 ; 571 ; KNL-LABEL: v4f32_one_step_2_divs: 572 ; KNL: # %bb.0: 573 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 574 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 575 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 576 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 577 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50] 578 ; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 579 ; KNL-NEXT: retq # sched: [7:1.00] 580 ; 581 ; SKX-LABEL: v4f32_one_step_2_divs: 582 ; SKX: # %bb.0: 583 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 584 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 585 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 586 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 587 ; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] 588 ; SKX-NEXT: retq # sched: [7:1.00] 589 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 590 %div2 = fdiv fast <4 x float> %div, %x 591 ret <4 x float> %div2 592 } 593 594 define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { 595 ; SSE-LABEL: v4f32_two_step2: 596 ; SSE: # %bb.0: 597 ; SSE-NEXT: rcpps %xmm0, %xmm2 598 ; SSE-NEXT: movaps %xmm0, %xmm3 599 ; SSE-NEXT: mulps %xmm2, %xmm3 600 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 601 ; SSE-NEXT: movaps %xmm1, %xmm4 602 ; SSE-NEXT: subps %xmm3, %xmm4 603 ; SSE-NEXT: mulps %xmm2, %xmm4 604 ; SSE-NEXT: addps %xmm2, %xmm4 605 ; SSE-NEXT: mulps %xmm4, %xmm0 606 ; SSE-NEXT: subps %xmm0, %xmm1 607 ; SSE-NEXT: mulps %xmm4, %xmm1 608 ; SSE-NEXT: addps %xmm4, %xmm1 609 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 610 ; SSE-NEXT: movaps %xmm1, %xmm0 611 ; SSE-NEXT: retq 612 ; 613 ; AVX-RECIP-LABEL: v4f32_two_step2: 614 ; AVX-RECIP: # %bb.0: 615 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 616 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 617 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 618 ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 619 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 620 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 621 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 622 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 623 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 624 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 625 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 626 ; AVX-RECIP-NEXT: retq 627 ; 628 ; FMA-RECIP-LABEL: v4f32_two_step2: 629 ; FMA-RECIP: # %bb.0: 630 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 631 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 632 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 633 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 634 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 635 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 636 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 637 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 638 ; FMA-RECIP-NEXT: retq 639 ; 640 ; BTVER2-LABEL: v4f32_two_step2: 641 ; BTVER2: # %bb.0: 642 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 643 ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 644 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00] 645 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 646 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00] 647 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 648 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 649 ; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 650 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 651 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 652 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 653 ; BTVER2-NEXT: retq # sched: [4:1.00] 654 ; 655 ; SANDY-LABEL: v4f32_two_step2: 656 ; SANDY: # %bb.0: 657 ; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 658 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00] 659 ; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 660 ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 661 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00] 662 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 663 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 664 ; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 665 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 666 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 667 ; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 668 ; SANDY-NEXT: retq # sched: [1:1.00] 669 ; 670 ; HASWELL-LABEL: v4f32_two_step2: 671 ; HASWELL: # %bb.0: 672 ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 673 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 674 ; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 675 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 676 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 677 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 678 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 679 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 680 ; HASWELL-NEXT: retq # sched: [7:1.00] 681 ; 682 ; HASWELL-NO-FMA-LABEL: v4f32_two_step2: 683 ; HASWELL-NO-FMA: # %bb.0: 684 ; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 685 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50] 686 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50] 687 ; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 688 ; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50] 689 ; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 690 ; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 691 ; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 692 ; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 693 ; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 694 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 695 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 696 ; 697 ; KNL-LABEL: v4f32_two_step2: 698 ; KNL: # %bb.0: 699 ; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 700 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 701 ; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 702 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 703 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 704 ; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 705 ; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 706 ; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 707 ; KNL-NEXT: retq # sched: [7:1.00] 708 ; 709 ; SKX-LABEL: v4f32_two_step2: 710 ; SKX: # %bb.0: 711 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 712 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 713 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33] 714 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50] 715 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50] 716 ; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50] 717 ; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50] 718 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 719 ; SKX-NEXT: retq # sched: [7:1.00] 720 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 721 ret <4 x float> %div 722 } 723 724 define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { 725 ; SSE-LABEL: v8f32_one_step2: 726 ; SSE: # %bb.0: 727 ; SSE-NEXT: rcpps %xmm1, %xmm4 728 ; SSE-NEXT: mulps %xmm4, %xmm1 729 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 730 ; SSE-NEXT: movaps %xmm2, %xmm3 731 ; SSE-NEXT: subps %xmm1, %xmm3 732 ; SSE-NEXT: mulps %xmm4, %xmm3 733 ; SSE-NEXT: addps %xmm4, %xmm3 734 ; SSE-NEXT: rcpps %xmm0, %xmm1 735 ; SSE-NEXT: mulps %xmm1, %xmm0 736 ; SSE-NEXT: subps %xmm0, %xmm2 737 ; SSE-NEXT: mulps %xmm1, %xmm2 738 ; SSE-NEXT: addps %xmm1, %xmm2 739 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 740 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 741 ; SSE-NEXT: movaps %xmm2, %xmm0 742 ; SSE-NEXT: movaps %xmm3, %xmm1 743 ; SSE-NEXT: retq 744 ; 745 ; AVX-RECIP-LABEL: v8f32_one_step2: 746 ; AVX-RECIP: # %bb.0: 747 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 748 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 749 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 750 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 751 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 752 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 753 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 754 ; AVX-RECIP-NEXT: retq 755 ; 756 ; FMA-RECIP-LABEL: v8f32_one_step2: 757 ; FMA-RECIP: # %bb.0: 758 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 759 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem 760 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 761 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 762 ; FMA-RECIP-NEXT: retq 763 ; 764 ; BTVER2-LABEL: v8f32_one_step2: 765 ; BTVER2: # %bb.0: 766 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 767 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 768 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 769 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 770 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 771 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 772 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 773 ; BTVER2-NEXT: retq # sched: [4:1.00] 774 ; 775 ; SANDY-LABEL: v8f32_one_step2: 776 ; SANDY: # %bb.0: 777 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 778 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 779 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 780 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 781 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 782 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 783 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 784 ; SANDY-NEXT: retq # sched: [1:1.00] 785 ; 786 ; HASWELL-LABEL: v8f32_one_step2: 787 ; HASWELL: # %bb.0: 788 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 789 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 790 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 791 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 792 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 793 ; HASWELL-NEXT: retq # sched: [7:1.00] 794 ; 795 ; HASWELL-NO-FMA-LABEL: v8f32_one_step2: 796 ; HASWELL-NO-FMA: # %bb.0: 797 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 798 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] 799 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 800 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 801 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 802 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 803 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 804 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 805 ; 806 ; KNL-LABEL: v8f32_one_step2: 807 ; KNL: # %bb.0: 808 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 809 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 810 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 811 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 812 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 813 ; KNL-NEXT: retq # sched: [7:1.00] 814 ; 815 ; SKX-LABEL: v8f32_one_step2: 816 ; SKX: # %bb.0: 817 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 818 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50] 819 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50] 820 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] 821 ; SKX-NEXT: retq # sched: [7:1.00] 822 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 823 ret <8 x float> %div 824 } 825 826 define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { 827 ; SSE-LABEL: v8f32_one_step_2_divs: 828 ; SSE: # %bb.0: 829 ; SSE-NEXT: rcpps %xmm0, %xmm2 830 ; SSE-NEXT: mulps %xmm2, %xmm0 831 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 832 ; SSE-NEXT: movaps %xmm3, %xmm4 833 ; SSE-NEXT: subps %xmm0, %xmm4 834 ; SSE-NEXT: mulps %xmm2, %xmm4 835 ; SSE-NEXT: addps %xmm2, %xmm4 836 ; SSE-NEXT: rcpps %xmm1, %xmm0 837 ; SSE-NEXT: mulps %xmm0, %xmm1 838 ; SSE-NEXT: subps %xmm1, %xmm3 839 ; SSE-NEXT: mulps %xmm0, %xmm3 840 ; SSE-NEXT: addps %xmm0, %xmm3 841 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] 842 ; SSE-NEXT: mulps %xmm3, %xmm1 843 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 844 ; SSE-NEXT: mulps %xmm4, %xmm0 845 ; SSE-NEXT: mulps %xmm4, %xmm0 846 ; SSE-NEXT: mulps %xmm3, %xmm1 847 ; SSE-NEXT: retq 848 ; 849 ; AVX-RECIP-LABEL: v8f32_one_step_2_divs: 850 ; AVX-RECIP: # %bb.0: 851 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 852 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 853 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 854 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 855 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 856 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 857 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 858 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 859 ; AVX-RECIP-NEXT: retq 860 ; 861 ; FMA-RECIP-LABEL: v8f32_one_step_2_divs: 862 ; FMA-RECIP: # %bb.0: 863 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 864 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem 865 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 866 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 867 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 868 ; FMA-RECIP-NEXT: retq 869 ; 870 ; BTVER2-LABEL: v8f32_one_step_2_divs: 871 ; BTVER2: # %bb.0: 872 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 873 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 874 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 875 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 876 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 877 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 878 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00] 879 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 880 ; BTVER2-NEXT: retq # sched: [4:1.00] 881 ; 882 ; SANDY-LABEL: v8f32_one_step_2_divs: 883 ; SANDY: # %bb.0: 884 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 885 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 886 ; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 887 ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 888 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 889 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 890 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00] 891 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 892 ; SANDY-NEXT: retq # sched: [1:1.00] 893 ; 894 ; HASWELL-LABEL: v8f32_one_step_2_divs: 895 ; HASWELL: # %bb.0: 896 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 897 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 898 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 899 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 900 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50] 901 ; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 902 ; HASWELL-NEXT: retq # sched: [7:1.00] 903 ; 904 ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: 905 ; HASWELL-NO-FMA: # %bb.0: 906 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 907 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] 908 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 909 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 910 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 911 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 912 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50] 913 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 914 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 915 ; 916 ; KNL-LABEL: v8f32_one_step_2_divs: 917 ; KNL: # %bb.0: 918 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 919 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 920 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 921 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 922 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50] 923 ; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 924 ; KNL-NEXT: retq # sched: [7:1.00] 925 ; 926 ; SKX-LABEL: v8f32_one_step_2_divs: 927 ; SKX: # %bb.0: 928 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 929 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50] 930 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50] 931 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50] 932 ; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] 933 ; SKX-NEXT: retq # sched: [7:1.00] 934 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 935 %div2 = fdiv fast <8 x float> %div, %x 936 ret <8 x float> %div2 937 } 938 939 define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { 940 ; SSE-LABEL: v8f32_two_step2: 941 ; SSE: # %bb.0: 942 ; SSE-NEXT: movaps %xmm0, %xmm2 943 ; SSE-NEXT: rcpps %xmm1, %xmm3 944 ; SSE-NEXT: movaps %xmm1, %xmm4 945 ; SSE-NEXT: mulps %xmm3, %xmm4 946 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 947 ; SSE-NEXT: movaps %xmm0, %xmm5 948 ; SSE-NEXT: subps %xmm4, %xmm5 949 ; SSE-NEXT: mulps %xmm3, %xmm5 950 ; SSE-NEXT: addps %xmm3, %xmm5 951 ; SSE-NEXT: mulps %xmm5, %xmm1 952 ; SSE-NEXT: movaps %xmm0, %xmm3 953 ; SSE-NEXT: subps %xmm1, %xmm3 954 ; SSE-NEXT: mulps %xmm5, %xmm3 955 ; SSE-NEXT: addps %xmm5, %xmm3 956 ; SSE-NEXT: rcpps %xmm2, %xmm1 957 ; SSE-NEXT: movaps %xmm2, %xmm4 958 ; SSE-NEXT: mulps %xmm1, %xmm4 959 ; SSE-NEXT: movaps %xmm0, %xmm5 960 ; SSE-NEXT: subps %xmm4, %xmm5 961 ; SSE-NEXT: mulps %xmm1, %xmm5 962 ; SSE-NEXT: addps %xmm1, %xmm5 963 ; SSE-NEXT: mulps %xmm5, %xmm2 964 ; SSE-NEXT: subps %xmm2, %xmm0 965 ; SSE-NEXT: mulps %xmm5, %xmm0 966 ; SSE-NEXT: addps %xmm5, %xmm0 967 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 968 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 969 ; SSE-NEXT: movaps %xmm3, %xmm1 970 ; SSE-NEXT: retq 971 ; 972 ; AVX-RECIP-LABEL: v8f32_two_step2: 973 ; AVX-RECIP: # %bb.0: 974 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 975 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 976 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 977 ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 978 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 979 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 980 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 981 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 982 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 983 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 984 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 985 ; AVX-RECIP-NEXT: retq 986 ; 987 ; FMA-RECIP-LABEL: v8f32_two_step2: 988 ; FMA-RECIP: # %bb.0: 989 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 990 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 991 ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 992 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 993 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 994 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 995 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 996 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 997 ; FMA-RECIP-NEXT: retq 998 ; 999 ; BTVER2-LABEL: v8f32_two_step2: 1000 ; BTVER2: # %bb.0: 1001 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1002 ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 1003 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00] 1004 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00] 1005 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00] 1006 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00] 1007 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 1008 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 1009 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 1010 ; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 1011 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1012 ; BTVER2-NEXT: retq # sched: [4:1.00] 1013 ; 1014 ; SANDY-LABEL: v8f32_two_step2: 1015 ; SANDY: # %bb.0: 1016 ; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 1017 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] 1018 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1019 ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] 1020 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] 1021 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] 1022 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 1023 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1024 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 1025 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 1026 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1027 ; SANDY-NEXT: retq # sched: [1:1.00] 1028 ; 1029 ; HASWELL-LABEL: v8f32_two_step2: 1030 ; HASWELL: # %bb.0: 1031 ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 1032 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1033 ; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] 1034 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50] 1035 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50] 1036 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50] 1037 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50] 1038 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1039 ; HASWELL-NEXT: retq # sched: [7:1.00] 1040 ; 1041 ; HASWELL-NO-FMA-LABEL: v8f32_two_step2: 1042 ; HASWELL-NO-FMA: # %bb.0: 1043 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 1044 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50] 1045 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1046 ; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] 1047 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50] 1048 ; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] 1049 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] 1050 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1051 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 1052 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 1053 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1054 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1055 ; 1056 ; KNL-LABEL: v8f32_two_step2: 1057 ; KNL: # %bb.0: 1058 ; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 1059 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1060 ; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] 1061 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50] 1062 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50] 1063 ; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50] 1064 ; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50] 1065 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1066 ; KNL-NEXT: retq # sched: [7:1.00] 1067 ; 1068 ; SKX-LABEL: v8f32_two_step2: 1069 ; SKX: # %bb.0: 1070 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 1071 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1072 ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33] 1073 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50] 1074 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50] 1075 ; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50] 1076 ; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50] 1077 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] 1078 ; SKX-NEXT: retq # sched: [7:1.00] 1079 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 1080 ret <8 x float> %div 1081 } 1082 1083 define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { 1084 ; SSE-LABEL: v8f32_no_step: 1085 ; SSE: # %bb.0: 1086 ; SSE-NEXT: rcpps %xmm0, %xmm0 1087 ; SSE-NEXT: rcpps %xmm1, %xmm1 1088 ; SSE-NEXT: retq 1089 ; 1090 ; AVX-RECIP-LABEL: v8f32_no_step: 1091 ; AVX-RECIP: # %bb.0: 1092 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1093 ; AVX-RECIP-NEXT: retq 1094 ; 1095 ; FMA-RECIP-LABEL: v8f32_no_step: 1096 ; FMA-RECIP: # %bb.0: 1097 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1098 ; FMA-RECIP-NEXT: retq 1099 ; 1100 ; BTVER2-LABEL: v8f32_no_step: 1101 ; BTVER2: # %bb.0: 1102 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1103 ; BTVER2-NEXT: retq # sched: [4:1.00] 1104 ; 1105 ; SANDY-LABEL: v8f32_no_step: 1106 ; SANDY: # %bb.0: 1107 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1108 ; SANDY-NEXT: retq # sched: [1:1.00] 1109 ; 1110 ; HASWELL-LABEL: v8f32_no_step: 1111 ; HASWELL: # %bb.0: 1112 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1113 ; HASWELL-NEXT: retq # sched: [7:1.00] 1114 ; 1115 ; HASWELL-NO-FMA-LABEL: v8f32_no_step: 1116 ; HASWELL-NO-FMA: # %bb.0: 1117 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1118 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1119 ; 1120 ; KNL-LABEL: v8f32_no_step: 1121 ; KNL: # %bb.0: 1122 ; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1123 ; KNL-NEXT: retq # sched: [7:1.00] 1124 ; 1125 ; SKX-LABEL: v8f32_no_step: 1126 ; SKX: # %bb.0: 1127 ; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] 1128 ; SKX-NEXT: retq # sched: [7:1.00] 1129 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1130 ret <8 x float> %div 1131 } 1132 1133 define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { 1134 ; SSE-LABEL: v8f32_no_step2: 1135 ; SSE: # %bb.0: 1136 ; SSE-NEXT: rcpps %xmm1, %xmm1 1137 ; SSE-NEXT: rcpps %xmm0, %xmm0 1138 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1139 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 1140 ; SSE-NEXT: retq 1141 ; 1142 ; AVX-RECIP-LABEL: v8f32_no_step2: 1143 ; AVX-RECIP: # %bb.0: 1144 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1145 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1146 ; AVX-RECIP-NEXT: retq 1147 ; 1148 ; FMA-RECIP-LABEL: v8f32_no_step2: 1149 ; FMA-RECIP: # %bb.0: 1150 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1151 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1152 ; FMA-RECIP-NEXT: retq 1153 ; 1154 ; BTVER2-LABEL: v8f32_no_step2: 1155 ; BTVER2: # %bb.0: 1156 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1157 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1158 ; BTVER2-NEXT: retq # sched: [4:1.00] 1159 ; 1160 ; SANDY-LABEL: v8f32_no_step2: 1161 ; SANDY: # %bb.0: 1162 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1163 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1164 ; SANDY-NEXT: retq # sched: [1:1.00] 1165 ; 1166 ; HASWELL-LABEL: v8f32_no_step2: 1167 ; HASWELL: # %bb.0: 1168 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1169 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1170 ; HASWELL-NEXT: retq # sched: [7:1.00] 1171 ; 1172 ; HASWELL-NO-FMA-LABEL: v8f32_no_step2: 1173 ; HASWELL-NO-FMA: # %bb.0: 1174 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1175 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1176 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1177 ; 1178 ; KNL-LABEL: v8f32_no_step2: 1179 ; KNL: # %bb.0: 1180 ; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1181 ; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1182 ; KNL-NEXT: retq # sched: [7:1.00] 1183 ; 1184 ; SKX-LABEL: v8f32_no_step2: 1185 ; SKX: # %bb.0: 1186 ; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] 1187 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] 1188 ; SKX-NEXT: retq # sched: [7:1.00] 1189 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 1190 ret <8 x float> %div 1191 } 1192 1193 define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { 1194 ; SSE-LABEL: v16f32_one_step2: 1195 ; SSE: # %bb.0: 1196 ; SSE-NEXT: movaps %xmm3, %xmm4 1197 ; SSE-NEXT: movaps %xmm2, %xmm5 1198 ; SSE-NEXT: movaps %xmm0, %xmm6 1199 ; SSE-NEXT: rcpps %xmm3, %xmm2 1200 ; SSE-NEXT: mulps %xmm2, %xmm4 1201 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1202 ; SSE-NEXT: movaps %xmm0, %xmm3 1203 ; SSE-NEXT: subps %xmm4, %xmm3 1204 ; SSE-NEXT: mulps %xmm2, %xmm3 1205 ; SSE-NEXT: addps %xmm2, %xmm3 1206 ; SSE-NEXT: rcpps %xmm5, %xmm4 1207 ; SSE-NEXT: mulps %xmm4, %xmm5 1208 ; SSE-NEXT: movaps %xmm0, %xmm2 1209 ; SSE-NEXT: subps %xmm5, %xmm2 1210 ; SSE-NEXT: mulps %xmm4, %xmm2 1211 ; SSE-NEXT: addps %xmm4, %xmm2 1212 ; SSE-NEXT: rcpps %xmm1, %xmm5 1213 ; SSE-NEXT: mulps %xmm5, %xmm1 1214 ; SSE-NEXT: movaps %xmm0, %xmm4 1215 ; SSE-NEXT: subps %xmm1, %xmm4 1216 ; SSE-NEXT: mulps %xmm5, %xmm4 1217 ; SSE-NEXT: addps %xmm5, %xmm4 1218 ; SSE-NEXT: rcpps %xmm6, %xmm1 1219 ; SSE-NEXT: mulps %xmm1, %xmm6 1220 ; SSE-NEXT: subps %xmm6, %xmm0 1221 ; SSE-NEXT: mulps %xmm1, %xmm0 1222 ; SSE-NEXT: addps %xmm1, %xmm0 1223 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1224 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm4 1225 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 1226 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 1227 ; SSE-NEXT: movaps %xmm4, %xmm1 1228 ; SSE-NEXT: retq 1229 ; 1230 ; AVX-RECIP-LABEL: v16f32_one_step2: 1231 ; AVX-RECIP: # %bb.0: 1232 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1233 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1234 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1235 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1236 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1237 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1238 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1239 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1240 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1241 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1242 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1243 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1244 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1245 ; AVX-RECIP-NEXT: retq 1246 ; 1247 ; FMA-RECIP-LABEL: v16f32_one_step2: 1248 ; FMA-RECIP: # %bb.0: 1249 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1250 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1251 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 1252 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 1253 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1254 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 1255 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 1256 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1257 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1258 ; FMA-RECIP-NEXT: retq 1259 ; 1260 ; BTVER2-LABEL: v16f32_one_step2: 1261 ; BTVER2: # %bb.0: 1262 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1263 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00] 1264 ; BTVER2-NEXT: vrcpps %ymm0, %ymm4 # sched: [2:2.00] 1265 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00] 1266 ; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0 # sched: [2:2.00] 1267 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00] 1268 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 1269 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1270 ; BTVER2-NEXT: vmulps %ymm0, %ymm4, %ymm0 # sched: [2:2.00] 1271 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00] 1272 ; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0 # sched: [3:2.00] 1273 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1274 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00] 1275 ; BTVER2-NEXT: retq # sched: [4:1.00] 1276 ; 1277 ; SANDY-LABEL: v16f32_one_step2: 1278 ; SANDY: # %bb.0: 1279 ; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00] 1280 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00] 1281 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1282 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1283 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 1284 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1285 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 1286 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 1287 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1288 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 1289 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1290 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1291 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00] 1292 ; SANDY-NEXT: retq # sched: [1:1.00] 1293 ; 1294 ; HASWELL-LABEL: v16f32_one_step2: 1295 ; HASWELL: # %bb.0: 1296 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1297 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1298 ; HASWELL-NEXT: vrcpps %ymm0, %ymm4 # sched: [11:2.00] 1299 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50] 1300 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50] 1301 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50] 1302 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50] 1303 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1304 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1305 ; HASWELL-NEXT: retq # sched: [7:1.00] 1306 ; 1307 ; HASWELL-NO-FMA-LABEL: v16f32_one_step2: 1308 ; HASWELL-NO-FMA: # %bb.0: 1309 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1310 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50] 1311 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1312 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1313 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1314 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1315 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1316 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50] 1317 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1318 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50] 1319 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1320 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1321 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1322 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1323 ; 1324 ; KNL-LABEL: v16f32_one_step2: 1325 ; KNL: # %bb.0: 1326 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1327 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] 1328 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] 1329 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] 1330 ; KNL-NEXT: retq # sched: [7:1.00] 1331 ; 1332 ; SKX-LABEL: v16f32_one_step2: 1333 ; SKX: # %bb.0: 1334 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1335 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] 1336 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50] 1337 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] 1338 ; SKX-NEXT: retq # sched: [7:1.00] 1339 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1340 ret <16 x float> %div 1341 } 1342 1343 define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { 1344 ; SSE-LABEL: v16f32_one_step_2_divs: 1345 ; SSE: # %bb.0: 1346 ; SSE-NEXT: rcpps %xmm0, %xmm6 1347 ; SSE-NEXT: mulps %xmm6, %xmm0 1348 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1349 ; SSE-NEXT: movaps %xmm4, %xmm5 1350 ; SSE-NEXT: subps %xmm0, %xmm5 1351 ; SSE-NEXT: mulps %xmm6, %xmm5 1352 ; SSE-NEXT: addps %xmm6, %xmm5 1353 ; SSE-NEXT: rcpps %xmm1, %xmm0 1354 ; SSE-NEXT: mulps %xmm0, %xmm1 1355 ; SSE-NEXT: movaps %xmm4, %xmm6 1356 ; SSE-NEXT: subps %xmm1, %xmm6 1357 ; SSE-NEXT: mulps %xmm0, %xmm6 1358 ; SSE-NEXT: addps %xmm0, %xmm6 1359 ; SSE-NEXT: rcpps %xmm2, %xmm0 1360 ; SSE-NEXT: mulps %xmm0, %xmm2 1361 ; SSE-NEXT: movaps %xmm4, %xmm7 1362 ; SSE-NEXT: subps %xmm2, %xmm7 1363 ; SSE-NEXT: mulps %xmm0, %xmm7 1364 ; SSE-NEXT: addps %xmm0, %xmm7 1365 ; SSE-NEXT: rcpps %xmm3, %xmm0 1366 ; SSE-NEXT: mulps %xmm0, %xmm3 1367 ; SSE-NEXT: subps %xmm3, %xmm4 1368 ; SSE-NEXT: mulps %xmm0, %xmm4 1369 ; SSE-NEXT: addps %xmm0, %xmm4 1370 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] 1371 ; SSE-NEXT: mulps %xmm4, %xmm3 1372 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01] 1373 ; SSE-NEXT: mulps %xmm7, %xmm2 1374 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] 1375 ; SSE-NEXT: mulps %xmm6, %xmm1 1376 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 1377 ; SSE-NEXT: mulps %xmm5, %xmm0 1378 ; SSE-NEXT: mulps %xmm5, %xmm0 1379 ; SSE-NEXT: mulps %xmm6, %xmm1 1380 ; SSE-NEXT: mulps %xmm7, %xmm2 1381 ; SSE-NEXT: mulps %xmm4, %xmm3 1382 ; SSE-NEXT: retq 1383 ; 1384 ; AVX-RECIP-LABEL: v16f32_one_step_2_divs: 1385 ; AVX-RECIP: # %bb.0: 1386 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1387 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1388 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1389 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1390 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1391 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1392 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1393 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1394 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1395 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1396 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1397 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 1398 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 1399 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0 1400 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1401 ; AVX-RECIP-NEXT: retq 1402 ; 1403 ; FMA-RECIP-LABEL: v16f32_one_step_2_divs: 1404 ; FMA-RECIP: # %bb.0: 1405 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1406 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1407 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 1408 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 1409 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1410 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 1411 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 1412 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 1413 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 1414 ; FMA-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0 1415 ; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1416 ; FMA-RECIP-NEXT: retq 1417 ; 1418 ; BTVER2-LABEL: v16f32_one_step_2_divs: 1419 ; BTVER2: # %bb.0: 1420 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1421 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00] 1422 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00] 1423 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 1424 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00] 1425 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 1426 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00] 1427 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00] 1428 ; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00] 1429 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [7:2.00] 1430 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1431 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00] 1432 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [7:2.00] 1433 ; BTVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [2:2.00] 1434 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1435 ; BTVER2-NEXT: retq # sched: [4:1.00] 1436 ; 1437 ; SANDY-LABEL: v16f32_one_step_2_divs: 1438 ; SANDY: # %bb.0: 1439 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 1440 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 1441 ; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1442 ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1443 ; SANDY-NEXT: vrcpps %ymm1, %ymm4 # sched: [7:2.00] 1444 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 1445 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1446 ; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 # sched: [5:1.00] 1447 ; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1448 ; SANDY-NEXT: vmulps %ymm1, %ymm4, %ymm1 # sched: [5:1.00] 1449 ; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1450 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00] 1451 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00] 1452 ; SANDY-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00] 1453 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 1454 ; SANDY-NEXT: retq # sched: [1:1.00] 1455 ; 1456 ; HASWELL-LABEL: v16f32_one_step_2_divs: 1457 ; HASWELL: # %bb.0: 1458 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1459 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1460 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50] 1461 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50] 1462 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1463 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50] 1464 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50] 1465 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50] 1466 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50] 1467 ; HASWELL-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50] 1468 ; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1469 ; HASWELL-NEXT: retq # sched: [7:1.00] 1470 ; 1471 ; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs: 1472 ; HASWELL-NO-FMA: # %bb.0: 1473 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1474 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50] 1475 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1476 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1477 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm4 # sched: [11:2.00] 1478 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50] 1479 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1480 ; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 # sched: [5:0.50] 1481 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1482 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm4, %ymm1 # sched: [5:0.50] 1483 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1484 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50] 1485 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50] 1486 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50] 1487 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1488 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1489 ; 1490 ; KNL-LABEL: v16f32_one_step_2_divs: 1491 ; KNL: # %bb.0: 1492 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1493 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] 1494 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] 1495 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50] 1496 ; KNL-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50] 1497 ; KNL-NEXT: retq # sched: [7:1.00] 1498 ; 1499 ; SKX-LABEL: v16f32_one_step_2_divs: 1500 ; SKX: # %bb.0: 1501 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1502 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] 1503 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50] 1504 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50] 1505 ; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.50] 1506 ; SKX-NEXT: retq # sched: [7:1.00] 1507 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1508 %div2 = fdiv fast <16 x float> %div, %x 1509 ret <16 x float> %div2 1510 } 1511 1512 define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { 1513 ; SSE-LABEL: v16f32_two_step2: 1514 ; SSE: # %bb.0: 1515 ; SSE-NEXT: movaps %xmm3, %xmm6 1516 ; SSE-NEXT: movaps %xmm2, %xmm5 1517 ; SSE-NEXT: movaps %xmm0, %xmm4 1518 ; SSE-NEXT: rcpps %xmm3, %xmm2 1519 ; SSE-NEXT: mulps %xmm2, %xmm3 1520 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1521 ; SSE-NEXT: movaps %xmm0, %xmm7 1522 ; SSE-NEXT: subps %xmm3, %xmm7 1523 ; SSE-NEXT: mulps %xmm2, %xmm7 1524 ; SSE-NEXT: addps %xmm2, %xmm7 1525 ; SSE-NEXT: mulps %xmm7, %xmm6 1526 ; SSE-NEXT: movaps %xmm0, %xmm3 1527 ; SSE-NEXT: subps %xmm6, %xmm3 1528 ; SSE-NEXT: mulps %xmm7, %xmm3 1529 ; SSE-NEXT: addps %xmm7, %xmm3 1530 ; SSE-NEXT: rcpps %xmm5, %xmm2 1531 ; SSE-NEXT: movaps %xmm5, %xmm6 1532 ; SSE-NEXT: mulps %xmm2, %xmm6 1533 ; SSE-NEXT: movaps %xmm0, %xmm7 1534 ; SSE-NEXT: subps %xmm6, %xmm7 1535 ; SSE-NEXT: mulps %xmm2, %xmm7 1536 ; SSE-NEXT: addps %xmm2, %xmm7 1537 ; SSE-NEXT: mulps %xmm7, %xmm5 1538 ; SSE-NEXT: movaps %xmm0, %xmm2 1539 ; SSE-NEXT: subps %xmm5, %xmm2 1540 ; SSE-NEXT: mulps %xmm7, %xmm2 1541 ; SSE-NEXT: addps %xmm7, %xmm2 1542 ; SSE-NEXT: rcpps %xmm1, %xmm5 1543 ; SSE-NEXT: movaps %xmm1, %xmm6 1544 ; SSE-NEXT: mulps %xmm5, %xmm6 1545 ; SSE-NEXT: movaps %xmm0, %xmm7 1546 ; SSE-NEXT: subps %xmm6, %xmm7 1547 ; SSE-NEXT: mulps %xmm5, %xmm7 1548 ; SSE-NEXT: addps %xmm5, %xmm7 1549 ; SSE-NEXT: mulps %xmm7, %xmm1 1550 ; SSE-NEXT: movaps %xmm0, %xmm5 1551 ; SSE-NEXT: subps %xmm1, %xmm5 1552 ; SSE-NEXT: mulps %xmm7, %xmm5 1553 ; SSE-NEXT: addps %xmm7, %xmm5 1554 ; SSE-NEXT: rcpps %xmm4, %xmm1 1555 ; SSE-NEXT: movaps %xmm4, %xmm6 1556 ; SSE-NEXT: mulps %xmm1, %xmm6 1557 ; SSE-NEXT: movaps %xmm0, %xmm7 1558 ; SSE-NEXT: subps %xmm6, %xmm7 1559 ; SSE-NEXT: mulps %xmm1, %xmm7 1560 ; SSE-NEXT: addps %xmm1, %xmm7 1561 ; SSE-NEXT: mulps %xmm7, %xmm4 1562 ; SSE-NEXT: subps %xmm4, %xmm0 1563 ; SSE-NEXT: mulps %xmm7, %xmm0 1564 ; SSE-NEXT: addps %xmm7, %xmm0 1565 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1566 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm5 1567 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 1568 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 1569 ; SSE-NEXT: movaps %xmm5, %xmm1 1570 ; SSE-NEXT: retq 1571 ; 1572 ; AVX-RECIP-LABEL: v16f32_two_step2: 1573 ; AVX-RECIP: # %bb.0: 1574 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1575 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 1576 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1577 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1578 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1579 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1580 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1581 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 1582 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1583 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1584 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1585 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 1586 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1587 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1588 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1589 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1590 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 1591 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1592 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1593 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1594 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1595 ; AVX-RECIP-NEXT: retq 1596 ; 1597 ; FMA-RECIP-LABEL: v16f32_two_step2: 1598 ; FMA-RECIP: # %bb.0: 1599 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1600 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1601 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1602 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 1603 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 1604 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 1605 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 1606 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1607 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1608 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 1609 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 1610 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 1611 ; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 1612 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1613 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1614 ; FMA-RECIP-NEXT: retq 1615 ; 1616 ; BTVER2-LABEL: v16f32_two_step2: 1617 ; BTVER2: # %bb.0: 1618 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1619 ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00] 1620 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00] 1621 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00] 1622 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00] 1623 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00] 1624 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00] 1625 ; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00] 1626 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1627 ; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00] 1628 ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00] 1629 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00] 1630 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00] 1631 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00] 1632 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00] 1633 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00] 1634 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00] 1635 ; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00] 1636 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00] 1637 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 1638 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1639 ; BTVER2-NEXT: retq # sched: [4:1.00] 1640 ; 1641 ; SANDY-LABEL: v16f32_two_step2: 1642 ; SANDY: # %bb.0: 1643 ; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00] 1644 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00] 1645 ; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1646 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1647 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00] 1648 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1649 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00] 1650 ; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1651 ; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 1652 ; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1653 ; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 1654 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00] 1655 ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1656 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00] 1657 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1658 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 1659 ; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00] 1660 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 1661 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1662 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1663 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00] 1664 ; SANDY-NEXT: retq # sched: [1:1.00] 1665 ; 1666 ; HASWELL-LABEL: v16f32_two_step2: 1667 ; HASWELL: # %bb.0: 1668 ; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1669 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1670 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00] 1671 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50] 1672 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50] 1673 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50] 1674 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50] 1675 ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1676 ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00] 1677 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50] 1678 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50] 1679 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50] 1680 ; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50] 1681 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1682 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1683 ; HASWELL-NEXT: retq # sched: [7:1.00] 1684 ; 1685 ; HASWELL-NO-FMA-LABEL: v16f32_two_step2: 1686 ; HASWELL-NO-FMA: # %bb.0: 1687 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1688 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50] 1689 ; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1690 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1691 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50] 1692 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1693 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50] 1694 ; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1695 ; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1696 ; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1697 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1698 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] 1699 ; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1700 ; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50] 1701 ; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1702 ; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50] 1703 ; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00] 1704 ; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50] 1705 ; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1706 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1707 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1708 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1709 ; 1710 ; KNL-LABEL: v16f32_two_step2: 1711 ; KNL: # %bb.0: 1712 ; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1713 ; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] 1714 ; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00] 1715 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50] 1716 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50] 1717 ; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50] 1718 ; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50] 1719 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] 1720 ; KNL-NEXT: retq # sched: [7:1.00] 1721 ; 1722 ; SKX-LABEL: v16f32_two_step2: 1723 ; SKX: # %bb.0: 1724 ; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1725 ; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] 1726 ; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33] 1727 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50] 1728 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50] 1729 ; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50] 1730 ; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50] 1731 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] 1732 ; SKX-NEXT: retq # sched: [7:1.00] 1733 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1734 ret <16 x float> %div 1735 } 1736 1737 define <16 x float> @v16f32_no_step(<16 x float> %x) #3 { 1738 ; SSE-LABEL: v16f32_no_step: 1739 ; SSE: # %bb.0: 1740 ; SSE-NEXT: rcpps %xmm0, %xmm0 1741 ; SSE-NEXT: rcpps %xmm1, %xmm1 1742 ; SSE-NEXT: rcpps %xmm2, %xmm2 1743 ; SSE-NEXT: rcpps %xmm3, %xmm3 1744 ; SSE-NEXT: retq 1745 ; 1746 ; AVX-RECIP-LABEL: v16f32_no_step: 1747 ; AVX-RECIP: # %bb.0: 1748 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1749 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 1750 ; AVX-RECIP-NEXT: retq 1751 ; 1752 ; FMA-RECIP-LABEL: v16f32_no_step: 1753 ; FMA-RECIP: # %bb.0: 1754 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1755 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 1756 ; FMA-RECIP-NEXT: retq 1757 ; 1758 ; BTVER2-LABEL: v16f32_no_step: 1759 ; BTVER2: # %bb.0: 1760 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1761 ; BTVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [2:2.00] 1762 ; BTVER2-NEXT: retq # sched: [4:1.00] 1763 ; 1764 ; SANDY-LABEL: v16f32_no_step: 1765 ; SANDY: # %bb.0: 1766 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1767 ; SANDY-NEXT: vrcpps %ymm1, %ymm1 # sched: [7:2.00] 1768 ; SANDY-NEXT: retq # sched: [1:1.00] 1769 ; 1770 ; HASWELL-LABEL: v16f32_no_step: 1771 ; HASWELL: # %bb.0: 1772 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1773 ; HASWELL-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1774 ; HASWELL-NEXT: retq # sched: [7:1.00] 1775 ; 1776 ; HASWELL-NO-FMA-LABEL: v16f32_no_step: 1777 ; HASWELL-NO-FMA: # %bb.0: 1778 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1779 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1780 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1781 ; 1782 ; KNL-LABEL: v16f32_no_step: 1783 ; KNL: # %bb.0: 1784 ; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [11:2.00] 1785 ; KNL-NEXT: retq # sched: [7:1.00] 1786 ; 1787 ; SKX-LABEL: v16f32_no_step: 1788 ; SKX: # %bb.0: 1789 ; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [4:2.00] 1790 ; SKX-NEXT: retq # sched: [7:1.00] 1791 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1792 ret <16 x float> %div 1793 } 1794 1795 define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 { 1796 ; SSE-LABEL: v16f32_no_step2: 1797 ; SSE: # %bb.0: 1798 ; SSE-NEXT: rcpps %xmm3, %xmm3 1799 ; SSE-NEXT: rcpps %xmm2, %xmm2 1800 ; SSE-NEXT: rcpps %xmm1, %xmm1 1801 ; SSE-NEXT: rcpps %xmm0, %xmm0 1802 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1803 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 1804 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 1805 ; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 1806 ; SSE-NEXT: retq 1807 ; 1808 ; AVX-RECIP-LABEL: v16f32_no_step2: 1809 ; AVX-RECIP: # %bb.0: 1810 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 1811 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1812 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1813 ; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1814 ; AVX-RECIP-NEXT: retq 1815 ; 1816 ; FMA-RECIP-LABEL: v16f32_no_step2: 1817 ; FMA-RECIP: # %bb.0: 1818 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 1819 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1820 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1821 ; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1822 ; FMA-RECIP-NEXT: retq 1823 ; 1824 ; BTVER2-LABEL: v16f32_no_step2: 1825 ; BTVER2: # %bb.0: 1826 ; BTVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [2:2.00] 1827 ; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1828 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1829 ; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00] 1830 ; BTVER2-NEXT: retq # sched: [4:1.00] 1831 ; 1832 ; SANDY-LABEL: v16f32_no_step2: 1833 ; SANDY: # %bb.0: 1834 ; SANDY-NEXT: vrcpps %ymm1, %ymm1 # sched: [7:2.00] 1835 ; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1836 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1837 ; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00] 1838 ; SANDY-NEXT: retq # sched: [1:1.00] 1839 ; 1840 ; HASWELL-LABEL: v16f32_no_step2: 1841 ; HASWELL: # %bb.0: 1842 ; HASWELL-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1843 ; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1844 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1845 ; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1846 ; HASWELL-NEXT: retq # sched: [7:1.00] 1847 ; 1848 ; HASWELL-NO-FMA-LABEL: v16f32_no_step2: 1849 ; HASWELL-NO-FMA: # %bb.0: 1850 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1851 ; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1852 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1853 ; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1854 ; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1855 ; 1856 ; KNL-LABEL: v16f32_no_step2: 1857 ; KNL: # %bb.0: 1858 ; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [11:2.00] 1859 ; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] 1860 ; KNL-NEXT: retq # sched: [7:1.00] 1861 ; 1862 ; SKX-LABEL: v16f32_no_step2: 1863 ; SKX: # %bb.0: 1864 ; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [4:2.00] 1865 ; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] 1866 ; SKX-NEXT: retq # sched: [7:1.00] 1867 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1868 ret <16 x float> %div 1869 } 1870 1871 attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } 1872 attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } 1873 attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } 1874 attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" } 1875 1876