1 //===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the itinerary class data for the POWER7 processor. 11 // 12 //===----------------------------------------------------------------------===// 13 14 // Primary reference: 15 // IBM POWER7 multicore server processor 16 // B. Sinharoy, et al. 17 // IBM J. Res. & Dev. (55) 3. May/June 2011. 18 19 // Scheduling for the P7 involves tracking two types of resources: 20 // 1. The dispatch bundle slots 21 // 2. The functional unit resources 22 23 // Dispatch units: 24 def P7_DU1 : FuncUnit; 25 def P7_DU2 : FuncUnit; 26 def P7_DU3 : FuncUnit; 27 def P7_DU4 : FuncUnit; 28 def P7_DU5 : FuncUnit; 29 def P7_DU6 : FuncUnit; 30 31 def P7_LS1 : FuncUnit; // Load/Store pipeline 1 32 def P7_LS2 : FuncUnit; // Load/Store pipeline 2 33 34 def P7_FX1 : FuncUnit; // FX pipeline 1 35 def P7_FX2 : FuncUnit; // FX pipeline 2 36 37 // VS pipeline 1 (vector integer ops. always here) 38 def P7_VS1 : FuncUnit; // VS pipeline 1 39 // VS pipeline 2 (128-bit stores and perms. here) 40 def P7_VS2 : FuncUnit; // VS pipeline 2 41 42 def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs) 43 def P7_BRU : FuncUnit; // BR unit 44 45 // Notes: 46 // Each LSU pipeline can also execute FX add and logical instructions. 47 // Each LSU pipeline can complete a load or store in one cycle. 48 // 49 // Each store is broken into two parts, AGEN goes to the LSU while a 50 // "data steering" op. goes to the FXU or VSU. 51 // 52 // FX loads have a two cycle load-to-use latency (so one "bubble" cycle). 53 // VSU loads have a three cycle load-to-use latency (so two "bubble" cycle). 54 // 55 // Frequent FX ops. take only one cycle and results can be used again in the 56 // next cycle (there is a self-bypass). Getting results from the other FX 57 // pipeline takes an additional cycle. 58 // 59 // The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles 60 // (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops. 61 // Dispatch of an instruction to VS1 that uses four single prec. inputs 62 // (either to a float or XC op). prevents dispatch in that cycle to VS2 of any 63 // floating point instruction. 64 // 65 // The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles 66 // (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline 67 // (unlike on the POWER6). 68 // 69 // FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP 70 // share the same write-back, and have a 5-cycle latency difference, so the 71 // IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP 72 // op. has been dispatched to VS1. 73 // 74 // Three cycles after an L1 cache hit, a dependent VSU instruction can issue. 75 // 76 // Instruction dispatch groups have (at most) four non-branch instructions, and 77 // two branches. Unlike on the POWER4/5, a branch does not automatically 78 // end the dispatch group, but a second branch must be the last in the group. 79 80 def P7Itineraries : ProcessorItineraries< 81 [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6, 82 P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [ 83 InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2, 84 P7_DU3, P7_DU4], 0>, 85 InstrStage<1, [P7_FX1, P7_FX2, 86 P7_LS1, P7_LS2]>], 87 [1, 1, 1]>, 88 InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2, 89 P7_DU3, P7_DU4], 0>, 90 InstrStage<1, [P7_FX1, P7_FX2]>], 91 [1, 1, 1]>, 92 InstrItinData<IIC_IntISEL, [InstrStage<1, [P7_DU1], 0>, 93 InstrStage<1, [P7_FX1, P7_FX2], 0>, 94 InstrStage<1, [P7_BRU]>], 95 [1, 1, 1, 1]>, 96 InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2, 97 P7_DU3, P7_DU4], 0>, 98 InstrStage<1, [P7_FX1, P7_FX2]>], 99 [1, 1, 1]>, 100 // FIXME: Add record-form itinerary data. 101 InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>, 102 InstrStage<1, [P7_DU2], 0>, 103 InstrStage<36, [P7_FX1, P7_FX2]>], 104 [36, 1, 1]>, 105 InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>, 106 InstrStage<1, [P7_DU2], 0>, 107 InstrStage<68, [P7_FX1, P7_FX2]>], 108 [68, 1, 1]>, 109 InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2, 110 P7_DU3, P7_DU4], 0>, 111 InstrStage<1, [P7_FX1, P7_FX2]>], 112 [4, 1, 1]>, 113 InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2, 114 P7_DU3, P7_DU4], 0>, 115 InstrStage<1, [P7_FX1, P7_FX2]>], 116 [4, 1, 1]>, 117 InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2, 118 P7_DU3, P7_DU4], 0>, 119 InstrStage<1, [P7_FX1, P7_FX2]>], 120 [4, 1, 1]>, 121 InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2, 122 P7_DU3, P7_DU4], 0>, 123 InstrStage<1, [P7_FX1, P7_FX2]>], 124 [1, 1, 1]>, 125 InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2, 126 P7_DU3, P7_DU4], 0>, 127 InstrStage<1, [P7_FX1, P7_FX2]>], 128 [1, 1, 1]>, 129 InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2, 130 P7_DU3, P7_DU4], 0>, 131 InstrStage<1, [P7_FX1, P7_FX2]>], 132 [1, 1, 1]>, 133 InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2, 134 P7_DU3, P7_DU4], 0>, 135 InstrStage<1, [P7_FX1, P7_FX2]>], 136 [1, 1]>, 137 InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2, 138 P7_DU3, P7_DU4], 0>, 139 InstrStage<1, [P7_FX1, P7_FX2]>], 140 [1, 1]>, 141 InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 142 InstrStage<1, [P7_BRU]>], 143 [3, 1, 1]>, 144 InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>, 145 InstrStage<1, [P7_CRU]>], 146 [3, 1, 1]>, 147 InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 148 InstrStage<1, [P7_BRU]>], 149 [3, 1, 1]>, 150 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 151 InstrStage<1, [P7_BRU]>], 152 [3, 1, 1]>, 153 InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2, 154 P7_DU3, P7_DU4], 0>, 155 InstrStage<1, [P7_LS1, P7_LS2]>], 156 [2, 1, 1]>, 157 InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>, 158 InstrStage<1, [P7_DU2], 0>, 159 InstrStage<1, [P7_LS1, P7_LS2], 0>, 160 InstrStage<1, [P7_FX1, P7_FX2]>], 161 [2, 2, 1, 1]>, 162 InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>, 163 InstrStage<1, [P7_DU2], 0>, 164 InstrStage<1, [P7_DU3], 0>, 165 InstrStage<1, [P7_DU4], 0>, 166 InstrStage<1, [P7_FX1, P7_FX2]>, 167 InstrStage<1, [P7_LS1, P7_LS2], 0>, 168 InstrStage<1, [P7_FX1, P7_FX2]>], 169 [3, 3, 1, 1]>, 170 InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2, 171 P7_DU3, P7_DU4], 0>, 172 InstrStage<1, [P7_LS1, P7_LS2]>], 173 [2, 1, 1]>, 174 InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>, 175 InstrStage<1, [P7_DU2], 0>, 176 InstrStage<1, [P7_LS1, P7_LS2], 0>, 177 InstrStage<1, [P7_FX1, P7_FX2]>], 178 [2, 2, 1, 1]>, 179 InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>, 180 InstrStage<1, [P7_DU2], 0>, 181 InstrStage<1, [P7_DU3], 0>, 182 InstrStage<1, [P7_DU4], 0>, 183 InstrStage<1, [P7_FX1, P7_FX2]>, 184 InstrStage<1, [P7_LS1, P7_LS2], 0>, 185 InstrStage<1, [P7_FX1, P7_FX2]>], 186 [3, 3, 1, 1]>, 187 InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2, 188 P7_DU3, P7_DU4], 0>, 189 InstrStage<1, [P7_LS1, P7_LS2]>], 190 [3, 1, 1]>, 191 InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2, 192 P7_DU3, P7_DU4], 0>, 193 InstrStage<1, [P7_LS1, P7_LS2]>], 194 [3, 1, 1]>, 195 InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>, 196 InstrStage<1, [P7_DU2], 0>, 197 InstrStage<1, [P7_LS1, P7_LS2], 0>, 198 InstrStage<1, [P7_FX1, P7_FX2]>], 199 [3, 3, 1, 1]>, 200 InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>, 201 InstrStage<1, [P7_DU2], 0>, 202 InstrStage<1, [P7_LS1, P7_LS2], 0>, 203 InstrStage<1, [P7_FX1, P7_FX2]>], 204 [3, 3, 1, 1]>, 205 InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>, 206 InstrStage<1, [P7_DU2], 0>, 207 InstrStage<1, [P7_LS1, P7_LS2]>, 208 InstrStage<1, [P7_FX1, P7_FX2]>], 209 [3, 1, 1]>, 210 InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>, 211 InstrStage<1, [P7_DU2], 0>, 212 InstrStage<1, [P7_LS1, P7_LS2], 0>, 213 InstrStage<1, [P7_FX1, P7_FX2]>, 214 InstrStage<1, [P7_FX1, P7_FX2]>], 215 [4, 4, 1, 1]>, 216 InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>, 217 InstrStage<1, [P7_DU2], 0>, 218 InstrStage<1, [P7_DU3], 0>, 219 InstrStage<1, [P7_DU4], 0>, 220 InstrStage<1, [P7_FX1, P7_FX2]>, 221 InstrStage<1, [P7_LS1, P7_LS2], 0>, 222 InstrStage<1, [P7_FX1, P7_FX2]>, 223 InstrStage<1, [P7_FX1, P7_FX2]>], 224 [4, 4, 1, 1]>, 225 InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>, 226 InstrStage<1, [P7_DU2], 0>, 227 InstrStage<1, [P7_LS1, P7_LS2]>, 228 InstrStage<1, [P7_FX1, P7_FX2]>], 229 [3, 1, 1]>, 230 InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>, 231 InstrStage<1, [P7_DU2], 0>, 232 InstrStage<1, [P7_DU3], 0>, 233 InstrStage<1, [P7_DU4], 0>, 234 InstrStage<1, [P7_LS1, P7_LS2]>], 235 [3, 1, 1]>, 236 InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>, 237 InstrStage<1, [P7_DU2], 0>, 238 InstrStage<1, [P7_DU3], 0>, 239 InstrStage<1, [P7_DU4], 0>, 240 InstrStage<1, [P7_LS1, P7_LS2]>], 241 [3, 1, 1]>, 242 InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2, 243 P7_DU3, P7_DU4], 0>, 244 InstrStage<1, [P7_LS1, P7_LS2]>], 245 [2, 1, 1]>, 246 InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2, 247 P7_DU3, P7_DU4], 0>, 248 InstrStage<1, [P7_LS1, P7_LS2], 0>, 249 InstrStage<1, [P7_FX1, P7_FX2]>], 250 [1, 1, 1]>, 251 InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2, 252 P7_DU3, P7_DU4], 0>, 253 InstrStage<1, [P7_LS1, P7_LS2], 0>, 254 InstrStage<1, [P7_FX1, P7_FX2]>], 255 [1, 1, 1]>, 256 InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P7_DU1], 0>, 257 InstrStage<1, [P7_DU2], 0>, 258 InstrStage<1, [P7_LS1, P7_LS2], 0>, 259 InstrStage<1, [P7_FX1, P7_FX2]>, 260 InstrStage<1, [P7_FX1, P7_FX2]>], 261 [2, 1, 1, 1]>, 262 InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P7_DU1], 0>, 263 InstrStage<1, [P7_DU2], 0>, 264 InstrStage<1, [P7_DU3], 0>, 265 InstrStage<1, [P7_DU4], 0>, 266 InstrStage<1, [P7_LS1, P7_LS2], 0>, 267 InstrStage<1, [P7_FX1, P7_FX2]>, 268 InstrStage<1, [P7_FX1, P7_FX2]>], 269 [2, 1, 1, 1]>, 270 InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2, 271 P7_DU3, P7_DU4], 0>, 272 InstrStage<1, [P7_LS1, P7_LS2], 0>, 273 InstrStage<1, [P7_VS1, P7_VS2]>], 274 [1, 1, 1]>, 275 InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>, 276 InstrStage<1, [P7_DU2], 0>, 277 InstrStage<1, [P7_LS1, P7_LS2], 0>, 278 InstrStage<1, [P7_FX1, P7_FX2], 0>, 279 InstrStage<1, [P7_VS1, P7_VS2]>], 280 [2, 1, 1, 1]>, 281 InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2, 282 P7_DU3, P7_DU4], 0>, 283 InstrStage<1, [P7_LS1, P7_LS2], 0>, 284 InstrStage<1, [P7_VS2]>], 285 [1, 1, 1]>, 286 InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>, 287 InstrStage<1, [P7_DU2], 0>, 288 InstrStage<1, [P7_DU3], 0>, 289 InstrStage<1, [P7_DU4], 0>, 290 InstrStage<1, [P7_LS1, P7_LS2]>], 291 [1, 1, 1]>, 292 InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>, 293 InstrStage<1, [P7_DU2], 0>, 294 InstrStage<1, [P7_DU3], 0>, 295 InstrStage<1, [P7_DU4], 0>, 296 InstrStage<1, [P7_LS1, P7_LS2]>], 297 [1, 1, 1]>, 298 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>, 299 InstrStage<1, [P7_DU2], 0>, 300 InstrStage<1, [P7_DU3], 0>, 301 InstrStage<1, [P7_DU4], 0>, 302 InstrStage<1, [P7_CRU]>, 303 InstrStage<1, [P7_FX1, P7_FX2]>], 304 [3, 1]>, // mtcr 305 InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>, 306 InstrStage<1, [P7_CRU]>], 307 [6, 1]>, 308 InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>, 309 InstrStage<1, [P7_CRU]>], 310 [3, 1]>, 311 InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>, 312 InstrStage<1, [P7_FX1]>], 313 [4, 1]>, // mtctr 314 InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2, 315 P7_DU3, P7_DU4], 0>, 316 InstrStage<1, [P7_VS1, P7_VS2]>], 317 [5, 1, 1]>, 318 InstrItinData<IIC_FPAddSub , [InstrStage<1, [P7_DU1, P7_DU2, 319 P7_DU3, P7_DU4], 0>, 320 InstrStage<1, [P7_VS1, P7_VS2]>], 321 [5, 1, 1]>, 322 InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2, 323 P7_DU3, P7_DU4], 0>, 324 InstrStage<1, [P7_VS1, P7_VS2]>], 325 [8, 1, 1]>, 326 InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2, 327 P7_DU3, P7_DU4], 0>, 328 InstrStage<1, [P7_VS1, P7_VS2]>], 329 [33, 1, 1]>, 330 InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2, 331 P7_DU3, P7_DU4], 0>, 332 InstrStage<1, [P7_VS1, P7_VS2]>], 333 [27, 1, 1]>, 334 InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2, 335 P7_DU3, P7_DU4], 0>, 336 InstrStage<1, [P7_VS1, P7_VS2]>], 337 [44, 1, 1]>, 338 InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2, 339 P7_DU3, P7_DU4], 0>, 340 InstrStage<1, [P7_VS1, P7_VS2]>], 341 [32, 1, 1]>, 342 InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2, 343 P7_DU3, P7_DU4], 0>, 344 InstrStage<1, [P7_VS1, P7_VS2]>], 345 [5, 1, 1, 1]>, 346 InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2, 347 P7_DU3, P7_DU4], 0>, 348 InstrStage<1, [P7_VS1, P7_VS2]>], 349 [5, 1, 1]>, 350 InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>, 351 InstrStage<1, [P7_VS1]>], 352 [2, 1, 1]>, 353 InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>, 354 InstrStage<1, [P7_VS1]>], 355 [2, 1, 1]>, 356 InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>, 357 InstrStage<1, [P7_VS1]>], 358 [2, 1, 1]>, 359 InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>, 360 InstrStage<1, [P7_VS1, P7_VS2]>], 361 [6, 1, 1]>, 362 InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>, 363 InstrStage<1, [P7_VS1, P7_VS2]>], 364 [6, 1, 1]>, 365 InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>, 366 InstrStage<1, [P7_VS1, P7_VS2]>], 367 [6, 1, 1]>, 368 InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>, 369 InstrStage<1, [P7_VS1]>], 370 [7, 1, 1]>, 371 InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>, 372 InstrStage<1, [P7_VS2]>], 373 [3, 1, 1]> 374 ]>; 375 376 // ===---------------------------------------------------------------------===// 377 // P7 machine model for scheduling and other instruction cost heuristics. 378 379 def P7Model : SchedMachineModel { 380 let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle. 381 // Note that the dispatch bundle size is 6 (including 382 // branches), but the total internal issue bandwidth per 383 // cycle (from all queues) is 8. 384 385 let MinLatency = 0; // Out-of-order dispatch. 386 let LoadLatency = 3; // Optimistic load latency assuming bypass. 387 // This is overriden by OperandCycles if the 388 // Itineraries are queried instead. 389 let MispredictPenalty = 16; 390 391 // Try to make sure we have at least 10 dispatch groups in a loop. 392 let LoopMicroOpBufferSize = 40; 393 394 let Itineraries = P7Itineraries; 395 } 396 397