1 //=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the machine model for AArch64 Cyclone to support 11 // instruction scheduling and other instruction cost heuristics. 12 // 13 //===----------------------------------------------------------------------===// 14 15 def CycloneModel : SchedMachineModel { 16 let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. 17 let MicroOpBufferSize = 192; // Based on the reorder buffer. 18 let LoadLatency = 4; // Optimistic load latency. 19 let MispredictPenalty = 16; // 14-19 cycles are typical. 20 } 21 22 //===----------------------------------------------------------------------===// 23 // Define each kind of processor resource and number available on Cyclone. 24 25 // 4 integer pipes 26 def CyUnitI : ProcResource<4> { 27 let BufferSize = 48; 28 } 29 30 // 2 branch units: I[0..1] 31 def CyUnitB : ProcResource<2> { 32 let Super = CyUnitI; 33 let BufferSize = 24; 34 } 35 36 // 1 indirect-branch unit: I[0] 37 def CyUnitBR : ProcResource<1> { 38 let Super = CyUnitB; 39 } 40 41 // 2 shifter pipes: I[2..3] 42 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI 43 def CyUnitIS : ProcResource<2> { 44 let Super = CyUnitI; 45 let BufferSize = 24; 46 } 47 48 // 1 mul pipe: I[0] 49 def CyUnitIM : ProcResource<1> { 50 let Super = CyUnitBR; 51 let BufferSize = 32; 52 } 53 54 // 1 div pipe: I[1] 55 def CyUnitID : ProcResource<1> { 56 let Super = CyUnitB; 57 let BufferSize = 16; 58 } 59 60 // 1 integer division unit. This is driven by the ID pipe, but only 61 // consumes the pipe for one cycle at issue and another cycle at writeback. 62 def CyUnitIntDiv : ProcResource<1>; 63 64 // 2 ld/st pipes. 65 def CyUnitLS : ProcResource<2> { 66 let BufferSize = 28; 67 } 68 69 // 3 fp/vector pipes. 70 def CyUnitV : ProcResource<3> { 71 let BufferSize = 48; 72 } 73 // 2 fp/vector arithmetic and multiply pipes: V[0-1] 74 def CyUnitVM : ProcResource<2> { 75 let Super = CyUnitV; 76 let BufferSize = 32; 77 } 78 // 1 fp/vector division/sqrt pipe: V[2] 79 def CyUnitVD : ProcResource<1> { 80 let Super = CyUnitV; 81 let BufferSize = 16; 82 } 83 // 1 fp compare pipe: V[0] 84 def CyUnitVC : ProcResource<1> { 85 let Super = CyUnitVM; 86 let BufferSize = 16; 87 } 88 89 // 2 fp division/square-root units. These are driven by the VD pipe, 90 // but only consume the pipe for one cycle at issue and a cycle at writeback. 91 def CyUnitFloatDiv : ProcResource<2>; 92 93 //===----------------------------------------------------------------------===// 94 // Define scheduler read/write resources and latency on Cyclone. 95 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. 96 97 let SchedModel = CycloneModel in { 98 99 //--- 100 // 7.8.1. Moves 101 //--- 102 103 // A single nop micro-op (uX). 104 def WriteX : SchedWriteRes<[]> { let Latency = 0; } 105 106 // Move zero is a register rename (to machine register zero). 107 // The move is replaced by a single nop micro-op. 108 // MOVZ Rd, #0 109 // AND Rd, Rzr, #imm 110 def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>; 111 def WriteImmZ : SchedWriteVariant<[ 112 SchedVar<WriteZPred, [WriteX]>, 113 SchedVar<NoSchedPred, [WriteImm]>]>; 114 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; 115 116 // Move GPR is a register rename and single nop micro-op. 117 // ORR Xd, XZR, Xm 118 // ADD Xd, Xn, #0 119 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>; 120 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>; 121 def WriteMov : SchedWriteVariant<[ 122 SchedVar<WriteIMovPred, [WriteX]>, 123 SchedVar<WriteVMovPred, [WriteX]>, 124 SchedVar<NoSchedPred, [WriteI]>]>; 125 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; 126 127 // Move non-zero immediate is an integer ALU op. 128 // MOVN,MOVZ,MOVK 129 def : WriteRes<WriteImm, [CyUnitI]>; 130 131 //--- 132 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, 133 // Shifts and Bitfield Operations 134 //--- 135 136 // ADR,ADRP 137 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri 138 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr 139 // ADC(S),SBC(S) 140 // Aliases: CMN, CMP, TST 141 // 142 // Conditional operations. 143 // CCMNi,CCMPi,CCMNr,CCMPr, 144 // CSEL,CSINC,CSINV,CSNEG 145 // 146 // Bit counting and reversal operations. 147 // CLS,CLZ,RBIT,REV,REV16,REV32 148 def : WriteRes<WriteI, [CyUnitI]>; 149 150 // ADD with shifted register operand is a single micro-op that 151 // consumes a shift pipeline for two cycles. 152 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs 153 // EXAMPLE: ADDrs Xn, Xm LSL #imm 154 def : WriteRes<WriteISReg, [CyUnitIS]> { 155 let Latency = 2; 156 let ResourceCycles = [2]; 157 } 158 159 // ADD with extended register operand is the same as shifted reg operand. 160 // ADD(S)re,SUB(S)re 161 // EXAMPLE: ADDXre Xn, Xm, UXTB #1 162 def : WriteRes<WriteIEReg, [CyUnitIS]> { 163 let Latency = 2; 164 let ResourceCycles = [2]; 165 } 166 167 // Variable shift and bitfield operations. 168 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM 169 def : WriteRes<WriteIS, [CyUnitIS]>; 170 171 // EXTR Shifts a pair of registers and requires two micro-ops. 172 // The second micro-op is delayed, as modeled by ReadExtrHi. 173 // EXTR Xn, Xm, #imm 174 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> { 175 let Latency = 2; 176 let NumMicroOps = 2; 177 } 178 179 // EXTR's first register read is delayed by one cycle, effectively 180 // shortening its writer's latency. 181 // EXTR Xn, Xm, #imm 182 def : ReadAdvance<ReadExtrHi, 1>; 183 184 //--- 185 // 7.8.6. Multiplies 186 //--- 187 188 // MUL/MNEG are aliases for MADD/MSUB. 189 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL 190 def : WriteRes<WriteIM32, [CyUnitIM]> { 191 let Latency = 4; 192 } 193 // MADDX,MSUBX,SMULH,UMULH 194 def : WriteRes<WriteIM64, [CyUnitIM]> { 195 let Latency = 5; 196 } 197 198 //--- 199 // 7.8.7. Divide 200 //--- 201 202 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. 203 // The ID pipe is consumed for 2 cycles: issue and writeback. 204 // SDIVW,UDIVW 205 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> { 206 let Latency = 10; 207 let ResourceCycles = [2, 10]; 208 } 209 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. 210 // The ID pipe is consumed for 2 cycles: issue and writeback. 211 // SDIVX,UDIVX 212 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> { 213 let Latency = 13; 214 let ResourceCycles = [2, 13]; 215 } 216 217 //--- 218 // 7.8.8,7.8.10. Load/Store, single element 219 //--- 220 221 // Integer loads take 4 cycles and use one LS unit for one cycle. 222 def : WriteRes<WriteLD, [CyUnitLS]> { 223 let Latency = 4; 224 } 225 226 // Store-load forwarding is 4 cycles. 227 // 228 // Note: The store-exclusive sequence incorporates this 229 // latency. However, general heuristics should not model the 230 // dependence between a store and subsequent may-alias load because 231 // hardware speculation works. 232 def : WriteRes<WriteST, [CyUnitLS]> { 233 let Latency = 4; 234 } 235 236 // Load from base address plus an optionally scaled register offset. 237 // Rt latency is latency WriteIS + WriteLD. 238 // EXAMPLE: LDR Xn, Xm [, lsl 3] 239 def CyWriteLDIdx : SchedWriteVariant<[ 240 SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register. 241 SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset. 242 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type. 243 244 // EXAMPLE: STR Xn, Xm [, lsl 3] 245 def CyWriteSTIdx : SchedWriteVariant<[ 246 SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register. 247 SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset. 248 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type. 249 250 // Read the (unshifted) base register Xn in the second micro-op one cycle later. 251 // EXAMPLE: LDR Xn, Xm [, lsl 3] 252 def ReadBaseRS : SchedReadAdvance<1>; 253 def CyReadAdrBase : SchedReadVariant<[ 254 SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset. 255 SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift. 256 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type. 257 258 //--- 259 // 7.8.9,7.8.11. Load/Store, paired 260 //--- 261 262 // Address pre/post increment is a simple ALU op with one cycle latency. 263 def : WriteRes<WriteAdr, [CyUnitI]>; 264 265 // LDP high register write is fused with the load, but a nop micro-op remains. 266 def : WriteRes<WriteLDHi, []> { 267 let Latency = 4; 268 } 269 270 // STP is a vector op and store, except for QQ, which is just two stores. 271 def : SchedAlias<WriteSTP, WriteVSTShuffle>; 272 def : InstRW<[WriteST, WriteST], (instrs STPQi)>; 273 274 //--- 275 // 7.8.13. Branches 276 //--- 277 278 // Branches take a single micro-op. 279 // The misprediction penalty is defined as a SchedMachineModel property. 280 def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;} 281 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;} 282 283 //--- 284 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations 285 //--- 286 287 // NOP,SEV,SEVL,WFE,WFI,YIELD 288 def : WriteRes<WriteHint, []> {let Latency = 0;} 289 // ISB 290 def : InstRW<[WriteI], (instrs ISB)>; 291 // SLREX,DMB,DSB 292 def : WriteRes<WriteBarrier, [CyUnitLS]>; 293 294 // System instructions get an invalid latency because the latency of 295 // other operations across them is meaningless. 296 def : WriteRes<WriteSys, []> {let Latency = -1;} 297 298 //===----------------------------------------------------------------------===// 299 // 7.9 Vector Unit Instructions 300 301 // Simple vector operations take 2 cycles. 302 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;} 303 304 // Define some longer latency vector op types for Cyclone. 305 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 306 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} 307 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} 308 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} 309 310 // Simple floating-point operations take 2 cycles. 311 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;} 312 313 //--- 314 // 7.9.1 Vector Moves 315 //--- 316 317 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently 318 // generates expensive int-float conversion instead: 319 // FMOVDi Dd, #0.0 320 // FMOVv2f64ns Vd.2d, #0.0 321 322 // FMOVSi,FMOVDi 323 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;} 324 325 // MOVI,MVNI are WriteV 326 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV 327 328 // Move FPR is a register rename and single nop micro-op. 329 // ORR.16b Vd,Vn,Vn 330 // COPY is handled above in the WriteMov Variant. 331 def WriteVMov : SchedWriteVariant<[ 332 SchedVar<WriteVMovPred, [WriteX]>, 333 SchedVar<NoSchedPred, [WriteV]>]>; 334 def : InstRW<[WriteVMov], (instrs ORRv16i8)>; 335 336 // FMOVSr,FMOVDr are WriteF. 337 338 // MOV V,V is a WriteV. 339 340 // CPY D,V[x] is a WriteV 341 342 // INS V[x],V[y] is a WriteV. 343 344 // FMOVWSr,FMOVXDr,FMOVXDHighr 345 def : WriteRes<WriteFCopy, [CyUnitLS]> { 346 let Latency = 5; 347 } 348 349 // FMOVSWr,FMOVDXr 350 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; 351 352 // INS V[x],R 353 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; 354 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; 355 356 // SMOV,UMOV R,V[x] 357 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; 358 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; 359 360 // DUP V,R 361 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; 362 363 // DUP V,V[x] is a WriteV. 364 365 //--- 366 // 7.9.2 Integer Arithmetic, Logical, and Comparisons 367 //--- 368 369 // BIC,ORR V,#imm are WriteV 370 371 def : InstRW<[CyWriteV3], (instregex "ABSv")>; 372 373 // MVN,NEG,NOT are WriteV 374 375 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; 376 377 // ADDP is a WriteV. 378 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 379 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; 380 381 def : InstRW<[CyWriteV3], 382 (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; 383 384 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; 385 386 // ADD,SUB are WriteV 387 388 // Forward declare. 389 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 390 391 // Add/Diff and accumulate uses the vector multiply unit. 392 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 393 def CyReadVAccum : SchedReadAdvance<1, 394 [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; 395 396 def : InstRW<[CyWriteVAccum, CyReadVAccum], 397 (instregex "SADALP","UADALP")>; 398 399 def : InstRW<[CyWriteVAccum, CyReadVAccum], 400 (instregex "SABAv","UABAv","SABALv","UABALv")>; 401 402 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; 403 404 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; 405 406 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; 407 408 // WriteV includes: 409 // AND,BIC,CMTST,EOR,ORN,ORR 410 // ADDP 411 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD 412 // SADDL,SSUBL,UADDL,USUBL 413 // SADDW,SSUBW,UADDW,USUBW 414 415 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", 416 "CMLEv","CMLTv", 417 "CMHIv","CMHSv")>; 418 419 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", 420 "SMAXPv","SMINPv","UMAXPv","UMINPv")>; 421 422 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", 423 "SABDLv","UABDLv")>; 424 425 //--- 426 // 7.9.3 Floating Point Arithmetic and Comparisons 427 //--- 428 429 // FABS,FNEG are WriteF 430 431 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; 432 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; 433 434 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", 435 "FMINPv2i","FMINNMPv2i")>; 436 437 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; 438 439 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, 440 FSUBSrr,FSUBv2f32,FSUBv4f32, 441 FADDPv2f32,FADDPv4f32, 442 FABD32,FABDv2f32,FABDv4f32)>; 443 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, 444 FSUBDrr,FSUBv2f64, 445 FADDPv2f64, 446 FABD64,FABDv2f64)>; 447 448 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; 449 450 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", 451 "FMAXS","FMAXD","FMAXv", 452 "FMINS","FMIND","FMINv", 453 "FMAXNMS","FMAXNMD","FMAXNMv", 454 "FMINNMS","FMINNMD","FMINNMv", 455 "FMAXPv2f","FMAXPv4f", 456 "FMINPv2f","FMINPv4f", 457 "FMAXNMPv2f","FMAXNMPv4f", 458 "FMINNMPv2f","FMINNMPv4f")>; 459 460 // FCMP,FCMPE,FCCMP,FCCMPE 461 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;} 462 463 // FCSEL is a WriteF. 464 465 //--- 466 // 7.9.4 Shifts and Bitfield Operations 467 //--- 468 469 // SHL is a WriteV 470 471 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 472 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; 473 474 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 475 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; 476 477 // Shift and accumulate uses the vector multiply unit. 478 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 479 def CyReadVShiftAcc : SchedReadAdvance<1, 480 [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; 481 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], 482 (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; 483 484 // SSHL,USHL are WriteV. 485 486 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; 487 488 // SQSHL,SQSHLU,UQSHL are WriteV. 489 490 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; 491 492 // WriteV includes: 493 // SHLL,SSHLL,USHLL 494 // SLI,SRI 495 // BIF,BIT,BSL 496 // EXT 497 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN 498 // XTN2 499 500 def : InstRW<[CyWriteV4], 501 (instregex "RSHRNv","SHRNv", 502 "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", 503 "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; 504 505 //--- 506 // 7.9.5 Multiplication 507 //--- 508 509 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} 510 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", 511 "SQDMULLv","SQDMULHv","SQRDMULHv")>; 512 513 // FMUL,FMULX,FNMUL default to WriteFMul. 514 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;} 515 516 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} 517 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, 518 FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; 519 520 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; 521 def : InstRW<[CyWriteVMul, CyReadVMulAcc], 522 (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", 523 "SQDMLAL","SQDMLSL")>; 524 525 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} 526 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} 527 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; 528 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; 529 530 def : InstRW<[CyWriteSMul, CyReadSMul], 531 (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, 532 FMLAv2f32,FMLAv4f32, 533 FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; 534 def : InstRW<[CyWriteDMul, CyReadDMul], 535 (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, 536 FMLAv2f64,FMLAv2i64_indexed, 537 FMLSv2f64,FMLSv2i64_indexed)>; 538 539 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } 540 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; 541 542 //--- 543 // 7.9.6 Divide and Square Root 544 //--- 545 546 // FDIV,FSQRT 547 // TODO: Add 64-bit variant with 19 cycle latency. 548 // TODO: Specialize FSQRT for longer latency. 549 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> { 550 let Latency = 17; 551 let ResourceCycles = [2, 17]; 552 } 553 554 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; 555 556 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } 557 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; 558 559 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } 560 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } 561 def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; 562 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; 563 564 //--- 565 // 7.9.7 Integer-FP Conversions 566 //--- 567 568 // FCVT lengthen f16/s32 569 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; 570 571 // FCVT,FCVTN,FCVTXN 572 // SCVTF,UCVTF V,V 573 // FRINT(AIMNPXZ) V,V 574 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;} 575 576 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. 577 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; 578 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; 579 580 // FCVT Rd, S/D = V6+LD4: 10 cycles 581 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; 582 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; 583 584 // FCVTL is a WriteV 585 586 //--- 587 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup 588 //--- 589 590 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} 591 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, 592 AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, 593 SHA1SU0rrr)>; 594 595 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} 596 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; 597 598 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} 599 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, 600 SHA256Hrrr,SHA256H2rrr)>; 601 602 // TRN,UZP,ZUP are WriteV. 603 604 // TBL,TBX are WriteV. 605 606 //--- 607 // 7.9.11-7.9.14 Load/Store, single element and paired 608 //--- 609 610 // Loading into the vector unit takes 5 cycles vs 4 for integer loads. 611 def : WriteRes<WriteVLD, [CyUnitLS]> { 612 let Latency = 5; 613 } 614 615 // Store-load forwarding is 4 cycles. 616 def : WriteRes<WriteVST, [CyUnitLS]> { 617 let Latency = 4; 618 } 619 620 // WriteVLDPair/VSTPair sequences are expanded by the target description. 621 622 //--- 623 // 7.9.15 Load, element operations 624 //--- 625 626 // Only the first WriteVLD and WriteAdr for writeback matches def operands. 627 // Subsequent WriteVLDs consume resources. Since all loaded values have the 628 // same latency, this is acceptable. 629 630 // Vd is read 5 cycles after issuing the vector load. 631 def : ReadAdvance<ReadVLD, 5>; 632 633 def : InstRW<[WriteVLD], 634 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 635 def : InstRW<[WriteVLD, WriteAdr], 636 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 637 638 // Register writes from the load's high half are fused micro-ops. 639 def : InstRW<[WriteVLD], 640 (instregex "LD1Twov(8b|4h|2s|1d)$")>; 641 def : InstRW<[WriteVLD, WriteAdr], 642 (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; 643 def : InstRW<[WriteVLD, WriteVLD], 644 (instregex "LD1Twov(16b|8h|4s|2d)$")>; 645 def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 646 (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; 647 648 def : InstRW<[WriteVLD, WriteVLD], 649 (instregex "LD1Threev(8b|4h|2s|1d)$")>; 650 def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 651 (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; 652 def : InstRW<[WriteVLD, WriteVLD, WriteVLD], 653 (instregex "LD1Threev(16b|8h|4s|2d)$")>; 654 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], 655 (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; 656 657 def : InstRW<[WriteVLD, WriteVLD], 658 (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 659 def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 660 (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; 661 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], 662 (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 663 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], 664 (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; 665 666 def : InstRW<[WriteVLDShuffle, ReadVLD], 667 (instregex "LD1i(8|16|32)$")>; 668 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], 669 (instregex "LD1i(8|16|32)_POST")>; 670 671 def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; 672 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; 673 674 def : InstRW<[WriteVLDShuffle], 675 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 676 def : InstRW<[WriteVLDShuffle, WriteAdr], 677 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 678 679 def : InstRW<[WriteVLDShuffle, WriteV], 680 (instregex "LD2Twov(8b|4h|2s)$")>; 681 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 682 (instregex "LD2Twov(8b|4h|2s)_POST$")>; 683 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], 684 (instregex "LD2Twov(16b|8h|4s|2d)$")>; 685 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], 686 (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; 687 688 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 689 (instregex "LD2i(8|16|32)$")>; 690 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 691 (instregex "LD2i(8|16|32)_POST")>; 692 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 693 (instregex "LD2i64$")>; 694 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 695 (instregex "LD2i64_POST")>; 696 697 def : InstRW<[WriteVLDShuffle, WriteV], 698 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 699 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 700 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 701 702 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 703 (instregex "LD3Threev(8b|4h|2s)$")>; 704 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 705 (instregex "LD3Threev(8b|4h|2s)_POST")>; 706 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], 707 (instregex "LD3Threev(16b|8h|4s|2d)$")>; 708 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], 709 (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; 710 711 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], 712 (instregex "LD3i(8|16|32)$")>; 713 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], 714 (instregex "LD3i(8|16|32)_POST")>; 715 716 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], 717 (instregex "LD3i64$")>; 718 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 719 (instregex "LD3i64_POST")>; 720 721 def : InstRW<[WriteVLDShuffle, WriteV, WriteV], 722 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; 723 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], 724 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; 725 726 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 727 (instrs LD3Rv1d,LD3Rv2d)>; 728 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 729 (instrs LD3Rv2d_POST,LD3Rv2d_POST)>; 730 731 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 732 (instregex "LD4Fourv(8b|4h|2s)$")>; 733 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 734 (instregex "LD4Fourv(8b|4h|2s)_POST")>; 735 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, 736 WriteVLDPairShuffle, WriteVLDPairShuffle], 737 (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 738 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, 739 WriteVLDPairShuffle, WriteVLDPairShuffle], 740 (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; 741 742 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], 743 (instregex "LD4i(8|16|32)$")>; 744 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], 745 (instregex "LD4i(8|16|32)_POST")>; 746 747 748 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], 749 (instrs LD4i64)>; 750 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 751 (instrs LD4i64_POST)>; 752 753 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], 754 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; 755 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], 756 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; 757 758 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 759 (instrs LD4Rv1d,LD4Rv2d)>; 760 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 761 (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; 762 763 //--- 764 // 7.9.16 Store, element operations 765 //--- 766 767 // Only the WriteAdr for writeback matches a def operands. 768 // Subsequent WriteVLDs only consume resources. 769 770 def : InstRW<[WriteVST], 771 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 772 def : InstRW<[WriteAdr, WriteVST], 773 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 774 775 def : InstRW<[WriteVSTShuffle], 776 (instregex "ST1Twov(8b|4h|2s|1d)$")>; 777 def : InstRW<[WriteAdr, WriteVSTShuffle], 778 (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; 779 def : InstRW<[WriteVST, WriteVST], 780 (instregex "ST1Twov(16b|8h|4s|2d)$")>; 781 def : InstRW<[WriteAdr, WriteVST, WriteVST], 782 (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; 783 784 def : InstRW<[WriteVSTShuffle, WriteVST], 785 (instregex "ST1Threev(8b|4h|2s|1d)$")>; 786 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], 787 (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; 788 def : InstRW<[WriteVST, WriteVST, WriteVST], 789 (instregex "ST1Threev(16b|8h|4s|2d)$")>; 790 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], 791 (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; 792 793 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 794 (instregex "ST1Fourv(8b|4h|2s|1d)$")>; 795 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 796 (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; 797 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], 798 (instregex "ST1Fourv(16b|8h|4s|2d)$")>; 799 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], 800 (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; 801 802 def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; 803 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; 804 805 def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; 806 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; 807 808 def : InstRW<[WriteVSTShuffle], 809 (instregex "ST2Twov(8b|4h|2s)$")>; 810 def : InstRW<[WriteAdr, WriteVSTShuffle], 811 (instregex "ST2Twov(8b|4h|2s)_POST")>; 812 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 813 (instregex "ST2Twov(16b|8h|4s|2d)$")>; 814 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 815 (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; 816 817 def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; 818 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; 819 def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; 820 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; 821 822 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 823 (instregex "ST3Threev(8b|4h|2s)$")>; 824 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 825 (instregex "ST3Threev(8b|4h|2s)_POST")>; 826 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 827 (instregex "ST3Threev(16b|8h|4s|2d)$")>; 828 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 829 (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; 830 831 def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; 832 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; 833 834 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; 835 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; 836 837 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], 838 (instregex "ST4Fourv(8b|4h|2s|1d)$")>; 839 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], 840 (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; 841 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, 842 WriteVSTPairShuffle, WriteVSTPairShuffle], 843 (instregex "ST4Fourv(16b|8h|4s|2d)$")>; 844 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, 845 WriteVSTPairShuffle, WriteVSTPairShuffle], 846 (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; 847 848 def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; 849 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; 850 851 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; 852 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; 853 854 //--- 855 // Unused SchedRead types 856 //--- 857 858 def : ReadAdvance<ReadI, 0>; 859 def : ReadAdvance<ReadISReg, 0>; 860 def : ReadAdvance<ReadIEReg, 0>; 861 def : ReadAdvance<ReadIM, 0>; 862 def : ReadAdvance<ReadIMA, 0>; 863 def : ReadAdvance<ReadID, 0>; 864 865 } // SchedModel = CycloneModel 866