1 //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the machine model for AArch64 Cyclone to support 11 // instruction scheduling and other instruction cost heuristics. 12 // 13 //===----------------------------------------------------------------------===// 14 15 def CycloneModel : SchedMachineModel { 16 let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. 17 let MicroOpBufferSize = 192; // Based on the reorder buffer. 18 let LoadLatency = 4; // Optimistic load latency. 19 let MispredictPenalty = 16; // 14-19 cycles are typical. 20 let CompleteModel = 1; 21 22 list<Predicate> UnsupportedFeatures = [HasSVE]; 23 } 24 25 //===----------------------------------------------------------------------===// 26 // Define each kind of processor resource and number available on Cyclone. 27 28 // 4 integer pipes 29 def CyUnitI : ProcResource<4> { 30 let BufferSize = 48; 31 } 32 33 // 2 branch units: I[0..1] 34 def CyUnitB : ProcResource<2> { 35 let Super = CyUnitI; 36 let BufferSize = 24; 37 } 38 39 // 1 indirect-branch unit: I[0] 40 def CyUnitBR : ProcResource<1> { 41 let Super = CyUnitB; 42 } 43 44 // 2 shifter pipes: I[2..3] 45 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI 46 def CyUnitIS : ProcResource<2> { 47 let Super = CyUnitI; 48 let BufferSize = 24; 49 } 50 51 // 1 mul pipe: I[0] 52 def CyUnitIM : ProcResource<1> { 53 let Super = CyUnitBR; 54 let BufferSize = 32; 55 } 56 57 // 1 div pipe: I[1] 58 def CyUnitID : ProcResource<1> { 59 let Super = CyUnitB; 60 let BufferSize = 16; 61 } 62 63 // 1 integer division unit. This is driven by the ID pipe, but only 64 // consumes the pipe for one cycle at issue and another cycle at writeback. 65 def CyUnitIntDiv : ProcResource<1>; 66 67 // 2 ld/st pipes. 68 def CyUnitLS : ProcResource<2> { 69 let BufferSize = 28; 70 } 71 72 // 3 fp/vector pipes. 73 def CyUnitV : ProcResource<3> { 74 let BufferSize = 48; 75 } 76 // 2 fp/vector arithmetic and multiply pipes: V[0-1] 77 def CyUnitVM : ProcResource<2> { 78 let Super = CyUnitV; 79 let BufferSize = 32; 80 } 81 // 1 fp/vector division/sqrt pipe: V[2] 82 def CyUnitVD : ProcResource<1> { 83 let Super = CyUnitV; 84 let BufferSize = 16; 85 } 86 // 1 fp compare pipe: V[0] 87 def CyUnitVC : ProcResource<1> { 88 let Super = CyUnitVM; 89 let BufferSize = 16; 90 } 91 92 // 2 fp division/square-root units. These are driven by the VD pipe, 93 // but only consume the pipe for one cycle at issue and a cycle at writeback. 94 def CyUnitFloatDiv : ProcResource<2>; 95 96 //===----------------------------------------------------------------------===// 97 // Define scheduler read/write resources and latency on Cyclone. 98 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. 99 100 let SchedModel = CycloneModel in { 101 102 //--- 103 // 7.8.1. Moves 104 //--- 105 106 // A single nop micro-op (uX). 107 def WriteX : SchedWriteRes<[]> { let Latency = 0; } 108 109 // Move zero is a register rename (to machine register zero). 110 // The move is replaced by a single nop micro-op. 111 // MOVZ Rd, #0 112 // AND Rd, Rzr, #imm 113 def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; 114 def WriteImmZ : SchedWriteVariant<[ 115 SchedVar<WriteZPred, [WriteX]>, 116 SchedVar<NoSchedPred, [WriteImm]>]>; 117 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; 118 119 // Move GPR is a register rename and single nop micro-op. 120 // ORR Xd, XZR, Xm 121 // ADD Xd, Xn, #0 122 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>; 123 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>; 124 def WriteMov : SchedWriteVariant<[ 125 SchedVar<WriteIMovPred, [WriteX]>, 126 SchedVar<WriteVMovPred, [WriteX]>, 127 SchedVar<NoSchedPred, [WriteI]>]>; 128 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; 129 130 // Move non-zero immediate is an integer ALU op. 131 // MOVN,MOVZ,MOVK 132 def : WriteRes<WriteImm, [CyUnitI]>; 133 134 //--- 135 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, 136 // Shifts and Bitfield Operations 137 //--- 138 139 // ADR,ADRP 140 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri 141 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr 142 // ADC(S),SBC(S) 143 // Aliases: CMN, CMP, TST 144 // 145 // Conditional operations. 146 // CCMNi,CCMPi,CCMNr,CCMPr, 147 // CSEL,CSINC,CSINV,CSNEG 148 // 149 // Bit counting and reversal operations. 150 // CLS,CLZ,RBIT,REV,REV16,REV32 151 def : WriteRes<WriteI, [CyUnitI]>; 152 153 // ADD with shifted register operand is a single micro-op that 154 // consumes a shift pipeline for two cycles. 155 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs 156 // EXAMPLE: ADDrs Xn, Xm LSL #imm 157 def : WriteRes<WriteISReg, [CyUnitIS]> { 158 let Latency = 2; 159 let ResourceCycles = [2]; 160 } 161 162 // ADD with extended register operand is the same as shifted reg operand. 163 // ADD(S)re,SUB(S)re 164 // EXAMPLE: ADDXre Xn, Xm, UXTB #1 165 def : WriteRes<WriteIEReg, [CyUnitIS]> { 166 let Latency = 2; 167 let ResourceCycles = [2]; 168 } 169 170 // Variable shift and bitfield operations. 171 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM 172 def : WriteRes<WriteIS, [CyUnitIS]>; 173 174 // EXTR Shifts a pair of registers and requires two micro-ops. 175 // The second micro-op is delayed, as modeled by ReadExtrHi. 176 // EXTR Xn, Xm, #imm 177 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> { 178 let Latency = 2; 179 let NumMicroOps = 2; 180 } 181 182 // EXTR's first register read is delayed by one cycle, effectively 183 // shortening its writer's latency. 184 // EXTR Xn, Xm, #imm 185 def : ReadAdvance<ReadExtrHi, 1>; 186 187 //--- 188 // 7.8.6. Multiplies 189 //--- 190 191 // MUL/MNEG are aliases for MADD/MSUB. 192 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL 193 def : WriteRes<WriteIM32, [CyUnitIM]> { 194 let Latency = 4; 195 } 196 // MADDX,MSUBX,SMULH,UMULH 197 def : WriteRes<WriteIM64, [CyUnitIM]> { 198 let Latency = 5; 199 } 200 201 //--- 202 // 7.8.7. Divide 203 //--- 204 205 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. 206 // The ID pipe is consumed for 2 cycles: issue and writeback. 207 // SDIVW,UDIVW 208 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> { 209 let Latency = 10; 210 let ResourceCycles = [2, 10]; 211 } 212 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. 213 // The ID pipe is consumed for 2 cycles: issue and writeback. 214 // SDIVX,UDIVX 215 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> { 216 let Latency = 13; 217 let ResourceCycles = [2, 13]; 218 } 219 220 //--- 221 // 7.8.8,7.8.10. Load/Store, single element 222 //--- 223 224 // Integer loads take 4 cycles and use one LS unit for one cycle. 225 def : WriteRes<WriteLD, [CyUnitLS]> { 226 let Latency = 4; 227 } 228 229 // Store-load forwarding is 4 cycles. 230 // 231 // Note: The store-exclusive sequence incorporates this 232 // latency. However, general heuristics should not model the 233 // dependence between a store and subsequent may-alias load because 234 // hardware speculation works. 235 def : WriteRes<WriteST, [CyUnitLS]> { 236 let Latency = 4; 237 } 238 239 // Load from base address plus an optionally scaled register offset. 240 // Rt latency is latency WriteIS + WriteLD. 241 // EXAMPLE: LDR Xn, Xm [, lsl 3] 242 def CyWriteLDIdx : SchedWriteVariant<[ 243 SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register. 244 SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset. 245 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type. 246 247 // EXAMPLE: STR Xn, Xm [, lsl 3] 248 def CyWriteSTIdx : SchedWriteVariant<[ 249 SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register. 250 SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset. 251 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type. 252 253 // Read the (unshifted) base register Xn in the second micro-op one cycle later. 254 // EXAMPLE: LDR Xn, Xm [, lsl 3] 255 def ReadBaseRS : SchedReadAdvance<1>; 256 def CyReadAdrBase : SchedReadVariant<[ 257 SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset. 258 SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift. 259 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type. 260 261 //--- 262 // 7.8.9,7.8.11. Load/Store, paired 263 //--- 264 265 // Address pre/post increment is a simple ALU op with one cycle latency. 266 def : WriteRes<WriteAdr, [CyUnitI]>; 267 268 // LDP high register write is fused with the load, but a nop micro-op remains. 269 def : WriteRes<WriteLDHi, []> { 270 let Latency = 4; 271 } 272 273 // STP is a vector op and store, except for QQ, which is just two stores. 274 def : SchedAlias<WriteSTP, WriteVSTShuffle>; 275 def : InstRW<[WriteST, WriteST], (instrs STPQi)>; 276 277 //--- 278 // 7.8.13. Branches 279 //--- 280 281 // Branches take a single micro-op. 282 // The misprediction penalty is defined as a SchedMachineModel property. 283 def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;} 284 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;} 285 286 //--- 287 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations 288 //--- 289 290 // NOP,SEV,SEVL,WFE,WFI,YIELD 291 def : WriteRes<WriteHint, []> {let Latency = 0;} 292 // ISB 293 def : InstRW<[WriteI], (instrs ISB)>; 294 // SLREX,DMB,DSB 295 def : WriteRes<WriteBarrier, [CyUnitLS]>; 296 297 // System instructions get an invalid latency because the latency of 298 // other operations across them is meaningless. 299 def : WriteRes<WriteSys, []> {let Latency = -1;} 300 301 //===----------------------------------------------------------------------===// 302 // 7.9 Vector Unit Instructions 303 304 // Simple vector operations take 2 cycles. 305 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;} 306 307 // Define some longer latency vector op types for Cyclone. 308 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 309 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} 310 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} 311 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} 312 313 // Simple floating-point operations take 2 cycles. 314 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;} 315 316 //--- 317 // 7.9.1 Vector Moves 318 //--- 319 320 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently 321 // generates expensive int-float conversion instead: 322 // FMOVDi Dd, #0.0 323 // FMOVv2f64ns Vd.2d, #0.0 324 325 // FMOVSi,FMOVDi 326 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;} 327 328 // MOVI,MVNI are WriteV 329 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV 330 331 // Move FPR is a register rename and single nop micro-op. 332 // ORR.16b Vd,Vn,Vn 333 // COPY is handled above in the WriteMov Variant. 334 def WriteVMov : SchedWriteVariant<[ 335 SchedVar<WriteVMovPred, [WriteX]>, 336 SchedVar<NoSchedPred, [WriteV]>]>; 337 def : InstRW<[WriteVMov], (instrs ORRv16i8)>; 338 339 // FMOVSr,FMOVDr are WriteF. 340 341 // MOV V,V is a WriteV. 342 343 // CPY D,V[x] is a WriteV 344 345 // INS V[x],V[y] is a WriteV. 346 347 // FMOVWSr,FMOVXDr,FMOVXDHighr 348 def : WriteRes<WriteFCopy, [CyUnitLS]> { 349 let Latency = 5; 350 } 351 352 // FMOVSWr,FMOVDXr 353 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; 354 355 // INS V[x],R 356 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; 357 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; 358 359 // SMOV,UMOV R,V[x] 360 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; 361 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; 362 363 // DUP V,R 364 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; 365 366 // DUP V,V[x] is a WriteV. 367 368 //--- 369 // 7.9.2 Integer Arithmetic, Logical, and Comparisons 370 //--- 371 372 // BIC,ORR V,#imm are WriteV 373 374 def : InstRW<[CyWriteV3], (instregex "ABSv")>; 375 376 // MVN,NEG,NOT are WriteV 377 378 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; 379 380 // ADDP is a WriteV. 381 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 382 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; 383 384 def : InstRW<[CyWriteV3], 385 (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; 386 387 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; 388 389 // ADD,SUB are WriteV 390 391 // Forward declare. 392 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 393 394 // Add/Diff and accumulate uses the vector multiply unit. 395 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 396 def CyReadVAccum : SchedReadAdvance<1, 397 [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; 398 399 def : InstRW<[CyWriteVAccum, CyReadVAccum], 400 (instregex "SADALP","UADALP")>; 401 402 def : InstRW<[CyWriteVAccum, CyReadVAccum], 403 (instregex "SABAv","UABAv","SABALv","UABALv")>; 404 405 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; 406 407 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; 408 409 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; 410 411 // WriteV includes: 412 // AND,BIC,CMTST,EOR,ORN,ORR 413 // ADDP 414 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD 415 // SADDL,SSUBL,UADDL,USUBL 416 // SADDW,SSUBW,UADDW,USUBW 417 418 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", 419 "CMLEv","CMLTv", 420 "CMHIv","CMHSv")>; 421 422 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", 423 "SMAXPv","SMINPv","UMAXPv","UMINPv")>; 424 425 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", 426 "SABDLv","UABDLv")>; 427 428 //--- 429 // 7.9.3 Floating Point Arithmetic and Comparisons 430 //--- 431 432 // FABS,FNEG are WriteF 433 434 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; 435 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; 436 437 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", 438 "FMINPv2i","FMINNMPv2i")>; 439 440 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; 441 442 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, 443 FSUBSrr,FSUBv2f32,FSUBv4f32, 444 FADDPv2f32,FADDPv4f32, 445 FABD32,FABDv2f32,FABDv4f32)>; 446 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, 447 FSUBDrr,FSUBv2f64, 448 FADDPv2f64, 449 FABD64,FABDv2f64)>; 450 451 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; 452 453 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", 454 "FMAXS","FMAXD","FMAXv", 455 "FMINS","FMIND","FMINv", 456 "FMAXNMS","FMAXNMD","FMAXNMv", 457 "FMINNMS","FMINNMD","FMINNMv", 458 "FMAXPv2f","FMAXPv4f", 459 "FMINPv2f","FMINPv4f", 460 "FMAXNMPv2f","FMAXNMPv4f", 461 "FMINNMPv2f","FMINNMPv4f")>; 462 463 // FCMP,FCMPE,FCCMP,FCCMPE 464 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;} 465 466 // FCSEL is a WriteF. 467 468 //--- 469 // 7.9.4 Shifts and Bitfield Operations 470 //--- 471 472 // SHL is a WriteV 473 474 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 475 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; 476 477 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 478 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; 479 480 // Shift and accumulate uses the vector multiply unit. 481 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 482 def CyReadVShiftAcc : SchedReadAdvance<1, 483 [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; 484 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], 485 (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; 486 487 // SSHL,USHL are WriteV. 488 489 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; 490 491 // SQSHL,SQSHLU,UQSHL are WriteV. 492 493 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; 494 495 // WriteV includes: 496 // SHLL,SSHLL,USHLL 497 // SLI,SRI 498 // BIF,BIT,BSL 499 // EXT 500 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN 501 // XTN2 502 503 def : InstRW<[CyWriteV4], 504 (instregex "RSHRNv","SHRNv", 505 "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", 506 "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; 507 508 //--- 509 // 7.9.5 Multiplication 510 //--- 511 512 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} 513 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", 514 "SQDMULLv","SQDMULHv","SQRDMULHv")>; 515 516 // FMUL,FMULX,FNMUL default to WriteFMul. 517 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;} 518 519 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} 520 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, 521 FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; 522 523 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; 524 def : InstRW<[CyWriteVMul, CyReadVMulAcc], 525 (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", 526 "SQDMLAL","SQDMLSL")>; 527 528 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} 529 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} 530 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; 531 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; 532 533 def : InstRW<[CyWriteSMul, CyReadSMul], 534 (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, 535 FMLAv2f32,FMLAv4f32, 536 FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; 537 def : InstRW<[CyWriteDMul, CyReadDMul], 538 (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, 539 FMLAv2f64,FMLAv2i64_indexed, 540 FMLSv2f64,FMLSv2i64_indexed)>; 541 542 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } 543 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; 544 545 //--- 546 // 7.9.6 Divide and Square Root 547 //--- 548 549 // FDIV,FSQRT 550 // TODO: Add 64-bit variant with 19 cycle latency. 551 // TODO: Specialize FSQRT for longer latency. 552 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> { 553 let Latency = 17; 554 let ResourceCycles = [2, 17]; 555 } 556 557 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; 558 559 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } 560 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; 561 562 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } 563 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } 564 def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; 565 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; 566 567 //--- 568 // 7.9.7 Integer-FP Conversions 569 //--- 570 571 // FCVT lengthen f16/s32 572 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; 573 574 // FCVT,FCVTN,FCVTXN 575 // SCVTF,UCVTF V,V 576 // FRINT(AIMNPXZ) V,V 577 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;} 578 579 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. 580 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; 581 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; 582 583 // FCVT Rd, S/D = V6+LD4: 10 cycles 584 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; 585 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; 586 587 // FCVTL is a WriteV 588 589 //--- 590 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup 591 //--- 592 593 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} 594 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, 595 AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, 596 SHA1SU0rrr)>; 597 598 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} 599 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; 600 601 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} 602 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, 603 SHA256Hrrr,SHA256H2rrr)>; 604 605 // TRN,UZP,ZUP are WriteV. 606 607 // TBL,TBX are WriteV. 608 609 //--- 610 // 7.9.11-7.9.14 Load/Store, single element and paired 611 //--- 612 613 // Loading into the vector unit takes 5 cycles vs 4 for integer loads. 614 def : WriteRes<WriteVLD, [CyUnitLS]> { 615 let Latency = 5; 616 } 617 618 // Store-load forwarding is 4 cycles. 619 def : WriteRes<WriteVST, [CyUnitLS]> { 620 let Latency = 4; 621 } 622 623 // WriteVLDPair/VSTPair sequences are expanded by the target description. 624 625 //--- 626 // 7.9.15 Load, element operations 627 //--- 628 629 // Only the first WriteVLD and WriteAdr for writeback matches def operands. 630 // Subsequent WriteVLDs consume resources. Since all loaded values have the 631 // same latency, this is acceptable. 632 633 // Vd is read 5 cycles after issuing the vector load. 634 def : ReadAdvance<ReadVLD, 5>; 635 636 def : InstRW<[WriteVLD], 637 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 638 def : InstRW<[WriteVLD, WriteAdr], 639 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 640 641 // Register writes from the load's high half are fused micro-ops. 642 def : InstRW<[WriteVLD], 643 (instregex "LD1Twov(8b|4h|2s|1d)$")>; 644 def : InstRW<[WriteVLD, WriteAdr], 645 (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; 646 def : InstRW<[WriteVLD, WriteVLD], 647 (instregex "LD1Twov(16b|8h|4s|2d)$")>; 648 def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 649 (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; 650 651 def : InstRW<[WriteVLD, WriteVLD], 652 (instregex "LD1Threev(8b|4h|2s|1d)$")>; 653 def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 654 (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; 655 def : InstRW<[WriteVLD, WriteVLD, WriteVLD], 656 (instregex "LD1Threev(16b|8h|4s|2d)$")>; 657 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], 658 (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; 659 660 def : InstRW<[WriteVLD, WriteVLD], 661 (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 662 def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 663 (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; 664 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], 665 (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 666 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], 667 (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; 668 669 def : InstRW<[WriteVLDShuffle, ReadVLD], 670 (instregex "LD1i(8|16|32)$")>; 671 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], 672 (instregex "LD1i(8|16|32)_POST")>; 673 674 def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; 675 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; 676 677 def : InstRW<[WriteVLDShuffle], 678 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 679 def : InstRW<[WriteVLDShuffle, WriteAdr], 680 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 681 682 def : InstRW<[WriteVLDShuffle, WriteV], 683 (instregex "LD2Twov(8b|4h|2s)$")>; 684 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 685 (instregex "LD2Twov(8b|4h|2s)_POST$")>; 686 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], 687 (instregex "LD2Twov(16b|8h|4s|2d)$")>; 688 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], 689 (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; 690 691 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 692 (instregex "LD2i(8|16|32)$")>; 693 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 694 (instregex "LD2i(8|16|32)_POST")>; 695 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 696 (instregex "LD2i64$")>; 697 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 698 (instregex "LD2i64_POST")>; 699 700 def : InstRW<[WriteVLDShuffle, WriteV], 701 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 702 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 703 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 704 705 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 706 (instregex "LD3Threev(8b|4h|2s)$")>; 707 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 708 (instregex "LD3Threev(8b|4h|2s)_POST")>; 709 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], 710 (instregex "LD3Threev(16b|8h|4s|2d)$")>; 711 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], 712 (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; 713 714 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], 715 (instregex "LD3i(8|16|32)$")>; 716 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], 717 (instregex "LD3i(8|16|32)_POST")>; 718 719 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], 720 (instregex "LD3i64$")>; 721 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 722 (instregex "LD3i64_POST")>; 723 724 def : InstRW<[WriteVLDShuffle, WriteV, WriteV], 725 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; 726 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], 727 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; 728 729 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 730 (instrs LD3Rv1d,LD3Rv2d)>; 731 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 732 (instrs LD3Rv1d_POST,LD3Rv2d_POST)>; 733 734 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 735 (instregex "LD4Fourv(8b|4h|2s)$")>; 736 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 737 (instregex "LD4Fourv(8b|4h|2s)_POST")>; 738 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, 739 WriteVLDPairShuffle, WriteVLDPairShuffle], 740 (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 741 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, 742 WriteVLDPairShuffle, WriteVLDPairShuffle], 743 (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; 744 745 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], 746 (instregex "LD4i(8|16|32)$")>; 747 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], 748 (instregex "LD4i(8|16|32)_POST")>; 749 750 751 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], 752 (instrs LD4i64)>; 753 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 754 (instrs LD4i64_POST)>; 755 756 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], 757 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; 758 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], 759 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; 760 761 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 762 (instrs LD4Rv1d,LD4Rv2d)>; 763 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 764 (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; 765 766 //--- 767 // 7.9.16 Store, element operations 768 //--- 769 770 // Only the WriteAdr for writeback matches a def operands. 771 // Subsequent WriteVLDs only consume resources. 772 773 def : InstRW<[WriteVST], 774 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 775 def : InstRW<[WriteAdr, WriteVST], 776 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 777 778 def : InstRW<[WriteVSTShuffle], 779 (instregex "ST1Twov(8b|4h|2s|1d)$")>; 780 def : InstRW<[WriteAdr, WriteVSTShuffle], 781 (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; 782 def : InstRW<[WriteVST, WriteVST], 783 (instregex "ST1Twov(16b|8h|4s|2d)$")>; 784 def : InstRW<[WriteAdr, WriteVST, WriteVST], 785 (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; 786 787 def : InstRW<[WriteVSTShuffle, WriteVST], 788 (instregex "ST1Threev(8b|4h|2s|1d)$")>; 789 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], 790 (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; 791 def : InstRW<[WriteVST, WriteVST, WriteVST], 792 (instregex "ST1Threev(16b|8h|4s|2d)$")>; 793 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], 794 (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; 795 796 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 797 (instregex "ST1Fourv(8b|4h|2s|1d)$")>; 798 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 799 (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; 800 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], 801 (instregex "ST1Fourv(16b|8h|4s|2d)$")>; 802 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], 803 (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; 804 805 def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; 806 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; 807 808 def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; 809 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; 810 811 def : InstRW<[WriteVSTShuffle], 812 (instregex "ST2Twov(8b|4h|2s)$")>; 813 def : InstRW<[WriteAdr, WriteVSTShuffle], 814 (instregex "ST2Twov(8b|4h|2s)_POST")>; 815 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 816 (instregex "ST2Twov(16b|8h|4s|2d)$")>; 817 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 818 (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; 819 820 def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; 821 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; 822 def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; 823 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; 824 825 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 826 (instregex "ST3Threev(8b|4h|2s)$")>; 827 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 828 (instregex "ST3Threev(8b|4h|2s)_POST")>; 829 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 830 (instregex "ST3Threev(16b|8h|4s|2d)$")>; 831 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 832 (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; 833 834 def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; 835 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; 836 837 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; 838 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; 839 840 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], 841 (instregex "ST4Fourv(8b|4h|2s|1d)$")>; 842 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], 843 (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; 844 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, 845 WriteVSTPairShuffle, WriteVSTPairShuffle], 846 (instregex "ST4Fourv(16b|8h|4s|2d)$")>; 847 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, 848 WriteVSTPairShuffle, WriteVSTPairShuffle], 849 (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; 850 851 def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; 852 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; 853 854 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; 855 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; 856 857 // Atomic operations are not supported. 858 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 859 860 //--- 861 // Unused SchedRead types 862 //--- 863 864 def : ReadAdvance<ReadI, 0>; 865 def : ReadAdvance<ReadISReg, 0>; 866 def : ReadAdvance<ReadIEReg, 0>; 867 def : ReadAdvance<ReadIM, 0>; 868 def : ReadAdvance<ReadIMA, 0>; 869 def : ReadAdvance<ReadID, 0>; 870 871 } // SchedModel = CycloneModel 872