Home | History | Annotate | Download | only in AArch64
      1 //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the machine model for AArch64 Cyclone to support
     11 // instruction scheduling and other instruction cost heuristics.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 def CycloneModel : SchedMachineModel {
     16   let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
     17   let MicroOpBufferSize = 192; // Based on the reorder buffer.
     18   let LoadLatency = 4; // Optimistic load latency.
     19   let MispredictPenalty = 16; // 14-19 cycles are typical.
     20   let CompleteModel = 1;
     21 
     22   list<Predicate> UnsupportedFeatures = [HasSVE];
     23 }
     24 
     25 //===----------------------------------------------------------------------===//
     26 // Define each kind of processor resource and number available on Cyclone.
     27 
     28 // 4 integer pipes
     29 def CyUnitI : ProcResource<4> {
     30   let BufferSize = 48;
     31 }
     32 
     33 // 2 branch units: I[0..1]
     34 def CyUnitB : ProcResource<2> {
     35   let Super  = CyUnitI;
     36   let BufferSize = 24;
     37 }
     38 
     39 // 1 indirect-branch unit: I[0]
     40 def CyUnitBR : ProcResource<1> {
     41   let Super  = CyUnitB;
     42 }
     43 
     44 // 2 shifter pipes: I[2..3]
     45 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
     46 def CyUnitIS : ProcResource<2> {
     47   let Super = CyUnitI;
     48   let BufferSize = 24;
     49 }
     50 
     51 // 1 mul pipe: I[0]
     52 def CyUnitIM : ProcResource<1> {
     53   let Super = CyUnitBR;
     54   let BufferSize = 32;
     55 }
     56 
     57 // 1 div pipe: I[1]
     58 def CyUnitID : ProcResource<1> {
     59   let Super = CyUnitB;
     60   let BufferSize = 16;
     61 }
     62 
     63 // 1 integer division unit. This is driven by the ID pipe, but only
     64 // consumes the pipe for one cycle at issue and another cycle at writeback.
     65 def CyUnitIntDiv : ProcResource<1>;
     66 
     67 // 2 ld/st pipes.
     68 def CyUnitLS : ProcResource<2> {
     69   let BufferSize = 28;
     70 }
     71 
     72 // 3 fp/vector pipes.
     73 def CyUnitV : ProcResource<3> {
     74   let BufferSize = 48;
     75 }
     76 // 2 fp/vector arithmetic and multiply pipes: V[0-1]
     77 def CyUnitVM : ProcResource<2> {
     78   let Super = CyUnitV;
     79   let BufferSize = 32;
     80 }
     81 // 1 fp/vector division/sqrt pipe: V[2]
     82 def CyUnitVD : ProcResource<1> {
     83   let Super = CyUnitV;
     84   let BufferSize = 16;
     85 }
     86 // 1 fp compare pipe: V[0]
     87 def CyUnitVC : ProcResource<1> {
     88   let Super = CyUnitVM;
     89   let BufferSize = 16;
     90 }
     91 
     92 // 2 fp division/square-root units.  These are driven by the VD pipe,
     93 // but only consume the pipe for one cycle at issue and a cycle at writeback.
     94 def CyUnitFloatDiv : ProcResource<2>;
     95 
     96 //===----------------------------------------------------------------------===//
     97 // Define scheduler read/write resources and latency on Cyclone.
     98 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
     99 
    100 let SchedModel = CycloneModel in {
    101 
    102 //---
    103 // 7.8.1. Moves
    104 //---
    105 
    106 // A single nop micro-op (uX).
    107 def WriteX : SchedWriteRes<[]> { let Latency = 0; }
    108 
    109 // Move zero is a register rename (to machine register zero).
    110 // The move is replaced by a single nop micro-op.
    111 // MOVZ Rd, #0
    112 // AND Rd, Rzr, #imm
    113 def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
    114 def WriteImmZ  : SchedWriteVariant<[
    115                    SchedVar<WriteZPred, [WriteX]>,
    116                    SchedVar<NoSchedPred, [WriteImm]>]>;
    117 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
    118 
    119 // Move GPR is a register rename and single nop micro-op.
    120 // ORR Xd, XZR, Xm
    121 // ADD Xd, Xn, #0
    122 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
    123 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
    124 def WriteMov      : SchedWriteVariant<[
    125                       SchedVar<WriteIMovPred, [WriteX]>,
    126                       SchedVar<WriteVMovPred, [WriteX]>,
    127                       SchedVar<NoSchedPred,   [WriteI]>]>;
    128 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
    129 
    130 // Move non-zero immediate is an integer ALU op.
    131 // MOVN,MOVZ,MOVK
    132 def : WriteRes<WriteImm, [CyUnitI]>;
    133 
    134 //---
    135 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
    136 //              Shifts and Bitfield Operations
    137 //---
    138 
    139 // ADR,ADRP
    140 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
    141 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
    142 // ADC(S),SBC(S)
    143 // Aliases: CMN, CMP, TST
    144 //
    145 // Conditional operations.
    146 // CCMNi,CCMPi,CCMNr,CCMPr,
    147 // CSEL,CSINC,CSINV,CSNEG
    148 //
    149 // Bit counting and reversal operations.
    150 // CLS,CLZ,RBIT,REV,REV16,REV32
    151 def : WriteRes<WriteI, [CyUnitI]>;
    152 
    153 // ADD with shifted register operand is a single micro-op that
    154 // consumes a shift pipeline for two cycles.
    155 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
    156 // EXAMPLE: ADDrs Xn, Xm LSL #imm
    157 def : WriteRes<WriteISReg, [CyUnitIS]> {
    158   let Latency = 2;
    159   let ResourceCycles = [2];
    160 }
    161 
    162 // ADD with extended register operand is the same as shifted reg operand.
    163 // ADD(S)re,SUB(S)re
    164 // EXAMPLE: ADDXre Xn, Xm, UXTB #1
    165 def : WriteRes<WriteIEReg, [CyUnitIS]> {
    166   let Latency = 2;
    167   let ResourceCycles = [2];
    168 }
    169 
    170 // Variable shift and bitfield operations.
    171 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
    172 def : WriteRes<WriteIS, [CyUnitIS]>;
    173 
    174 // EXTR Shifts a pair of registers and requires two micro-ops.
    175 // The second micro-op is delayed, as modeled by ReadExtrHi.
    176 // EXTR Xn, Xm, #imm
    177 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
    178   let Latency = 2;
    179   let NumMicroOps = 2;
    180 }
    181 
    182 // EXTR's first register read is delayed by one cycle, effectively
    183 // shortening its writer's latency.
    184 // EXTR Xn, Xm, #imm
    185 def : ReadAdvance<ReadExtrHi, 1>;
    186 
    187 //---
    188 // 7.8.6. Multiplies
    189 //---
    190 
    191 // MUL/MNEG are aliases for MADD/MSUB.
    192 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
    193 def : WriteRes<WriteIM32, [CyUnitIM]> {
    194   let Latency = 4;
    195 }
    196 // MADDX,MSUBX,SMULH,UMULH
    197 def : WriteRes<WriteIM64, [CyUnitIM]> {
    198   let Latency = 5;
    199 }
    200 
    201 //---
    202 // 7.8.7. Divide
    203 //---
    204 
    205 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
    206 // The ID pipe is consumed for 2 cycles: issue and writeback.
    207 // SDIVW,UDIVW
    208 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
    209   let Latency = 10;
    210   let ResourceCycles = [2, 10];
    211 }
    212 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
    213 // The ID pipe is consumed for 2 cycles: issue and writeback.
    214 // SDIVX,UDIVX
    215 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
    216   let Latency = 13;
    217   let ResourceCycles = [2, 13];
    218 }
    219 
    220 //---
    221 // 7.8.8,7.8.10. Load/Store, single element
    222 //---
    223 
    224 // Integer loads take 4 cycles and use one LS unit for one cycle.
    225 def : WriteRes<WriteLD, [CyUnitLS]> {
    226   let Latency = 4;
    227 }
    228 
    229 // Store-load forwarding is 4 cycles.
    230 //
    231 // Note: The store-exclusive sequence incorporates this
    232 // latency. However, general heuristics should not model the
    233 // dependence between a store and subsequent may-alias load because
    234 // hardware speculation works.
    235 def : WriteRes<WriteST, [CyUnitLS]> {
    236   let Latency = 4;
    237 }
    238 
    239 // Load from base address plus an optionally scaled register offset.
    240 // Rt latency is latency WriteIS + WriteLD.
    241 // EXAMPLE: LDR Xn, Xm [, lsl 3]
    242 def CyWriteLDIdx : SchedWriteVariant<[
    243   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
    244   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
    245 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
    246 
    247 // EXAMPLE: STR Xn, Xm [, lsl 3]
    248 def CyWriteSTIdx : SchedWriteVariant<[
    249   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
    250   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
    251 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
    252 
    253 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
    254 // EXAMPLE: LDR Xn, Xm [, lsl 3]
    255 def ReadBaseRS : SchedReadAdvance<1>;
    256 def CyReadAdrBase : SchedReadVariant<[
    257   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
    258   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
    259 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
    260 
    261 //---
    262 // 7.8.9,7.8.11. Load/Store, paired
    263 //---
    264 
    265 // Address pre/post increment is a simple ALU op with one cycle latency.
    266 def : WriteRes<WriteAdr, [CyUnitI]>;
    267 
    268 // LDP high register write is fused with the load, but a nop micro-op remains.
    269 def : WriteRes<WriteLDHi, []> {
    270   let Latency = 4;
    271 }
    272 
    273 // STP is a vector op and store, except for QQ, which is just two stores.
    274 def : SchedAlias<WriteSTP, WriteVSTShuffle>;
    275 def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
    276 
    277 //---
    278 // 7.8.13. Branches
    279 //---
    280 
    281 // Branches take a single micro-op.
    282 // The misprediction penalty is defined as a SchedMachineModel property.
    283 def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
    284 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
    285 
    286 //---
    287 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations
    288 //---
    289 
    290 // NOP,SEV,SEVL,WFE,WFI,YIELD
    291 def : WriteRes<WriteHint, []> {let Latency = 0;}
    292 // ISB
    293 def : InstRW<[WriteI], (instrs ISB)>;
    294 // SLREX,DMB,DSB
    295 def : WriteRes<WriteBarrier, [CyUnitLS]>;
    296 
    297 // System instructions get an invalid latency because the latency of
    298 // other operations across them is meaningless.
    299 def : WriteRes<WriteSys, []> {let Latency = -1;}
    300 
    301 //===----------------------------------------------------------------------===//
    302 // 7.9 Vector Unit Instructions
    303 
    304 // Simple vector operations take 2 cycles.
    305 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
    306 
    307 // Define some longer latency vector op types for Cyclone.
    308 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
    309 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
    310 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
    311 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
    312 
    313 // Simple floating-point operations take 2 cycles.
    314 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
    315 
    316 //---
    317 // 7.9.1 Vector Moves
    318 //---
    319 
    320 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
    321 // generates expensive int-float conversion instead:
    322 // FMOVDi Dd, #0.0
    323 // FMOVv2f64ns Vd.2d, #0.0
    324 
    325 // FMOVSi,FMOVDi
    326 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
    327 
    328 // MOVI,MVNI are WriteV
    329 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
    330 
    331 // Move FPR is a register rename and single nop micro-op.
    332 // ORR.16b Vd,Vn,Vn
    333 // COPY is handled above in the WriteMov Variant.
    334 def WriteVMov    : SchedWriteVariant<[
    335                      SchedVar<WriteVMovPred, [WriteX]>,
    336                      SchedVar<NoSchedPred,   [WriteV]>]>;
    337 def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
    338 
    339 // FMOVSr,FMOVDr are WriteF.
    340 
    341 // MOV V,V is a WriteV.
    342 
    343 // CPY D,V[x] is a WriteV
    344 
    345 // INS V[x],V[y] is a WriteV.
    346 
    347 // FMOVWSr,FMOVXDr,FMOVXDHighr
    348 def : WriteRes<WriteFCopy, [CyUnitLS]> {
    349   let Latency = 5;
    350 }
    351 
    352 // FMOVSWr,FMOVDXr
    353 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
    354 
    355 // INS V[x],R
    356 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
    357 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
    358 
    359 // SMOV,UMOV R,V[x]
    360 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
    361 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
    362 
    363 // DUP V,R
    364 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
    365 
    366 // DUP V,V[x] is a WriteV.
    367 
    368 //---
    369 // 7.9.2 Integer Arithmetic, Logical, and Comparisons
    370 //---
    371 
    372 // BIC,ORR V,#imm are WriteV
    373 
    374 def : InstRW<[CyWriteV3], (instregex "ABSv")>;
    375 
    376 // MVN,NEG,NOT are WriteV
    377 
    378 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
    379 
    380 // ADDP is a WriteV.
    381 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
    382 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
    383 
    384 def : InstRW<[CyWriteV3],
    385              (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
    386 
    387 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
    388 
    389 // ADD,SUB are WriteV
    390 
    391 // Forward declare.
    392 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
    393 
    394 // Add/Diff and accumulate uses the vector multiply unit.
    395 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
    396 def CyReadVAccum  : SchedReadAdvance<1,
    397                     [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
    398 
    399 def : InstRW<[CyWriteVAccum, CyReadVAccum],
    400              (instregex "SADALP","UADALP")>;
    401 
    402 def : InstRW<[CyWriteVAccum, CyReadVAccum],
    403              (instregex "SABAv","UABAv","SABALv","UABALv")>;
    404 
    405 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
    406 
    407 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
    408 
    409 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
    410 
    411 // WriteV includes:
    412 // AND,BIC,CMTST,EOR,ORN,ORR
    413 // ADDP
    414 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
    415 // SADDL,SSUBL,UADDL,USUBL
    416 // SADDW,SSUBW,UADDW,USUBW
    417 
    418 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
    419                                      "CMLEv","CMLTv",
    420                                      "CMHIv","CMHSv")>;
    421 
    422 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
    423                                      "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
    424 
    425 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
    426                                        "SABDLv","UABDLv")>;
    427 
    428 //---
    429 // 7.9.3 Floating Point Arithmetic and Comparisons
    430 //---
    431 
    432 // FABS,FNEG are WriteF
    433 
    434 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
    435 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
    436 
    437 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
    438                                      "FMINPv2i","FMINNMPv2i")>;
    439 
    440 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
    441 
    442 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
    443                                   FSUBSrr,FSUBv2f32,FSUBv4f32,
    444                                   FADDPv2f32,FADDPv4f32,
    445                                   FABD32,FABDv2f32,FABDv4f32)>;
    446 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
    447                                   FSUBDrr,FSUBv2f64,
    448                                   FADDPv2f64,
    449                                   FABD64,FABDv2f64)>;
    450 
    451 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
    452 
    453 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
    454                                      "FMAXS","FMAXD","FMAXv",
    455                                      "FMINS","FMIND","FMINv",
    456                                      "FMAXNMS","FMAXNMD","FMAXNMv",
    457                                      "FMINNMS","FMINNMD","FMINNMv",
    458                                      "FMAXPv2f","FMAXPv4f",
    459                                      "FMINPv2f","FMINPv4f",
    460                                      "FMAXNMPv2f","FMAXNMPv4f",
    461                                      "FMINNMPv2f","FMINNMPv4f")>;
    462 
    463 // FCMP,FCMPE,FCCMP,FCCMPE
    464 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
    465 
    466 // FCSEL is a WriteF.
    467 
    468 //---
    469 // 7.9.4 Shifts and Bitfield Operations
    470 //---
    471 
    472 // SHL is a WriteV
    473 
    474 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
    475 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
    476 
    477 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
    478 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
    479 
    480 // Shift and accumulate uses the vector multiply unit.
    481 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
    482 def CyReadVShiftAcc  : SchedReadAdvance<1,
    483                         [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
    484 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
    485              (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
    486 
    487 // SSHL,USHL are WriteV.
    488 
    489 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
    490 
    491 // SQSHL,SQSHLU,UQSHL are WriteV.
    492 
    493 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
    494 
    495 // WriteV includes:
    496 // SHLL,SSHLL,USHLL
    497 // SLI,SRI
    498 // BIF,BIT,BSL
    499 // EXT
    500 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
    501 // XTN2
    502 
    503 def : InstRW<[CyWriteV4],
    504              (instregex "RSHRNv","SHRNv",
    505                         "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
    506                         "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
    507 
    508 //---
    509 // 7.9.5 Multiplication
    510 //---
    511 
    512 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
    513 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
    514                              "SQDMULLv","SQDMULHv","SQRDMULHv")>;
    515 
    516 // FMUL,FMULX,FNMUL default to WriteFMul.
    517 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
    518 
    519 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
    520 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
    521                                FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
    522 
    523 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
    524 def : InstRW<[CyWriteVMul, CyReadVMulAcc],
    525              (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
    526               "SQDMLAL","SQDMLSL")>;
    527 
    528 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
    529 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
    530 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
    531 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
    532 
    533 def : InstRW<[CyWriteSMul, CyReadSMul],
    534              (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
    535               FMLAv2f32,FMLAv4f32,
    536               FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
    537 def : InstRW<[CyWriteDMul, CyReadDMul],
    538              (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
    539               FMLAv2f64,FMLAv2i64_indexed,
    540               FMLSv2f64,FMLSv2i64_indexed)>;
    541 
    542 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
    543 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
    544 
    545 //---
    546 // 7.9.6 Divide and Square Root
    547 //---
    548 
    549 // FDIV,FSQRT
    550 // TODO: Add 64-bit variant with 19 cycle latency.
    551 // TODO: Specialize FSQRT for longer latency.
    552 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
    553   let Latency = 17;
    554   let ResourceCycles = [2, 17];
    555 }
    556 
    557 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
    558 
    559 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
    560 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
    561 
    562 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
    563 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
    564 def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
    565 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
    566 
    567 //---
    568 // 7.9.7 Integer-FP Conversions
    569 //---
    570 
    571 // FCVT lengthen f16/s32
    572 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
    573 
    574 // FCVT,FCVTN,FCVTXN
    575 // SCVTF,UCVTF V,V
    576 // FRINT(AIMNPXZ) V,V
    577 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
    578 
    579 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
    580 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
    581 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
    582 
    583 // FCVT Rd, S/D = V6+LD4: 10 cycles
    584 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
    585 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
    586 
    587 // FCVTL is a WriteV
    588 
    589 //---
    590 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
    591 //---
    592 
    593 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
    594 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
    595                                        AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
    596                                        SHA1SU0rrr)>;
    597 
    598 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
    599 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
    600 
    601 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
    602 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
    603                                        SHA256Hrrr,SHA256H2rrr)>;
    604 
    605 // TRN,UZP,ZUP are WriteV.
    606 
    607 // TBL,TBX are WriteV.
    608 
    609 //---
    610 // 7.9.11-7.9.14 Load/Store, single element and paired
    611 //---
    612 
    613 // Loading into the vector unit takes 5 cycles vs 4 for integer loads.
    614 def : WriteRes<WriteVLD, [CyUnitLS]> {
    615   let Latency = 5;
    616 }
    617 
    618 // Store-load forwarding is 4 cycles.
    619 def : WriteRes<WriteVST, [CyUnitLS]> {
    620   let Latency = 4;
    621 }
    622 
    623 // WriteVLDPair/VSTPair sequences are expanded by the target description.
    624 
    625 //---
    626 // 7.9.15 Load, element operations
    627 //---
    628 
    629 // Only the first WriteVLD and WriteAdr for writeback matches def operands.
    630 // Subsequent WriteVLDs consume resources. Since all loaded values have the
    631 // same latency, this is acceptable.
    632 
    633 // Vd is read 5 cycles after issuing the vector load.
    634 def : ReadAdvance<ReadVLD, 5>;
    635 
    636 def : InstRW<[WriteVLD],
    637              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    638 def : InstRW<[WriteVLD, WriteAdr],
    639              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
    640 
    641 // Register writes from the load's high half are fused micro-ops.
    642 def : InstRW<[WriteVLD],
    643              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
    644 def : InstRW<[WriteVLD, WriteAdr],
    645              (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
    646 def : InstRW<[WriteVLD, WriteVLD],
    647              (instregex "LD1Twov(16b|8h|4s|2d)$")>;
    648 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
    649              (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
    650 
    651 def : InstRW<[WriteVLD, WriteVLD],
    652              (instregex "LD1Threev(8b|4h|2s|1d)$")>;
    653 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
    654              (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
    655 def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
    656              (instregex "LD1Threev(16b|8h|4s|2d)$")>;
    657 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
    658              (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
    659 
    660 def : InstRW<[WriteVLD, WriteVLD],
    661              (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
    662 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
    663              (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
    664 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
    665              (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
    666 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
    667              (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
    668 
    669 def : InstRW<[WriteVLDShuffle, ReadVLD],
    670              (instregex "LD1i(8|16|32)$")>;
    671 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
    672              (instregex "LD1i(8|16|32)_POST")>;
    673 
    674 def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
    675 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
    676 
    677 def : InstRW<[WriteVLDShuffle],
    678              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    679 def : InstRW<[WriteVLDShuffle, WriteAdr],
    680              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
    681 
    682 def : InstRW<[WriteVLDShuffle, WriteV],
    683              (instregex "LD2Twov(8b|4h|2s)$")>;
    684 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
    685              (instregex "LD2Twov(8b|4h|2s)_POST$")>;
    686 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
    687              (instregex "LD2Twov(16b|8h|4s|2d)$")>;
    688 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
    689              (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
    690 
    691 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
    692              (instregex "LD2i(8|16|32)$")>;
    693 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
    694              (instregex "LD2i(8|16|32)_POST")>;
    695 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
    696              (instregex "LD2i64$")>;
    697 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
    698              (instregex "LD2i64_POST")>;
    699 
    700 def : InstRW<[WriteVLDShuffle, WriteV],
    701              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    702 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
    703              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
    704 
    705 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
    706              (instregex "LD3Threev(8b|4h|2s)$")>;
    707 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
    708              (instregex "LD3Threev(8b|4h|2s)_POST")>;
    709 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
    710              (instregex "LD3Threev(16b|8h|4s|2d)$")>;
    711 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
    712              (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
    713 
    714 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
    715              (instregex "LD3i(8|16|32)$")>;
    716 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
    717              (instregex "LD3i(8|16|32)_POST")>;
    718 
    719 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
    720              (instregex "LD3i64$")>;
    721 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
    722              (instregex "LD3i64_POST")>;
    723 
    724 def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
    725              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
    726 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
    727              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
    728 
    729 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
    730              (instrs LD3Rv1d,LD3Rv2d)>;
    731 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
    732              (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
    733 
    734 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
    735              (instregex "LD4Fourv(8b|4h|2s)$")>;
    736 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
    737              (instregex "LD4Fourv(8b|4h|2s)_POST")>;
    738 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
    739               WriteVLDPairShuffle, WriteVLDPairShuffle],
    740              (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
    741 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
    742               WriteVLDPairShuffle, WriteVLDPairShuffle],
    743              (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
    744 
    745 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
    746              (instregex "LD4i(8|16|32)$")>;
    747 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
    748              (instregex "LD4i(8|16|32)_POST")>;
    749 
    750 
    751 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
    752              (instrs LD4i64)>;
    753 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
    754              (instrs LD4i64_POST)>;
    755 
    756 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
    757              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
    758 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
    759              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
    760 
    761 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
    762              (instrs LD4Rv1d,LD4Rv2d)>;
    763 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
    764              (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
    765 
    766 //---
    767 // 7.9.16 Store, element operations
    768 //---
    769 
    770 // Only the WriteAdr for writeback matches a def operands.
    771 // Subsequent WriteVLDs only consume resources.
    772 
    773 def : InstRW<[WriteVST],
    774              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    775 def : InstRW<[WriteAdr, WriteVST],
    776              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
    777 
    778 def : InstRW<[WriteVSTShuffle],
    779              (instregex "ST1Twov(8b|4h|2s|1d)$")>;
    780 def : InstRW<[WriteAdr, WriteVSTShuffle],
    781              (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
    782 def : InstRW<[WriteVST, WriteVST],
    783              (instregex "ST1Twov(16b|8h|4s|2d)$")>;
    784 def : InstRW<[WriteAdr, WriteVST, WriteVST],
    785              (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
    786 
    787 def : InstRW<[WriteVSTShuffle, WriteVST],
    788              (instregex "ST1Threev(8b|4h|2s|1d)$")>;
    789 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
    790              (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
    791 def : InstRW<[WriteVST, WriteVST, WriteVST],
    792              (instregex "ST1Threev(16b|8h|4s|2d)$")>;
    793 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
    794              (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
    795 
    796 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
    797              (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
    798 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
    799              (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
    800 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
    801              (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
    802 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
    803              (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
    804 
    805 def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
    806 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
    807 
    808 def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
    809 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
    810 
    811 def : InstRW<[WriteVSTShuffle],
    812              (instregex "ST2Twov(8b|4h|2s)$")>;
    813 def : InstRW<[WriteAdr, WriteVSTShuffle],
    814              (instregex "ST2Twov(8b|4h|2s)_POST")>;
    815 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
    816              (instregex "ST2Twov(16b|8h|4s|2d)$")>;
    817 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
    818              (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
    819 
    820 def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
    821 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
    822 def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
    823 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
    824 
    825 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
    826              (instregex "ST3Threev(8b|4h|2s)$")>;
    827 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
    828              (instregex "ST3Threev(8b|4h|2s)_POST")>;
    829 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
    830              (instregex "ST3Threev(16b|8h|4s|2d)$")>;
    831 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
    832              (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
    833 
    834 def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
    835 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
    836 
    837 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
    838 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
    839 
    840 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
    841             (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
    842 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
    843             (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
    844 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
    845               WriteVSTPairShuffle, WriteVSTPairShuffle],
    846              (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
    847 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
    848               WriteVSTPairShuffle, WriteVSTPairShuffle],
    849              (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
    850 
    851 def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
    852 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
    853 
    854 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
    855 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
    856 
    857 // Atomic operations are not supported.
    858 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
    859 
    860 //---
    861 // Unused SchedRead types
    862 //---
    863 
    864 def : ReadAdvance<ReadI, 0>;
    865 def : ReadAdvance<ReadISReg, 0>;
    866 def : ReadAdvance<ReadIEReg, 0>;
    867 def : ReadAdvance<ReadIM, 0>;
    868 def : ReadAdvance<ReadIMA, 0>;
    869 def : ReadAdvance<ReadID, 0>;
    870 
    871 } // SchedModel = CycloneModel
    872