Home | History | Annotate | Download | only in AArch64
      1 //=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the machine model for AArch64 Cyclone to support
     11 // instruction scheduling and other instruction cost heuristics.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 def CycloneModel : SchedMachineModel {
     16   let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
     17   let MicroOpBufferSize = 192; // Based on the reorder buffer.
     18   let LoadLatency = 4; // Optimistic load latency.
     19   let MispredictPenalty = 16; // 14-19 cycles are typical.
     20 }
     21 
     22 //===----------------------------------------------------------------------===//
     23 // Define each kind of processor resource and number available on Cyclone.
     24 
     25 // 4 integer pipes
     26 def CyUnitI : ProcResource<4> {
     27   let BufferSize = 48;
     28 }
     29 
     30 // 2 branch units: I[0..1]
     31 def CyUnitB : ProcResource<2> {
     32   let Super  = CyUnitI;
     33   let BufferSize = 24;
     34 }
     35 
     36 // 1 indirect-branch unit: I[0]
     37 def CyUnitBR : ProcResource<1> {
     38   let Super  = CyUnitB;
     39 }
     40 
     41 // 2 shifter pipes: I[2..3]
     42 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
     43 def CyUnitIS : ProcResource<2> {
     44   let Super = CyUnitI;
     45   let BufferSize = 24;
     46 }
     47 
     48 // 1 mul pipe: I[0]
     49 def CyUnitIM : ProcResource<1> {
     50   let Super = CyUnitBR;
     51   let BufferSize = 32;
     52 }
     53 
     54 // 1 div pipe: I[1]
     55 def CyUnitID : ProcResource<1> {
     56   let Super = CyUnitB;
     57   let BufferSize = 16;
     58 }
     59 
     60 // 1 integer division unit. This is driven by the ID pipe, but only
     61 // consumes the pipe for one cycle at issue and another cycle at writeback.
     62 def CyUnitIntDiv : ProcResource<1>;
     63 
     64 // 2 ld/st pipes.
     65 def CyUnitLS : ProcResource<2> {
     66   let BufferSize = 28;
     67 }
     68 
     69 // 3 fp/vector pipes.
     70 def CyUnitV : ProcResource<3> {
     71   let BufferSize = 48;
     72 }
     73 // 2 fp/vector arithmetic and multiply pipes: V[0-1]
     74 def CyUnitVM : ProcResource<2> {
     75   let Super = CyUnitV;
     76   let BufferSize = 32;
     77 }
     78 // 1 fp/vector division/sqrt pipe: V[2]
     79 def CyUnitVD : ProcResource<1> {
     80   let Super = CyUnitV;
     81   let BufferSize = 16;
     82 }
     83 // 1 fp compare pipe: V[0]
     84 def CyUnitVC : ProcResource<1> {
     85   let Super = CyUnitVM;
     86   let BufferSize = 16;
     87 }
     88 
     89 // 2 fp division/square-root units.  These are driven by the VD pipe,
     90 // but only consume the pipe for one cycle at issue and a cycle at writeback.
     91 def CyUnitFloatDiv : ProcResource<2>;
     92 
     93 //===----------------------------------------------------------------------===//
     94 // Define scheduler read/write resources and latency on Cyclone.
     95 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
     96 
     97 let SchedModel = CycloneModel in {
     98 
     99 //---
    100 // 7.8.1. Moves
    101 //---
    102 
    103 // A single nop micro-op (uX).
    104 def WriteX : SchedWriteRes<[]> { let Latency = 0; }
    105 
    106 // Move zero is a register rename (to machine register zero).
    107 // The move is replaced by a single nop micro-op.
    108 // MOVZ Rd, #0
    109 // AND Rd, Rzr, #imm
    110 def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
    111 def WriteImmZ  : SchedWriteVariant<[
    112                    SchedVar<WriteZPred, [WriteX]>,
    113                    SchedVar<NoSchedPred, [WriteImm]>]>;
    114 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
    115 
    116 // Move GPR is a register rename and single nop micro-op.
    117 // ORR Xd, XZR, Xm
    118 // ADD Xd, Xn, #0
    119 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
    120 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
    121 def WriteMov      : SchedWriteVariant<[
    122                       SchedVar<WriteIMovPred, [WriteX]>,
    123                       SchedVar<WriteVMovPred, [WriteX]>,
    124                       SchedVar<NoSchedPred,   [WriteI]>]>;
    125 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
    126 
    127 // Move non-zero immediate is an integer ALU op.
    128 // MOVN,MOVZ,MOVK
    129 def : WriteRes<WriteImm, [CyUnitI]>;
    130 
    131 //---
    132 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
    133 //              Shifts and Bitfield Operations
    134 //---
    135 
    136 // ADR,ADRP
    137 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
    138 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
    139 // ADC(S),SBC(S)
    140 // Aliases: CMN, CMP, TST
    141 //
    142 // Conditional operations.
    143 // CCMNi,CCMPi,CCMNr,CCMPr,
    144 // CSEL,CSINC,CSINV,CSNEG
    145 //
    146 // Bit counting and reversal operations.
    147 // CLS,CLZ,RBIT,REV,REV16,REV32
    148 def : WriteRes<WriteI, [CyUnitI]>;
    149 
    150 // ADD with shifted register operand is a single micro-op that
    151 // consumes a shift pipeline for two cycles.
    152 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
    153 // EXAMPLE: ADDrs Xn, Xm LSL #imm
    154 def : WriteRes<WriteISReg, [CyUnitIS]> {
    155   let Latency = 2;
    156   let ResourceCycles = [2];
    157 }
    158 
    159 // ADD with extended register operand is the same as shifted reg operand.
    160 // ADD(S)re,SUB(S)re
    161 // EXAMPLE: ADDXre Xn, Xm, UXTB #1
    162 def : WriteRes<WriteIEReg, [CyUnitIS]> {
    163   let Latency = 2;
    164   let ResourceCycles = [2];
    165 }
    166 
    167 // Variable shift and bitfield operations.
    168 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
    169 def : WriteRes<WriteIS, [CyUnitIS]>;
    170 
    171 // EXTR Shifts a pair of registers and requires two micro-ops.
    172 // The second micro-op is delayed, as modeled by ReadExtrHi.
    173 // EXTR Xn, Xm, #imm
    174 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
    175   let Latency = 2;
    176   let NumMicroOps = 2;
    177 }
    178 
    179 // EXTR's first register read is delayed by one cycle, effectively
    180 // shortening its writer's latency.
    181 // EXTR Xn, Xm, #imm
    182 def : ReadAdvance<ReadExtrHi, 1>;
    183 
    184 //---
    185 // 7.8.6. Multiplies
    186 //---
    187 
    188 // MUL/MNEG are aliases for MADD/MSUB.
    189 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
    190 def : WriteRes<WriteIM32, [CyUnitIM]> {
    191   let Latency = 4;
    192 }
    193 // MADDX,MSUBX,SMULH,UMULH
    194 def : WriteRes<WriteIM64, [CyUnitIM]> {
    195   let Latency = 5;
    196 }
    197 
    198 //---
    199 // 7.8.7. Divide
    200 //---
    201 
    202 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
    203 // The ID pipe is consumed for 2 cycles: issue and writeback.
    204 // SDIVW,UDIVW
    205 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
    206   let Latency = 10;
    207   let ResourceCycles = [2, 10];
    208 }
    209 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
    210 // The ID pipe is consumed for 2 cycles: issue and writeback.
    211 // SDIVX,UDIVX
    212 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
    213   let Latency = 13;
    214   let ResourceCycles = [2, 13];
    215 }
    216 
    217 //---
    218 // 7.8.8,7.8.10. Load/Store, single element
    219 //---
    220 
    221 // Integer loads take 4 cycles and use one LS unit for one cycle.
    222 def : WriteRes<WriteLD, [CyUnitLS]> {
    223   let Latency = 4;
    224 }
    225 
    226 // Store-load forwarding is 4 cycles.
    227 //
    228 // Note: The store-exclusive sequence incorporates this
    229 // latency. However, general heuristics should not model the
    230 // dependence between a store and subsequent may-alias load because
    231 // hardware speculation works.
    232 def : WriteRes<WriteST, [CyUnitLS]> {
    233   let Latency = 4;
    234 }
    235 
    236 // Load from base address plus an optionally scaled register offset.
    237 // Rt latency is latency WriteIS + WriteLD.
    238 // EXAMPLE: LDR Xn, Xm [, lsl 3]
    239 def CyWriteLDIdx : SchedWriteVariant<[
    240   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
    241   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
    242 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
    243 
    244 // EXAMPLE: STR Xn, Xm [, lsl 3]
    245 def CyWriteSTIdx : SchedWriteVariant<[
    246   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
    247   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
    248 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
    249 
    250 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
    251 // EXAMPLE: LDR Xn, Xm [, lsl 3]
    252 def ReadBaseRS : SchedReadAdvance<1>;
    253 def CyReadAdrBase : SchedReadVariant<[
    254   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
    255   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
    256 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
    257 
    258 //---
    259 // 7.8.9,7.8.11. Load/Store, paired
    260 //---
    261 
    262 // Address pre/post increment is a simple ALU op with one cycle latency.
    263 def : WriteRes<WriteAdr, [CyUnitI]>;
    264 
    265 // LDP high register write is fused with the load, but a nop micro-op remains.
    266 def : WriteRes<WriteLDHi, []> {
    267   let Latency = 4;
    268 }
    269 
    270 // STP is a vector op and store, except for QQ, which is just two stores.
    271 def : SchedAlias<WriteSTP, WriteVSTShuffle>;
    272 def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
    273 
    274 //---
    275 // 7.8.13. Branches
    276 //---
    277 
    278 // Branches take a single micro-op.
    279 // The misprediction penalty is defined as a SchedMachineModel property.
    280 def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
    281 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
    282 
    283 //---
    284 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations
    285 //---
    286 
    287 // NOP,SEV,SEVL,WFE,WFI,YIELD
    288 def : WriteRes<WriteHint, []> {let Latency = 0;}
    289 // ISB
    290 def : InstRW<[WriteI], (instrs ISB)>;
    291 // SLREX,DMB,DSB
    292 def : WriteRes<WriteBarrier, [CyUnitLS]>;
    293 
    294 // System instructions get an invalid latency because the latency of
    295 // other operations across them is meaningless.
    296 def : WriteRes<WriteSys, []> {let Latency = -1;}
    297 
    298 //===----------------------------------------------------------------------===//
    299 // 7.9 Vector Unit Instructions
    300 
    301 // Simple vector operations take 2 cycles.
    302 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
    303 
    304 // Define some longer latency vector op types for Cyclone.
    305 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
    306 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
    307 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
    308 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
    309 
    310 // Simple floating-point operations take 2 cycles.
    311 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
    312 
    313 //---
    314 // 7.9.1 Vector Moves
    315 //---
    316 
    317 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
    318 // generates expensive int-float conversion instead:
    319 // FMOVDi Dd, #0.0
    320 // FMOVv2f64ns Vd.2d, #0.0
    321 
    322 // FMOVSi,FMOVDi
    323 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
    324 
    325 // MOVI,MVNI are WriteV
    326 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
    327 
    328 // Move FPR is a register rename and single nop micro-op.
    329 // ORR.16b Vd,Vn,Vn
    330 // COPY is handled above in the WriteMov Variant.
    331 def WriteVMov    : SchedWriteVariant<[
    332                      SchedVar<WriteVMovPred, [WriteX]>,
    333                      SchedVar<NoSchedPred,   [WriteV]>]>;
    334 def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
    335 
    336 // FMOVSr,FMOVDr are WriteF.
    337 
    338 // MOV V,V is a WriteV.
    339 
    340 // CPY D,V[x] is a WriteV
    341 
    342 // INS V[x],V[y] is a WriteV.
    343 
    344 // FMOVWSr,FMOVXDr,FMOVXDHighr
    345 def : WriteRes<WriteFCopy, [CyUnitLS]> {
    346   let Latency = 5;
    347 }
    348 
    349 // FMOVSWr,FMOVDXr
    350 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
    351 
    352 // INS V[x],R
    353 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
    354 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
    355 
    356 // SMOV,UMOV R,V[x]
    357 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
    358 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
    359 
    360 // DUP V,R
    361 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
    362 
    363 // DUP V,V[x] is a WriteV.
    364 
    365 //---
    366 // 7.9.2 Integer Arithmetic, Logical, and Comparisons
    367 //---
    368 
    369 // BIC,ORR V,#imm are WriteV
    370 
    371 def : InstRW<[CyWriteV3], (instregex "ABSv")>;
    372 
    373 // MVN,NEG,NOT are WriteV
    374 
    375 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
    376 
    377 // ADDP is a WriteV.
    378 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
    379 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
    380 
    381 def : InstRW<[CyWriteV3],
    382              (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
    383 
    384 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
    385 
    386 // ADD,SUB are WriteV
    387 
    388 // Forward declare.
    389 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
    390 
    391 // Add/Diff and accumulate uses the vector multiply unit.
    392 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
    393 def CyReadVAccum  : SchedReadAdvance<1,
    394                     [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
    395 
    396 def : InstRW<[CyWriteVAccum, CyReadVAccum],
    397              (instregex "SADALP","UADALP")>;
    398 
    399 def : InstRW<[CyWriteVAccum, CyReadVAccum],
    400              (instregex "SABAv","UABAv","SABALv","UABALv")>;
    401 
    402 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
    403 
    404 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
    405 
    406 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
    407 
    408 // WriteV includes:
    409 // AND,BIC,CMTST,EOR,ORN,ORR
    410 // ADDP
    411 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
    412 // SADDL,SSUBL,UADDL,USUBL
    413 // SADDW,SSUBW,UADDW,USUBW
    414 
    415 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
    416                                      "CMLEv","CMLTv",
    417                                      "CMHIv","CMHSv")>;
    418 
    419 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
    420                                      "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
    421 
    422 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
    423                                        "SABDLv","UABDLv")>;
    424 
    425 //---
    426 // 7.9.3 Floating Point Arithmetic and Comparisons
    427 //---
    428 
    429 // FABS,FNEG are WriteF
    430 
    431 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
    432 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
    433 
    434 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
    435                                      "FMINPv2i","FMINNMPv2i")>;
    436 
    437 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
    438 
    439 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
    440                                   FSUBSrr,FSUBv2f32,FSUBv4f32,
    441                                   FADDPv2f32,FADDPv4f32,
    442                                   FABD32,FABDv2f32,FABDv4f32)>;
    443 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
    444                                   FSUBDrr,FSUBv2f64,
    445                                   FADDPv2f64,
    446                                   FABD64,FABDv2f64)>;
    447 
    448 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
    449 
    450 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
    451                                      "FMAXS","FMAXD","FMAXv",
    452                                      "FMINS","FMIND","FMINv",
    453                                      "FMAXNMS","FMAXNMD","FMAXNMv",
    454                                      "FMINNMS","FMINNMD","FMINNMv",
    455                                      "FMAXPv2f","FMAXPv4f",
    456                                      "FMINPv2f","FMINPv4f",
    457                                      "FMAXNMPv2f","FMAXNMPv4f",
    458                                      "FMINNMPv2f","FMINNMPv4f")>;
    459 
    460 // FCMP,FCMPE,FCCMP,FCCMPE
    461 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
    462 
    463 // FCSEL is a WriteF.
    464 
    465 //---
    466 // 7.9.4 Shifts and Bitfield Operations
    467 //---
    468 
    469 // SHL is a WriteV
    470 
    471 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
    472 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
    473 
    474 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
    475 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
    476 
    477 // Shift and accumulate uses the vector multiply unit.
    478 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
    479 def CyReadVShiftAcc  : SchedReadAdvance<1,
    480                         [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
    481 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
    482              (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
    483 
    484 // SSHL,USHL are WriteV.
    485 
    486 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
    487 
    488 // SQSHL,SQSHLU,UQSHL are WriteV.
    489 
    490 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
    491 
    492 // WriteV includes:
    493 // SHLL,SSHLL,USHLL
    494 // SLI,SRI
    495 // BIF,BIT,BSL
    496 // EXT
    497 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
    498 // XTN2
    499 
    500 def : InstRW<[CyWriteV4],
    501              (instregex "RSHRNv","SHRNv",
    502                         "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
    503                         "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
    504 
    505 //---
    506 // 7.9.5 Multiplication
    507 //---
    508 
    509 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
    510 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
    511                              "SQDMULLv","SQDMULHv","SQRDMULHv")>;
    512 
    513 // FMUL,FMULX,FNMUL default to WriteFMul.
    514 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
    515 
    516 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
    517 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
    518                                FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
    519 
    520 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
    521 def : InstRW<[CyWriteVMul, CyReadVMulAcc],
    522              (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
    523               "SQDMLAL","SQDMLSL")>;
    524 
    525 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
    526 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
    527 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
    528 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
    529 
    530 def : InstRW<[CyWriteSMul, CyReadSMul],
    531              (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
    532               FMLAv2f32,FMLAv4f32,
    533               FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
    534 def : InstRW<[CyWriteDMul, CyReadDMul],
    535              (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
    536               FMLAv2f64,FMLAv2i64_indexed,
    537               FMLSv2f64,FMLSv2i64_indexed)>;
    538 
    539 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
    540 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
    541 
    542 //---
    543 // 7.9.6 Divide and Square Root
    544 //---
    545 
    546 // FDIV,FSQRT
    547 // TODO: Add 64-bit variant with 19 cycle latency.
    548 // TODO: Specialize FSQRT for longer latency.
    549 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
    550   let Latency = 17;
    551   let ResourceCycles = [2, 17];
    552 }
    553 
    554 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
    555 
    556 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
    557 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
    558 
    559 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
    560 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
    561 def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
    562 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
    563 
    564 //---
    565 // 7.9.7 Integer-FP Conversions
    566 //---
    567 
    568 // FCVT lengthen f16/s32
    569 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
    570 
    571 // FCVT,FCVTN,FCVTXN
    572 // SCVTF,UCVTF V,V
    573 // FRINT(AIMNPXZ) V,V
    574 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
    575 
    576 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
    577 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
    578 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
    579 
    580 // FCVT Rd, S/D = V6+LD4: 10 cycles
    581 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
    582 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
    583 
    584 // FCVTL is a WriteV
    585 
    586 //---
    587 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
    588 //---
    589 
    590 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
    591 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
    592                                        AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
    593                                        SHA1SU0rrr)>;
    594 
    595 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
    596 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
    597 
    598 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
    599 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
    600                                        SHA256Hrrr,SHA256H2rrr)>;
    601 
    602 // TRN,UZP,ZUP are WriteV.
    603 
    604 // TBL,TBX are WriteV.
    605 
    606 //---
    607 // 7.9.11-7.9.14 Load/Store, single element and paired
    608 //---
    609 
    610 // Loading into the vector unit takes 5 cycles vs 4 for integer loads.
    611 def : WriteRes<WriteVLD, [CyUnitLS]> {
    612   let Latency = 5;
    613 }
    614 
    615 // Store-load forwarding is 4 cycles.
    616 def : WriteRes<WriteVST, [CyUnitLS]> {
    617   let Latency = 4;
    618 }
    619 
    620 // WriteVLDPair/VSTPair sequences are expanded by the target description.
    621 
    622 //---
    623 // 7.9.15 Load, element operations
    624 //---
    625 
    626 // Only the first WriteVLD and WriteAdr for writeback matches def operands.
    627 // Subsequent WriteVLDs consume resources. Since all loaded values have the
    628 // same latency, this is acceptable.
    629 
    630 // Vd is read 5 cycles after issuing the vector load.
    631 def : ReadAdvance<ReadVLD, 5>;
    632 
    633 def : InstRW<[WriteVLD],
    634              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    635 def : InstRW<[WriteVLD, WriteAdr],
    636              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
    637 
    638 // Register writes from the load's high half are fused micro-ops.
    639 def : InstRW<[WriteVLD],
    640              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
    641 def : InstRW<[WriteVLD, WriteAdr],
    642              (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
    643 def : InstRW<[WriteVLD, WriteVLD],
    644              (instregex "LD1Twov(16b|8h|4s|2d)$")>;
    645 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
    646              (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
    647 
    648 def : InstRW<[WriteVLD, WriteVLD],
    649              (instregex "LD1Threev(8b|4h|2s|1d)$")>;
    650 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
    651              (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
    652 def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
    653              (instregex "LD1Threev(16b|8h|4s|2d)$")>;
    654 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
    655              (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
    656 
    657 def : InstRW<[WriteVLD, WriteVLD],
    658              (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
    659 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
    660              (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
    661 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
    662              (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
    663 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
    664              (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
    665 
    666 def : InstRW<[WriteVLDShuffle, ReadVLD],
    667              (instregex "LD1i(8|16|32)$")>;
    668 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
    669              (instregex "LD1i(8|16|32)_POST")>;
    670 
    671 def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
    672 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
    673 
    674 def : InstRW<[WriteVLDShuffle],
    675              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    676 def : InstRW<[WriteVLDShuffle, WriteAdr],
    677              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
    678 
    679 def : InstRW<[WriteVLDShuffle, WriteV],
    680              (instregex "LD2Twov(8b|4h|2s)$")>;
    681 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
    682              (instregex "LD2Twov(8b|4h|2s)_POST$")>;
    683 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
    684              (instregex "LD2Twov(16b|8h|4s|2d)$")>;
    685 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
    686              (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
    687 
    688 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
    689              (instregex "LD2i(8|16|32)$")>;
    690 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
    691              (instregex "LD2i(8|16|32)_POST")>;
    692 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
    693              (instregex "LD2i64$")>;
    694 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
    695              (instregex "LD2i64_POST")>;
    696 
    697 def : InstRW<[WriteVLDShuffle, WriteV],
    698              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    699 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
    700              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
    701 
    702 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
    703              (instregex "LD3Threev(8b|4h|2s)$")>;
    704 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
    705              (instregex "LD3Threev(8b|4h|2s)_POST")>;
    706 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
    707              (instregex "LD3Threev(16b|8h|4s|2d)$")>;
    708 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
    709              (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
    710 
    711 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
    712              (instregex "LD3i(8|16|32)$")>;
    713 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
    714              (instregex "LD3i(8|16|32)_POST")>;
    715 
    716 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
    717              (instregex "LD3i64$")>;
    718 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
    719              (instregex "LD3i64_POST")>;
    720 
    721 def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
    722              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
    723 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
    724              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
    725 
    726 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
    727              (instrs LD3Rv1d,LD3Rv2d)>;
    728 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
    729              (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
    730 
    731 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
    732              (instregex "LD4Fourv(8b|4h|2s)$")>;
    733 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
    734              (instregex "LD4Fourv(8b|4h|2s)_POST")>;
    735 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
    736               WriteVLDPairShuffle, WriteVLDPairShuffle],
    737              (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
    738 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
    739               WriteVLDPairShuffle, WriteVLDPairShuffle],
    740              (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
    741 
    742 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
    743              (instregex "LD4i(8|16|32)$")>;
    744 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
    745              (instregex "LD4i(8|16|32)_POST")>;
    746 
    747 
    748 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
    749              (instrs LD4i64)>;
    750 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
    751              (instrs LD4i64_POST)>;
    752 
    753 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
    754              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
    755 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
    756              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
    757 
    758 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
    759              (instrs LD4Rv1d,LD4Rv2d)>;
    760 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
    761              (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
    762 
    763 //---
    764 // 7.9.16 Store, element operations
    765 //---
    766 
    767 // Only the WriteAdr for writeback matches a def operands.
    768 // Subsequent WriteVLDs only consume resources.
    769 
    770 def : InstRW<[WriteVST],
    771              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
    772 def : InstRW<[WriteAdr, WriteVST],
    773              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
    774 
    775 def : InstRW<[WriteVSTShuffle],
    776              (instregex "ST1Twov(8b|4h|2s|1d)$")>;
    777 def : InstRW<[WriteAdr, WriteVSTShuffle],
    778              (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
    779 def : InstRW<[WriteVST, WriteVST],
    780              (instregex "ST1Twov(16b|8h|4s|2d)$")>;
    781 def : InstRW<[WriteAdr, WriteVST, WriteVST],
    782              (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
    783 
    784 def : InstRW<[WriteVSTShuffle, WriteVST],
    785              (instregex "ST1Threev(8b|4h|2s|1d)$")>;
    786 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
    787              (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
    788 def : InstRW<[WriteVST, WriteVST, WriteVST],
    789              (instregex "ST1Threev(16b|8h|4s|2d)$")>;
    790 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
    791              (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
    792 
    793 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
    794              (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
    795 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
    796              (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
    797 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
    798              (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
    799 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
    800              (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
    801 
    802 def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
    803 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
    804 
    805 def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
    806 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
    807 
    808 def : InstRW<[WriteVSTShuffle],
    809              (instregex "ST2Twov(8b|4h|2s)$")>;
    810 def : InstRW<[WriteAdr, WriteVSTShuffle],
    811              (instregex "ST2Twov(8b|4h|2s)_POST")>;
    812 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
    813              (instregex "ST2Twov(16b|8h|4s|2d)$")>;
    814 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
    815              (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
    816 
    817 def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
    818 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
    819 def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
    820 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
    821 
    822 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
    823              (instregex "ST3Threev(8b|4h|2s)$")>;
    824 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
    825              (instregex "ST3Threev(8b|4h|2s)_POST")>;
    826 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
    827              (instregex "ST3Threev(16b|8h|4s|2d)$")>;
    828 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
    829              (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
    830 
    831 def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
    832 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
    833 
    834 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
    835 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
    836 
    837 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
    838             (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
    839 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
    840             (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
    841 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
    842               WriteVSTPairShuffle, WriteVSTPairShuffle],
    843              (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
    844 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
    845               WriteVSTPairShuffle, WriteVSTPairShuffle],
    846              (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
    847 
    848 def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
    849 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
    850 
    851 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
    852 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
    853 
    854 //---
    855 // Unused SchedRead types
    856 //---
    857 
    858 def : ReadAdvance<ReadI, 0>;
    859 def : ReadAdvance<ReadISReg, 0>;
    860 def : ReadAdvance<ReadIEReg, 0>;
    861 def : ReadAdvance<ReadIM, 0>;
    862 def : ReadAdvance<ReadIMA, 0>;
    863 def : ReadAdvance<ReadID, 0>;
    864 
    865 } // SchedModel = CycloneModel
    866