Home | History | Annotate | Download | only in X86
      1 //===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines a pass that optimizes call sequences on x86.
     11 // Currently, it converts movs of function parameters onto the stack into
     12 // pushes. This is beneficial for two main reasons:
     13 // 1) The push instruction encoding is much smaller than an esp-relative mov
     14 // 2) It is possible to push memory arguments directly. So, if the
     15 //    the transformation is preformed pre-reg-alloc, it can help relieve
     16 //    register pressure.
     17 //
     18 //===----------------------------------------------------------------------===//
     19 
     20 #include <algorithm>
     21 
     22 #include "X86.h"
     23 #include "X86InstrInfo.h"
     24 #include "X86Subtarget.h"
     25 #include "X86MachineFunctionInfo.h"
     26 #include "llvm/ADT/Statistic.h"
     27 #include "llvm/CodeGen/MachineFunctionPass.h"
     28 #include "llvm/CodeGen/MachineInstrBuilder.h"
     29 #include "llvm/CodeGen/MachineModuleInfo.h"
     30 #include "llvm/CodeGen/MachineRegisterInfo.h"
     31 #include "llvm/CodeGen/Passes.h"
     32 #include "llvm/IR/Function.h"
     33 #include "llvm/Support/Debug.h"
     34 #include "llvm/Support/raw_ostream.h"
     35 #include "llvm/Target/TargetInstrInfo.h"
     36 
     37 using namespace llvm;
     38 
     39 #define DEBUG_TYPE "x86-cf-opt"
     40 
     41 static cl::opt<bool>
     42     NoX86CFOpt("no-x86-call-frame-opt",
     43                cl::desc("Avoid optimizing x86 call frames for size"),
     44                cl::init(false), cl::Hidden);
     45 
     46 namespace {
     47 class X86CallFrameOptimization : public MachineFunctionPass {
     48 public:
     49   X86CallFrameOptimization() : MachineFunctionPass(ID) {}
     50 
     51   bool runOnMachineFunction(MachineFunction &MF) override;
     52 
     53 private:
     54   // Information we know about a particular call site
     55   struct CallContext {
     56     CallContext()
     57         : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
     58           MovVector(4, nullptr), NoStackParams(false), UsePush(false){}
     59 
     60     // Iterator referring to the frame setup instruction
     61     MachineBasicBlock::iterator FrameSetup;
     62 
     63     // Actual call instruction
     64     MachineInstr *Call;
     65 
     66     // A copy of the stack pointer
     67     MachineInstr *SPCopy;
     68 
     69     // The total displacement of all passed parameters
     70     int64_t ExpectedDist;
     71 
     72     // The sequence of movs used to pass the parameters
     73     SmallVector<MachineInstr *, 4> MovVector;
     74 
     75     // True if this call site has no stack parameters
     76     bool NoStackParams;
     77 
     78     // True of this callsite can use push instructions
     79     bool UsePush;
     80   };
     81 
     82   typedef SmallVector<CallContext, 8> ContextVector;
     83 
     84   bool isLegal(MachineFunction &MF);
     85 
     86   bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
     87 
     88   void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
     89                        MachineBasicBlock::iterator I, CallContext &Context);
     90 
     91   bool adjustCallSequence(MachineFunction &MF, const CallContext &Context);
     92 
     93   MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
     94                                    unsigned Reg);
     95 
     96   enum InstClassification { Convert, Skip, Exit };
     97 
     98   InstClassification classifyInstruction(MachineBasicBlock &MBB,
     99                                          MachineBasicBlock::iterator MI,
    100                                          const X86RegisterInfo &RegInfo,
    101                                          DenseSet<unsigned int> &UsedRegs);
    102 
    103   const char *getPassName() const override { return "X86 Optimize Call Frame"; }
    104 
    105   const TargetInstrInfo *TII;
    106   const X86FrameLowering *TFL;
    107   const X86Subtarget *STI;
    108   const MachineRegisterInfo *MRI;
    109   static char ID;
    110 };
    111 
    112 char X86CallFrameOptimization::ID = 0;
    113 }
    114 
    115 FunctionPass *llvm::createX86CallFrameOptimization() {
    116   return new X86CallFrameOptimization();
    117 }
    118 
    119 // This checks whether the transformation is legal.
    120 // Also returns false in cases where it's potentially legal, but
    121 // we don't even want to try.
    122 bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
    123   if (NoX86CFOpt.getValue())
    124     return false;
    125 
    126   // We currently only support call sequences where *all* parameters.
    127   // are passed on the stack.
    128   // No point in running this in 64-bit mode, since some arguments are
    129   // passed in-register in all common calling conventions, so the pattern
    130   // we're looking for will never match.
    131   if (STI->is64Bit())
    132     return false;
    133 
    134   // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
    135   // in the compact unwind encoding that Darwin uses. So, bail if there
    136   // is a danger of that being generated.
    137   if (STI->isTargetDarwin() &&
    138      (!MF.getMMI().getLandingPads().empty() ||
    139        (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
    140     return false;
    141 
    142   // You would expect straight-line code between call-frame setup and
    143   // call-frame destroy. You would be wrong. There are circumstances (e.g.
    144   // CMOV_GR8 expansion of a select that feeds a function call!) where we can
    145   // end up with the setup and the destroy in different basic blocks.
    146   // This is bad, and breaks SP adjustment.
    147   // So, check that all of the frames in the function are closed inside
    148   // the same block, and, for good measure, that there are no nested frames.
    149   unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
    150   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
    151   for (MachineBasicBlock &BB : MF) {
    152     bool InsideFrameSequence = false;
    153     for (MachineInstr &MI : BB) {
    154       if (MI.getOpcode() == FrameSetupOpcode) {
    155         if (InsideFrameSequence)
    156           return false;
    157         InsideFrameSequence = true;
    158       } else if (MI.getOpcode() == FrameDestroyOpcode) {
    159         if (!InsideFrameSequence)
    160           return false;
    161         InsideFrameSequence = false;
    162       }
    163     }
    164 
    165     if (InsideFrameSequence)
    166       return false;
    167   }
    168 
    169   return true;
    170 }
    171 
    172 // Check whether this trasnformation is profitable for a particular
    173 // function - in terms of code size.
    174 bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
    175   ContextVector &CallSeqVector) {
    176   // This transformation is always a win when we do not expect to have
    177   // a reserved call frame. Under other circumstances, it may be either
    178   // a win or a loss, and requires a heuristic.
    179   bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
    180   if (CannotReserveFrame)
    181     return true;
    182 
    183   // Don't do this when not optimizing for size.
    184   if (!MF.getFunction()->optForSize())
    185     return false;
    186 
    187   unsigned StackAlign = TFL->getStackAlignment();
    188 
    189   int64_t Advantage = 0;
    190   for (auto CC : CallSeqVector) {
    191     // Call sites where no parameters are passed on the stack
    192     // do not affect the cost, since there needs to be no
    193     // stack adjustment.
    194     if (CC.NoStackParams)
    195       continue;
    196 
    197     if (!CC.UsePush) {
    198       // If we don't use pushes for a particular call site,
    199       // we pay for not having a reserved call frame with an
    200       // additional sub/add esp pair. The cost is ~3 bytes per instruction,
    201       // depending on the size of the constant.
    202       // TODO: Callee-pop functions should have a smaller penalty, because
    203       // an add is needed even with a reserved call frame.
    204       Advantage -= 6;
    205     } else {
    206       // We can use pushes. First, account for the fixed costs.
    207       // We'll need a add after the call.
    208       Advantage -= 3;
    209       // If we have to realign the stack, we'll also need and sub before
    210       if (CC.ExpectedDist % StackAlign)
    211         Advantage -= 3;
    212       // Now, for each push, we save ~3 bytes. For small constants, we actually,
    213       // save more (up to 5 bytes), but 3 should be a good approximation.
    214       Advantage += (CC.ExpectedDist / 4) * 3;
    215     }
    216   }
    217 
    218   return (Advantage >= 0);
    219 }
    220 
    221 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
    222   STI = &MF.getSubtarget<X86Subtarget>();
    223   TII = STI->getInstrInfo();
    224   TFL = STI->getFrameLowering();
    225   MRI = &MF.getRegInfo();
    226 
    227   if (!isLegal(MF))
    228     return false;
    229 
    230   unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
    231 
    232   bool Changed = false;
    233 
    234   ContextVector CallSeqVector;
    235 
    236   for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
    237     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
    238       if (I->getOpcode() == FrameSetupOpcode) {
    239         CallContext Context;
    240         collectCallInfo(MF, *BB, I, Context);
    241         CallSeqVector.push_back(Context);
    242       }
    243 
    244   if (!isProfitable(MF, CallSeqVector))
    245     return false;
    246 
    247   for (auto CC : CallSeqVector)
    248     if (CC.UsePush)
    249       Changed |= adjustCallSequence(MF, CC);
    250 
    251   return Changed;
    252 }
    253 
    254 X86CallFrameOptimization::InstClassification
    255 X86CallFrameOptimization::classifyInstruction(
    256     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
    257     const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) {
    258   if (MI == MBB.end())
    259     return Exit;
    260 
    261   // The instructions we actually care about are movs onto the stack
    262   int Opcode = MI->getOpcode();
    263   if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr)
    264     return Convert;
    265 
    266   // Not all calling conventions have only stack MOVs between the stack
    267   // adjust and the call.
    268 
    269   // We want to tolerate other instructions, to cover more cases.
    270   // In particular:
    271   // a) PCrel calls, where we expect an additional COPY of the basereg.
    272   // b) Passing frame-index addresses.
    273   // c) Calling conventions that have inreg parameters. These generate
    274   //    both copies and movs into registers.
    275   // To avoid creating lots of special cases, allow any instruction
    276   // that does not write into memory, does not def or use the stack
    277   // pointer, and does not def any register that was used by a preceding
    278   // push.
    279   // (Reading from memory is allowed, even if referenced through a
    280   // frame index, since these will get adjusted properly in PEI)
    281 
    282   // The reason for the last condition is that the pushes can't replace
    283   // the movs in place, because the order must be reversed.
    284   // So if we have a MOV32mr that uses EDX, then an instruction that defs
    285   // EDX, and then the call, after the transformation the push will use
    286   // the modified version of EDX, and not the original one.
    287   // Since we are still in SSA form at this point, we only need to
    288   // make sure we don't clobber any *physical* registers that were
    289   // used by an earlier mov that will become a push.
    290 
    291   if (MI->isCall() || MI->mayStore())
    292     return Exit;
    293 
    294   for (const MachineOperand &MO : MI->operands()) {
    295     if (!MO.isReg())
    296       continue;
    297     unsigned int Reg = MO.getReg();
    298     if (!RegInfo.isPhysicalRegister(Reg))
    299       continue;
    300     if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
    301       return Exit;
    302     if (MO.isDef()) {
    303       for (unsigned int U : UsedRegs)
    304         if (RegInfo.regsOverlap(Reg, U))
    305           return Exit;
    306     }
    307   }
    308 
    309   return Skip;
    310 }
    311 
    312 void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
    313                                                MachineBasicBlock &MBB,
    314                                                MachineBasicBlock::iterator I,
    315                                                CallContext &Context) {
    316   // Check that this particular call sequence is amenable to the
    317   // transformation.
    318   const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
    319                                        STI->getRegisterInfo());
    320   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
    321 
    322   // We expect to enter this at the beginning of a call sequence
    323   assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
    324   MachineBasicBlock::iterator FrameSetup = I++;
    325   Context.FrameSetup = FrameSetup;
    326 
    327   // How much do we adjust the stack? This puts an upper bound on
    328   // the number of parameters actually passed on it.
    329   unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
    330 
    331   // A zero adjustment means no stack parameters
    332   if (!MaxAdjust) {
    333     Context.NoStackParams = true;
    334     return;
    335   }
    336 
    337   // For globals in PIC mode, we can have some LEAs here.
    338   // Ignore them, they don't bother us.
    339   // TODO: Extend this to something that covers more cases.
    340   while (I->getOpcode() == X86::LEA32r)
    341     ++I;
    342 
    343   // We expect a copy instruction here.
    344   // TODO: The copy instruction is a lowering artifact.
    345   //       We should also support a copy-less version, where the stack
    346   //       pointer is used directly.
    347   if (!I->isCopy() || !I->getOperand(0).isReg())
    348     return;
    349   Context.SPCopy = I++;
    350 
    351   unsigned StackPtr = Context.SPCopy->getOperand(0).getReg();
    352 
    353   // Scan the call setup sequence for the pattern we're looking for.
    354   // We only handle a simple case - a sequence of MOV32mi or MOV32mr
    355   // instructions, that push a sequence of 32-bit values onto the stack, with
    356   // no gaps between them.
    357   if (MaxAdjust > 4)
    358     Context.MovVector.resize(MaxAdjust, nullptr);
    359 
    360   InstClassification Classification;
    361   DenseSet<unsigned int> UsedRegs;
    362 
    363   while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) !=
    364          Exit) {
    365     if (Classification == Skip) {
    366       ++I;
    367       continue;
    368     }
    369 
    370     // We know the instruction is a MOV32mi/MOV32mr.
    371     // We only want movs of the form:
    372     // movl imm/r32, k(%esp)
    373     // If we run into something else, bail.
    374     // Note that AddrBaseReg may, counter to its name, not be a register,
    375     // but rather a frame index.
    376     // TODO: Support the fi case. This should probably work now that we
    377     // have the infrastructure to track the stack pointer within a call
    378     // sequence.
    379     if (!I->getOperand(X86::AddrBaseReg).isReg() ||
    380         (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
    381         !I->getOperand(X86::AddrScaleAmt).isImm() ||
    382         (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
    383         (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
    384         (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
    385         !I->getOperand(X86::AddrDisp).isImm())
    386       return;
    387 
    388     int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
    389     assert(StackDisp >= 0 &&
    390            "Negative stack displacement when passing parameters");
    391 
    392     // We really don't want to consider the unaligned case.
    393     if (StackDisp % 4)
    394       return;
    395     StackDisp /= 4;
    396 
    397     assert((size_t)StackDisp < Context.MovVector.size() &&
    398            "Function call has more parameters than the stack is adjusted for.");
    399 
    400     // If the same stack slot is being filled twice, something's fishy.
    401     if (Context.MovVector[StackDisp] != nullptr)
    402       return;
    403     Context.MovVector[StackDisp] = I;
    404 
    405     for (const MachineOperand &MO : I->uses()) {
    406       if (!MO.isReg())
    407         continue;
    408       unsigned int Reg = MO.getReg();
    409       if (RegInfo.isPhysicalRegister(Reg))
    410         UsedRegs.insert(Reg);
    411     }
    412 
    413     ++I;
    414   }
    415 
    416   // We now expect the end of the sequence. If we stopped early,
    417   // or reached the end of the block without finding a call, bail.
    418   if (I == MBB.end() || !I->isCall())
    419     return;
    420 
    421   Context.Call = I;
    422   if ((++I)->getOpcode() != FrameDestroyOpcode)
    423     return;
    424 
    425   // Now, go through the vector, and see that we don't have any gaps,
    426   // but only a series of 32-bit MOVs.
    427   auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
    428   for (; MMI != MME; ++MMI, Context.ExpectedDist += 4)
    429     if (*MMI == nullptr)
    430       break;
    431 
    432   // If the call had no parameters, do nothing
    433   if (MMI == Context.MovVector.begin())
    434     return;
    435 
    436   // We are either at the last parameter, or a gap.
    437   // Make sure it's not a gap
    438   for (; MMI != MME; ++MMI)
    439     if (*MMI != nullptr)
    440       return;
    441 
    442   Context.UsePush = true;
    443   return;
    444 }
    445 
    446 bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
    447                                                   const CallContext &Context) {
    448   // Ok, we can in fact do the transformation for this call.
    449   // Do not remove the FrameSetup instruction, but adjust the parameters.
    450   // PEI will end up finalizing the handling of this.
    451   MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
    452   MachineBasicBlock &MBB = *(FrameSetup->getParent());
    453   FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
    454 
    455   DebugLoc DL = FrameSetup->getDebugLoc();
    456   // Now, iterate through the vector in reverse order, and replace the movs
    457   // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
    458   // replace uses.
    459   for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
    460     MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
    461     MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
    462     MachineBasicBlock::iterator Push = nullptr;
    463     if (MOV->getOpcode() == X86::MOV32mi) {
    464       unsigned PushOpcode = X86::PUSHi32;
    465       // If the operand is a small (8-bit) immediate, we can use a
    466       // PUSH instruction with a shorter encoding.
    467       // Note that isImm() may fail even though this is a MOVmi, because
    468       // the operand can also be a symbol.
    469       if (PushOp.isImm()) {
    470         int64_t Val = PushOp.getImm();
    471         if (isInt<8>(Val))
    472           PushOpcode = X86::PUSH32i8;
    473       }
    474       Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
    475           .addOperand(PushOp);
    476     } else {
    477       unsigned int Reg = PushOp.getReg();
    478 
    479       // If PUSHrmm is not slow on this target, try to fold the source of the
    480       // push into the instruction.
    481       bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
    482 
    483       // Check that this is legal to fold. Right now, we're extremely
    484       // conservative about that.
    485       MachineInstr *DefMov = nullptr;
    486       if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
    487         Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
    488 
    489         unsigned NumOps = DefMov->getDesc().getNumOperands();
    490         for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
    491           Push->addOperand(DefMov->getOperand(i));
    492 
    493         DefMov->eraseFromParent();
    494       } else {
    495         Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
    496             .addReg(Reg)
    497             .getInstr();
    498       }
    499     }
    500 
    501     // For debugging, when using SP-based CFA, we need to adjust the CFA
    502     // offset after each push.
    503     // TODO: This is needed only if we require precise CFA.
    504     if (!TFL->hasFP(MF))
    505       TFL->BuildCFI(MBB, std::next(Push), DL,
    506                     MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
    507 
    508     MBB.erase(MOV);
    509   }
    510 
    511   // The stack-pointer copy is no longer used in the call sequences.
    512   // There should not be any other users, but we can't commit to that, so:
    513   if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
    514     Context.SPCopy->eraseFromParent();
    515 
    516   // Once we've done this, we need to make sure PEI doesn't assume a reserved
    517   // frame.
    518   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
    519   FuncInfo->setHasPushSequences(true);
    520 
    521   return true;
    522 }
    523 
    524 MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
    525     MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
    526   // Do an extremely restricted form of load folding.
    527   // ISel will often create patterns like:
    528   // movl    4(%edi), %eax
    529   // movl    8(%edi), %ecx
    530   // movl    12(%edi), %edx
    531   // movl    %edx, 8(%esp)
    532   // movl    %ecx, 4(%esp)
    533   // movl    %eax, (%esp)
    534   // call
    535   // Get rid of those with prejudice.
    536   if (!TargetRegisterInfo::isVirtualRegister(Reg))
    537     return nullptr;
    538 
    539   // Make sure this is the only use of Reg.
    540   if (!MRI->hasOneNonDBGUse(Reg))
    541     return nullptr;
    542 
    543   MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
    544 
    545   // Make sure the def is a MOV from memory.
    546   // If the def is an another block, give up.
    547   if (DefMI->getOpcode() != X86::MOV32rm ||
    548       DefMI->getParent() != FrameSetup->getParent())
    549     return nullptr;
    550 
    551   // Make sure we don't have any instructions between DefMI and the
    552   // push that make folding the load illegal.
    553   for (auto I = DefMI; I != FrameSetup; ++I)
    554     if (I->isLoadFoldBarrier())
    555       return nullptr;
    556 
    557   return DefMI;
    558 }
    559