Home | History | Annotate | Download | only in X86
      1 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the pass which inserts x86 AVX vzeroupper instructions
     11 // before calls to SSE encoded functions. This avoids transition latency
     12 // penalty when transferring control between AVX encoded instructions and old
     13 // SSE encoding mode.
     14 //
     15 //===----------------------------------------------------------------------===//
     16 
     17 #include "X86.h"
     18 #include "X86InstrInfo.h"
     19 #include "X86Subtarget.h"
     20 #include "llvm/ADT/Statistic.h"
     21 #include "llvm/CodeGen/MachineFunctionPass.h"
     22 #include "llvm/CodeGen/MachineInstrBuilder.h"
     23 #include "llvm/CodeGen/MachineRegisterInfo.h"
     24 #include "llvm/CodeGen/Passes.h"
     25 #include "llvm/Support/Debug.h"
     26 #include "llvm/Support/raw_ostream.h"
     27 #include "llvm/Target/TargetInstrInfo.h"
     28 using namespace llvm;
     29 
     30 #define DEBUG_TYPE "x86-vzeroupper"
     31 
     32 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
     33 
     34 namespace {
     35 
     36   class VZeroUpperInserter : public MachineFunctionPass {
     37   public:
     38 
     39     VZeroUpperInserter() : MachineFunctionPass(ID) {}
     40     bool runOnMachineFunction(MachineFunction &MF) override;
     41     MachineFunctionProperties getRequiredProperties() const override {
     42       return MachineFunctionProperties().set(
     43           MachineFunctionProperties::Property::AllVRegsAllocated);
     44     }
     45     const char *getPassName() const override {return "X86 vzeroupper inserter";}
     46 
     47   private:
     48 
     49     void processBasicBlock(MachineBasicBlock &MBB);
     50     void insertVZeroUpper(MachineBasicBlock::iterator I,
     51                           MachineBasicBlock &MBB);
     52     void addDirtySuccessor(MachineBasicBlock &MBB);
     53 
     54     typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
     55     static const char* getBlockExitStateName(BlockExitState ST);
     56 
     57     // Core algorithm state:
     58     // BlockState - Each block is either:
     59     //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
     60     //                   vzeroupper instructions in this block.
     61     //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
     62     //                  block that will ensure that YMM is clean on exit.
     63     //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
     64     //                  subsequent vzeroupper in the block clears it.
     65     //
     66     // AddedToDirtySuccessors - This flag is raised when a block is added to the
     67     //                          DirtySuccessors list to ensure that it's not
     68     //                          added multiple times.
     69     //
     70     // FirstUnguardedCall - Records the location of the first unguarded call in
     71     //                      each basic block that may need to be guarded by a
     72     //                      vzeroupper. We won't know whether it actually needs
     73     //                      to be guarded until we discover a predecessor that
     74     //                      is DIRTY_OUT.
     75     struct BlockState {
     76       BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
     77       BlockExitState ExitState;
     78       bool AddedToDirtySuccessors;
     79       MachineBasicBlock::iterator FirstUnguardedCall;
     80     };
     81     typedef SmallVector<BlockState, 8> BlockStateMap;
     82     typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
     83 
     84     BlockStateMap BlockStates;
     85     DirtySuccessorsWorkList DirtySuccessors;
     86     bool EverMadeChange;
     87     bool IsX86INTR;
     88     const TargetInstrInfo *TII;
     89 
     90     static char ID;
     91   };
     92 
     93   char VZeroUpperInserter::ID = 0;
     94 }
     95 
     96 FunctionPass *llvm::createX86IssueVZeroUpperPass() {
     97   return new VZeroUpperInserter();
     98 }
     99 
    100 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
    101   switch (ST) {
    102     case PASS_THROUGH: return "Pass-through";
    103     case EXITS_DIRTY: return "Exits-dirty";
    104     case EXITS_CLEAN: return "Exits-clean";
    105   }
    106   llvm_unreachable("Invalid block exit state.");
    107 }
    108 
    109 static bool isYmmReg(unsigned Reg) {
    110   return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
    111 }
    112 
    113 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
    114   for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
    115        E = MRI.livein_end(); I != E; ++I)
    116     if (isYmmReg(I->first))
    117       return true;
    118 
    119   return false;
    120 }
    121 
    122 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
    123   for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
    124     if (!MO.clobbersPhysReg(reg))
    125       return false;
    126   }
    127   return true;
    128 }
    129 
    130 static bool hasYmmReg(MachineInstr &MI) {
    131   for (const MachineOperand &MO : MI.operands()) {
    132     if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
    133       return true;
    134     if (!MO.isReg())
    135       continue;
    136     if (MO.isDebug())
    137       continue;
    138     if (isYmmReg(MO.getReg()))
    139       return true;
    140   }
    141   return false;
    142 }
    143 
    144 /// Check if any YMM register will be clobbered by this instruction.
    145 static bool callClobbersAnyYmmReg(MachineInstr &MI) {
    146   assert(MI.isCall() && "Can only be called on call instructions.");
    147   for (const MachineOperand &MO : MI.operands()) {
    148     if (!MO.isRegMask())
    149       continue;
    150     for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
    151       if (MO.clobbersPhysReg(reg))
    152         return true;
    153     }
    154   }
    155   return false;
    156 }
    157 
    158 /// Insert a vzeroupper instruction before I.
    159 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
    160                                           MachineBasicBlock &MBB) {
    161   DebugLoc dl = I->getDebugLoc();
    162   BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
    163   ++NumVZU;
    164   EverMadeChange = true;
    165 }
    166 
    167 /// Add MBB to the DirtySuccessors list if it hasn't already been added.
    168 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
    169   if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
    170     DirtySuccessors.push_back(&MBB);
    171     BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
    172   }
    173 }
    174 
    175 /// Loop over all of the instructions in the basic block, inserting vzeroupper
    176 /// instructions before function calls.
    177 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
    178 
    179   // Start by assuming that the block is PASS_THROUGH which implies no unguarded
    180   // calls.
    181   BlockExitState CurState = PASS_THROUGH;
    182   BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
    183 
    184   for (MachineInstr &MI : MBB) {
    185     // No need for vzeroupper before iret in interrupt handler function,
    186     // epilogue will restore YMM registers if needed.
    187     bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
    188     bool IsControlFlow = MI.isCall() || MI.isReturn();
    189 
    190     // An existing VZERO* instruction resets the state.
    191     if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
    192       CurState = EXITS_CLEAN;
    193       continue;
    194     }
    195 
    196     // Shortcut: don't need to check regular instructions in dirty state.
    197     if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
    198       continue;
    199 
    200     if (hasYmmReg(MI)) {
    201       // We found a ymm-using instruction; this could be an AVX instruction,
    202       // or it could be control flow.
    203       CurState = EXITS_DIRTY;
    204       continue;
    205     }
    206 
    207     // Check for control-flow out of the current function (which might
    208     // indirectly execute SSE instructions).
    209     if (!IsControlFlow || IsReturnFromX86INTR)
    210       continue;
    211 
    212     // If the call won't clobber any YMM register, skip it as well. It usually
    213     // happens on helper function calls (such as '_chkstk', '_ftol2') where
    214     // standard calling convention is not used (RegMask is not used to mark
    215     // register clobbered and register usage (def/imp-def/use) is well-defined
    216     // and explicitly specified.
    217     if (MI.isCall() && !callClobbersAnyYmmReg(MI))
    218       continue;
    219 
    220     // The VZEROUPPER instruction resets the upper 128 bits of all AVX
    221     // registers. In addition, the processor changes back to Clean state, after
    222     // which execution of SSE instructions or AVX instructions has no transition
    223     // penalty. Add the VZEROUPPER instruction before any function call/return
    224     // that might execute SSE code.
    225     // FIXME: In some cases, we may want to move the VZEROUPPER into a
    226     // predecessor block.
    227     if (CurState == EXITS_DIRTY) {
    228       // After the inserted VZEROUPPER the state becomes clean again, but
    229       // other YMM may appear before other subsequent calls or even before
    230       // the end of the BB.
    231       insertVZeroUpper(MI, MBB);
    232       CurState = EXITS_CLEAN;
    233     } else if (CurState == PASS_THROUGH) {
    234       // If this block is currently in pass-through state and we encounter a
    235       // call then whether we need a vzeroupper or not depends on whether this
    236       // block has successors that exit dirty. Record the location of the call,
    237       // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
    238       // It will be inserted later if necessary.
    239       BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
    240       CurState = EXITS_CLEAN;
    241     }
    242   }
    243 
    244   DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
    245                << getBlockExitStateName(CurState) << '\n');
    246 
    247   if (CurState == EXITS_DIRTY)
    248     for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
    249                                           SE = MBB.succ_end();
    250          SI != SE; ++SI)
    251       addDirtySuccessor(**SI);
    252 
    253   BlockStates[MBB.getNumber()].ExitState = CurState;
    254 }
    255 
    256 /// Loop over all of the basic blocks, inserting vzeroupper instructions before
    257 /// function calls.
    258 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
    259   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
    260   if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
    261     return false;
    262   TII = ST.getInstrInfo();
    263   MachineRegisterInfo &MRI = MF.getRegInfo();
    264   EverMadeChange = false;
    265   IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
    266 
    267   bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
    268 
    269   // Fast check: if the function doesn't use any ymm registers, we don't need
    270   // to insert any VZEROUPPER instructions.  This is constant-time, so it is
    271   // cheap in the common case of no ymm use.
    272   bool YMMUsed = FnHasLiveInYmm;
    273   if (!YMMUsed) {
    274     const TargetRegisterClass *RC = &X86::VR256RegClass;
    275     for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
    276          i++) {
    277       if (!MRI.reg_nodbg_empty(*i)) {
    278         YMMUsed = true;
    279         break;
    280       }
    281     }
    282   }
    283   if (!YMMUsed) {
    284     return false;
    285   }
    286 
    287   assert(BlockStates.empty() && DirtySuccessors.empty() &&
    288          "X86VZeroUpper state should be clear");
    289   BlockStates.resize(MF.getNumBlockIDs());
    290 
    291   // Process all blocks. This will compute block exit states, record the first
    292   // unguarded call in each block, and add successors of dirty blocks to the
    293   // DirtySuccessors list.
    294   for (MachineBasicBlock &MBB : MF)
    295     processBasicBlock(MBB);
    296 
    297   // If any YMM regs are live-in to this function, add the entry block to the
    298   // DirtySuccessors list
    299   if (FnHasLiveInYmm)
    300     addDirtySuccessor(MF.front());
    301 
    302   // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
    303   // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
    304   // through PASS_THROUGH blocks.
    305   while (!DirtySuccessors.empty()) {
    306     MachineBasicBlock &MBB = *DirtySuccessors.back();
    307     DirtySuccessors.pop_back();
    308     BlockState &BBState = BlockStates[MBB.getNumber()];
    309 
    310     // MBB is a successor of a dirty block, so its first call needs to be
    311     // guarded.
    312     if (BBState.FirstUnguardedCall != MBB.end())
    313       insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
    314 
    315     // If this successor was a pass-through block, then it is now dirty. Its
    316     // successors need to be added to the worklist (if they haven't been
    317     // already).
    318     if (BBState.ExitState == PASS_THROUGH) {
    319       DEBUG(dbgs() << "MBB #" << MBB.getNumber()
    320                    << " was Pass-through, is now Dirty-out.\n");
    321       for (MachineBasicBlock *Succ : MBB.successors())
    322         addDirtySuccessor(*Succ);
    323     }
    324   }
    325 
    326   BlockStates.clear();
    327   return EverMadeChange;
    328 }
    329