Home | History | Annotate | Download | only in X86
      1 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file defines the pass which inserts x86 AVX vzeroupper instructions
     11 // before calls to SSE encoded functions. This avoids transition latency
     12 // penalty when transferring control between AVX encoded instructions and old
     13 // SSE encoding mode.
     14 //
     15 //===----------------------------------------------------------------------===//
     16 
     17 #include "X86.h"
     18 #include "X86InstrInfo.h"
     19 #include "X86Subtarget.h"
     20 #include "llvm/ADT/Statistic.h"
     21 #include "llvm/CodeGen/MachineFunctionPass.h"
     22 #include "llvm/CodeGen/MachineInstrBuilder.h"
     23 #include "llvm/CodeGen/MachineRegisterInfo.h"
     24 #include "llvm/CodeGen/Passes.h"
     25 #include "llvm/Support/Debug.h"
     26 #include "llvm/Support/raw_ostream.h"
     27 #include "llvm/Target/TargetInstrInfo.h"
     28 using namespace llvm;
     29 
     30 #define DEBUG_TYPE "x86-vzeroupper"
     31 
     32 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
     33 
     34 namespace {
     35 
     36   class VZeroUpperInserter : public MachineFunctionPass {
     37   public:
     38 
     39     VZeroUpperInserter() : MachineFunctionPass(ID) {}
     40     bool runOnMachineFunction(MachineFunction &MF) override;
     41     const char *getPassName() const override {return "X86 vzeroupper inserter";}
     42 
     43   private:
     44 
     45     void processBasicBlock(MachineBasicBlock &MBB);
     46     void insertVZeroUpper(MachineBasicBlock::iterator I,
     47                           MachineBasicBlock &MBB);
     48     void addDirtySuccessor(MachineBasicBlock &MBB);
     49 
     50     typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
     51     static const char* getBlockExitStateName(BlockExitState ST);
     52 
     53     // Core algorithm state:
     54     // BlockState - Each block is either:
     55     //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
     56     //                   vzeroupper instructions in this block.
     57     //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
     58     //                  block that will ensure that YMM is clean on exit.
     59     //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
     60     //                  subsequent vzeroupper in the block clears it.
     61     //
     62     // AddedToDirtySuccessors - This flag is raised when a block is added to the
     63     //                          DirtySuccessors list to ensure that it's not
     64     //                          added multiple times.
     65     //
     66     // FirstUnguardedCall - Records the location of the first unguarded call in
     67     //                      each basic block that may need to be guarded by a
     68     //                      vzeroupper. We won't know whether it actually needs
     69     //                      to be guarded until we discover a predecessor that
     70     //                      is DIRTY_OUT.
     71     struct BlockState {
     72       BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
     73       BlockExitState ExitState;
     74       bool AddedToDirtySuccessors;
     75       MachineBasicBlock::iterator FirstUnguardedCall;
     76     };
     77     typedef SmallVector<BlockState, 8> BlockStateMap;
     78     typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
     79 
     80     BlockStateMap BlockStates;
     81     DirtySuccessorsWorkList DirtySuccessors;
     82     bool EverMadeChange;
     83     const TargetInstrInfo *TII;
     84 
     85     static char ID;
     86   };
     87 
     88   char VZeroUpperInserter::ID = 0;
     89 }
     90 
     91 FunctionPass *llvm::createX86IssueVZeroUpperPass() {
     92   return new VZeroUpperInserter();
     93 }
     94 
     95 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
     96   switch (ST) {
     97     case PASS_THROUGH: return "Pass-through";
     98     case EXITS_DIRTY: return "Exits-dirty";
     99     case EXITS_CLEAN: return "Exits-clean";
    100   }
    101   llvm_unreachable("Invalid block exit state.");
    102 }
    103 
    104 static bool isYmmReg(unsigned Reg) {
    105   return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
    106 }
    107 
    108 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
    109   for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
    110        E = MRI.livein_end(); I != E; ++I)
    111     if (isYmmReg(I->first))
    112       return true;
    113 
    114   return false;
    115 }
    116 
    117 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
    118   for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
    119     if (!MO.clobbersPhysReg(reg))
    120       return false;
    121   }
    122   return true;
    123 }
    124 
    125 static bool hasYmmReg(MachineInstr *MI) {
    126   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
    127     const MachineOperand &MO = MI->getOperand(i);
    128     if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
    129       return true;
    130     if (!MO.isReg())
    131       continue;
    132     if (MO.isDebug())
    133       continue;
    134     if (isYmmReg(MO.getReg()))
    135       return true;
    136   }
    137   return false;
    138 }
    139 
    140 /// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
    141 /// instruction.
    142 static bool callClobbersAnyYmmReg(MachineInstr *MI) {
    143   assert(MI->isCall() && "Can only be called on call instructions.");
    144   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
    145     const MachineOperand &MO = MI->getOperand(i);
    146     if (!MO.isRegMask())
    147       continue;
    148     for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
    149       if (MO.clobbersPhysReg(reg))
    150         return true;
    151     }
    152   }
    153   return false;
    154 }
    155 
    156 // Insert a vzeroupper instruction before I.
    157 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
    158                                               MachineBasicBlock &MBB) {
    159   DebugLoc dl = I->getDebugLoc();
    160   BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
    161   ++NumVZU;
    162   EverMadeChange = true;
    163 }
    164 
    165 // Add MBB to the DirtySuccessors list if it hasn't already been added.
    166 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
    167   if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
    168     DirtySuccessors.push_back(&MBB);
    169     BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
    170   }
    171 }
    172 
    173 /// processBasicBlock - Loop over all of the instructions in the basic block,
    174 /// inserting vzeroupper instructions before function calls.
    175 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
    176 
    177   // Start by assuming that the block PASS_THROUGH, which implies no unguarded
    178   // calls.
    179   BlockExitState CurState = PASS_THROUGH;
    180   BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
    181 
    182   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
    183     MachineInstr *MI = I;
    184     bool isControlFlow = MI->isCall() || MI->isReturn();
    185 
    186     // Shortcut: don't need to check regular instructions in dirty state.
    187     if (!isControlFlow && CurState == EXITS_DIRTY)
    188       continue;
    189 
    190     if (hasYmmReg(MI)) {
    191       // We found a ymm-using instruction; this could be an AVX instruction,
    192       // or it could be control flow.
    193       CurState = EXITS_DIRTY;
    194       continue;
    195     }
    196 
    197     // Check for control-flow out of the current function (which might
    198     // indirectly execute SSE instructions).
    199     if (!isControlFlow)
    200       continue;
    201 
    202     // If the call won't clobber any YMM register, skip it as well. It usually
    203     // happens on helper function calls (such as '_chkstk', '_ftol2') where
    204     // standard calling convention is not used (RegMask is not used to mark
    205     // register clobbered and register usage (def/imp-def/use) is well-defined
    206     // and explicitly specified.
    207     if (MI->isCall() && !callClobbersAnyYmmReg(MI))
    208       continue;
    209 
    210     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
    211     // registers. This instruction has zero latency. In addition, the processor
    212     // changes back to Clean state, after which execution of Intel SSE
    213     // instructions or Intel AVX instructions has no transition penalty. Add
    214     // the VZEROUPPER instruction before any function call/return that might
    215     // execute SSE code.
    216     // FIXME: In some cases, we may want to move the VZEROUPPER into a
    217     // predecessor block.
    218     if (CurState == EXITS_DIRTY) {
    219       // After the inserted VZEROUPPER the state becomes clean again, but
    220       // other YMM may appear before other subsequent calls or even before
    221       // the end of the BB.
    222       insertVZeroUpper(I, MBB);
    223       CurState = EXITS_CLEAN;
    224     } else if (CurState == PASS_THROUGH) {
    225       // If this block is currently in pass-through state and we encounter a
    226       // call then whether we need a vzeroupper or not depends on whether this
    227       // block has successors that exit dirty. Record the location of the call,
    228       // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
    229       // It will be inserted later if necessary.
    230       BlockStates[MBB.getNumber()].FirstUnguardedCall = I;
    231       CurState = EXITS_CLEAN;
    232     }
    233   }
    234 
    235   DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
    236                << getBlockExitStateName(CurState) << '\n');
    237 
    238   if (CurState == EXITS_DIRTY)
    239     for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
    240                                           SE = MBB.succ_end();
    241          SI != SE; ++SI)
    242       addDirtySuccessor(**SI);
    243 
    244   BlockStates[MBB.getNumber()].ExitState = CurState;
    245 }
    246 
    247 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
    248 /// vzeroupper instructions before function calls.
    249 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
    250   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
    251   if (!ST.hasAVX() || ST.hasAVX512())
    252     return false;
    253   TII = ST.getInstrInfo();
    254   MachineRegisterInfo &MRI = MF.getRegInfo();
    255   EverMadeChange = false;
    256 
    257   bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
    258 
    259   // Fast check: if the function doesn't use any ymm registers, we don't need
    260   // to insert any VZEROUPPER instructions.  This is constant-time, so it is
    261   // cheap in the common case of no ymm use.
    262   bool YMMUsed = FnHasLiveInYmm;
    263   if (!YMMUsed) {
    264     const TargetRegisterClass *RC = &X86::VR256RegClass;
    265     for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
    266          i++) {
    267       if (!MRI.reg_nodbg_empty(*i)) {
    268         YMMUsed = true;
    269         break;
    270       }
    271     }
    272   }
    273   if (!YMMUsed) {
    274     return false;
    275   }
    276 
    277   assert(BlockStates.empty() && DirtySuccessors.empty() &&
    278          "X86VZeroUpper state should be clear");
    279   BlockStates.resize(MF.getNumBlockIDs());
    280 
    281   // Process all blocks. This will compute block exit states, record the first
    282   // unguarded call in each block, and add successors of dirty blocks to the
    283   // DirtySuccessors list.
    284   for (MachineBasicBlock &MBB : MF)
    285     processBasicBlock(MBB);
    286 
    287   // If any YMM regs are live in to this function, add the entry block to the
    288   // DirtySuccessors list
    289   if (FnHasLiveInYmm)
    290     addDirtySuccessor(MF.front());
    291 
    292   // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add
    293   // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
    294   // through PASS_THROUGH blocks.
    295   while (!DirtySuccessors.empty()) {
    296     MachineBasicBlock &MBB = *DirtySuccessors.back();
    297     DirtySuccessors.pop_back();
    298     BlockState &BBState = BlockStates[MBB.getNumber()];
    299 
    300     // MBB is a successor of a dirty block, so its first call needs to be
    301     // guarded.
    302     if (BBState.FirstUnguardedCall != MBB.end())
    303       insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
    304 
    305     // If this successor was a pass-through block then it is now dirty, and its
    306     // successors need to be added to the worklist (if they haven't been
    307     // already).
    308     if (BBState.ExitState == PASS_THROUGH) {
    309       DEBUG(dbgs() << "MBB #" << MBB.getNumber()
    310                    << " was Pass-through, is now Dirty-out.\n");
    311       for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
    312                                             SE = MBB.succ_end();
    313            SI != SE; ++SI)
    314         addDirtySuccessor(**SI);
    315     }
    316   }
    317 
    318   BlockStates.clear();
    319   return EverMadeChange;
    320 }
    321