1 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the pass which inserts x86 AVX vzeroupper instructions 11 // before calls to SSE encoded functions. This avoids transition latency 12 // penalty when transferring control between AVX encoded instructions and old 13 // SSE encoding mode. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "X86.h" 18 #include "X86InstrInfo.h" 19 #include "X86Subtarget.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineFunctionPass.h" 22 #include "llvm/CodeGen/MachineInstrBuilder.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include "llvm/Target/TargetInstrInfo.h" 28 using namespace llvm; 29 30 #define DEBUG_TYPE "x86-vzeroupper" 31 32 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 33 34 namespace { 35 36 class VZeroUpperInserter : public MachineFunctionPass { 37 public: 38 39 VZeroUpperInserter() : MachineFunctionPass(ID) {} 40 bool runOnMachineFunction(MachineFunction &MF) override; 41 const char *getPassName() const override {return "X86 vzeroupper inserter";} 42 43 private: 44 45 void processBasicBlock(MachineBasicBlock &MBB); 46 void insertVZeroUpper(MachineBasicBlock::iterator I, 47 MachineBasicBlock &MBB); 48 void addDirtySuccessor(MachineBasicBlock &MBB); 49 50 typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState; 51 static const char* getBlockExitStateName(BlockExitState ST); 52 53 // Core algorithm state: 54 // BlockState - Each block is either: 55 // - PASS_THROUGH: There are neither YMM dirtying instructions nor 56 // vzeroupper instructions in this block. 57 // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this 58 // block that will ensure that YMM is clean on exit. 59 // - EXITS_DIRTY: An instruction in the block dirties YMM and no 60 // subsequent vzeroupper in the block clears it. 61 // 62 // AddedToDirtySuccessors - This flag is raised when a block is added to the 63 // DirtySuccessors list to ensure that it's not 64 // added multiple times. 65 // 66 // FirstUnguardedCall - Records the location of the first unguarded call in 67 // each basic block that may need to be guarded by a 68 // vzeroupper. We won't know whether it actually needs 69 // to be guarded until we discover a predecessor that 70 // is DIRTY_OUT. 71 struct BlockState { 72 BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {} 73 BlockExitState ExitState; 74 bool AddedToDirtySuccessors; 75 MachineBasicBlock::iterator FirstUnguardedCall; 76 }; 77 typedef SmallVector<BlockState, 8> BlockStateMap; 78 typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList; 79 80 BlockStateMap BlockStates; 81 DirtySuccessorsWorkList DirtySuccessors; 82 bool EverMadeChange; 83 const TargetInstrInfo *TII; 84 85 static char ID; 86 }; 87 88 char VZeroUpperInserter::ID = 0; 89 } 90 91 FunctionPass *llvm::createX86IssueVZeroUpperPass() { 92 return new VZeroUpperInserter(); 93 } 94 95 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { 96 switch (ST) { 97 case PASS_THROUGH: return "Pass-through"; 98 case EXITS_DIRTY: return "Exits-dirty"; 99 case EXITS_CLEAN: return "Exits-clean"; 100 } 101 llvm_unreachable("Invalid block exit state."); 102 } 103 104 static bool isYmmReg(unsigned Reg) { 105 return (Reg >= X86::YMM0 && Reg <= X86::YMM15); 106 } 107 108 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { 109 for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), 110 E = MRI.livein_end(); I != E; ++I) 111 if (isYmmReg(I->first)) 112 return true; 113 114 return false; 115 } 116 117 static bool clobbersAllYmmRegs(const MachineOperand &MO) { 118 for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { 119 if (!MO.clobbersPhysReg(reg)) 120 return false; 121 } 122 return true; 123 } 124 125 static bool hasYmmReg(MachineInstr *MI) { 126 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 127 const MachineOperand &MO = MI->getOperand(i); 128 if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) 129 return true; 130 if (!MO.isReg()) 131 continue; 132 if (MO.isDebug()) 133 continue; 134 if (isYmmReg(MO.getReg())) 135 return true; 136 } 137 return false; 138 } 139 140 /// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this 141 /// instruction. 142 static bool callClobbersAnyYmmReg(MachineInstr *MI) { 143 assert(MI->isCall() && "Can only be called on call instructions."); 144 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 145 const MachineOperand &MO = MI->getOperand(i); 146 if (!MO.isRegMask()) 147 continue; 148 for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { 149 if (MO.clobbersPhysReg(reg)) 150 return true; 151 } 152 } 153 return false; 154 } 155 156 // Insert a vzeroupper instruction before I. 157 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I, 158 MachineBasicBlock &MBB) { 159 DebugLoc dl = I->getDebugLoc(); 160 BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER)); 161 ++NumVZU; 162 EverMadeChange = true; 163 } 164 165 // Add MBB to the DirtySuccessors list if it hasn't already been added. 166 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { 167 if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) { 168 DirtySuccessors.push_back(&MBB); 169 BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true; 170 } 171 } 172 173 /// processBasicBlock - Loop over all of the instructions in the basic block, 174 /// inserting vzeroupper instructions before function calls. 175 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { 176 177 // Start by assuming that the block PASS_THROUGH, which implies no unguarded 178 // calls. 179 BlockExitState CurState = PASS_THROUGH; 180 BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); 181 182 for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { 183 MachineInstr *MI = I; 184 bool isControlFlow = MI->isCall() || MI->isReturn(); 185 186 // Shortcut: don't need to check regular instructions in dirty state. 187 if (!isControlFlow && CurState == EXITS_DIRTY) 188 continue; 189 190 if (hasYmmReg(MI)) { 191 // We found a ymm-using instruction; this could be an AVX instruction, 192 // or it could be control flow. 193 CurState = EXITS_DIRTY; 194 continue; 195 } 196 197 // Check for control-flow out of the current function (which might 198 // indirectly execute SSE instructions). 199 if (!isControlFlow) 200 continue; 201 202 // If the call won't clobber any YMM register, skip it as well. It usually 203 // happens on helper function calls (such as '_chkstk', '_ftol2') where 204 // standard calling convention is not used (RegMask is not used to mark 205 // register clobbered and register usage (def/imp-def/use) is well-defined 206 // and explicitly specified. 207 if (MI->isCall() && !callClobbersAnyYmmReg(MI)) 208 continue; 209 210 // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX 211 // registers. This instruction has zero latency. In addition, the processor 212 // changes back to Clean state, after which execution of Intel SSE 213 // instructions or Intel AVX instructions has no transition penalty. Add 214 // the VZEROUPPER instruction before any function call/return that might 215 // execute SSE code. 216 // FIXME: In some cases, we may want to move the VZEROUPPER into a 217 // predecessor block. 218 if (CurState == EXITS_DIRTY) { 219 // After the inserted VZEROUPPER the state becomes clean again, but 220 // other YMM may appear before other subsequent calls or even before 221 // the end of the BB. 222 insertVZeroUpper(I, MBB); 223 CurState = EXITS_CLEAN; 224 } else if (CurState == PASS_THROUGH) { 225 // If this block is currently in pass-through state and we encounter a 226 // call then whether we need a vzeroupper or not depends on whether this 227 // block has successors that exit dirty. Record the location of the call, 228 // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet. 229 // It will be inserted later if necessary. 230 BlockStates[MBB.getNumber()].FirstUnguardedCall = I; 231 CurState = EXITS_CLEAN; 232 } 233 } 234 235 DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: " 236 << getBlockExitStateName(CurState) << '\n'); 237 238 if (CurState == EXITS_DIRTY) 239 for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), 240 SE = MBB.succ_end(); 241 SI != SE; ++SI) 242 addDirtySuccessor(**SI); 243 244 BlockStates[MBB.getNumber()].ExitState = CurState; 245 } 246 247 /// runOnMachineFunction - Loop over all of the basic blocks, inserting 248 /// vzeroupper instructions before function calls. 249 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 250 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); 251 if (!ST.hasAVX() || ST.hasAVX512()) 252 return false; 253 TII = ST.getInstrInfo(); 254 MachineRegisterInfo &MRI = MF.getRegInfo(); 255 EverMadeChange = false; 256 257 bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); 258 259 // Fast check: if the function doesn't use any ymm registers, we don't need 260 // to insert any VZEROUPPER instructions. This is constant-time, so it is 261 // cheap in the common case of no ymm use. 262 bool YMMUsed = FnHasLiveInYmm; 263 if (!YMMUsed) { 264 const TargetRegisterClass *RC = &X86::VR256RegClass; 265 for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; 266 i++) { 267 if (!MRI.reg_nodbg_empty(*i)) { 268 YMMUsed = true; 269 break; 270 } 271 } 272 } 273 if (!YMMUsed) { 274 return false; 275 } 276 277 assert(BlockStates.empty() && DirtySuccessors.empty() && 278 "X86VZeroUpper state should be clear"); 279 BlockStates.resize(MF.getNumBlockIDs()); 280 281 // Process all blocks. This will compute block exit states, record the first 282 // unguarded call in each block, and add successors of dirty blocks to the 283 // DirtySuccessors list. 284 for (MachineBasicBlock &MBB : MF) 285 processBasicBlock(MBB); 286 287 // If any YMM regs are live in to this function, add the entry block to the 288 // DirtySuccessors list 289 if (FnHasLiveInYmm) 290 addDirtySuccessor(MF.front()); 291 292 // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add 293 // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY 294 // through PASS_THROUGH blocks. 295 while (!DirtySuccessors.empty()) { 296 MachineBasicBlock &MBB = *DirtySuccessors.back(); 297 DirtySuccessors.pop_back(); 298 BlockState &BBState = BlockStates[MBB.getNumber()]; 299 300 // MBB is a successor of a dirty block, so its first call needs to be 301 // guarded. 302 if (BBState.FirstUnguardedCall != MBB.end()) 303 insertVZeroUpper(BBState.FirstUnguardedCall, MBB); 304 305 // If this successor was a pass-through block then it is now dirty, and its 306 // successors need to be added to the worklist (if they haven't been 307 // already). 308 if (BBState.ExitState == PASS_THROUGH) { 309 DEBUG(dbgs() << "MBB #" << MBB.getNumber() 310 << " was Pass-through, is now Dirty-out.\n"); 311 for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), 312 SE = MBB.succ_end(); 313 SI != SE; ++SI) 314 addDirtySuccessor(**SI); 315 } 316 } 317 318 BlockStates.clear(); 319 return EverMadeChange; 320 } 321