1 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the pass which inserts x86 AVX vzeroupper instructions 11 // before calls to SSE encoded functions. This avoids transition latency 12 // penalty when transferring control between AVX encoded instructions and old 13 // SSE encoding mode. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "X86.h" 18 #include "X86InstrInfo.h" 19 #include "X86Subtarget.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineFunctionPass.h" 22 #include "llvm/CodeGen/MachineInstrBuilder.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include "llvm/Target/TargetInstrInfo.h" 28 using namespace llvm; 29 30 #define DEBUG_TYPE "x86-vzeroupper" 31 32 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 33 34 namespace { 35 36 class VZeroUpperInserter : public MachineFunctionPass { 37 public: 38 39 VZeroUpperInserter() : MachineFunctionPass(ID) {} 40 bool runOnMachineFunction(MachineFunction &MF) override; 41 MachineFunctionProperties getRequiredProperties() const override { 42 return MachineFunctionProperties().set( 43 MachineFunctionProperties::Property::AllVRegsAllocated); 44 } 45 const char *getPassName() const override {return "X86 vzeroupper inserter";} 46 47 private: 48 49 void processBasicBlock(MachineBasicBlock &MBB); 50 void insertVZeroUpper(MachineBasicBlock::iterator I, 51 MachineBasicBlock &MBB); 52 void addDirtySuccessor(MachineBasicBlock &MBB); 53 54 typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState; 55 static const char* getBlockExitStateName(BlockExitState ST); 56 57 // Core algorithm state: 58 // BlockState - Each block is either: 59 // - PASS_THROUGH: There are neither YMM dirtying instructions nor 60 // vzeroupper instructions in this block. 61 // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this 62 // block that will ensure that YMM is clean on exit. 63 // - EXITS_DIRTY: An instruction in the block dirties YMM and no 64 // subsequent vzeroupper in the block clears it. 65 // 66 // AddedToDirtySuccessors - This flag is raised when a block is added to the 67 // DirtySuccessors list to ensure that it's not 68 // added multiple times. 69 // 70 // FirstUnguardedCall - Records the location of the first unguarded call in 71 // each basic block that may need to be guarded by a 72 // vzeroupper. We won't know whether it actually needs 73 // to be guarded until we discover a predecessor that 74 // is DIRTY_OUT. 75 struct BlockState { 76 BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {} 77 BlockExitState ExitState; 78 bool AddedToDirtySuccessors; 79 MachineBasicBlock::iterator FirstUnguardedCall; 80 }; 81 typedef SmallVector<BlockState, 8> BlockStateMap; 82 typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList; 83 84 BlockStateMap BlockStates; 85 DirtySuccessorsWorkList DirtySuccessors; 86 bool EverMadeChange; 87 bool IsX86INTR; 88 const TargetInstrInfo *TII; 89 90 static char ID; 91 }; 92 93 char VZeroUpperInserter::ID = 0; 94 } 95 96 FunctionPass *llvm::createX86IssueVZeroUpperPass() { 97 return new VZeroUpperInserter(); 98 } 99 100 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { 101 switch (ST) { 102 case PASS_THROUGH: return "Pass-through"; 103 case EXITS_DIRTY: return "Exits-dirty"; 104 case EXITS_CLEAN: return "Exits-clean"; 105 } 106 llvm_unreachable("Invalid block exit state."); 107 } 108 109 static bool isYmmReg(unsigned Reg) { 110 return (Reg >= X86::YMM0 && Reg <= X86::YMM15); 111 } 112 113 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { 114 for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), 115 E = MRI.livein_end(); I != E; ++I) 116 if (isYmmReg(I->first)) 117 return true; 118 119 return false; 120 } 121 122 static bool clobbersAllYmmRegs(const MachineOperand &MO) { 123 for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { 124 if (!MO.clobbersPhysReg(reg)) 125 return false; 126 } 127 return true; 128 } 129 130 static bool hasYmmReg(MachineInstr &MI) { 131 for (const MachineOperand &MO : MI.operands()) { 132 if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) 133 return true; 134 if (!MO.isReg()) 135 continue; 136 if (MO.isDebug()) 137 continue; 138 if (isYmmReg(MO.getReg())) 139 return true; 140 } 141 return false; 142 } 143 144 /// Check if any YMM register will be clobbered by this instruction. 145 static bool callClobbersAnyYmmReg(MachineInstr &MI) { 146 assert(MI.isCall() && "Can only be called on call instructions."); 147 for (const MachineOperand &MO : MI.operands()) { 148 if (!MO.isRegMask()) 149 continue; 150 for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { 151 if (MO.clobbersPhysReg(reg)) 152 return true; 153 } 154 } 155 return false; 156 } 157 158 /// Insert a vzeroupper instruction before I. 159 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I, 160 MachineBasicBlock &MBB) { 161 DebugLoc dl = I->getDebugLoc(); 162 BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER)); 163 ++NumVZU; 164 EverMadeChange = true; 165 } 166 167 /// Add MBB to the DirtySuccessors list if it hasn't already been added. 168 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { 169 if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) { 170 DirtySuccessors.push_back(&MBB); 171 BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true; 172 } 173 } 174 175 /// Loop over all of the instructions in the basic block, inserting vzeroupper 176 /// instructions before function calls. 177 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { 178 179 // Start by assuming that the block is PASS_THROUGH which implies no unguarded 180 // calls. 181 BlockExitState CurState = PASS_THROUGH; 182 BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); 183 184 for (MachineInstr &MI : MBB) { 185 // No need for vzeroupper before iret in interrupt handler function, 186 // epilogue will restore YMM registers if needed. 187 bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn(); 188 bool IsControlFlow = MI.isCall() || MI.isReturn(); 189 190 // An existing VZERO* instruction resets the state. 191 if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) { 192 CurState = EXITS_CLEAN; 193 continue; 194 } 195 196 // Shortcut: don't need to check regular instructions in dirty state. 197 if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY) 198 continue; 199 200 if (hasYmmReg(MI)) { 201 // We found a ymm-using instruction; this could be an AVX instruction, 202 // or it could be control flow. 203 CurState = EXITS_DIRTY; 204 continue; 205 } 206 207 // Check for control-flow out of the current function (which might 208 // indirectly execute SSE instructions). 209 if (!IsControlFlow || IsReturnFromX86INTR) 210 continue; 211 212 // If the call won't clobber any YMM register, skip it as well. It usually 213 // happens on helper function calls (such as '_chkstk', '_ftol2') where 214 // standard calling convention is not used (RegMask is not used to mark 215 // register clobbered and register usage (def/imp-def/use) is well-defined 216 // and explicitly specified. 217 if (MI.isCall() && !callClobbersAnyYmmReg(MI)) 218 continue; 219 220 // The VZEROUPPER instruction resets the upper 128 bits of all AVX 221 // registers. In addition, the processor changes back to Clean state, after 222 // which execution of SSE instructions or AVX instructions has no transition 223 // penalty. Add the VZEROUPPER instruction before any function call/return 224 // that might execute SSE code. 225 // FIXME: In some cases, we may want to move the VZEROUPPER into a 226 // predecessor block. 227 if (CurState == EXITS_DIRTY) { 228 // After the inserted VZEROUPPER the state becomes clean again, but 229 // other YMM may appear before other subsequent calls or even before 230 // the end of the BB. 231 insertVZeroUpper(MI, MBB); 232 CurState = EXITS_CLEAN; 233 } else if (CurState == PASS_THROUGH) { 234 // If this block is currently in pass-through state and we encounter a 235 // call then whether we need a vzeroupper or not depends on whether this 236 // block has successors that exit dirty. Record the location of the call, 237 // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet. 238 // It will be inserted later if necessary. 239 BlockStates[MBB.getNumber()].FirstUnguardedCall = MI; 240 CurState = EXITS_CLEAN; 241 } 242 } 243 244 DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: " 245 << getBlockExitStateName(CurState) << '\n'); 246 247 if (CurState == EXITS_DIRTY) 248 for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), 249 SE = MBB.succ_end(); 250 SI != SE; ++SI) 251 addDirtySuccessor(**SI); 252 253 BlockStates[MBB.getNumber()].ExitState = CurState; 254 } 255 256 /// Loop over all of the basic blocks, inserting vzeroupper instructions before 257 /// function calls. 258 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 259 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); 260 if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite()) 261 return false; 262 TII = ST.getInstrInfo(); 263 MachineRegisterInfo &MRI = MF.getRegInfo(); 264 EverMadeChange = false; 265 IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR; 266 267 bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); 268 269 // Fast check: if the function doesn't use any ymm registers, we don't need 270 // to insert any VZEROUPPER instructions. This is constant-time, so it is 271 // cheap in the common case of no ymm use. 272 bool YMMUsed = FnHasLiveInYmm; 273 if (!YMMUsed) { 274 const TargetRegisterClass *RC = &X86::VR256RegClass; 275 for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; 276 i++) { 277 if (!MRI.reg_nodbg_empty(*i)) { 278 YMMUsed = true; 279 break; 280 } 281 } 282 } 283 if (!YMMUsed) { 284 return false; 285 } 286 287 assert(BlockStates.empty() && DirtySuccessors.empty() && 288 "X86VZeroUpper state should be clear"); 289 BlockStates.resize(MF.getNumBlockIDs()); 290 291 // Process all blocks. This will compute block exit states, record the first 292 // unguarded call in each block, and add successors of dirty blocks to the 293 // DirtySuccessors list. 294 for (MachineBasicBlock &MBB : MF) 295 processBasicBlock(MBB); 296 297 // If any YMM regs are live-in to this function, add the entry block to the 298 // DirtySuccessors list 299 if (FnHasLiveInYmm) 300 addDirtySuccessor(MF.front()); 301 302 // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add 303 // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY 304 // through PASS_THROUGH blocks. 305 while (!DirtySuccessors.empty()) { 306 MachineBasicBlock &MBB = *DirtySuccessors.back(); 307 DirtySuccessors.pop_back(); 308 BlockState &BBState = BlockStates[MBB.getNumber()]; 309 310 // MBB is a successor of a dirty block, so its first call needs to be 311 // guarded. 312 if (BBState.FirstUnguardedCall != MBB.end()) 313 insertVZeroUpper(BBState.FirstUnguardedCall, MBB); 314 315 // If this successor was a pass-through block, then it is now dirty. Its 316 // successors need to be added to the worklist (if they haven't been 317 // already). 318 if (BBState.ExitState == PASS_THROUGH) { 319 DEBUG(dbgs() << "MBB #" << MBB.getNumber() 320 << " was Pass-through, is now Dirty-out.\n"); 321 for (MachineBasicBlock *Succ : MBB.successors()) 322 addDirtySuccessor(*Succ); 323 } 324 } 325 326 BlockStates.clear(); 327 return EverMadeChange; 328 } 329