1 //===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the pass which inserts x86 AVX vzeroupper instructions 11 // before calls to SSE encoded functions. This avoids transition latency 12 // penalty when transferring control between AVX encoded instructions and old 13 // SSE encoding mode. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "X86.h" 18 #include "X86InstrInfo.h" 19 #include "X86Subtarget.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/CodeGen/MachineBasicBlock.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineFunctionPass.h" 25 #include "llvm/CodeGen/MachineInstr.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineOperand.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 #include "llvm/CodeGen/TargetInstrInfo.h" 30 #include "llvm/CodeGen/TargetRegisterInfo.h" 31 #include "llvm/IR/CallingConv.h" 32 #include "llvm/IR/DebugLoc.h" 33 #include "llvm/IR/Function.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/ErrorHandling.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include <cassert> 38 39 using namespace llvm; 40 41 #define DEBUG_TYPE "x86-vzeroupper" 42 43 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 44 45 namespace { 46 47 class VZeroUpperInserter : public MachineFunctionPass { 48 public: 49 VZeroUpperInserter() : MachineFunctionPass(ID) {} 50 51 bool runOnMachineFunction(MachineFunction &MF) override; 52 53 MachineFunctionProperties getRequiredProperties() const override { 54 return MachineFunctionProperties().set( 55 MachineFunctionProperties::Property::NoVRegs); 56 } 57 58 StringRef getPassName() const override { return "X86 vzeroupper inserter"; } 59 60 private: 61 void processBasicBlock(MachineBasicBlock &MBB); 62 void insertVZeroUpper(MachineBasicBlock::iterator I, 63 MachineBasicBlock &MBB); 64 void addDirtySuccessor(MachineBasicBlock &MBB); 65 66 using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; 67 68 static const char* getBlockExitStateName(BlockExitState ST); 69 70 // Core algorithm state: 71 // BlockState - Each block is either: 72 // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor 73 // vzeroupper instructions in this block. 74 // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this 75 // block that will ensure that YMM/ZMM is clean on exit. 76 // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no 77 // subsequent vzeroupper in the block clears it. 78 // 79 // AddedToDirtySuccessors - This flag is raised when a block is added to the 80 // DirtySuccessors list to ensure that it's not 81 // added multiple times. 82 // 83 // FirstUnguardedCall - Records the location of the first unguarded call in 84 // each basic block that may need to be guarded by a 85 // vzeroupper. We won't know whether it actually needs 86 // to be guarded until we discover a predecessor that 87 // is DIRTY_OUT. 88 struct BlockState { 89 BlockExitState ExitState = PASS_THROUGH; 90 bool AddedToDirtySuccessors = false; 91 MachineBasicBlock::iterator FirstUnguardedCall; 92 93 BlockState() = default; 94 }; 95 96 using BlockStateMap = SmallVector<BlockState, 8>; 97 using DirtySuccessorsWorkList = SmallVector<MachineBasicBlock *, 8>; 98 99 BlockStateMap BlockStates; 100 DirtySuccessorsWorkList DirtySuccessors; 101 bool EverMadeChange; 102 bool IsX86INTR; 103 const TargetInstrInfo *TII; 104 105 static char ID; 106 }; 107 108 } // end anonymous namespace 109 110 char VZeroUpperInserter::ID = 0; 111 112 FunctionPass *llvm::createX86IssueVZeroUpperPass() { 113 return new VZeroUpperInserter(); 114 } 115 116 #ifndef NDEBUG 117 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { 118 switch (ST) { 119 case PASS_THROUGH: return "Pass-through"; 120 case EXITS_DIRTY: return "Exits-dirty"; 121 case EXITS_CLEAN: return "Exits-clean"; 122 } 123 llvm_unreachable("Invalid block exit state."); 124 } 125 #endif 126 127 /// VZEROUPPER cleans state that is related to Y/ZMM0-15 only. 128 /// Thus, there is no need to check for Y/ZMM16 and above. 129 static bool isYmmOrZmmReg(unsigned Reg) { 130 return (Reg >= X86::YMM0 && Reg <= X86::YMM15) || 131 (Reg >= X86::ZMM0 && Reg <= X86::ZMM15); 132 } 133 134 static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) { 135 for (std::pair<unsigned, unsigned> LI : MRI.liveins()) 136 if (isYmmOrZmmReg(LI.first)) 137 return true; 138 139 return false; 140 } 141 142 static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) { 143 for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { 144 if (!MO.clobbersPhysReg(reg)) 145 return false; 146 } 147 for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) { 148 if (!MO.clobbersPhysReg(reg)) 149 return false; 150 } 151 return true; 152 } 153 154 static bool hasYmmOrZmmReg(MachineInstr &MI) { 155 for (const MachineOperand &MO : MI.operands()) { 156 if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO)) 157 return true; 158 if (!MO.isReg()) 159 continue; 160 if (MO.isDebug()) 161 continue; 162 if (isYmmOrZmmReg(MO.getReg())) 163 return true; 164 } 165 return false; 166 } 167 168 /// Check if given call instruction has a RegMask operand. 169 static bool callHasRegMask(MachineInstr &MI) { 170 assert(MI.isCall() && "Can only be called on call instructions."); 171 for (const MachineOperand &MO : MI.operands()) { 172 if (MO.isRegMask()) 173 return true; 174 } 175 return false; 176 } 177 178 /// Insert a vzeroupper instruction before I. 179 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I, 180 MachineBasicBlock &MBB) { 181 DebugLoc dl = I->getDebugLoc(); 182 BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER)); 183 ++NumVZU; 184 EverMadeChange = true; 185 } 186 187 /// Add MBB to the DirtySuccessors list if it hasn't already been added. 188 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { 189 if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) { 190 DirtySuccessors.push_back(&MBB); 191 BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true; 192 } 193 } 194 195 /// Loop over all of the instructions in the basic block, inserting vzeroupper 196 /// instructions before function calls. 197 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { 198 // Start by assuming that the block is PASS_THROUGH which implies no unguarded 199 // calls. 200 BlockExitState CurState = PASS_THROUGH; 201 BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); 202 203 for (MachineInstr &MI : MBB) { 204 bool IsCall = MI.isCall(); 205 bool IsReturn = MI.isReturn(); 206 bool IsControlFlow = IsCall || IsReturn; 207 208 // No need for vzeroupper before iret in interrupt handler function, 209 // epilogue will restore YMM/ZMM registers if needed. 210 if (IsX86INTR && IsReturn) 211 continue; 212 213 // An existing VZERO* instruction resets the state. 214 if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) { 215 CurState = EXITS_CLEAN; 216 continue; 217 } 218 219 // Shortcut: don't need to check regular instructions in dirty state. 220 if (!IsControlFlow && CurState == EXITS_DIRTY) 221 continue; 222 223 if (hasYmmOrZmmReg(MI)) { 224 // We found a ymm/zmm-using instruction; this could be an AVX/AVX512 225 // instruction, or it could be control flow. 226 CurState = EXITS_DIRTY; 227 continue; 228 } 229 230 // Check for control-flow out of the current function (which might 231 // indirectly execute SSE instructions). 232 if (!IsControlFlow) 233 continue; 234 235 // If the call has no RegMask, skip it as well. It usually happens on 236 // helper function calls (such as '_chkstk', '_ftol2') where standard 237 // calling convention is not used (RegMask is not used to mark register 238 // clobbered and register usage (def/implicit-def/use) is well-defined and 239 // explicitly specified. 240 if (IsCall && !callHasRegMask(MI)) 241 continue; 242 243 // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15 244 // registers. In addition, the processor changes back to Clean state, after 245 // which execution of SSE instructions or AVX instructions has no transition 246 // penalty. Add the VZEROUPPER instruction before any function call/return 247 // that might execute SSE code. 248 // FIXME: In some cases, we may want to move the VZEROUPPER into a 249 // predecessor block. 250 if (CurState == EXITS_DIRTY) { 251 // After the inserted VZEROUPPER the state becomes clean again, but 252 // other YMM/ZMM may appear before other subsequent calls or even before 253 // the end of the BB. 254 insertVZeroUpper(MI, MBB); 255 CurState = EXITS_CLEAN; 256 } else if (CurState == PASS_THROUGH) { 257 // If this block is currently in pass-through state and we encounter a 258 // call then whether we need a vzeroupper or not depends on whether this 259 // block has successors that exit dirty. Record the location of the call, 260 // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet. 261 // It will be inserted later if necessary. 262 BlockStates[MBB.getNumber()].FirstUnguardedCall = MI; 263 CurState = EXITS_CLEAN; 264 } 265 } 266 267 LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: " 268 << getBlockExitStateName(CurState) << '\n'); 269 270 if (CurState == EXITS_DIRTY) 271 for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), 272 SE = MBB.succ_end(); 273 SI != SE; ++SI) 274 addDirtySuccessor(**SI); 275 276 BlockStates[MBB.getNumber()].ExitState = CurState; 277 } 278 279 /// Loop over all of the basic blocks, inserting vzeroupper instructions before 280 /// function calls. 281 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 282 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); 283 if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite()) 284 return false; 285 TII = ST.getInstrInfo(); 286 MachineRegisterInfo &MRI = MF.getRegInfo(); 287 EverMadeChange = false; 288 IsX86INTR = MF.getFunction().getCallingConv() == CallingConv::X86_INTR; 289 290 bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI); 291 292 // Fast check: if the function doesn't use any ymm/zmm registers, we don't 293 // need to insert any VZEROUPPER instructions. This is constant-time, so it 294 // is cheap in the common case of no ymm/zmm use. 295 bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm; 296 const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass}; 297 for (auto *RC : RCs) { 298 if (!YmmOrZmmUsed) { 299 for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; 300 i++) { 301 if (!MRI.reg_nodbg_empty(*i)) { 302 YmmOrZmmUsed = true; 303 break; 304 } 305 } 306 } 307 } 308 if (!YmmOrZmmUsed) { 309 return false; 310 } 311 312 assert(BlockStates.empty() && DirtySuccessors.empty() && 313 "X86VZeroUpper state should be clear"); 314 BlockStates.resize(MF.getNumBlockIDs()); 315 316 // Process all blocks. This will compute block exit states, record the first 317 // unguarded call in each block, and add successors of dirty blocks to the 318 // DirtySuccessors list. 319 for (MachineBasicBlock &MBB : MF) 320 processBasicBlock(MBB); 321 322 // If any YMM/ZMM regs are live-in to this function, add the entry block to 323 // the DirtySuccessors list 324 if (FnHasLiveInYmmOrZmm) 325 addDirtySuccessor(MF.front()); 326 327 // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add 328 // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY 329 // through PASS_THROUGH blocks. 330 while (!DirtySuccessors.empty()) { 331 MachineBasicBlock &MBB = *DirtySuccessors.back(); 332 DirtySuccessors.pop_back(); 333 BlockState &BBState = BlockStates[MBB.getNumber()]; 334 335 // MBB is a successor of a dirty block, so its first call needs to be 336 // guarded. 337 if (BBState.FirstUnguardedCall != MBB.end()) 338 insertVZeroUpper(BBState.FirstUnguardedCall, MBB); 339 340 // If this successor was a pass-through block, then it is now dirty. Its 341 // successors need to be added to the worklist (if they haven't been 342 // already). 343 if (BBState.ExitState == PASS_THROUGH) { 344 LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber() 345 << " was Pass-through, is now Dirty-out.\n"); 346 for (MachineBasicBlock *Succ : MBB.successors()) 347 addDirtySuccessor(*Succ); 348 } 349 } 350 351 BlockStates.clear(); 352 return EverMadeChange; 353 } 354