1 //===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines the pass which inserts x86 AVX vzeroupper instructions 11 // before calls to SSE encoded functions. This avoids transition latency 12 // penalty when tranfering control between AVX encoded instructions and old 13 // SSE encoding mode. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #define DEBUG_TYPE "x86-vzeroupper" 18 #include "X86.h" 19 #include "X86InstrInfo.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineFunctionPass.h" 22 #include "llvm/CodeGen/MachineInstrBuilder.h" 23 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include "llvm/Target/TargetInstrInfo.h" 28 using namespace llvm; 29 30 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); 31 32 namespace { 33 struct VZeroUpperInserter : public MachineFunctionPass { 34 static char ID; 35 VZeroUpperInserter() : MachineFunctionPass(ID) {} 36 37 virtual bool runOnMachineFunction(MachineFunction &MF); 38 39 bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); 40 41 virtual const char *getPassName() const { return "X86 vzeroupper inserter";} 42 43 private: 44 const TargetInstrInfo *TII; // Machine instruction info. 45 MachineBasicBlock *MBB; // Current basic block 46 47 // Any YMM register live-in to this function? 48 bool FnHasLiveInYmm; 49 50 // BBState - Contains the state of each MBB: unknown, clean, dirty 51 SmallVector<uint8_t, 8> BBState; 52 53 // BBSolved - Keep track of all MBB which had been already analyzed 54 // and there is no further processing required. 55 BitVector BBSolved; 56 57 // Machine Basic Blocks are classified according this pass: 58 // 59 // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state 60 // until the MBB exit there isn't a instruction using YMM to change 61 // the state to dirty, or one of the incoming predecessors is unknown 62 // and there's not a dirty predecessor between them. 63 // 64 // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have 65 // instructions using YMM and be marked ST_CLEAN, as long as the state 66 // is cleaned by a vzeroupper before any call. 67 // 68 // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a 69 // vzeroupper instruction. 70 // 71 // ST_INIT - Placeholder for an empty state set 72 // 73 enum { 74 ST_UNKNOWN = 0, 75 ST_CLEAN = 1, 76 ST_DIRTY = 2, 77 ST_INIT = 3 78 }; 79 80 // computeState - Given two states, compute the resulting state, in 81 // the following way 82 // 83 // 1) One dirty state yields another dirty state 84 // 2) All states must be clean for the result to be clean 85 // 3) If none above and one unknown, the result state is also unknown 86 // 87 unsigned computeState(unsigned PrevState, unsigned CurState) { 88 if (PrevState == ST_INIT) 89 return CurState; 90 91 if (PrevState == ST_DIRTY || CurState == ST_DIRTY) 92 return ST_DIRTY; 93 94 if (PrevState == ST_CLEAN && CurState == ST_CLEAN) 95 return ST_CLEAN; 96 97 return ST_UNKNOWN; 98 } 99 100 }; 101 char VZeroUpperInserter::ID = 0; 102 } 103 104 FunctionPass *llvm::createX86IssueVZeroUpperPass() { 105 return new VZeroUpperInserter(); 106 } 107 108 static bool isYmmReg(unsigned Reg) { 109 if (Reg >= X86::YMM0 && Reg <= X86::YMM15) 110 return true; 111 112 return false; 113 } 114 115 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { 116 for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), 117 E = MRI.livein_end(); I != E; ++I) 118 if (isYmmReg(I->first)) 119 return true; 120 121 return false; 122 } 123 124 static bool hasYmmReg(MachineInstr *MI) { 125 for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { 126 const MachineOperand &MO = MI->getOperand(i); 127 if (!MO.isReg()) 128 continue; 129 if (MO.isDebug()) 130 continue; 131 if (isYmmReg(MO.getReg())) 132 return true; 133 } 134 return false; 135 } 136 137 /// runOnMachineFunction - Loop over all of the basic blocks, inserting 138 /// vzero upper instructions before function calls. 139 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { 140 TII = MF.getTarget().getInstrInfo(); 141 MachineRegisterInfo &MRI = MF.getRegInfo(); 142 bool EverMadeChange = false; 143 144 // Fast check: if the function doesn't use any ymm registers, we don't need 145 // to insert any VZEROUPPER instructions. This is constant-time, so it is 146 // cheap in the common case of no ymm use. 147 bool YMMUsed = false; 148 const TargetRegisterClass *RC = X86::VR256RegisterClass; 149 for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); 150 i != e; i++) { 151 if (MRI.isPhysRegUsed(*i)) { 152 YMMUsed = true; 153 break; 154 } 155 } 156 if (!YMMUsed) 157 return EverMadeChange; 158 159 // Pre-compute the existence of any live-in YMM registers to this function 160 FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); 161 162 assert(BBState.empty()); 163 BBState.resize(MF.getNumBlockIDs(), 0); 164 BBSolved.resize(MF.getNumBlockIDs(), 0); 165 166 // Each BB state depends on all predecessors, loop over until everything 167 // converges. (Once we converge, we can implicitly mark everything that is 168 // still ST_UNKNOWN as ST_CLEAN.) 169 while (1) { 170 bool MadeChange = false; 171 172 // Process all basic blocks. 173 for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) 174 MadeChange |= processBasicBlock(MF, *I); 175 176 // If this iteration over the code changed anything, keep iterating. 177 if (!MadeChange) break; 178 EverMadeChange = true; 179 } 180 181 BBState.clear(); 182 BBSolved.clear(); 183 return EverMadeChange; 184 } 185 186 /// processBasicBlock - Loop over all of the instructions in the basic block, 187 /// inserting vzero upper instructions before function calls. 188 bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, 189 MachineBasicBlock &BB) { 190 bool Changed = false; 191 unsigned BBNum = BB.getNumber(); 192 MBB = &BB; 193 194 // Don't process already solved BBs 195 if (BBSolved[BBNum]) 196 return false; // No changes 197 198 // Check the state of all predecessors 199 unsigned EntryState = ST_INIT; 200 for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), 201 PE = BB.pred_end(); PI != PE; ++PI) { 202 EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); 203 if (EntryState == ST_DIRTY) 204 break; 205 } 206 207 208 // The entry MBB for the function may set the inital state to dirty if 209 // the function receives any YMM incoming arguments 210 if (MBB == MF.begin()) { 211 EntryState = ST_CLEAN; 212 if (FnHasLiveInYmm) 213 EntryState = ST_DIRTY; 214 } 215 216 // The current state is initialized according to the predecessors 217 unsigned CurState = EntryState; 218 bool BBHasCall = false; 219 220 for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { 221 MachineInstr *MI = I; 222 DebugLoc dl = I->getDebugLoc(); 223 bool isControlFlow = MI->isCall() || MI->isReturn(); 224 225 // Shortcut: don't need to check regular instructions in dirty state. 226 if (!isControlFlow && CurState == ST_DIRTY) 227 continue; 228 229 if (hasYmmReg(MI)) { 230 // We found a ymm-using instruction; this could be an AVX instruction, 231 // or it could be control flow. 232 CurState = ST_DIRTY; 233 continue; 234 } 235 236 // Check for control-flow out of the current function (which might 237 // indirectly execute SSE instructions). 238 if (!isControlFlow) 239 continue; 240 241 BBHasCall = true; 242 243 // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX 244 // registers. This instruction has zero latency. In addition, the processor 245 // changes back to Clean state, after which execution of Intel SSE 246 // instructions or Intel AVX instructions has no transition penalty. Add 247 // the VZEROUPPER instruction before any function call/return that might 248 // execute SSE code. 249 // FIXME: In some cases, we may want to move the VZEROUPPER into a 250 // predecessor block. 251 if (CurState == ST_DIRTY) { 252 // Only insert the VZEROUPPER in case the entry state isn't unknown. 253 // When unknown, only compute the information within the block to have 254 // it available in the exit if possible, but don't change the block. 255 if (EntryState != ST_UNKNOWN) { 256 BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); 257 ++NumVZU; 258 } 259 260 // After the inserted VZEROUPPER the state becomes clean again, but 261 // other YMM may appear before other subsequent calls or even before 262 // the end of the BB. 263 CurState = ST_CLEAN; 264 } 265 } 266 267 DEBUG(dbgs() << "MBB #" << BBNum 268 << ", current state: " << CurState << '\n'); 269 270 // A BB can only be considered solved when we both have done all the 271 // necessary transformations, and have computed the exit state. This happens 272 // in two cases: 273 // 1) We know the entry state: this immediately implies the exit state and 274 // all the necessary transformations. 275 // 2) There are no calls, and and a non-call instruction marks this block: 276 // no transformations are necessary, and we know the exit state. 277 if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) 278 BBSolved[BBNum] = true; 279 280 if (CurState != BBState[BBNum]) 281 Changed = true; 282 283 BBState[BBNum] = CurState; 284 return Changed; 285 } 286