1 //====- X86FlagsCopyLowering.cpp - Lowers COPY nodes of EFLAGS ------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// \file 10 /// 11 /// Lowers COPY nodes of EFLAGS by directly extracting and preserving individual 12 /// flag bits. 13 /// 14 /// We have to do this by carefully analyzing and rewriting the usage of the 15 /// copied EFLAGS register because there is no general way to rematerialize the 16 /// entire EFLAGS register safely and efficiently. Using `popf` both forces 17 /// dynamic stack adjustment and can create correctness issues due to IF, TF, 18 /// and other non-status flags being overwritten. Using sequences involving 19 /// SAHF don't work on all x86 processors and are often quite slow compared to 20 /// directly testing a single status preserved in its own GPR. 21 /// 22 //===----------------------------------------------------------------------===// 23 24 #include "X86.h" 25 #include "X86InstrBuilder.h" 26 #include "X86InstrInfo.h" 27 #include "X86Subtarget.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/DenseMap.h" 30 #include "llvm/ADT/PostOrderIterator.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/ScopeExit.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/SparseBitVector.h" 37 #include "llvm/ADT/Statistic.h" 38 #include "llvm/CodeGen/MachineBasicBlock.h" 39 #include "llvm/CodeGen/MachineConstantPool.h" 40 #include "llvm/CodeGen/MachineDominators.h" 41 #include "llvm/CodeGen/MachineFunction.h" 42 #include "llvm/CodeGen/MachineFunctionPass.h" 43 #include "llvm/CodeGen/MachineInstr.h" 44 #include "llvm/CodeGen/MachineInstrBuilder.h" 45 #include "llvm/CodeGen/MachineModuleInfo.h" 46 #include "llvm/CodeGen/MachineOperand.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/CodeGen/MachineSSAUpdater.h" 49 #include "llvm/CodeGen/TargetInstrInfo.h" 50 #include "llvm/CodeGen/TargetRegisterInfo.h" 51 #include "llvm/CodeGen/TargetSchedule.h" 52 #include "llvm/CodeGen/TargetSubtargetInfo.h" 53 #include "llvm/IR/DebugLoc.h" 54 #include "llvm/MC/MCSchedule.h" 55 #include "llvm/Pass.h" 56 #include "llvm/Support/CommandLine.h" 57 #include "llvm/Support/Debug.h" 58 #include "llvm/Support/raw_ostream.h" 59 #include <algorithm> 60 #include <cassert> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define PASS_KEY "x86-flags-copy-lowering" 67 #define DEBUG_TYPE PASS_KEY 68 69 STATISTIC(NumCopiesEliminated, "Number of copies of EFLAGS eliminated"); 70 STATISTIC(NumSetCCsInserted, "Number of setCC instructions inserted"); 71 STATISTIC(NumTestsInserted, "Number of test instructions inserted"); 72 STATISTIC(NumAddsInserted, "Number of adds instructions inserted"); 73 74 namespace llvm { 75 76 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); 77 78 } // end namespace llvm 79 80 namespace { 81 82 // Convenient array type for storing registers associated with each condition. 83 using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>; 84 85 class X86FlagsCopyLoweringPass : public MachineFunctionPass { 86 public: 87 X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { 88 initializeX86FlagsCopyLoweringPassPass(*PassRegistry::getPassRegistry()); 89 } 90 91 StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; } 92 bool runOnMachineFunction(MachineFunction &MF) override; 93 void getAnalysisUsage(AnalysisUsage &AU) const override; 94 95 /// Pass identification, replacement for typeid. 96 static char ID; 97 98 private: 99 MachineRegisterInfo *MRI; 100 const X86Subtarget *Subtarget; 101 const X86InstrInfo *TII; 102 const TargetRegisterInfo *TRI; 103 const TargetRegisterClass *PromoteRC; 104 MachineDominatorTree *MDT; 105 106 CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, 107 MachineBasicBlock::iterator CopyDefI); 108 109 unsigned promoteCondToReg(MachineBasicBlock &MBB, 110 MachineBasicBlock::iterator TestPos, 111 DebugLoc TestLoc, X86::CondCode Cond); 112 std::pair<unsigned, bool> 113 getCondOrInverseInReg(MachineBasicBlock &TestMBB, 114 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 115 X86::CondCode Cond, CondRegArray &CondRegs); 116 void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos, 117 DebugLoc Loc, unsigned Reg); 118 119 void rewriteArithmetic(MachineBasicBlock &TestMBB, 120 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 121 MachineInstr &MI, MachineOperand &FlagUse, 122 CondRegArray &CondRegs); 123 void rewriteCMov(MachineBasicBlock &TestMBB, 124 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 125 MachineInstr &CMovI, MachineOperand &FlagUse, 126 CondRegArray &CondRegs); 127 void rewriteCondJmp(MachineBasicBlock &TestMBB, 128 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 129 MachineInstr &JmpI, CondRegArray &CondRegs); 130 void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, 131 MachineInstr &CopyDefI); 132 void rewriteSetCarryExtended(MachineBasicBlock &TestMBB, 133 MachineBasicBlock::iterator TestPos, 134 DebugLoc TestLoc, MachineInstr &SetBI, 135 MachineOperand &FlagUse, CondRegArray &CondRegs); 136 void rewriteSetCC(MachineBasicBlock &TestMBB, 137 MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, 138 MachineInstr &SetCCI, MachineOperand &FlagUse, 139 CondRegArray &CondRegs); 140 }; 141 142 } // end anonymous namespace 143 144 INITIALIZE_PASS_BEGIN(X86FlagsCopyLoweringPass, DEBUG_TYPE, 145 "X86 EFLAGS copy lowering", false, false) 146 INITIALIZE_PASS_END(X86FlagsCopyLoweringPass, DEBUG_TYPE, 147 "X86 EFLAGS copy lowering", false, false) 148 149 FunctionPass *llvm::createX86FlagsCopyLoweringPass() { 150 return new X86FlagsCopyLoweringPass(); 151 } 152 153 char X86FlagsCopyLoweringPass::ID = 0; 154 155 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { 156 AU.addRequired<MachineDominatorTree>(); 157 MachineFunctionPass::getAnalysisUsage(AU); 158 } 159 160 namespace { 161 /// An enumeration of the arithmetic instruction mnemonics which have 162 /// interesting flag semantics. 163 /// 164 /// We can map instruction opcodes into these mnemonics to make it easy to 165 /// dispatch with specific functionality. 166 enum class FlagArithMnemonic { 167 ADC, 168 ADCX, 169 ADOX, 170 RCL, 171 RCR, 172 SBB, 173 }; 174 } // namespace 175 176 static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { 177 switch (Opcode) { 178 default: 179 report_fatal_error("No support for lowering a copy into EFLAGS when used " 180 "by this instruction!"); 181 182 #define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX) \ 183 case X86::MNEMONIC##8##SUFFIX: \ 184 case X86::MNEMONIC##16##SUFFIX: \ 185 case X86::MNEMONIC##32##SUFFIX: \ 186 case X86::MNEMONIC##64##SUFFIX: 187 188 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ 189 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ 190 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ 191 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ 192 LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ 193 case X86::MNEMONIC##8ri: \ 194 case X86::MNEMONIC##16ri8: \ 195 case X86::MNEMONIC##32ri8: \ 196 case X86::MNEMONIC##64ri8: \ 197 case X86::MNEMONIC##16ri: \ 198 case X86::MNEMONIC##32ri: \ 199 case X86::MNEMONIC##64ri32: \ 200 case X86::MNEMONIC##8mi: \ 201 case X86::MNEMONIC##16mi8: \ 202 case X86::MNEMONIC##32mi8: \ 203 case X86::MNEMONIC##64mi8: \ 204 case X86::MNEMONIC##16mi: \ 205 case X86::MNEMONIC##32mi: \ 206 case X86::MNEMONIC##64mi32: \ 207 case X86::MNEMONIC##8i8: \ 208 case X86::MNEMONIC##16i16: \ 209 case X86::MNEMONIC##32i32: \ 210 case X86::MNEMONIC##64i32: 211 212 LLVM_EXPAND_ADC_SBB_INSTR(ADC) 213 return FlagArithMnemonic::ADC; 214 215 LLVM_EXPAND_ADC_SBB_INSTR(SBB) 216 return FlagArithMnemonic::SBB; 217 218 #undef LLVM_EXPAND_ADC_SBB_INSTR 219 220 LLVM_EXPAND_INSTR_SIZES(RCL, rCL) 221 LLVM_EXPAND_INSTR_SIZES(RCL, r1) 222 LLVM_EXPAND_INSTR_SIZES(RCL, ri) 223 return FlagArithMnemonic::RCL; 224 225 LLVM_EXPAND_INSTR_SIZES(RCR, rCL) 226 LLVM_EXPAND_INSTR_SIZES(RCR, r1) 227 LLVM_EXPAND_INSTR_SIZES(RCR, ri) 228 return FlagArithMnemonic::RCR; 229 230 #undef LLVM_EXPAND_INSTR_SIZES 231 232 case X86::ADCX32rr: 233 case X86::ADCX64rr: 234 case X86::ADCX32rm: 235 case X86::ADCX64rm: 236 return FlagArithMnemonic::ADCX; 237 238 case X86::ADOX32rr: 239 case X86::ADOX64rr: 240 case X86::ADOX32rm: 241 case X86::ADOX64rm: 242 return FlagArithMnemonic::ADOX; 243 } 244 } 245 246 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, 247 MachineInstr &SplitI, 248 const X86InstrInfo &TII) { 249 MachineFunction &MF = *MBB.getParent(); 250 251 assert(SplitI.getParent() == &MBB && 252 "Split instruction must be in the split block!"); 253 assert(SplitI.isBranch() && 254 "Only designed to split a tail of branch instructions!"); 255 assert(X86::getCondFromBranchOpc(SplitI.getOpcode()) != X86::COND_INVALID && 256 "Must split on an actual jCC instruction!"); 257 258 // Dig out the previous instruction to the split point. 259 MachineInstr &PrevI = *std::prev(SplitI.getIterator()); 260 assert(PrevI.isBranch() && "Must split after a branch!"); 261 assert(X86::getCondFromBranchOpc(PrevI.getOpcode()) != X86::COND_INVALID && 262 "Must split after an actual jCC instruction!"); 263 assert(!std::prev(PrevI.getIterator())->isTerminator() && 264 "Must only have this one terminator prior to the split!"); 265 266 // Grab the one successor edge that will stay in `MBB`. 267 MachineBasicBlock &UnsplitSucc = *PrevI.getOperand(0).getMBB(); 268 269 // Analyze the original block to see if we are actually splitting an edge 270 // into two edges. This can happen when we have multiple conditional jumps to 271 // the same successor. 272 bool IsEdgeSplit = 273 std::any_of(SplitI.getIterator(), MBB.instr_end(), 274 [&](MachineInstr &MI) { 275 assert(MI.isTerminator() && 276 "Should only have spliced terminators!"); 277 return llvm::any_of( 278 MI.operands(), [&](MachineOperand &MOp) { 279 return MOp.isMBB() && MOp.getMBB() == &UnsplitSucc; 280 }); 281 }) || 282 MBB.getFallThrough() == &UnsplitSucc; 283 284 MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); 285 286 // Insert the new block immediately after the current one. Any existing 287 // fallthrough will be sunk into this new block anyways. 288 MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); 289 290 // Splice the tail of instructions into the new block. 291 NewMBB.splice(NewMBB.end(), &MBB, SplitI.getIterator(), MBB.end()); 292 293 // Copy the necessary succesors (and their probability info) into the new 294 // block. 295 for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) 296 if (IsEdgeSplit || *SI != &UnsplitSucc) 297 NewMBB.copySuccessor(&MBB, SI); 298 // Normalize the probabilities if we didn't end up splitting the edge. 299 if (!IsEdgeSplit) 300 NewMBB.normalizeSuccProbs(); 301 302 // Now replace all of the moved successors in the original block with the new 303 // block. This will merge their probabilities. 304 for (MachineBasicBlock *Succ : NewMBB.successors()) 305 if (Succ != &UnsplitSucc) 306 MBB.replaceSuccessor(Succ, &NewMBB); 307 308 // We should always end up replacing at least one successor. 309 assert(MBB.isSuccessor(&NewMBB) && 310 "Failed to make the new block a successor!"); 311 312 // Now update all the PHIs. 313 for (MachineBasicBlock *Succ : NewMBB.successors()) { 314 for (MachineInstr &MI : *Succ) { 315 if (!MI.isPHI()) 316 break; 317 318 for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; 319 OpIdx += 2) { 320 MachineOperand &OpV = MI.getOperand(OpIdx); 321 MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); 322 assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); 323 if (OpMBB.getMBB() != &MBB) 324 continue; 325 326 // Replace the operand for unsplit successors 327 if (!IsEdgeSplit || Succ != &UnsplitSucc) { 328 OpMBB.setMBB(&NewMBB); 329 330 // We have to continue scanning as there may be multiple entries in 331 // the PHI. 332 continue; 333 } 334 335 // When we have split the edge append a new successor. 336 MI.addOperand(MF, OpV); 337 MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); 338 break; 339 } 340 } 341 } 342 343 return NewMBB; 344 } 345 346 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { 347 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() 348 << " **********\n"); 349 350 Subtarget = &MF.getSubtarget<X86Subtarget>(); 351 MRI = &MF.getRegInfo(); 352 TII = Subtarget->getInstrInfo(); 353 TRI = Subtarget->getRegisterInfo(); 354 MDT = &getAnalysis<MachineDominatorTree>(); 355 PromoteRC = &X86::GR8RegClass; 356 357 if (MF.begin() == MF.end()) 358 // Nothing to do for a degenerate empty function... 359 return false; 360 361 // Collect the copies in RPO so that when there are chains where a copy is in 362 // turn copied again we visit the first one first. This ensures we can find 363 // viable locations for testing the original EFLAGS that dominate all the 364 // uses across complex CFGs. 365 SmallVector<MachineInstr *, 4> Copies; 366 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); 367 for (MachineBasicBlock *MBB : RPOT) 368 for (MachineInstr &MI : *MBB) 369 if (MI.getOpcode() == TargetOpcode::COPY && 370 MI.getOperand(0).getReg() == X86::EFLAGS) 371 Copies.push_back(&MI); 372 373 for (MachineInstr *CopyI : Copies) { 374 MachineBasicBlock &MBB = *CopyI->getParent(); 375 376 MachineOperand &VOp = CopyI->getOperand(1); 377 assert(VOp.isReg() && 378 "The input to the copy for EFLAGS should always be a register!"); 379 MachineInstr &CopyDefI = *MRI->getVRegDef(VOp.getReg()); 380 if (CopyDefI.getOpcode() != TargetOpcode::COPY) { 381 // FIXME: The big likely candidate here are PHI nodes. We could in theory 382 // handle PHI nodes, but it gets really, really hard. Insanely hard. Hard 383 // enough that it is probably better to change every other part of LLVM 384 // to avoid creating them. The issue is that once we have PHIs we won't 385 // know which original EFLAGS value we need to capture with our setCCs 386 // below. The end result will be computing a complete set of setCCs that 387 // we *might* want, computing them in every place where we copy *out* of 388 // EFLAGS and then doing SSA formation on all of them to insert necessary 389 // PHI nodes and consume those here. Then hoping that somehow we DCE the 390 // unnecessary ones. This DCE seems very unlikely to be successful and so 391 // we will almost certainly end up with a glut of dead setCC 392 // instructions. Until we have a motivating test case and fail to avoid 393 // it by changing other parts of LLVM's lowering, we refuse to handle 394 // this complex case here. 395 LLVM_DEBUG( 396 dbgs() << "ERROR: Encountered unexpected def of an eflags copy: "; 397 CopyDefI.dump()); 398 report_fatal_error( 399 "Cannot lower EFLAGS copy unless it is defined in turn by a copy!"); 400 } 401 402 auto Cleanup = make_scope_exit([&] { 403 // All uses of the EFLAGS copy are now rewritten, kill the copy into 404 // eflags and if dead the copy from. 405 CopyI->eraseFromParent(); 406 if (MRI->use_empty(CopyDefI.getOperand(0).getReg())) 407 CopyDefI.eraseFromParent(); 408 ++NumCopiesEliminated; 409 }); 410 411 MachineOperand &DOp = CopyI->getOperand(0); 412 assert(DOp.isDef() && "Expected register def!"); 413 assert(DOp.getReg() == X86::EFLAGS && "Unexpected copy def register!"); 414 if (DOp.isDead()) 415 continue; 416 417 MachineBasicBlock *TestMBB = CopyDefI.getParent(); 418 auto TestPos = CopyDefI.getIterator(); 419 DebugLoc TestLoc = CopyDefI.getDebugLoc(); 420 421 LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump()); 422 423 // Walk up across live-in EFLAGS to find where they were actually def'ed. 424 // 425 // This copy's def may just be part of a region of blocks covered by 426 // a single def of EFLAGS and we want to find the top of that region where 427 // possible. 428 // 429 // This is essentially a search for a *candidate* reaching definition 430 // location. We don't need to ever find the actual reaching definition here, 431 // but we want to walk up the dominator tree to find the highest point which 432 // would be viable for such a definition. 433 auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin, 434 MachineBasicBlock::iterator End) { 435 // Scan backwards as we expect these to be relatively short and often find 436 // a clobber near the end. 437 return llvm::any_of( 438 llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) { 439 // Flag any instruction (other than the copy we are 440 // currently rewriting) that defs EFLAGS. 441 return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS); 442 }); 443 }; 444 auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB, 445 MachineBasicBlock *EndMBB) { 446 assert(MDT->dominates(BeginMBB, EndMBB) && 447 "Only support paths down the dominator tree!"); 448 SmallPtrSet<MachineBasicBlock *, 4> Visited; 449 SmallVector<MachineBasicBlock *, 4> Worklist; 450 // We terminate at the beginning. No need to scan it. 451 Visited.insert(BeginMBB); 452 Worklist.push_back(EndMBB); 453 do { 454 auto *MBB = Worklist.pop_back_val(); 455 for (auto *PredMBB : MBB->predecessors()) { 456 if (!Visited.insert(PredMBB).second) 457 continue; 458 if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end())) 459 return true; 460 // Enqueue this block to walk its predecessors. 461 Worklist.push_back(PredMBB); 462 } 463 } while (!Worklist.empty()); 464 // No clobber found along a path from the begin to end. 465 return false; 466 }; 467 while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() && 468 !HasEFLAGSClobber(TestMBB->begin(), TestPos)) { 469 // Find the nearest common dominator of the predecessors, as 470 // that will be the best candidate to hoist into. 471 MachineBasicBlock *HoistMBB = 472 std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(), 473 *TestMBB->pred_begin(), 474 [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) { 475 return MDT->findNearestCommonDominator(LHS, RHS); 476 }); 477 478 // Now we need to scan all predecessors that may be reached along paths to 479 // the hoist block. A clobber anywhere in any of these blocks the hoist. 480 // Note that this even handles loops because we require *no* clobbers. 481 if (HasEFLAGSClobberPath(HoistMBB, TestMBB)) 482 break; 483 484 // We also need the terminators to not sneakily clobber flags. 485 if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(), 486 HoistMBB->instr_end())) 487 break; 488 489 // We found a viable location, hoist our test position to it. 490 TestMBB = HoistMBB; 491 TestPos = TestMBB->getFirstTerminator()->getIterator(); 492 // Clear the debug location as it would just be confusing after hoisting. 493 TestLoc = DebugLoc(); 494 } 495 LLVM_DEBUG({ 496 auto DefIt = llvm::find_if( 497 llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)), 498 [&](MachineInstr &MI) { 499 return MI.findRegisterDefOperand(X86::EFLAGS); 500 }); 501 if (DefIt.base() != TestMBB->instr_begin()) { 502 dbgs() << " Using EFLAGS defined by: "; 503 DefIt->dump(); 504 } else { 505 dbgs() << " Using live-in flags for BB:\n"; 506 TestMBB->dump(); 507 } 508 }); 509 510 // While rewriting uses, we buffer jumps and rewrite them in a second pass 511 // because doing so will perturb the CFG that we are walking to find the 512 // uses in the first place. 513 SmallVector<MachineInstr *, 4> JmpIs; 514 515 // Gather the condition flags that have already been preserved in 516 // registers. We do this from scratch each time as we expect there to be 517 // very few of them and we expect to not revisit the same copy definition 518 // many times. If either of those change sufficiently we could build a map 519 // of these up front instead. 520 CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos); 521 522 // Collect the basic blocks we need to scan. Typically this will just be 523 // a single basic block but we may have to scan multiple blocks if the 524 // EFLAGS copy lives into successors. 525 SmallVector<MachineBasicBlock *, 2> Blocks; 526 SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks; 527 Blocks.push_back(&MBB); 528 529 do { 530 MachineBasicBlock &UseMBB = *Blocks.pop_back_val(); 531 532 // Track when if/when we find a kill of the flags in this block. 533 bool FlagsKilled = false; 534 535 // In most cases, we walk from the beginning to the end of the block. But 536 // when the block is the same block as the copy is from, we will visit it 537 // twice. The first time we start from the copy and go to the end. The 538 // second time we start from the beginning and go to the copy. This lets 539 // us handle copies inside of cycles. 540 // FIXME: This loop is *super* confusing. This is at least in part 541 // a symptom of all of this routine needing to be refactored into 542 // documentable components. Once done, there may be a better way to write 543 // this loop. 544 for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB)) 545 ? std::next(CopyI->getIterator()) 546 : UseMBB.instr_begin(), 547 MIE = UseMBB.instr_end(); 548 MII != MIE;) { 549 MachineInstr &MI = *MII++; 550 // If we are in the original copy block and encounter either the copy 551 // def or the copy itself, break so that we don't re-process any part of 552 // the block or process the instructions in the range that was copied 553 // over. 554 if (&MI == CopyI || &MI == &CopyDefI) { 555 assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) && 556 "Should only encounter these on the second pass over the " 557 "original block."); 558 break; 559 } 560 561 MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS); 562 if (!FlagUse) { 563 if (MI.findRegisterDefOperand(X86::EFLAGS)) { 564 // If EFLAGS are defined, it's as-if they were killed. We can stop 565 // scanning here. 566 // 567 // NB!!! Many instructions only modify some flags. LLVM currently 568 // models this as clobbering all flags, but if that ever changes 569 // this will need to be carefully updated to handle that more 570 // complex logic. 571 FlagsKilled = true; 572 break; 573 } 574 continue; 575 } 576 577 LLVM_DEBUG(dbgs() << " Rewriting use: "; MI.dump()); 578 579 // Check the kill flag before we rewrite as that may change it. 580 if (FlagUse->isKill()) 581 FlagsKilled = true; 582 583 // Once we encounter a branch, the rest of the instructions must also be 584 // branches. We can't rewrite in place here, so we handle them below. 585 // 586 // Note that we don't have to handle tail calls here, even conditional 587 // tail calls, as those are not introduced into the X86 MI until post-RA 588 // branch folding or black placement. As a consequence, we get to deal 589 // with the simpler formulation of conditional branches followed by tail 590 // calls. 591 if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) { 592 auto JmpIt = MI.getIterator(); 593 do { 594 JmpIs.push_back(&*JmpIt); 595 ++JmpIt; 596 } while (JmpIt != UseMBB.instr_end() && 597 X86::getCondFromBranchOpc(JmpIt->getOpcode()) != 598 X86::COND_INVALID); 599 break; 600 } 601 602 // Otherwise we can just rewrite in-place. 603 if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) { 604 rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 605 } else if (X86::getCondFromSETOpc(MI.getOpcode()) != 606 X86::COND_INVALID) { 607 rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); 608 } else if (MI.getOpcode() == TargetOpcode::COPY) { 609 rewriteCopy(MI, *FlagUse, CopyDefI); 610 } else { 611 // We assume all other instructions that use flags also def them. 612 assert(MI.findRegisterDefOperand(X86::EFLAGS) && 613 "Expected a def of EFLAGS for this instruction!"); 614 615 // NB!!! Several arithmetic instructions only *partially* update 616 // flags. Theoretically, we could generate MI code sequences that 617 // would rely on this fact and observe different flags independently. 618 // But currently LLVM models all of these instructions as clobbering 619 // all the flags in an undef way. We rely on that to simplify the 620 // logic. 621 FlagsKilled = true; 622 623 switch (MI.getOpcode()) { 624 case X86::SETB_C8r: 625 case X86::SETB_C16r: 626 case X86::SETB_C32r: 627 case X86::SETB_C64r: 628 // Use custom lowering for arithmetic that is merely extending the 629 // carry flag. We model this as the SETB_C* pseudo instructions. 630 rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse, 631 CondRegs); 632 break; 633 634 default: 635 // Generically handle remaining uses as arithmetic instructions. 636 rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse, 637 CondRegs); 638 break; 639 } 640 break; 641 } 642 643 // If this was the last use of the flags, we're done. 644 if (FlagsKilled) 645 break; 646 } 647 648 // If the flags were killed, we're done with this block. 649 if (FlagsKilled) 650 continue; 651 652 // Otherwise we need to scan successors for ones where the flags live-in 653 // and queue those up for processing. 654 for (MachineBasicBlock *SuccMBB : UseMBB.successors()) 655 if (SuccMBB->isLiveIn(X86::EFLAGS) && 656 VisitedBlocks.insert(SuccMBB).second) { 657 // We currently don't do any PHI insertion and so we require that the 658 // test basic block dominates all of the use basic blocks. Further, we 659 // can't have a cycle from the test block back to itself as that would 660 // create a cycle requiring a PHI to break it. 661 // 662 // We could in theory do PHI insertion here if it becomes useful by 663 // just taking undef values in along every edge that we don't trace 664 // this EFLAGS copy along. This isn't as bad as fully general PHI 665 // insertion, but still seems like a great deal of complexity. 666 // 667 // Because it is theoretically possible that some earlier MI pass or 668 // other lowering transformation could induce this to happen, we do 669 // a hard check even in non-debug builds here. 670 if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) { 671 LLVM_DEBUG({ 672 dbgs() 673 << "ERROR: Encountered use that is not dominated by our test " 674 "basic block! Rewriting this would require inserting PHI " 675 "nodes to track the flag state across the CFG.\n\nTest " 676 "block:\n"; 677 TestMBB->dump(); 678 dbgs() << "Use block:\n"; 679 SuccMBB->dump(); 680 }); 681 report_fatal_error( 682 "Cannot lower EFLAGS copy when original copy def " 683 "does not dominate all uses."); 684 } 685 686 Blocks.push_back(SuccMBB); 687 } 688 } while (!Blocks.empty()); 689 690 // Now rewrite the jumps that use the flags. These we handle specially 691 // because if there are multiple jumps in a single basic block we'll have 692 // to do surgery on the CFG. 693 MachineBasicBlock *LastJmpMBB = nullptr; 694 for (MachineInstr *JmpI : JmpIs) { 695 // Past the first jump within a basic block we need to split the blocks 696 // apart. 697 if (JmpI->getParent() == LastJmpMBB) 698 splitBlock(*JmpI->getParent(), *JmpI, *TII); 699 else 700 LastJmpMBB = JmpI->getParent(); 701 702 rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs); 703 } 704 705 // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if 706 // the copy's def operand is itself a kill. 707 } 708 709 #ifndef NDEBUG 710 for (MachineBasicBlock &MBB : MF) 711 for (MachineInstr &MI : MBB) 712 if (MI.getOpcode() == TargetOpcode::COPY && 713 (MI.getOperand(0).getReg() == X86::EFLAGS || 714 MI.getOperand(1).getReg() == X86::EFLAGS)) { 715 LLVM_DEBUG(dbgs() << "ERROR: Found a COPY involving EFLAGS: "; 716 MI.dump()); 717 llvm_unreachable("Unlowered EFLAGS copy!"); 718 } 719 #endif 720 721 return true; 722 } 723 724 /// Collect any conditions that have already been set in registers so that we 725 /// can re-use them rather than adding duplicates. 726 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( 727 MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) { 728 CondRegArray CondRegs = {}; 729 730 // Scan backwards across the range of instructions with live EFLAGS. 731 for (MachineInstr &MI : 732 llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { 733 X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode()); 734 if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && 735 TRI->isVirtualRegister(MI.getOperand(0).getReg())) { 736 assert(MI.getOperand(0).isDef() && 737 "A non-storing SETcc should always define a register!"); 738 CondRegs[Cond] = MI.getOperand(0).getReg(); 739 } 740 741 // Stop scanning when we see the first definition of the EFLAGS as prior to 742 // this we would potentially capture the wrong flag state. 743 if (MI.findRegisterDefOperand(X86::EFLAGS)) 744 break; 745 } 746 return CondRegs; 747 } 748 749 unsigned X86FlagsCopyLoweringPass::promoteCondToReg( 750 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 751 DebugLoc TestLoc, X86::CondCode Cond) { 752 unsigned Reg = MRI->createVirtualRegister(PromoteRC); 753 auto SetI = BuildMI(TestMBB, TestPos, TestLoc, 754 TII->get(X86::getSETFromCond(Cond)), Reg); 755 (void)SetI; 756 LLVM_DEBUG(dbgs() << " save cond: "; SetI->dump()); 757 ++NumSetCCsInserted; 758 return Reg; 759 } 760 761 std::pair<unsigned, bool> X86FlagsCopyLoweringPass::getCondOrInverseInReg( 762 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 763 DebugLoc TestLoc, X86::CondCode Cond, CondRegArray &CondRegs) { 764 unsigned &CondReg = CondRegs[Cond]; 765 unsigned &InvCondReg = CondRegs[X86::GetOppositeBranchCondition(Cond)]; 766 if (!CondReg && !InvCondReg) 767 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 768 769 if (CondReg) 770 return {CondReg, false}; 771 else 772 return {InvCondReg, true}; 773 } 774 775 void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB, 776 MachineBasicBlock::iterator Pos, 777 DebugLoc Loc, unsigned Reg) { 778 auto TestI = 779 BuildMI(MBB, Pos, Loc, TII->get(X86::TEST8rr)).addReg(Reg).addReg(Reg); 780 (void)TestI; 781 LLVM_DEBUG(dbgs() << " test cond: "; TestI->dump()); 782 ++NumTestsInserted; 783 } 784 785 void X86FlagsCopyLoweringPass::rewriteArithmetic( 786 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 787 DebugLoc TestLoc, MachineInstr &MI, MachineOperand &FlagUse, 788 CondRegArray &CondRegs) { 789 // Arithmetic is either reading CF or OF. Figure out which condition we need 790 // to preserve in a register. 791 X86::CondCode Cond; 792 793 // The addend to use to reset CF or OF when added to the flag value. 794 int Addend; 795 796 switch (getMnemonicFromOpcode(MI.getOpcode())) { 797 case FlagArithMnemonic::ADC: 798 case FlagArithMnemonic::ADCX: 799 case FlagArithMnemonic::RCL: 800 case FlagArithMnemonic::RCR: 801 case FlagArithMnemonic::SBB: 802 Cond = X86::COND_B; // CF == 1 803 // Set up an addend that when one is added will need a carry due to not 804 // having a higher bit available. 805 Addend = 255; 806 break; 807 808 case FlagArithMnemonic::ADOX: 809 Cond = X86::COND_O; // OF == 1 810 // Set up an addend that when one is added will turn from positive to 811 // negative and thus overflow in the signed domain. 812 Addend = 127; 813 break; 814 } 815 816 // Now get a register that contains the value of the flag input to the 817 // arithmetic. We require exactly this flag to simplify the arithmetic 818 // required to materialize it back into the flag. 819 unsigned &CondReg = CondRegs[Cond]; 820 if (!CondReg) 821 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 822 823 MachineBasicBlock &MBB = *MI.getParent(); 824 825 // Insert an instruction that will set the flag back to the desired value. 826 unsigned TmpReg = MRI->createVirtualRegister(PromoteRC); 827 auto AddI = 828 BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) 829 .addDef(TmpReg, RegState::Dead) 830 .addReg(CondReg) 831 .addImm(Addend); 832 (void)AddI; 833 LLVM_DEBUG(dbgs() << " add cond: "; AddI->dump()); 834 ++NumAddsInserted; 835 FlagUse.setIsKill(true); 836 } 837 838 void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, 839 MachineBasicBlock::iterator TestPos, 840 DebugLoc TestLoc, 841 MachineInstr &CMovI, 842 MachineOperand &FlagUse, 843 CondRegArray &CondRegs) { 844 // First get the register containing this specific condition. 845 X86::CondCode Cond = X86::getCondFromCMovOpc(CMovI.getOpcode()); 846 unsigned CondReg; 847 bool Inverted; 848 std::tie(CondReg, Inverted) = 849 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 850 851 MachineBasicBlock &MBB = *CMovI.getParent(); 852 853 // Insert a direct test of the saved register. 854 insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); 855 856 // Rewrite the CMov to use the !ZF flag from the test (but match register 857 // size and memory operand), and then kill its use of the flags afterward. 858 auto &CMovRC = *MRI->getRegClass(CMovI.getOperand(0).getReg()); 859 CMovI.setDesc(TII->get(X86::getCMovFromCond( 860 Inverted ? X86::COND_E : X86::COND_NE, TRI->getRegSizeInBits(CMovRC) / 8, 861 !CMovI.memoperands_empty()))); 862 FlagUse.setIsKill(true); 863 LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); 864 } 865 866 void X86FlagsCopyLoweringPass::rewriteCondJmp( 867 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 868 DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { 869 // First get the register containing this specific condition. 870 X86::CondCode Cond = X86::getCondFromBranchOpc(JmpI.getOpcode()); 871 unsigned CondReg; 872 bool Inverted; 873 std::tie(CondReg, Inverted) = 874 getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); 875 876 MachineBasicBlock &JmpMBB = *JmpI.getParent(); 877 878 // Insert a direct test of the saved register. 879 insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg); 880 881 // Rewrite the jump to use the !ZF flag from the test, and kill its use of 882 // flags afterward. 883 JmpI.setDesc(TII->get( 884 X86::GetCondBranchFromCond(Inverted ? X86::COND_E : X86::COND_NE))); 885 const int ImplicitEFLAGSOpIdx = 1; 886 JmpI.getOperand(ImplicitEFLAGSOpIdx).setIsKill(true); 887 LLVM_DEBUG(dbgs() << " fixed jCC: "; JmpI.dump()); 888 } 889 890 void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, 891 MachineOperand &FlagUse, 892 MachineInstr &CopyDefI) { 893 // Just replace this copy with the original copy def. 894 MRI->replaceRegWith(MI.getOperand(0).getReg(), 895 CopyDefI.getOperand(0).getReg()); 896 MI.eraseFromParent(); 897 } 898 899 void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( 900 MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, 901 DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse, 902 CondRegArray &CondRegs) { 903 // This routine is only used to handle pseudos for setting a register to zero 904 // or all ones based on CF. This is essentially the sign extended from 1-bit 905 // form of SETB and modeled with the SETB_C* pseudos. They require special 906 // handling as they aren't normal SETcc instructions and are lowered to an 907 // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that 908 // they are only provided in reg-defining forms. A complicating factor is that 909 // they can define many different register widths. 910 assert(SetBI.getOperand(0).isReg() && 911 "Cannot have a non-register defined operand to this variant of SETB!"); 912 913 // Little helper to do the common final step of replacing the register def'ed 914 // by this SETB instruction with a new register and removing the SETB 915 // instruction. 916 auto RewriteToReg = [&](unsigned Reg) { 917 MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg); 918 SetBI.eraseFromParent(); 919 }; 920 921 // Grab the register class used for this particular instruction. 922 auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg()); 923 924 MachineBasicBlock &MBB = *SetBI.getParent(); 925 auto SetPos = SetBI.getIterator(); 926 auto SetLoc = SetBI.getDebugLoc(); 927 928 auto AdjustReg = [&](unsigned Reg) { 929 auto &OrigRC = *MRI->getRegClass(Reg); 930 if (&OrigRC == &SetBRC) 931 return Reg; 932 933 unsigned NewReg; 934 935 int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8; 936 int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8; 937 assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!"); 938 assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!"); 939 int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit, 940 X86::NoSubRegister, X86::sub_32bit}; 941 942 // If the original size is smaller than the target *and* is smaller than 4 943 // bytes, we need to explicitly zero extend it. We always extend to 4-bytes 944 // to maximize the chance of being able to CSE that operation and to avoid 945 // partial dependency stalls extending to 2-bytes. 946 if (OrigRegSize < TargetRegSize && OrigRegSize < 4) { 947 NewReg = MRI->createVirtualRegister(&X86::GR32RegClass); 948 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg) 949 .addReg(Reg); 950 if (&SetBRC == &X86::GR32RegClass) 951 return NewReg; 952 Reg = NewReg; 953 OrigRegSize = 4; 954 } 955 956 NewReg = MRI->createVirtualRegister(&SetBRC); 957 if (OrigRegSize < TargetRegSize) { 958 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG), 959 NewReg) 960 .addImm(0) 961 .addReg(Reg) 962 .addImm(SubRegIdx[OrigRegSize]); 963 } else if (OrigRegSize > TargetRegSize) { 964 if (TargetRegSize == 1 && !Subtarget->is64Bit()) { 965 // Need to constrain the register class. 966 MRI->constrainRegClass(Reg, &X86::GR32_ABCDRegClass); 967 } 968 969 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), 970 NewReg) 971 .addReg(Reg, 0, SubRegIdx[TargetRegSize]); 972 } else { 973 BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg) 974 .addReg(Reg); 975 } 976 return NewReg; 977 }; 978 979 unsigned &CondReg = CondRegs[X86::COND_B]; 980 if (!CondReg) 981 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B); 982 983 // Adjust the condition to have the desired register width by zero-extending 984 // as needed. 985 // FIXME: We should use a better API to avoid the local reference and using a 986 // different variable here. 987 unsigned ExtCondReg = AdjustReg(CondReg); 988 989 // Now we need to turn this into a bitmask. We do this by subtracting it from 990 // zero. 991 unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); 992 BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); 993 ZeroReg = AdjustReg(ZeroReg); 994 995 unsigned Sub; 996 switch (SetBI.getOpcode()) { 997 case X86::SETB_C8r: 998 Sub = X86::SUB8rr; 999 break; 1000 1001 case X86::SETB_C16r: 1002 Sub = X86::SUB16rr; 1003 break; 1004 1005 case X86::SETB_C32r: 1006 Sub = X86::SUB32rr; 1007 break; 1008 1009 case X86::SETB_C64r: 1010 Sub = X86::SUB64rr; 1011 break; 1012 1013 default: 1014 llvm_unreachable("Invalid SETB_C* opcode!"); 1015 } 1016 unsigned ResultReg = MRI->createVirtualRegister(&SetBRC); 1017 BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) 1018 .addReg(ZeroReg) 1019 .addReg(ExtCondReg); 1020 return RewriteToReg(ResultReg); 1021 } 1022 1023 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, 1024 MachineBasicBlock::iterator TestPos, 1025 DebugLoc TestLoc, 1026 MachineInstr &SetCCI, 1027 MachineOperand &FlagUse, 1028 CondRegArray &CondRegs) { 1029 X86::CondCode Cond = X86::getCondFromSETOpc(SetCCI.getOpcode()); 1030 // Note that we can't usefully rewrite this to the inverse without complex 1031 // analysis of the users of the setCC. Largely we rely on duplicates which 1032 // could have been avoided already being avoided here. 1033 unsigned &CondReg = CondRegs[Cond]; 1034 if (!CondReg) 1035 CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond); 1036 1037 // Rewriting a register def is trivial: we just replace the register and 1038 // remove the setcc. 1039 if (!SetCCI.mayStore()) { 1040 assert(SetCCI.getOperand(0).isReg() && 1041 "Cannot have a non-register defined operand to SETcc!"); 1042 MRI->replaceRegWith(SetCCI.getOperand(0).getReg(), CondReg); 1043 SetCCI.eraseFromParent(); 1044 return; 1045 } 1046 1047 // Otherwise, we need to emit a store. 1048 auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(), 1049 SetCCI.getDebugLoc(), TII->get(X86::MOV8mr)); 1050 // Copy the address operands. 1051 for (int i = 0; i < X86::AddrNumOperands; ++i) 1052 MIB.add(SetCCI.getOperand(i)); 1053 1054 MIB.addReg(CondReg); 1055 1056 MIB->setMemRefs(SetCCI.memoperands_begin(), SetCCI.memoperands_end()); 1057 1058 SetCCI.eraseFromParent(); 1059 return; 1060 } 1061