1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains an optimization for div and rem on architectures that 11 // execute short instructions significantly faster than longer instructions. 12 // For example, on Intel Atom 32-bit divides are slow enough that during 13 // runtime it is profitable to check the value of the operands, and if they are 14 // positive and less than 256 use an unsigned 8-bit divide. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/None.h" 21 #include "llvm/ADT/Optional.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallPtrSet.h" 24 #include "llvm/Transforms/Utils/Local.h" 25 #include "llvm/Analysis/ValueTracking.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/Constants.h" 28 #include "llvm/IR/DerivedTypes.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/IRBuilder.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/Module.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/IR/Value.h" 36 #include "llvm/Support/Casting.h" 37 #include "llvm/Support/KnownBits.h" 38 #include <cassert> 39 #include <cstdint> 40 41 using namespace llvm; 42 43 #define DEBUG_TYPE "bypass-slow-division" 44 45 namespace { 46 47 struct QuotRemPair { 48 Value *Quotient; 49 Value *Remainder; 50 51 QuotRemPair(Value *InQuotient, Value *InRemainder) 52 : Quotient(InQuotient), Remainder(InRemainder) {} 53 }; 54 55 /// A quotient and remainder, plus a BB from which they logically "originate". 56 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 57 /// corresponding predecessor. 58 struct QuotRemWithBB { 59 BasicBlock *BB = nullptr; 60 Value *Quotient = nullptr; 61 Value *Remainder = nullptr; 62 }; 63 64 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; 65 using BypassWidthsTy = DenseMap<unsigned, unsigned>; 66 using VisitedSetTy = SmallPtrSet<Instruction *, 4>; 67 68 enum ValueRange { 69 /// Operand definitely fits into BypassType. No runtime checks are needed. 70 VALRNG_KNOWN_SHORT, 71 /// A runtime check is required, as value range is unknown. 72 VALRNG_UNKNOWN, 73 /// Operand is unlikely to fit into BypassType. The bypassing should be 74 /// disabled. 75 VALRNG_LIKELY_LONG 76 }; 77 78 class FastDivInsertionTask { 79 bool IsValidTask = false; 80 Instruction *SlowDivOrRem = nullptr; 81 IntegerType *BypassType = nullptr; 82 BasicBlock *MainBB = nullptr; 83 84 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 85 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 86 QuotRemWithBB createSlowBB(BasicBlock *Successor); 87 QuotRemWithBB createFastBB(BasicBlock *Successor); 88 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 89 BasicBlock *PhiBB); 90 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 91 Optional<QuotRemPair> insertFastDivAndRem(); 92 93 bool isSignedOp() { 94 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 95 SlowDivOrRem->getOpcode() == Instruction::SRem; 96 } 97 98 bool isDivisionOp() { 99 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 100 SlowDivOrRem->getOpcode() == Instruction::UDiv; 101 } 102 103 Type *getSlowType() { return SlowDivOrRem->getType(); } 104 105 public: 106 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 107 108 Value *getReplacement(DivCacheTy &Cache); 109 }; 110 111 } // end anonymous namespace 112 113 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 114 const BypassWidthsTy &BypassWidths) { 115 switch (I->getOpcode()) { 116 case Instruction::UDiv: 117 case Instruction::SDiv: 118 case Instruction::URem: 119 case Instruction::SRem: 120 SlowDivOrRem = I; 121 break; 122 default: 123 // I is not a div/rem operation. 124 return; 125 } 126 127 // Skip division on vector types. Only optimize integer instructions. 128 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 129 if (!SlowType) 130 return; 131 132 // Skip if this bitwidth is not bypassed. 133 auto BI = BypassWidths.find(SlowType->getBitWidth()); 134 if (BI == BypassWidths.end()) 135 return; 136 137 // Get type for div/rem instruction with bypass bitwidth. 138 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 139 BypassType = BT; 140 141 // The original basic block. 142 MainBB = I->getParent(); 143 144 // The instruction is indeed a slow div or rem operation. 145 IsValidTask = true; 146 } 147 148 /// Reuses previously-computed dividend or remainder from the current BB if 149 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 150 /// perform the optimization and caches the resulting dividend and remainder. 151 /// If no replacement can be generated, nullptr is returned. 152 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 153 // First, make sure that the task is valid. 154 if (!IsValidTask) 155 return nullptr; 156 157 // Then, look for a value in Cache. 158 Value *Dividend = SlowDivOrRem->getOperand(0); 159 Value *Divisor = SlowDivOrRem->getOperand(1); 160 DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 161 auto CacheI = Cache.find(Key); 162 163 if (CacheI == Cache.end()) { 164 // If previous instance does not exist, try to insert fast div. 165 Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 166 // Bail out if insertFastDivAndRem has failed. 167 if (!OptResult) 168 return nullptr; 169 CacheI = Cache.insert({Key, *OptResult}).first; 170 } 171 172 QuotRemPair &Value = CacheI->second; 173 return isDivisionOp() ? Value.Quotient : Value.Remainder; 174 } 175 176 /// Check if a value looks like a hash. 177 /// 178 /// The routine is expected to detect values computed using the most common hash 179 /// algorithms. Typically, hash computations end with one of the following 180 /// instructions: 181 /// 182 /// 1) MUL with a constant wider than BypassType 183 /// 2) XOR instruction 184 /// 185 /// And even if we are wrong and the value is not a hash, it is still quite 186 /// unlikely that such values will fit into BypassType. 187 /// 188 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 189 /// It is implemented as a depth-first search for values that look neither long 190 /// nor hash-like. 191 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 192 Instruction *I = dyn_cast<Instruction>(V); 193 if (!I) 194 return false; 195 196 switch (I->getOpcode()) { 197 case Instruction::Xor: 198 return true; 199 case Instruction::Mul: { 200 // After Constant Hoisting pass, long constants may be represented as 201 // bitcast instructions. As a result, some constants may look like an 202 // instruction at first, and an additional check is necessary to find out if 203 // an operand is actually a constant. 204 Value *Op1 = I->getOperand(1); 205 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 206 if (!C && isa<BitCastInst>(Op1)) 207 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 208 return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 209 } 210 case Instruction::PHI: 211 // Stop IR traversal in case of a crazy input code. This limits recursion 212 // depth. 213 if (Visited.size() >= 16) 214 return false; 215 // Do not visit nodes that have been visited already. We return true because 216 // it means that we couldn't find any value that doesn't look hash-like. 217 if (Visited.find(I) != Visited.end()) 218 return true; 219 Visited.insert(I); 220 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 221 // Ignore undef values as they probably don't affect the division 222 // operands. 223 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 224 isa<UndefValue>(V); 225 }); 226 default: 227 return false; 228 } 229 } 230 231 /// Check if an integer value fits into our bypass type. 232 ValueRange FastDivInsertionTask::getValueRange(Value *V, 233 VisitedSetTy &Visited) { 234 unsigned ShortLen = BypassType->getBitWidth(); 235 unsigned LongLen = V->getType()->getIntegerBitWidth(); 236 237 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 238 unsigned HiBits = LongLen - ShortLen; 239 240 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 241 KnownBits Known(LongLen); 242 243 computeKnownBits(V, Known, DL); 244 245 if (Known.countMinLeadingZeros() >= HiBits) 246 return VALRNG_KNOWN_SHORT; 247 248 if (Known.countMaxLeadingZeros() < HiBits) 249 return VALRNG_LIKELY_LONG; 250 251 // Long integer divisions are often used in hashtable implementations. It's 252 // not worth bypassing such divisions because hash values are extremely 253 // unlikely to have enough leading zeros. The call below tries to detect 254 // values that are unlikely to fit BypassType (including hashes). 255 if (isHashLikeValue(V, Visited)) 256 return VALRNG_LIKELY_LONG; 257 258 return VALRNG_UNKNOWN; 259 } 260 261 /// Add new basic block for slow div and rem operations and put it before 262 /// SuccessorBB. 263 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 264 QuotRemWithBB DivRemPair; 265 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 266 MainBB->getParent(), SuccessorBB); 267 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 268 269 Value *Dividend = SlowDivOrRem->getOperand(0); 270 Value *Divisor = SlowDivOrRem->getOperand(1); 271 272 if (isSignedOp()) { 273 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 274 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 275 } else { 276 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 277 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 278 } 279 280 Builder.CreateBr(SuccessorBB); 281 return DivRemPair; 282 } 283 284 /// Add new basic block for fast div and rem operations and put it before 285 /// SuccessorBB. 286 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 287 QuotRemWithBB DivRemPair; 288 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 289 MainBB->getParent(), SuccessorBB); 290 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 291 292 Value *Dividend = SlowDivOrRem->getOperand(0); 293 Value *Divisor = SlowDivOrRem->getOperand(1); 294 Value *ShortDivisorV = 295 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 296 Value *ShortDividendV = 297 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 298 299 // udiv/urem because this optimization only handles positive numbers. 300 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 301 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 302 DivRemPair.Quotient = 303 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 304 DivRemPair.Remainder = 305 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 306 Builder.CreateBr(SuccessorBB); 307 308 return DivRemPair; 309 } 310 311 /// Creates Phi nodes for result of Div and Rem. 312 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 313 QuotRemWithBB &RHS, 314 BasicBlock *PhiBB) { 315 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 316 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 317 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 318 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 319 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 320 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 321 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 322 return QuotRemPair(QuoPhi, RemPhi); 323 } 324 325 /// Creates a runtime check to test whether both the divisor and dividend fit 326 /// into BypassType. The check is inserted at the end of MainBB. True return 327 /// value means that the operands fit. Either of the operands may be NULL if it 328 /// doesn't need a runtime check. 329 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 330 assert((Op1 || Op2) && "Nothing to check"); 331 IRBuilder<> Builder(MainBB, MainBB->end()); 332 333 Value *OrV; 334 if (Op1 && Op2) 335 OrV = Builder.CreateOr(Op1, Op2); 336 else 337 OrV = Op1 ? Op1 : Op2; 338 339 // BitMask is inverted to check if the operands are 340 // larger than the bypass type 341 uint64_t BitMask = ~BypassType->getBitMask(); 342 Value *AndV = Builder.CreateAnd(OrV, BitMask); 343 344 // Compare operand values 345 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 346 return Builder.CreateICmpEQ(AndV, ZeroV); 347 } 348 349 /// Substitutes the div/rem instruction with code that checks the value of the 350 /// operands and uses a shorter-faster div/rem instruction when possible. 351 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 352 Value *Dividend = SlowDivOrRem->getOperand(0); 353 Value *Divisor = SlowDivOrRem->getOperand(1); 354 355 VisitedSetTy SetL; 356 ValueRange DividendRange = getValueRange(Dividend, SetL); 357 if (DividendRange == VALRNG_LIKELY_LONG) 358 return None; 359 360 VisitedSetTy SetR; 361 ValueRange DivisorRange = getValueRange(Divisor, SetR); 362 if (DivisorRange == VALRNG_LIKELY_LONG) 363 return None; 364 365 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 366 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 367 368 if (DividendShort && DivisorShort) { 369 // If both operands are known to be short then just replace the long 370 // division with a short one in-place. Since we're not introducing control 371 // flow in this case, narrowing the division is always a win, even if the 372 // divisor is a constant (and will later get replaced by a multiplication). 373 374 IRBuilder<> Builder(SlowDivOrRem); 375 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 376 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 377 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 378 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 379 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 380 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 381 return QuotRemPair(ExtDiv, ExtRem); 382 } 383 384 if (isa<ConstantInt>(Divisor)) { 385 // If the divisor is not a constant, DAGCombiner will convert it to a 386 // multiplication by a magic constant. It isn't clear if it is worth 387 // introducing control flow to get a narrower multiply. 388 return None; 389 } 390 391 // After Constant Hoisting pass, long constants may be represented as 392 // bitcast instructions. As a result, some constants may look like an 393 // instruction at first, and an additional check is necessary to find out if 394 // an operand is actually a constant. 395 if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) 396 if (BCI->getParent() == SlowDivOrRem->getParent() && 397 isa<ConstantInt>(BCI->getOperand(0))) 398 return None; 399 400 if (DividendShort && !isSignedOp()) { 401 // If the division is unsigned and Dividend is known to be short, then 402 // either 403 // 1) Divisor is less or equal to Dividend, and the result can be computed 404 // with a short division. 405 // 2) Divisor is greater than Dividend. In this case, no division is needed 406 // at all: The quotient is 0 and the remainder is equal to Dividend. 407 // 408 // So instead of checking at runtime whether Divisor fits into BypassType, 409 // we emit a runtime check to differentiate between these two cases. This 410 // lets us entirely avoid a long div. 411 412 // Split the basic block before the div/rem. 413 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 414 // Remove the unconditional branch from MainBB to SuccessorBB. 415 MainBB->getInstList().back().eraseFromParent(); 416 QuotRemWithBB Long; 417 Long.BB = MainBB; 418 Long.Quotient = ConstantInt::get(getSlowType(), 0); 419 Long.Remainder = Dividend; 420 QuotRemWithBB Fast = createFastBB(SuccessorBB); 421 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 422 IRBuilder<> Builder(MainBB, MainBB->end()); 423 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 424 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 425 return Result; 426 } else { 427 // General case. Create both slow and fast div/rem pairs and choose one of 428 // them at runtime. 429 430 // Split the basic block before the div/rem. 431 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 432 // Remove the unconditional branch from MainBB to SuccessorBB. 433 MainBB->getInstList().back().eraseFromParent(); 434 QuotRemWithBB Fast = createFastBB(SuccessorBB); 435 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 436 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 437 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 438 DivisorShort ? nullptr : Divisor); 439 IRBuilder<> Builder(MainBB, MainBB->end()); 440 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 441 return Result; 442 } 443 } 444 445 /// This optimization identifies DIV/REM instructions in a BB that can be 446 /// profitably bypassed and carried out with a shorter, faster divide. 447 bool llvm::bypassSlowDivision(BasicBlock *BB, 448 const BypassWidthsTy &BypassWidths) { 449 DivCacheTy PerBBDivCache; 450 451 bool MadeChange = false; 452 Instruction* Next = &*BB->begin(); 453 while (Next != nullptr) { 454 // We may add instructions immediately after I, but we want to skip over 455 // them. 456 Instruction* I = Next; 457 Next = Next->getNextNode(); 458 459 FastDivInsertionTask Task(I, BypassWidths); 460 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 461 I->replaceAllUsesWith(Replacement); 462 I->eraseFromParent(); 463 MadeChange = true; 464 } 465 } 466 467 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 468 // create divrem machine instructions. Now erase any unused divs / rems so we 469 // don't leave extra instructions sitting around. 470 for (auto &KV : PerBBDivCache) 471 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 472 RecursivelyDeleteTriviallyDeadInstructions(V); 473 474 return MadeChange; 475 } 476