Home | History | Annotate | Download | only in Utils
      1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file contains an optimization for div and rem on architectures that
     11 // execute short instructions significantly faster than longer instructions.
     12 // For example, on Intel Atom 32-bit divides are slow enough that during
     13 // runtime it is profitable to check the value of the operands, and if they are
     14 // positive and less than 256 use an unsigned 8-bit divide.
     15 //
     16 //===----------------------------------------------------------------------===//
     17 
     18 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
     19 #include "llvm/ADT/DenseMap.h"
     20 #include "llvm/ADT/None.h"
     21 #include "llvm/ADT/Optional.h"
     22 #include "llvm/ADT/STLExtras.h"
     23 #include "llvm/ADT/SmallPtrSet.h"
     24 #include "llvm/Transforms/Utils/Local.h"
     25 #include "llvm/Analysis/ValueTracking.h"
     26 #include "llvm/IR/BasicBlock.h"
     27 #include "llvm/IR/Constants.h"
     28 #include "llvm/IR/DerivedTypes.h"
     29 #include "llvm/IR/Function.h"
     30 #include "llvm/IR/IRBuilder.h"
     31 #include "llvm/IR/Instruction.h"
     32 #include "llvm/IR/Instructions.h"
     33 #include "llvm/IR/Module.h"
     34 #include "llvm/IR/Type.h"
     35 #include "llvm/IR/Value.h"
     36 #include "llvm/Support/Casting.h"
     37 #include "llvm/Support/KnownBits.h"
     38 #include <cassert>
     39 #include <cstdint>
     40 
     41 using namespace llvm;
     42 
     43 #define DEBUG_TYPE "bypass-slow-division"
     44 
     45 namespace {
     46 
     47   struct QuotRemPair {
     48     Value *Quotient;
     49     Value *Remainder;
     50 
     51     QuotRemPair(Value *InQuotient, Value *InRemainder)
     52         : Quotient(InQuotient), Remainder(InRemainder) {}
     53   };
     54 
     55   /// A quotient and remainder, plus a BB from which they logically "originate".
     56   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
     57   /// corresponding predecessor.
     58   struct QuotRemWithBB {
     59     BasicBlock *BB = nullptr;
     60     Value *Quotient = nullptr;
     61     Value *Remainder = nullptr;
     62   };
     63 
     64 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
     65 using BypassWidthsTy = DenseMap<unsigned, unsigned>;
     66 using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
     67 
     68 enum ValueRange {
     69   /// Operand definitely fits into BypassType. No runtime checks are needed.
     70   VALRNG_KNOWN_SHORT,
     71   /// A runtime check is required, as value range is unknown.
     72   VALRNG_UNKNOWN,
     73   /// Operand is unlikely to fit into BypassType. The bypassing should be
     74   /// disabled.
     75   VALRNG_LIKELY_LONG
     76 };
     77 
     78 class FastDivInsertionTask {
     79   bool IsValidTask = false;
     80   Instruction *SlowDivOrRem = nullptr;
     81   IntegerType *BypassType = nullptr;
     82   BasicBlock *MainBB = nullptr;
     83 
     84   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
     85   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
     86   QuotRemWithBB createSlowBB(BasicBlock *Successor);
     87   QuotRemWithBB createFastBB(BasicBlock *Successor);
     88   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
     89                                    BasicBlock *PhiBB);
     90   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
     91   Optional<QuotRemPair> insertFastDivAndRem();
     92 
     93   bool isSignedOp() {
     94     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
     95            SlowDivOrRem->getOpcode() == Instruction::SRem;
     96   }
     97 
     98   bool isDivisionOp() {
     99     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
    100            SlowDivOrRem->getOpcode() == Instruction::UDiv;
    101   }
    102 
    103   Type *getSlowType() { return SlowDivOrRem->getType(); }
    104 
    105 public:
    106   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
    107 
    108   Value *getReplacement(DivCacheTy &Cache);
    109 };
    110 
    111 } // end anonymous namespace
    112 
    113 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
    114                                            const BypassWidthsTy &BypassWidths) {
    115   switch (I->getOpcode()) {
    116   case Instruction::UDiv:
    117   case Instruction::SDiv:
    118   case Instruction::URem:
    119   case Instruction::SRem:
    120     SlowDivOrRem = I;
    121     break;
    122   default:
    123     // I is not a div/rem operation.
    124     return;
    125   }
    126 
    127   // Skip division on vector types. Only optimize integer instructions.
    128   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
    129   if (!SlowType)
    130     return;
    131 
    132   // Skip if this bitwidth is not bypassed.
    133   auto BI = BypassWidths.find(SlowType->getBitWidth());
    134   if (BI == BypassWidths.end())
    135     return;
    136 
    137   // Get type for div/rem instruction with bypass bitwidth.
    138   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
    139   BypassType = BT;
    140 
    141   // The original basic block.
    142   MainBB = I->getParent();
    143 
    144   // The instruction is indeed a slow div or rem operation.
    145   IsValidTask = true;
    146 }
    147 
    148 /// Reuses previously-computed dividend or remainder from the current BB if
    149 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
    150 /// perform the optimization and caches the resulting dividend and remainder.
    151 /// If no replacement can be generated, nullptr is returned.
    152 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
    153   // First, make sure that the task is valid.
    154   if (!IsValidTask)
    155     return nullptr;
    156 
    157   // Then, look for a value in Cache.
    158   Value *Dividend = SlowDivOrRem->getOperand(0);
    159   Value *Divisor = SlowDivOrRem->getOperand(1);
    160   DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
    161   auto CacheI = Cache.find(Key);
    162 
    163   if (CacheI == Cache.end()) {
    164     // If previous instance does not exist, try to insert fast div.
    165     Optional<QuotRemPair> OptResult = insertFastDivAndRem();
    166     // Bail out if insertFastDivAndRem has failed.
    167     if (!OptResult)
    168       return nullptr;
    169     CacheI = Cache.insert({Key, *OptResult}).first;
    170   }
    171 
    172   QuotRemPair &Value = CacheI->second;
    173   return isDivisionOp() ? Value.Quotient : Value.Remainder;
    174 }
    175 
    176 /// Check if a value looks like a hash.
    177 ///
    178 /// The routine is expected to detect values computed using the most common hash
    179 /// algorithms. Typically, hash computations end with one of the following
    180 /// instructions:
    181 ///
    182 /// 1) MUL with a constant wider than BypassType
    183 /// 2) XOR instruction
    184 ///
    185 /// And even if we are wrong and the value is not a hash, it is still quite
    186 /// unlikely that such values will fit into BypassType.
    187 ///
    188 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
    189 /// It is implemented as a depth-first search for values that look neither long
    190 /// nor hash-like.
    191 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
    192   Instruction *I = dyn_cast<Instruction>(V);
    193   if (!I)
    194     return false;
    195 
    196   switch (I->getOpcode()) {
    197   case Instruction::Xor:
    198     return true;
    199   case Instruction::Mul: {
    200     // After Constant Hoisting pass, long constants may be represented as
    201     // bitcast instructions. As a result, some constants may look like an
    202     // instruction at first, and an additional check is necessary to find out if
    203     // an operand is actually a constant.
    204     Value *Op1 = I->getOperand(1);
    205     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
    206     if (!C && isa<BitCastInst>(Op1))
    207       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
    208     return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
    209   }
    210   case Instruction::PHI:
    211     // Stop IR traversal in case of a crazy input code. This limits recursion
    212     // depth.
    213     if (Visited.size() >= 16)
    214       return false;
    215     // Do not visit nodes that have been visited already. We return true because
    216     // it means that we couldn't find any value that doesn't look hash-like.
    217     if (Visited.find(I) != Visited.end())
    218       return true;
    219     Visited.insert(I);
    220     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
    221       // Ignore undef values as they probably don't affect the division
    222       // operands.
    223       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
    224              isa<UndefValue>(V);
    225     });
    226   default:
    227     return false;
    228   }
    229 }
    230 
    231 /// Check if an integer value fits into our bypass type.
    232 ValueRange FastDivInsertionTask::getValueRange(Value *V,
    233                                                VisitedSetTy &Visited) {
    234   unsigned ShortLen = BypassType->getBitWidth();
    235   unsigned LongLen = V->getType()->getIntegerBitWidth();
    236 
    237   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
    238   unsigned HiBits = LongLen - ShortLen;
    239 
    240   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
    241   KnownBits Known(LongLen);
    242 
    243   computeKnownBits(V, Known, DL);
    244 
    245   if (Known.countMinLeadingZeros() >= HiBits)
    246     return VALRNG_KNOWN_SHORT;
    247 
    248   if (Known.countMaxLeadingZeros() < HiBits)
    249     return VALRNG_LIKELY_LONG;
    250 
    251   // Long integer divisions are often used in hashtable implementations. It's
    252   // not worth bypassing such divisions because hash values are extremely
    253   // unlikely to have enough leading zeros. The call below tries to detect
    254   // values that are unlikely to fit BypassType (including hashes).
    255   if (isHashLikeValue(V, Visited))
    256     return VALRNG_LIKELY_LONG;
    257 
    258   return VALRNG_UNKNOWN;
    259 }
    260 
    261 /// Add new basic block for slow div and rem operations and put it before
    262 /// SuccessorBB.
    263 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
    264   QuotRemWithBB DivRemPair;
    265   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
    266                                      MainBB->getParent(), SuccessorBB);
    267   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
    268 
    269   Value *Dividend = SlowDivOrRem->getOperand(0);
    270   Value *Divisor = SlowDivOrRem->getOperand(1);
    271 
    272   if (isSignedOp()) {
    273     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
    274     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
    275   } else {
    276     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
    277     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
    278   }
    279 
    280   Builder.CreateBr(SuccessorBB);
    281   return DivRemPair;
    282 }
    283 
    284 /// Add new basic block for fast div and rem operations and put it before
    285 /// SuccessorBB.
    286 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
    287   QuotRemWithBB DivRemPair;
    288   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
    289                                      MainBB->getParent(), SuccessorBB);
    290   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
    291 
    292   Value *Dividend = SlowDivOrRem->getOperand(0);
    293   Value *Divisor = SlowDivOrRem->getOperand(1);
    294   Value *ShortDivisorV =
    295       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
    296   Value *ShortDividendV =
    297       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
    298 
    299   // udiv/urem because this optimization only handles positive numbers.
    300   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
    301   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
    302   DivRemPair.Quotient =
    303       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
    304   DivRemPair.Remainder =
    305       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
    306   Builder.CreateBr(SuccessorBB);
    307 
    308   return DivRemPair;
    309 }
    310 
    311 /// Creates Phi nodes for result of Div and Rem.
    312 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
    313                                                        QuotRemWithBB &RHS,
    314                                                        BasicBlock *PhiBB) {
    315   IRBuilder<> Builder(PhiBB, PhiBB->begin());
    316   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
    317   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
    318   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
    319   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
    320   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
    321   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
    322   return QuotRemPair(QuoPhi, RemPhi);
    323 }
    324 
    325 /// Creates a runtime check to test whether both the divisor and dividend fit
    326 /// into BypassType. The check is inserted at the end of MainBB. True return
    327 /// value means that the operands fit. Either of the operands may be NULL if it
    328 /// doesn't need a runtime check.
    329 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
    330   assert((Op1 || Op2) && "Nothing to check");
    331   IRBuilder<> Builder(MainBB, MainBB->end());
    332 
    333   Value *OrV;
    334   if (Op1 && Op2)
    335     OrV = Builder.CreateOr(Op1, Op2);
    336   else
    337     OrV = Op1 ? Op1 : Op2;
    338 
    339   // BitMask is inverted to check if the operands are
    340   // larger than the bypass type
    341   uint64_t BitMask = ~BypassType->getBitMask();
    342   Value *AndV = Builder.CreateAnd(OrV, BitMask);
    343 
    344   // Compare operand values
    345   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
    346   return Builder.CreateICmpEQ(AndV, ZeroV);
    347 }
    348 
    349 /// Substitutes the div/rem instruction with code that checks the value of the
    350 /// operands and uses a shorter-faster div/rem instruction when possible.
    351 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
    352   Value *Dividend = SlowDivOrRem->getOperand(0);
    353   Value *Divisor = SlowDivOrRem->getOperand(1);
    354 
    355   VisitedSetTy SetL;
    356   ValueRange DividendRange = getValueRange(Dividend, SetL);
    357   if (DividendRange == VALRNG_LIKELY_LONG)
    358     return None;
    359 
    360   VisitedSetTy SetR;
    361   ValueRange DivisorRange = getValueRange(Divisor, SetR);
    362   if (DivisorRange == VALRNG_LIKELY_LONG)
    363     return None;
    364 
    365   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
    366   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
    367 
    368   if (DividendShort && DivisorShort) {
    369     // If both operands are known to be short then just replace the long
    370     // division with a short one in-place.  Since we're not introducing control
    371     // flow in this case, narrowing the division is always a win, even if the
    372     // divisor is a constant (and will later get replaced by a multiplication).
    373 
    374     IRBuilder<> Builder(SlowDivOrRem);
    375     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
    376     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
    377     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
    378     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
    379     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
    380     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
    381     return QuotRemPair(ExtDiv, ExtRem);
    382   }
    383 
    384   if (isa<ConstantInt>(Divisor)) {
    385     // If the divisor is not a constant, DAGCombiner will convert it to a
    386     // multiplication by a magic constant.  It isn't clear if it is worth
    387     // introducing control flow to get a narrower multiply.
    388     return None;
    389   }
    390 
    391   // After Constant Hoisting pass, long constants may be represented as
    392   // bitcast instructions. As a result, some constants may look like an
    393   // instruction at first, and an additional check is necessary to find out if
    394   // an operand is actually a constant.
    395   if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
    396     if (BCI->getParent() == SlowDivOrRem->getParent() &&
    397         isa<ConstantInt>(BCI->getOperand(0)))
    398       return None;
    399 
    400   if (DividendShort && !isSignedOp()) {
    401     // If the division is unsigned and Dividend is known to be short, then
    402     // either
    403     // 1) Divisor is less or equal to Dividend, and the result can be computed
    404     //    with a short division.
    405     // 2) Divisor is greater than Dividend. In this case, no division is needed
    406     //    at all: The quotient is 0 and the remainder is equal to Dividend.
    407     //
    408     // So instead of checking at runtime whether Divisor fits into BypassType,
    409     // we emit a runtime check to differentiate between these two cases. This
    410     // lets us entirely avoid a long div.
    411 
    412     // Split the basic block before the div/rem.
    413     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
    414     // Remove the unconditional branch from MainBB to SuccessorBB.
    415     MainBB->getInstList().back().eraseFromParent();
    416     QuotRemWithBB Long;
    417     Long.BB = MainBB;
    418     Long.Quotient = ConstantInt::get(getSlowType(), 0);
    419     Long.Remainder = Dividend;
    420     QuotRemWithBB Fast = createFastBB(SuccessorBB);
    421     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
    422     IRBuilder<> Builder(MainBB, MainBB->end());
    423     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
    424     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
    425     return Result;
    426   } else {
    427     // General case. Create both slow and fast div/rem pairs and choose one of
    428     // them at runtime.
    429 
    430     // Split the basic block before the div/rem.
    431     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
    432     // Remove the unconditional branch from MainBB to SuccessorBB.
    433     MainBB->getInstList().back().eraseFromParent();
    434     QuotRemWithBB Fast = createFastBB(SuccessorBB);
    435     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
    436     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
    437     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
    438                                             DivisorShort ? nullptr : Divisor);
    439     IRBuilder<> Builder(MainBB, MainBB->end());
    440     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
    441     return Result;
    442   }
    443 }
    444 
    445 /// This optimization identifies DIV/REM instructions in a BB that can be
    446 /// profitably bypassed and carried out with a shorter, faster divide.
    447 bool llvm::bypassSlowDivision(BasicBlock *BB,
    448                               const BypassWidthsTy &BypassWidths) {
    449   DivCacheTy PerBBDivCache;
    450 
    451   bool MadeChange = false;
    452   Instruction* Next = &*BB->begin();
    453   while (Next != nullptr) {
    454     // We may add instructions immediately after I, but we want to skip over
    455     // them.
    456     Instruction* I = Next;
    457     Next = Next->getNextNode();
    458 
    459     FastDivInsertionTask Task(I, BypassWidths);
    460     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
    461       I->replaceAllUsesWith(Replacement);
    462       I->eraseFromParent();
    463       MadeChange = true;
    464     }
    465   }
    466 
    467   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
    468   // create divrem machine instructions.  Now erase any unused divs / rems so we
    469   // don't leave extra instructions sitting around.
    470   for (auto &KV : PerBBDivCache)
    471     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
    472       RecursivelyDeleteTriviallyDeadInstructions(V);
    473 
    474   return MadeChange;
    475 }
    476