Home | History | Annotate | Download | only in Scalar
      1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This transformation analyzes and transforms the induction variables (and
     11 // computations derived from them) into forms suitable for efficient execution
     12 // on the target.
     13 //
     14 // This pass performs a strength reduction on array references inside loops that
     15 // have as one or more of their components the loop induction variable, it
     16 // rewrites expressions to take advantage of scaled-index addressing modes
     17 // available on the target, and it performs a variety of other optimizations
     18 // related to loop induction variables.
     19 //
     20 // Terminology note: this code has a lot of handling for "post-increment" or
     21 // "post-inc" users. This is not talking about post-increment addressing modes;
     22 // it is instead talking about code like this:
     23 //
     24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
     25 //   ...
     26 //   %i.next = add %i, 1
     27 //   %c = icmp eq %i.next, %n
     28 //
     29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
     30 // it's useful to think about these as the same register, with some uses using
     31 // the value of the register before the add and some using // it after. In this
     32 // example, the icmp is a post-increment user, since it uses %i.next, which is
     33 // the value of the induction variable after the increment. The other common
     34 // case of post-increment users is users outside the loop.
     35 //
     36 // TODO: More sophistication in the way Formulae are generated and filtered.
     37 //
     38 // TODO: Handle multiple loops at a time.
     39 //
     40 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
     41 //       instead of a GlobalValue?
     42 //
     43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
     44 //       smaller encoding (on x86 at least).
     45 //
     46 // TODO: When a negated register is used by an add (such as in a list of
     47 //       multiple base registers, or as the increment expression in an addrec),
     48 //       we may not actually need both reg and (-1 * reg) in registers; the
     49 //       negation can be implemented by using a sub instead of an add. The
     50 //       lack of support for taking this into consideration when making
     51 //       register pressure decisions is partly worked around by the "Special"
     52 //       use kind.
     53 //
     54 //===----------------------------------------------------------------------===//
     55 
     56 #define DEBUG_TYPE "loop-reduce"
     57 #include "llvm/Transforms/Scalar.h"
     58 #include "llvm/Constants.h"
     59 #include "llvm/Instructions.h"
     60 #include "llvm/IntrinsicInst.h"
     61 #include "llvm/DerivedTypes.h"
     62 #include "llvm/Analysis/IVUsers.h"
     63 #include "llvm/Analysis/Dominators.h"
     64 #include "llvm/Analysis/LoopPass.h"
     65 #include "llvm/Analysis/ScalarEvolutionExpander.h"
     66 #include "llvm/Assembly/Writer.h"
     67 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
     68 #include "llvm/Transforms/Utils/Local.h"
     69 #include "llvm/ADT/SmallBitVector.h"
     70 #include "llvm/ADT/SetVector.h"
     71 #include "llvm/ADT/DenseSet.h"
     72 #include "llvm/Support/Debug.h"
     73 #include "llvm/Support/CommandLine.h"
     74 #include "llvm/Support/ValueHandle.h"
     75 #include "llvm/Support/raw_ostream.h"
     76 #include "llvm/Target/TargetLowering.h"
     77 #include <algorithm>
     78 using namespace llvm;
     79 
     80 namespace llvm {
     81 cl::opt<bool> EnableNested(
     82   "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
     83 
     84 cl::opt<bool> EnableRetry(
     85     "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
     86 
     87 // Temporary flag to cleanup congruent phis after LSR phi expansion.
     88 // It's currently disabled until we can determine whether it's truly useful or
     89 // not. The flag should be removed after the v3.0 release.
     90 cl::opt<bool> EnablePhiElim(
     91     "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination"));
     92 }
     93 
     94 namespace {
     95 
     96 /// RegSortData - This class holds data which is used to order reuse candidates.
     97 class RegSortData {
     98 public:
     99   /// UsedByIndices - This represents the set of LSRUse indices which reference
    100   /// a particular register.
    101   SmallBitVector UsedByIndices;
    102 
    103   RegSortData() {}
    104 
    105   void print(raw_ostream &OS) const;
    106   void dump() const;
    107 };
    108 
    109 }
    110 
    111 void RegSortData::print(raw_ostream &OS) const {
    112   OS << "[NumUses=" << UsedByIndices.count() << ']';
    113 }
    114 
    115 void RegSortData::dump() const {
    116   print(errs()); errs() << '\n';
    117 }
    118 
    119 namespace {
    120 
    121 /// RegUseTracker - Map register candidates to information about how they are
    122 /// used.
    123 class RegUseTracker {
    124   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
    125 
    126   RegUsesTy RegUsesMap;
    127   SmallVector<const SCEV *, 16> RegSequence;
    128 
    129 public:
    130   void CountRegister(const SCEV *Reg, size_t LUIdx);
    131   void DropRegister(const SCEV *Reg, size_t LUIdx);
    132   void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
    133 
    134   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
    135 
    136   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
    137 
    138   void clear();
    139 
    140   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
    141   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
    142   iterator begin() { return RegSequence.begin(); }
    143   iterator end()   { return RegSequence.end(); }
    144   const_iterator begin() const { return RegSequence.begin(); }
    145   const_iterator end() const   { return RegSequence.end(); }
    146 };
    147 
    148 }
    149 
    150 void
    151 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
    152   std::pair<RegUsesTy::iterator, bool> Pair =
    153     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
    154   RegSortData &RSD = Pair.first->second;
    155   if (Pair.second)
    156     RegSequence.push_back(Reg);
    157   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
    158   RSD.UsedByIndices.set(LUIdx);
    159 }
    160 
    161 void
    162 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
    163   RegUsesTy::iterator It = RegUsesMap.find(Reg);
    164   assert(It != RegUsesMap.end());
    165   RegSortData &RSD = It->second;
    166   assert(RSD.UsedByIndices.size() > LUIdx);
    167   RSD.UsedByIndices.reset(LUIdx);
    168 }
    169 
    170 void
    171 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
    172   assert(LUIdx <= LastLUIdx);
    173 
    174   // Update RegUses. The data structure is not optimized for this purpose;
    175   // we must iterate through it and update each of the bit vectors.
    176   for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
    177        I != E; ++I) {
    178     SmallBitVector &UsedByIndices = I->second.UsedByIndices;
    179     if (LUIdx < UsedByIndices.size())
    180       UsedByIndices[LUIdx] =
    181         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
    182     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
    183   }
    184 }
    185 
    186 bool
    187 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
    188   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
    189   if (I == RegUsesMap.end())
    190     return false;
    191   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
    192   int i = UsedByIndices.find_first();
    193   if (i == -1) return false;
    194   if ((size_t)i != LUIdx) return true;
    195   return UsedByIndices.find_next(i) != -1;
    196 }
    197 
    198 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
    199   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
    200   assert(I != RegUsesMap.end() && "Unknown register!");
    201   return I->second.UsedByIndices;
    202 }
    203 
    204 void RegUseTracker::clear() {
    205   RegUsesMap.clear();
    206   RegSequence.clear();
    207 }
    208 
    209 namespace {
    210 
    211 /// Formula - This class holds information that describes a formula for
    212 /// computing satisfying a use. It may include broken-out immediates and scaled
    213 /// registers.
    214 struct Formula {
    215   /// AM - This is used to represent complex addressing, as well as other kinds
    216   /// of interesting uses.
    217   TargetLowering::AddrMode AM;
    218 
    219   /// BaseRegs - The list of "base" registers for this use. When this is
    220   /// non-empty, AM.HasBaseReg should be set to true.
    221   SmallVector<const SCEV *, 2> BaseRegs;
    222 
    223   /// ScaledReg - The 'scaled' register for this use. This should be non-null
    224   /// when AM.Scale is not zero.
    225   const SCEV *ScaledReg;
    226 
    227   /// UnfoldedOffset - An additional constant offset which added near the
    228   /// use. This requires a temporary register, but the offset itself can
    229   /// live in an add immediate field rather than a register.
    230   int64_t UnfoldedOffset;
    231 
    232   Formula() : ScaledReg(0), UnfoldedOffset(0) {}
    233 
    234   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
    235 
    236   unsigned getNumRegs() const;
    237   Type *getType() const;
    238 
    239   void DeleteBaseReg(const SCEV *&S);
    240 
    241   bool referencesReg(const SCEV *S) const;
    242   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
    243                                   const RegUseTracker &RegUses) const;
    244 
    245   void print(raw_ostream &OS) const;
    246   void dump() const;
    247 };
    248 
    249 }
    250 
    251 /// DoInitialMatch - Recursion helper for InitialMatch.
    252 static void DoInitialMatch(const SCEV *S, Loop *L,
    253                            SmallVectorImpl<const SCEV *> &Good,
    254                            SmallVectorImpl<const SCEV *> &Bad,
    255                            ScalarEvolution &SE) {
    256   // Collect expressions which properly dominate the loop header.
    257   if (SE.properlyDominates(S, L->getHeader())) {
    258     Good.push_back(S);
    259     return;
    260   }
    261 
    262   // Look at add operands.
    263   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
    264     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
    265          I != E; ++I)
    266       DoInitialMatch(*I, L, Good, Bad, SE);
    267     return;
    268   }
    269 
    270   // Look at addrec operands.
    271   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
    272     if (!AR->getStart()->isZero()) {
    273       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
    274       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
    275                                       AR->getStepRecurrence(SE),
    276                                       // FIXME: AR->getNoWrapFlags()
    277                                       AR->getLoop(), SCEV::FlagAnyWrap),
    278                      L, Good, Bad, SE);
    279       return;
    280     }
    281 
    282   // Handle a multiplication by -1 (negation) if it didn't fold.
    283   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
    284     if (Mul->getOperand(0)->isAllOnesValue()) {
    285       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
    286       const SCEV *NewMul = SE.getMulExpr(Ops);
    287 
    288       SmallVector<const SCEV *, 4> MyGood;
    289       SmallVector<const SCEV *, 4> MyBad;
    290       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
    291       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
    292         SE.getEffectiveSCEVType(NewMul->getType())));
    293       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
    294            E = MyGood.end(); I != E; ++I)
    295         Good.push_back(SE.getMulExpr(NegOne, *I));
    296       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
    297            E = MyBad.end(); I != E; ++I)
    298         Bad.push_back(SE.getMulExpr(NegOne, *I));
    299       return;
    300     }
    301 
    302   // Ok, we can't do anything interesting. Just stuff the whole thing into a
    303   // register and hope for the best.
    304   Bad.push_back(S);
    305 }
    306 
    307 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
    308 /// attempting to keep all loop-invariant and loop-computable values in a
    309 /// single base register.
    310 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
    311   SmallVector<const SCEV *, 4> Good;
    312   SmallVector<const SCEV *, 4> Bad;
    313   DoInitialMatch(S, L, Good, Bad, SE);
    314   if (!Good.empty()) {
    315     const SCEV *Sum = SE.getAddExpr(Good);
    316     if (!Sum->isZero())
    317       BaseRegs.push_back(Sum);
    318     AM.HasBaseReg = true;
    319   }
    320   if (!Bad.empty()) {
    321     const SCEV *Sum = SE.getAddExpr(Bad);
    322     if (!Sum->isZero())
    323       BaseRegs.push_back(Sum);
    324     AM.HasBaseReg = true;
    325   }
    326 }
    327 
    328 /// getNumRegs - Return the total number of register operands used by this
    329 /// formula. This does not include register uses implied by non-constant
    330 /// addrec strides.
    331 unsigned Formula::getNumRegs() const {
    332   return !!ScaledReg + BaseRegs.size();
    333 }
    334 
    335 /// getType - Return the type of this formula, if it has one, or null
    336 /// otherwise. This type is meaningless except for the bit size.
    337 Type *Formula::getType() const {
    338   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
    339          ScaledReg ? ScaledReg->getType() :
    340          AM.BaseGV ? AM.BaseGV->getType() :
    341          0;
    342 }
    343 
    344 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
    345 void Formula::DeleteBaseReg(const SCEV *&S) {
    346   if (&S != &BaseRegs.back())
    347     std::swap(S, BaseRegs.back());
    348   BaseRegs.pop_back();
    349 }
    350 
    351 /// referencesReg - Test if this formula references the given register.
    352 bool Formula::referencesReg(const SCEV *S) const {
    353   return S == ScaledReg ||
    354          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
    355 }
    356 
    357 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
    358 /// which are used by uses other than the use with the given index.
    359 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
    360                                          const RegUseTracker &RegUses) const {
    361   if (ScaledReg)
    362     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
    363       return true;
    364   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
    365        E = BaseRegs.end(); I != E; ++I)
    366     if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
    367       return true;
    368   return false;
    369 }
    370 
    371 void Formula::print(raw_ostream &OS) const {
    372   bool First = true;
    373   if (AM.BaseGV) {
    374     if (!First) OS << " + "; else First = false;
    375     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
    376   }
    377   if (AM.BaseOffs != 0) {
    378     if (!First) OS << " + "; else First = false;
    379     OS << AM.BaseOffs;
    380   }
    381   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
    382        E = BaseRegs.end(); I != E; ++I) {
    383     if (!First) OS << " + "; else First = false;
    384     OS << "reg(" << **I << ')';
    385   }
    386   if (AM.HasBaseReg && BaseRegs.empty()) {
    387     if (!First) OS << " + "; else First = false;
    388     OS << "**error: HasBaseReg**";
    389   } else if (!AM.HasBaseReg && !BaseRegs.empty()) {
    390     if (!First) OS << " + "; else First = false;
    391     OS << "**error: !HasBaseReg**";
    392   }
    393   if (AM.Scale != 0) {
    394     if (!First) OS << " + "; else First = false;
    395     OS << AM.Scale << "*reg(";
    396     if (ScaledReg)
    397       OS << *ScaledReg;
    398     else
    399       OS << "<unknown>";
    400     OS << ')';
    401   }
    402   if (UnfoldedOffset != 0) {
    403     if (!First) OS << " + "; else First = false;
    404     OS << "imm(" << UnfoldedOffset << ')';
    405   }
    406 }
    407 
    408 void Formula::dump() const {
    409   print(errs()); errs() << '\n';
    410 }
    411 
    412 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
    413 /// without changing its value.
    414 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
    415   Type *WideTy =
    416     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
    417   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
    418 }
    419 
    420 /// isAddSExtable - Return true if the given add can be sign-extended
    421 /// without changing its value.
    422 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
    423   Type *WideTy =
    424     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
    425   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
    426 }
    427 
    428 /// isMulSExtable - Return true if the given mul can be sign-extended
    429 /// without changing its value.
    430 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
    431   Type *WideTy =
    432     IntegerType::get(SE.getContext(),
    433                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
    434   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
    435 }
    436 
    437 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
    438 /// and if the remainder is known to be zero,  or null otherwise. If
    439 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
    440 /// to Y, ignoring that the multiplication may overflow, which is useful when
    441 /// the result will be used in a context where the most significant bits are
    442 /// ignored.
    443 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
    444                                 ScalarEvolution &SE,
    445                                 bool IgnoreSignificantBits = false) {
    446   // Handle the trivial case, which works for any SCEV type.
    447   if (LHS == RHS)
    448     return SE.getConstant(LHS->getType(), 1);
    449 
    450   // Handle a few RHS special cases.
    451   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
    452   if (RC) {
    453     const APInt &RA = RC->getValue()->getValue();
    454     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
    455     // some folding.
    456     if (RA.isAllOnesValue())
    457       return SE.getMulExpr(LHS, RC);
    458     // Handle x /s 1 as x.
    459     if (RA == 1)
    460       return LHS;
    461   }
    462 
    463   // Check for a division of a constant by a constant.
    464   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
    465     if (!RC)
    466       return 0;
    467     const APInt &LA = C->getValue()->getValue();
    468     const APInt &RA = RC->getValue()->getValue();
    469     if (LA.srem(RA) != 0)
    470       return 0;
    471     return SE.getConstant(LA.sdiv(RA));
    472   }
    473 
    474   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
    475   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
    476     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
    477       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
    478                                       IgnoreSignificantBits);
    479       if (!Step) return 0;
    480       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
    481                                        IgnoreSignificantBits);
    482       if (!Start) return 0;
    483       // FlagNW is independent of the start value, step direction, and is
    484       // preserved with smaller magnitude steps.
    485       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
    486       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
    487     }
    488     return 0;
    489   }
    490 
    491   // Distribute the sdiv over add operands, if the add doesn't overflow.
    492   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
    493     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
    494       SmallVector<const SCEV *, 8> Ops;
    495       for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
    496            I != E; ++I) {
    497         const SCEV *Op = getExactSDiv(*I, RHS, SE,
    498                                       IgnoreSignificantBits);
    499         if (!Op) return 0;
    500         Ops.push_back(Op);
    501       }
    502       return SE.getAddExpr(Ops);
    503     }
    504     return 0;
    505   }
    506 
    507   // Check for a multiply operand that we can pull RHS out of.
    508   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
    509     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
    510       SmallVector<const SCEV *, 4> Ops;
    511       bool Found = false;
    512       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
    513            I != E; ++I) {
    514         const SCEV *S = *I;
    515         if (!Found)
    516           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
    517                                            IgnoreSignificantBits)) {
    518             S = Q;
    519             Found = true;
    520           }
    521         Ops.push_back(S);
    522       }
    523       return Found ? SE.getMulExpr(Ops) : 0;
    524     }
    525     return 0;
    526   }
    527 
    528   // Otherwise we don't know.
    529   return 0;
    530 }
    531 
    532 /// ExtractImmediate - If S involves the addition of a constant integer value,
    533 /// return that integer value, and mutate S to point to a new SCEV with that
    534 /// value excluded.
    535 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
    536   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
    537     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
    538       S = SE.getConstant(C->getType(), 0);
    539       return C->getValue()->getSExtValue();
    540     }
    541   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
    542     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
    543     int64_t Result = ExtractImmediate(NewOps.front(), SE);
    544     if (Result != 0)
    545       S = SE.getAddExpr(NewOps);
    546     return Result;
    547   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
    548     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
    549     int64_t Result = ExtractImmediate(NewOps.front(), SE);
    550     if (Result != 0)
    551       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
    552                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
    553                            SCEV::FlagAnyWrap);
    554     return Result;
    555   }
    556   return 0;
    557 }
    558 
    559 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
    560 /// return that symbol, and mutate S to point to a new SCEV with that
    561 /// value excluded.
    562 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
    563   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
    564     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
    565       S = SE.getConstant(GV->getType(), 0);
    566       return GV;
    567     }
    568   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
    569     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
    570     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
    571     if (Result)
    572       S = SE.getAddExpr(NewOps);
    573     return Result;
    574   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
    575     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
    576     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
    577     if (Result)
    578       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
    579                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
    580                            SCEV::FlagAnyWrap);
    581     return Result;
    582   }
    583   return 0;
    584 }
    585 
    586 /// isAddressUse - Returns true if the specified instruction is using the
    587 /// specified value as an address.
    588 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
    589   bool isAddress = isa<LoadInst>(Inst);
    590   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
    591     if (SI->getOperand(1) == OperandVal)
    592       isAddress = true;
    593   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
    594     // Addressing modes can also be folded into prefetches and a variety
    595     // of intrinsics.
    596     switch (II->getIntrinsicID()) {
    597       default: break;
    598       case Intrinsic::prefetch:
    599       case Intrinsic::x86_sse_storeu_ps:
    600       case Intrinsic::x86_sse2_storeu_pd:
    601       case Intrinsic::x86_sse2_storeu_dq:
    602       case Intrinsic::x86_sse2_storel_dq:
    603         if (II->getArgOperand(0) == OperandVal)
    604           isAddress = true;
    605         break;
    606     }
    607   }
    608   return isAddress;
    609 }
    610 
    611 /// getAccessType - Return the type of the memory being accessed.
    612 static Type *getAccessType(const Instruction *Inst) {
    613   Type *AccessTy = Inst->getType();
    614   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
    615     AccessTy = SI->getOperand(0)->getType();
    616   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
    617     // Addressing modes can also be folded into prefetches and a variety
    618     // of intrinsics.
    619     switch (II->getIntrinsicID()) {
    620     default: break;
    621     case Intrinsic::x86_sse_storeu_ps:
    622     case Intrinsic::x86_sse2_storeu_pd:
    623     case Intrinsic::x86_sse2_storeu_dq:
    624     case Intrinsic::x86_sse2_storel_dq:
    625       AccessTy = II->getArgOperand(0)->getType();
    626       break;
    627     }
    628   }
    629 
    630   // All pointers have the same requirements, so canonicalize them to an
    631   // arbitrary pointer type to minimize variation.
    632   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
    633     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
    634                                 PTy->getAddressSpace());
    635 
    636   return AccessTy;
    637 }
    638 
    639 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
    640 /// specified set are trivially dead, delete them and see if this makes any of
    641 /// their operands subsequently dead.
    642 static bool
    643 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
    644   bool Changed = false;
    645 
    646   while (!DeadInsts.empty()) {
    647     Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val());
    648 
    649     if (I == 0 || !isInstructionTriviallyDead(I))
    650       continue;
    651 
    652     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
    653       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
    654         *OI = 0;
    655         if (U->use_empty())
    656           DeadInsts.push_back(U);
    657       }
    658 
    659     I->eraseFromParent();
    660     Changed = true;
    661   }
    662 
    663   return Changed;
    664 }
    665 
    666 namespace {
    667 
    668 /// Cost - This class is used to measure and compare candidate formulae.
    669 class Cost {
    670   /// TODO: Some of these could be merged. Also, a lexical ordering
    671   /// isn't always optimal.
    672   unsigned NumRegs;
    673   unsigned AddRecCost;
    674   unsigned NumIVMuls;
    675   unsigned NumBaseAdds;
    676   unsigned ImmCost;
    677   unsigned SetupCost;
    678 
    679 public:
    680   Cost()
    681     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
    682       SetupCost(0) {}
    683 
    684   bool operator<(const Cost &Other) const;
    685 
    686   void Loose();
    687 
    688 #ifndef NDEBUG
    689   // Once any of the metrics loses, they must all remain losers.
    690   bool isValid() {
    691     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
    692              | ImmCost | SetupCost) != ~0u)
    693       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
    694            & ImmCost & SetupCost) == ~0u);
    695   }
    696 #endif
    697 
    698   bool isLoser() {
    699     assert(isValid() && "invalid cost");
    700     return NumRegs == ~0u;
    701   }
    702 
    703   void RateFormula(const Formula &F,
    704                    SmallPtrSet<const SCEV *, 16> &Regs,
    705                    const DenseSet<const SCEV *> &VisitedRegs,
    706                    const Loop *L,
    707                    const SmallVectorImpl<int64_t> &Offsets,
    708                    ScalarEvolution &SE, DominatorTree &DT);
    709 
    710   void print(raw_ostream &OS) const;
    711   void dump() const;
    712 
    713 private:
    714   void RateRegister(const SCEV *Reg,
    715                     SmallPtrSet<const SCEV *, 16> &Regs,
    716                     const Loop *L,
    717                     ScalarEvolution &SE, DominatorTree &DT);
    718   void RatePrimaryRegister(const SCEV *Reg,
    719                            SmallPtrSet<const SCEV *, 16> &Regs,
    720                            const Loop *L,
    721                            ScalarEvolution &SE, DominatorTree &DT);
    722 };
    723 
    724 }
    725 
    726 /// RateRegister - Tally up interesting quantities from the given register.
    727 void Cost::RateRegister(const SCEV *Reg,
    728                         SmallPtrSet<const SCEV *, 16> &Regs,
    729                         const Loop *L,
    730                         ScalarEvolution &SE, DominatorTree &DT) {
    731   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
    732     if (AR->getLoop() == L)
    733       AddRecCost += 1; /// TODO: This should be a function of the stride.
    734 
    735     // If this is an addrec for another loop, don't second-guess its addrec phi
    736     // nodes. LSR isn't currently smart enough to reason about more than one
    737     // loop at a time. LSR has either already run on inner loops, will not run
    738     // on other loops, and cannot be expected to change sibling loops. If the
    739     // AddRec exists, consider it's register free and leave it alone. Otherwise,
    740     // do not consider this formula at all.
    741     // FIXME: why do we need to generate such fomulae?
    742     else if (!EnableNested || L->contains(AR->getLoop()) ||
    743              (!AR->getLoop()->contains(L) &&
    744               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
    745       for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
    746            PHINode *PN = dyn_cast<PHINode>(I); ++I) {
    747         if (SE.isSCEVable(PN->getType()) &&
    748             (SE.getEffectiveSCEVType(PN->getType()) ==
    749              SE.getEffectiveSCEVType(AR->getType())) &&
    750             SE.getSCEV(PN) == AR)
    751           return;
    752       }
    753       if (!EnableNested) {
    754         Loose();
    755         return;
    756       }
    757       // If this isn't one of the addrecs that the loop already has, it
    758       // would require a costly new phi and add. TODO: This isn't
    759       // precisely modeled right now.
    760       ++NumBaseAdds;
    761       if (!Regs.count(AR->getStart())) {
    762         RateRegister(AR->getStart(), Regs, L, SE, DT);
    763         if (isLoser())
    764           return;
    765       }
    766     }
    767 
    768     // Add the step value register, if it needs one.
    769     // TODO: The non-affine case isn't precisely modeled here.
    770     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
    771       if (!Regs.count(AR->getOperand(1))) {
    772         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
    773         if (isLoser())
    774           return;
    775       }
    776     }
    777   }
    778   ++NumRegs;
    779 
    780   // Rough heuristic; favor registers which don't require extra setup
    781   // instructions in the preheader.
    782   if (!isa<SCEVUnknown>(Reg) &&
    783       !isa<SCEVConstant>(Reg) &&
    784       !(isa<SCEVAddRecExpr>(Reg) &&
    785         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
    786          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
    787     ++SetupCost;
    788 
    789     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
    790                  SE.hasComputableLoopEvolution(Reg, L);
    791 }
    792 
    793 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
    794 /// before, rate it.
    795 void Cost::RatePrimaryRegister(const SCEV *Reg,
    796                                SmallPtrSet<const SCEV *, 16> &Regs,
    797                                const Loop *L,
    798                                ScalarEvolution &SE, DominatorTree &DT) {
    799   if (Regs.insert(Reg))
    800     RateRegister(Reg, Regs, L, SE, DT);
    801 }
    802 
    803 void Cost::RateFormula(const Formula &F,
    804                        SmallPtrSet<const SCEV *, 16> &Regs,
    805                        const DenseSet<const SCEV *> &VisitedRegs,
    806                        const Loop *L,
    807                        const SmallVectorImpl<int64_t> &Offsets,
    808                        ScalarEvolution &SE, DominatorTree &DT) {
    809   // Tally up the registers.
    810   if (const SCEV *ScaledReg = F.ScaledReg) {
    811     if (VisitedRegs.count(ScaledReg)) {
    812       Loose();
    813       return;
    814     }
    815     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
    816     if (isLoser())
    817       return;
    818   }
    819   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
    820        E = F.BaseRegs.end(); I != E; ++I) {
    821     const SCEV *BaseReg = *I;
    822     if (VisitedRegs.count(BaseReg)) {
    823       Loose();
    824       return;
    825     }
    826     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
    827     if (isLoser())
    828       return;
    829   }
    830 
    831   // Determine how many (unfolded) adds we'll need inside the loop.
    832   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
    833   if (NumBaseParts > 1)
    834     NumBaseAdds += NumBaseParts - 1;
    835 
    836   // Tally up the non-zero immediates.
    837   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
    838        E = Offsets.end(); I != E; ++I) {
    839     int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
    840     if (F.AM.BaseGV)
    841       ImmCost += 64; // Handle symbolic values conservatively.
    842                      // TODO: This should probably be the pointer size.
    843     else if (Offset != 0)
    844       ImmCost += APInt(64, Offset, true).getMinSignedBits();
    845   }
    846   assert(isValid() && "invalid cost");
    847 }
    848 
    849 /// Loose - Set this cost to a losing value.
    850 void Cost::Loose() {
    851   NumRegs = ~0u;
    852   AddRecCost = ~0u;
    853   NumIVMuls = ~0u;
    854   NumBaseAdds = ~0u;
    855   ImmCost = ~0u;
    856   SetupCost = ~0u;
    857 }
    858 
    859 /// operator< - Choose the lower cost.
    860 bool Cost::operator<(const Cost &Other) const {
    861   if (NumRegs != Other.NumRegs)
    862     return NumRegs < Other.NumRegs;
    863   if (AddRecCost != Other.AddRecCost)
    864     return AddRecCost < Other.AddRecCost;
    865   if (NumIVMuls != Other.NumIVMuls)
    866     return NumIVMuls < Other.NumIVMuls;
    867   if (NumBaseAdds != Other.NumBaseAdds)
    868     return NumBaseAdds < Other.NumBaseAdds;
    869   if (ImmCost != Other.ImmCost)
    870     return ImmCost < Other.ImmCost;
    871   if (SetupCost != Other.SetupCost)
    872     return SetupCost < Other.SetupCost;
    873   return false;
    874 }
    875 
    876 void Cost::print(raw_ostream &OS) const {
    877   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
    878   if (AddRecCost != 0)
    879     OS << ", with addrec cost " << AddRecCost;
    880   if (NumIVMuls != 0)
    881     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
    882   if (NumBaseAdds != 0)
    883     OS << ", plus " << NumBaseAdds << " base add"
    884        << (NumBaseAdds == 1 ? "" : "s");
    885   if (ImmCost != 0)
    886     OS << ", plus " << ImmCost << " imm cost";
    887   if (SetupCost != 0)
    888     OS << ", plus " << SetupCost << " setup cost";
    889 }
    890 
    891 void Cost::dump() const {
    892   print(errs()); errs() << '\n';
    893 }
    894 
    895 namespace {
    896 
    897 /// LSRFixup - An operand value in an instruction which is to be replaced
    898 /// with some equivalent, possibly strength-reduced, replacement.
    899 struct LSRFixup {
    900   /// UserInst - The instruction which will be updated.
    901   Instruction *UserInst;
    902 
    903   /// OperandValToReplace - The operand of the instruction which will
    904   /// be replaced. The operand may be used more than once; every instance
    905   /// will be replaced.
    906   Value *OperandValToReplace;
    907 
    908   /// PostIncLoops - If this user is to use the post-incremented value of an
    909   /// induction variable, this variable is non-null and holds the loop
    910   /// associated with the induction variable.
    911   PostIncLoopSet PostIncLoops;
    912 
    913   /// LUIdx - The index of the LSRUse describing the expression which
    914   /// this fixup needs, minus an offset (below).
    915   size_t LUIdx;
    916 
    917   /// Offset - A constant offset to be added to the LSRUse expression.
    918   /// This allows multiple fixups to share the same LSRUse with different
    919   /// offsets, for example in an unrolled loop.
    920   int64_t Offset;
    921 
    922   bool isUseFullyOutsideLoop(const Loop *L) const;
    923 
    924   LSRFixup();
    925 
    926   void print(raw_ostream &OS) const;
    927   void dump() const;
    928 };
    929 
    930 }
    931 
    932 LSRFixup::LSRFixup()
    933   : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
    934 
    935 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
    936 /// value outside of the given loop.
    937 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
    938   // PHI nodes use their value in their incoming blocks.
    939   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
    940     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
    941       if (PN->getIncomingValue(i) == OperandValToReplace &&
    942           L->contains(PN->getIncomingBlock(i)))
    943         return false;
    944     return true;
    945   }
    946 
    947   return !L->contains(UserInst);
    948 }
    949 
    950 void LSRFixup::print(raw_ostream &OS) const {
    951   OS << "UserInst=";
    952   // Store is common and interesting enough to be worth special-casing.
    953   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
    954     OS << "store ";
    955     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
    956   } else if (UserInst->getType()->isVoidTy())
    957     OS << UserInst->getOpcodeName();
    958   else
    959     WriteAsOperand(OS, UserInst, /*PrintType=*/false);
    960 
    961   OS << ", OperandValToReplace=";
    962   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
    963 
    964   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
    965        E = PostIncLoops.end(); I != E; ++I) {
    966     OS << ", PostIncLoop=";
    967     WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
    968   }
    969 
    970   if (LUIdx != ~size_t(0))
    971     OS << ", LUIdx=" << LUIdx;
    972 
    973   if (Offset != 0)
    974     OS << ", Offset=" << Offset;
    975 }
    976 
    977 void LSRFixup::dump() const {
    978   print(errs()); errs() << '\n';
    979 }
    980 
    981 namespace {
    982 
    983 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
    984 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
    985 struct UniquifierDenseMapInfo {
    986   static SmallVector<const SCEV *, 2> getEmptyKey() {
    987     SmallVector<const SCEV *, 2> V;
    988     V.push_back(reinterpret_cast<const SCEV *>(-1));
    989     return V;
    990   }
    991 
    992   static SmallVector<const SCEV *, 2> getTombstoneKey() {
    993     SmallVector<const SCEV *, 2> V;
    994     V.push_back(reinterpret_cast<const SCEV *>(-2));
    995     return V;
    996   }
    997 
    998   static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
    999     unsigned Result = 0;
   1000     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
   1001          E = V.end(); I != E; ++I)
   1002       Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
   1003     return Result;
   1004   }
   1005 
   1006   static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
   1007                       const SmallVector<const SCEV *, 2> &RHS) {
   1008     return LHS == RHS;
   1009   }
   1010 };
   1011 
   1012 /// LSRUse - This class holds the state that LSR keeps for each use in
   1013 /// IVUsers, as well as uses invented by LSR itself. It includes information
   1014 /// about what kinds of things can be folded into the user, information about
   1015 /// the user itself, and information about how the use may be satisfied.
   1016 /// TODO: Represent multiple users of the same expression in common?
   1017 class LSRUse {
   1018   DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
   1019 
   1020 public:
   1021   /// KindType - An enum for a kind of use, indicating what types of
   1022   /// scaled and immediate operands it might support.
   1023   enum KindType {
   1024     Basic,   ///< A normal use, with no folding.
   1025     Special, ///< A special case of basic, allowing -1 scales.
   1026     Address, ///< An address use; folding according to TargetLowering
   1027     ICmpZero ///< An equality icmp with both operands folded into one.
   1028     // TODO: Add a generic icmp too?
   1029   };
   1030 
   1031   KindType Kind;
   1032   Type *AccessTy;
   1033 
   1034   SmallVector<int64_t, 8> Offsets;
   1035   int64_t MinOffset;
   1036   int64_t MaxOffset;
   1037 
   1038   /// AllFixupsOutsideLoop - This records whether all of the fixups using this
   1039   /// LSRUse are outside of the loop, in which case some special-case heuristics
   1040   /// may be used.
   1041   bool AllFixupsOutsideLoop;
   1042 
   1043   /// WidestFixupType - This records the widest use type for any fixup using
   1044   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
   1045   /// max fixup widths to be equivalent, because the narrower one may be relying
   1046   /// on the implicit truncation to truncate away bogus bits.
   1047   Type *WidestFixupType;
   1048 
   1049   /// Formulae - A list of ways to build a value that can satisfy this user.
   1050   /// After the list is populated, one of these is selected heuristically and
   1051   /// used to formulate a replacement for OperandValToReplace in UserInst.
   1052   SmallVector<Formula, 12> Formulae;
   1053 
   1054   /// Regs - The set of register candidates used by all formulae in this LSRUse.
   1055   SmallPtrSet<const SCEV *, 4> Regs;
   1056 
   1057   LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
   1058                                       MinOffset(INT64_MAX),
   1059                                       MaxOffset(INT64_MIN),
   1060                                       AllFixupsOutsideLoop(true),
   1061                                       WidestFixupType(0) {}
   1062 
   1063   bool HasFormulaWithSameRegs(const Formula &F) const;
   1064   bool InsertFormula(const Formula &F);
   1065   void DeleteFormula(Formula &F);
   1066   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
   1067 
   1068   void print(raw_ostream &OS) const;
   1069   void dump() const;
   1070 };
   1071 
   1072 }
   1073 
   1074 /// HasFormula - Test whether this use as a formula which has the same
   1075 /// registers as the given formula.
   1076 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   1077   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
   1078   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   1079   // Unstable sort by host order ok, because this is only used for uniquifying.
   1080   std::sort(Key.begin(), Key.end());
   1081   return Uniquifier.count(Key);
   1082 }
   1083 
   1084 /// InsertFormula - If the given formula has not yet been inserted, add it to
   1085 /// the list, and return true. Return false otherwise.
   1086 bool LSRUse::InsertFormula(const Formula &F) {
   1087   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
   1088   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   1089   // Unstable sort by host order ok, because this is only used for uniquifying.
   1090   std::sort(Key.begin(), Key.end());
   1091 
   1092   if (!Uniquifier.insert(Key).second)
   1093     return false;
   1094 
   1095   // Using a register to hold the value of 0 is not profitable.
   1096   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
   1097          "Zero allocated in a scaled register!");
   1098 #ifndef NDEBUG
   1099   for (SmallVectorImpl<const SCEV *>::const_iterator I =
   1100        F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
   1101     assert(!(*I)->isZero() && "Zero allocated in a base register!");
   1102 #endif
   1103 
   1104   // Add the formula to the list.
   1105   Formulae.push_back(F);
   1106 
   1107   // Record registers now being used by this use.
   1108   if (F.ScaledReg) Regs.insert(F.ScaledReg);
   1109   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
   1110 
   1111   return true;
   1112 }
   1113 
   1114 /// DeleteFormula - Remove the given formula from this use's list.
   1115 void LSRUse::DeleteFormula(Formula &F) {
   1116   if (&F != &Formulae.back())
   1117     std::swap(F, Formulae.back());
   1118   Formulae.pop_back();
   1119   assert(!Formulae.empty() && "LSRUse has no formulae left!");
   1120 }
   1121 
   1122 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
   1123 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
   1124   // Now that we've filtered out some formulae, recompute the Regs set.
   1125   SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
   1126   Regs.clear();
   1127   for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
   1128        E = Formulae.end(); I != E; ++I) {
   1129     const Formula &F = *I;
   1130     if (F.ScaledReg) Regs.insert(F.ScaledReg);
   1131     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
   1132   }
   1133 
   1134   // Update the RegTracker.
   1135   for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
   1136        E = OldRegs.end(); I != E; ++I)
   1137     if (!Regs.count(*I))
   1138       RegUses.DropRegister(*I, LUIdx);
   1139 }
   1140 
   1141 void LSRUse::print(raw_ostream &OS) const {
   1142   OS << "LSR Use: Kind=";
   1143   switch (Kind) {
   1144   case Basic:    OS << "Basic"; break;
   1145   case Special:  OS << "Special"; break;
   1146   case ICmpZero: OS << "ICmpZero"; break;
   1147   case Address:
   1148     OS << "Address of ";
   1149     if (AccessTy->isPointerTy())
   1150       OS << "pointer"; // the full pointer type could be really verbose
   1151     else
   1152       OS << *AccessTy;
   1153   }
   1154 
   1155   OS << ", Offsets={";
   1156   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
   1157        E = Offsets.end(); I != E; ++I) {
   1158     OS << *I;
   1159     if (llvm::next(I) != E)
   1160       OS << ',';
   1161   }
   1162   OS << '}';
   1163 
   1164   if (AllFixupsOutsideLoop)
   1165     OS << ", all-fixups-outside-loop";
   1166 
   1167   if (WidestFixupType)
   1168     OS << ", widest fixup type: " << *WidestFixupType;
   1169 }
   1170 
   1171 void LSRUse::dump() const {
   1172   print(errs()); errs() << '\n';
   1173 }
   1174 
   1175 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
   1176 /// be completely folded into the user instruction at isel time. This includes
   1177 /// address-mode folding and special icmp tricks.
   1178 static bool isLegalUse(const TargetLowering::AddrMode &AM,
   1179                        LSRUse::KindType Kind, Type *AccessTy,
   1180                        const TargetLowering *TLI) {
   1181   switch (Kind) {
   1182   case LSRUse::Address:
   1183     // If we have low-level target information, ask the target if it can
   1184     // completely fold this address.
   1185     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
   1186 
   1187     // Otherwise, just guess that reg+reg addressing is legal.
   1188     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
   1189 
   1190   case LSRUse::ICmpZero:
   1191     // There's not even a target hook for querying whether it would be legal to
   1192     // fold a GV into an ICmp.
   1193     if (AM.BaseGV)
   1194       return false;
   1195 
   1196     // ICmp only has two operands; don't allow more than two non-trivial parts.
   1197     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
   1198       return false;
   1199 
   1200     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
   1201     // putting the scaled register in the other operand of the icmp.
   1202     if (AM.Scale != 0 && AM.Scale != -1)
   1203       return false;
   1204 
   1205     // If we have low-level target information, ask the target if it can fold an
   1206     // integer immediate on an icmp.
   1207     if (AM.BaseOffs != 0) {
   1208       if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
   1209       return false;
   1210     }
   1211 
   1212     return true;
   1213 
   1214   case LSRUse::Basic:
   1215     // Only handle single-register values.
   1216     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
   1217 
   1218   case LSRUse::Special:
   1219     // Only handle -1 scales, or no scale.
   1220     return AM.Scale == 0 || AM.Scale == -1;
   1221   }
   1222 
   1223   return false;
   1224 }
   1225 
   1226 static bool isLegalUse(TargetLowering::AddrMode AM,
   1227                        int64_t MinOffset, int64_t MaxOffset,
   1228                        LSRUse::KindType Kind, Type *AccessTy,
   1229                        const TargetLowering *TLI) {
   1230   // Check for overflow.
   1231   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
   1232       (MinOffset > 0))
   1233     return false;
   1234   AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
   1235   if (isLegalUse(AM, Kind, AccessTy, TLI)) {
   1236     AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
   1237     // Check for overflow.
   1238     if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
   1239         (MaxOffset > 0))
   1240       return false;
   1241     AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
   1242     return isLegalUse(AM, Kind, AccessTy, TLI);
   1243   }
   1244   return false;
   1245 }
   1246 
   1247 static bool isAlwaysFoldable(int64_t BaseOffs,
   1248                              GlobalValue *BaseGV,
   1249                              bool HasBaseReg,
   1250                              LSRUse::KindType Kind, Type *AccessTy,
   1251                              const TargetLowering *TLI) {
   1252   // Fast-path: zero is always foldable.
   1253   if (BaseOffs == 0 && !BaseGV) return true;
   1254 
   1255   // Conservatively, create an address with an immediate and a
   1256   // base and a scale.
   1257   TargetLowering::AddrMode AM;
   1258   AM.BaseOffs = BaseOffs;
   1259   AM.BaseGV = BaseGV;
   1260   AM.HasBaseReg = HasBaseReg;
   1261   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
   1262 
   1263   // Canonicalize a scale of 1 to a base register if the formula doesn't
   1264   // already have a base register.
   1265   if (!AM.HasBaseReg && AM.Scale == 1) {
   1266     AM.Scale = 0;
   1267     AM.HasBaseReg = true;
   1268   }
   1269 
   1270   return isLegalUse(AM, Kind, AccessTy, TLI);
   1271 }
   1272 
   1273 static bool isAlwaysFoldable(const SCEV *S,
   1274                              int64_t MinOffset, int64_t MaxOffset,
   1275                              bool HasBaseReg,
   1276                              LSRUse::KindType Kind, Type *AccessTy,
   1277                              const TargetLowering *TLI,
   1278                              ScalarEvolution &SE) {
   1279   // Fast-path: zero is always foldable.
   1280   if (S->isZero()) return true;
   1281 
   1282   // Conservatively, create an address with an immediate and a
   1283   // base and a scale.
   1284   int64_t BaseOffs = ExtractImmediate(S, SE);
   1285   GlobalValue *BaseGV = ExtractSymbol(S, SE);
   1286 
   1287   // If there's anything else involved, it's not foldable.
   1288   if (!S->isZero()) return false;
   1289 
   1290   // Fast-path: zero is always foldable.
   1291   if (BaseOffs == 0 && !BaseGV) return true;
   1292 
   1293   // Conservatively, create an address with an immediate and a
   1294   // base and a scale.
   1295   TargetLowering::AddrMode AM;
   1296   AM.BaseOffs = BaseOffs;
   1297   AM.BaseGV = BaseGV;
   1298   AM.HasBaseReg = HasBaseReg;
   1299   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
   1300 
   1301   return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
   1302 }
   1303 
   1304 namespace {
   1305 
   1306 /// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
   1307 /// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
   1308 struct UseMapDenseMapInfo {
   1309   static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
   1310     return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
   1311   }
   1312 
   1313   static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
   1314     return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
   1315   }
   1316 
   1317   static unsigned
   1318   getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
   1319     unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
   1320     Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
   1321     return Result;
   1322   }
   1323 
   1324   static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
   1325                       const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
   1326     return LHS == RHS;
   1327   }
   1328 };
   1329 
   1330 /// LSRInstance - This class holds state for the main loop strength reduction
   1331 /// logic.
   1332 class LSRInstance {
   1333   IVUsers &IU;
   1334   ScalarEvolution &SE;
   1335   DominatorTree &DT;
   1336   LoopInfo &LI;
   1337   const TargetLowering *const TLI;
   1338   Loop *const L;
   1339   bool Changed;
   1340 
   1341   /// IVIncInsertPos - This is the insert position that the current loop's
   1342   /// induction variable increment should be placed. In simple loops, this is
   1343   /// the latch block's terminator. But in more complicated cases, this is a
   1344   /// position which will dominate all the in-loop post-increment users.
   1345   Instruction *IVIncInsertPos;
   1346 
   1347   /// Factors - Interesting factors between use strides.
   1348   SmallSetVector<int64_t, 8> Factors;
   1349 
   1350   /// Types - Interesting use types, to facilitate truncation reuse.
   1351   SmallSetVector<Type *, 4> Types;
   1352 
   1353   /// Fixups - The list of operands which are to be replaced.
   1354   SmallVector<LSRFixup, 16> Fixups;
   1355 
   1356   /// Uses - The list of interesting uses.
   1357   SmallVector<LSRUse, 16> Uses;
   1358 
   1359   /// RegUses - Track which uses use which register candidates.
   1360   RegUseTracker RegUses;
   1361 
   1362   void OptimizeShadowIV();
   1363   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
   1364   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
   1365   void OptimizeLoopTermCond();
   1366 
   1367   void CollectInterestingTypesAndFactors();
   1368   void CollectFixupsAndInitialFormulae();
   1369 
   1370   LSRFixup &getNewFixup() {
   1371     Fixups.push_back(LSRFixup());
   1372     return Fixups.back();
   1373   }
   1374 
   1375   // Support for sharing of LSRUses between LSRFixups.
   1376   typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
   1377                    size_t,
   1378                    UseMapDenseMapInfo> UseMapTy;
   1379   UseMapTy UseMap;
   1380 
   1381   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   1382                           LSRUse::KindType Kind, Type *AccessTy);
   1383 
   1384   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
   1385                                     LSRUse::KindType Kind,
   1386                                     Type *AccessTy);
   1387 
   1388   void DeleteUse(LSRUse &LU, size_t LUIdx);
   1389 
   1390   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
   1391 
   1392 public:
   1393   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   1394   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   1395   void CountRegisters(const Formula &F, size_t LUIdx);
   1396   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
   1397 
   1398   void CollectLoopInvariantFixupsAndFormulae();
   1399 
   1400   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
   1401                               unsigned Depth = 0);
   1402   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
   1403   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   1404   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   1405   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   1406   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   1407   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
   1408   void GenerateCrossUseConstantOffsets();
   1409   void GenerateAllReuseFormulae();
   1410 
   1411   void FilterOutUndesirableDedicatedRegisters();
   1412 
   1413   size_t EstimateSearchSpaceComplexity() const;
   1414   void NarrowSearchSpaceByDetectingSupersets();
   1415   void NarrowSearchSpaceByCollapsingUnrolledCode();
   1416   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
   1417   void NarrowSearchSpaceByPickingWinnerRegs();
   1418   void NarrowSearchSpaceUsingHeuristics();
   1419 
   1420   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
   1421                     Cost &SolutionCost,
   1422                     SmallVectorImpl<const Formula *> &Workspace,
   1423                     const Cost &CurCost,
   1424                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
   1425                     DenseSet<const SCEV *> &VisitedRegs) const;
   1426   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
   1427 
   1428   BasicBlock::iterator
   1429     HoistInsertPosition(BasicBlock::iterator IP,
   1430                         const SmallVectorImpl<Instruction *> &Inputs) const;
   1431   BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   1432                                                      const LSRFixup &LF,
   1433                                                      const LSRUse &LU) const;
   1434 
   1435   Value *Expand(const LSRFixup &LF,
   1436                 const Formula &F,
   1437                 BasicBlock::iterator IP,
   1438                 SCEVExpander &Rewriter,
   1439                 SmallVectorImpl<WeakVH> &DeadInsts) const;
   1440   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
   1441                      const Formula &F,
   1442                      SCEVExpander &Rewriter,
   1443                      SmallVectorImpl<WeakVH> &DeadInsts,
   1444                      Pass *P) const;
   1445   void Rewrite(const LSRFixup &LF,
   1446                const Formula &F,
   1447                SCEVExpander &Rewriter,
   1448                SmallVectorImpl<WeakVH> &DeadInsts,
   1449                Pass *P) const;
   1450   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   1451                          Pass *P);
   1452 
   1453   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
   1454 
   1455   bool getChanged() const { return Changed; }
   1456 
   1457   void print_factors_and_types(raw_ostream &OS) const;
   1458   void print_fixups(raw_ostream &OS) const;
   1459   void print_uses(raw_ostream &OS) const;
   1460   void print(raw_ostream &OS) const;
   1461   void dump() const;
   1462 };
   1463 
   1464 }
   1465 
   1466 /// OptimizeShadowIV - If IV is used in a int-to-float cast
   1467 /// inside the loop then try to eliminate the cast operation.
   1468 void LSRInstance::OptimizeShadowIV() {
   1469   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
   1470   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
   1471     return;
   1472 
   1473   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
   1474        UI != E; /* empty */) {
   1475     IVUsers::const_iterator CandidateUI = UI;
   1476     ++UI;
   1477     Instruction *ShadowUse = CandidateUI->getUser();
   1478     Type *DestTy = NULL;
   1479     bool IsSigned = false;
   1480 
   1481     /* If shadow use is a int->float cast then insert a second IV
   1482        to eliminate this cast.
   1483 
   1484          for (unsigned i = 0; i < n; ++i)
   1485            foo((double)i);
   1486 
   1487        is transformed into
   1488 
   1489          double d = 0.0;
   1490          for (unsigned i = 0; i < n; ++i, ++d)
   1491            foo(d);
   1492     */
   1493     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
   1494       IsSigned = false;
   1495       DestTy = UCast->getDestTy();
   1496     }
   1497     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
   1498       IsSigned = true;
   1499       DestTy = SCast->getDestTy();
   1500     }
   1501     if (!DestTy) continue;
   1502 
   1503     if (TLI) {
   1504       // If target does not support DestTy natively then do not apply
   1505       // this transformation.
   1506       EVT DVT = TLI->getValueType(DestTy);
   1507       if (!TLI->isTypeLegal(DVT)) continue;
   1508     }
   1509 
   1510     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
   1511     if (!PH) continue;
   1512     if (PH->getNumIncomingValues() != 2) continue;
   1513 
   1514     Type *SrcTy = PH->getType();
   1515     int Mantissa = DestTy->getFPMantissaWidth();
   1516     if (Mantissa == -1) continue;
   1517     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
   1518       continue;
   1519 
   1520     unsigned Entry, Latch;
   1521     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
   1522       Entry = 0;
   1523       Latch = 1;
   1524     } else {
   1525       Entry = 1;
   1526       Latch = 0;
   1527     }
   1528 
   1529     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
   1530     if (!Init) continue;
   1531     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
   1532                                         (double)Init->getSExtValue() :
   1533                                         (double)Init->getZExtValue());
   1534 
   1535     BinaryOperator *Incr =
   1536       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
   1537     if (!Incr) continue;
   1538     if (Incr->getOpcode() != Instruction::Add
   1539         && Incr->getOpcode() != Instruction::Sub)
   1540       continue;
   1541 
   1542     /* Initialize new IV, double d = 0.0 in above example. */
   1543     ConstantInt *C = NULL;
   1544     if (Incr->getOperand(0) == PH)
   1545       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
   1546     else if (Incr->getOperand(1) == PH)
   1547       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
   1548     else
   1549       continue;
   1550 
   1551     if (!C) continue;
   1552 
   1553     // Ignore negative constants, as the code below doesn't handle them
   1554     // correctly. TODO: Remove this restriction.
   1555     if (!C->getValue().isStrictlyPositive()) continue;
   1556 
   1557     /* Add new PHINode. */
   1558     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
   1559 
   1560     /* create new increment. '++d' in above example. */
   1561     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
   1562     BinaryOperator *NewIncr =
   1563       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
   1564                                Instruction::FAdd : Instruction::FSub,
   1565                              NewPH, CFP, "IV.S.next.", Incr);
   1566 
   1567     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
   1568     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
   1569 
   1570     /* Remove cast operation */
   1571     ShadowUse->replaceAllUsesWith(NewPH);
   1572     ShadowUse->eraseFromParent();
   1573     Changed = true;
   1574     break;
   1575   }
   1576 }
   1577 
   1578 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
   1579 /// set the IV user and stride information and return true, otherwise return
   1580 /// false.
   1581 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
   1582   for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
   1583     if (UI->getUser() == Cond) {
   1584       // NOTE: we could handle setcc instructions with multiple uses here, but
   1585       // InstCombine does it as well for simple uses, it's not clear that it
   1586       // occurs enough in real life to handle.
   1587       CondUse = UI;
   1588       return true;
   1589     }
   1590   return false;
   1591 }
   1592 
   1593 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
   1594 /// a max computation.
   1595 ///
   1596 /// This is a narrow solution to a specific, but acute, problem. For loops
   1597 /// like this:
   1598 ///
   1599 ///   i = 0;
   1600 ///   do {
   1601 ///     p[i] = 0.0;
   1602 ///   } while (++i < n);
   1603 ///
   1604 /// the trip count isn't just 'n', because 'n' might not be positive. And
   1605 /// unfortunately this can come up even for loops where the user didn't use
   1606 /// a C do-while loop. For example, seemingly well-behaved top-test loops
   1607 /// will commonly be lowered like this:
   1608 //
   1609 ///   if (n > 0) {
   1610 ///     i = 0;
   1611 ///     do {
   1612 ///       p[i] = 0.0;
   1613 ///     } while (++i < n);
   1614 ///   }
   1615 ///
   1616 /// and then it's possible for subsequent optimization to obscure the if
   1617 /// test in such a way that indvars can't find it.
   1618 ///
   1619 /// When indvars can't find the if test in loops like this, it creates a
   1620 /// max expression, which allows it to give the loop a canonical
   1621 /// induction variable:
   1622 ///
   1623 ///   i = 0;
   1624 ///   max = n < 1 ? 1 : n;
   1625 ///   do {
   1626 ///     p[i] = 0.0;
   1627 ///   } while (++i != max);
   1628 ///
   1629 /// Canonical induction variables are necessary because the loop passes
   1630 /// are designed around them. The most obvious example of this is the
   1631 /// LoopInfo analysis, which doesn't remember trip count values. It
   1632 /// expects to be able to rediscover the trip count each time it is
   1633 /// needed, and it does this using a simple analysis that only succeeds if
   1634 /// the loop has a canonical induction variable.
   1635 ///
   1636 /// However, when it comes time to generate code, the maximum operation
   1637 /// can be quite costly, especially if it's inside of an outer loop.
   1638 ///
   1639 /// This function solves this problem by detecting this type of loop and
   1640 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
   1641 /// the instructions for the maximum computation.
   1642 ///
   1643 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   1644   // Check that the loop matches the pattern we're looking for.
   1645   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
   1646       Cond->getPredicate() != CmpInst::ICMP_NE)
   1647     return Cond;
   1648 
   1649   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
   1650   if (!Sel || !Sel->hasOneUse()) return Cond;
   1651 
   1652   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
   1653   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
   1654     return Cond;
   1655   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
   1656 
   1657   // Add one to the backedge-taken count to get the trip count.
   1658   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
   1659   if (IterationCount != SE.getSCEV(Sel)) return Cond;
   1660 
   1661   // Check for a max calculation that matches the pattern. There's no check
   1662   // for ICMP_ULE here because the comparison would be with zero, which
   1663   // isn't interesting.
   1664   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
   1665   const SCEVNAryExpr *Max = 0;
   1666   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
   1667     Pred = ICmpInst::ICMP_SLE;
   1668     Max = S;
   1669   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
   1670     Pred = ICmpInst::ICMP_SLT;
   1671     Max = S;
   1672   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
   1673     Pred = ICmpInst::ICMP_ULT;
   1674     Max = U;
   1675   } else {
   1676     // No match; bail.
   1677     return Cond;
   1678   }
   1679 
   1680   // To handle a max with more than two operands, this optimization would
   1681   // require additional checking and setup.
   1682   if (Max->getNumOperands() != 2)
   1683     return Cond;
   1684 
   1685   const SCEV *MaxLHS = Max->getOperand(0);
   1686   const SCEV *MaxRHS = Max->getOperand(1);
   1687 
   1688   // ScalarEvolution canonicalizes constants to the left. For < and >, look
   1689   // for a comparison with 1. For <= and >=, a comparison with zero.
   1690   if (!MaxLHS ||
   1691       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
   1692     return Cond;
   1693 
   1694   // Check the relevant induction variable for conformance to
   1695   // the pattern.
   1696   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
   1697   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
   1698   if (!AR || !AR->isAffine() ||
   1699       AR->getStart() != One ||
   1700       AR->getStepRecurrence(SE) != One)
   1701     return Cond;
   1702 
   1703   assert(AR->getLoop() == L &&
   1704          "Loop condition operand is an addrec in a different loop!");
   1705 
   1706   // Check the right operand of the select, and remember it, as it will
   1707   // be used in the new comparison instruction.
   1708   Value *NewRHS = 0;
   1709   if (ICmpInst::isTrueWhenEqual(Pred)) {
   1710     // Look for n+1, and grab n.
   1711     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
   1712       if (isa<ConstantInt>(BO->getOperand(1)) &&
   1713           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
   1714           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
   1715         NewRHS = BO->getOperand(0);
   1716     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
   1717       if (isa<ConstantInt>(BO->getOperand(1)) &&
   1718           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
   1719           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
   1720         NewRHS = BO->getOperand(0);
   1721     if (!NewRHS)
   1722       return Cond;
   1723   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
   1724     NewRHS = Sel->getOperand(1);
   1725   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
   1726     NewRHS = Sel->getOperand(2);
   1727   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
   1728     NewRHS = SU->getValue();
   1729   else
   1730     // Max doesn't match expected pattern.
   1731     return Cond;
   1732 
   1733   // Determine the new comparison opcode. It may be signed or unsigned,
   1734   // and the original comparison may be either equality or inequality.
   1735   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
   1736     Pred = CmpInst::getInversePredicate(Pred);
   1737 
   1738   // Ok, everything looks ok to change the condition into an SLT or SGE and
   1739   // delete the max calculation.
   1740   ICmpInst *NewCond =
   1741     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
   1742 
   1743   // Delete the max calculation instructions.
   1744   Cond->replaceAllUsesWith(NewCond);
   1745   CondUse->setUser(NewCond);
   1746   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
   1747   Cond->eraseFromParent();
   1748   Sel->eraseFromParent();
   1749   if (Cmp->use_empty())
   1750     Cmp->eraseFromParent();
   1751   return NewCond;
   1752 }
   1753 
   1754 /// OptimizeLoopTermCond - Change loop terminating condition to use the
   1755 /// postinc iv when possible.
   1756 void
   1757 LSRInstance::OptimizeLoopTermCond() {
   1758   SmallPtrSet<Instruction *, 4> PostIncs;
   1759 
   1760   BasicBlock *LatchBlock = L->getLoopLatch();
   1761   SmallVector<BasicBlock*, 8> ExitingBlocks;
   1762   L->getExitingBlocks(ExitingBlocks);
   1763 
   1764   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
   1765     BasicBlock *ExitingBlock = ExitingBlocks[i];
   1766 
   1767     // Get the terminating condition for the loop if possible.  If we
   1768     // can, we want to change it to use a post-incremented version of its
   1769     // induction variable, to allow coalescing the live ranges for the IV into
   1770     // one register value.
   1771 
   1772     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
   1773     if (!TermBr)
   1774       continue;
   1775     // FIXME: Overly conservative, termination condition could be an 'or' etc..
   1776     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
   1777       continue;
   1778 
   1779     // Search IVUsesByStride to find Cond's IVUse if there is one.
   1780     IVStrideUse *CondUse = 0;
   1781     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
   1782     if (!FindIVUserForCond(Cond, CondUse))
   1783       continue;
   1784 
   1785     // If the trip count is computed in terms of a max (due to ScalarEvolution
   1786     // being unable to find a sufficient guard, for example), change the loop
   1787     // comparison to use SLT or ULT instead of NE.
   1788     // One consequence of doing this now is that it disrupts the count-down
   1789     // optimization. That's not always a bad thing though, because in such
   1790     // cases it may still be worthwhile to avoid a max.
   1791     Cond = OptimizeMax(Cond, CondUse);
   1792 
   1793     // If this exiting block dominates the latch block, it may also use
   1794     // the post-inc value if it won't be shared with other uses.
   1795     // Check for dominance.
   1796     if (!DT.dominates(ExitingBlock, LatchBlock))
   1797       continue;
   1798 
   1799     // Conservatively avoid trying to use the post-inc value in non-latch
   1800     // exits if there may be pre-inc users in intervening blocks.
   1801     if (LatchBlock != ExitingBlock)
   1802       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
   1803         // Test if the use is reachable from the exiting block. This dominator
   1804         // query is a conservative approximation of reachability.
   1805         if (&*UI != CondUse &&
   1806             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
   1807           // Conservatively assume there may be reuse if the quotient of their
   1808           // strides could be a legal scale.
   1809           const SCEV *A = IU.getStride(*CondUse, L);
   1810           const SCEV *B = IU.getStride(*UI, L);
   1811           if (!A || !B) continue;
   1812           if (SE.getTypeSizeInBits(A->getType()) !=
   1813               SE.getTypeSizeInBits(B->getType())) {
   1814             if (SE.getTypeSizeInBits(A->getType()) >
   1815                 SE.getTypeSizeInBits(B->getType()))
   1816               B = SE.getSignExtendExpr(B, A->getType());
   1817             else
   1818               A = SE.getSignExtendExpr(A, B->getType());
   1819           }
   1820           if (const SCEVConstant *D =
   1821                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
   1822             const ConstantInt *C = D->getValue();
   1823             // Stride of one or negative one can have reuse with non-addresses.
   1824             if (C->isOne() || C->isAllOnesValue())
   1825               goto decline_post_inc;
   1826             // Avoid weird situations.
   1827             if (C->getValue().getMinSignedBits() >= 64 ||
   1828                 C->getValue().isMinSignedValue())
   1829               goto decline_post_inc;
   1830             // Without TLI, assume that any stride might be valid, and so any
   1831             // use might be shared.
   1832             if (!TLI)
   1833               goto decline_post_inc;
   1834             // Check for possible scaled-address reuse.
   1835             Type *AccessTy = getAccessType(UI->getUser());
   1836             TargetLowering::AddrMode AM;
   1837             AM.Scale = C->getSExtValue();
   1838             if (TLI->isLegalAddressingMode(AM, AccessTy))
   1839               goto decline_post_inc;
   1840             AM.Scale = -AM.Scale;
   1841             if (TLI->isLegalAddressingMode(AM, AccessTy))
   1842               goto decline_post_inc;
   1843           }
   1844         }
   1845 
   1846     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
   1847                  << *Cond << '\n');
   1848 
   1849     // It's possible for the setcc instruction to be anywhere in the loop, and
   1850     // possible for it to have multiple users.  If it is not immediately before
   1851     // the exiting block branch, move it.
   1852     if (&*++BasicBlock::iterator(Cond) != TermBr) {
   1853       if (Cond->hasOneUse()) {
   1854         Cond->moveBefore(TermBr);
   1855       } else {
   1856         // Clone the terminating condition and insert into the loopend.
   1857         ICmpInst *OldCond = Cond;
   1858         Cond = cast<ICmpInst>(Cond->clone());
   1859         Cond->setName(L->getHeader()->getName() + ".termcond");
   1860         ExitingBlock->getInstList().insert(TermBr, Cond);
   1861 
   1862         // Clone the IVUse, as the old use still exists!
   1863         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
   1864         TermBr->replaceUsesOfWith(OldCond, Cond);
   1865       }
   1866     }
   1867 
   1868     // If we get to here, we know that we can transform the setcc instruction to
   1869     // use the post-incremented version of the IV, allowing us to coalesce the
   1870     // live ranges for the IV correctly.
   1871     CondUse->transformToPostInc(L);
   1872     Changed = true;
   1873 
   1874     PostIncs.insert(Cond);
   1875   decline_post_inc:;
   1876   }
   1877 
   1878   // Determine an insertion point for the loop induction variable increment. It
   1879   // must dominate all the post-inc comparisons we just set up, and it must
   1880   // dominate the loop latch edge.
   1881   IVIncInsertPos = L->getLoopLatch()->getTerminator();
   1882   for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
   1883        E = PostIncs.end(); I != E; ++I) {
   1884     BasicBlock *BB =
   1885       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
   1886                                     (*I)->getParent());
   1887     if (BB == (*I)->getParent())
   1888       IVIncInsertPos = *I;
   1889     else if (BB != IVIncInsertPos->getParent())
   1890       IVIncInsertPos = BB->getTerminator();
   1891   }
   1892 }
   1893 
   1894 /// reconcileNewOffset - Determine if the given use can accommodate a fixup
   1895 /// at the given offset and other details. If so, update the use and
   1896 /// return true.
   1897 bool
   1898 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   1899                                 LSRUse::KindType Kind, Type *AccessTy) {
   1900   int64_t NewMinOffset = LU.MinOffset;
   1901   int64_t NewMaxOffset = LU.MaxOffset;
   1902   Type *NewAccessTy = AccessTy;
   1903 
   1904   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
   1905   // something conservative, however this can pessimize in the case that one of
   1906   // the uses will have all its uses outside the loop, for example.
   1907   if (LU.Kind != Kind)
   1908     return false;
   1909   // Conservatively assume HasBaseReg is true for now.
   1910   if (NewOffset < LU.MinOffset) {
   1911     if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
   1912                           Kind, AccessTy, TLI))
   1913       return false;
   1914     NewMinOffset = NewOffset;
   1915   } else if (NewOffset > LU.MaxOffset) {
   1916     if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
   1917                           Kind, AccessTy, TLI))
   1918       return false;
   1919     NewMaxOffset = NewOffset;
   1920   }
   1921   // Check for a mismatched access type, and fall back conservatively as needed.
   1922   // TODO: Be less conservative when the type is similar and can use the same
   1923   // addressing modes.
   1924   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
   1925     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
   1926 
   1927   // Update the use.
   1928   LU.MinOffset = NewMinOffset;
   1929   LU.MaxOffset = NewMaxOffset;
   1930   LU.AccessTy = NewAccessTy;
   1931   if (NewOffset != LU.Offsets.back())
   1932     LU.Offsets.push_back(NewOffset);
   1933   return true;
   1934 }
   1935 
   1936 /// getUse - Return an LSRUse index and an offset value for a fixup which
   1937 /// needs the given expression, with the given kind and optional access type.
   1938 /// Either reuse an existing use or create a new one, as needed.
   1939 std::pair<size_t, int64_t>
   1940 LSRInstance::getUse(const SCEV *&Expr,
   1941                     LSRUse::KindType Kind, Type *AccessTy) {
   1942   const SCEV *Copy = Expr;
   1943   int64_t Offset = ExtractImmediate(Expr, SE);
   1944 
   1945   // Basic uses can't accept any offset, for example.
   1946   if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
   1947     Expr = Copy;
   1948     Offset = 0;
   1949   }
   1950 
   1951   std::pair<UseMapTy::iterator, bool> P =
   1952     UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
   1953   if (!P.second) {
   1954     // A use already existed with this base.
   1955     size_t LUIdx = P.first->second;
   1956     LSRUse &LU = Uses[LUIdx];
   1957     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
   1958       // Reuse this use.
   1959       return std::make_pair(LUIdx, Offset);
   1960   }
   1961 
   1962   // Create a new use.
   1963   size_t LUIdx = Uses.size();
   1964   P.first->second = LUIdx;
   1965   Uses.push_back(LSRUse(Kind, AccessTy));
   1966   LSRUse &LU = Uses[LUIdx];
   1967 
   1968   // We don't need to track redundant offsets, but we don't need to go out
   1969   // of our way here to avoid them.
   1970   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
   1971     LU.Offsets.push_back(Offset);
   1972 
   1973   LU.MinOffset = Offset;
   1974   LU.MaxOffset = Offset;
   1975   return std::make_pair(LUIdx, Offset);
   1976 }
   1977 
   1978 /// DeleteUse - Delete the given use from the Uses list.
   1979 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
   1980   if (&LU != &Uses.back())
   1981     std::swap(LU, Uses.back());
   1982   Uses.pop_back();
   1983 
   1984   // Update RegUses.
   1985   RegUses.SwapAndDropUse(LUIdx, Uses.size());
   1986 }
   1987 
   1988 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
   1989 /// a formula that has the same registers as the given formula.
   1990 LSRUse *
   1991 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
   1992                                        const LSRUse &OrigLU) {
   1993   // Search all uses for the formula. This could be more clever.
   1994   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   1995     LSRUse &LU = Uses[LUIdx];
   1996     // Check whether this use is close enough to OrigLU, to see whether it's
   1997     // worthwhile looking through its formulae.
   1998     // Ignore ICmpZero uses because they may contain formulae generated by
   1999     // GenerateICmpZeroScales, in which case adding fixup offsets may
   2000     // be invalid.
   2001     if (&LU != &OrigLU &&
   2002         LU.Kind != LSRUse::ICmpZero &&
   2003         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
   2004         LU.WidestFixupType == OrigLU.WidestFixupType &&
   2005         LU.HasFormulaWithSameRegs(OrigF)) {
   2006       // Scan through this use's formulae.
   2007       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
   2008            E = LU.Formulae.end(); I != E; ++I) {
   2009         const Formula &F = *I;
   2010         // Check to see if this formula has the same registers and symbols
   2011         // as OrigF.
   2012         if (F.BaseRegs == OrigF.BaseRegs &&
   2013             F.ScaledReg == OrigF.ScaledReg &&
   2014             F.AM.BaseGV == OrigF.AM.BaseGV &&
   2015             F.AM.Scale == OrigF.AM.Scale &&
   2016             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
   2017           if (F.AM.BaseOffs == 0)
   2018             return &LU;
   2019           // This is the formula where all the registers and symbols matched;
   2020           // there aren't going to be any others. Since we declined it, we
   2021           // can skip the rest of the formulae and procede to the next LSRUse.
   2022           break;
   2023         }
   2024       }
   2025     }
   2026   }
   2027 
   2028   // Nothing looked good.
   2029   return 0;
   2030 }
   2031 
   2032 void LSRInstance::CollectInterestingTypesAndFactors() {
   2033   SmallSetVector<const SCEV *, 4> Strides;
   2034 
   2035   // Collect interesting types and strides.
   2036   SmallVector<const SCEV *, 4> Worklist;
   2037   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
   2038     const SCEV *Expr = IU.getExpr(*UI);
   2039 
   2040     // Collect interesting types.
   2041     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
   2042 
   2043     // Add strides for mentioned loops.
   2044     Worklist.push_back(Expr);
   2045     do {
   2046       const SCEV *S = Worklist.pop_back_val();
   2047       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
   2048         Strides.insert(AR->getStepRecurrence(SE));
   2049         Worklist.push_back(AR->getStart());
   2050       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
   2051         Worklist.append(Add->op_begin(), Add->op_end());
   2052       }
   2053     } while (!Worklist.empty());
   2054   }
   2055 
   2056   // Compute interesting factors from the set of interesting strides.
   2057   for (SmallSetVector<const SCEV *, 4>::const_iterator
   2058        I = Strides.begin(), E = Strides.end(); I != E; ++I)
   2059     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
   2060          llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
   2061       const SCEV *OldStride = *I;
   2062       const SCEV *NewStride = *NewStrideIter;
   2063 
   2064       if (SE.getTypeSizeInBits(OldStride->getType()) !=
   2065           SE.getTypeSizeInBits(NewStride->getType())) {
   2066         if (SE.getTypeSizeInBits(OldStride->getType()) >
   2067             SE.getTypeSizeInBits(NewStride->getType()))
   2068           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
   2069         else
   2070           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
   2071       }
   2072       if (const SCEVConstant *Factor =
   2073             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
   2074                                                         SE, true))) {
   2075         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
   2076           Factors.insert(Factor->getValue()->getValue().getSExtValue());
   2077       } else if (const SCEVConstant *Factor =
   2078                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
   2079                                                                NewStride,
   2080                                                                SE, true))) {
   2081         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
   2082           Factors.insert(Factor->getValue()->getValue().getSExtValue());
   2083       }
   2084     }
   2085 
   2086   // If all uses use the same type, don't bother looking for truncation-based
   2087   // reuse.
   2088   if (Types.size() == 1)
   2089     Types.clear();
   2090 
   2091   DEBUG(print_factors_and_types(dbgs()));
   2092 }
   2093 
   2094 void LSRInstance::CollectFixupsAndInitialFormulae() {
   2095   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
   2096     // Record the uses.
   2097     LSRFixup &LF = getNewFixup();
   2098     LF.UserInst = UI->getUser();
   2099     LF.OperandValToReplace = UI->getOperandValToReplace();
   2100     LF.PostIncLoops = UI->getPostIncLoops();
   2101 
   2102     LSRUse::KindType Kind = LSRUse::Basic;
   2103     Type *AccessTy = 0;
   2104     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
   2105       Kind = LSRUse::Address;
   2106       AccessTy = getAccessType(LF.UserInst);
   2107     }
   2108 
   2109     const SCEV *S = IU.getExpr(*UI);
   2110 
   2111     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
   2112     // (N - i == 0), and this allows (N - i) to be the expression that we work
   2113     // with rather than just N or i, so we can consider the register
   2114     // requirements for both N and i at the same time. Limiting this code to
   2115     // equality icmps is not a problem because all interesting loops use
   2116     // equality icmps, thanks to IndVarSimplify.
   2117     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
   2118       if (CI->isEquality()) {
   2119         // Swap the operands if needed to put the OperandValToReplace on the
   2120         // left, for consistency.
   2121         Value *NV = CI->getOperand(1);
   2122         if (NV == LF.OperandValToReplace) {
   2123           CI->setOperand(1, CI->getOperand(0));
   2124           CI->setOperand(0, NV);
   2125           NV = CI->getOperand(1);
   2126           Changed = true;
   2127         }
   2128 
   2129         // x == y  -->  x - y == 0
   2130         const SCEV *N = SE.getSCEV(NV);
   2131         if (SE.isLoopInvariant(N, L)) {
   2132           // S is normalized, so normalize N before folding it into S
   2133           // to keep the result normalized.
   2134           N = TransformForPostIncUse(Normalize, N, CI, 0,
   2135                                      LF.PostIncLoops, SE, DT);
   2136           Kind = LSRUse::ICmpZero;
   2137           S = SE.getMinusSCEV(N, S);
   2138         }
   2139 
   2140         // -1 and the negations of all interesting strides (except the negation
   2141         // of -1) are now also interesting.
   2142         for (size_t i = 0, e = Factors.size(); i != e; ++i)
   2143           if (Factors[i] != -1)
   2144             Factors.insert(-(uint64_t)Factors[i]);
   2145         Factors.insert(-1);
   2146       }
   2147 
   2148     // Set up the initial formula for this use.
   2149     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
   2150     LF.LUIdx = P.first;
   2151     LF.Offset = P.second;
   2152     LSRUse &LU = Uses[LF.LUIdx];
   2153     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
   2154     if (!LU.WidestFixupType ||
   2155         SE.getTypeSizeInBits(LU.WidestFixupType) <
   2156         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
   2157       LU.WidestFixupType = LF.OperandValToReplace->getType();
   2158 
   2159     // If this is the first use of this LSRUse, give it a formula.
   2160     if (LU.Formulae.empty()) {
   2161       InsertInitialFormula(S, LU, LF.LUIdx);
   2162       CountRegisters(LU.Formulae.back(), LF.LUIdx);
   2163     }
   2164   }
   2165 
   2166   DEBUG(print_fixups(dbgs()));
   2167 }
   2168 
   2169 /// InsertInitialFormula - Insert a formula for the given expression into
   2170 /// the given use, separating out loop-variant portions from loop-invariant
   2171 /// and loop-computable portions.
   2172 void
   2173 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
   2174   Formula F;
   2175   F.InitialMatch(S, L, SE);
   2176   bool Inserted = InsertFormula(LU, LUIdx, F);
   2177   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
   2178 }
   2179 
   2180 /// InsertSupplementalFormula - Insert a simple single-register formula for
   2181 /// the given expression into the given use.
   2182 void
   2183 LSRInstance::InsertSupplementalFormula(const SCEV *S,
   2184                                        LSRUse &LU, size_t LUIdx) {
   2185   Formula F;
   2186   F.BaseRegs.push_back(S);
   2187   F.AM.HasBaseReg = true;
   2188   bool Inserted = InsertFormula(LU, LUIdx, F);
   2189   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
   2190 }
   2191 
   2192 /// CountRegisters - Note which registers are used by the given formula,
   2193 /// updating RegUses.
   2194 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
   2195   if (F.ScaledReg)
   2196     RegUses.CountRegister(F.ScaledReg, LUIdx);
   2197   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
   2198        E = F.BaseRegs.end(); I != E; ++I)
   2199     RegUses.CountRegister(*I, LUIdx);
   2200 }
   2201 
   2202 /// InsertFormula - If the given formula has not yet been inserted, add it to
   2203 /// the list, and return true. Return false otherwise.
   2204 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   2205   if (!LU.InsertFormula(F))
   2206     return false;
   2207 
   2208   CountRegisters(F, LUIdx);
   2209   return true;
   2210 }
   2211 
   2212 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
   2213 /// loop-invariant values which we're tracking. These other uses will pin these
   2214 /// values in registers, making them less profitable for elimination.
   2215 /// TODO: This currently misses non-constant addrec step registers.
   2216 /// TODO: Should this give more weight to users inside the loop?
   2217 void
   2218 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
   2219   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
   2220   SmallPtrSet<const SCEV *, 8> Inserted;
   2221 
   2222   while (!Worklist.empty()) {
   2223     const SCEV *S = Worklist.pop_back_val();
   2224 
   2225     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
   2226       Worklist.append(N->op_begin(), N->op_end());
   2227     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
   2228       Worklist.push_back(C->getOperand());
   2229     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
   2230       Worklist.push_back(D->getLHS());
   2231       Worklist.push_back(D->getRHS());
   2232     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
   2233       if (!Inserted.insert(U)) continue;
   2234       const Value *V = U->getValue();
   2235       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
   2236         // Look for instructions defined outside the loop.
   2237         if (L->contains(Inst)) continue;
   2238       } else if (isa<UndefValue>(V))
   2239         // Undef doesn't have a live range, so it doesn't matter.
   2240         continue;
   2241       for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
   2242            UI != UE; ++UI) {
   2243         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
   2244         // Ignore non-instructions.
   2245         if (!UserInst)
   2246           continue;
   2247         // Ignore instructions in other functions (as can happen with
   2248         // Constants).
   2249         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
   2250           continue;
   2251         // Ignore instructions not dominated by the loop.
   2252         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
   2253           UserInst->getParent() :
   2254           cast<PHINode>(UserInst)->getIncomingBlock(
   2255             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
   2256         if (!DT.dominates(L->getHeader(), UseBB))
   2257           continue;
   2258         // Ignore uses which are part of other SCEV expressions, to avoid
   2259         // analyzing them multiple times.
   2260         if (SE.isSCEVable(UserInst->getType())) {
   2261           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
   2262           // If the user is a no-op, look through to its uses.
   2263           if (!isa<SCEVUnknown>(UserS))
   2264             continue;
   2265           if (UserS == U) {
   2266             Worklist.push_back(
   2267               SE.getUnknown(const_cast<Instruction *>(UserInst)));
   2268             continue;
   2269           }
   2270         }
   2271         // Ignore icmp instructions which are already being analyzed.
   2272         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
   2273           unsigned OtherIdx = !UI.getOperandNo();
   2274           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
   2275           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
   2276             continue;
   2277         }
   2278 
   2279         LSRFixup &LF = getNewFixup();
   2280         LF.UserInst = const_cast<Instruction *>(UserInst);
   2281         LF.OperandValToReplace = UI.getUse();
   2282         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
   2283         LF.LUIdx = P.first;
   2284         LF.Offset = P.second;
   2285         LSRUse &LU = Uses[LF.LUIdx];
   2286         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
   2287         if (!LU.WidestFixupType ||
   2288             SE.getTypeSizeInBits(LU.WidestFixupType) <
   2289             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
   2290           LU.WidestFixupType = LF.OperandValToReplace->getType();
   2291         InsertSupplementalFormula(U, LU, LF.LUIdx);
   2292         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
   2293         break;
   2294       }
   2295     }
   2296   }
   2297 }
   2298 
   2299 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
   2300 /// separate registers. If C is non-null, multiply each subexpression by C.
   2301 static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
   2302                             SmallVectorImpl<const SCEV *> &Ops,
   2303                             const Loop *L,
   2304                             ScalarEvolution &SE) {
   2305   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
   2306     // Break out add operands.
   2307     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
   2308          I != E; ++I)
   2309       CollectSubexprs(*I, C, Ops, L, SE);
   2310     return;
   2311   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
   2312     // Split a non-zero base out of an addrec.
   2313     if (!AR->getStart()->isZero()) {
   2314       CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
   2315                                        AR->getStepRecurrence(SE),
   2316                                        AR->getLoop(),
   2317                                        //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
   2318                                        SCEV::FlagAnyWrap),
   2319                       C, Ops, L, SE);
   2320       CollectSubexprs(AR->getStart(), C, Ops, L, SE);
   2321       return;
   2322     }
   2323   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
   2324     // Break (C * (a + b + c)) into C*a + C*b + C*c.
   2325     if (Mul->getNumOperands() == 2)
   2326       if (const SCEVConstant *Op0 =
   2327             dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
   2328         CollectSubexprs(Mul->getOperand(1),
   2329                         C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
   2330                         Ops, L, SE);
   2331         return;
   2332       }
   2333   }
   2334 
   2335   // Otherwise use the value itself, optionally with a scale applied.
   2336   Ops.push_back(C ? SE.getMulExpr(C, S) : S);
   2337 }
   2338 
   2339 /// GenerateReassociations - Split out subexpressions from adds and the bases of
   2340 /// addrecs.
   2341 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
   2342                                          Formula Base,
   2343                                          unsigned Depth) {
   2344   // Arbitrarily cap recursion to protect compile time.
   2345   if (Depth >= 3) return;
   2346 
   2347   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
   2348     const SCEV *BaseReg = Base.BaseRegs[i];
   2349 
   2350     SmallVector<const SCEV *, 8> AddOps;
   2351     CollectSubexprs(BaseReg, 0, AddOps, L, SE);
   2352 
   2353     if (AddOps.size() == 1) continue;
   2354 
   2355     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
   2356          JE = AddOps.end(); J != JE; ++J) {
   2357 
   2358       // Loop-variant "unknown" values are uninteresting; we won't be able to
   2359       // do anything meaningful with them.
   2360       if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
   2361         continue;
   2362 
   2363       // Don't pull a constant into a register if the constant could be folded
   2364       // into an immediate field.
   2365       if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
   2366                            Base.getNumRegs() > 1,
   2367                            LU.Kind, LU.AccessTy, TLI, SE))
   2368         continue;
   2369 
   2370       // Collect all operands except *J.
   2371       SmallVector<const SCEV *, 8> InnerAddOps
   2372         (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
   2373       InnerAddOps.append
   2374         (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
   2375 
   2376       // Don't leave just a constant behind in a register if the constant could
   2377       // be folded into an immediate field.
   2378       if (InnerAddOps.size() == 1 &&
   2379           isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
   2380                            Base.getNumRegs() > 1,
   2381                            LU.Kind, LU.AccessTy, TLI, SE))
   2382         continue;
   2383 
   2384       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
   2385       if (InnerSum->isZero())
   2386         continue;
   2387       Formula F = Base;
   2388 
   2389       // Add the remaining pieces of the add back into the new formula.
   2390       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
   2391       if (TLI && InnerSumSC &&
   2392           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
   2393           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
   2394                                    InnerSumSC->getValue()->getZExtValue())) {
   2395         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
   2396                            InnerSumSC->getValue()->getZExtValue();
   2397         F.BaseRegs.erase(F.BaseRegs.begin() + i);
   2398       } else
   2399         F.BaseRegs[i] = InnerSum;
   2400 
   2401       // Add J as its own register, or an unfolded immediate.
   2402       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
   2403       if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
   2404           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
   2405                                    SC->getValue()->getZExtValue()))
   2406         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
   2407                            SC->getValue()->getZExtValue();
   2408       else
   2409         F.BaseRegs.push_back(*J);
   2410 
   2411       if (InsertFormula(LU, LUIdx, F))
   2412         // If that formula hadn't been seen before, recurse to find more like
   2413         // it.
   2414         GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
   2415     }
   2416   }
   2417 }
   2418 
   2419 /// GenerateCombinations - Generate a formula consisting of all of the
   2420 /// loop-dominating registers added into a single register.
   2421 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
   2422                                        Formula Base) {
   2423   // This method is only interesting on a plurality of registers.
   2424   if (Base.BaseRegs.size() <= 1) return;
   2425 
   2426   Formula F = Base;
   2427   F.BaseRegs.clear();
   2428   SmallVector<const SCEV *, 4> Ops;
   2429   for (SmallVectorImpl<const SCEV *>::const_iterator
   2430        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
   2431     const SCEV *BaseReg = *I;
   2432     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
   2433         !SE.hasComputableLoopEvolution(BaseReg, L))
   2434       Ops.push_back(BaseReg);
   2435     else
   2436       F.BaseRegs.push_back(BaseReg);
   2437   }
   2438   if (Ops.size() > 1) {
   2439     const SCEV *Sum = SE.getAddExpr(Ops);
   2440     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
   2441     // opportunity to fold something. For now, just ignore such cases
   2442     // rather than proceed with zero in a register.
   2443     if (!Sum->isZero()) {
   2444       F.BaseRegs.push_back(Sum);
   2445       (void)InsertFormula(LU, LUIdx, F);
   2446     }
   2447   }
   2448 }
   2449 
   2450 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
   2451 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
   2452                                           Formula Base) {
   2453   // We can't add a symbolic offset if the address already contains one.
   2454   if (Base.AM.BaseGV) return;
   2455 
   2456   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
   2457     const SCEV *G = Base.BaseRegs[i];
   2458     GlobalValue *GV = ExtractSymbol(G, SE);
   2459     if (G->isZero() || !GV)
   2460       continue;
   2461     Formula F = Base;
   2462     F.AM.BaseGV = GV;
   2463     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
   2464                     LU.Kind, LU.AccessTy, TLI))
   2465       continue;
   2466     F.BaseRegs[i] = G;
   2467     (void)InsertFormula(LU, LUIdx, F);
   2468   }
   2469 }
   2470 
   2471 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
   2472 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
   2473                                           Formula Base) {
   2474   // TODO: For now, just add the min and max offset, because it usually isn't
   2475   // worthwhile looking at everything inbetween.
   2476   SmallVector<int64_t, 2> Worklist;
   2477   Worklist.push_back(LU.MinOffset);
   2478   if (LU.MaxOffset != LU.MinOffset)
   2479     Worklist.push_back(LU.MaxOffset);
   2480 
   2481   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
   2482     const SCEV *G = Base.BaseRegs[i];
   2483 
   2484     for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
   2485          E = Worklist.end(); I != E; ++I) {
   2486       Formula F = Base;
   2487       F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
   2488       if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
   2489                      LU.Kind, LU.AccessTy, TLI)) {
   2490         // Add the offset to the base register.
   2491         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
   2492         // If it cancelled out, drop the base register, otherwise update it.
   2493         if (NewG->isZero()) {
   2494           std::swap(F.BaseRegs[i], F.BaseRegs.back());
   2495           F.BaseRegs.pop_back();
   2496         } else
   2497           F.BaseRegs[i] = NewG;
   2498 
   2499         (void)InsertFormula(LU, LUIdx, F);
   2500       }
   2501     }
   2502 
   2503     int64_t Imm = ExtractImmediate(G, SE);
   2504     if (G->isZero() || Imm == 0)
   2505       continue;
   2506     Formula F = Base;
   2507     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
   2508     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
   2509                     LU.Kind, LU.AccessTy, TLI))
   2510       continue;
   2511     F.BaseRegs[i] = G;
   2512     (void)InsertFormula(LU, LUIdx, F);
   2513   }
   2514 }
   2515 
   2516 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
   2517 /// the comparison. For example, x == y -> x*c == y*c.
   2518 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   2519                                          Formula Base) {
   2520   if (LU.Kind != LSRUse::ICmpZero) return;
   2521 
   2522   // Determine the integer type for the base formula.
   2523   Type *IntTy = Base.getType();
   2524   if (!IntTy) return;
   2525   if (SE.getTypeSizeInBits(IntTy) > 64) return;
   2526 
   2527   // Don't do this if there is more than one offset.
   2528   if (LU.MinOffset != LU.MaxOffset) return;
   2529 
   2530   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
   2531 
   2532   // Check each interesting stride.
   2533   for (SmallSetVector<int64_t, 8>::const_iterator
   2534        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
   2535     int64_t Factor = *I;
   2536 
   2537     // Check that the multiplication doesn't overflow.
   2538     if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
   2539       continue;
   2540     int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
   2541     if (NewBaseOffs / Factor != Base.AM.BaseOffs)
   2542       continue;
   2543 
   2544     // Check that multiplying with the use offset doesn't overflow.
   2545     int64_t Offset = LU.MinOffset;
   2546     if (Offset == INT64_MIN && Factor == -1)
   2547       continue;
   2548     Offset = (uint64_t)Offset * Factor;
   2549     if (Offset / Factor != LU.MinOffset)
   2550       continue;
   2551 
   2552     Formula F = Base;
   2553     F.AM.BaseOffs = NewBaseOffs;
   2554 
   2555     // Check that this scale is legal.
   2556     if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
   2557       continue;
   2558 
   2559     // Compensate for the use having MinOffset built into it.
   2560     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
   2561 
   2562     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
   2563 
   2564     // Check that multiplying with each base register doesn't overflow.
   2565     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
   2566       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
   2567       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
   2568         goto next;
   2569     }
   2570 
   2571     // Check that multiplying with the scaled register doesn't overflow.
   2572     if (F.ScaledReg) {
   2573       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
   2574       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
   2575         continue;
   2576     }
   2577 
   2578     // Check that multiplying with the unfolded offset doesn't overflow.
   2579     if (F.UnfoldedOffset != 0) {
   2580       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
   2581         continue;
   2582       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
   2583       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
   2584         continue;
   2585     }
   2586 
   2587     // If we make it here and it's legal, add it.
   2588     (void)InsertFormula(LU, LUIdx, F);
   2589   next:;
   2590   }
   2591 }
   2592 
   2593 /// GenerateScales - Generate stride factor reuse formulae by making use of
   2594 /// scaled-offset address modes, for example.
   2595 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   2596   // Determine the integer type for the base formula.
   2597   Type *IntTy = Base.getType();
   2598   if (!IntTy) return;
   2599 
   2600   // If this Formula already has a scaled register, we can't add another one.
   2601   if (Base.AM.Scale != 0) return;
   2602 
   2603   // Check each interesting stride.
   2604   for (SmallSetVector<int64_t, 8>::const_iterator
   2605        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
   2606     int64_t Factor = *I;
   2607 
   2608     Base.AM.Scale = Factor;
   2609     Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
   2610     // Check whether this scale is going to be legal.
   2611     if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
   2612                     LU.Kind, LU.AccessTy, TLI)) {
   2613       // As a special-case, handle special out-of-loop Basic users specially.
   2614       // TODO: Reconsider this special case.
   2615       if (LU.Kind == LSRUse::Basic &&
   2616           isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
   2617                      LSRUse::Special, LU.AccessTy, TLI) &&
   2618           LU.AllFixupsOutsideLoop)
   2619         LU.Kind = LSRUse::Special;
   2620       else
   2621         continue;
   2622     }
   2623     // For an ICmpZero, negating a solitary base register won't lead to
   2624     // new solutions.
   2625     if (LU.Kind == LSRUse::ICmpZero &&
   2626         !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
   2627       continue;
   2628     // For each addrec base reg, apply the scale, if possible.
   2629     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
   2630       if (const SCEVAddRecExpr *AR =
   2631             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
   2632         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
   2633         if (FactorS->isZero())
   2634           continue;
   2635         // Divide out the factor, ignoring high bits, since we'll be
   2636         // scaling the value back up in the end.
   2637         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
   2638           // TODO: This could be optimized to avoid all the copying.
   2639           Formula F = Base;
   2640           F.ScaledReg = Quotient;
   2641           F.DeleteBaseReg(F.BaseRegs[i]);
   2642           (void)InsertFormula(LU, LUIdx, F);
   2643         }
   2644       }
   2645   }
   2646 }
   2647 
   2648 /// GenerateTruncates - Generate reuse formulae from different IV types.
   2649 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   2650   // This requires TargetLowering to tell us which truncates are free.
   2651   if (!TLI) return;
   2652 
   2653   // Don't bother truncating symbolic values.
   2654   if (Base.AM.BaseGV) return;
   2655 
   2656   // Determine the integer type for the base formula.
   2657   Type *DstTy = Base.getType();
   2658   if (!DstTy) return;
   2659   DstTy = SE.getEffectiveSCEVType(DstTy);
   2660 
   2661   for (SmallSetVector<Type *, 4>::const_iterator
   2662        I = Types.begin(), E = Types.end(); I != E; ++I) {
   2663     Type *SrcTy = *I;
   2664     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
   2665       Formula F = Base;
   2666 
   2667       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
   2668       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
   2669            JE = F.BaseRegs.end(); J != JE; ++J)
   2670         *J = SE.getAnyExtendExpr(*J, SrcTy);
   2671 
   2672       // TODO: This assumes we've done basic processing on all uses and
   2673       // have an idea what the register usage is.
   2674       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
   2675         continue;
   2676 
   2677       (void)InsertFormula(LU, LUIdx, F);
   2678     }
   2679   }
   2680 }
   2681 
   2682 namespace {
   2683 
   2684 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
   2685 /// defer modifications so that the search phase doesn't have to worry about
   2686 /// the data structures moving underneath it.
   2687 struct WorkItem {
   2688   size_t LUIdx;
   2689   int64_t Imm;
   2690   const SCEV *OrigReg;
   2691 
   2692   WorkItem(size_t LI, int64_t I, const SCEV *R)
   2693     : LUIdx(LI), Imm(I), OrigReg(R) {}
   2694 
   2695   void print(raw_ostream &OS) const;
   2696   void dump() const;
   2697 };
   2698 
   2699 }
   2700 
   2701 void WorkItem::print(raw_ostream &OS) const {
   2702   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
   2703      << " , add offset " << Imm;
   2704 }
   2705 
   2706 void WorkItem::dump() const {
   2707   print(errs()); errs() << '\n';
   2708 }
   2709 
   2710 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
   2711 /// distance apart and try to form reuse opportunities between them.
   2712 void LSRInstance::GenerateCrossUseConstantOffsets() {
   2713   // Group the registers by their value without any added constant offset.
   2714   typedef std::map<int64_t, const SCEV *> ImmMapTy;
   2715   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
   2716   RegMapTy Map;
   2717   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
   2718   SmallVector<const SCEV *, 8> Sequence;
   2719   for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
   2720        I != E; ++I) {
   2721     const SCEV *Reg = *I;
   2722     int64_t Imm = ExtractImmediate(Reg, SE);
   2723     std::pair<RegMapTy::iterator, bool> Pair =
   2724       Map.insert(std::make_pair(Reg, ImmMapTy()));
   2725     if (Pair.second)
   2726       Sequence.push_back(Reg);
   2727     Pair.first->second.insert(std::make_pair(Imm, *I));
   2728     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
   2729   }
   2730 
   2731   // Now examine each set of registers with the same base value. Build up
   2732   // a list of work to do and do the work in a separate step so that we're
   2733   // not adding formulae and register counts while we're searching.
   2734   SmallVector<WorkItem, 32> WorkItems;
   2735   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
   2736   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
   2737        E = Sequence.end(); I != E; ++I) {
   2738     const SCEV *Reg = *I;
   2739     const ImmMapTy &Imms = Map.find(Reg)->second;
   2740 
   2741     // It's not worthwhile looking for reuse if there's only one offset.
   2742     if (Imms.size() == 1)
   2743       continue;
   2744 
   2745     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
   2746           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
   2747                J != JE; ++J)
   2748             dbgs() << ' ' << J->first;
   2749           dbgs() << '\n');
   2750 
   2751     // Examine each offset.
   2752     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
   2753          J != JE; ++J) {
   2754       const SCEV *OrigReg = J->second;
   2755 
   2756       int64_t JImm = J->first;
   2757       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
   2758 
   2759       if (!isa<SCEVConstant>(OrigReg) &&
   2760           UsedByIndicesMap[Reg].count() == 1) {
   2761         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
   2762         continue;
   2763       }
   2764 
   2765       // Conservatively examine offsets between this orig reg a few selected
   2766       // other orig regs.
   2767       ImmMapTy::const_iterator OtherImms[] = {
   2768         Imms.begin(), prior(Imms.end()),
   2769         Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
   2770       };
   2771       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
   2772         ImmMapTy::const_iterator M = OtherImms[i];
   2773         if (M == J || M == JE) continue;
   2774 
   2775         // Compute the difference between the two.
   2776         int64_t Imm = (uint64_t)JImm - M->first;
   2777         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
   2778              LUIdx = UsedByIndices.find_next(LUIdx))
   2779           // Make a memo of this use, offset, and register tuple.
   2780           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
   2781             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
   2782       }
   2783     }
   2784   }
   2785 
   2786   Map.clear();
   2787   Sequence.clear();
   2788   UsedByIndicesMap.clear();
   2789   UniqueItems.clear();
   2790 
   2791   // Now iterate through the worklist and add new formulae.
   2792   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
   2793        E = WorkItems.end(); I != E; ++I) {
   2794     const WorkItem &WI = *I;
   2795     size_t LUIdx = WI.LUIdx;
   2796     LSRUse &LU = Uses[LUIdx];
   2797     int64_t Imm = WI.Imm;
   2798     const SCEV *OrigReg = WI.OrigReg;
   2799 
   2800     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
   2801     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
   2802     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
   2803 
   2804     // TODO: Use a more targeted data structure.
   2805     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
   2806       const Formula &F = LU.Formulae[L];
   2807       // Use the immediate in the scaled register.
   2808       if (F.ScaledReg == OrigReg) {
   2809         int64_t Offs = (uint64_t)F.AM.BaseOffs +
   2810                        Imm * (uint64_t)F.AM.Scale;
   2811         // Don't create 50 + reg(-50).
   2812         if (F.referencesReg(SE.getSCEV(
   2813                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
   2814           continue;
   2815         Formula NewF = F;
   2816         NewF.AM.BaseOffs = Offs;
   2817         if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
   2818                         LU.Kind, LU.AccessTy, TLI))
   2819           continue;
   2820         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
   2821 
   2822         // If the new scale is a constant in a register, and adding the constant
   2823         // value to the immediate would produce a value closer to zero than the
   2824         // immediate itself, then the formula isn't worthwhile.
   2825         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
   2826           if (C->getValue()->isNegative() !=
   2827                 (NewF.AM.BaseOffs < 0) &&
   2828               (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
   2829                 .ule(abs64(NewF.AM.BaseOffs)))
   2830             continue;
   2831 
   2832         // OK, looks good.
   2833         (void)InsertFormula(LU, LUIdx, NewF);
   2834       } else {
   2835         // Use the immediate in a base register.
   2836         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
   2837           const SCEV *BaseReg = F.BaseRegs[N];
   2838           if (BaseReg != OrigReg)
   2839             continue;
   2840           Formula NewF = F;
   2841           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
   2842           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
   2843                           LU.Kind, LU.AccessTy, TLI)) {
   2844             if (!TLI ||
   2845                 !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
   2846               continue;
   2847             NewF = F;
   2848             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
   2849           }
   2850           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
   2851 
   2852           // If the new formula has a constant in a register, and adding the
   2853           // constant value to the immediate would produce a value closer to
   2854           // zero than the immediate itself, then the formula isn't worthwhile.
   2855           for (SmallVectorImpl<const SCEV *>::const_iterator
   2856                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
   2857                J != JE; ++J)
   2858             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
   2859               if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt(
   2860                    abs64(NewF.AM.BaseOffs)) &&
   2861                   (C->getValue()->getValue() +
   2862                    NewF.AM.BaseOffs).countTrailingZeros() >=
   2863                    CountTrailingZeros_64(NewF.AM.BaseOffs))
   2864                 goto skip_formula;
   2865 
   2866           // Ok, looks good.
   2867           (void)InsertFormula(LU, LUIdx, NewF);
   2868           break;
   2869         skip_formula:;
   2870         }
   2871       }
   2872     }
   2873   }
   2874 }
   2875 
   2876 /// GenerateAllReuseFormulae - Generate formulae for each use.
   2877 void
   2878 LSRInstance::GenerateAllReuseFormulae() {
   2879   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
   2880   // queries are more precise.
   2881   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2882     LSRUse &LU = Uses[LUIdx];
   2883     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2884       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
   2885     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2886       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
   2887   }
   2888   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2889     LSRUse &LU = Uses[LUIdx];
   2890     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2891       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
   2892     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2893       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
   2894     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2895       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
   2896     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2897       GenerateScales(LU, LUIdx, LU.Formulae[i]);
   2898   }
   2899   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2900     LSRUse &LU = Uses[LUIdx];
   2901     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2902       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
   2903   }
   2904 
   2905   GenerateCrossUseConstantOffsets();
   2906 
   2907   DEBUG(dbgs() << "\n"
   2908                   "After generating reuse formulae:\n";
   2909         print_uses(dbgs()));
   2910 }
   2911 
   2912 /// If there are multiple formulae with the same set of registers used
   2913 /// by other uses, pick the best one and delete the others.
   2914 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
   2915   DenseSet<const SCEV *> VisitedRegs;
   2916   SmallPtrSet<const SCEV *, 16> Regs;
   2917 #ifndef NDEBUG
   2918   bool ChangedFormulae = false;
   2919 #endif
   2920 
   2921   // Collect the best formula for each unique set of shared registers. This
   2922   // is reset for each use.
   2923   typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
   2924     BestFormulaeTy;
   2925   BestFormulaeTy BestFormulae;
   2926 
   2927   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2928     LSRUse &LU = Uses[LUIdx];
   2929     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
   2930 
   2931     bool Any = false;
   2932     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
   2933          FIdx != NumForms; ++FIdx) {
   2934       Formula &F = LU.Formulae[FIdx];
   2935 
   2936       SmallVector<const SCEV *, 2> Key;
   2937       for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
   2938            JE = F.BaseRegs.end(); J != JE; ++J) {
   2939         const SCEV *Reg = *J;
   2940         if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
   2941           Key.push_back(Reg);
   2942       }
   2943       if (F.ScaledReg &&
   2944           RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
   2945         Key.push_back(F.ScaledReg);
   2946       // Unstable sort by host order ok, because this is only used for
   2947       // uniquifying.
   2948       std::sort(Key.begin(), Key.end());
   2949 
   2950       std::pair<BestFormulaeTy::const_iterator, bool> P =
   2951         BestFormulae.insert(std::make_pair(Key, FIdx));
   2952       if (!P.second) {
   2953         Formula &Best = LU.Formulae[P.first->second];
   2954 
   2955         Cost CostF;
   2956         CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
   2957         Regs.clear();
   2958         Cost CostBest;
   2959         CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
   2960         Regs.clear();
   2961         if (CostF < CostBest)
   2962           std::swap(F, Best);
   2963         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
   2964               dbgs() << "\n"
   2965                         "    in favor of formula "; Best.print(dbgs());
   2966               dbgs() << '\n');
   2967 #ifndef NDEBUG
   2968         ChangedFormulae = true;
   2969 #endif
   2970         LU.DeleteFormula(F);
   2971         --FIdx;
   2972         --NumForms;
   2973         Any = true;
   2974         continue;
   2975       }
   2976     }
   2977 
   2978     // Now that we've filtered out some formulae, recompute the Regs set.
   2979     if (Any)
   2980       LU.RecomputeRegs(LUIdx, RegUses);
   2981 
   2982     // Reset this to prepare for the next use.
   2983     BestFormulae.clear();
   2984   }
   2985 
   2986   DEBUG(if (ChangedFormulae) {
   2987           dbgs() << "\n"
   2988                     "After filtering out undesirable candidates:\n";
   2989           print_uses(dbgs());
   2990         });
   2991 }
   2992 
   2993 // This is a rough guess that seems to work fairly well.
   2994 static const size_t ComplexityLimit = UINT16_MAX;
   2995 
   2996 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of
   2997 /// solutions the solver might have to consider. It almost never considers
   2998 /// this many solutions because it prune the search space, but the pruning
   2999 /// isn't always sufficient.
   3000 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
   3001   size_t Power = 1;
   3002   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
   3003        E = Uses.end(); I != E; ++I) {
   3004     size_t FSize = I->Formulae.size();
   3005     if (FSize >= ComplexityLimit) {
   3006       Power = ComplexityLimit;
   3007       break;
   3008     }
   3009     Power *= FSize;
   3010     if (Power >= ComplexityLimit)
   3011       break;
   3012   }
   3013   return Power;
   3014 }
   3015 
   3016 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
   3017 /// of the registers of another formula, it won't help reduce register
   3018 /// pressure (though it may not necessarily hurt register pressure); remove
   3019 /// it to simplify the system.
   3020 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
   3021   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   3022     DEBUG(dbgs() << "The search space is too complex.\n");
   3023 
   3024     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
   3025                     "which use a superset of registers used by other "
   3026                     "formulae.\n");
   3027 
   3028     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   3029       LSRUse &LU = Uses[LUIdx];
   3030       bool Any = false;
   3031       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
   3032         Formula &F = LU.Formulae[i];
   3033         // Look for a formula with a constant or GV in a register. If the use
   3034         // also has a formula with that same value in an immediate field,
   3035         // delete the one that uses a register.
   3036         for (SmallVectorImpl<const SCEV *>::const_iterator
   3037              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
   3038           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
   3039             Formula NewF = F;
   3040             NewF.AM.BaseOffs += C->getValue()->getSExtValue();
   3041             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
   3042                                 (I - F.BaseRegs.begin()));
   3043             if (LU.HasFormulaWithSameRegs(NewF)) {
   3044               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
   3045               LU.DeleteFormula(F);
   3046               --i;
   3047               --e;
   3048               Any = true;
   3049               break;
   3050             }
   3051           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
   3052             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
   3053               if (!F.AM.BaseGV) {
   3054                 Formula NewF = F;
   3055                 NewF.AM.BaseGV = GV;
   3056                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
   3057                                     (I - F.BaseRegs.begin()));
   3058                 if (LU.HasFormulaWithSameRegs(NewF)) {
   3059                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
   3060                         dbgs() << '\n');
   3061                   LU.DeleteFormula(F);
   3062                   --i;
   3063                   --e;
   3064                   Any = true;
   3065                   break;
   3066                 }
   3067               }
   3068           }
   3069         }
   3070       }
   3071       if (Any)
   3072         LU.RecomputeRegs(LUIdx, RegUses);
   3073     }
   3074 
   3075     DEBUG(dbgs() << "After pre-selection:\n";
   3076           print_uses(dbgs()));
   3077   }
   3078 }
   3079 
   3080 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
   3081 /// for expressions like A, A+1, A+2, etc., allocate a single register for
   3082 /// them.
   3083 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   3084   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   3085     DEBUG(dbgs() << "The search space is too complex.\n");
   3086 
   3087     DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
   3088                     "separated by a constant offset will use the same "
   3089                     "registers.\n");
   3090 
   3091     // This is especially useful for unrolled loops.
   3092 
   3093     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   3094       LSRUse &LU = Uses[LUIdx];
   3095       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
   3096            E = LU.Formulae.end(); I != E; ++I) {
   3097         const Formula &F = *I;
   3098         if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
   3099           if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
   3100             if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
   3101                                    /*HasBaseReg=*/false,
   3102                                    LU.Kind, LU.AccessTy)) {
   3103               DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
   3104                     dbgs() << '\n');
   3105 
   3106               LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
   3107 
   3108               // Update the relocs to reference the new use.
   3109               for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
   3110                    E = Fixups.end(); I != E; ++I) {
   3111                 LSRFixup &Fixup = *I;
   3112                 if (Fixup.LUIdx == LUIdx) {
   3113                   Fixup.LUIdx = LUThatHas - &Uses.front();
   3114                   Fixup.Offset += F.AM.BaseOffs;
   3115                   // Add the new offset to LUThatHas' offset list.
   3116                   if (LUThatHas->Offsets.back() != Fixup.Offset) {
   3117                     LUThatHas->Offsets.push_back(Fixup.Offset);
   3118                     if (Fixup.Offset > LUThatHas->MaxOffset)
   3119                       LUThatHas->MaxOffset = Fixup.Offset;
   3120                     if (Fixup.Offset < LUThatHas->MinOffset)
   3121                       LUThatHas->MinOffset = Fixup.Offset;
   3122                   }
   3123                   DEBUG(dbgs() << "New fixup has offset "
   3124                                << Fixup.Offset << '\n');
   3125                 }
   3126                 if (Fixup.LUIdx == NumUses-1)
   3127                   Fixup.LUIdx = LUIdx;
   3128               }
   3129 
   3130               // Delete formulae from the new use which are no longer legal.
   3131               bool Any = false;
   3132               for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
   3133                 Formula &F = LUThatHas->Formulae[i];
   3134                 if (!isLegalUse(F.AM,
   3135                                 LUThatHas->MinOffset, LUThatHas->MaxOffset,
   3136                                 LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
   3137                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
   3138                         dbgs() << '\n');
   3139                   LUThatHas->DeleteFormula(F);
   3140                   --i;
   3141                   --e;
   3142                   Any = true;
   3143                 }
   3144               }
   3145               if (Any)
   3146                 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
   3147 
   3148               // Delete the old use.
   3149               DeleteUse(LU, LUIdx);
   3150               --LUIdx;
   3151               --NumUses;
   3152               break;
   3153             }
   3154           }
   3155         }
   3156       }
   3157     }
   3158 
   3159     DEBUG(dbgs() << "After pre-selection:\n";
   3160           print_uses(dbgs()));
   3161   }
   3162 }
   3163 
   3164 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
   3165 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
   3166 /// we've done more filtering, as it may be able to find more formulae to
   3167 /// eliminate.
   3168 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   3169   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   3170     DEBUG(dbgs() << "The search space is too complex.\n");
   3171 
   3172     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
   3173                     "undesirable dedicated registers.\n");
   3174 
   3175     FilterOutUndesirableDedicatedRegisters();
   3176 
   3177     DEBUG(dbgs() << "After pre-selection:\n";
   3178           print_uses(dbgs()));
   3179   }
   3180 }
   3181 
   3182 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
   3183 /// to be profitable, and then in any use which has any reference to that
   3184 /// register, delete all formulae which do not reference that register.
   3185 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
   3186   // With all other options exhausted, loop until the system is simple
   3187   // enough to handle.
   3188   SmallPtrSet<const SCEV *, 4> Taken;
   3189   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   3190     // Ok, we have too many of formulae on our hands to conveniently handle.
   3191     // Use a rough heuristic to thin out the list.
   3192     DEBUG(dbgs() << "The search space is too complex.\n");
   3193 
   3194     // Pick the register which is used by the most LSRUses, which is likely
   3195     // to be a good reuse register candidate.
   3196     const SCEV *Best = 0;
   3197     unsigned BestNum = 0;
   3198     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
   3199          I != E; ++I) {
   3200       const SCEV *Reg = *I;
   3201       if (Taken.count(Reg))
   3202         continue;
   3203       if (!Best)
   3204         Best = Reg;
   3205       else {
   3206         unsigned Count = RegUses.getUsedByIndices(Reg).count();
   3207         if (Count > BestNum) {
   3208           Best = Reg;
   3209           BestNum = Count;
   3210         }
   3211       }
   3212     }
   3213 
   3214     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
   3215                  << " will yield profitable reuse.\n");
   3216     Taken.insert(Best);
   3217 
   3218     // In any use with formulae which references this register, delete formulae
   3219     // which don't reference it.
   3220     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   3221       LSRUse &LU = Uses[LUIdx];
   3222       if (!LU.Regs.count(Best)) continue;
   3223 
   3224       bool Any = false;
   3225       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
   3226         Formula &F = LU.Formulae[i];
   3227         if (!F.referencesReg(Best)) {
   3228           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
   3229           LU.DeleteFormula(F);
   3230           --e;
   3231           --i;
   3232           Any = true;
   3233           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
   3234           continue;
   3235         }
   3236       }
   3237 
   3238       if (Any)
   3239         LU.RecomputeRegs(LUIdx, RegUses);
   3240     }
   3241 
   3242     DEBUG(dbgs() << "After pre-selection:\n";
   3243           print_uses(dbgs()));
   3244   }
   3245 }
   3246 
   3247 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
   3248 /// formulae to choose from, use some rough heuristics to prune down the number
   3249 /// of formulae. This keeps the main solver from taking an extraordinary amount
   3250 /// of time in some worst-case scenarios.
   3251 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   3252   NarrowSearchSpaceByDetectingSupersets();
   3253   NarrowSearchSpaceByCollapsingUnrolledCode();
   3254   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
   3255   NarrowSearchSpaceByPickingWinnerRegs();
   3256 }
   3257 
   3258 /// SolveRecurse - This is the recursive solver.
   3259 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
   3260                                Cost &SolutionCost,
   3261                                SmallVectorImpl<const Formula *> &Workspace,
   3262                                const Cost &CurCost,
   3263                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
   3264                                DenseSet<const SCEV *> &VisitedRegs) const {
   3265   // Some ideas:
   3266   //  - prune more:
   3267   //    - use more aggressive filtering
   3268   //    - sort the formula so that the most profitable solutions are found first
   3269   //    - sort the uses too
   3270   //  - search faster:
   3271   //    - don't compute a cost, and then compare. compare while computing a cost
   3272   //      and bail early.
   3273   //    - track register sets with SmallBitVector
   3274 
   3275   const LSRUse &LU = Uses[Workspace.size()];
   3276 
   3277   // If this use references any register that's already a part of the
   3278   // in-progress solution, consider it a requirement that a formula must
   3279   // reference that register in order to be considered. This prunes out
   3280   // unprofitable searching.
   3281   SmallSetVector<const SCEV *, 4> ReqRegs;
   3282   for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
   3283        E = CurRegs.end(); I != E; ++I)
   3284     if (LU.Regs.count(*I))
   3285       ReqRegs.insert(*I);
   3286 
   3287   bool AnySatisfiedReqRegs = false;
   3288   SmallPtrSet<const SCEV *, 16> NewRegs;
   3289   Cost NewCost;
   3290 retry:
   3291   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
   3292        E = LU.Formulae.end(); I != E; ++I) {
   3293     const Formula &F = *I;
   3294 
   3295     // Ignore formulae which do not use any of the required registers.
   3296     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
   3297          JE = ReqRegs.end(); J != JE; ++J) {
   3298       const SCEV *Reg = *J;
   3299       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
   3300           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
   3301           F.BaseRegs.end())
   3302         goto skip;
   3303     }
   3304     AnySatisfiedReqRegs = true;
   3305 
   3306     // Evaluate the cost of the current formula. If it's already worse than
   3307     // the current best, prune the search at that point.
   3308     NewCost = CurCost;
   3309     NewRegs = CurRegs;
   3310     NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
   3311     if (NewCost < SolutionCost) {
   3312       Workspace.push_back(&F);
   3313       if (Workspace.size() != Uses.size()) {
   3314         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
   3315                      NewRegs, VisitedRegs);
   3316         if (F.getNumRegs() == 1 && Workspace.size() == 1)
   3317           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
   3318       } else {
   3319         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
   3320               dbgs() << ". Regs:";
   3321               for (SmallPtrSet<const SCEV *, 16>::const_iterator
   3322                    I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
   3323                 dbgs() << ' ' << **I;
   3324               dbgs() << '\n');
   3325 
   3326         SolutionCost = NewCost;
   3327         Solution = Workspace;
   3328       }
   3329       Workspace.pop_back();
   3330     }
   3331   skip:;
   3332   }
   3333 
   3334   if (!EnableRetry && !AnySatisfiedReqRegs)
   3335     return;
   3336 
   3337   // If none of the formulae had all of the required registers, relax the
   3338   // constraint so that we don't exclude all formulae.
   3339   if (!AnySatisfiedReqRegs) {
   3340     assert(!ReqRegs.empty() && "Solver failed even without required registers");
   3341     ReqRegs.clear();
   3342     goto retry;
   3343   }
   3344 }
   3345 
   3346 /// Solve - Choose one formula from each use. Return the results in the given
   3347 /// Solution vector.
   3348 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   3349   SmallVector<const Formula *, 8> Workspace;
   3350   Cost SolutionCost;
   3351   SolutionCost.Loose();
   3352   Cost CurCost;
   3353   SmallPtrSet<const SCEV *, 16> CurRegs;
   3354   DenseSet<const SCEV *> VisitedRegs;
   3355   Workspace.reserve(Uses.size());
   3356 
   3357   // SolveRecurse does all the work.
   3358   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
   3359                CurRegs, VisitedRegs);
   3360   if (Solution.empty()) {
   3361     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
   3362     return;
   3363   }
   3364 
   3365   // Ok, we've now made all our decisions.
   3366   DEBUG(dbgs() << "\n"
   3367                   "The chosen solution requires "; SolutionCost.print(dbgs());
   3368         dbgs() << ":\n";
   3369         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
   3370           dbgs() << "  ";
   3371           Uses[i].print(dbgs());
   3372           dbgs() << "\n"
   3373                     "    ";
   3374           Solution[i]->print(dbgs());
   3375           dbgs() << '\n';
   3376         });
   3377 
   3378   assert(Solution.size() == Uses.size() && "Malformed solution!");
   3379 }
   3380 
   3381 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
   3382 /// the dominator tree far as we can go while still being dominated by the
   3383 /// input positions. This helps canonicalize the insert position, which
   3384 /// encourages sharing.
   3385 BasicBlock::iterator
   3386 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
   3387                                  const SmallVectorImpl<Instruction *> &Inputs)
   3388                                                                          const {
   3389   for (;;) {
   3390     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
   3391     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
   3392 
   3393     BasicBlock *IDom;
   3394     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
   3395       if (!Rung) return IP;
   3396       Rung = Rung->getIDom();
   3397       if (!Rung) return IP;
   3398       IDom = Rung->getBlock();
   3399 
   3400       // Don't climb into a loop though.
   3401       const Loop *IDomLoop = LI.getLoopFor(IDom);
   3402       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
   3403       if (IDomDepth <= IPLoopDepth &&
   3404           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
   3405         break;
   3406     }
   3407 
   3408     bool AllDominate = true;
   3409     Instruction *BetterPos = 0;
   3410     Instruction *Tentative = IDom->getTerminator();
   3411     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
   3412          E = Inputs.end(); I != E; ++I) {
   3413       Instruction *Inst = *I;
   3414       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
   3415         AllDominate = false;
   3416         break;
   3417       }
   3418       // Attempt to find an insert position in the middle of the block,
   3419       // instead of at the end, so that it can be used for other expansions.
   3420       if (IDom == Inst->getParent() &&
   3421           (!BetterPos || DT.dominates(BetterPos, Inst)))
   3422         BetterPos = llvm::next(BasicBlock::iterator(Inst));
   3423     }
   3424     if (!AllDominate)
   3425       break;
   3426     if (BetterPos)
   3427       IP = BetterPos;
   3428     else
   3429       IP = Tentative;
   3430   }
   3431 
   3432   return IP;
   3433 }
   3434 
   3435 /// AdjustInsertPositionForExpand - Determine an input position which will be
   3436 /// dominated by the operands and which will dominate the result.
   3437 BasicBlock::iterator
   3438 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   3439                                            const LSRFixup &LF,
   3440                                            const LSRUse &LU) const {
   3441   // Collect some instructions which must be dominated by the
   3442   // expanding replacement. These must be dominated by any operands that
   3443   // will be required in the expansion.
   3444   SmallVector<Instruction *, 4> Inputs;
   3445   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
   3446     Inputs.push_back(I);
   3447   if (LU.Kind == LSRUse::ICmpZero)
   3448     if (Instruction *I =
   3449           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
   3450       Inputs.push_back(I);
   3451   if (LF.PostIncLoops.count(L)) {
   3452     if (LF.isUseFullyOutsideLoop(L))
   3453       Inputs.push_back(L->getLoopLatch()->getTerminator());
   3454     else
   3455       Inputs.push_back(IVIncInsertPos);
   3456   }
   3457   // The expansion must also be dominated by the increment positions of any
   3458   // loops it for which it is using post-inc mode.
   3459   for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
   3460        E = LF.PostIncLoops.end(); I != E; ++I) {
   3461     const Loop *PIL = *I;
   3462     if (PIL == L) continue;
   3463 
   3464     // Be dominated by the loop exit.
   3465     SmallVector<BasicBlock *, 4> ExitingBlocks;
   3466     PIL->getExitingBlocks(ExitingBlocks);
   3467     if (!ExitingBlocks.empty()) {
   3468       BasicBlock *BB = ExitingBlocks[0];
   3469       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
   3470         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
   3471       Inputs.push_back(BB->getTerminator());
   3472     }
   3473   }
   3474 
   3475   // Then, climb up the immediate dominator tree as far as we can go while
   3476   // still being dominated by the input positions.
   3477   IP = HoistInsertPosition(IP, Inputs);
   3478 
   3479   // Don't insert instructions before PHI nodes.
   3480   while (isa<PHINode>(IP)) ++IP;
   3481 
   3482   // Ignore landingpad instructions.
   3483   while (isa<LandingPadInst>(IP)) ++IP;
   3484 
   3485   // Ignore debug intrinsics.
   3486   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
   3487 
   3488   return IP;
   3489 }
   3490 
   3491 /// Expand - Emit instructions for the leading candidate expression for this
   3492 /// LSRUse (this is called "expanding").
   3493 Value *LSRInstance::Expand(const LSRFixup &LF,
   3494                            const Formula &F,
   3495                            BasicBlock::iterator IP,
   3496                            SCEVExpander &Rewriter,
   3497                            SmallVectorImpl<WeakVH> &DeadInsts) const {
   3498   const LSRUse &LU = Uses[LF.LUIdx];
   3499 
   3500   // Determine an input position which will be dominated by the operands and
   3501   // which will dominate the result.
   3502   IP = AdjustInsertPositionForExpand(IP, LF, LU);
   3503 
   3504   // Inform the Rewriter if we have a post-increment use, so that it can
   3505   // perform an advantageous expansion.
   3506   Rewriter.setPostInc(LF.PostIncLoops);
   3507 
   3508   // This is the type that the user actually needs.
   3509   Type *OpTy = LF.OperandValToReplace->getType();
   3510   // This will be the type that we'll initially expand to.
   3511   Type *Ty = F.getType();
   3512   if (!Ty)
   3513     // No type known; just expand directly to the ultimate type.
   3514     Ty = OpTy;
   3515   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
   3516     // Expand directly to the ultimate type if it's the right size.
   3517     Ty = OpTy;
   3518   // This is the type to do integer arithmetic in.
   3519   Type *IntTy = SE.getEffectiveSCEVType(Ty);
   3520 
   3521   // Build up a list of operands to add together to form the full base.
   3522   SmallVector<const SCEV *, 8> Ops;
   3523 
   3524   // Expand the BaseRegs portion.
   3525   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
   3526        E = F.BaseRegs.end(); I != E; ++I) {
   3527     const SCEV *Reg = *I;
   3528     assert(!Reg->isZero() && "Zero allocated in a base register!");
   3529 
   3530     // If we're expanding for a post-inc user, make the post-inc adjustment.
   3531     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
   3532     Reg = TransformForPostIncUse(Denormalize, Reg,
   3533                                  LF.UserInst, LF.OperandValToReplace,
   3534                                  Loops, SE, DT);
   3535 
   3536     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
   3537   }
   3538 
   3539   // Flush the operand list to suppress SCEVExpander hoisting.
   3540   if (!Ops.empty()) {
   3541     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
   3542     Ops.clear();
   3543     Ops.push_back(SE.getUnknown(FullV));
   3544   }
   3545 
   3546   // Expand the ScaledReg portion.
   3547   Value *ICmpScaledV = 0;
   3548   if (F.AM.Scale != 0) {
   3549     const SCEV *ScaledS = F.ScaledReg;
   3550 
   3551     // If we're expanding for a post-inc user, make the post-inc adjustment.
   3552     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
   3553     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
   3554                                      LF.UserInst, LF.OperandValToReplace,
   3555                                      Loops, SE, DT);
   3556 
   3557     if (LU.Kind == LSRUse::ICmpZero) {
   3558       // An interesting way of "folding" with an icmp is to use a negated
   3559       // scale, which we'll implement by inserting it into the other operand
   3560       // of the icmp.
   3561       assert(F.AM.Scale == -1 &&
   3562              "The only scale supported by ICmpZero uses is -1!");
   3563       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
   3564     } else {
   3565       // Otherwise just expand the scaled register and an explicit scale,
   3566       // which is expected to be matched as part of the address.
   3567       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
   3568       ScaledS = SE.getMulExpr(ScaledS,
   3569                               SE.getConstant(ScaledS->getType(), F.AM.Scale));
   3570       Ops.push_back(ScaledS);
   3571 
   3572       // Flush the operand list to suppress SCEVExpander hoisting.
   3573       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
   3574       Ops.clear();
   3575       Ops.push_back(SE.getUnknown(FullV));
   3576     }
   3577   }
   3578 
   3579   // Expand the GV portion.
   3580   if (F.AM.BaseGV) {
   3581     Ops.push_back(SE.getUnknown(F.AM.BaseGV));
   3582 
   3583     // Flush the operand list to suppress SCEVExpander hoisting.
   3584     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
   3585     Ops.clear();
   3586     Ops.push_back(SE.getUnknown(FullV));
   3587   }
   3588 
   3589   // Expand the immediate portion.
   3590   int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
   3591   if (Offset != 0) {
   3592     if (LU.Kind == LSRUse::ICmpZero) {
   3593       // The other interesting way of "folding" with an ICmpZero is to use a
   3594       // negated immediate.
   3595       if (!ICmpScaledV)
   3596         ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
   3597       else {
   3598         Ops.push_back(SE.getUnknown(ICmpScaledV));
   3599         ICmpScaledV = ConstantInt::get(IntTy, Offset);
   3600       }
   3601     } else {
   3602       // Just add the immediate values. These again are expected to be matched
   3603       // as part of the address.
   3604       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
   3605     }
   3606   }
   3607 
   3608   // Expand the unfolded offset portion.
   3609   int64_t UnfoldedOffset = F.UnfoldedOffset;
   3610   if (UnfoldedOffset != 0) {
   3611     // Just add the immediate values.
   3612     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
   3613                                                        UnfoldedOffset)));
   3614   }
   3615 
   3616   // Emit instructions summing all the operands.
   3617   const SCEV *FullS = Ops.empty() ?
   3618                       SE.getConstant(IntTy, 0) :
   3619                       SE.getAddExpr(Ops);
   3620   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
   3621 
   3622   // We're done expanding now, so reset the rewriter.
   3623   Rewriter.clearPostInc();
   3624 
   3625   // An ICmpZero Formula represents an ICmp which we're handling as a
   3626   // comparison against zero. Now that we've expanded an expression for that
   3627   // form, update the ICmp's other operand.
   3628   if (LU.Kind == LSRUse::ICmpZero) {
   3629     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
   3630     DeadInsts.push_back(CI->getOperand(1));
   3631     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
   3632                            "a scale at the same time!");
   3633     if (F.AM.Scale == -1) {
   3634       if (ICmpScaledV->getType() != OpTy) {
   3635         Instruction *Cast =
   3636           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
   3637                                                    OpTy, false),
   3638                            ICmpScaledV, OpTy, "tmp", CI);
   3639         ICmpScaledV = Cast;
   3640       }
   3641       CI->setOperand(1, ICmpScaledV);
   3642     } else {
   3643       assert(F.AM.Scale == 0 &&
   3644              "ICmp does not support folding a global value and "
   3645              "a scale at the same time!");
   3646       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
   3647                                            -(uint64_t)Offset);
   3648       if (C->getType() != OpTy)
   3649         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
   3650                                                           OpTy, false),
   3651                                   C, OpTy);
   3652 
   3653       CI->setOperand(1, C);
   3654     }
   3655   }
   3656 
   3657   return FullV;
   3658 }
   3659 
   3660 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
   3661 /// of their operands effectively happens in their predecessor blocks, so the
   3662 /// expression may need to be expanded in multiple places.
   3663 void LSRInstance::RewriteForPHI(PHINode *PN,
   3664                                 const LSRFixup &LF,
   3665                                 const Formula &F,
   3666                                 SCEVExpander &Rewriter,
   3667                                 SmallVectorImpl<WeakVH> &DeadInsts,
   3668                                 Pass *P) const {
   3669   DenseMap<BasicBlock *, Value *> Inserted;
   3670   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
   3671     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
   3672       BasicBlock *BB = PN->getIncomingBlock(i);
   3673 
   3674       // If this is a critical edge, split the edge so that we do not insert
   3675       // the code on all predecessor/successor paths.  We do this unless this
   3676       // is the canonical backedge for this loop, which complicates post-inc
   3677       // users.
   3678       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
   3679           !isa<IndirectBrInst>(BB->getTerminator())) {
   3680         BasicBlock *Parent = PN->getParent();
   3681         Loop *PNLoop = LI.getLoopFor(Parent);
   3682         if (!PNLoop || Parent != PNLoop->getHeader()) {
   3683           // Split the critical edge.
   3684           BasicBlock *NewBB = 0;
   3685           if (!Parent->isLandingPad()) {
   3686             NewBB = SplitCriticalEdge(BB, Parent, P,
   3687                                       /*MergeIdenticalEdges=*/true,
   3688                                       /*DontDeleteUselessPhis=*/true);
   3689           } else {
   3690             SmallVector<BasicBlock*, 2> NewBBs;
   3691             SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
   3692             NewBB = NewBBs[0];
   3693           }
   3694 
   3695           // If PN is outside of the loop and BB is in the loop, we want to
   3696           // move the block to be immediately before the PHI block, not
   3697           // immediately after BB.
   3698           if (L->contains(BB) && !L->contains(PN))
   3699             NewBB->moveBefore(PN->getParent());
   3700 
   3701           // Splitting the edge can reduce the number of PHI entries we have.
   3702           e = PN->getNumIncomingValues();
   3703           BB = NewBB;
   3704           i = PN->getBasicBlockIndex(BB);
   3705         }
   3706       }
   3707 
   3708       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
   3709         Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
   3710       if (!Pair.second)
   3711         PN->setIncomingValue(i, Pair.first->second);
   3712       else {
   3713         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
   3714 
   3715         // If this is reuse-by-noop-cast, insert the noop cast.
   3716         Type *OpTy = LF.OperandValToReplace->getType();
   3717         if (FullV->getType() != OpTy)
   3718           FullV =
   3719             CastInst::Create(CastInst::getCastOpcode(FullV, false,
   3720                                                      OpTy, false),
   3721                              FullV, LF.OperandValToReplace->getType(),
   3722                              "tmp", BB->getTerminator());
   3723 
   3724         PN->setIncomingValue(i, FullV);
   3725         Pair.first->second = FullV;
   3726       }
   3727     }
   3728 }
   3729 
   3730 /// Rewrite - Emit instructions for the leading candidate expression for this
   3731 /// LSRUse (this is called "expanding"), and update the UserInst to reference
   3732 /// the newly expanded value.
   3733 void LSRInstance::Rewrite(const LSRFixup &LF,
   3734                           const Formula &F,
   3735                           SCEVExpander &Rewriter,
   3736                           SmallVectorImpl<WeakVH> &DeadInsts,
   3737                           Pass *P) const {
   3738   // First, find an insertion point that dominates UserInst. For PHI nodes,
   3739   // find the nearest block which dominates all the relevant uses.
   3740   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
   3741     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
   3742   } else {
   3743     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
   3744 
   3745     // If this is reuse-by-noop-cast, insert the noop cast.
   3746     Type *OpTy = LF.OperandValToReplace->getType();
   3747     if (FullV->getType() != OpTy) {
   3748       Instruction *Cast =
   3749         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
   3750                          FullV, OpTy, "tmp", LF.UserInst);
   3751       FullV = Cast;
   3752     }
   3753 
   3754     // Update the user. ICmpZero is handled specially here (for now) because
   3755     // Expand may have updated one of the operands of the icmp already, and
   3756     // its new value may happen to be equal to LF.OperandValToReplace, in
   3757     // which case doing replaceUsesOfWith leads to replacing both operands
   3758     // with the same value. TODO: Reorganize this.
   3759     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
   3760       LF.UserInst->setOperand(0, FullV);
   3761     else
   3762       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
   3763   }
   3764 
   3765   DeadInsts.push_back(LF.OperandValToReplace);
   3766 }
   3767 
   3768 /// ImplementSolution - Rewrite all the fixup locations with new values,
   3769 /// following the chosen solution.
   3770 void
   3771 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   3772                                Pass *P) {
   3773   // Keep track of instructions we may have made dead, so that
   3774   // we can remove them after we are done working.
   3775   SmallVector<WeakVH, 16> DeadInsts;
   3776 
   3777   SCEVExpander Rewriter(SE, "lsr");
   3778   Rewriter.disableCanonicalMode();
   3779   Rewriter.enableLSRMode();
   3780   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
   3781 
   3782   // Expand the new value definitions and update the users.
   3783   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
   3784        E = Fixups.end(); I != E; ++I) {
   3785     const LSRFixup &Fixup = *I;
   3786 
   3787     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
   3788 
   3789     Changed = true;
   3790   }
   3791 
   3792   // Clean up after ourselves. This must be done before deleting any
   3793   // instructions.
   3794   Rewriter.clear();
   3795 
   3796   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
   3797 }
   3798 
   3799 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   3800   : IU(P->getAnalysis<IVUsers>()),
   3801     SE(P->getAnalysis<ScalarEvolution>()),
   3802     DT(P->getAnalysis<DominatorTree>()),
   3803     LI(P->getAnalysis<LoopInfo>()),
   3804     TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
   3805 
   3806   // If LoopSimplify form is not available, stay out of trouble.
   3807   if (!L->isLoopSimplifyForm()) return;
   3808 
   3809   // If there's no interesting work to be done, bail early.
   3810   if (IU.empty()) return;
   3811 
   3812   DEBUG(dbgs() << "\nLSR on loop ";
   3813         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
   3814         dbgs() << ":\n");
   3815 
   3816   // First, perform some low-level loop optimizations.
   3817   OptimizeShadowIV();
   3818   OptimizeLoopTermCond();
   3819 
   3820   // If loop preparation eliminates all interesting IV users, bail.
   3821   if (IU.empty()) return;
   3822 
   3823   // Skip nested loops until we can model them better with formulae.
   3824   if (!EnableNested && !L->empty()) {
   3825 
   3826     if (EnablePhiElim) {
   3827       // Remove any extra phis created by processing inner loops.
   3828       SmallVector<WeakVH, 16> DeadInsts;
   3829       SCEVExpander Rewriter(SE, "lsr");
   3830       Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
   3831       Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
   3832     }
   3833     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
   3834     return;
   3835   }
   3836 
   3837   // Start collecting data and preparing for the solver.
   3838   CollectInterestingTypesAndFactors();
   3839   CollectFixupsAndInitialFormulae();
   3840   CollectLoopInvariantFixupsAndFormulae();
   3841 
   3842   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
   3843         print_uses(dbgs()));
   3844 
   3845   // Now use the reuse data to generate a bunch of interesting ways
   3846   // to formulate the values needed for the uses.
   3847   GenerateAllReuseFormulae();
   3848 
   3849   FilterOutUndesirableDedicatedRegisters();
   3850   NarrowSearchSpaceUsingHeuristics();
   3851 
   3852   SmallVector<const Formula *, 8> Solution;
   3853   Solve(Solution);
   3854 
   3855   // Release memory that is no longer needed.
   3856   Factors.clear();
   3857   Types.clear();
   3858   RegUses.clear();
   3859 
   3860   if (Solution.empty())
   3861     return;
   3862 
   3863 #ifndef NDEBUG
   3864   // Formulae should be legal.
   3865   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
   3866        E = Uses.end(); I != E; ++I) {
   3867      const LSRUse &LU = *I;
   3868      for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
   3869           JE = LU.Formulae.end(); J != JE; ++J)
   3870         assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
   3871                           LU.Kind, LU.AccessTy, TLI) &&
   3872                "Illegal formula generated!");
   3873   };
   3874 #endif
   3875 
   3876   // Now that we've decided what we want, make it so.
   3877   ImplementSolution(Solution, P);
   3878 
   3879   if (EnablePhiElim) {
   3880     // Remove any extra phis created by processing inner loops.
   3881     SmallVector<WeakVH, 16> DeadInsts;
   3882     SCEVExpander Rewriter(SE, "lsr");
   3883     Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
   3884     Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
   3885   }
   3886 }
   3887 
   3888 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
   3889   if (Factors.empty() && Types.empty()) return;
   3890 
   3891   OS << "LSR has identified the following interesting factors and types: ";
   3892   bool First = true;
   3893 
   3894   for (SmallSetVector<int64_t, 8>::const_iterator
   3895        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
   3896     if (!First) OS << ", ";
   3897     First = false;
   3898     OS << '*' << *I;
   3899   }
   3900 
   3901   for (SmallSetVector<Type *, 4>::const_iterator
   3902        I = Types.begin(), E = Types.end(); I != E; ++I) {
   3903     if (!First) OS << ", ";
   3904     First = false;
   3905     OS << '(' << **I << ')';
   3906   }
   3907   OS << '\n';
   3908 }
   3909 
   3910 void LSRInstance::print_fixups(raw_ostream &OS) const {
   3911   OS << "LSR is examining the following fixup sites:\n";
   3912   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
   3913        E = Fixups.end(); I != E; ++I) {
   3914     dbgs() << "  ";
   3915     I->print(OS);
   3916     OS << '\n';
   3917   }
   3918 }
   3919 
   3920 void LSRInstance::print_uses(raw_ostream &OS) const {
   3921   OS << "LSR is examining the following uses:\n";
   3922   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
   3923        E = Uses.end(); I != E; ++I) {
   3924     const LSRUse &LU = *I;
   3925     dbgs() << "  ";
   3926     LU.print(OS);
   3927     OS << '\n';
   3928     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
   3929          JE = LU.Formulae.end(); J != JE; ++J) {
   3930       OS << "    ";
   3931       J->print(OS);
   3932       OS << '\n';
   3933     }
   3934   }
   3935 }
   3936 
   3937 void LSRInstance::print(raw_ostream &OS) const {
   3938   print_factors_and_types(OS);
   3939   print_fixups(OS);
   3940   print_uses(OS);
   3941 }
   3942 
   3943 void LSRInstance::dump() const {
   3944   print(errs()); errs() << '\n';
   3945 }
   3946 
   3947 namespace {
   3948 
   3949 class LoopStrengthReduce : public LoopPass {
   3950   /// TLI - Keep a pointer of a TargetLowering to consult for determining
   3951   /// transformation profitability.
   3952   const TargetLowering *const TLI;
   3953 
   3954 public:
   3955   static char ID; // Pass ID, replacement for typeid
   3956   explicit LoopStrengthReduce(const TargetLowering *tli = 0);
   3957 
   3958 private:
   3959   bool runOnLoop(Loop *L, LPPassManager &LPM);
   3960   void getAnalysisUsage(AnalysisUsage &AU) const;
   3961 };
   3962 
   3963 }
   3964 
   3965 char LoopStrengthReduce::ID = 0;
   3966 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
   3967                 "Loop Strength Reduction", false, false)
   3968 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
   3969 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
   3970 INITIALIZE_PASS_DEPENDENCY(IVUsers)
   3971 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
   3972 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
   3973 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
   3974                 "Loop Strength Reduction", false, false)
   3975 
   3976 
   3977 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
   3978   return new LoopStrengthReduce(TLI);
   3979 }
   3980 
   3981 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
   3982   : LoopPass(ID), TLI(tli) {
   3983     initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
   3984   }
   3985 
   3986 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   3987   // We split critical edges, so we change the CFG.  However, we do update
   3988   // many analyses if they are around.
   3989   AU.addPreservedID(LoopSimplifyID);
   3990 
   3991   AU.addRequired<LoopInfo>();
   3992   AU.addPreserved<LoopInfo>();
   3993   AU.addRequiredID(LoopSimplifyID);
   3994   AU.addRequired<DominatorTree>();
   3995   AU.addPreserved<DominatorTree>();
   3996   AU.addRequired<ScalarEvolution>();
   3997   AU.addPreserved<ScalarEvolution>();
   3998   // Requiring LoopSimplify a second time here prevents IVUsers from running
   3999   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
   4000   AU.addRequiredID(LoopSimplifyID);
   4001   AU.addRequired<IVUsers>();
   4002   AU.addPreserved<IVUsers>();
   4003 }
   4004 
   4005 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   4006   bool Changed = false;
   4007 
   4008   // Run the main LSR transformation.
   4009   Changed |= LSRInstance(TLI, L, this).getChanged();
   4010 
   4011   // At this point, it is worth checking to see if any recurrence PHIs are also
   4012   // dead, so that we can remove them as well.
   4013   Changed |= DeleteDeadPHIs(L->getHeader());
   4014 
   4015   return Changed;
   4016 }
   4017