Home | History | Annotate | Download | only in Scalar
      1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This transformation analyzes and transforms the induction variables (and
     11 // computations derived from them) into forms suitable for efficient execution
     12 // on the target.
     13 //
     14 // This pass performs a strength reduction on array references inside loops that
     15 // have as one or more of their components the loop induction variable, it
     16 // rewrites expressions to take advantage of scaled-index addressing modes
     17 // available on the target, and it performs a variety of other optimizations
     18 // related to loop induction variables.
     19 //
     20 // Terminology note: this code has a lot of handling for "post-increment" or
     21 // "post-inc" users. This is not talking about post-increment addressing modes;
     22 // it is instead talking about code like this:
     23 //
     24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
     25 //   ...
     26 //   %i.next = add %i, 1
     27 //   %c = icmp eq %i.next, %n
     28 //
     29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
     30 // it's useful to think about these as the same register, with some uses using
     31 // the value of the register before the add and some using // it after. In this
     32 // example, the icmp is a post-increment user, since it uses %i.next, which is
     33 // the value of the induction variable after the increment. The other common
     34 // case of post-increment users is users outside the loop.
     35 //
     36 // TODO: More sophistication in the way Formulae are generated and filtered.
     37 //
     38 // TODO: Handle multiple loops at a time.
     39 //
     40 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
     41 //       instead of a GlobalValue?
     42 //
     43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
     44 //       smaller encoding (on x86 at least).
     45 //
     46 // TODO: When a negated register is used by an add (such as in a list of
     47 //       multiple base registers, or as the increment expression in an addrec),
     48 //       we may not actually need both reg and (-1 * reg) in registers; the
     49 //       negation can be implemented by using a sub instead of an add. The
     50 //       lack of support for taking this into consideration when making
     51 //       register pressure decisions is partly worked around by the "Special"
     52 //       use kind.
     53 //
     54 //===----------------------------------------------------------------------===//
     55 
     56 #define DEBUG_TYPE "loop-reduce"
     57 #include "llvm/Transforms/Scalar.h"
     58 #include "llvm/Constants.h"
     59 #include "llvm/Instructions.h"
     60 #include "llvm/IntrinsicInst.h"
     61 #include "llvm/DerivedTypes.h"
     62 #include "llvm/Analysis/IVUsers.h"
     63 #include "llvm/Analysis/Dominators.h"
     64 #include "llvm/Analysis/LoopPass.h"
     65 #include "llvm/Analysis/ScalarEvolutionExpander.h"
     66 #include "llvm/Assembly/Writer.h"
     67 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
     68 #include "llvm/Transforms/Utils/Local.h"
     69 #include "llvm/ADT/SmallBitVector.h"
     70 #include "llvm/ADT/SetVector.h"
     71 #include "llvm/ADT/DenseSet.h"
     72 #include "llvm/Support/Debug.h"
     73 #include "llvm/Support/ValueHandle.h"
     74 #include "llvm/Support/raw_ostream.h"
     75 #include "llvm/Target/TargetLowering.h"
     76 #include <algorithm>
     77 using namespace llvm;
     78 
     79 namespace {
     80 
     81 /// RegSortData - This class holds data which is used to order reuse candidates.
     82 class RegSortData {
     83 public:
     84   /// UsedByIndices - This represents the set of LSRUse indices which reference
     85   /// a particular register.
     86   SmallBitVector UsedByIndices;
     87 
     88   RegSortData() {}
     89 
     90   void print(raw_ostream &OS) const;
     91   void dump() const;
     92 };
     93 
     94 }
     95 
     96 void RegSortData::print(raw_ostream &OS) const {
     97   OS << "[NumUses=" << UsedByIndices.count() << ']';
     98 }
     99 
    100 void RegSortData::dump() const {
    101   print(errs()); errs() << '\n';
    102 }
    103 
    104 namespace {
    105 
    106 /// RegUseTracker - Map register candidates to information about how they are
    107 /// used.
    108 class RegUseTracker {
    109   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
    110 
    111   RegUsesTy RegUsesMap;
    112   SmallVector<const SCEV *, 16> RegSequence;
    113 
    114 public:
    115   void CountRegister(const SCEV *Reg, size_t LUIdx);
    116   void DropRegister(const SCEV *Reg, size_t LUIdx);
    117   void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
    118 
    119   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
    120 
    121   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
    122 
    123   void clear();
    124 
    125   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
    126   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
    127   iterator begin() { return RegSequence.begin(); }
    128   iterator end()   { return RegSequence.end(); }
    129   const_iterator begin() const { return RegSequence.begin(); }
    130   const_iterator end() const   { return RegSequence.end(); }
    131 };
    132 
    133 }
    134 
    135 void
    136 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
    137   std::pair<RegUsesTy::iterator, bool> Pair =
    138     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
    139   RegSortData &RSD = Pair.first->second;
    140   if (Pair.second)
    141     RegSequence.push_back(Reg);
    142   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
    143   RSD.UsedByIndices.set(LUIdx);
    144 }
    145 
    146 void
    147 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
    148   RegUsesTy::iterator It = RegUsesMap.find(Reg);
    149   assert(It != RegUsesMap.end());
    150   RegSortData &RSD = It->second;
    151   assert(RSD.UsedByIndices.size() > LUIdx);
    152   RSD.UsedByIndices.reset(LUIdx);
    153 }
    154 
    155 void
    156 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
    157   assert(LUIdx <= LastLUIdx);
    158 
    159   // Update RegUses. The data structure is not optimized for this purpose;
    160   // we must iterate through it and update each of the bit vectors.
    161   for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
    162        I != E; ++I) {
    163     SmallBitVector &UsedByIndices = I->second.UsedByIndices;
    164     if (LUIdx < UsedByIndices.size())
    165       UsedByIndices[LUIdx] =
    166         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
    167     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
    168   }
    169 }
    170 
    171 bool
    172 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
    173   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
    174   if (I == RegUsesMap.end())
    175     return false;
    176   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
    177   int i = UsedByIndices.find_first();
    178   if (i == -1) return false;
    179   if ((size_t)i != LUIdx) return true;
    180   return UsedByIndices.find_next(i) != -1;
    181 }
    182 
    183 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
    184   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
    185   assert(I != RegUsesMap.end() && "Unknown register!");
    186   return I->second.UsedByIndices;
    187 }
    188 
    189 void RegUseTracker::clear() {
    190   RegUsesMap.clear();
    191   RegSequence.clear();
    192 }
    193 
    194 namespace {
    195 
    196 /// Formula - This class holds information that describes a formula for
    197 /// computing satisfying a use. It may include broken-out immediates and scaled
    198 /// registers.
    199 struct Formula {
    200   /// AM - This is used to represent complex addressing, as well as other kinds
    201   /// of interesting uses.
    202   TargetLowering::AddrMode AM;
    203 
    204   /// BaseRegs - The list of "base" registers for this use. When this is
    205   /// non-empty, AM.HasBaseReg should be set to true.
    206   SmallVector<const SCEV *, 2> BaseRegs;
    207 
    208   /// ScaledReg - The 'scaled' register for this use. This should be non-null
    209   /// when AM.Scale is not zero.
    210   const SCEV *ScaledReg;
    211 
    212   /// UnfoldedOffset - An additional constant offset which added near the
    213   /// use. This requires a temporary register, but the offset itself can
    214   /// live in an add immediate field rather than a register.
    215   int64_t UnfoldedOffset;
    216 
    217   Formula() : ScaledReg(0), UnfoldedOffset(0) {}
    218 
    219   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
    220 
    221   unsigned getNumRegs() const;
    222   Type *getType() const;
    223 
    224   void DeleteBaseReg(const SCEV *&S);
    225 
    226   bool referencesReg(const SCEV *S) const;
    227   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
    228                                   const RegUseTracker &RegUses) const;
    229 
    230   void print(raw_ostream &OS) const;
    231   void dump() const;
    232 };
    233 
    234 }
    235 
    236 /// DoInitialMatch - Recursion helper for InitialMatch.
    237 static void DoInitialMatch(const SCEV *S, Loop *L,
    238                            SmallVectorImpl<const SCEV *> &Good,
    239                            SmallVectorImpl<const SCEV *> &Bad,
    240                            ScalarEvolution &SE) {
    241   // Collect expressions which properly dominate the loop header.
    242   if (SE.properlyDominates(S, L->getHeader())) {
    243     Good.push_back(S);
    244     return;
    245   }
    246 
    247   // Look at add operands.
    248   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
    249     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
    250          I != E; ++I)
    251       DoInitialMatch(*I, L, Good, Bad, SE);
    252     return;
    253   }
    254 
    255   // Look at addrec operands.
    256   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
    257     if (!AR->getStart()->isZero()) {
    258       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
    259       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
    260                                       AR->getStepRecurrence(SE),
    261                                       // FIXME: AR->getNoWrapFlags()
    262                                       AR->getLoop(), SCEV::FlagAnyWrap),
    263                      L, Good, Bad, SE);
    264       return;
    265     }
    266 
    267   // Handle a multiplication by -1 (negation) if it didn't fold.
    268   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
    269     if (Mul->getOperand(0)->isAllOnesValue()) {
    270       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
    271       const SCEV *NewMul = SE.getMulExpr(Ops);
    272 
    273       SmallVector<const SCEV *, 4> MyGood;
    274       SmallVector<const SCEV *, 4> MyBad;
    275       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
    276       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
    277         SE.getEffectiveSCEVType(NewMul->getType())));
    278       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
    279            E = MyGood.end(); I != E; ++I)
    280         Good.push_back(SE.getMulExpr(NegOne, *I));
    281       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
    282            E = MyBad.end(); I != E; ++I)
    283         Bad.push_back(SE.getMulExpr(NegOne, *I));
    284       return;
    285     }
    286 
    287   // Ok, we can't do anything interesting. Just stuff the whole thing into a
    288   // register and hope for the best.
    289   Bad.push_back(S);
    290 }
    291 
    292 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
    293 /// attempting to keep all loop-invariant and loop-computable values in a
    294 /// single base register.
    295 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
    296   SmallVector<const SCEV *, 4> Good;
    297   SmallVector<const SCEV *, 4> Bad;
    298   DoInitialMatch(S, L, Good, Bad, SE);
    299   if (!Good.empty()) {
    300     const SCEV *Sum = SE.getAddExpr(Good);
    301     if (!Sum->isZero())
    302       BaseRegs.push_back(Sum);
    303     AM.HasBaseReg = true;
    304   }
    305   if (!Bad.empty()) {
    306     const SCEV *Sum = SE.getAddExpr(Bad);
    307     if (!Sum->isZero())
    308       BaseRegs.push_back(Sum);
    309     AM.HasBaseReg = true;
    310   }
    311 }
    312 
    313 /// getNumRegs - Return the total number of register operands used by this
    314 /// formula. This does not include register uses implied by non-constant
    315 /// addrec strides.
    316 unsigned Formula::getNumRegs() const {
    317   return !!ScaledReg + BaseRegs.size();
    318 }
    319 
    320 /// getType - Return the type of this formula, if it has one, or null
    321 /// otherwise. This type is meaningless except for the bit size.
    322 Type *Formula::getType() const {
    323   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
    324          ScaledReg ? ScaledReg->getType() :
    325          AM.BaseGV ? AM.BaseGV->getType() :
    326          0;
    327 }
    328 
    329 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
    330 void Formula::DeleteBaseReg(const SCEV *&S) {
    331   if (&S != &BaseRegs.back())
    332     std::swap(S, BaseRegs.back());
    333   BaseRegs.pop_back();
    334 }
    335 
    336 /// referencesReg - Test if this formula references the given register.
    337 bool Formula::referencesReg(const SCEV *S) const {
    338   return S == ScaledReg ||
    339          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
    340 }
    341 
    342 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
    343 /// which are used by uses other than the use with the given index.
    344 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
    345                                          const RegUseTracker &RegUses) const {
    346   if (ScaledReg)
    347     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
    348       return true;
    349   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
    350        E = BaseRegs.end(); I != E; ++I)
    351     if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
    352       return true;
    353   return false;
    354 }
    355 
    356 void Formula::print(raw_ostream &OS) const {
    357   bool First = true;
    358   if (AM.BaseGV) {
    359     if (!First) OS << " + "; else First = false;
    360     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
    361   }
    362   if (AM.BaseOffs != 0) {
    363     if (!First) OS << " + "; else First = false;
    364     OS << AM.BaseOffs;
    365   }
    366   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
    367        E = BaseRegs.end(); I != E; ++I) {
    368     if (!First) OS << " + "; else First = false;
    369     OS << "reg(" << **I << ')';
    370   }
    371   if (AM.HasBaseReg && BaseRegs.empty()) {
    372     if (!First) OS << " + "; else First = false;
    373     OS << "**error: HasBaseReg**";
    374   } else if (!AM.HasBaseReg && !BaseRegs.empty()) {
    375     if (!First) OS << " + "; else First = false;
    376     OS << "**error: !HasBaseReg**";
    377   }
    378   if (AM.Scale != 0) {
    379     if (!First) OS << " + "; else First = false;
    380     OS << AM.Scale << "*reg(";
    381     if (ScaledReg)
    382       OS << *ScaledReg;
    383     else
    384       OS << "<unknown>";
    385     OS << ')';
    386   }
    387   if (UnfoldedOffset != 0) {
    388     if (!First) OS << " + "; else First = false;
    389     OS << "imm(" << UnfoldedOffset << ')';
    390   }
    391 }
    392 
    393 void Formula::dump() const {
    394   print(errs()); errs() << '\n';
    395 }
    396 
    397 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
    398 /// without changing its value.
    399 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
    400   Type *WideTy =
    401     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
    402   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
    403 }
    404 
    405 /// isAddSExtable - Return true if the given add can be sign-extended
    406 /// without changing its value.
    407 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
    408   Type *WideTy =
    409     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
    410   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
    411 }
    412 
    413 /// isMulSExtable - Return true if the given mul can be sign-extended
    414 /// without changing its value.
    415 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
    416   Type *WideTy =
    417     IntegerType::get(SE.getContext(),
    418                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
    419   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
    420 }
    421 
    422 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
    423 /// and if the remainder is known to be zero,  or null otherwise. If
    424 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
    425 /// to Y, ignoring that the multiplication may overflow, which is useful when
    426 /// the result will be used in a context where the most significant bits are
    427 /// ignored.
    428 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
    429                                 ScalarEvolution &SE,
    430                                 bool IgnoreSignificantBits = false) {
    431   // Handle the trivial case, which works for any SCEV type.
    432   if (LHS == RHS)
    433     return SE.getConstant(LHS->getType(), 1);
    434 
    435   // Handle a few RHS special cases.
    436   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
    437   if (RC) {
    438     const APInt &RA = RC->getValue()->getValue();
    439     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
    440     // some folding.
    441     if (RA.isAllOnesValue())
    442       return SE.getMulExpr(LHS, RC);
    443     // Handle x /s 1 as x.
    444     if (RA == 1)
    445       return LHS;
    446   }
    447 
    448   // Check for a division of a constant by a constant.
    449   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
    450     if (!RC)
    451       return 0;
    452     const APInt &LA = C->getValue()->getValue();
    453     const APInt &RA = RC->getValue()->getValue();
    454     if (LA.srem(RA) != 0)
    455       return 0;
    456     return SE.getConstant(LA.sdiv(RA));
    457   }
    458 
    459   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
    460   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
    461     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
    462       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
    463                                       IgnoreSignificantBits);
    464       if (!Step) return 0;
    465       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
    466                                        IgnoreSignificantBits);
    467       if (!Start) return 0;
    468       // FlagNW is independent of the start value, step direction, and is
    469       // preserved with smaller magnitude steps.
    470       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
    471       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
    472     }
    473     return 0;
    474   }
    475 
    476   // Distribute the sdiv over add operands, if the add doesn't overflow.
    477   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
    478     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
    479       SmallVector<const SCEV *, 8> Ops;
    480       for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
    481            I != E; ++I) {
    482         const SCEV *Op = getExactSDiv(*I, RHS, SE,
    483                                       IgnoreSignificantBits);
    484         if (!Op) return 0;
    485         Ops.push_back(Op);
    486       }
    487       return SE.getAddExpr(Ops);
    488     }
    489     return 0;
    490   }
    491 
    492   // Check for a multiply operand that we can pull RHS out of.
    493   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
    494     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
    495       SmallVector<const SCEV *, 4> Ops;
    496       bool Found = false;
    497       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
    498            I != E; ++I) {
    499         const SCEV *S = *I;
    500         if (!Found)
    501           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
    502                                            IgnoreSignificantBits)) {
    503             S = Q;
    504             Found = true;
    505           }
    506         Ops.push_back(S);
    507       }
    508       return Found ? SE.getMulExpr(Ops) : 0;
    509     }
    510     return 0;
    511   }
    512 
    513   // Otherwise we don't know.
    514   return 0;
    515 }
    516 
    517 /// ExtractImmediate - If S involves the addition of a constant integer value,
    518 /// return that integer value, and mutate S to point to a new SCEV with that
    519 /// value excluded.
    520 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
    521   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
    522     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
    523       S = SE.getConstant(C->getType(), 0);
    524       return C->getValue()->getSExtValue();
    525     }
    526   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
    527     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
    528     int64_t Result = ExtractImmediate(NewOps.front(), SE);
    529     if (Result != 0)
    530       S = SE.getAddExpr(NewOps);
    531     return Result;
    532   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
    533     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
    534     int64_t Result = ExtractImmediate(NewOps.front(), SE);
    535     if (Result != 0)
    536       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
    537                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
    538                            SCEV::FlagAnyWrap);
    539     return Result;
    540   }
    541   return 0;
    542 }
    543 
    544 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
    545 /// return that symbol, and mutate S to point to a new SCEV with that
    546 /// value excluded.
    547 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
    548   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
    549     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
    550       S = SE.getConstant(GV->getType(), 0);
    551       return GV;
    552     }
    553   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
    554     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
    555     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
    556     if (Result)
    557       S = SE.getAddExpr(NewOps);
    558     return Result;
    559   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
    560     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
    561     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
    562     if (Result)
    563       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
    564                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
    565                            SCEV::FlagAnyWrap);
    566     return Result;
    567   }
    568   return 0;
    569 }
    570 
    571 /// isAddressUse - Returns true if the specified instruction is using the
    572 /// specified value as an address.
    573 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
    574   bool isAddress = isa<LoadInst>(Inst);
    575   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
    576     if (SI->getOperand(1) == OperandVal)
    577       isAddress = true;
    578   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
    579     // Addressing modes can also be folded into prefetches and a variety
    580     // of intrinsics.
    581     switch (II->getIntrinsicID()) {
    582       default: break;
    583       case Intrinsic::prefetch:
    584       case Intrinsic::x86_sse_storeu_ps:
    585       case Intrinsic::x86_sse2_storeu_pd:
    586       case Intrinsic::x86_sse2_storeu_dq:
    587       case Intrinsic::x86_sse2_storel_dq:
    588         if (II->getArgOperand(0) == OperandVal)
    589           isAddress = true;
    590         break;
    591     }
    592   }
    593   return isAddress;
    594 }
    595 
    596 /// getAccessType - Return the type of the memory being accessed.
    597 static Type *getAccessType(const Instruction *Inst) {
    598   Type *AccessTy = Inst->getType();
    599   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
    600     AccessTy = SI->getOperand(0)->getType();
    601   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
    602     // Addressing modes can also be folded into prefetches and a variety
    603     // of intrinsics.
    604     switch (II->getIntrinsicID()) {
    605     default: break;
    606     case Intrinsic::x86_sse_storeu_ps:
    607     case Intrinsic::x86_sse2_storeu_pd:
    608     case Intrinsic::x86_sse2_storeu_dq:
    609     case Intrinsic::x86_sse2_storel_dq:
    610       AccessTy = II->getArgOperand(0)->getType();
    611       break;
    612     }
    613   }
    614 
    615   // All pointers have the same requirements, so canonicalize them to an
    616   // arbitrary pointer type to minimize variation.
    617   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
    618     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
    619                                 PTy->getAddressSpace());
    620 
    621   return AccessTy;
    622 }
    623 
    624 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
    625 /// specified set are trivially dead, delete them and see if this makes any of
    626 /// their operands subsequently dead.
    627 static bool
    628 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
    629   bool Changed = false;
    630 
    631   while (!DeadInsts.empty()) {
    632     Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val());
    633 
    634     if (I == 0 || !isInstructionTriviallyDead(I))
    635       continue;
    636 
    637     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
    638       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
    639         *OI = 0;
    640         if (U->use_empty())
    641           DeadInsts.push_back(U);
    642       }
    643 
    644     I->eraseFromParent();
    645     Changed = true;
    646   }
    647 
    648   return Changed;
    649 }
    650 
    651 namespace {
    652 
    653 /// Cost - This class is used to measure and compare candidate formulae.
    654 class Cost {
    655   /// TODO: Some of these could be merged. Also, a lexical ordering
    656   /// isn't always optimal.
    657   unsigned NumRegs;
    658   unsigned AddRecCost;
    659   unsigned NumIVMuls;
    660   unsigned NumBaseAdds;
    661   unsigned ImmCost;
    662   unsigned SetupCost;
    663 
    664 public:
    665   Cost()
    666     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
    667       SetupCost(0) {}
    668 
    669   bool operator<(const Cost &Other) const;
    670 
    671   void Loose();
    672 
    673   void RateFormula(const Formula &F,
    674                    SmallPtrSet<const SCEV *, 16> &Regs,
    675                    const DenseSet<const SCEV *> &VisitedRegs,
    676                    const Loop *L,
    677                    const SmallVectorImpl<int64_t> &Offsets,
    678                    ScalarEvolution &SE, DominatorTree &DT);
    679 
    680   void print(raw_ostream &OS) const;
    681   void dump() const;
    682 
    683 private:
    684   void RateRegister(const SCEV *Reg,
    685                     SmallPtrSet<const SCEV *, 16> &Regs,
    686                     const Loop *L,
    687                     ScalarEvolution &SE, DominatorTree &DT);
    688   void RatePrimaryRegister(const SCEV *Reg,
    689                            SmallPtrSet<const SCEV *, 16> &Regs,
    690                            const Loop *L,
    691                            ScalarEvolution &SE, DominatorTree &DT);
    692 };
    693 
    694 }
    695 
    696 /// RateRegister - Tally up interesting quantities from the given register.
    697 void Cost::RateRegister(const SCEV *Reg,
    698                         SmallPtrSet<const SCEV *, 16> &Regs,
    699                         const Loop *L,
    700                         ScalarEvolution &SE, DominatorTree &DT) {
    701   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
    702     if (AR->getLoop() == L)
    703       AddRecCost += 1; /// TODO: This should be a function of the stride.
    704 
    705     // If this is an addrec for a loop that's already been visited by LSR,
    706     // don't second-guess its addrec phi nodes. LSR isn't currently smart
    707     // enough to reason about more than one loop at a time. Consider these
    708     // registers free and leave them alone.
    709     else if (L->contains(AR->getLoop()) ||
    710              (!AR->getLoop()->contains(L) &&
    711               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
    712       for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
    713            PHINode *PN = dyn_cast<PHINode>(I); ++I)
    714         if (SE.isSCEVable(PN->getType()) &&
    715             (SE.getEffectiveSCEVType(PN->getType()) ==
    716              SE.getEffectiveSCEVType(AR->getType())) &&
    717             SE.getSCEV(PN) == AR)
    718           return;
    719 
    720       // If this isn't one of the addrecs that the loop already has, it
    721       // would require a costly new phi and add. TODO: This isn't
    722       // precisely modeled right now.
    723       ++NumBaseAdds;
    724       if (!Regs.count(AR->getStart()))
    725         RateRegister(AR->getStart(), Regs, L, SE, DT);
    726     }
    727 
    728     // Add the step value register, if it needs one.
    729     // TODO: The non-affine case isn't precisely modeled here.
    730     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
    731       if (!Regs.count(AR->getStart()))
    732         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
    733   }
    734   ++NumRegs;
    735 
    736   // Rough heuristic; favor registers which don't require extra setup
    737   // instructions in the preheader.
    738   if (!isa<SCEVUnknown>(Reg) &&
    739       !isa<SCEVConstant>(Reg) &&
    740       !(isa<SCEVAddRecExpr>(Reg) &&
    741         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
    742          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
    743     ++SetupCost;
    744 
    745     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
    746                  SE.hasComputableLoopEvolution(Reg, L);
    747 }
    748 
    749 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
    750 /// before, rate it.
    751 void Cost::RatePrimaryRegister(const SCEV *Reg,
    752                                SmallPtrSet<const SCEV *, 16> &Regs,
    753                                const Loop *L,
    754                                ScalarEvolution &SE, DominatorTree &DT) {
    755   if (Regs.insert(Reg))
    756     RateRegister(Reg, Regs, L, SE, DT);
    757 }
    758 
    759 void Cost::RateFormula(const Formula &F,
    760                        SmallPtrSet<const SCEV *, 16> &Regs,
    761                        const DenseSet<const SCEV *> &VisitedRegs,
    762                        const Loop *L,
    763                        const SmallVectorImpl<int64_t> &Offsets,
    764                        ScalarEvolution &SE, DominatorTree &DT) {
    765   // Tally up the registers.
    766   if (const SCEV *ScaledReg = F.ScaledReg) {
    767     if (VisitedRegs.count(ScaledReg)) {
    768       Loose();
    769       return;
    770     }
    771     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
    772   }
    773   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
    774        E = F.BaseRegs.end(); I != E; ++I) {
    775     const SCEV *BaseReg = *I;
    776     if (VisitedRegs.count(BaseReg)) {
    777       Loose();
    778       return;
    779     }
    780     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
    781   }
    782 
    783   // Determine how many (unfolded) adds we'll need inside the loop.
    784   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
    785   if (NumBaseParts > 1)
    786     NumBaseAdds += NumBaseParts - 1;
    787 
    788   // Tally up the non-zero immediates.
    789   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
    790        E = Offsets.end(); I != E; ++I) {
    791     int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
    792     if (F.AM.BaseGV)
    793       ImmCost += 64; // Handle symbolic values conservatively.
    794                      // TODO: This should probably be the pointer size.
    795     else if (Offset != 0)
    796       ImmCost += APInt(64, Offset, true).getMinSignedBits();
    797   }
    798 }
    799 
    800 /// Loose - Set this cost to a losing value.
    801 void Cost::Loose() {
    802   NumRegs = ~0u;
    803   AddRecCost = ~0u;
    804   NumIVMuls = ~0u;
    805   NumBaseAdds = ~0u;
    806   ImmCost = ~0u;
    807   SetupCost = ~0u;
    808 }
    809 
    810 /// operator< - Choose the lower cost.
    811 bool Cost::operator<(const Cost &Other) const {
    812   if (NumRegs != Other.NumRegs)
    813     return NumRegs < Other.NumRegs;
    814   if (AddRecCost != Other.AddRecCost)
    815     return AddRecCost < Other.AddRecCost;
    816   if (NumIVMuls != Other.NumIVMuls)
    817     return NumIVMuls < Other.NumIVMuls;
    818   if (NumBaseAdds != Other.NumBaseAdds)
    819     return NumBaseAdds < Other.NumBaseAdds;
    820   if (ImmCost != Other.ImmCost)
    821     return ImmCost < Other.ImmCost;
    822   if (SetupCost != Other.SetupCost)
    823     return SetupCost < Other.SetupCost;
    824   return false;
    825 }
    826 
    827 void Cost::print(raw_ostream &OS) const {
    828   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
    829   if (AddRecCost != 0)
    830     OS << ", with addrec cost " << AddRecCost;
    831   if (NumIVMuls != 0)
    832     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
    833   if (NumBaseAdds != 0)
    834     OS << ", plus " << NumBaseAdds << " base add"
    835        << (NumBaseAdds == 1 ? "" : "s");
    836   if (ImmCost != 0)
    837     OS << ", plus " << ImmCost << " imm cost";
    838   if (SetupCost != 0)
    839     OS << ", plus " << SetupCost << " setup cost";
    840 }
    841 
    842 void Cost::dump() const {
    843   print(errs()); errs() << '\n';
    844 }
    845 
    846 namespace {
    847 
    848 /// LSRFixup - An operand value in an instruction which is to be replaced
    849 /// with some equivalent, possibly strength-reduced, replacement.
    850 struct LSRFixup {
    851   /// UserInst - The instruction which will be updated.
    852   Instruction *UserInst;
    853 
    854   /// OperandValToReplace - The operand of the instruction which will
    855   /// be replaced. The operand may be used more than once; every instance
    856   /// will be replaced.
    857   Value *OperandValToReplace;
    858 
    859   /// PostIncLoops - If this user is to use the post-incremented value of an
    860   /// induction variable, this variable is non-null and holds the loop
    861   /// associated with the induction variable.
    862   PostIncLoopSet PostIncLoops;
    863 
    864   /// LUIdx - The index of the LSRUse describing the expression which
    865   /// this fixup needs, minus an offset (below).
    866   size_t LUIdx;
    867 
    868   /// Offset - A constant offset to be added to the LSRUse expression.
    869   /// This allows multiple fixups to share the same LSRUse with different
    870   /// offsets, for example in an unrolled loop.
    871   int64_t Offset;
    872 
    873   bool isUseFullyOutsideLoop(const Loop *L) const;
    874 
    875   LSRFixup();
    876 
    877   void print(raw_ostream &OS) const;
    878   void dump() const;
    879 };
    880 
    881 }
    882 
    883 LSRFixup::LSRFixup()
    884   : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
    885 
    886 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
    887 /// value outside of the given loop.
    888 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
    889   // PHI nodes use their value in their incoming blocks.
    890   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
    891     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
    892       if (PN->getIncomingValue(i) == OperandValToReplace &&
    893           L->contains(PN->getIncomingBlock(i)))
    894         return false;
    895     return true;
    896   }
    897 
    898   return !L->contains(UserInst);
    899 }
    900 
    901 void LSRFixup::print(raw_ostream &OS) const {
    902   OS << "UserInst=";
    903   // Store is common and interesting enough to be worth special-casing.
    904   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
    905     OS << "store ";
    906     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
    907   } else if (UserInst->getType()->isVoidTy())
    908     OS << UserInst->getOpcodeName();
    909   else
    910     WriteAsOperand(OS, UserInst, /*PrintType=*/false);
    911 
    912   OS << ", OperandValToReplace=";
    913   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
    914 
    915   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
    916        E = PostIncLoops.end(); I != E; ++I) {
    917     OS << ", PostIncLoop=";
    918     WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
    919   }
    920 
    921   if (LUIdx != ~size_t(0))
    922     OS << ", LUIdx=" << LUIdx;
    923 
    924   if (Offset != 0)
    925     OS << ", Offset=" << Offset;
    926 }
    927 
    928 void LSRFixup::dump() const {
    929   print(errs()); errs() << '\n';
    930 }
    931 
    932 namespace {
    933 
    934 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
    935 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
    936 struct UniquifierDenseMapInfo {
    937   static SmallVector<const SCEV *, 2> getEmptyKey() {
    938     SmallVector<const SCEV *, 2> V;
    939     V.push_back(reinterpret_cast<const SCEV *>(-1));
    940     return V;
    941   }
    942 
    943   static SmallVector<const SCEV *, 2> getTombstoneKey() {
    944     SmallVector<const SCEV *, 2> V;
    945     V.push_back(reinterpret_cast<const SCEV *>(-2));
    946     return V;
    947   }
    948 
    949   static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
    950     unsigned Result = 0;
    951     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
    952          E = V.end(); I != E; ++I)
    953       Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
    954     return Result;
    955   }
    956 
    957   static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
    958                       const SmallVector<const SCEV *, 2> &RHS) {
    959     return LHS == RHS;
    960   }
    961 };
    962 
    963 /// LSRUse - This class holds the state that LSR keeps for each use in
    964 /// IVUsers, as well as uses invented by LSR itself. It includes information
    965 /// about what kinds of things can be folded into the user, information about
    966 /// the user itself, and information about how the use may be satisfied.
    967 /// TODO: Represent multiple users of the same expression in common?
    968 class LSRUse {
    969   DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
    970 
    971 public:
    972   /// KindType - An enum for a kind of use, indicating what types of
    973   /// scaled and immediate operands it might support.
    974   enum KindType {
    975     Basic,   ///< A normal use, with no folding.
    976     Special, ///< A special case of basic, allowing -1 scales.
    977     Address, ///< An address use; folding according to TargetLowering
    978     ICmpZero ///< An equality icmp with both operands folded into one.
    979     // TODO: Add a generic icmp too?
    980   };
    981 
    982   KindType Kind;
    983   Type *AccessTy;
    984 
    985   SmallVector<int64_t, 8> Offsets;
    986   int64_t MinOffset;
    987   int64_t MaxOffset;
    988 
    989   /// AllFixupsOutsideLoop - This records whether all of the fixups using this
    990   /// LSRUse are outside of the loop, in which case some special-case heuristics
    991   /// may be used.
    992   bool AllFixupsOutsideLoop;
    993 
    994   /// WidestFixupType - This records the widest use type for any fixup using
    995   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
    996   /// max fixup widths to be equivalent, because the narrower one may be relying
    997   /// on the implicit truncation to truncate away bogus bits.
    998   Type *WidestFixupType;
    999 
   1000   /// Formulae - A list of ways to build a value that can satisfy this user.
   1001   /// After the list is populated, one of these is selected heuristically and
   1002   /// used to formulate a replacement for OperandValToReplace in UserInst.
   1003   SmallVector<Formula, 12> Formulae;
   1004 
   1005   /// Regs - The set of register candidates used by all formulae in this LSRUse.
   1006   SmallPtrSet<const SCEV *, 4> Regs;
   1007 
   1008   LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
   1009                                       MinOffset(INT64_MAX),
   1010                                       MaxOffset(INT64_MIN),
   1011                                       AllFixupsOutsideLoop(true),
   1012                                       WidestFixupType(0) {}
   1013 
   1014   bool HasFormulaWithSameRegs(const Formula &F) const;
   1015   bool InsertFormula(const Formula &F);
   1016   void DeleteFormula(Formula &F);
   1017   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
   1018 
   1019   void print(raw_ostream &OS) const;
   1020   void dump() const;
   1021 };
   1022 
   1023 }
   1024 
   1025 /// HasFormula - Test whether this use as a formula which has the same
   1026 /// registers as the given formula.
   1027 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   1028   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
   1029   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   1030   // Unstable sort by host order ok, because this is only used for uniquifying.
   1031   std::sort(Key.begin(), Key.end());
   1032   return Uniquifier.count(Key);
   1033 }
   1034 
   1035 /// InsertFormula - If the given formula has not yet been inserted, add it to
   1036 /// the list, and return true. Return false otherwise.
   1037 bool LSRUse::InsertFormula(const Formula &F) {
   1038   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
   1039   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   1040   // Unstable sort by host order ok, because this is only used for uniquifying.
   1041   std::sort(Key.begin(), Key.end());
   1042 
   1043   if (!Uniquifier.insert(Key).second)
   1044     return false;
   1045 
   1046   // Using a register to hold the value of 0 is not profitable.
   1047   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
   1048          "Zero allocated in a scaled register!");
   1049 #ifndef NDEBUG
   1050   for (SmallVectorImpl<const SCEV *>::const_iterator I =
   1051        F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
   1052     assert(!(*I)->isZero() && "Zero allocated in a base register!");
   1053 #endif
   1054 
   1055   // Add the formula to the list.
   1056   Formulae.push_back(F);
   1057 
   1058   // Record registers now being used by this use.
   1059   if (F.ScaledReg) Regs.insert(F.ScaledReg);
   1060   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
   1061 
   1062   return true;
   1063 }
   1064 
   1065 /// DeleteFormula - Remove the given formula from this use's list.
   1066 void LSRUse::DeleteFormula(Formula &F) {
   1067   if (&F != &Formulae.back())
   1068     std::swap(F, Formulae.back());
   1069   Formulae.pop_back();
   1070   assert(!Formulae.empty() && "LSRUse has no formulae left!");
   1071 }
   1072 
   1073 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
   1074 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
   1075   // Now that we've filtered out some formulae, recompute the Regs set.
   1076   SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
   1077   Regs.clear();
   1078   for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
   1079        E = Formulae.end(); I != E; ++I) {
   1080     const Formula &F = *I;
   1081     if (F.ScaledReg) Regs.insert(F.ScaledReg);
   1082     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
   1083   }
   1084 
   1085   // Update the RegTracker.
   1086   for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
   1087        E = OldRegs.end(); I != E; ++I)
   1088     if (!Regs.count(*I))
   1089       RegUses.DropRegister(*I, LUIdx);
   1090 }
   1091 
   1092 void LSRUse::print(raw_ostream &OS) const {
   1093   OS << "LSR Use: Kind=";
   1094   switch (Kind) {
   1095   case Basic:    OS << "Basic"; break;
   1096   case Special:  OS << "Special"; break;
   1097   case ICmpZero: OS << "ICmpZero"; break;
   1098   case Address:
   1099     OS << "Address of ";
   1100     if (AccessTy->isPointerTy())
   1101       OS << "pointer"; // the full pointer type could be really verbose
   1102     else
   1103       OS << *AccessTy;
   1104   }
   1105 
   1106   OS << ", Offsets={";
   1107   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
   1108        E = Offsets.end(); I != E; ++I) {
   1109     OS << *I;
   1110     if (llvm::next(I) != E)
   1111       OS << ',';
   1112   }
   1113   OS << '}';
   1114 
   1115   if (AllFixupsOutsideLoop)
   1116     OS << ", all-fixups-outside-loop";
   1117 
   1118   if (WidestFixupType)
   1119     OS << ", widest fixup type: " << *WidestFixupType;
   1120 }
   1121 
   1122 void LSRUse::dump() const {
   1123   print(errs()); errs() << '\n';
   1124 }
   1125 
   1126 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
   1127 /// be completely folded into the user instruction at isel time. This includes
   1128 /// address-mode folding and special icmp tricks.
   1129 static bool isLegalUse(const TargetLowering::AddrMode &AM,
   1130                        LSRUse::KindType Kind, Type *AccessTy,
   1131                        const TargetLowering *TLI) {
   1132   switch (Kind) {
   1133   case LSRUse::Address:
   1134     // If we have low-level target information, ask the target if it can
   1135     // completely fold this address.
   1136     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
   1137 
   1138     // Otherwise, just guess that reg+reg addressing is legal.
   1139     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
   1140 
   1141   case LSRUse::ICmpZero:
   1142     // There's not even a target hook for querying whether it would be legal to
   1143     // fold a GV into an ICmp.
   1144     if (AM.BaseGV)
   1145       return false;
   1146 
   1147     // ICmp only has two operands; don't allow more than two non-trivial parts.
   1148     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
   1149       return false;
   1150 
   1151     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
   1152     // putting the scaled register in the other operand of the icmp.
   1153     if (AM.Scale != 0 && AM.Scale != -1)
   1154       return false;
   1155 
   1156     // If we have low-level target information, ask the target if it can fold an
   1157     // integer immediate on an icmp.
   1158     if (AM.BaseOffs != 0) {
   1159       if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
   1160       return false;
   1161     }
   1162 
   1163     return true;
   1164 
   1165   case LSRUse::Basic:
   1166     // Only handle single-register values.
   1167     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
   1168 
   1169   case LSRUse::Special:
   1170     // Only handle -1 scales, or no scale.
   1171     return AM.Scale == 0 || AM.Scale == -1;
   1172   }
   1173 
   1174   return false;
   1175 }
   1176 
   1177 static bool isLegalUse(TargetLowering::AddrMode AM,
   1178                        int64_t MinOffset, int64_t MaxOffset,
   1179                        LSRUse::KindType Kind, Type *AccessTy,
   1180                        const TargetLowering *TLI) {
   1181   // Check for overflow.
   1182   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
   1183       (MinOffset > 0))
   1184     return false;
   1185   AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
   1186   if (isLegalUse(AM, Kind, AccessTy, TLI)) {
   1187     AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
   1188     // Check for overflow.
   1189     if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
   1190         (MaxOffset > 0))
   1191       return false;
   1192     AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
   1193     return isLegalUse(AM, Kind, AccessTy, TLI);
   1194   }
   1195   return false;
   1196 }
   1197 
   1198 static bool isAlwaysFoldable(int64_t BaseOffs,
   1199                              GlobalValue *BaseGV,
   1200                              bool HasBaseReg,
   1201                              LSRUse::KindType Kind, Type *AccessTy,
   1202                              const TargetLowering *TLI) {
   1203   // Fast-path: zero is always foldable.
   1204   if (BaseOffs == 0 && !BaseGV) return true;
   1205 
   1206   // Conservatively, create an address with an immediate and a
   1207   // base and a scale.
   1208   TargetLowering::AddrMode AM;
   1209   AM.BaseOffs = BaseOffs;
   1210   AM.BaseGV = BaseGV;
   1211   AM.HasBaseReg = HasBaseReg;
   1212   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
   1213 
   1214   // Canonicalize a scale of 1 to a base register if the formula doesn't
   1215   // already have a base register.
   1216   if (!AM.HasBaseReg && AM.Scale == 1) {
   1217     AM.Scale = 0;
   1218     AM.HasBaseReg = true;
   1219   }
   1220 
   1221   return isLegalUse(AM, Kind, AccessTy, TLI);
   1222 }
   1223 
   1224 static bool isAlwaysFoldable(const SCEV *S,
   1225                              int64_t MinOffset, int64_t MaxOffset,
   1226                              bool HasBaseReg,
   1227                              LSRUse::KindType Kind, Type *AccessTy,
   1228                              const TargetLowering *TLI,
   1229                              ScalarEvolution &SE) {
   1230   // Fast-path: zero is always foldable.
   1231   if (S->isZero()) return true;
   1232 
   1233   // Conservatively, create an address with an immediate and a
   1234   // base and a scale.
   1235   int64_t BaseOffs = ExtractImmediate(S, SE);
   1236   GlobalValue *BaseGV = ExtractSymbol(S, SE);
   1237 
   1238   // If there's anything else involved, it's not foldable.
   1239   if (!S->isZero()) return false;
   1240 
   1241   // Fast-path: zero is always foldable.
   1242   if (BaseOffs == 0 && !BaseGV) return true;
   1243 
   1244   // Conservatively, create an address with an immediate and a
   1245   // base and a scale.
   1246   TargetLowering::AddrMode AM;
   1247   AM.BaseOffs = BaseOffs;
   1248   AM.BaseGV = BaseGV;
   1249   AM.HasBaseReg = HasBaseReg;
   1250   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
   1251 
   1252   return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
   1253 }
   1254 
   1255 namespace {
   1256 
   1257 /// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
   1258 /// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
   1259 struct UseMapDenseMapInfo {
   1260   static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
   1261     return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
   1262   }
   1263 
   1264   static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
   1265     return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
   1266   }
   1267 
   1268   static unsigned
   1269   getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
   1270     unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
   1271     Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
   1272     return Result;
   1273   }
   1274 
   1275   static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
   1276                       const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
   1277     return LHS == RHS;
   1278   }
   1279 };
   1280 
   1281 /// LSRInstance - This class holds state for the main loop strength reduction
   1282 /// logic.
   1283 class LSRInstance {
   1284   IVUsers &IU;
   1285   ScalarEvolution &SE;
   1286   DominatorTree &DT;
   1287   LoopInfo &LI;
   1288   const TargetLowering *const TLI;
   1289   Loop *const L;
   1290   bool Changed;
   1291 
   1292   /// IVIncInsertPos - This is the insert position that the current loop's
   1293   /// induction variable increment should be placed. In simple loops, this is
   1294   /// the latch block's terminator. But in more complicated cases, this is a
   1295   /// position which will dominate all the in-loop post-increment users.
   1296   Instruction *IVIncInsertPos;
   1297 
   1298   /// Factors - Interesting factors between use strides.
   1299   SmallSetVector<int64_t, 8> Factors;
   1300 
   1301   /// Types - Interesting use types, to facilitate truncation reuse.
   1302   SmallSetVector<Type *, 4> Types;
   1303 
   1304   /// Fixups - The list of operands which are to be replaced.
   1305   SmallVector<LSRFixup, 16> Fixups;
   1306 
   1307   /// Uses - The list of interesting uses.
   1308   SmallVector<LSRUse, 16> Uses;
   1309 
   1310   /// RegUses - Track which uses use which register candidates.
   1311   RegUseTracker RegUses;
   1312 
   1313   void OptimizeShadowIV();
   1314   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
   1315   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
   1316   void OptimizeLoopTermCond();
   1317 
   1318   void CollectInterestingTypesAndFactors();
   1319   void CollectFixupsAndInitialFormulae();
   1320 
   1321   LSRFixup &getNewFixup() {
   1322     Fixups.push_back(LSRFixup());
   1323     return Fixups.back();
   1324   }
   1325 
   1326   // Support for sharing of LSRUses between LSRFixups.
   1327   typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
   1328                    size_t,
   1329                    UseMapDenseMapInfo> UseMapTy;
   1330   UseMapTy UseMap;
   1331 
   1332   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   1333                           LSRUse::KindType Kind, Type *AccessTy);
   1334 
   1335   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
   1336                                     LSRUse::KindType Kind,
   1337                                     Type *AccessTy);
   1338 
   1339   void DeleteUse(LSRUse &LU, size_t LUIdx);
   1340 
   1341   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
   1342 
   1343 public:
   1344   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   1345   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
   1346   void CountRegisters(const Formula &F, size_t LUIdx);
   1347   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
   1348 
   1349   void CollectLoopInvariantFixupsAndFormulae();
   1350 
   1351   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
   1352                               unsigned Depth = 0);
   1353   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
   1354   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   1355   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
   1356   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   1357   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
   1358   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
   1359   void GenerateCrossUseConstantOffsets();
   1360   void GenerateAllReuseFormulae();
   1361 
   1362   void FilterOutUndesirableDedicatedRegisters();
   1363 
   1364   size_t EstimateSearchSpaceComplexity() const;
   1365   void NarrowSearchSpaceByDetectingSupersets();
   1366   void NarrowSearchSpaceByCollapsingUnrolledCode();
   1367   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
   1368   void NarrowSearchSpaceByPickingWinnerRegs();
   1369   void NarrowSearchSpaceUsingHeuristics();
   1370 
   1371   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
   1372                     Cost &SolutionCost,
   1373                     SmallVectorImpl<const Formula *> &Workspace,
   1374                     const Cost &CurCost,
   1375                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
   1376                     DenseSet<const SCEV *> &VisitedRegs) const;
   1377   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
   1378 
   1379   BasicBlock::iterator
   1380     HoistInsertPosition(BasicBlock::iterator IP,
   1381                         const SmallVectorImpl<Instruction *> &Inputs) const;
   1382   BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   1383                                                      const LSRFixup &LF,
   1384                                                      const LSRUse &LU) const;
   1385 
   1386   Value *Expand(const LSRFixup &LF,
   1387                 const Formula &F,
   1388                 BasicBlock::iterator IP,
   1389                 SCEVExpander &Rewriter,
   1390                 SmallVectorImpl<WeakVH> &DeadInsts) const;
   1391   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
   1392                      const Formula &F,
   1393                      SCEVExpander &Rewriter,
   1394                      SmallVectorImpl<WeakVH> &DeadInsts,
   1395                      Pass *P) const;
   1396   void Rewrite(const LSRFixup &LF,
   1397                const Formula &F,
   1398                SCEVExpander &Rewriter,
   1399                SmallVectorImpl<WeakVH> &DeadInsts,
   1400                Pass *P) const;
   1401   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   1402                          Pass *P);
   1403 
   1404   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
   1405 
   1406   bool getChanged() const { return Changed; }
   1407 
   1408   void print_factors_and_types(raw_ostream &OS) const;
   1409   void print_fixups(raw_ostream &OS) const;
   1410   void print_uses(raw_ostream &OS) const;
   1411   void print(raw_ostream &OS) const;
   1412   void dump() const;
   1413 };
   1414 
   1415 }
   1416 
   1417 /// OptimizeShadowIV - If IV is used in a int-to-float cast
   1418 /// inside the loop then try to eliminate the cast operation.
   1419 void LSRInstance::OptimizeShadowIV() {
   1420   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
   1421   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
   1422     return;
   1423 
   1424   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
   1425        UI != E; /* empty */) {
   1426     IVUsers::const_iterator CandidateUI = UI;
   1427     ++UI;
   1428     Instruction *ShadowUse = CandidateUI->getUser();
   1429     Type *DestTy = NULL;
   1430 
   1431     /* If shadow use is a int->float cast then insert a second IV
   1432        to eliminate this cast.
   1433 
   1434          for (unsigned i = 0; i < n; ++i)
   1435            foo((double)i);
   1436 
   1437        is transformed into
   1438 
   1439          double d = 0.0;
   1440          for (unsigned i = 0; i < n; ++i, ++d)
   1441            foo(d);
   1442     */
   1443     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
   1444       DestTy = UCast->getDestTy();
   1445     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
   1446       DestTy = SCast->getDestTy();
   1447     if (!DestTy) continue;
   1448 
   1449     if (TLI) {
   1450       // If target does not support DestTy natively then do not apply
   1451       // this transformation.
   1452       EVT DVT = TLI->getValueType(DestTy);
   1453       if (!TLI->isTypeLegal(DVT)) continue;
   1454     }
   1455 
   1456     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
   1457     if (!PH) continue;
   1458     if (PH->getNumIncomingValues() != 2) continue;
   1459 
   1460     Type *SrcTy = PH->getType();
   1461     int Mantissa = DestTy->getFPMantissaWidth();
   1462     if (Mantissa == -1) continue;
   1463     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
   1464       continue;
   1465 
   1466     unsigned Entry, Latch;
   1467     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
   1468       Entry = 0;
   1469       Latch = 1;
   1470     } else {
   1471       Entry = 1;
   1472       Latch = 0;
   1473     }
   1474 
   1475     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
   1476     if (!Init) continue;
   1477     Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
   1478 
   1479     BinaryOperator *Incr =
   1480       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
   1481     if (!Incr) continue;
   1482     if (Incr->getOpcode() != Instruction::Add
   1483         && Incr->getOpcode() != Instruction::Sub)
   1484       continue;
   1485 
   1486     /* Initialize new IV, double d = 0.0 in above example. */
   1487     ConstantInt *C = NULL;
   1488     if (Incr->getOperand(0) == PH)
   1489       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
   1490     else if (Incr->getOperand(1) == PH)
   1491       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
   1492     else
   1493       continue;
   1494 
   1495     if (!C) continue;
   1496 
   1497     // Ignore negative constants, as the code below doesn't handle them
   1498     // correctly. TODO: Remove this restriction.
   1499     if (!C->getValue().isStrictlyPositive()) continue;
   1500 
   1501     /* Add new PHINode. */
   1502     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
   1503 
   1504     /* create new increment. '++d' in above example. */
   1505     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
   1506     BinaryOperator *NewIncr =
   1507       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
   1508                                Instruction::FAdd : Instruction::FSub,
   1509                              NewPH, CFP, "IV.S.next.", Incr);
   1510 
   1511     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
   1512     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
   1513 
   1514     /* Remove cast operation */
   1515     ShadowUse->replaceAllUsesWith(NewPH);
   1516     ShadowUse->eraseFromParent();
   1517     Changed = true;
   1518     break;
   1519   }
   1520 }
   1521 
   1522 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
   1523 /// set the IV user and stride information and return true, otherwise return
   1524 /// false.
   1525 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
   1526   for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
   1527     if (UI->getUser() == Cond) {
   1528       // NOTE: we could handle setcc instructions with multiple uses here, but
   1529       // InstCombine does it as well for simple uses, it's not clear that it
   1530       // occurs enough in real life to handle.
   1531       CondUse = UI;
   1532       return true;
   1533     }
   1534   return false;
   1535 }
   1536 
   1537 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
   1538 /// a max computation.
   1539 ///
   1540 /// This is a narrow solution to a specific, but acute, problem. For loops
   1541 /// like this:
   1542 ///
   1543 ///   i = 0;
   1544 ///   do {
   1545 ///     p[i] = 0.0;
   1546 ///   } while (++i < n);
   1547 ///
   1548 /// the trip count isn't just 'n', because 'n' might not be positive. And
   1549 /// unfortunately this can come up even for loops where the user didn't use
   1550 /// a C do-while loop. For example, seemingly well-behaved top-test loops
   1551 /// will commonly be lowered like this:
   1552 //
   1553 ///   if (n > 0) {
   1554 ///     i = 0;
   1555 ///     do {
   1556 ///       p[i] = 0.0;
   1557 ///     } while (++i < n);
   1558 ///   }
   1559 ///
   1560 /// and then it's possible for subsequent optimization to obscure the if
   1561 /// test in such a way that indvars can't find it.
   1562 ///
   1563 /// When indvars can't find the if test in loops like this, it creates a
   1564 /// max expression, which allows it to give the loop a canonical
   1565 /// induction variable:
   1566 ///
   1567 ///   i = 0;
   1568 ///   max = n < 1 ? 1 : n;
   1569 ///   do {
   1570 ///     p[i] = 0.0;
   1571 ///   } while (++i != max);
   1572 ///
   1573 /// Canonical induction variables are necessary because the loop passes
   1574 /// are designed around them. The most obvious example of this is the
   1575 /// LoopInfo analysis, which doesn't remember trip count values. It
   1576 /// expects to be able to rediscover the trip count each time it is
   1577 /// needed, and it does this using a simple analysis that only succeeds if
   1578 /// the loop has a canonical induction variable.
   1579 ///
   1580 /// However, when it comes time to generate code, the maximum operation
   1581 /// can be quite costly, especially if it's inside of an outer loop.
   1582 ///
   1583 /// This function solves this problem by detecting this type of loop and
   1584 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
   1585 /// the instructions for the maximum computation.
   1586 ///
   1587 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   1588   // Check that the loop matches the pattern we're looking for.
   1589   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
   1590       Cond->getPredicate() != CmpInst::ICMP_NE)
   1591     return Cond;
   1592 
   1593   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
   1594   if (!Sel || !Sel->hasOneUse()) return Cond;
   1595 
   1596   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
   1597   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
   1598     return Cond;
   1599   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
   1600 
   1601   // Add one to the backedge-taken count to get the trip count.
   1602   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
   1603   if (IterationCount != SE.getSCEV(Sel)) return Cond;
   1604 
   1605   // Check for a max calculation that matches the pattern. There's no check
   1606   // for ICMP_ULE here because the comparison would be with zero, which
   1607   // isn't interesting.
   1608   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
   1609   const SCEVNAryExpr *Max = 0;
   1610   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
   1611     Pred = ICmpInst::ICMP_SLE;
   1612     Max = S;
   1613   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
   1614     Pred = ICmpInst::ICMP_SLT;
   1615     Max = S;
   1616   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
   1617     Pred = ICmpInst::ICMP_ULT;
   1618     Max = U;
   1619   } else {
   1620     // No match; bail.
   1621     return Cond;
   1622   }
   1623 
   1624   // To handle a max with more than two operands, this optimization would
   1625   // require additional checking and setup.
   1626   if (Max->getNumOperands() != 2)
   1627     return Cond;
   1628 
   1629   const SCEV *MaxLHS = Max->getOperand(0);
   1630   const SCEV *MaxRHS = Max->getOperand(1);
   1631 
   1632   // ScalarEvolution canonicalizes constants to the left. For < and >, look
   1633   // for a comparison with 1. For <= and >=, a comparison with zero.
   1634   if (!MaxLHS ||
   1635       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
   1636     return Cond;
   1637 
   1638   // Check the relevant induction variable for conformance to
   1639   // the pattern.
   1640   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
   1641   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
   1642   if (!AR || !AR->isAffine() ||
   1643       AR->getStart() != One ||
   1644       AR->getStepRecurrence(SE) != One)
   1645     return Cond;
   1646 
   1647   assert(AR->getLoop() == L &&
   1648          "Loop condition operand is an addrec in a different loop!");
   1649 
   1650   // Check the right operand of the select, and remember it, as it will
   1651   // be used in the new comparison instruction.
   1652   Value *NewRHS = 0;
   1653   if (ICmpInst::isTrueWhenEqual(Pred)) {
   1654     // Look for n+1, and grab n.
   1655     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
   1656       if (isa<ConstantInt>(BO->getOperand(1)) &&
   1657           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
   1658           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
   1659         NewRHS = BO->getOperand(0);
   1660     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
   1661       if (isa<ConstantInt>(BO->getOperand(1)) &&
   1662           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
   1663           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
   1664         NewRHS = BO->getOperand(0);
   1665     if (!NewRHS)
   1666       return Cond;
   1667   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
   1668     NewRHS = Sel->getOperand(1);
   1669   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
   1670     NewRHS = Sel->getOperand(2);
   1671   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
   1672     NewRHS = SU->getValue();
   1673   else
   1674     // Max doesn't match expected pattern.
   1675     return Cond;
   1676 
   1677   // Determine the new comparison opcode. It may be signed or unsigned,
   1678   // and the original comparison may be either equality or inequality.
   1679   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
   1680     Pred = CmpInst::getInversePredicate(Pred);
   1681 
   1682   // Ok, everything looks ok to change the condition into an SLT or SGE and
   1683   // delete the max calculation.
   1684   ICmpInst *NewCond =
   1685     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
   1686 
   1687   // Delete the max calculation instructions.
   1688   Cond->replaceAllUsesWith(NewCond);
   1689   CondUse->setUser(NewCond);
   1690   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
   1691   Cond->eraseFromParent();
   1692   Sel->eraseFromParent();
   1693   if (Cmp->use_empty())
   1694     Cmp->eraseFromParent();
   1695   return NewCond;
   1696 }
   1697 
   1698 /// OptimizeLoopTermCond - Change loop terminating condition to use the
   1699 /// postinc iv when possible.
   1700 void
   1701 LSRInstance::OptimizeLoopTermCond() {
   1702   SmallPtrSet<Instruction *, 4> PostIncs;
   1703 
   1704   BasicBlock *LatchBlock = L->getLoopLatch();
   1705   SmallVector<BasicBlock*, 8> ExitingBlocks;
   1706   L->getExitingBlocks(ExitingBlocks);
   1707 
   1708   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
   1709     BasicBlock *ExitingBlock = ExitingBlocks[i];
   1710 
   1711     // Get the terminating condition for the loop if possible.  If we
   1712     // can, we want to change it to use a post-incremented version of its
   1713     // induction variable, to allow coalescing the live ranges for the IV into
   1714     // one register value.
   1715 
   1716     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
   1717     if (!TermBr)
   1718       continue;
   1719     // FIXME: Overly conservative, termination condition could be an 'or' etc..
   1720     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
   1721       continue;
   1722 
   1723     // Search IVUsesByStride to find Cond's IVUse if there is one.
   1724     IVStrideUse *CondUse = 0;
   1725     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
   1726     if (!FindIVUserForCond(Cond, CondUse))
   1727       continue;
   1728 
   1729     // If the trip count is computed in terms of a max (due to ScalarEvolution
   1730     // being unable to find a sufficient guard, for example), change the loop
   1731     // comparison to use SLT or ULT instead of NE.
   1732     // One consequence of doing this now is that it disrupts the count-down
   1733     // optimization. That's not always a bad thing though, because in such
   1734     // cases it may still be worthwhile to avoid a max.
   1735     Cond = OptimizeMax(Cond, CondUse);
   1736 
   1737     // If this exiting block dominates the latch block, it may also use
   1738     // the post-inc value if it won't be shared with other uses.
   1739     // Check for dominance.
   1740     if (!DT.dominates(ExitingBlock, LatchBlock))
   1741       continue;
   1742 
   1743     // Conservatively avoid trying to use the post-inc value in non-latch
   1744     // exits if there may be pre-inc users in intervening blocks.
   1745     if (LatchBlock != ExitingBlock)
   1746       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
   1747         // Test if the use is reachable from the exiting block. This dominator
   1748         // query is a conservative approximation of reachability.
   1749         if (&*UI != CondUse &&
   1750             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
   1751           // Conservatively assume there may be reuse if the quotient of their
   1752           // strides could be a legal scale.
   1753           const SCEV *A = IU.getStride(*CondUse, L);
   1754           const SCEV *B = IU.getStride(*UI, L);
   1755           if (!A || !B) continue;
   1756           if (SE.getTypeSizeInBits(A->getType()) !=
   1757               SE.getTypeSizeInBits(B->getType())) {
   1758             if (SE.getTypeSizeInBits(A->getType()) >
   1759                 SE.getTypeSizeInBits(B->getType()))
   1760               B = SE.getSignExtendExpr(B, A->getType());
   1761             else
   1762               A = SE.getSignExtendExpr(A, B->getType());
   1763           }
   1764           if (const SCEVConstant *D =
   1765                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
   1766             const ConstantInt *C = D->getValue();
   1767             // Stride of one or negative one can have reuse with non-addresses.
   1768             if (C->isOne() || C->isAllOnesValue())
   1769               goto decline_post_inc;
   1770             // Avoid weird situations.
   1771             if (C->getValue().getMinSignedBits() >= 64 ||
   1772                 C->getValue().isMinSignedValue())
   1773               goto decline_post_inc;
   1774             // Without TLI, assume that any stride might be valid, and so any
   1775             // use might be shared.
   1776             if (!TLI)
   1777               goto decline_post_inc;
   1778             // Check for possible scaled-address reuse.
   1779             Type *AccessTy = getAccessType(UI->getUser());
   1780             TargetLowering::AddrMode AM;
   1781             AM.Scale = C->getSExtValue();
   1782             if (TLI->isLegalAddressingMode(AM, AccessTy))
   1783               goto decline_post_inc;
   1784             AM.Scale = -AM.Scale;
   1785             if (TLI->isLegalAddressingMode(AM, AccessTy))
   1786               goto decline_post_inc;
   1787           }
   1788         }
   1789 
   1790     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
   1791                  << *Cond << '\n');
   1792 
   1793     // It's possible for the setcc instruction to be anywhere in the loop, and
   1794     // possible for it to have multiple users.  If it is not immediately before
   1795     // the exiting block branch, move it.
   1796     if (&*++BasicBlock::iterator(Cond) != TermBr) {
   1797       if (Cond->hasOneUse()) {
   1798         Cond->moveBefore(TermBr);
   1799       } else {
   1800         // Clone the terminating condition and insert into the loopend.
   1801         ICmpInst *OldCond = Cond;
   1802         Cond = cast<ICmpInst>(Cond->clone());
   1803         Cond->setName(L->getHeader()->getName() + ".termcond");
   1804         ExitingBlock->getInstList().insert(TermBr, Cond);
   1805 
   1806         // Clone the IVUse, as the old use still exists!
   1807         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
   1808         TermBr->replaceUsesOfWith(OldCond, Cond);
   1809       }
   1810     }
   1811 
   1812     // If we get to here, we know that we can transform the setcc instruction to
   1813     // use the post-incremented version of the IV, allowing us to coalesce the
   1814     // live ranges for the IV correctly.
   1815     CondUse->transformToPostInc(L);
   1816     Changed = true;
   1817 
   1818     PostIncs.insert(Cond);
   1819   decline_post_inc:;
   1820   }
   1821 
   1822   // Determine an insertion point for the loop induction variable increment. It
   1823   // must dominate all the post-inc comparisons we just set up, and it must
   1824   // dominate the loop latch edge.
   1825   IVIncInsertPos = L->getLoopLatch()->getTerminator();
   1826   for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
   1827        E = PostIncs.end(); I != E; ++I) {
   1828     BasicBlock *BB =
   1829       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
   1830                                     (*I)->getParent());
   1831     if (BB == (*I)->getParent())
   1832       IVIncInsertPos = *I;
   1833     else if (BB != IVIncInsertPos->getParent())
   1834       IVIncInsertPos = BB->getTerminator();
   1835   }
   1836 }
   1837 
   1838 /// reconcileNewOffset - Determine if the given use can accommodate a fixup
   1839 /// at the given offset and other details. If so, update the use and
   1840 /// return true.
   1841 bool
   1842 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
   1843                                 LSRUse::KindType Kind, Type *AccessTy) {
   1844   int64_t NewMinOffset = LU.MinOffset;
   1845   int64_t NewMaxOffset = LU.MaxOffset;
   1846   Type *NewAccessTy = AccessTy;
   1847 
   1848   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
   1849   // something conservative, however this can pessimize in the case that one of
   1850   // the uses will have all its uses outside the loop, for example.
   1851   if (LU.Kind != Kind)
   1852     return false;
   1853   // Conservatively assume HasBaseReg is true for now.
   1854   if (NewOffset < LU.MinOffset) {
   1855     if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
   1856                           Kind, AccessTy, TLI))
   1857       return false;
   1858     NewMinOffset = NewOffset;
   1859   } else if (NewOffset > LU.MaxOffset) {
   1860     if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
   1861                           Kind, AccessTy, TLI))
   1862       return false;
   1863     NewMaxOffset = NewOffset;
   1864   }
   1865   // Check for a mismatched access type, and fall back conservatively as needed.
   1866   // TODO: Be less conservative when the type is similar and can use the same
   1867   // addressing modes.
   1868   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
   1869     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
   1870 
   1871   // Update the use.
   1872   LU.MinOffset = NewMinOffset;
   1873   LU.MaxOffset = NewMaxOffset;
   1874   LU.AccessTy = NewAccessTy;
   1875   if (NewOffset != LU.Offsets.back())
   1876     LU.Offsets.push_back(NewOffset);
   1877   return true;
   1878 }
   1879 
   1880 /// getUse - Return an LSRUse index and an offset value for a fixup which
   1881 /// needs the given expression, with the given kind and optional access type.
   1882 /// Either reuse an existing use or create a new one, as needed.
   1883 std::pair<size_t, int64_t>
   1884 LSRInstance::getUse(const SCEV *&Expr,
   1885                     LSRUse::KindType Kind, Type *AccessTy) {
   1886   const SCEV *Copy = Expr;
   1887   int64_t Offset = ExtractImmediate(Expr, SE);
   1888 
   1889   // Basic uses can't accept any offset, for example.
   1890   if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
   1891     Expr = Copy;
   1892     Offset = 0;
   1893   }
   1894 
   1895   std::pair<UseMapTy::iterator, bool> P =
   1896     UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
   1897   if (!P.second) {
   1898     // A use already existed with this base.
   1899     size_t LUIdx = P.first->second;
   1900     LSRUse &LU = Uses[LUIdx];
   1901     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
   1902       // Reuse this use.
   1903       return std::make_pair(LUIdx, Offset);
   1904   }
   1905 
   1906   // Create a new use.
   1907   size_t LUIdx = Uses.size();
   1908   P.first->second = LUIdx;
   1909   Uses.push_back(LSRUse(Kind, AccessTy));
   1910   LSRUse &LU = Uses[LUIdx];
   1911 
   1912   // We don't need to track redundant offsets, but we don't need to go out
   1913   // of our way here to avoid them.
   1914   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
   1915     LU.Offsets.push_back(Offset);
   1916 
   1917   LU.MinOffset = Offset;
   1918   LU.MaxOffset = Offset;
   1919   return std::make_pair(LUIdx, Offset);
   1920 }
   1921 
   1922 /// DeleteUse - Delete the given use from the Uses list.
   1923 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
   1924   if (&LU != &Uses.back())
   1925     std::swap(LU, Uses.back());
   1926   Uses.pop_back();
   1927 
   1928   // Update RegUses.
   1929   RegUses.SwapAndDropUse(LUIdx, Uses.size());
   1930 }
   1931 
   1932 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
   1933 /// a formula that has the same registers as the given formula.
   1934 LSRUse *
   1935 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
   1936                                        const LSRUse &OrigLU) {
   1937   // Search all uses for the formula. This could be more clever.
   1938   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   1939     LSRUse &LU = Uses[LUIdx];
   1940     // Check whether this use is close enough to OrigLU, to see whether it's
   1941     // worthwhile looking through its formulae.
   1942     // Ignore ICmpZero uses because they may contain formulae generated by
   1943     // GenerateICmpZeroScales, in which case adding fixup offsets may
   1944     // be invalid.
   1945     if (&LU != &OrigLU &&
   1946         LU.Kind != LSRUse::ICmpZero &&
   1947         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
   1948         LU.WidestFixupType == OrigLU.WidestFixupType &&
   1949         LU.HasFormulaWithSameRegs(OrigF)) {
   1950       // Scan through this use's formulae.
   1951       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
   1952            E = LU.Formulae.end(); I != E; ++I) {
   1953         const Formula &F = *I;
   1954         // Check to see if this formula has the same registers and symbols
   1955         // as OrigF.
   1956         if (F.BaseRegs == OrigF.BaseRegs &&
   1957             F.ScaledReg == OrigF.ScaledReg &&
   1958             F.AM.BaseGV == OrigF.AM.BaseGV &&
   1959             F.AM.Scale == OrigF.AM.Scale &&
   1960             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
   1961           if (F.AM.BaseOffs == 0)
   1962             return &LU;
   1963           // This is the formula where all the registers and symbols matched;
   1964           // there aren't going to be any others. Since we declined it, we
   1965           // can skip the rest of the formulae and procede to the next LSRUse.
   1966           break;
   1967         }
   1968       }
   1969     }
   1970   }
   1971 
   1972   // Nothing looked good.
   1973   return 0;
   1974 }
   1975 
   1976 void LSRInstance::CollectInterestingTypesAndFactors() {
   1977   SmallSetVector<const SCEV *, 4> Strides;
   1978 
   1979   // Collect interesting types and strides.
   1980   SmallVector<const SCEV *, 4> Worklist;
   1981   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
   1982     const SCEV *Expr = IU.getExpr(*UI);
   1983 
   1984     // Collect interesting types.
   1985     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
   1986 
   1987     // Add strides for mentioned loops.
   1988     Worklist.push_back(Expr);
   1989     do {
   1990       const SCEV *S = Worklist.pop_back_val();
   1991       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
   1992         Strides.insert(AR->getStepRecurrence(SE));
   1993         Worklist.push_back(AR->getStart());
   1994       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
   1995         Worklist.append(Add->op_begin(), Add->op_end());
   1996       }
   1997     } while (!Worklist.empty());
   1998   }
   1999 
   2000   // Compute interesting factors from the set of interesting strides.
   2001   for (SmallSetVector<const SCEV *, 4>::const_iterator
   2002        I = Strides.begin(), E = Strides.end(); I != E; ++I)
   2003     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
   2004          llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
   2005       const SCEV *OldStride = *I;
   2006       const SCEV *NewStride = *NewStrideIter;
   2007 
   2008       if (SE.getTypeSizeInBits(OldStride->getType()) !=
   2009           SE.getTypeSizeInBits(NewStride->getType())) {
   2010         if (SE.getTypeSizeInBits(OldStride->getType()) >
   2011             SE.getTypeSizeInBits(NewStride->getType()))
   2012           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
   2013         else
   2014           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
   2015       }
   2016       if (const SCEVConstant *Factor =
   2017             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
   2018                                                         SE, true))) {
   2019         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
   2020           Factors.insert(Factor->getValue()->getValue().getSExtValue());
   2021       } else if (const SCEVConstant *Factor =
   2022                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
   2023                                                                NewStride,
   2024                                                                SE, true))) {
   2025         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
   2026           Factors.insert(Factor->getValue()->getValue().getSExtValue());
   2027       }
   2028     }
   2029 
   2030   // If all uses use the same type, don't bother looking for truncation-based
   2031   // reuse.
   2032   if (Types.size() == 1)
   2033     Types.clear();
   2034 
   2035   DEBUG(print_factors_and_types(dbgs()));
   2036 }
   2037 
   2038 void LSRInstance::CollectFixupsAndInitialFormulae() {
   2039   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
   2040     // Record the uses.
   2041     LSRFixup &LF = getNewFixup();
   2042     LF.UserInst = UI->getUser();
   2043     LF.OperandValToReplace = UI->getOperandValToReplace();
   2044     LF.PostIncLoops = UI->getPostIncLoops();
   2045 
   2046     LSRUse::KindType Kind = LSRUse::Basic;
   2047     Type *AccessTy = 0;
   2048     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
   2049       Kind = LSRUse::Address;
   2050       AccessTy = getAccessType(LF.UserInst);
   2051     }
   2052 
   2053     const SCEV *S = IU.getExpr(*UI);
   2054 
   2055     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
   2056     // (N - i == 0), and this allows (N - i) to be the expression that we work
   2057     // with rather than just N or i, so we can consider the register
   2058     // requirements for both N and i at the same time. Limiting this code to
   2059     // equality icmps is not a problem because all interesting loops use
   2060     // equality icmps, thanks to IndVarSimplify.
   2061     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
   2062       if (CI->isEquality()) {
   2063         // Swap the operands if needed to put the OperandValToReplace on the
   2064         // left, for consistency.
   2065         Value *NV = CI->getOperand(1);
   2066         if (NV == LF.OperandValToReplace) {
   2067           CI->setOperand(1, CI->getOperand(0));
   2068           CI->setOperand(0, NV);
   2069           NV = CI->getOperand(1);
   2070           Changed = true;
   2071         }
   2072 
   2073         // x == y  -->  x - y == 0
   2074         const SCEV *N = SE.getSCEV(NV);
   2075         if (SE.isLoopInvariant(N, L)) {
   2076           // S is normalized, so normalize N before folding it into S
   2077           // to keep the result normalized.
   2078           N = TransformForPostIncUse(Normalize, N, CI, 0,
   2079                                      LF.PostIncLoops, SE, DT);
   2080           Kind = LSRUse::ICmpZero;
   2081           S = SE.getMinusSCEV(N, S);
   2082         }
   2083 
   2084         // -1 and the negations of all interesting strides (except the negation
   2085         // of -1) are now also interesting.
   2086         for (size_t i = 0, e = Factors.size(); i != e; ++i)
   2087           if (Factors[i] != -1)
   2088             Factors.insert(-(uint64_t)Factors[i]);
   2089         Factors.insert(-1);
   2090       }
   2091 
   2092     // Set up the initial formula for this use.
   2093     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
   2094     LF.LUIdx = P.first;
   2095     LF.Offset = P.second;
   2096     LSRUse &LU = Uses[LF.LUIdx];
   2097     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
   2098     if (!LU.WidestFixupType ||
   2099         SE.getTypeSizeInBits(LU.WidestFixupType) <
   2100         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
   2101       LU.WidestFixupType = LF.OperandValToReplace->getType();
   2102 
   2103     // If this is the first use of this LSRUse, give it a formula.
   2104     if (LU.Formulae.empty()) {
   2105       InsertInitialFormula(S, LU, LF.LUIdx);
   2106       CountRegisters(LU.Formulae.back(), LF.LUIdx);
   2107     }
   2108   }
   2109 
   2110   DEBUG(print_fixups(dbgs()));
   2111 }
   2112 
   2113 /// InsertInitialFormula - Insert a formula for the given expression into
   2114 /// the given use, separating out loop-variant portions from loop-invariant
   2115 /// and loop-computable portions.
   2116 void
   2117 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
   2118   Formula F;
   2119   F.InitialMatch(S, L, SE);
   2120   bool Inserted = InsertFormula(LU, LUIdx, F);
   2121   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
   2122 }
   2123 
   2124 /// InsertSupplementalFormula - Insert a simple single-register formula for
   2125 /// the given expression into the given use.
   2126 void
   2127 LSRInstance::InsertSupplementalFormula(const SCEV *S,
   2128                                        LSRUse &LU, size_t LUIdx) {
   2129   Formula F;
   2130   F.BaseRegs.push_back(S);
   2131   F.AM.HasBaseReg = true;
   2132   bool Inserted = InsertFormula(LU, LUIdx, F);
   2133   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
   2134 }
   2135 
   2136 /// CountRegisters - Note which registers are used by the given formula,
   2137 /// updating RegUses.
   2138 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
   2139   if (F.ScaledReg)
   2140     RegUses.CountRegister(F.ScaledReg, LUIdx);
   2141   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
   2142        E = F.BaseRegs.end(); I != E; ++I)
   2143     RegUses.CountRegister(*I, LUIdx);
   2144 }
   2145 
   2146 /// InsertFormula - If the given formula has not yet been inserted, add it to
   2147 /// the list, and return true. Return false otherwise.
   2148 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   2149   if (!LU.InsertFormula(F))
   2150     return false;
   2151 
   2152   CountRegisters(F, LUIdx);
   2153   return true;
   2154 }
   2155 
   2156 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
   2157 /// loop-invariant values which we're tracking. These other uses will pin these
   2158 /// values in registers, making them less profitable for elimination.
   2159 /// TODO: This currently misses non-constant addrec step registers.
   2160 /// TODO: Should this give more weight to users inside the loop?
   2161 void
   2162 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
   2163   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
   2164   SmallPtrSet<const SCEV *, 8> Inserted;
   2165 
   2166   while (!Worklist.empty()) {
   2167     const SCEV *S = Worklist.pop_back_val();
   2168 
   2169     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
   2170       Worklist.append(N->op_begin(), N->op_end());
   2171     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
   2172       Worklist.push_back(C->getOperand());
   2173     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
   2174       Worklist.push_back(D->getLHS());
   2175       Worklist.push_back(D->getRHS());
   2176     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
   2177       if (!Inserted.insert(U)) continue;
   2178       const Value *V = U->getValue();
   2179       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
   2180         // Look for instructions defined outside the loop.
   2181         if (L->contains(Inst)) continue;
   2182       } else if (isa<UndefValue>(V))
   2183         // Undef doesn't have a live range, so it doesn't matter.
   2184         continue;
   2185       for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
   2186            UI != UE; ++UI) {
   2187         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
   2188         // Ignore non-instructions.
   2189         if (!UserInst)
   2190           continue;
   2191         // Ignore instructions in other functions (as can happen with
   2192         // Constants).
   2193         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
   2194           continue;
   2195         // Ignore instructions not dominated by the loop.
   2196         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
   2197           UserInst->getParent() :
   2198           cast<PHINode>(UserInst)->getIncomingBlock(
   2199             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
   2200         if (!DT.dominates(L->getHeader(), UseBB))
   2201           continue;
   2202         // Ignore uses which are part of other SCEV expressions, to avoid
   2203         // analyzing them multiple times.
   2204         if (SE.isSCEVable(UserInst->getType())) {
   2205           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
   2206           // If the user is a no-op, look through to its uses.
   2207           if (!isa<SCEVUnknown>(UserS))
   2208             continue;
   2209           if (UserS == U) {
   2210             Worklist.push_back(
   2211               SE.getUnknown(const_cast<Instruction *>(UserInst)));
   2212             continue;
   2213           }
   2214         }
   2215         // Ignore icmp instructions which are already being analyzed.
   2216         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
   2217           unsigned OtherIdx = !UI.getOperandNo();
   2218           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
   2219           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
   2220             continue;
   2221         }
   2222 
   2223         LSRFixup &LF = getNewFixup();
   2224         LF.UserInst = const_cast<Instruction *>(UserInst);
   2225         LF.OperandValToReplace = UI.getUse();
   2226         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
   2227         LF.LUIdx = P.first;
   2228         LF.Offset = P.second;
   2229         LSRUse &LU = Uses[LF.LUIdx];
   2230         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
   2231         if (!LU.WidestFixupType ||
   2232             SE.getTypeSizeInBits(LU.WidestFixupType) <
   2233             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
   2234           LU.WidestFixupType = LF.OperandValToReplace->getType();
   2235         InsertSupplementalFormula(U, LU, LF.LUIdx);
   2236         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
   2237         break;
   2238       }
   2239     }
   2240   }
   2241 }
   2242 
   2243 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
   2244 /// separate registers. If C is non-null, multiply each subexpression by C.
   2245 static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
   2246                             SmallVectorImpl<const SCEV *> &Ops,
   2247                             const Loop *L,
   2248                             ScalarEvolution &SE) {
   2249   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
   2250     // Break out add operands.
   2251     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
   2252          I != E; ++I)
   2253       CollectSubexprs(*I, C, Ops, L, SE);
   2254     return;
   2255   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
   2256     // Split a non-zero base out of an addrec.
   2257     if (!AR->getStart()->isZero()) {
   2258       CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
   2259                                        AR->getStepRecurrence(SE),
   2260                                        AR->getLoop(),
   2261                                        //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
   2262                                        SCEV::FlagAnyWrap),
   2263                       C, Ops, L, SE);
   2264       CollectSubexprs(AR->getStart(), C, Ops, L, SE);
   2265       return;
   2266     }
   2267   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
   2268     // Break (C * (a + b + c)) into C*a + C*b + C*c.
   2269     if (Mul->getNumOperands() == 2)
   2270       if (const SCEVConstant *Op0 =
   2271             dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
   2272         CollectSubexprs(Mul->getOperand(1),
   2273                         C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
   2274                         Ops, L, SE);
   2275         return;
   2276       }
   2277   }
   2278 
   2279   // Otherwise use the value itself, optionally with a scale applied.
   2280   Ops.push_back(C ? SE.getMulExpr(C, S) : S);
   2281 }
   2282 
   2283 /// GenerateReassociations - Split out subexpressions from adds and the bases of
   2284 /// addrecs.
   2285 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
   2286                                          Formula Base,
   2287                                          unsigned Depth) {
   2288   // Arbitrarily cap recursion to protect compile time.
   2289   if (Depth >= 3) return;
   2290 
   2291   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
   2292     const SCEV *BaseReg = Base.BaseRegs[i];
   2293 
   2294     SmallVector<const SCEV *, 8> AddOps;
   2295     CollectSubexprs(BaseReg, 0, AddOps, L, SE);
   2296 
   2297     if (AddOps.size() == 1) continue;
   2298 
   2299     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
   2300          JE = AddOps.end(); J != JE; ++J) {
   2301 
   2302       // Loop-variant "unknown" values are uninteresting; we won't be able to
   2303       // do anything meaningful with them.
   2304       if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
   2305         continue;
   2306 
   2307       // Don't pull a constant into a register if the constant could be folded
   2308       // into an immediate field.
   2309       if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
   2310                            Base.getNumRegs() > 1,
   2311                            LU.Kind, LU.AccessTy, TLI, SE))
   2312         continue;
   2313 
   2314       // Collect all operands except *J.
   2315       SmallVector<const SCEV *, 8> InnerAddOps
   2316         (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
   2317       InnerAddOps.append
   2318         (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
   2319 
   2320       // Don't leave just a constant behind in a register if the constant could
   2321       // be folded into an immediate field.
   2322       if (InnerAddOps.size() == 1 &&
   2323           isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
   2324                            Base.getNumRegs() > 1,
   2325                            LU.Kind, LU.AccessTy, TLI, SE))
   2326         continue;
   2327 
   2328       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
   2329       if (InnerSum->isZero())
   2330         continue;
   2331       Formula F = Base;
   2332 
   2333       // Add the remaining pieces of the add back into the new formula.
   2334       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
   2335       if (TLI && InnerSumSC &&
   2336           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
   2337           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
   2338                                    InnerSumSC->getValue()->getZExtValue())) {
   2339         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
   2340                            InnerSumSC->getValue()->getZExtValue();
   2341         F.BaseRegs.erase(F.BaseRegs.begin() + i);
   2342       } else
   2343         F.BaseRegs[i] = InnerSum;
   2344 
   2345       // Add J as its own register, or an unfolded immediate.
   2346       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
   2347       if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
   2348           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
   2349                                    SC->getValue()->getZExtValue()))
   2350         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
   2351                            SC->getValue()->getZExtValue();
   2352       else
   2353         F.BaseRegs.push_back(*J);
   2354 
   2355       if (InsertFormula(LU, LUIdx, F))
   2356         // If that formula hadn't been seen before, recurse to find more like
   2357         // it.
   2358         GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
   2359     }
   2360   }
   2361 }
   2362 
   2363 /// GenerateCombinations - Generate a formula consisting of all of the
   2364 /// loop-dominating registers added into a single register.
   2365 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
   2366                                        Formula Base) {
   2367   // This method is only interesting on a plurality of registers.
   2368   if (Base.BaseRegs.size() <= 1) return;
   2369 
   2370   Formula F = Base;
   2371   F.BaseRegs.clear();
   2372   SmallVector<const SCEV *, 4> Ops;
   2373   for (SmallVectorImpl<const SCEV *>::const_iterator
   2374        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
   2375     const SCEV *BaseReg = *I;
   2376     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
   2377         !SE.hasComputableLoopEvolution(BaseReg, L))
   2378       Ops.push_back(BaseReg);
   2379     else
   2380       F.BaseRegs.push_back(BaseReg);
   2381   }
   2382   if (Ops.size() > 1) {
   2383     const SCEV *Sum = SE.getAddExpr(Ops);
   2384     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
   2385     // opportunity to fold something. For now, just ignore such cases
   2386     // rather than proceed with zero in a register.
   2387     if (!Sum->isZero()) {
   2388       F.BaseRegs.push_back(Sum);
   2389       (void)InsertFormula(LU, LUIdx, F);
   2390     }
   2391   }
   2392 }
   2393 
   2394 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
   2395 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
   2396                                           Formula Base) {
   2397   // We can't add a symbolic offset if the address already contains one.
   2398   if (Base.AM.BaseGV) return;
   2399 
   2400   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
   2401     const SCEV *G = Base.BaseRegs[i];
   2402     GlobalValue *GV = ExtractSymbol(G, SE);
   2403     if (G->isZero() || !GV)
   2404       continue;
   2405     Formula F = Base;
   2406     F.AM.BaseGV = GV;
   2407     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
   2408                     LU.Kind, LU.AccessTy, TLI))
   2409       continue;
   2410     F.BaseRegs[i] = G;
   2411     (void)InsertFormula(LU, LUIdx, F);
   2412   }
   2413 }
   2414 
   2415 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
   2416 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
   2417                                           Formula Base) {
   2418   // TODO: For now, just add the min and max offset, because it usually isn't
   2419   // worthwhile looking at everything inbetween.
   2420   SmallVector<int64_t, 2> Worklist;
   2421   Worklist.push_back(LU.MinOffset);
   2422   if (LU.MaxOffset != LU.MinOffset)
   2423     Worklist.push_back(LU.MaxOffset);
   2424 
   2425   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
   2426     const SCEV *G = Base.BaseRegs[i];
   2427 
   2428     for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
   2429          E = Worklist.end(); I != E; ++I) {
   2430       Formula F = Base;
   2431       F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
   2432       if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
   2433                      LU.Kind, LU.AccessTy, TLI)) {
   2434         // Add the offset to the base register.
   2435         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
   2436         // If it cancelled out, drop the base register, otherwise update it.
   2437         if (NewG->isZero()) {
   2438           std::swap(F.BaseRegs[i], F.BaseRegs.back());
   2439           F.BaseRegs.pop_back();
   2440         } else
   2441           F.BaseRegs[i] = NewG;
   2442 
   2443         (void)InsertFormula(LU, LUIdx, F);
   2444       }
   2445     }
   2446 
   2447     int64_t Imm = ExtractImmediate(G, SE);
   2448     if (G->isZero() || Imm == 0)
   2449       continue;
   2450     Formula F = Base;
   2451     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
   2452     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
   2453                     LU.Kind, LU.AccessTy, TLI))
   2454       continue;
   2455     F.BaseRegs[i] = G;
   2456     (void)InsertFormula(LU, LUIdx, F);
   2457   }
   2458 }
   2459 
   2460 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
   2461 /// the comparison. For example, x == y -> x*c == y*c.
   2462 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   2463                                          Formula Base) {
   2464   if (LU.Kind != LSRUse::ICmpZero) return;
   2465 
   2466   // Determine the integer type for the base formula.
   2467   Type *IntTy = Base.getType();
   2468   if (!IntTy) return;
   2469   if (SE.getTypeSizeInBits(IntTy) > 64) return;
   2470 
   2471   // Don't do this if there is more than one offset.
   2472   if (LU.MinOffset != LU.MaxOffset) return;
   2473 
   2474   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
   2475 
   2476   // Check each interesting stride.
   2477   for (SmallSetVector<int64_t, 8>::const_iterator
   2478        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
   2479     int64_t Factor = *I;
   2480 
   2481     // Check that the multiplication doesn't overflow.
   2482     if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
   2483       continue;
   2484     int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
   2485     if (NewBaseOffs / Factor != Base.AM.BaseOffs)
   2486       continue;
   2487 
   2488     // Check that multiplying with the use offset doesn't overflow.
   2489     int64_t Offset = LU.MinOffset;
   2490     if (Offset == INT64_MIN && Factor == -1)
   2491       continue;
   2492     Offset = (uint64_t)Offset * Factor;
   2493     if (Offset / Factor != LU.MinOffset)
   2494       continue;
   2495 
   2496     Formula F = Base;
   2497     F.AM.BaseOffs = NewBaseOffs;
   2498 
   2499     // Check that this scale is legal.
   2500     if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
   2501       continue;
   2502 
   2503     // Compensate for the use having MinOffset built into it.
   2504     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
   2505 
   2506     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
   2507 
   2508     // Check that multiplying with each base register doesn't overflow.
   2509     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
   2510       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
   2511       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
   2512         goto next;
   2513     }
   2514 
   2515     // Check that multiplying with the scaled register doesn't overflow.
   2516     if (F.ScaledReg) {
   2517       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
   2518       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
   2519         continue;
   2520     }
   2521 
   2522     // Check that multiplying with the unfolded offset doesn't overflow.
   2523     if (F.UnfoldedOffset != 0) {
   2524       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
   2525         continue;
   2526       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
   2527       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
   2528         continue;
   2529     }
   2530 
   2531     // If we make it here and it's legal, add it.
   2532     (void)InsertFormula(LU, LUIdx, F);
   2533   next:;
   2534   }
   2535 }
   2536 
   2537 /// GenerateScales - Generate stride factor reuse formulae by making use of
   2538 /// scaled-offset address modes, for example.
   2539 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   2540   // Determine the integer type for the base formula.
   2541   Type *IntTy = Base.getType();
   2542   if (!IntTy) return;
   2543 
   2544   // If this Formula already has a scaled register, we can't add another one.
   2545   if (Base.AM.Scale != 0) return;
   2546 
   2547   // Check each interesting stride.
   2548   for (SmallSetVector<int64_t, 8>::const_iterator
   2549        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
   2550     int64_t Factor = *I;
   2551 
   2552     Base.AM.Scale = Factor;
   2553     Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
   2554     // Check whether this scale is going to be legal.
   2555     if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
   2556                     LU.Kind, LU.AccessTy, TLI)) {
   2557       // As a special-case, handle special out-of-loop Basic users specially.
   2558       // TODO: Reconsider this special case.
   2559       if (LU.Kind == LSRUse::Basic &&
   2560           isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
   2561                      LSRUse::Special, LU.AccessTy, TLI) &&
   2562           LU.AllFixupsOutsideLoop)
   2563         LU.Kind = LSRUse::Special;
   2564       else
   2565         continue;
   2566     }
   2567     // For an ICmpZero, negating a solitary base register won't lead to
   2568     // new solutions.
   2569     if (LU.Kind == LSRUse::ICmpZero &&
   2570         !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
   2571       continue;
   2572     // For each addrec base reg, apply the scale, if possible.
   2573     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
   2574       if (const SCEVAddRecExpr *AR =
   2575             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
   2576         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
   2577         if (FactorS->isZero())
   2578           continue;
   2579         // Divide out the factor, ignoring high bits, since we'll be
   2580         // scaling the value back up in the end.
   2581         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
   2582           // TODO: This could be optimized to avoid all the copying.
   2583           Formula F = Base;
   2584           F.ScaledReg = Quotient;
   2585           F.DeleteBaseReg(F.BaseRegs[i]);
   2586           (void)InsertFormula(LU, LUIdx, F);
   2587         }
   2588       }
   2589   }
   2590 }
   2591 
   2592 /// GenerateTruncates - Generate reuse formulae from different IV types.
   2593 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   2594   // This requires TargetLowering to tell us which truncates are free.
   2595   if (!TLI) return;
   2596 
   2597   // Don't bother truncating symbolic values.
   2598   if (Base.AM.BaseGV) return;
   2599 
   2600   // Determine the integer type for the base formula.
   2601   Type *DstTy = Base.getType();
   2602   if (!DstTy) return;
   2603   DstTy = SE.getEffectiveSCEVType(DstTy);
   2604 
   2605   for (SmallSetVector<Type *, 4>::const_iterator
   2606        I = Types.begin(), E = Types.end(); I != E; ++I) {
   2607     Type *SrcTy = *I;
   2608     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
   2609       Formula F = Base;
   2610 
   2611       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
   2612       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
   2613            JE = F.BaseRegs.end(); J != JE; ++J)
   2614         *J = SE.getAnyExtendExpr(*J, SrcTy);
   2615 
   2616       // TODO: This assumes we've done basic processing on all uses and
   2617       // have an idea what the register usage is.
   2618       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
   2619         continue;
   2620 
   2621       (void)InsertFormula(LU, LUIdx, F);
   2622     }
   2623   }
   2624 }
   2625 
   2626 namespace {
   2627 
   2628 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
   2629 /// defer modifications so that the search phase doesn't have to worry about
   2630 /// the data structures moving underneath it.
   2631 struct WorkItem {
   2632   size_t LUIdx;
   2633   int64_t Imm;
   2634   const SCEV *OrigReg;
   2635 
   2636   WorkItem(size_t LI, int64_t I, const SCEV *R)
   2637     : LUIdx(LI), Imm(I), OrigReg(R) {}
   2638 
   2639   void print(raw_ostream &OS) const;
   2640   void dump() const;
   2641 };
   2642 
   2643 }
   2644 
   2645 void WorkItem::print(raw_ostream &OS) const {
   2646   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
   2647      << " , add offset " << Imm;
   2648 }
   2649 
   2650 void WorkItem::dump() const {
   2651   print(errs()); errs() << '\n';
   2652 }
   2653 
   2654 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
   2655 /// distance apart and try to form reuse opportunities between them.
   2656 void LSRInstance::GenerateCrossUseConstantOffsets() {
   2657   // Group the registers by their value without any added constant offset.
   2658   typedef std::map<int64_t, const SCEV *> ImmMapTy;
   2659   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
   2660   RegMapTy Map;
   2661   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
   2662   SmallVector<const SCEV *, 8> Sequence;
   2663   for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
   2664        I != E; ++I) {
   2665     const SCEV *Reg = *I;
   2666     int64_t Imm = ExtractImmediate(Reg, SE);
   2667     std::pair<RegMapTy::iterator, bool> Pair =
   2668       Map.insert(std::make_pair(Reg, ImmMapTy()));
   2669     if (Pair.second)
   2670       Sequence.push_back(Reg);
   2671     Pair.first->second.insert(std::make_pair(Imm, *I));
   2672     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
   2673   }
   2674 
   2675   // Now examine each set of registers with the same base value. Build up
   2676   // a list of work to do and do the work in a separate step so that we're
   2677   // not adding formulae and register counts while we're searching.
   2678   SmallVector<WorkItem, 32> WorkItems;
   2679   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
   2680   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
   2681        E = Sequence.end(); I != E; ++I) {
   2682     const SCEV *Reg = *I;
   2683     const ImmMapTy &Imms = Map.find(Reg)->second;
   2684 
   2685     // It's not worthwhile looking for reuse if there's only one offset.
   2686     if (Imms.size() == 1)
   2687       continue;
   2688 
   2689     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
   2690           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
   2691                J != JE; ++J)
   2692             dbgs() << ' ' << J->first;
   2693           dbgs() << '\n');
   2694 
   2695     // Examine each offset.
   2696     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
   2697          J != JE; ++J) {
   2698       const SCEV *OrigReg = J->second;
   2699 
   2700       int64_t JImm = J->first;
   2701       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
   2702 
   2703       if (!isa<SCEVConstant>(OrigReg) &&
   2704           UsedByIndicesMap[Reg].count() == 1) {
   2705         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
   2706         continue;
   2707       }
   2708 
   2709       // Conservatively examine offsets between this orig reg a few selected
   2710       // other orig regs.
   2711       ImmMapTy::const_iterator OtherImms[] = {
   2712         Imms.begin(), prior(Imms.end()),
   2713         Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
   2714       };
   2715       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
   2716         ImmMapTy::const_iterator M = OtherImms[i];
   2717         if (M == J || M == JE) continue;
   2718 
   2719         // Compute the difference between the two.
   2720         int64_t Imm = (uint64_t)JImm - M->first;
   2721         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
   2722              LUIdx = UsedByIndices.find_next(LUIdx))
   2723           // Make a memo of this use, offset, and register tuple.
   2724           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
   2725             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
   2726       }
   2727     }
   2728   }
   2729 
   2730   Map.clear();
   2731   Sequence.clear();
   2732   UsedByIndicesMap.clear();
   2733   UniqueItems.clear();
   2734 
   2735   // Now iterate through the worklist and add new formulae.
   2736   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
   2737        E = WorkItems.end(); I != E; ++I) {
   2738     const WorkItem &WI = *I;
   2739     size_t LUIdx = WI.LUIdx;
   2740     LSRUse &LU = Uses[LUIdx];
   2741     int64_t Imm = WI.Imm;
   2742     const SCEV *OrigReg = WI.OrigReg;
   2743 
   2744     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
   2745     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
   2746     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
   2747 
   2748     // TODO: Use a more targeted data structure.
   2749     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
   2750       const Formula &F = LU.Formulae[L];
   2751       // Use the immediate in the scaled register.
   2752       if (F.ScaledReg == OrigReg) {
   2753         int64_t Offs = (uint64_t)F.AM.BaseOffs +
   2754                        Imm * (uint64_t)F.AM.Scale;
   2755         // Don't create 50 + reg(-50).
   2756         if (F.referencesReg(SE.getSCEV(
   2757                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
   2758           continue;
   2759         Formula NewF = F;
   2760         NewF.AM.BaseOffs = Offs;
   2761         if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
   2762                         LU.Kind, LU.AccessTy, TLI))
   2763           continue;
   2764         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
   2765 
   2766         // If the new scale is a constant in a register, and adding the constant
   2767         // value to the immediate would produce a value closer to zero than the
   2768         // immediate itself, then the formula isn't worthwhile.
   2769         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
   2770           if (C->getValue()->isNegative() !=
   2771                 (NewF.AM.BaseOffs < 0) &&
   2772               (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
   2773                 .ule(abs64(NewF.AM.BaseOffs)))
   2774             continue;
   2775 
   2776         // OK, looks good.
   2777         (void)InsertFormula(LU, LUIdx, NewF);
   2778       } else {
   2779         // Use the immediate in a base register.
   2780         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
   2781           const SCEV *BaseReg = F.BaseRegs[N];
   2782           if (BaseReg != OrigReg)
   2783             continue;
   2784           Formula NewF = F;
   2785           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
   2786           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
   2787                           LU.Kind, LU.AccessTy, TLI)) {
   2788             if (!TLI ||
   2789                 !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
   2790               continue;
   2791             NewF = F;
   2792             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
   2793           }
   2794           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
   2795 
   2796           // If the new formula has a constant in a register, and adding the
   2797           // constant value to the immediate would produce a value closer to
   2798           // zero than the immediate itself, then the formula isn't worthwhile.
   2799           for (SmallVectorImpl<const SCEV *>::const_iterator
   2800                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
   2801                J != JE; ++J)
   2802             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
   2803               if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt(
   2804                    abs64(NewF.AM.BaseOffs)) &&
   2805                   (C->getValue()->getValue() +
   2806                    NewF.AM.BaseOffs).countTrailingZeros() >=
   2807                    CountTrailingZeros_64(NewF.AM.BaseOffs))
   2808                 goto skip_formula;
   2809 
   2810           // Ok, looks good.
   2811           (void)InsertFormula(LU, LUIdx, NewF);
   2812           break;
   2813         skip_formula:;
   2814         }
   2815       }
   2816     }
   2817   }
   2818 }
   2819 
   2820 /// GenerateAllReuseFormulae - Generate formulae for each use.
   2821 void
   2822 LSRInstance::GenerateAllReuseFormulae() {
   2823   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
   2824   // queries are more precise.
   2825   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2826     LSRUse &LU = Uses[LUIdx];
   2827     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2828       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
   2829     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2830       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
   2831   }
   2832   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2833     LSRUse &LU = Uses[LUIdx];
   2834     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2835       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
   2836     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2837       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
   2838     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2839       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
   2840     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2841       GenerateScales(LU, LUIdx, LU.Formulae[i]);
   2842   }
   2843   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2844     LSRUse &LU = Uses[LUIdx];
   2845     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
   2846       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
   2847   }
   2848 
   2849   GenerateCrossUseConstantOffsets();
   2850 
   2851   DEBUG(dbgs() << "\n"
   2852                   "After generating reuse formulae:\n";
   2853         print_uses(dbgs()));
   2854 }
   2855 
   2856 /// If there are multiple formulae with the same set of registers used
   2857 /// by other uses, pick the best one and delete the others.
   2858 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
   2859   DenseSet<const SCEV *> VisitedRegs;
   2860   SmallPtrSet<const SCEV *, 16> Regs;
   2861 #ifndef NDEBUG
   2862   bool ChangedFormulae = false;
   2863 #endif
   2864 
   2865   // Collect the best formula for each unique set of shared registers. This
   2866   // is reset for each use.
   2867   typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
   2868     BestFormulaeTy;
   2869   BestFormulaeTy BestFormulae;
   2870 
   2871   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2872     LSRUse &LU = Uses[LUIdx];
   2873     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
   2874 
   2875     bool Any = false;
   2876     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
   2877          FIdx != NumForms; ++FIdx) {
   2878       Formula &F = LU.Formulae[FIdx];
   2879 
   2880       SmallVector<const SCEV *, 2> Key;
   2881       for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
   2882            JE = F.BaseRegs.end(); J != JE; ++J) {
   2883         const SCEV *Reg = *J;
   2884         if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
   2885           Key.push_back(Reg);
   2886       }
   2887       if (F.ScaledReg &&
   2888           RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
   2889         Key.push_back(F.ScaledReg);
   2890       // Unstable sort by host order ok, because this is only used for
   2891       // uniquifying.
   2892       std::sort(Key.begin(), Key.end());
   2893 
   2894       std::pair<BestFormulaeTy::const_iterator, bool> P =
   2895         BestFormulae.insert(std::make_pair(Key, FIdx));
   2896       if (!P.second) {
   2897         Formula &Best = LU.Formulae[P.first->second];
   2898 
   2899         Cost CostF;
   2900         CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
   2901         Regs.clear();
   2902         Cost CostBest;
   2903         CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
   2904         Regs.clear();
   2905         if (CostF < CostBest)
   2906           std::swap(F, Best);
   2907         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
   2908               dbgs() << "\n"
   2909                         "    in favor of formula "; Best.print(dbgs());
   2910               dbgs() << '\n');
   2911 #ifndef NDEBUG
   2912         ChangedFormulae = true;
   2913 #endif
   2914         LU.DeleteFormula(F);
   2915         --FIdx;
   2916         --NumForms;
   2917         Any = true;
   2918         continue;
   2919       }
   2920     }
   2921 
   2922     // Now that we've filtered out some formulae, recompute the Regs set.
   2923     if (Any)
   2924       LU.RecomputeRegs(LUIdx, RegUses);
   2925 
   2926     // Reset this to prepare for the next use.
   2927     BestFormulae.clear();
   2928   }
   2929 
   2930   DEBUG(if (ChangedFormulae) {
   2931           dbgs() << "\n"
   2932                     "After filtering out undesirable candidates:\n";
   2933           print_uses(dbgs());
   2934         });
   2935 }
   2936 
   2937 // This is a rough guess that seems to work fairly well.
   2938 static const size_t ComplexityLimit = UINT16_MAX;
   2939 
   2940 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of
   2941 /// solutions the solver might have to consider. It almost never considers
   2942 /// this many solutions because it prune the search space, but the pruning
   2943 /// isn't always sufficient.
   2944 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
   2945   size_t Power = 1;
   2946   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
   2947        E = Uses.end(); I != E; ++I) {
   2948     size_t FSize = I->Formulae.size();
   2949     if (FSize >= ComplexityLimit) {
   2950       Power = ComplexityLimit;
   2951       break;
   2952     }
   2953     Power *= FSize;
   2954     if (Power >= ComplexityLimit)
   2955       break;
   2956   }
   2957   return Power;
   2958 }
   2959 
   2960 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
   2961 /// of the registers of another formula, it won't help reduce register
   2962 /// pressure (though it may not necessarily hurt register pressure); remove
   2963 /// it to simplify the system.
   2964 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
   2965   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   2966     DEBUG(dbgs() << "The search space is too complex.\n");
   2967 
   2968     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
   2969                     "which use a superset of registers used by other "
   2970                     "formulae.\n");
   2971 
   2972     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   2973       LSRUse &LU = Uses[LUIdx];
   2974       bool Any = false;
   2975       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
   2976         Formula &F = LU.Formulae[i];
   2977         // Look for a formula with a constant or GV in a register. If the use
   2978         // also has a formula with that same value in an immediate field,
   2979         // delete the one that uses a register.
   2980         for (SmallVectorImpl<const SCEV *>::const_iterator
   2981              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
   2982           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
   2983             Formula NewF = F;
   2984             NewF.AM.BaseOffs += C->getValue()->getSExtValue();
   2985             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
   2986                                 (I - F.BaseRegs.begin()));
   2987             if (LU.HasFormulaWithSameRegs(NewF)) {
   2988               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
   2989               LU.DeleteFormula(F);
   2990               --i;
   2991               --e;
   2992               Any = true;
   2993               break;
   2994             }
   2995           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
   2996             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
   2997               if (!F.AM.BaseGV) {
   2998                 Formula NewF = F;
   2999                 NewF.AM.BaseGV = GV;
   3000                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
   3001                                     (I - F.BaseRegs.begin()));
   3002                 if (LU.HasFormulaWithSameRegs(NewF)) {
   3003                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
   3004                         dbgs() << '\n');
   3005                   LU.DeleteFormula(F);
   3006                   --i;
   3007                   --e;
   3008                   Any = true;
   3009                   break;
   3010                 }
   3011               }
   3012           }
   3013         }
   3014       }
   3015       if (Any)
   3016         LU.RecomputeRegs(LUIdx, RegUses);
   3017     }
   3018 
   3019     DEBUG(dbgs() << "After pre-selection:\n";
   3020           print_uses(dbgs()));
   3021   }
   3022 }
   3023 
   3024 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
   3025 /// for expressions like A, A+1, A+2, etc., allocate a single register for
   3026 /// them.
   3027 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
   3028   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   3029     DEBUG(dbgs() << "The search space is too complex.\n");
   3030 
   3031     DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
   3032                     "separated by a constant offset will use the same "
   3033                     "registers.\n");
   3034 
   3035     // This is especially useful for unrolled loops.
   3036 
   3037     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   3038       LSRUse &LU = Uses[LUIdx];
   3039       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
   3040            E = LU.Formulae.end(); I != E; ++I) {
   3041         const Formula &F = *I;
   3042         if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
   3043           if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
   3044             if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
   3045                                    /*HasBaseReg=*/false,
   3046                                    LU.Kind, LU.AccessTy)) {
   3047               DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
   3048                     dbgs() << '\n');
   3049 
   3050               LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
   3051 
   3052               // Update the relocs to reference the new use.
   3053               for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
   3054                    E = Fixups.end(); I != E; ++I) {
   3055                 LSRFixup &Fixup = *I;
   3056                 if (Fixup.LUIdx == LUIdx) {
   3057                   Fixup.LUIdx = LUThatHas - &Uses.front();
   3058                   Fixup.Offset += F.AM.BaseOffs;
   3059                   // Add the new offset to LUThatHas' offset list.
   3060                   if (LUThatHas->Offsets.back() != Fixup.Offset) {
   3061                     LUThatHas->Offsets.push_back(Fixup.Offset);
   3062                     if (Fixup.Offset > LUThatHas->MaxOffset)
   3063                       LUThatHas->MaxOffset = Fixup.Offset;
   3064                     if (Fixup.Offset < LUThatHas->MinOffset)
   3065                       LUThatHas->MinOffset = Fixup.Offset;
   3066                   }
   3067                   DEBUG(dbgs() << "New fixup has offset "
   3068                                << Fixup.Offset << '\n');
   3069                 }
   3070                 if (Fixup.LUIdx == NumUses-1)
   3071                   Fixup.LUIdx = LUIdx;
   3072               }
   3073 
   3074               // Delete formulae from the new use which are no longer legal.
   3075               bool Any = false;
   3076               for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
   3077                 Formula &F = LUThatHas->Formulae[i];
   3078                 if (!isLegalUse(F.AM,
   3079                                 LUThatHas->MinOffset, LUThatHas->MaxOffset,
   3080                                 LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
   3081                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
   3082                         dbgs() << '\n');
   3083                   LUThatHas->DeleteFormula(F);
   3084                   --i;
   3085                   --e;
   3086                   Any = true;
   3087                 }
   3088               }
   3089               if (Any)
   3090                 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
   3091 
   3092               // Delete the old use.
   3093               DeleteUse(LU, LUIdx);
   3094               --LUIdx;
   3095               --NumUses;
   3096               break;
   3097             }
   3098           }
   3099         }
   3100       }
   3101     }
   3102 
   3103     DEBUG(dbgs() << "After pre-selection:\n";
   3104           print_uses(dbgs()));
   3105   }
   3106 }
   3107 
   3108 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
   3109 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
   3110 /// we've done more filtering, as it may be able to find more formulae to
   3111 /// eliminate.
   3112 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   3113   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   3114     DEBUG(dbgs() << "The search space is too complex.\n");
   3115 
   3116     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
   3117                     "undesirable dedicated registers.\n");
   3118 
   3119     FilterOutUndesirableDedicatedRegisters();
   3120 
   3121     DEBUG(dbgs() << "After pre-selection:\n";
   3122           print_uses(dbgs()));
   3123   }
   3124 }
   3125 
   3126 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
   3127 /// to be profitable, and then in any use which has any reference to that
   3128 /// register, delete all formulae which do not reference that register.
   3129 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
   3130   // With all other options exhausted, loop until the system is simple
   3131   // enough to handle.
   3132   SmallPtrSet<const SCEV *, 4> Taken;
   3133   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
   3134     // Ok, we have too many of formulae on our hands to conveniently handle.
   3135     // Use a rough heuristic to thin out the list.
   3136     DEBUG(dbgs() << "The search space is too complex.\n");
   3137 
   3138     // Pick the register which is used by the most LSRUses, which is likely
   3139     // to be a good reuse register candidate.
   3140     const SCEV *Best = 0;
   3141     unsigned BestNum = 0;
   3142     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
   3143          I != E; ++I) {
   3144       const SCEV *Reg = *I;
   3145       if (Taken.count(Reg))
   3146         continue;
   3147       if (!Best)
   3148         Best = Reg;
   3149       else {
   3150         unsigned Count = RegUses.getUsedByIndices(Reg).count();
   3151         if (Count > BestNum) {
   3152           Best = Reg;
   3153           BestNum = Count;
   3154         }
   3155       }
   3156     }
   3157 
   3158     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
   3159                  << " will yield profitable reuse.\n");
   3160     Taken.insert(Best);
   3161 
   3162     // In any use with formulae which references this register, delete formulae
   3163     // which don't reference it.
   3164     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
   3165       LSRUse &LU = Uses[LUIdx];
   3166       if (!LU.Regs.count(Best)) continue;
   3167 
   3168       bool Any = false;
   3169       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
   3170         Formula &F = LU.Formulae[i];
   3171         if (!F.referencesReg(Best)) {
   3172           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
   3173           LU.DeleteFormula(F);
   3174           --e;
   3175           --i;
   3176           Any = true;
   3177           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
   3178           continue;
   3179         }
   3180       }
   3181 
   3182       if (Any)
   3183         LU.RecomputeRegs(LUIdx, RegUses);
   3184     }
   3185 
   3186     DEBUG(dbgs() << "After pre-selection:\n";
   3187           print_uses(dbgs()));
   3188   }
   3189 }
   3190 
   3191 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
   3192 /// formulae to choose from, use some rough heuristics to prune down the number
   3193 /// of formulae. This keeps the main solver from taking an extraordinary amount
   3194 /// of time in some worst-case scenarios.
   3195 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   3196   NarrowSearchSpaceByDetectingSupersets();
   3197   NarrowSearchSpaceByCollapsingUnrolledCode();
   3198   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
   3199   NarrowSearchSpaceByPickingWinnerRegs();
   3200 }
   3201 
   3202 /// SolveRecurse - This is the recursive solver.
   3203 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
   3204                                Cost &SolutionCost,
   3205                                SmallVectorImpl<const Formula *> &Workspace,
   3206                                const Cost &CurCost,
   3207                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
   3208                                DenseSet<const SCEV *> &VisitedRegs) const {
   3209   // Some ideas:
   3210   //  - prune more:
   3211   //    - use more aggressive filtering
   3212   //    - sort the formula so that the most profitable solutions are found first
   3213   //    - sort the uses too
   3214   //  - search faster:
   3215   //    - don't compute a cost, and then compare. compare while computing a cost
   3216   //      and bail early.
   3217   //    - track register sets with SmallBitVector
   3218 
   3219   const LSRUse &LU = Uses[Workspace.size()];
   3220 
   3221   // If this use references any register that's already a part of the
   3222   // in-progress solution, consider it a requirement that a formula must
   3223   // reference that register in order to be considered. This prunes out
   3224   // unprofitable searching.
   3225   SmallSetVector<const SCEV *, 4> ReqRegs;
   3226   for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
   3227        E = CurRegs.end(); I != E; ++I)
   3228     if (LU.Regs.count(*I))
   3229       ReqRegs.insert(*I);
   3230 
   3231   bool AnySatisfiedReqRegs = false;
   3232   SmallPtrSet<const SCEV *, 16> NewRegs;
   3233   Cost NewCost;
   3234 retry:
   3235   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
   3236        E = LU.Formulae.end(); I != E; ++I) {
   3237     const Formula &F = *I;
   3238 
   3239     // Ignore formulae which do not use any of the required registers.
   3240     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
   3241          JE = ReqRegs.end(); J != JE; ++J) {
   3242       const SCEV *Reg = *J;
   3243       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
   3244           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
   3245           F.BaseRegs.end())
   3246         goto skip;
   3247     }
   3248     AnySatisfiedReqRegs = true;
   3249 
   3250     // Evaluate the cost of the current formula. If it's already worse than
   3251     // the current best, prune the search at that point.
   3252     NewCost = CurCost;
   3253     NewRegs = CurRegs;
   3254     NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
   3255     if (NewCost < SolutionCost) {
   3256       Workspace.push_back(&F);
   3257       if (Workspace.size() != Uses.size()) {
   3258         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
   3259                      NewRegs, VisitedRegs);
   3260         if (F.getNumRegs() == 1 && Workspace.size() == 1)
   3261           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
   3262       } else {
   3263         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
   3264               dbgs() << ". Regs:";
   3265               for (SmallPtrSet<const SCEV *, 16>::const_iterator
   3266                    I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
   3267                 dbgs() << ' ' << **I;
   3268               dbgs() << '\n');
   3269 
   3270         SolutionCost = NewCost;
   3271         Solution = Workspace;
   3272       }
   3273       Workspace.pop_back();
   3274     }
   3275   skip:;
   3276   }
   3277 
   3278   // If none of the formulae had all of the required registers, relax the
   3279   // constraint so that we don't exclude all formulae.
   3280   if (!AnySatisfiedReqRegs) {
   3281     assert(!ReqRegs.empty() && "Solver failed even without required registers");
   3282     ReqRegs.clear();
   3283     goto retry;
   3284   }
   3285 }
   3286 
   3287 /// Solve - Choose one formula from each use. Return the results in the given
   3288 /// Solution vector.
   3289 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   3290   SmallVector<const Formula *, 8> Workspace;
   3291   Cost SolutionCost;
   3292   SolutionCost.Loose();
   3293   Cost CurCost;
   3294   SmallPtrSet<const SCEV *, 16> CurRegs;
   3295   DenseSet<const SCEV *> VisitedRegs;
   3296   Workspace.reserve(Uses.size());
   3297 
   3298   // SolveRecurse does all the work.
   3299   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
   3300                CurRegs, VisitedRegs);
   3301 
   3302   // Ok, we've now made all our decisions.
   3303   DEBUG(dbgs() << "\n"
   3304                   "The chosen solution requires "; SolutionCost.print(dbgs());
   3305         dbgs() << ":\n";
   3306         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
   3307           dbgs() << "  ";
   3308           Uses[i].print(dbgs());
   3309           dbgs() << "\n"
   3310                     "    ";
   3311           Solution[i]->print(dbgs());
   3312           dbgs() << '\n';
   3313         });
   3314 
   3315   assert(Solution.size() == Uses.size() && "Malformed solution!");
   3316 }
   3317 
   3318 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
   3319 /// the dominator tree far as we can go while still being dominated by the
   3320 /// input positions. This helps canonicalize the insert position, which
   3321 /// encourages sharing.
   3322 BasicBlock::iterator
   3323 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
   3324                                  const SmallVectorImpl<Instruction *> &Inputs)
   3325                                                                          const {
   3326   for (;;) {
   3327     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
   3328     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
   3329 
   3330     BasicBlock *IDom;
   3331     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
   3332       if (!Rung) return IP;
   3333       Rung = Rung->getIDom();
   3334       if (!Rung) return IP;
   3335       IDom = Rung->getBlock();
   3336 
   3337       // Don't climb into a loop though.
   3338       const Loop *IDomLoop = LI.getLoopFor(IDom);
   3339       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
   3340       if (IDomDepth <= IPLoopDepth &&
   3341           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
   3342         break;
   3343     }
   3344 
   3345     bool AllDominate = true;
   3346     Instruction *BetterPos = 0;
   3347     Instruction *Tentative = IDom->getTerminator();
   3348     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
   3349          E = Inputs.end(); I != E; ++I) {
   3350       Instruction *Inst = *I;
   3351       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
   3352         AllDominate = false;
   3353         break;
   3354       }
   3355       // Attempt to find an insert position in the middle of the block,
   3356       // instead of at the end, so that it can be used for other expansions.
   3357       if (IDom == Inst->getParent() &&
   3358           (!BetterPos || DT.dominates(BetterPos, Inst)))
   3359         BetterPos = llvm::next(BasicBlock::iterator(Inst));
   3360     }
   3361     if (!AllDominate)
   3362       break;
   3363     if (BetterPos)
   3364       IP = BetterPos;
   3365     else
   3366       IP = Tentative;
   3367   }
   3368 
   3369   return IP;
   3370 }
   3371 
   3372 /// AdjustInsertPositionForExpand - Determine an input position which will be
   3373 /// dominated by the operands and which will dominate the result.
   3374 BasicBlock::iterator
   3375 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   3376                                            const LSRFixup &LF,
   3377                                            const LSRUse &LU) const {
   3378   // Collect some instructions which must be dominated by the
   3379   // expanding replacement. These must be dominated by any operands that
   3380   // will be required in the expansion.
   3381   SmallVector<Instruction *, 4> Inputs;
   3382   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
   3383     Inputs.push_back(I);
   3384   if (LU.Kind == LSRUse::ICmpZero)
   3385     if (Instruction *I =
   3386           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
   3387       Inputs.push_back(I);
   3388   if (LF.PostIncLoops.count(L)) {
   3389     if (LF.isUseFullyOutsideLoop(L))
   3390       Inputs.push_back(L->getLoopLatch()->getTerminator());
   3391     else
   3392       Inputs.push_back(IVIncInsertPos);
   3393   }
   3394   // The expansion must also be dominated by the increment positions of any
   3395   // loops it for which it is using post-inc mode.
   3396   for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
   3397        E = LF.PostIncLoops.end(); I != E; ++I) {
   3398     const Loop *PIL = *I;
   3399     if (PIL == L) continue;
   3400 
   3401     // Be dominated by the loop exit.
   3402     SmallVector<BasicBlock *, 4> ExitingBlocks;
   3403     PIL->getExitingBlocks(ExitingBlocks);
   3404     if (!ExitingBlocks.empty()) {
   3405       BasicBlock *BB = ExitingBlocks[0];
   3406       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
   3407         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
   3408       Inputs.push_back(BB->getTerminator());
   3409     }
   3410   }
   3411 
   3412   // Then, climb up the immediate dominator tree as far as we can go while
   3413   // still being dominated by the input positions.
   3414   IP = HoistInsertPosition(IP, Inputs);
   3415 
   3416   // Don't insert instructions before PHI nodes.
   3417   while (isa<PHINode>(IP)) ++IP;
   3418 
   3419   // Ignore debug intrinsics.
   3420   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
   3421 
   3422   return IP;
   3423 }
   3424 
   3425 /// Expand - Emit instructions for the leading candidate expression for this
   3426 /// LSRUse (this is called "expanding").
   3427 Value *LSRInstance::Expand(const LSRFixup &LF,
   3428                            const Formula &F,
   3429                            BasicBlock::iterator IP,
   3430                            SCEVExpander &Rewriter,
   3431                            SmallVectorImpl<WeakVH> &DeadInsts) const {
   3432   const LSRUse &LU = Uses[LF.LUIdx];
   3433 
   3434   // Determine an input position which will be dominated by the operands and
   3435   // which will dominate the result.
   3436   IP = AdjustInsertPositionForExpand(IP, LF, LU);
   3437 
   3438   // Inform the Rewriter if we have a post-increment use, so that it can
   3439   // perform an advantageous expansion.
   3440   Rewriter.setPostInc(LF.PostIncLoops);
   3441 
   3442   // This is the type that the user actually needs.
   3443   Type *OpTy = LF.OperandValToReplace->getType();
   3444   // This will be the type that we'll initially expand to.
   3445   Type *Ty = F.getType();
   3446   if (!Ty)
   3447     // No type known; just expand directly to the ultimate type.
   3448     Ty = OpTy;
   3449   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
   3450     // Expand directly to the ultimate type if it's the right size.
   3451     Ty = OpTy;
   3452   // This is the type to do integer arithmetic in.
   3453   Type *IntTy = SE.getEffectiveSCEVType(Ty);
   3454 
   3455   // Build up a list of operands to add together to form the full base.
   3456   SmallVector<const SCEV *, 8> Ops;
   3457 
   3458   // Expand the BaseRegs portion.
   3459   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
   3460        E = F.BaseRegs.end(); I != E; ++I) {
   3461     const SCEV *Reg = *I;
   3462     assert(!Reg->isZero() && "Zero allocated in a base register!");
   3463 
   3464     // If we're expanding for a post-inc user, make the post-inc adjustment.
   3465     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
   3466     Reg = TransformForPostIncUse(Denormalize, Reg,
   3467                                  LF.UserInst, LF.OperandValToReplace,
   3468                                  Loops, SE, DT);
   3469 
   3470     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
   3471   }
   3472 
   3473   // Flush the operand list to suppress SCEVExpander hoisting.
   3474   if (!Ops.empty()) {
   3475     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
   3476     Ops.clear();
   3477     Ops.push_back(SE.getUnknown(FullV));
   3478   }
   3479 
   3480   // Expand the ScaledReg portion.
   3481   Value *ICmpScaledV = 0;
   3482   if (F.AM.Scale != 0) {
   3483     const SCEV *ScaledS = F.ScaledReg;
   3484 
   3485     // If we're expanding for a post-inc user, make the post-inc adjustment.
   3486     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
   3487     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
   3488                                      LF.UserInst, LF.OperandValToReplace,
   3489                                      Loops, SE, DT);
   3490 
   3491     if (LU.Kind == LSRUse::ICmpZero) {
   3492       // An interesting way of "folding" with an icmp is to use a negated
   3493       // scale, which we'll implement by inserting it into the other operand
   3494       // of the icmp.
   3495       assert(F.AM.Scale == -1 &&
   3496              "The only scale supported by ICmpZero uses is -1!");
   3497       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
   3498     } else {
   3499       // Otherwise just expand the scaled register and an explicit scale,
   3500       // which is expected to be matched as part of the address.
   3501       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
   3502       ScaledS = SE.getMulExpr(ScaledS,
   3503                               SE.getConstant(ScaledS->getType(), F.AM.Scale));
   3504       Ops.push_back(ScaledS);
   3505 
   3506       // Flush the operand list to suppress SCEVExpander hoisting.
   3507       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
   3508       Ops.clear();
   3509       Ops.push_back(SE.getUnknown(FullV));
   3510     }
   3511   }
   3512 
   3513   // Expand the GV portion.
   3514   if (F.AM.BaseGV) {
   3515     Ops.push_back(SE.getUnknown(F.AM.BaseGV));
   3516 
   3517     // Flush the operand list to suppress SCEVExpander hoisting.
   3518     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
   3519     Ops.clear();
   3520     Ops.push_back(SE.getUnknown(FullV));
   3521   }
   3522 
   3523   // Expand the immediate portion.
   3524   int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
   3525   if (Offset != 0) {
   3526     if (LU.Kind == LSRUse::ICmpZero) {
   3527       // The other interesting way of "folding" with an ICmpZero is to use a
   3528       // negated immediate.
   3529       if (!ICmpScaledV)
   3530         ICmpScaledV = ConstantInt::get(IntTy, -Offset);
   3531       else {
   3532         Ops.push_back(SE.getUnknown(ICmpScaledV));
   3533         ICmpScaledV = ConstantInt::get(IntTy, Offset);
   3534       }
   3535     } else {
   3536       // Just add the immediate values. These again are expected to be matched
   3537       // as part of the address.
   3538       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
   3539     }
   3540   }
   3541 
   3542   // Expand the unfolded offset portion.
   3543   int64_t UnfoldedOffset = F.UnfoldedOffset;
   3544   if (UnfoldedOffset != 0) {
   3545     // Just add the immediate values.
   3546     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
   3547                                                        UnfoldedOffset)));
   3548   }
   3549 
   3550   // Emit instructions summing all the operands.
   3551   const SCEV *FullS = Ops.empty() ?
   3552                       SE.getConstant(IntTy, 0) :
   3553                       SE.getAddExpr(Ops);
   3554   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
   3555 
   3556   // We're done expanding now, so reset the rewriter.
   3557   Rewriter.clearPostInc();
   3558 
   3559   // An ICmpZero Formula represents an ICmp which we're handling as a
   3560   // comparison against zero. Now that we've expanded an expression for that
   3561   // form, update the ICmp's other operand.
   3562   if (LU.Kind == LSRUse::ICmpZero) {
   3563     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
   3564     DeadInsts.push_back(CI->getOperand(1));
   3565     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
   3566                            "a scale at the same time!");
   3567     if (F.AM.Scale == -1) {
   3568       if (ICmpScaledV->getType() != OpTy) {
   3569         Instruction *Cast =
   3570           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
   3571                                                    OpTy, false),
   3572                            ICmpScaledV, OpTy, "tmp", CI);
   3573         ICmpScaledV = Cast;
   3574       }
   3575       CI->setOperand(1, ICmpScaledV);
   3576     } else {
   3577       assert(F.AM.Scale == 0 &&
   3578              "ICmp does not support folding a global value and "
   3579              "a scale at the same time!");
   3580       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
   3581                                            -(uint64_t)Offset);
   3582       if (C->getType() != OpTy)
   3583         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
   3584                                                           OpTy, false),
   3585                                   C, OpTy);
   3586 
   3587       CI->setOperand(1, C);
   3588     }
   3589   }
   3590 
   3591   return FullV;
   3592 }
   3593 
   3594 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
   3595 /// of their operands effectively happens in their predecessor blocks, so the
   3596 /// expression may need to be expanded in multiple places.
   3597 void LSRInstance::RewriteForPHI(PHINode *PN,
   3598                                 const LSRFixup &LF,
   3599                                 const Formula &F,
   3600                                 SCEVExpander &Rewriter,
   3601                                 SmallVectorImpl<WeakVH> &DeadInsts,
   3602                                 Pass *P) const {
   3603   DenseMap<BasicBlock *, Value *> Inserted;
   3604   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
   3605     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
   3606       BasicBlock *BB = PN->getIncomingBlock(i);
   3607 
   3608       // If this is a critical edge, split the edge so that we do not insert
   3609       // the code on all predecessor/successor paths.  We do this unless this
   3610       // is the canonical backedge for this loop, which complicates post-inc
   3611       // users.
   3612       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
   3613           !isa<IndirectBrInst>(BB->getTerminator())) {
   3614         Loop *PNLoop = LI.getLoopFor(PN->getParent());
   3615         if (!PNLoop || PN->getParent() != PNLoop->getHeader()) {
   3616           // Split the critical edge.
   3617           BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
   3618 
   3619           // If PN is outside of the loop and BB is in the loop, we want to
   3620           // move the block to be immediately before the PHI block, not
   3621           // immediately after BB.
   3622           if (L->contains(BB) && !L->contains(PN))
   3623             NewBB->moveBefore(PN->getParent());
   3624 
   3625           // Splitting the edge can reduce the number of PHI entries we have.
   3626           e = PN->getNumIncomingValues();
   3627           BB = NewBB;
   3628           i = PN->getBasicBlockIndex(BB);
   3629         }
   3630       }
   3631 
   3632       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
   3633         Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
   3634       if (!Pair.second)
   3635         PN->setIncomingValue(i, Pair.first->second);
   3636       else {
   3637         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
   3638 
   3639         // If this is reuse-by-noop-cast, insert the noop cast.
   3640         Type *OpTy = LF.OperandValToReplace->getType();
   3641         if (FullV->getType() != OpTy)
   3642           FullV =
   3643             CastInst::Create(CastInst::getCastOpcode(FullV, false,
   3644                                                      OpTy, false),
   3645                              FullV, LF.OperandValToReplace->getType(),
   3646                              "tmp", BB->getTerminator());
   3647 
   3648         PN->setIncomingValue(i, FullV);
   3649         Pair.first->second = FullV;
   3650       }
   3651     }
   3652 }
   3653 
   3654 /// Rewrite - Emit instructions for the leading candidate expression for this
   3655 /// LSRUse (this is called "expanding"), and update the UserInst to reference
   3656 /// the newly expanded value.
   3657 void LSRInstance::Rewrite(const LSRFixup &LF,
   3658                           const Formula &F,
   3659                           SCEVExpander &Rewriter,
   3660                           SmallVectorImpl<WeakVH> &DeadInsts,
   3661                           Pass *P) const {
   3662   // First, find an insertion point that dominates UserInst. For PHI nodes,
   3663   // find the nearest block which dominates all the relevant uses.
   3664   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
   3665     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
   3666   } else {
   3667     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
   3668 
   3669     // If this is reuse-by-noop-cast, insert the noop cast.
   3670     Type *OpTy = LF.OperandValToReplace->getType();
   3671     if (FullV->getType() != OpTy) {
   3672       Instruction *Cast =
   3673         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
   3674                          FullV, OpTy, "tmp", LF.UserInst);
   3675       FullV = Cast;
   3676     }
   3677 
   3678     // Update the user. ICmpZero is handled specially here (for now) because
   3679     // Expand may have updated one of the operands of the icmp already, and
   3680     // its new value may happen to be equal to LF.OperandValToReplace, in
   3681     // which case doing replaceUsesOfWith leads to replacing both operands
   3682     // with the same value. TODO: Reorganize this.
   3683     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
   3684       LF.UserInst->setOperand(0, FullV);
   3685     else
   3686       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
   3687   }
   3688 
   3689   DeadInsts.push_back(LF.OperandValToReplace);
   3690 }
   3691 
   3692 /// ImplementSolution - Rewrite all the fixup locations with new values,
   3693 /// following the chosen solution.
   3694 void
   3695 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   3696                                Pass *P) {
   3697   // Keep track of instructions we may have made dead, so that
   3698   // we can remove them after we are done working.
   3699   SmallVector<WeakVH, 16> DeadInsts;
   3700 
   3701   SCEVExpander Rewriter(SE, "lsr");
   3702   Rewriter.disableCanonicalMode();
   3703   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
   3704 
   3705   // Expand the new value definitions and update the users.
   3706   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
   3707        E = Fixups.end(); I != E; ++I) {
   3708     const LSRFixup &Fixup = *I;
   3709 
   3710     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
   3711 
   3712     Changed = true;
   3713   }
   3714 
   3715   // Clean up after ourselves. This must be done before deleting any
   3716   // instructions.
   3717   Rewriter.clear();
   3718 
   3719   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
   3720 }
   3721 
   3722 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   3723   : IU(P->getAnalysis<IVUsers>()),
   3724     SE(P->getAnalysis<ScalarEvolution>()),
   3725     DT(P->getAnalysis<DominatorTree>()),
   3726     LI(P->getAnalysis<LoopInfo>()),
   3727     TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
   3728 
   3729   // If LoopSimplify form is not available, stay out of trouble.
   3730   if (!L->isLoopSimplifyForm()) return;
   3731 
   3732   // If there's no interesting work to be done, bail early.
   3733   if (IU.empty()) return;
   3734 
   3735   DEBUG(dbgs() << "\nLSR on loop ";
   3736         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
   3737         dbgs() << ":\n");
   3738 
   3739   // First, perform some low-level loop optimizations.
   3740   OptimizeShadowIV();
   3741   OptimizeLoopTermCond();
   3742 
   3743   // Start collecting data and preparing for the solver.
   3744   CollectInterestingTypesAndFactors();
   3745   CollectFixupsAndInitialFormulae();
   3746   CollectLoopInvariantFixupsAndFormulae();
   3747 
   3748   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
   3749         print_uses(dbgs()));
   3750 
   3751   // Now use the reuse data to generate a bunch of interesting ways
   3752   // to formulate the values needed for the uses.
   3753   GenerateAllReuseFormulae();
   3754 
   3755   FilterOutUndesirableDedicatedRegisters();
   3756   NarrowSearchSpaceUsingHeuristics();
   3757 
   3758   SmallVector<const Formula *, 8> Solution;
   3759   Solve(Solution);
   3760 
   3761   // Release memory that is no longer needed.
   3762   Factors.clear();
   3763   Types.clear();
   3764   RegUses.clear();
   3765 
   3766 #ifndef NDEBUG
   3767   // Formulae should be legal.
   3768   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
   3769        E = Uses.end(); I != E; ++I) {
   3770      const LSRUse &LU = *I;
   3771      for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
   3772           JE = LU.Formulae.end(); J != JE; ++J)
   3773         assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
   3774                           LU.Kind, LU.AccessTy, TLI) &&
   3775                "Illegal formula generated!");
   3776   };
   3777 #endif
   3778 
   3779   // Now that we've decided what we want, make it so.
   3780   ImplementSolution(Solution, P);
   3781 }
   3782 
   3783 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
   3784   if (Factors.empty() && Types.empty()) return;
   3785 
   3786   OS << "LSR has identified the following interesting factors and types: ";
   3787   bool First = true;
   3788 
   3789   for (SmallSetVector<int64_t, 8>::const_iterator
   3790        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
   3791     if (!First) OS << ", ";
   3792     First = false;
   3793     OS << '*' << *I;
   3794   }
   3795 
   3796   for (SmallSetVector<Type *, 4>::const_iterator
   3797        I = Types.begin(), E = Types.end(); I != E; ++I) {
   3798     if (!First) OS << ", ";
   3799     First = false;
   3800     OS << '(' << **I << ')';
   3801   }
   3802   OS << '\n';
   3803 }
   3804 
   3805 void LSRInstance::print_fixups(raw_ostream &OS) const {
   3806   OS << "LSR is examining the following fixup sites:\n";
   3807   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
   3808        E = Fixups.end(); I != E; ++I) {
   3809     dbgs() << "  ";
   3810     I->print(OS);
   3811     OS << '\n';
   3812   }
   3813 }
   3814 
   3815 void LSRInstance::print_uses(raw_ostream &OS) const {
   3816   OS << "LSR is examining the following uses:\n";
   3817   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
   3818        E = Uses.end(); I != E; ++I) {
   3819     const LSRUse &LU = *I;
   3820     dbgs() << "  ";
   3821     LU.print(OS);
   3822     OS << '\n';
   3823     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
   3824          JE = LU.Formulae.end(); J != JE; ++J) {
   3825       OS << "    ";
   3826       J->print(OS);
   3827       OS << '\n';
   3828     }
   3829   }
   3830 }
   3831 
   3832 void LSRInstance::print(raw_ostream &OS) const {
   3833   print_factors_and_types(OS);
   3834   print_fixups(OS);
   3835   print_uses(OS);
   3836 }
   3837 
   3838 void LSRInstance::dump() const {
   3839   print(errs()); errs() << '\n';
   3840 }
   3841 
   3842 namespace {
   3843 
   3844 class LoopStrengthReduce : public LoopPass {
   3845   /// TLI - Keep a pointer of a TargetLowering to consult for determining
   3846   /// transformation profitability.
   3847   const TargetLowering *const TLI;
   3848 
   3849 public:
   3850   static char ID; // Pass ID, replacement for typeid
   3851   explicit LoopStrengthReduce(const TargetLowering *tli = 0);
   3852 
   3853 private:
   3854   bool runOnLoop(Loop *L, LPPassManager &LPM);
   3855   void getAnalysisUsage(AnalysisUsage &AU) const;
   3856 };
   3857 
   3858 }
   3859 
   3860 char LoopStrengthReduce::ID = 0;
   3861 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
   3862                 "Loop Strength Reduction", false, false)
   3863 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
   3864 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
   3865 INITIALIZE_PASS_DEPENDENCY(IVUsers)
   3866 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
   3867 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
   3868 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
   3869                 "Loop Strength Reduction", false, false)
   3870 
   3871 
   3872 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
   3873   return new LoopStrengthReduce(TLI);
   3874 }
   3875 
   3876 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
   3877   : LoopPass(ID), TLI(tli) {
   3878     initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
   3879   }
   3880 
   3881 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   3882   // We split critical edges, so we change the CFG.  However, we do update
   3883   // many analyses if they are around.
   3884   AU.addPreservedID(LoopSimplifyID);
   3885 
   3886   AU.addRequired<LoopInfo>();
   3887   AU.addPreserved<LoopInfo>();
   3888   AU.addRequiredID(LoopSimplifyID);
   3889   AU.addRequired<DominatorTree>();
   3890   AU.addPreserved<DominatorTree>();
   3891   AU.addRequired<ScalarEvolution>();
   3892   AU.addPreserved<ScalarEvolution>();
   3893   // Requiring LoopSimplify a second time here prevents IVUsers from running
   3894   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
   3895   AU.addRequiredID(LoopSimplifyID);
   3896   AU.addRequired<IVUsers>();
   3897   AU.addPreserved<IVUsers>();
   3898 }
   3899 
   3900 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   3901   bool Changed = false;
   3902 
   3903   // Run the main LSR transformation.
   3904   Changed |= LSRInstance(TLI, L, this).getChanged();
   3905 
   3906   // At this point, it is worth checking to see if any recurrence PHIs are also
   3907   // dead, so that we can remove them as well.
   3908   Changed |= DeleteDeadPHIs(L->getHeader());
   3909 
   3910   return Changed;
   3911 }
   3912