Home | History | Annotate | Download | only in Scalar
      1 //===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This pass implements a simple loop reroller.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "llvm/Transforms/Scalar.h"
     15 #include "llvm/ADT/MapVector.h"
     16 #include "llvm/ADT/STLExtras.h"
     17 #include "llvm/ADT/SmallBitVector.h"
     18 #include "llvm/ADT/SmallSet.h"
     19 #include "llvm/ADT/Statistic.h"
     20 #include "llvm/Analysis/AliasAnalysis.h"
     21 #include "llvm/Analysis/AliasSetTracker.h"
     22 #include "llvm/Analysis/LoopPass.h"
     23 #include "llvm/Analysis/ScalarEvolution.h"
     24 #include "llvm/Analysis/ScalarEvolutionExpander.h"
     25 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
     26 #include "llvm/Analysis/TargetLibraryInfo.h"
     27 #include "llvm/Analysis/ValueTracking.h"
     28 #include "llvm/IR/DataLayout.h"
     29 #include "llvm/IR/Dominators.h"
     30 #include "llvm/IR/IntrinsicInst.h"
     31 #include "llvm/Support/CommandLine.h"
     32 #include "llvm/Support/Debug.h"
     33 #include "llvm/Support/raw_ostream.h"
     34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
     35 #include "llvm/Transforms/Utils/Local.h"
     36 #include "llvm/Transforms/Utils/LoopUtils.h"
     37 
     38 using namespace llvm;
     39 
     40 #define DEBUG_TYPE "loop-reroll"
     41 
     42 STATISTIC(NumRerolledLoops, "Number of rerolled loops");
     43 
     44 static cl::opt<unsigned>
     45 MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
     46   cl::desc("The maximum increment for loop rerolling"));
     47 
     48 static cl::opt<unsigned>
     49 NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
     50                           cl::Hidden,
     51                           cl::desc("The maximum number of failures to tolerate"
     52                                    " during fuzzy matching. (default: 400)"));
     53 
     54 // This loop re-rolling transformation aims to transform loops like this:
     55 //
     56 // int foo(int a);
     57 // void bar(int *x) {
     58 //   for (int i = 0; i < 500; i += 3) {
     59 //     foo(i);
     60 //     foo(i+1);
     61 //     foo(i+2);
     62 //   }
     63 // }
     64 //
     65 // into a loop like this:
     66 //
     67 // void bar(int *x) {
     68 //   for (int i = 0; i < 500; ++i)
     69 //     foo(i);
     70 // }
     71 //
     72 // It does this by looking for loops that, besides the latch code, are composed
     73 // of isomorphic DAGs of instructions, with each DAG rooted at some increment
     74 // to the induction variable, and where each DAG is isomorphic to the DAG
     75 // rooted at the induction variable (excepting the sub-DAGs which root the
     76 // other induction-variable increments). In other words, we're looking for loop
     77 // bodies of the form:
     78 //
     79 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
     80 // f(%iv)
     81 // %iv.1 = add %iv, 1                <-- a root increment
     82 // f(%iv.1)
     83 // %iv.2 = add %iv, 2                <-- a root increment
     84 // f(%iv.2)
     85 // %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
     86 // f(%iv.scale_m_1)
     87 // ...
     88 // %iv.next = add %iv, scale
     89 // %cmp = icmp(%iv, ...)
     90 // br %cmp, header, exit
     91 //
     92 // where each f(i) is a set of instructions that, collectively, are a function
     93 // only of i (and other loop-invariant values).
     94 //
     95 // As a special case, we can also reroll loops like this:
     96 //
     97 // int foo(int);
     98 // void bar(int *x) {
     99 //   for (int i = 0; i < 500; ++i) {
    100 //     x[3*i] = foo(0);
    101 //     x[3*i+1] = foo(0);
    102 //     x[3*i+2] = foo(0);
    103 //   }
    104 // }
    105 //
    106 // into this:
    107 //
    108 // void bar(int *x) {
    109 //   for (int i = 0; i < 1500; ++i)
    110 //     x[i] = foo(0);
    111 // }
    112 //
    113 // in which case, we're looking for inputs like this:
    114 //
    115 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
    116 // %scaled.iv = mul %iv, scale
    117 // f(%scaled.iv)
    118 // %scaled.iv.1 = add %scaled.iv, 1
    119 // f(%scaled.iv.1)
    120 // %scaled.iv.2 = add %scaled.iv, 2
    121 // f(%scaled.iv.2)
    122 // %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
    123 // f(%scaled.iv.scale_m_1)
    124 // ...
    125 // %iv.next = add %iv, 1
    126 // %cmp = icmp(%iv, ...)
    127 // br %cmp, header, exit
    128 
    129 namespace {
    130   enum IterationLimits {
    131     /// The maximum number of iterations that we'll try and reroll. This
    132     /// has to be less than 25 in order to fit into a SmallBitVector.
    133     IL_MaxRerollIterations = 16,
    134     /// The bitvector index used by loop induction variables and other
    135     /// instructions that belong to all iterations.
    136     IL_All,
    137     IL_End
    138   };
    139 
    140   class LoopReroll : public LoopPass {
    141   public:
    142     static char ID; // Pass ID, replacement for typeid
    143     LoopReroll() : LoopPass(ID) {
    144       initializeLoopRerollPass(*PassRegistry::getPassRegistry());
    145     }
    146 
    147     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
    148 
    149     void getAnalysisUsage(AnalysisUsage &AU) const override {
    150       AU.addRequired<AAResultsWrapperPass>();
    151       AU.addRequired<LoopInfoWrapperPass>();
    152       AU.addPreserved<LoopInfoWrapperPass>();
    153       AU.addRequired<DominatorTreeWrapperPass>();
    154       AU.addPreserved<DominatorTreeWrapperPass>();
    155       AU.addRequired<ScalarEvolutionWrapperPass>();
    156       AU.addRequired<TargetLibraryInfoWrapperPass>();
    157     }
    158 
    159   protected:
    160     AliasAnalysis *AA;
    161     LoopInfo *LI;
    162     ScalarEvolution *SE;
    163     TargetLibraryInfo *TLI;
    164     DominatorTree *DT;
    165     bool PreserveLCSSA;
    166 
    167     typedef SmallVector<Instruction *, 16> SmallInstructionVector;
    168     typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
    169 
    170     // Map between induction variable and its increment
    171     DenseMap<Instruction *, int64_t> IVToIncMap;
    172 
    173     // A chain of isomorphic instructions, identified by a single-use PHI
    174     // representing a reduction. Only the last value may be used outside the
    175     // loop.
    176     struct SimpleLoopReduction {
    177       SimpleLoopReduction(Instruction *P, Loop *L)
    178         : Valid(false), Instructions(1, P) {
    179         assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
    180         add(L);
    181       }
    182 
    183       bool valid() const {
    184         return Valid;
    185       }
    186 
    187       Instruction *getPHI() const {
    188         assert(Valid && "Using invalid reduction");
    189         return Instructions.front();
    190       }
    191 
    192       Instruction *getReducedValue() const {
    193         assert(Valid && "Using invalid reduction");
    194         return Instructions.back();
    195       }
    196 
    197       Instruction *get(size_t i) const {
    198         assert(Valid && "Using invalid reduction");
    199         return Instructions[i+1];
    200       }
    201 
    202       Instruction *operator [] (size_t i) const { return get(i); }
    203 
    204       // The size, ignoring the initial PHI.
    205       size_t size() const {
    206         assert(Valid && "Using invalid reduction");
    207         return Instructions.size()-1;
    208       }
    209 
    210       typedef SmallInstructionVector::iterator iterator;
    211       typedef SmallInstructionVector::const_iterator const_iterator;
    212 
    213       iterator begin() {
    214         assert(Valid && "Using invalid reduction");
    215         return std::next(Instructions.begin());
    216       }
    217 
    218       const_iterator begin() const {
    219         assert(Valid && "Using invalid reduction");
    220         return std::next(Instructions.begin());
    221       }
    222 
    223       iterator end() { return Instructions.end(); }
    224       const_iterator end() const { return Instructions.end(); }
    225 
    226     protected:
    227       bool Valid;
    228       SmallInstructionVector Instructions;
    229 
    230       void add(Loop *L);
    231     };
    232 
    233     // The set of all reductions, and state tracking of possible reductions
    234     // during loop instruction processing.
    235     struct ReductionTracker {
    236       typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
    237 
    238       // Add a new possible reduction.
    239       void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
    240 
    241       // Setup to track possible reductions corresponding to the provided
    242       // rerolling scale. Only reductions with a number of non-PHI instructions
    243       // that is divisible by the scale are considered. Three instructions sets
    244       // are filled in:
    245       //   - A set of all possible instructions in eligible reductions.
    246       //   - A set of all PHIs in eligible reductions
    247       //   - A set of all reduced values (last instructions) in eligible
    248       //     reductions.
    249       void restrictToScale(uint64_t Scale,
    250                            SmallInstructionSet &PossibleRedSet,
    251                            SmallInstructionSet &PossibleRedPHISet,
    252                            SmallInstructionSet &PossibleRedLastSet) {
    253         PossibleRedIdx.clear();
    254         PossibleRedIter.clear();
    255         Reds.clear();
    256 
    257         for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
    258           if (PossibleReds[i].size() % Scale == 0) {
    259             PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
    260             PossibleRedPHISet.insert(PossibleReds[i].getPHI());
    261 
    262             PossibleRedSet.insert(PossibleReds[i].getPHI());
    263             PossibleRedIdx[PossibleReds[i].getPHI()] = i;
    264             for (Instruction *J : PossibleReds[i]) {
    265               PossibleRedSet.insert(J);
    266               PossibleRedIdx[J] = i;
    267             }
    268           }
    269       }
    270 
    271       // The functions below are used while processing the loop instructions.
    272 
    273       // Are the two instructions both from reductions, and furthermore, from
    274       // the same reduction?
    275       bool isPairInSame(Instruction *J1, Instruction *J2) {
    276         DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
    277         if (J1I != PossibleRedIdx.end()) {
    278           DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
    279           if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
    280             return true;
    281         }
    282 
    283         return false;
    284       }
    285 
    286       // The two provided instructions, the first from the base iteration, and
    287       // the second from iteration i, form a matched pair. If these are part of
    288       // a reduction, record that fact.
    289       void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
    290         if (PossibleRedIdx.count(J1)) {
    291           assert(PossibleRedIdx.count(J2) &&
    292                  "Recording reduction vs. non-reduction instruction?");
    293 
    294           PossibleRedIter[J1] = 0;
    295           PossibleRedIter[J2] = i;
    296 
    297           int Idx = PossibleRedIdx[J1];
    298           assert(Idx == PossibleRedIdx[J2] &&
    299                  "Recording pair from different reductions?");
    300           Reds.insert(Idx);
    301         }
    302       }
    303 
    304       // The functions below can be called after we've finished processing all
    305       // instructions in the loop, and we know which reductions were selected.
    306 
    307       bool validateSelected();
    308       void replaceSelected();
    309 
    310     protected:
    311       // The vector of all possible reductions (for any scale).
    312       SmallReductionVector PossibleReds;
    313 
    314       DenseMap<Instruction *, int> PossibleRedIdx;
    315       DenseMap<Instruction *, int> PossibleRedIter;
    316       DenseSet<int> Reds;
    317     };
    318 
    319     // A DAGRootSet models an induction variable being used in a rerollable
    320     // loop. For example,
    321     //
    322     //   x[i*3+0] = y1
    323     //   x[i*3+1] = y2
    324     //   x[i*3+2] = y3
    325     //
    326     //   Base instruction -> i*3
    327     //                    +---+----+
    328     //                   /    |     \
    329     //               ST[y1]  +1     +2  <-- Roots
    330     //                        |      |
    331     //                      ST[y2] ST[y3]
    332     //
    333     // There may be multiple DAGRoots, for example:
    334     //
    335     //   x[i*2+0] = ...   (1)
    336     //   x[i*2+1] = ...   (1)
    337     //   x[i*2+4] = ...   (2)
    338     //   x[i*2+5] = ...   (2)
    339     //   x[(i+1234)*2+5678] = ... (3)
    340     //   x[(i+1234)*2+5679] = ... (3)
    341     //
    342     // The loop will be rerolled by adding a new loop induction variable,
    343     // one for the Base instruction in each DAGRootSet.
    344     //
    345     struct DAGRootSet {
    346       Instruction *BaseInst;
    347       SmallInstructionVector Roots;
    348       // The instructions between IV and BaseInst (but not including BaseInst).
    349       SmallInstructionSet SubsumedInsts;
    350     };
    351 
    352     // The set of all DAG roots, and state tracking of all roots
    353     // for a particular induction variable.
    354     struct DAGRootTracker {
    355       DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
    356                      ScalarEvolution *SE, AliasAnalysis *AA,
    357                      TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
    358                      bool PreserveLCSSA,
    359                      DenseMap<Instruction *, int64_t> &IncrMap)
    360           : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
    361             PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {}
    362 
    363       /// Stage 1: Find all the DAG roots for the induction variable.
    364       bool findRoots();
    365       /// Stage 2: Validate if the found roots are valid.
    366       bool validate(ReductionTracker &Reductions);
    367       /// Stage 3: Assuming validate() returned true, perform the
    368       /// replacement.
    369       /// @param IterCount The maximum iteration count of L.
    370       void replace(const SCEV *IterCount);
    371 
    372     protected:
    373       typedef MapVector<Instruction*, SmallBitVector> UsesTy;
    374 
    375       bool findRootsRecursive(Instruction *IVU,
    376                               SmallInstructionSet SubsumedInsts);
    377       bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
    378       bool collectPossibleRoots(Instruction *Base,
    379                                 std::map<int64_t,Instruction*> &Roots);
    380 
    381       bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
    382       void collectInLoopUserSet(const SmallInstructionVector &Roots,
    383                                 const SmallInstructionSet &Exclude,
    384                                 const SmallInstructionSet &Final,
    385                                 DenseSet<Instruction *> &Users);
    386       void collectInLoopUserSet(Instruction *Root,
    387                                 const SmallInstructionSet &Exclude,
    388                                 const SmallInstructionSet &Final,
    389                                 DenseSet<Instruction *> &Users);
    390 
    391       UsesTy::iterator nextInstr(int Val, UsesTy &In,
    392                                  const SmallInstructionSet &Exclude,
    393                                  UsesTy::iterator *StartI=nullptr);
    394       bool isBaseInst(Instruction *I);
    395       bool isRootInst(Instruction *I);
    396       bool instrDependsOn(Instruction *I,
    397                           UsesTy::iterator Start,
    398                           UsesTy::iterator End);
    399 
    400       LoopReroll *Parent;
    401 
    402       // Members of Parent, replicated here for brevity.
    403       Loop *L;
    404       ScalarEvolution *SE;
    405       AliasAnalysis *AA;
    406       TargetLibraryInfo *TLI;
    407       DominatorTree *DT;
    408       LoopInfo *LI;
    409       bool PreserveLCSSA;
    410 
    411       // The loop induction variable.
    412       Instruction *IV;
    413       // Loop step amount.
    414       int64_t Inc;
    415       // Loop reroll count; if Inc == 1, this records the scaling applied
    416       // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
    417       // If Inc is not 1, Scale = Inc.
    418       uint64_t Scale;
    419       // The roots themselves.
    420       SmallVector<DAGRootSet,16> RootSets;
    421       // All increment instructions for IV.
    422       SmallInstructionVector LoopIncs;
    423       // Map of all instructions in the loop (in order) to the iterations
    424       // they are used in (or specially, IL_All for instructions
    425       // used in the loop increment mechanism).
    426       UsesTy Uses;
    427       // Map between induction variable and its increment
    428       DenseMap<Instruction *, int64_t> &IVToIncMap;
    429     };
    430 
    431     void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
    432     void collectPossibleReductions(Loop *L,
    433            ReductionTracker &Reductions);
    434     bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
    435                 ReductionTracker &Reductions);
    436   };
    437 }
    438 
    439 char LoopReroll::ID = 0;
    440 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
    441 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
    442 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
    443 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
    444 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
    445 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
    446 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
    447 
    448 Pass *llvm::createLoopRerollPass() {
    449   return new LoopReroll;
    450 }
    451 
    452 // Returns true if the provided instruction is used outside the given loop.
    453 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
    454 // non-loop blocks to be outside the loop.
    455 static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
    456   for (User *U : I->users()) {
    457     if (!L->contains(cast<Instruction>(U)))
    458       return true;
    459   }
    460   return false;
    461 }
    462 
    463 // Collect the list of loop induction variables with respect to which it might
    464 // be possible to reroll the loop.
    465 void LoopReroll::collectPossibleIVs(Loop *L,
    466                                     SmallInstructionVector &PossibleIVs) {
    467   BasicBlock *Header = L->getHeader();
    468   for (BasicBlock::iterator I = Header->begin(),
    469        IE = Header->getFirstInsertionPt(); I != IE; ++I) {
    470     if (!isa<PHINode>(I))
    471       continue;
    472     if (!I->getType()->isIntegerTy())
    473       continue;
    474 
    475     if (const SCEVAddRecExpr *PHISCEV =
    476             dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
    477       if (PHISCEV->getLoop() != L)
    478         continue;
    479       if (!PHISCEV->isAffine())
    480         continue;
    481       if (const SCEVConstant *IncSCEV =
    482           dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
    483         const APInt &AInt = IncSCEV->getAPInt().abs();
    484         if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
    485           continue;
    486         IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
    487         DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
    488                      << "\n");
    489         PossibleIVs.push_back(&*I);
    490       }
    491     }
    492   }
    493 }
    494 
    495 // Add the remainder of the reduction-variable chain to the instruction vector
    496 // (the initial PHINode has already been added). If successful, the object is
    497 // marked as valid.
    498 void LoopReroll::SimpleLoopReduction::add(Loop *L) {
    499   assert(!Valid && "Cannot add to an already-valid chain");
    500 
    501   // The reduction variable must be a chain of single-use instructions
    502   // (including the PHI), except for the last value (which is used by the PHI
    503   // and also outside the loop).
    504   Instruction *C = Instructions.front();
    505   if (C->user_empty())
    506     return;
    507 
    508   do {
    509     C = cast<Instruction>(*C->user_begin());
    510     if (C->hasOneUse()) {
    511       if (!C->isBinaryOp())
    512         return;
    513 
    514       if (!(isa<PHINode>(Instructions.back()) ||
    515             C->isSameOperationAs(Instructions.back())))
    516         return;
    517 
    518       Instructions.push_back(C);
    519     }
    520   } while (C->hasOneUse());
    521 
    522   if (Instructions.size() < 2 ||
    523       !C->isSameOperationAs(Instructions.back()) ||
    524       C->use_empty())
    525     return;
    526 
    527   // C is now the (potential) last instruction in the reduction chain.
    528   for (User *U : C->users()) {
    529     // The only in-loop user can be the initial PHI.
    530     if (L->contains(cast<Instruction>(U)))
    531       if (cast<Instruction>(U) != Instructions.front())
    532         return;
    533   }
    534 
    535   Instructions.push_back(C);
    536   Valid = true;
    537 }
    538 
    539 // Collect the vector of possible reduction variables.
    540 void LoopReroll::collectPossibleReductions(Loop *L,
    541   ReductionTracker &Reductions) {
    542   BasicBlock *Header = L->getHeader();
    543   for (BasicBlock::iterator I = Header->begin(),
    544        IE = Header->getFirstInsertionPt(); I != IE; ++I) {
    545     if (!isa<PHINode>(I))
    546       continue;
    547     if (!I->getType()->isSingleValueType())
    548       continue;
    549 
    550     SimpleLoopReduction SLR(&*I, L);
    551     if (!SLR.valid())
    552       continue;
    553 
    554     DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
    555           SLR.size() << " chained instructions)\n");
    556     Reductions.addSLR(SLR);
    557   }
    558 }
    559 
    560 // Collect the set of all users of the provided root instruction. This set of
    561 // users contains not only the direct users of the root instruction, but also
    562 // all users of those users, and so on. There are two exceptions:
    563 //
    564 //   1. Instructions in the set of excluded instructions are never added to the
    565 //   use set (even if they are users). This is used, for example, to exclude
    566 //   including root increments in the use set of the primary IV.
    567 //
    568 //   2. Instructions in the set of final instructions are added to the use set
    569 //   if they are users, but their users are not added. This is used, for
    570 //   example, to prevent a reduction update from forcing all later reduction
    571 //   updates into the use set.
    572 void LoopReroll::DAGRootTracker::collectInLoopUserSet(
    573   Instruction *Root, const SmallInstructionSet &Exclude,
    574   const SmallInstructionSet &Final,
    575   DenseSet<Instruction *> &Users) {
    576   SmallInstructionVector Queue(1, Root);
    577   while (!Queue.empty()) {
    578     Instruction *I = Queue.pop_back_val();
    579     if (!Users.insert(I).second)
    580       continue;
    581 
    582     if (!Final.count(I))
    583       for (Use &U : I->uses()) {
    584         Instruction *User = cast<Instruction>(U.getUser());
    585         if (PHINode *PN = dyn_cast<PHINode>(User)) {
    586           // Ignore "wrap-around" uses to PHIs of this loop's header.
    587           if (PN->getIncomingBlock(U) == L->getHeader())
    588             continue;
    589         }
    590 
    591         if (L->contains(User) && !Exclude.count(User)) {
    592           Queue.push_back(User);
    593         }
    594       }
    595 
    596     // We also want to collect single-user "feeder" values.
    597     for (User::op_iterator OI = I->op_begin(),
    598          OIE = I->op_end(); OI != OIE; ++OI) {
    599       if (Instruction *Op = dyn_cast<Instruction>(*OI))
    600         if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
    601             !Final.count(Op))
    602           Queue.push_back(Op);
    603     }
    604   }
    605 }
    606 
    607 // Collect all of the users of all of the provided root instructions (combined
    608 // into a single set).
    609 void LoopReroll::DAGRootTracker::collectInLoopUserSet(
    610   const SmallInstructionVector &Roots,
    611   const SmallInstructionSet &Exclude,
    612   const SmallInstructionSet &Final,
    613   DenseSet<Instruction *> &Users) {
    614   for (SmallInstructionVector::const_iterator I = Roots.begin(),
    615        IE = Roots.end(); I != IE; ++I)
    616     collectInLoopUserSet(*I, Exclude, Final, Users);
    617 }
    618 
    619 static bool isSimpleLoadStore(Instruction *I) {
    620   if (LoadInst *LI = dyn_cast<LoadInst>(I))
    621     return LI->isSimple();
    622   if (StoreInst *SI = dyn_cast<StoreInst>(I))
    623     return SI->isSimple();
    624   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
    625     return !MI->isVolatile();
    626   return false;
    627 }
    628 
    629 /// Return true if IVU is a "simple" arithmetic operation.
    630 /// This is used for narrowing the search space for DAGRoots; only arithmetic
    631 /// and GEPs can be part of a DAGRoot.
    632 static bool isSimpleArithmeticOp(User *IVU) {
    633   if (Instruction *I = dyn_cast<Instruction>(IVU)) {
    634     switch (I->getOpcode()) {
    635     default: return false;
    636     case Instruction::Add:
    637     case Instruction::Sub:
    638     case Instruction::Mul:
    639     case Instruction::Shl:
    640     case Instruction::AShr:
    641     case Instruction::LShr:
    642     case Instruction::GetElementPtr:
    643     case Instruction::Trunc:
    644     case Instruction::ZExt:
    645     case Instruction::SExt:
    646       return true;
    647     }
    648   }
    649   return false;
    650 }
    651 
    652 static bool isLoopIncrement(User *U, Instruction *IV) {
    653   BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
    654   if (!BO || BO->getOpcode() != Instruction::Add)
    655     return false;
    656 
    657   for (auto *UU : BO->users()) {
    658     PHINode *PN = dyn_cast<PHINode>(UU);
    659     if (PN && PN == IV)
    660       return true;
    661   }
    662   return false;
    663 }
    664 
    665 bool LoopReroll::DAGRootTracker::
    666 collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
    667   SmallInstructionVector BaseUsers;
    668 
    669   for (auto *I : Base->users()) {
    670     ConstantInt *CI = nullptr;
    671 
    672     if (isLoopIncrement(I, IV)) {
    673       LoopIncs.push_back(cast<Instruction>(I));
    674       continue;
    675     }
    676 
    677     // The root nodes must be either GEPs, ORs or ADDs.
    678     if (auto *BO = dyn_cast<BinaryOperator>(I)) {
    679       if (BO->getOpcode() == Instruction::Add ||
    680           BO->getOpcode() == Instruction::Or)
    681         CI = dyn_cast<ConstantInt>(BO->getOperand(1));
    682     } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
    683       Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
    684       CI = dyn_cast<ConstantInt>(LastOperand);
    685     }
    686 
    687     if (!CI) {
    688       if (Instruction *II = dyn_cast<Instruction>(I)) {
    689         BaseUsers.push_back(II);
    690         continue;
    691       } else {
    692         DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
    693         return false;
    694       }
    695     }
    696 
    697     int64_t V = std::abs(CI->getValue().getSExtValue());
    698     if (Roots.find(V) != Roots.end())
    699       // No duplicates, please.
    700       return false;
    701 
    702     Roots[V] = cast<Instruction>(I);
    703   }
    704 
    705   if (Roots.empty())
    706     return false;
    707 
    708   // If we found non-loop-inc, non-root users of Base, assume they are
    709   // for the zeroth root index. This is because "add %a, 0" gets optimized
    710   // away.
    711   if (BaseUsers.size()) {
    712     if (Roots.find(0) != Roots.end()) {
    713       DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
    714       return false;
    715     }
    716     Roots[0] = Base;
    717   }
    718 
    719   // Calculate the number of users of the base, or lowest indexed, iteration.
    720   unsigned NumBaseUses = BaseUsers.size();
    721   if (NumBaseUses == 0)
    722     NumBaseUses = Roots.begin()->second->getNumUses();
    723 
    724   // Check that every node has the same number of users.
    725   for (auto &KV : Roots) {
    726     if (KV.first == 0)
    727       continue;
    728     if (KV.second->getNumUses() != NumBaseUses) {
    729       DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
    730             << "#Base=" << NumBaseUses << ", #Root=" <<
    731             KV.second->getNumUses() << "\n");
    732       return false;
    733     }
    734   }
    735 
    736   return true;
    737 }
    738 
    739 bool LoopReroll::DAGRootTracker::
    740 findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
    741   // Does the user look like it could be part of a root set?
    742   // All its users must be simple arithmetic ops.
    743   if (I->getNumUses() > IL_MaxRerollIterations)
    744     return false;
    745 
    746   if ((I->getOpcode() == Instruction::Mul ||
    747        I->getOpcode() == Instruction::PHI) &&
    748       I != IV &&
    749       findRootsBase(I, SubsumedInsts))
    750     return true;
    751 
    752   SubsumedInsts.insert(I);
    753 
    754   for (User *V : I->users()) {
    755     Instruction *I = dyn_cast<Instruction>(V);
    756     if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end())
    757       continue;
    758 
    759     if (!I || !isSimpleArithmeticOp(I) ||
    760         !findRootsRecursive(I, SubsumedInsts))
    761       return false;
    762   }
    763   return true;
    764 }
    765 
    766 bool LoopReroll::DAGRootTracker::
    767 findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
    768 
    769   // The base instruction needs to be a multiply so
    770   // that we can erase it.
    771   if (IVU->getOpcode() != Instruction::Mul &&
    772       IVU->getOpcode() != Instruction::PHI)
    773     return false;
    774 
    775   std::map<int64_t, Instruction*> V;
    776   if (!collectPossibleRoots(IVU, V))
    777     return false;
    778 
    779   // If we didn't get a root for index zero, then IVU must be
    780   // subsumed.
    781   if (V.find(0) == V.end())
    782     SubsumedInsts.insert(IVU);
    783 
    784   // Partition the vector into monotonically increasing indexes.
    785   DAGRootSet DRS;
    786   DRS.BaseInst = nullptr;
    787 
    788   for (auto &KV : V) {
    789     if (!DRS.BaseInst) {
    790       DRS.BaseInst = KV.second;
    791       DRS.SubsumedInsts = SubsumedInsts;
    792     } else if (DRS.Roots.empty()) {
    793       DRS.Roots.push_back(KV.second);
    794     } else if (V.find(KV.first - 1) != V.end()) {
    795       DRS.Roots.push_back(KV.second);
    796     } else {
    797       // Linear sequence terminated.
    798       RootSets.push_back(DRS);
    799       DRS.BaseInst = KV.second;
    800       DRS.SubsumedInsts = SubsumedInsts;
    801       DRS.Roots.clear();
    802     }
    803   }
    804   RootSets.push_back(DRS);
    805 
    806   return true;
    807 }
    808 
    809 bool LoopReroll::DAGRootTracker::findRoots() {
    810   Inc = IVToIncMap[IV];
    811 
    812   assert(RootSets.empty() && "Unclean state!");
    813   if (std::abs(Inc) == 1) {
    814     for (auto *IVU : IV->users()) {
    815       if (isLoopIncrement(IVU, IV))
    816         LoopIncs.push_back(cast<Instruction>(IVU));
    817     }
    818     if (!findRootsRecursive(IV, SmallInstructionSet()))
    819       return false;
    820     LoopIncs.push_back(IV);
    821   } else {
    822     if (!findRootsBase(IV, SmallInstructionSet()))
    823       return false;
    824   }
    825 
    826   // Ensure all sets have the same size.
    827   if (RootSets.empty()) {
    828     DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
    829     return false;
    830   }
    831   for (auto &V : RootSets) {
    832     if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
    833       DEBUG(dbgs()
    834             << "LRR: Aborting because not all root sets have the same size\n");
    835       return false;
    836     }
    837   }
    838 
    839   // And ensure all loop iterations are consecutive. We rely on std::map
    840   // providing ordered traversal.
    841   for (auto &V : RootSets) {
    842     const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst));
    843     if (!ADR)
    844       return false;
    845 
    846     // Consider a DAGRootSet with N-1 roots (so N different values including
    847     //   BaseInst).
    848     // Define d = Roots[0] - BaseInst, which should be the same as
    849     //   Roots[I] - Roots[I-1] for all I in [1..N).
    850     // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
    851     //   loop iteration J.
    852     //
    853     // Now, For the loop iterations to be consecutive:
    854     //   D = d * N
    855 
    856     unsigned N = V.Roots.size() + 1;
    857     const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR);
    858     const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
    859     if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) {
    860       DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n");
    861       return false;
    862     }
    863   }
    864   Scale = RootSets[0].Roots.size() + 1;
    865 
    866   if (Scale > IL_MaxRerollIterations) {
    867     DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
    868           << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
    869           << "\n");
    870     return false;
    871   }
    872 
    873   DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
    874 
    875   return true;
    876 }
    877 
    878 bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
    879   // Populate the MapVector with all instructions in the block, in order first,
    880   // so we can iterate over the contents later in perfect order.
    881   for (auto &I : *L->getHeader()) {
    882     Uses[&I].resize(IL_End);
    883   }
    884 
    885   SmallInstructionSet Exclude;
    886   for (auto &DRS : RootSets) {
    887     Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
    888     Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
    889     Exclude.insert(DRS.BaseInst);
    890   }
    891   Exclude.insert(LoopIncs.begin(), LoopIncs.end());
    892 
    893   for (auto &DRS : RootSets) {
    894     DenseSet<Instruction*> VBase;
    895     collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
    896     for (auto *I : VBase) {
    897       Uses[I].set(0);
    898     }
    899 
    900     unsigned Idx = 1;
    901     for (auto *Root : DRS.Roots) {
    902       DenseSet<Instruction*> V;
    903       collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
    904 
    905       // While we're here, check the use sets are the same size.
    906       if (V.size() != VBase.size()) {
    907         DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
    908         return false;
    909       }
    910 
    911       for (auto *I : V) {
    912         Uses[I].set(Idx);
    913       }
    914       ++Idx;
    915     }
    916 
    917     // Make sure our subsumed instructions are remembered too.
    918     for (auto *I : DRS.SubsumedInsts) {
    919       Uses[I].set(IL_All);
    920     }
    921   }
    922 
    923   // Make sure the loop increments are also accounted for.
    924 
    925   Exclude.clear();
    926   for (auto &DRS : RootSets) {
    927     Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
    928     Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
    929     Exclude.insert(DRS.BaseInst);
    930   }
    931 
    932   DenseSet<Instruction*> V;
    933   collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
    934   for (auto *I : V) {
    935     Uses[I].set(IL_All);
    936   }
    937 
    938   return true;
    939 
    940 }
    941 
    942 /// Get the next instruction in "In" that is a member of set Val.
    943 /// Start searching from StartI, and do not return anything in Exclude.
    944 /// If StartI is not given, start from In.begin().
    945 LoopReroll::DAGRootTracker::UsesTy::iterator
    946 LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
    947                                       const SmallInstructionSet &Exclude,
    948                                       UsesTy::iterator *StartI) {
    949   UsesTy::iterator I = StartI ? *StartI : In.begin();
    950   while (I != In.end() && (I->second.test(Val) == 0 ||
    951                            Exclude.count(I->first) != 0))
    952     ++I;
    953   return I;
    954 }
    955 
    956 bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
    957   for (auto &DRS : RootSets) {
    958     if (DRS.BaseInst == I)
    959       return true;
    960   }
    961   return false;
    962 }
    963 
    964 bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
    965   for (auto &DRS : RootSets) {
    966     if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end())
    967       return true;
    968   }
    969   return false;
    970 }
    971 
    972 /// Return true if instruction I depends on any instruction between
    973 /// Start and End.
    974 bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
    975                                                 UsesTy::iterator Start,
    976                                                 UsesTy::iterator End) {
    977   for (auto *U : I->users()) {
    978     for (auto It = Start; It != End; ++It)
    979       if (U == It->first)
    980         return true;
    981   }
    982   return false;
    983 }
    984 
    985 static bool isIgnorableInst(const Instruction *I) {
    986   if (isa<DbgInfoIntrinsic>(I))
    987     return true;
    988   const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
    989   if (!II)
    990     return false;
    991   switch (II->getIntrinsicID()) {
    992     default:
    993       return false;
    994     case llvm::Intrinsic::annotation:
    995     case Intrinsic::ptr_annotation:
    996     case Intrinsic::var_annotation:
    997     // TODO: the following intrinsics may also be whitelisted:
    998     //   lifetime_start, lifetime_end, invariant_start, invariant_end
    999       return true;
   1000   }
   1001   return false;
   1002 }
   1003 
   1004 bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
   1005   // We now need to check for equivalence of the use graph of each root with
   1006   // that of the primary induction variable (excluding the roots). Our goal
   1007   // here is not to solve the full graph isomorphism problem, but rather to
   1008   // catch common cases without a lot of work. As a result, we will assume
   1009   // that the relative order of the instructions in each unrolled iteration
   1010   // is the same (although we will not make an assumption about how the
   1011   // different iterations are intermixed). Note that while the order must be
   1012   // the same, the instructions may not be in the same basic block.
   1013 
   1014   // An array of just the possible reductions for this scale factor. When we
   1015   // collect the set of all users of some root instructions, these reduction
   1016   // instructions are treated as 'final' (their uses are not considered).
   1017   // This is important because we don't want the root use set to search down
   1018   // the reduction chain.
   1019   SmallInstructionSet PossibleRedSet;
   1020   SmallInstructionSet PossibleRedLastSet;
   1021   SmallInstructionSet PossibleRedPHISet;
   1022   Reductions.restrictToScale(Scale, PossibleRedSet,
   1023                              PossibleRedPHISet, PossibleRedLastSet);
   1024 
   1025   // Populate "Uses" with where each instruction is used.
   1026   if (!collectUsedInstructions(PossibleRedSet))
   1027     return false;
   1028 
   1029   // Make sure we mark the reduction PHIs as used in all iterations.
   1030   for (auto *I : PossibleRedPHISet) {
   1031     Uses[I].set(IL_All);
   1032   }
   1033 
   1034   // Make sure all instructions in the loop are in one and only one
   1035   // set.
   1036   for (auto &KV : Uses) {
   1037     if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
   1038       DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
   1039             << *KV.first << " (#uses=" << KV.second.count() << ")\n");
   1040       return false;
   1041     }
   1042   }
   1043 
   1044   DEBUG(
   1045     for (auto &KV : Uses) {
   1046       dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
   1047     }
   1048     );
   1049 
   1050   for (unsigned Iter = 1; Iter < Scale; ++Iter) {
   1051     // In addition to regular aliasing information, we need to look for
   1052     // instructions from later (future) iterations that have side effects
   1053     // preventing us from reordering them past other instructions with side
   1054     // effects.
   1055     bool FutureSideEffects = false;
   1056     AliasSetTracker AST(*AA);
   1057     // The map between instructions in f(%iv.(i+1)) and f(%iv).
   1058     DenseMap<Value *, Value *> BaseMap;
   1059 
   1060     // Compare iteration Iter to the base.
   1061     SmallInstructionSet Visited;
   1062     auto BaseIt = nextInstr(0, Uses, Visited);
   1063     auto RootIt = nextInstr(Iter, Uses, Visited);
   1064     auto LastRootIt = Uses.begin();
   1065 
   1066     while (BaseIt != Uses.end() && RootIt != Uses.end()) {
   1067       Instruction *BaseInst = BaseIt->first;
   1068       Instruction *RootInst = RootIt->first;
   1069 
   1070       // Skip over the IV or root instructions; only match their users.
   1071       bool Continue = false;
   1072       if (isBaseInst(BaseInst)) {
   1073         Visited.insert(BaseInst);
   1074         BaseIt = nextInstr(0, Uses, Visited);
   1075         Continue = true;
   1076       }
   1077       if (isRootInst(RootInst)) {
   1078         LastRootIt = RootIt;
   1079         Visited.insert(RootInst);
   1080         RootIt = nextInstr(Iter, Uses, Visited);
   1081         Continue = true;
   1082       }
   1083       if (Continue) continue;
   1084 
   1085       if (!BaseInst->isSameOperationAs(RootInst)) {
   1086         // Last chance saloon. We don't try and solve the full isomorphism
   1087         // problem, but try and at least catch the case where two instructions
   1088         // *of different types* are round the wrong way. We won't be able to
   1089         // efficiently tell, given two ADD instructions, which way around we
   1090         // should match them, but given an ADD and a SUB, we can at least infer
   1091         // which one is which.
   1092         //
   1093         // This should allow us to deal with a greater subset of the isomorphism
   1094         // problem. It does however change a linear algorithm into a quadratic
   1095         // one, so limit the number of probes we do.
   1096         auto TryIt = RootIt;
   1097         unsigned N = NumToleratedFailedMatches;
   1098         while (TryIt != Uses.end() &&
   1099                !BaseInst->isSameOperationAs(TryIt->first) &&
   1100                N--) {
   1101           ++TryIt;
   1102           TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
   1103         }
   1104 
   1105         if (TryIt == Uses.end() || TryIt == RootIt ||
   1106             instrDependsOn(TryIt->first, RootIt, TryIt)) {
   1107           DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
   1108                 " vs. " << *RootInst << "\n");
   1109           return false;
   1110         }
   1111 
   1112         RootIt = TryIt;
   1113         RootInst = TryIt->first;
   1114       }
   1115 
   1116       // All instructions between the last root and this root
   1117       // may belong to some other iteration. If they belong to a
   1118       // future iteration, then they're dangerous to alias with.
   1119       //
   1120       // Note that because we allow a limited amount of flexibility in the order
   1121       // that we visit nodes, LastRootIt might be *before* RootIt, in which
   1122       // case we've already checked this set of instructions so we shouldn't
   1123       // do anything.
   1124       for (; LastRootIt < RootIt; ++LastRootIt) {
   1125         Instruction *I = LastRootIt->first;
   1126         if (LastRootIt->second.find_first() < (int)Iter)
   1127           continue;
   1128         if (I->mayWriteToMemory())
   1129           AST.add(I);
   1130         // Note: This is specifically guarded by a check on isa<PHINode>,
   1131         // which while a valid (somewhat arbitrary) micro-optimization, is
   1132         // needed because otherwise isSafeToSpeculativelyExecute returns
   1133         // false on PHI nodes.
   1134         if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
   1135             !isSafeToSpeculativelyExecute(I))
   1136           // Intervening instructions cause side effects.
   1137           FutureSideEffects = true;
   1138       }
   1139 
   1140       // Make sure that this instruction, which is in the use set of this
   1141       // root instruction, does not also belong to the base set or the set of
   1142       // some other root instruction.
   1143       if (RootIt->second.count() > 1) {
   1144         DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
   1145                         " vs. " << *RootInst << " (prev. case overlap)\n");
   1146         return false;
   1147       }
   1148 
   1149       // Make sure that we don't alias with any instruction in the alias set
   1150       // tracker. If we do, then we depend on a future iteration, and we
   1151       // can't reroll.
   1152       if (RootInst->mayReadFromMemory())
   1153         for (auto &K : AST) {
   1154           if (K.aliasesUnknownInst(RootInst, *AA)) {
   1155             DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
   1156                             " vs. " << *RootInst << " (depends on future store)\n");
   1157             return false;
   1158           }
   1159         }
   1160 
   1161       // If we've past an instruction from a future iteration that may have
   1162       // side effects, and this instruction might also, then we can't reorder
   1163       // them, and this matching fails. As an exception, we allow the alias
   1164       // set tracker to handle regular (simple) load/store dependencies.
   1165       if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) &&
   1166                                  !isSafeToSpeculativelyExecute(BaseInst)) ||
   1167                                 (!isSimpleLoadStore(RootInst) &&
   1168                                  !isSafeToSpeculativelyExecute(RootInst)))) {
   1169         DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
   1170                         " vs. " << *RootInst <<
   1171                         " (side effects prevent reordering)\n");
   1172         return false;
   1173       }
   1174 
   1175       // For instructions that are part of a reduction, if the operation is
   1176       // associative, then don't bother matching the operands (because we
   1177       // already know that the instructions are isomorphic, and the order
   1178       // within the iteration does not matter). For non-associative reductions,
   1179       // we do need to match the operands, because we need to reject
   1180       // out-of-order instructions within an iteration!
   1181       // For example (assume floating-point addition), we need to reject this:
   1182       //   x += a[i]; x += b[i];
   1183       //   x += a[i+1]; x += b[i+1];
   1184       //   x += b[i+2]; x += a[i+2];
   1185       bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
   1186 
   1187       if (!(InReduction && BaseInst->isAssociative())) {
   1188         bool Swapped = false, SomeOpMatched = false;
   1189         for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
   1190           Value *Op2 = RootInst->getOperand(j);
   1191 
   1192           // If this is part of a reduction (and the operation is not
   1193           // associatve), then we match all operands, but not those that are
   1194           // part of the reduction.
   1195           if (InReduction)
   1196             if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
   1197               if (Reductions.isPairInSame(RootInst, Op2I))
   1198                 continue;
   1199 
   1200           DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
   1201           if (BMI != BaseMap.end()) {
   1202             Op2 = BMI->second;
   1203           } else {
   1204             for (auto &DRS : RootSets) {
   1205               if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
   1206                 Op2 = DRS.BaseInst;
   1207                 break;
   1208               }
   1209             }
   1210           }
   1211 
   1212           if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
   1213             // If we've not already decided to swap the matched operands, and
   1214             // we've not already matched our first operand (note that we could
   1215             // have skipped matching the first operand because it is part of a
   1216             // reduction above), and the instruction is commutative, then try
   1217             // the swapped match.
   1218             if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
   1219                 BaseInst->getOperand(!j) == Op2) {
   1220               Swapped = true;
   1221             } else {
   1222               DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
   1223                     << " vs. " << *RootInst << " (operand " << j << ")\n");
   1224               return false;
   1225             }
   1226           }
   1227 
   1228           SomeOpMatched = true;
   1229         }
   1230       }
   1231 
   1232       if ((!PossibleRedLastSet.count(BaseInst) &&
   1233            hasUsesOutsideLoop(BaseInst, L)) ||
   1234           (!PossibleRedLastSet.count(RootInst) &&
   1235            hasUsesOutsideLoop(RootInst, L))) {
   1236         DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
   1237                         " vs. " << *RootInst << " (uses outside loop)\n");
   1238         return false;
   1239       }
   1240 
   1241       Reductions.recordPair(BaseInst, RootInst, Iter);
   1242       BaseMap.insert(std::make_pair(RootInst, BaseInst));
   1243 
   1244       LastRootIt = RootIt;
   1245       Visited.insert(BaseInst);
   1246       Visited.insert(RootInst);
   1247       BaseIt = nextInstr(0, Uses, Visited);
   1248       RootIt = nextInstr(Iter, Uses, Visited);
   1249     }
   1250     assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
   1251             "Mismatched set sizes!");
   1252   }
   1253 
   1254   DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
   1255                   *IV << "\n");
   1256 
   1257   return true;
   1258 }
   1259 
   1260 void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
   1261   BasicBlock *Header = L->getHeader();
   1262   // Remove instructions associated with non-base iterations.
   1263   for (BasicBlock::reverse_iterator J = Header->rbegin();
   1264        J != Header->rend();) {
   1265     unsigned I = Uses[&*J].find_first();
   1266     if (I > 0 && I < IL_All) {
   1267       Instruction *D = &*J;
   1268       DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
   1269       D->eraseFromParent();
   1270       continue;
   1271     }
   1272 
   1273     ++J;
   1274   }
   1275   bool Negative = IVToIncMap[IV] < 0;
   1276   const DataLayout &DL = Header->getModule()->getDataLayout();
   1277 
   1278   // We need to create a new induction variable for each different BaseInst.
   1279   for (auto &DRS : RootSets) {
   1280     // Insert the new induction variable.
   1281     const SCEVAddRecExpr *RealIVSCEV =
   1282       cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
   1283     const SCEV *Start = RealIVSCEV->getStart();
   1284     const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr(
   1285         Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L,
   1286         SCEV::FlagAnyWrap));
   1287     { // Limit the lifetime of SCEVExpander.
   1288       SCEVExpander Expander(*SE, DL, "reroll");
   1289       Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front());
   1290 
   1291       for (auto &KV : Uses) {
   1292         if (KV.second.find_first() == 0)
   1293           KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV);
   1294       }
   1295 
   1296       if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
   1297         // FIXME: Why do we need this check?
   1298         if (Uses[BI].find_first() == IL_All) {
   1299           const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
   1300 
   1301           // Iteration count SCEV minus 1
   1302           const SCEV *ICMinus1SCEV = SE->getMinusSCEV(
   1303               ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1));
   1304 
   1305           Value *ICMinus1; // Iteration count minus 1
   1306           if (isa<SCEVConstant>(ICMinus1SCEV)) {
   1307             ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
   1308           } else {
   1309             BasicBlock *Preheader = L->getLoopPreheader();
   1310             if (!Preheader)
   1311               Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
   1312 
   1313             ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
   1314                                               Preheader->getTerminator());
   1315           }
   1316 
   1317           Value *Cond =
   1318             new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
   1319           BI->setCondition(Cond);
   1320 
   1321           if (BI->getSuccessor(1) != Header)
   1322             BI->swapSuccessors();
   1323         }
   1324       }
   1325     }
   1326   }
   1327 
   1328   SimplifyInstructionsInBlock(Header, TLI);
   1329   DeleteDeadPHIs(Header, TLI);
   1330 }
   1331 
   1332 // Validate the selected reductions. All iterations must have an isomorphic
   1333 // part of the reduction chain and, for non-associative reductions, the chain
   1334 // entries must appear in order.
   1335 bool LoopReroll::ReductionTracker::validateSelected() {
   1336   // For a non-associative reduction, the chain entries must appear in order.
   1337   for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
   1338        RI != RIE; ++RI) {
   1339     int i = *RI;
   1340     int PrevIter = 0, BaseCount = 0, Count = 0;
   1341     for (Instruction *J : PossibleReds[i]) {
   1342       // Note that all instructions in the chain must have been found because
   1343       // all instructions in the function must have been assigned to some
   1344       // iteration.
   1345       int Iter = PossibleRedIter[J];
   1346       if (Iter != PrevIter && Iter != PrevIter + 1 &&
   1347           !PossibleReds[i].getReducedValue()->isAssociative()) {
   1348         DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
   1349                         J << "\n");
   1350         return false;
   1351       }
   1352 
   1353       if (Iter != PrevIter) {
   1354         if (Count != BaseCount) {
   1355           DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
   1356                 " reduction use count " << Count <<
   1357                 " is not equal to the base use count " <<
   1358                 BaseCount << "\n");
   1359           return false;
   1360         }
   1361 
   1362         Count = 0;
   1363       }
   1364 
   1365       ++Count;
   1366       if (Iter == 0)
   1367         ++BaseCount;
   1368 
   1369       PrevIter = Iter;
   1370     }
   1371   }
   1372 
   1373   return true;
   1374 }
   1375 
   1376 // For all selected reductions, remove all parts except those in the first
   1377 // iteration (and the PHI). Replace outside uses of the reduced value with uses
   1378 // of the first-iteration reduced value (in other words, reroll the selected
   1379 // reductions).
   1380 void LoopReroll::ReductionTracker::replaceSelected() {
   1381   // Fixup reductions to refer to the last instruction associated with the
   1382   // first iteration (not the last).
   1383   for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
   1384        RI != RIE; ++RI) {
   1385     int i = *RI;
   1386     int j = 0;
   1387     for (int e = PossibleReds[i].size(); j != e; ++j)
   1388       if (PossibleRedIter[PossibleReds[i][j]] != 0) {
   1389         --j;
   1390         break;
   1391       }
   1392 
   1393     // Replace users with the new end-of-chain value.
   1394     SmallInstructionVector Users;
   1395     for (User *U : PossibleReds[i].getReducedValue()->users()) {
   1396       Users.push_back(cast<Instruction>(U));
   1397     }
   1398 
   1399     for (SmallInstructionVector::iterator J = Users.begin(),
   1400          JE = Users.end(); J != JE; ++J)
   1401       (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
   1402                               PossibleReds[i][j]);
   1403   }
   1404 }
   1405 
   1406 // Reroll the provided loop with respect to the provided induction variable.
   1407 // Generally, we're looking for a loop like this:
   1408 //
   1409 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
   1410 // f(%iv)
   1411 // %iv.1 = add %iv, 1                <-- a root increment
   1412 // f(%iv.1)
   1413 // %iv.2 = add %iv, 2                <-- a root increment
   1414 // f(%iv.2)
   1415 // %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
   1416 // f(%iv.scale_m_1)
   1417 // ...
   1418 // %iv.next = add %iv, scale
   1419 // %cmp = icmp(%iv, ...)
   1420 // br %cmp, header, exit
   1421 //
   1422 // Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
   1423 // instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
   1424 // be intermixed with eachother. The restriction imposed by this algorithm is
   1425 // that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
   1426 // etc. be the same.
   1427 //
   1428 // First, we collect the use set of %iv, excluding the other increment roots.
   1429 // This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
   1430 // times, having collected the use set of f(%iv.(i+1)), during which we:
   1431 //   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
   1432 //     the next unmatched instruction in f(%iv.(i+1)).
   1433 //   - Ensure that both matched instructions don't have any external users
   1434 //     (with the exception of last-in-chain reduction instructions).
   1435 //   - Track the (aliasing) write set, and other side effects, of all
   1436 //     instructions that belong to future iterations that come before the matched
   1437 //     instructions. If the matched instructions read from that write set, then
   1438 //     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
   1439 //     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
   1440 //     if any of these future instructions had side effects (could not be
   1441 //     speculatively executed), and so do the matched instructions, when we
   1442 //     cannot reorder those side-effect-producing instructions, and rerolling
   1443 //     fails.
   1444 //
   1445 // Finally, we make sure that all loop instructions are either loop increment
   1446 // roots, belong to simple latch code, parts of validated reductions, part of
   1447 // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
   1448 // have been validated), then we reroll the loop.
   1449 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
   1450                         const SCEV *IterCount,
   1451                         ReductionTracker &Reductions) {
   1452   DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
   1453                           IVToIncMap);
   1454 
   1455   if (!DAGRoots.findRoots())
   1456     return false;
   1457   DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
   1458                   *IV << "\n");
   1459 
   1460   if (!DAGRoots.validate(Reductions))
   1461     return false;
   1462   if (!Reductions.validateSelected())
   1463     return false;
   1464   // At this point, we've validated the rerolling, and we're committed to
   1465   // making changes!
   1466 
   1467   Reductions.replaceSelected();
   1468   DAGRoots.replace(IterCount);
   1469 
   1470   ++NumRerolledLoops;
   1471   return true;
   1472 }
   1473 
   1474 bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   1475   if (skipOptnoneFunction(L))
   1476     return false;
   1477 
   1478   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   1479   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   1480   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   1481   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   1482   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   1483   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
   1484 
   1485   BasicBlock *Header = L->getHeader();
   1486   DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
   1487         "] Loop %" << Header->getName() << " (" <<
   1488         L->getNumBlocks() << " block(s))\n");
   1489 
   1490   bool Changed = false;
   1491 
   1492   // For now, we'll handle only single BB loops.
   1493   if (L->getNumBlocks() > 1)
   1494     return Changed;
   1495 
   1496   if (!SE->hasLoopInvariantBackedgeTakenCount(L))
   1497     return Changed;
   1498 
   1499   const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
   1500   const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
   1501   DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
   1502 
   1503   // First, we need to find the induction variable with respect to which we can
   1504   // reroll (there may be several possible options).
   1505   SmallInstructionVector PossibleIVs;
   1506   IVToIncMap.clear();
   1507   collectPossibleIVs(L, PossibleIVs);
   1508 
   1509   if (PossibleIVs.empty()) {
   1510     DEBUG(dbgs() << "LRR: No possible IVs found\n");
   1511     return Changed;
   1512   }
   1513 
   1514   ReductionTracker Reductions;
   1515   collectPossibleReductions(L, Reductions);
   1516 
   1517   // For each possible IV, collect the associated possible set of 'root' nodes
   1518   // (i+1, i+2, etc.).
   1519   for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
   1520        IE = PossibleIVs.end(); I != IE; ++I)
   1521     if (reroll(*I, L, Header, IterCount, Reductions)) {
   1522       Changed = true;
   1523       break;
   1524     }
   1525 
   1526   return Changed;
   1527 }
   1528