1 //===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass implements a simple loop reroller. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/Scalar.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SmallSet.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/AliasSetTracker.h" 20 #include "llvm/Analysis/LoopPass.h" 21 #include "llvm/Analysis/ScalarEvolution.h" 22 #include "llvm/Analysis/ScalarEvolutionExpander.h" 23 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 24 #include "llvm/Analysis/ValueTracking.h" 25 #include "llvm/IR/DataLayout.h" 26 #include "llvm/IR/Dominators.h" 27 #include "llvm/IR/IntrinsicInst.h" 28 #include "llvm/Support/CommandLine.h" 29 #include "llvm/Support/Debug.h" 30 #include "llvm/Support/raw_ostream.h" 31 #include "llvm/Target/TargetLibraryInfo.h" 32 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 33 #include "llvm/Transforms/Utils/Local.h" 34 #include "llvm/Transforms/Utils/LoopUtils.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "loop-reroll" 39 40 STATISTIC(NumRerolledLoops, "Number of rerolled loops"); 41 42 static cl::opt<unsigned> 43 MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, 44 cl::desc("The maximum increment for loop rerolling")); 45 46 // This loop re-rolling transformation aims to transform loops like this: 47 // 48 // int foo(int a); 49 // void bar(int *x) { 50 // for (int i = 0; i < 500; i += 3) { 51 // foo(i); 52 // foo(i+1); 53 // foo(i+2); 54 // } 55 // } 56 // 57 // into a loop like this: 58 // 59 // void bar(int *x) { 60 // for (int i = 0; i < 500; ++i) 61 // foo(i); 62 // } 63 // 64 // It does this by looking for loops that, besides the latch code, are composed 65 // of isomorphic DAGs of instructions, with each DAG rooted at some increment 66 // to the induction variable, and where each DAG is isomorphic to the DAG 67 // rooted at the induction variable (excepting the sub-DAGs which root the 68 // other induction-variable increments). In other words, we're looking for loop 69 // bodies of the form: 70 // 71 // %iv = phi [ (preheader, ...), (body, %iv.next) ] 72 // f(%iv) 73 // %iv.1 = add %iv, 1 <-- a root increment 74 // f(%iv.1) 75 // %iv.2 = add %iv, 2 <-- a root increment 76 // f(%iv.2) 77 // %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 78 // f(%iv.scale_m_1) 79 // ... 80 // %iv.next = add %iv, scale 81 // %cmp = icmp(%iv, ...) 82 // br %cmp, header, exit 83 // 84 // where each f(i) is a set of instructions that, collectively, are a function 85 // only of i (and other loop-invariant values). 86 // 87 // As a special case, we can also reroll loops like this: 88 // 89 // int foo(int); 90 // void bar(int *x) { 91 // for (int i = 0; i < 500; ++i) { 92 // x[3*i] = foo(0); 93 // x[3*i+1] = foo(0); 94 // x[3*i+2] = foo(0); 95 // } 96 // } 97 // 98 // into this: 99 // 100 // void bar(int *x) { 101 // for (int i = 0; i < 1500; ++i) 102 // x[i] = foo(0); 103 // } 104 // 105 // in which case, we're looking for inputs like this: 106 // 107 // %iv = phi [ (preheader, ...), (body, %iv.next) ] 108 // %scaled.iv = mul %iv, scale 109 // f(%scaled.iv) 110 // %scaled.iv.1 = add %scaled.iv, 1 111 // f(%scaled.iv.1) 112 // %scaled.iv.2 = add %scaled.iv, 2 113 // f(%scaled.iv.2) 114 // %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 115 // f(%scaled.iv.scale_m_1) 116 // ... 117 // %iv.next = add %iv, 1 118 // %cmp = icmp(%iv, ...) 119 // br %cmp, header, exit 120 121 namespace { 122 class LoopReroll : public LoopPass { 123 public: 124 static char ID; // Pass ID, replacement for typeid 125 LoopReroll() : LoopPass(ID) { 126 initializeLoopRerollPass(*PassRegistry::getPassRegistry()); 127 } 128 129 bool runOnLoop(Loop *L, LPPassManager &LPM) override; 130 131 void getAnalysisUsage(AnalysisUsage &AU) const override { 132 AU.addRequired<AliasAnalysis>(); 133 AU.addRequired<LoopInfo>(); 134 AU.addPreserved<LoopInfo>(); 135 AU.addRequired<DominatorTreeWrapperPass>(); 136 AU.addPreserved<DominatorTreeWrapperPass>(); 137 AU.addRequired<ScalarEvolution>(); 138 AU.addRequired<TargetLibraryInfo>(); 139 } 140 141 protected: 142 AliasAnalysis *AA; 143 LoopInfo *LI; 144 ScalarEvolution *SE; 145 const DataLayout *DL; 146 TargetLibraryInfo *TLI; 147 DominatorTree *DT; 148 149 typedef SmallVector<Instruction *, 16> SmallInstructionVector; 150 typedef SmallSet<Instruction *, 16> SmallInstructionSet; 151 152 // A chain of isomorphic instructions, indentified by a single-use PHI, 153 // representing a reduction. Only the last value may be used outside the 154 // loop. 155 struct SimpleLoopReduction { 156 SimpleLoopReduction(Instruction *P, Loop *L) 157 : Valid(false), Instructions(1, P) { 158 assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); 159 add(L); 160 } 161 162 bool valid() const { 163 return Valid; 164 } 165 166 Instruction *getPHI() const { 167 assert(Valid && "Using invalid reduction"); 168 return Instructions.front(); 169 } 170 171 Instruction *getReducedValue() const { 172 assert(Valid && "Using invalid reduction"); 173 return Instructions.back(); 174 } 175 176 Instruction *get(size_t i) const { 177 assert(Valid && "Using invalid reduction"); 178 return Instructions[i+1]; 179 } 180 181 Instruction *operator [] (size_t i) const { return get(i); } 182 183 // The size, ignoring the initial PHI. 184 size_t size() const { 185 assert(Valid && "Using invalid reduction"); 186 return Instructions.size()-1; 187 } 188 189 typedef SmallInstructionVector::iterator iterator; 190 typedef SmallInstructionVector::const_iterator const_iterator; 191 192 iterator begin() { 193 assert(Valid && "Using invalid reduction"); 194 return std::next(Instructions.begin()); 195 } 196 197 const_iterator begin() const { 198 assert(Valid && "Using invalid reduction"); 199 return std::next(Instructions.begin()); 200 } 201 202 iterator end() { return Instructions.end(); } 203 const_iterator end() const { return Instructions.end(); } 204 205 protected: 206 bool Valid; 207 SmallInstructionVector Instructions; 208 209 void add(Loop *L); 210 }; 211 212 // The set of all reductions, and state tracking of possible reductions 213 // during loop instruction processing. 214 struct ReductionTracker { 215 typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; 216 217 // Add a new possible reduction. 218 void addSLR(SimpleLoopReduction &SLR) { 219 PossibleReds.push_back(SLR); 220 } 221 222 // Setup to track possible reductions corresponding to the provided 223 // rerolling scale. Only reductions with a number of non-PHI instructions 224 // that is divisible by the scale are considered. Three instructions sets 225 // are filled in: 226 // - A set of all possible instructions in eligible reductions. 227 // - A set of all PHIs in eligible reductions 228 // - A set of all reduced values (last instructions) in eligible reductions. 229 void restrictToScale(uint64_t Scale, 230 SmallInstructionSet &PossibleRedSet, 231 SmallInstructionSet &PossibleRedPHISet, 232 SmallInstructionSet &PossibleRedLastSet) { 233 PossibleRedIdx.clear(); 234 PossibleRedIter.clear(); 235 Reds.clear(); 236 237 for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) 238 if (PossibleReds[i].size() % Scale == 0) { 239 PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); 240 PossibleRedPHISet.insert(PossibleReds[i].getPHI()); 241 242 PossibleRedSet.insert(PossibleReds[i].getPHI()); 243 PossibleRedIdx[PossibleReds[i].getPHI()] = i; 244 for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), 245 JE = PossibleReds[i].end(); J != JE; ++J) { 246 PossibleRedSet.insert(*J); 247 PossibleRedIdx[*J] = i; 248 } 249 } 250 } 251 252 // The functions below are used while processing the loop instructions. 253 254 // Are the two instructions both from reductions, and furthermore, from 255 // the same reduction? 256 bool isPairInSame(Instruction *J1, Instruction *J2) { 257 DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); 258 if (J1I != PossibleRedIdx.end()) { 259 DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); 260 if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) 261 return true; 262 } 263 264 return false; 265 } 266 267 // The two provided instructions, the first from the base iteration, and 268 // the second from iteration i, form a matched pair. If these are part of 269 // a reduction, record that fact. 270 void recordPair(Instruction *J1, Instruction *J2, unsigned i) { 271 if (PossibleRedIdx.count(J1)) { 272 assert(PossibleRedIdx.count(J2) && 273 "Recording reduction vs. non-reduction instruction?"); 274 275 PossibleRedIter[J1] = 0; 276 PossibleRedIter[J2] = i; 277 278 int Idx = PossibleRedIdx[J1]; 279 assert(Idx == PossibleRedIdx[J2] && 280 "Recording pair from different reductions?"); 281 Reds.insert(Idx); 282 } 283 } 284 285 // The functions below can be called after we've finished processing all 286 // instructions in the loop, and we know which reductions were selected. 287 288 // Is the provided instruction the PHI of a reduction selected for 289 // rerolling? 290 bool isSelectedPHI(Instruction *J) { 291 if (!isa<PHINode>(J)) 292 return false; 293 294 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 295 RI != RIE; ++RI) { 296 int i = *RI; 297 if (cast<Instruction>(J) == PossibleReds[i].getPHI()) 298 return true; 299 } 300 301 return false; 302 } 303 304 bool validateSelected(); 305 void replaceSelected(); 306 307 protected: 308 // The vector of all possible reductions (for any scale). 309 SmallReductionVector PossibleReds; 310 311 DenseMap<Instruction *, int> PossibleRedIdx; 312 DenseMap<Instruction *, int> PossibleRedIter; 313 DenseSet<int> Reds; 314 }; 315 316 void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); 317 void collectPossibleReductions(Loop *L, 318 ReductionTracker &Reductions); 319 void collectInLoopUserSet(Loop *L, 320 const SmallInstructionVector &Roots, 321 const SmallInstructionSet &Exclude, 322 const SmallInstructionSet &Final, 323 DenseSet<Instruction *> &Users); 324 void collectInLoopUserSet(Loop *L, 325 Instruction * Root, 326 const SmallInstructionSet &Exclude, 327 const SmallInstructionSet &Final, 328 DenseSet<Instruction *> &Users); 329 bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, 330 Instruction *&IV, 331 SmallInstructionVector &LoopIncs); 332 bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, 333 SmallVector<SmallInstructionVector, 32> &Roots, 334 SmallInstructionSet &AllRoots, 335 SmallInstructionVector &LoopIncs); 336 bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, 337 ReductionTracker &Reductions); 338 }; 339 } 340 341 char LoopReroll::ID = 0; 342 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) 343 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 344 INITIALIZE_PASS_DEPENDENCY(LoopInfo) 345 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 346 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 347 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) 348 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) 349 350 Pass *llvm::createLoopRerollPass() { 351 return new LoopReroll; 352 } 353 354 // Returns true if the provided instruction is used outside the given loop. 355 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in 356 // non-loop blocks to be outside the loop. 357 static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { 358 for (User *U : I->users()) 359 if (!L->contains(cast<Instruction>(U))) 360 return true; 361 362 return false; 363 } 364 365 // Collect the list of loop induction variables with respect to which it might 366 // be possible to reroll the loop. 367 void LoopReroll::collectPossibleIVs(Loop *L, 368 SmallInstructionVector &PossibleIVs) { 369 BasicBlock *Header = L->getHeader(); 370 for (BasicBlock::iterator I = Header->begin(), 371 IE = Header->getFirstInsertionPt(); I != IE; ++I) { 372 if (!isa<PHINode>(I)) 373 continue; 374 if (!I->getType()->isIntegerTy()) 375 continue; 376 377 if (const SCEVAddRecExpr *PHISCEV = 378 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { 379 if (PHISCEV->getLoop() != L) 380 continue; 381 if (!PHISCEV->isAffine()) 382 continue; 383 if (const SCEVConstant *IncSCEV = 384 dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { 385 if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) 386 continue; 387 if (IncSCEV->getValue()->uge(MaxInc)) 388 continue; 389 390 DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << 391 *PHISCEV << "\n"); 392 PossibleIVs.push_back(I); 393 } 394 } 395 } 396 } 397 398 // Add the remainder of the reduction-variable chain to the instruction vector 399 // (the initial PHINode has already been added). If successful, the object is 400 // marked as valid. 401 void LoopReroll::SimpleLoopReduction::add(Loop *L) { 402 assert(!Valid && "Cannot add to an already-valid chain"); 403 404 // The reduction variable must be a chain of single-use instructions 405 // (including the PHI), except for the last value (which is used by the PHI 406 // and also outside the loop). 407 Instruction *C = Instructions.front(); 408 409 do { 410 C = cast<Instruction>(*C->user_begin()); 411 if (C->hasOneUse()) { 412 if (!C->isBinaryOp()) 413 return; 414 415 if (!(isa<PHINode>(Instructions.back()) || 416 C->isSameOperationAs(Instructions.back()))) 417 return; 418 419 Instructions.push_back(C); 420 } 421 } while (C->hasOneUse()); 422 423 if (Instructions.size() < 2 || 424 !C->isSameOperationAs(Instructions.back()) || 425 C->use_empty()) 426 return; 427 428 // C is now the (potential) last instruction in the reduction chain. 429 for (User *U : C->users()) 430 // The only in-loop user can be the initial PHI. 431 if (L->contains(cast<Instruction>(U))) 432 if (cast<Instruction>(U) != Instructions.front()) 433 return; 434 435 Instructions.push_back(C); 436 Valid = true; 437 } 438 439 // Collect the vector of possible reduction variables. 440 void LoopReroll::collectPossibleReductions(Loop *L, 441 ReductionTracker &Reductions) { 442 BasicBlock *Header = L->getHeader(); 443 for (BasicBlock::iterator I = Header->begin(), 444 IE = Header->getFirstInsertionPt(); I != IE; ++I) { 445 if (!isa<PHINode>(I)) 446 continue; 447 if (!I->getType()->isSingleValueType()) 448 continue; 449 450 SimpleLoopReduction SLR(I, L); 451 if (!SLR.valid()) 452 continue; 453 454 DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " << 455 SLR.size() << " chained instructions)\n"); 456 Reductions.addSLR(SLR); 457 } 458 } 459 460 // Collect the set of all users of the provided root instruction. This set of 461 // users contains not only the direct users of the root instruction, but also 462 // all users of those users, and so on. There are two exceptions: 463 // 464 // 1. Instructions in the set of excluded instructions are never added to the 465 // use set (even if they are users). This is used, for example, to exclude 466 // including root increments in the use set of the primary IV. 467 // 468 // 2. Instructions in the set of final instructions are added to the use set 469 // if they are users, but their users are not added. This is used, for 470 // example, to prevent a reduction update from forcing all later reduction 471 // updates into the use set. 472 void LoopReroll::collectInLoopUserSet(Loop *L, 473 Instruction *Root, const SmallInstructionSet &Exclude, 474 const SmallInstructionSet &Final, 475 DenseSet<Instruction *> &Users) { 476 SmallInstructionVector Queue(1, Root); 477 while (!Queue.empty()) { 478 Instruction *I = Queue.pop_back_val(); 479 if (!Users.insert(I).second) 480 continue; 481 482 if (!Final.count(I)) 483 for (Use &U : I->uses()) { 484 Instruction *User = cast<Instruction>(U.getUser()); 485 if (PHINode *PN = dyn_cast<PHINode>(User)) { 486 // Ignore "wrap-around" uses to PHIs of this loop's header. 487 if (PN->getIncomingBlock(U) == L->getHeader()) 488 continue; 489 } 490 491 if (L->contains(User) && !Exclude.count(User)) { 492 Queue.push_back(User); 493 } 494 } 495 496 // We also want to collect single-user "feeder" values. 497 for (User::op_iterator OI = I->op_begin(), 498 OIE = I->op_end(); OI != OIE; ++OI) { 499 if (Instruction *Op = dyn_cast<Instruction>(*OI)) 500 if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && 501 !Final.count(Op)) 502 Queue.push_back(Op); 503 } 504 } 505 } 506 507 // Collect all of the users of all of the provided root instructions (combined 508 // into a single set). 509 void LoopReroll::collectInLoopUserSet(Loop *L, 510 const SmallInstructionVector &Roots, 511 const SmallInstructionSet &Exclude, 512 const SmallInstructionSet &Final, 513 DenseSet<Instruction *> &Users) { 514 for (SmallInstructionVector::const_iterator I = Roots.begin(), 515 IE = Roots.end(); I != IE; ++I) 516 collectInLoopUserSet(L, *I, Exclude, Final, Users); 517 } 518 519 static bool isSimpleLoadStore(Instruction *I) { 520 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 521 return LI->isSimple(); 522 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 523 return SI->isSimple(); 524 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) 525 return !MI->isVolatile(); 526 return false; 527 } 528 529 // Recognize loops that are setup like this: 530 // 531 // %iv = phi [ (preheader, ...), (body, %iv.next) ] 532 // %scaled.iv = mul %iv, scale 533 // f(%scaled.iv) 534 // %scaled.iv.1 = add %scaled.iv, 1 535 // f(%scaled.iv.1) 536 // %scaled.iv.2 = add %scaled.iv, 2 537 // f(%scaled.iv.2) 538 // %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 539 // f(%scaled.iv.scale_m_1) 540 // ... 541 // %iv.next = add %iv, 1 542 // %cmp = icmp(%iv, ...) 543 // br %cmp, header, exit 544 // 545 // and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. 546 bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, 547 Instruction *&IV, 548 SmallInstructionVector &LoopIncs) { 549 // This is a special case: here we're looking for all uses (except for 550 // the increment) to be multiplied by a common factor. The increment must 551 // be by one. This is to capture loops like: 552 // for (int i = 0; i < 500; ++i) { 553 // foo(3*i); foo(3*i+1); foo(3*i+2); 554 // } 555 if (RealIV->getNumUses() != 2) 556 return false; 557 const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); 558 Instruction *User1 = cast<Instruction>(*RealIV->user_begin()), 559 *User2 = cast<Instruction>(*std::next(RealIV->user_begin())); 560 if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) 561 return false; 562 const SCEVAddRecExpr *User1SCEV = 563 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), 564 *User2SCEV = 565 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); 566 if (!User1SCEV || !User1SCEV->isAffine() || 567 !User2SCEV || !User2SCEV->isAffine()) 568 return false; 569 570 // We assume below that User1 is the scale multiply and User2 is the 571 // increment. If this can't be true, then swap them. 572 if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { 573 std::swap(User1, User2); 574 std::swap(User1SCEV, User2SCEV); 575 } 576 577 if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) 578 return false; 579 assert(User2SCEV->getStepRecurrence(*SE)->isOne() && 580 "Invalid non-unit step for multiplicative scaling"); 581 LoopIncs.push_back(User2); 582 583 if (const SCEVConstant *MulScale = 584 dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { 585 // Make sure that both the start and step have the same multiplier. 586 if (RealIVSCEV->getStart()->getType() != MulScale->getType()) 587 return false; 588 if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != 589 User1SCEV->getStart()) 590 return false; 591 592 ConstantInt *MulScaleCI = MulScale->getValue(); 593 if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) 594 return false; 595 Scale = MulScaleCI->getZExtValue(); 596 IV = User1; 597 } else 598 return false; 599 600 DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); 601 return true; 602 } 603 604 // Collect all root increments with respect to the provided induction variable 605 // (normally the PHI, but sometimes a multiply). A root increment is an 606 // instruction, normally an add, with a positive constant less than Scale. In a 607 // rerollable loop, each of these increments is the root of an instruction 608 // graph isomorphic to the others. Also, we collect the final induction 609 // increment (the increment equal to the Scale), and its users in LoopIncs. 610 bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, 611 Instruction *IV, 612 SmallVector<SmallInstructionVector, 32> &Roots, 613 SmallInstructionSet &AllRoots, 614 SmallInstructionVector &LoopIncs) { 615 for (User *U : IV->users()) { 616 Instruction *UI = cast<Instruction>(U); 617 if (!SE->isSCEVable(UI->getType())) 618 continue; 619 if (UI->getType() != IV->getType()) 620 continue; 621 if (!L->contains(UI)) 622 continue; 623 if (hasUsesOutsideLoop(UI, L)) 624 continue; 625 626 if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( 627 SE->getSCEV(UI), SE->getSCEV(IV)))) { 628 uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); 629 if (Idx > 0 && Idx < Scale) { 630 Roots[Idx-1].push_back(UI); 631 AllRoots.insert(UI); 632 } else if (Idx == Scale && Inc > 1) { 633 LoopIncs.push_back(UI); 634 } 635 } 636 } 637 638 if (Roots[0].empty()) 639 return false; 640 bool AllSame = true; 641 for (unsigned i = 1; i < Scale-1; ++i) 642 if (Roots[i].size() != Roots[0].size()) { 643 AllSame = false; 644 break; 645 } 646 647 if (!AllSame) 648 return false; 649 650 return true; 651 } 652 653 // Validate the selected reductions. All iterations must have an isomorphic 654 // part of the reduction chain and, for non-associative reductions, the chain 655 // entries must appear in order. 656 bool LoopReroll::ReductionTracker::validateSelected() { 657 // For a non-associative reduction, the chain entries must appear in order. 658 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 659 RI != RIE; ++RI) { 660 int i = *RI; 661 int PrevIter = 0, BaseCount = 0, Count = 0; 662 for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), 663 JE = PossibleReds[i].end(); J != JE; ++J) { 664 // Note that all instructions in the chain must have been found because 665 // all instructions in the function must have been assigned to some 666 // iteration. 667 int Iter = PossibleRedIter[*J]; 668 if (Iter != PrevIter && Iter != PrevIter + 1 && 669 !PossibleReds[i].getReducedValue()->isAssociative()) { 670 DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << 671 *J << "\n"); 672 return false; 673 } 674 675 if (Iter != PrevIter) { 676 if (Count != BaseCount) { 677 DEBUG(dbgs() << "LRR: Iteration " << PrevIter << 678 " reduction use count " << Count << 679 " is not equal to the base use count " << 680 BaseCount << "\n"); 681 return false; 682 } 683 684 Count = 0; 685 } 686 687 ++Count; 688 if (Iter == 0) 689 ++BaseCount; 690 691 PrevIter = Iter; 692 } 693 } 694 695 return true; 696 } 697 698 // For all selected reductions, remove all parts except those in the first 699 // iteration (and the PHI). Replace outside uses of the reduced value with uses 700 // of the first-iteration reduced value (in other words, reroll the selected 701 // reductions). 702 void LoopReroll::ReductionTracker::replaceSelected() { 703 // Fixup reductions to refer to the last instruction associated with the 704 // first iteration (not the last). 705 for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); 706 RI != RIE; ++RI) { 707 int i = *RI; 708 int j = 0; 709 for (int e = PossibleReds[i].size(); j != e; ++j) 710 if (PossibleRedIter[PossibleReds[i][j]] != 0) { 711 --j; 712 break; 713 } 714 715 // Replace users with the new end-of-chain value. 716 SmallInstructionVector Users; 717 for (User *U : PossibleReds[i].getReducedValue()->users()) 718 Users.push_back(cast<Instruction>(U)); 719 720 for (SmallInstructionVector::iterator J = Users.begin(), 721 JE = Users.end(); J != JE; ++J) 722 (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), 723 PossibleReds[i][j]); 724 } 725 } 726 727 // Reroll the provided loop with respect to the provided induction variable. 728 // Generally, we're looking for a loop like this: 729 // 730 // %iv = phi [ (preheader, ...), (body, %iv.next) ] 731 // f(%iv) 732 // %iv.1 = add %iv, 1 <-- a root increment 733 // f(%iv.1) 734 // %iv.2 = add %iv, 2 <-- a root increment 735 // f(%iv.2) 736 // %iv.scale_m_1 = add %iv, scale-1 <-- a root increment 737 // f(%iv.scale_m_1) 738 // ... 739 // %iv.next = add %iv, scale 740 // %cmp = icmp(%iv, ...) 741 // br %cmp, header, exit 742 // 743 // Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of 744 // instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can 745 // be intermixed with eachother. The restriction imposed by this algorithm is 746 // that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), 747 // etc. be the same. 748 // 749 // First, we collect the use set of %iv, excluding the other increment roots. 750 // This gives us f(%iv). Then we iterate over the loop instructions (scale-1) 751 // times, having collected the use set of f(%iv.(i+1)), during which we: 752 // - Ensure that the next unmatched instruction in f(%iv) is isomorphic to 753 // the next unmatched instruction in f(%iv.(i+1)). 754 // - Ensure that both matched instructions don't have any external users 755 // (with the exception of last-in-chain reduction instructions). 756 // - Track the (aliasing) write set, and other side effects, of all 757 // instructions that belong to future iterations that come before the matched 758 // instructions. If the matched instructions read from that write set, then 759 // f(%iv) or f(%iv.(i+1)) has some dependency on instructions in 760 // f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, 761 // if any of these future instructions had side effects (could not be 762 // speculatively executed), and so do the matched instructions, when we 763 // cannot reorder those side-effect-producing instructions, and rerolling 764 // fails. 765 // 766 // Finally, we make sure that all loop instructions are either loop increment 767 // roots, belong to simple latch code, parts of validated reductions, part of 768 // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions 769 // have been validated), then we reroll the loop. 770 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, 771 const SCEV *IterCount, 772 ReductionTracker &Reductions) { 773 const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); 774 uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> 775 getValue()->getZExtValue(); 776 // The collection of loop increment instructions. 777 SmallInstructionVector LoopIncs; 778 uint64_t Scale = Inc; 779 780 // The effective induction variable, IV, is normally also the real induction 781 // variable. When we're dealing with a loop like: 782 // for (int i = 0; i < 500; ++i) 783 // x[3*i] = ...; 784 // x[3*i+1] = ...; 785 // x[3*i+2] = ...; 786 // then the real IV is still i, but the effective IV is (3*i). 787 Instruction *RealIV = IV; 788 if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) 789 return false; 790 791 assert(Scale <= MaxInc && "Scale is too large"); 792 assert(Scale > 1 && "Scale must be at least 2"); 793 794 // The set of increment instructions for each increment value. 795 SmallVector<SmallInstructionVector, 32> Roots(Scale-1); 796 SmallInstructionSet AllRoots; 797 if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) 798 return false; 799 800 DEBUG(dbgs() << "LRR: Found all root induction increments for: " << 801 *RealIV << "\n"); 802 803 // An array of just the possible reductions for this scale factor. When we 804 // collect the set of all users of some root instructions, these reduction 805 // instructions are treated as 'final' (their uses are not considered). 806 // This is important because we don't want the root use set to search down 807 // the reduction chain. 808 SmallInstructionSet PossibleRedSet; 809 SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; 810 Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, 811 PossibleRedLastSet); 812 813 // We now need to check for equivalence of the use graph of each root with 814 // that of the primary induction variable (excluding the roots). Our goal 815 // here is not to solve the full graph isomorphism problem, but rather to 816 // catch common cases without a lot of work. As a result, we will assume 817 // that the relative order of the instructions in each unrolled iteration 818 // is the same (although we will not make an assumption about how the 819 // different iterations are intermixed). Note that while the order must be 820 // the same, the instructions may not be in the same basic block. 821 SmallInstructionSet Exclude(AllRoots); 822 Exclude.insert(LoopIncs.begin(), LoopIncs.end()); 823 824 DenseSet<Instruction *> BaseUseSet; 825 collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); 826 827 DenseSet<Instruction *> AllRootUses; 828 std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); 829 830 bool MatchFailed = false; 831 for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { 832 DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; 833 collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), 834 PossibleRedSet, RootUseSet); 835 836 DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << 837 " vs. iteration increment " << (i+1) << 838 " use set size: " << RootUseSet.size() << "\n"); 839 840 if (BaseUseSet.size() != RootUseSet.size()) { 841 MatchFailed = true; 842 break; 843 } 844 845 // In addition to regular aliasing information, we need to look for 846 // instructions from later (future) iterations that have side effects 847 // preventing us from reordering them past other instructions with side 848 // effects. 849 bool FutureSideEffects = false; 850 AliasSetTracker AST(*AA); 851 852 // The map between instructions in f(%iv.(i+1)) and f(%iv). 853 DenseMap<Value *, Value *> BaseMap; 854 855 assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); 856 for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), 857 JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { 858 if (cast<Instruction>(J1) == RealIV) 859 continue; 860 if (cast<Instruction>(J1) == IV) 861 continue; 862 if (!BaseUseSet.count(J1)) 863 continue; 864 if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. 865 continue; 866 867 while (J2 != JE && (!RootUseSet.count(J2) || 868 std::find(Roots[i].begin(), Roots[i].end(), J2) != 869 Roots[i].end())) { 870 // As we iterate through the instructions, instructions that don't 871 // belong to previous iterations (or the base case), must belong to 872 // future iterations. We want to track the alias set of writes from 873 // previous iterations. 874 if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && 875 !AllRootUses.count(J2)) { 876 if (J2->mayWriteToMemory()) 877 AST.add(J2); 878 879 // Note: This is specifically guarded by a check on isa<PHINode>, 880 // which while a valid (somewhat arbitrary) micro-optimization, is 881 // needed because otherwise isSafeToSpeculativelyExecute returns 882 // false on PHI nodes. 883 if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) 884 FutureSideEffects = true; 885 } 886 887 ++J2; 888 } 889 890 if (!J1->isSameOperationAs(J2)) { 891 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 892 " vs. " << *J2 << "\n"); 893 MatchFailed = true; 894 break; 895 } 896 897 // Make sure that this instruction, which is in the use set of this 898 // root instruction, does not also belong to the base set or the set of 899 // some previous root instruction. 900 if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { 901 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 902 " vs. " << *J2 << " (prev. case overlap)\n"); 903 MatchFailed = true; 904 break; 905 } 906 907 // Make sure that we don't alias with any instruction in the alias set 908 // tracker. If we do, then we depend on a future iteration, and we 909 // can't reroll. 910 if (J2->mayReadFromMemory()) { 911 for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); 912 K != KE && !MatchFailed; ++K) { 913 if (K->aliasesUnknownInst(J2, *AA)) { 914 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 915 " vs. " << *J2 << " (depends on future store)\n"); 916 MatchFailed = true; 917 break; 918 } 919 } 920 } 921 922 // If we've past an instruction from a future iteration that may have 923 // side effects, and this instruction might also, then we can't reorder 924 // them, and this matching fails. As an exception, we allow the alias 925 // set tracker to handle regular (simple) load/store dependencies. 926 if (FutureSideEffects && 927 ((!isSimpleLoadStore(J1) && 928 !isSafeToSpeculativelyExecute(J1, DL)) || 929 (!isSimpleLoadStore(J2) && 930 !isSafeToSpeculativelyExecute(J2, DL)))) { 931 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 932 " vs. " << *J2 << 933 " (side effects prevent reordering)\n"); 934 MatchFailed = true; 935 break; 936 } 937 938 // For instructions that are part of a reduction, if the operation is 939 // associative, then don't bother matching the operands (because we 940 // already know that the instructions are isomorphic, and the order 941 // within the iteration does not matter). For non-associative reductions, 942 // we do need to match the operands, because we need to reject 943 // out-of-order instructions within an iteration! 944 // For example (assume floating-point addition), we need to reject this: 945 // x += a[i]; x += b[i]; 946 // x += a[i+1]; x += b[i+1]; 947 // x += b[i+2]; x += a[i+2]; 948 bool InReduction = Reductions.isPairInSame(J1, J2); 949 950 if (!(InReduction && J1->isAssociative())) { 951 bool Swapped = false, SomeOpMatched = false; 952 for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { 953 Value *Op2 = J2->getOperand(j); 954 955 // If this is part of a reduction (and the operation is not 956 // associatve), then we match all operands, but not those that are 957 // part of the reduction. 958 if (InReduction) 959 if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) 960 if (Reductions.isPairInSame(J2, Op2I)) 961 continue; 962 963 DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); 964 if (BMI != BaseMap.end()) 965 Op2 = BMI->second; 966 else if (std::find(Roots[i].begin(), Roots[i].end(), 967 (Instruction*) Op2) != Roots[i].end()) 968 Op2 = IV; 969 970 if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { 971 // If we've not already decided to swap the matched operands, and 972 // we've not already matched our first operand (note that we could 973 // have skipped matching the first operand because it is part of a 974 // reduction above), and the instruction is commutative, then try 975 // the swapped match. 976 if (!Swapped && J1->isCommutative() && !SomeOpMatched && 977 J1->getOperand(!j) == Op2) { 978 Swapped = true; 979 } else { 980 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 981 " vs. " << *J2 << " (operand " << j << ")\n"); 982 MatchFailed = true; 983 break; 984 } 985 } 986 987 SomeOpMatched = true; 988 } 989 } 990 991 if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || 992 (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { 993 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << 994 " vs. " << *J2 << " (uses outside loop)\n"); 995 MatchFailed = true; 996 break; 997 } 998 999 if (!MatchFailed) 1000 BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); 1001 1002 AllRootUses.insert(J2); 1003 Reductions.recordPair(J1, J2, i+1); 1004 1005 ++J2; 1006 } 1007 } 1008 1009 if (MatchFailed) 1010 return false; 1011 1012 DEBUG(dbgs() << "LRR: Matched all iteration increments for " << 1013 *RealIV << "\n"); 1014 1015 DenseSet<Instruction *> LoopIncUseSet; 1016 collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), 1017 SmallInstructionSet(), LoopIncUseSet); 1018 DEBUG(dbgs() << "LRR: Loop increment set size: " << 1019 LoopIncUseSet.size() << "\n"); 1020 1021 // Make sure that all instructions in the loop have been included in some 1022 // use set. 1023 for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); 1024 J != JE; ++J) { 1025 if (isa<DbgInfoIntrinsic>(J)) 1026 continue; 1027 if (cast<Instruction>(J) == RealIV) 1028 continue; 1029 if (cast<Instruction>(J) == IV) 1030 continue; 1031 if (BaseUseSet.count(J) || AllRootUses.count(J) || 1032 (LoopIncUseSet.count(J) && (J->isTerminator() || 1033 isSafeToSpeculativelyExecute(J, DL)))) 1034 continue; 1035 1036 if (AllRoots.count(J)) 1037 continue; 1038 1039 if (Reductions.isSelectedPHI(J)) 1040 continue; 1041 1042 DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << 1043 " unprocessed instruction found: " << *J << "\n"); 1044 MatchFailed = true; 1045 break; 1046 } 1047 1048 if (MatchFailed) 1049 return false; 1050 1051 DEBUG(dbgs() << "LRR: all instructions processed from " << 1052 *RealIV << "\n"); 1053 1054 if (!Reductions.validateSelected()) 1055 return false; 1056 1057 // At this point, we've validated the rerolling, and we're committed to 1058 // making changes! 1059 1060 Reductions.replaceSelected(); 1061 1062 // Remove instructions associated with non-base iterations. 1063 for (BasicBlock::reverse_iterator J = Header->rbegin(); 1064 J != Header->rend();) { 1065 if (AllRootUses.count(&*J)) { 1066 Instruction *D = &*J; 1067 DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); 1068 D->eraseFromParent(); 1069 continue; 1070 } 1071 1072 ++J; 1073 } 1074 1075 // Insert the new induction variable. 1076 const SCEV *Start = RealIVSCEV->getStart(); 1077 if (Inc == 1) 1078 Start = SE->getMulExpr(Start, 1079 SE->getConstant(Start->getType(), Scale)); 1080 const SCEVAddRecExpr *H = 1081 cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, 1082 SE->getConstant(RealIVSCEV->getType(), 1), 1083 L, SCEV::FlagAnyWrap)); 1084 { // Limit the lifetime of SCEVExpander. 1085 SCEVExpander Expander(*SE, "reroll"); 1086 Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); 1087 1088 for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), 1089 JE = BaseUseSet.end(); J != JE; ++J) 1090 (*J)->replaceUsesOfWith(IV, NewIV); 1091 1092 if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { 1093 if (LoopIncUseSet.count(BI)) { 1094 const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); 1095 if (Inc == 1) 1096 ICSCEV = 1097 SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); 1098 // Iteration count SCEV minus 1 1099 const SCEV *ICMinus1SCEV = 1100 SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); 1101 1102 Value *ICMinus1; // Iteration count minus 1 1103 if (isa<SCEVConstant>(ICMinus1SCEV)) { 1104 ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); 1105 } else { 1106 BasicBlock *Preheader = L->getLoopPreheader(); 1107 if (!Preheader) 1108 Preheader = InsertPreheaderForLoop(L, this); 1109 1110 ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), 1111 Preheader->getTerminator()); 1112 } 1113 1114 Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, 1115 "exitcond"); 1116 BI->setCondition(Cond); 1117 1118 if (BI->getSuccessor(1) != Header) 1119 BI->swapSuccessors(); 1120 } 1121 } 1122 } 1123 1124 SimplifyInstructionsInBlock(Header, DL, TLI); 1125 DeleteDeadPHIs(Header, TLI); 1126 ++NumRerolledLoops; 1127 return true; 1128 } 1129 1130 bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { 1131 if (skipOptnoneFunction(L)) 1132 return false; 1133 1134 AA = &getAnalysis<AliasAnalysis>(); 1135 LI = &getAnalysis<LoopInfo>(); 1136 SE = &getAnalysis<ScalarEvolution>(); 1137 TLI = &getAnalysis<TargetLibraryInfo>(); 1138 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 1139 DL = DLP ? &DLP->getDataLayout() : nullptr; 1140 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1141 1142 BasicBlock *Header = L->getHeader(); 1143 DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << 1144 "] Loop %" << Header->getName() << " (" << 1145 L->getNumBlocks() << " block(s))\n"); 1146 1147 bool Changed = false; 1148 1149 // For now, we'll handle only single BB loops. 1150 if (L->getNumBlocks() > 1) 1151 return Changed; 1152 1153 if (!SE->hasLoopInvariantBackedgeTakenCount(L)) 1154 return Changed; 1155 1156 const SCEV *LIBETC = SE->getBackedgeTakenCount(L); 1157 const SCEV *IterCount = 1158 SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); 1159 DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); 1160 1161 // First, we need to find the induction variable with respect to which we can 1162 // reroll (there may be several possible options). 1163 SmallInstructionVector PossibleIVs; 1164 collectPossibleIVs(L, PossibleIVs); 1165 1166 if (PossibleIVs.empty()) { 1167 DEBUG(dbgs() << "LRR: No possible IVs found\n"); 1168 return Changed; 1169 } 1170 1171 ReductionTracker Reductions; 1172 collectPossibleReductions(L, Reductions); 1173 1174 // For each possible IV, collect the associated possible set of 'root' nodes 1175 // (i+1, i+2, etc.). 1176 for (SmallInstructionVector::iterator I = PossibleIVs.begin(), 1177 IE = PossibleIVs.end(); I != IE; ++I) 1178 if (reroll(*I, L, Header, IterCount, Reductions)) { 1179 Changed = true; 1180 break; 1181 } 1182 1183 return Changed; 1184 } 1185 1186