1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Loop Distribution Pass. Its main focus is to 11 // distribute loops that cannot be vectorized due to dependence cycles. It 12 // tries to isolate the offending dependences into a new loop allowing 13 // vectorization of the remaining parts. 14 // 15 // For dependence analysis, the pass uses the LoopVectorizer's 16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of 17 // memory operations, special care is taken to preserve the lexical order of 18 // these operations. 19 // 20 // Similarly to the Vectorizer, the pass also supports loop versioning to 21 // run-time disambiguate potentially overlapping arrays. 22 // 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/ADT/DepthFirstIterator.h" 26 #include "llvm/ADT/EquivalenceClasses.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/Analysis/LoopAccessAnalysis.h" 30 #include "llvm/Analysis/LoopInfo.h" 31 #include "llvm/IR/DiagnosticInfo.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/Pass.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 37 #include "llvm/Transforms/Utils/Cloning.h" 38 #include "llvm/Transforms/Utils/LoopUtils.h" 39 #include "llvm/Transforms/Utils/LoopVersioning.h" 40 #include <list> 41 42 #define LDIST_NAME "loop-distribute" 43 #define DEBUG_TYPE LDIST_NAME 44 45 using namespace llvm; 46 47 static cl::opt<bool> 48 LDistVerify("loop-distribute-verify", cl::Hidden, 49 cl::desc("Turn on DominatorTree and LoopInfo verification " 50 "after Loop Distribution"), 51 cl::init(false)); 52 53 static cl::opt<bool> DistributeNonIfConvertible( 54 "loop-distribute-non-if-convertible", cl::Hidden, 55 cl::desc("Whether to distribute into a loop that may not be " 56 "if-convertible by the loop vectorizer"), 57 cl::init(false)); 58 59 static cl::opt<unsigned> DistributeSCEVCheckThreshold( 60 "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 61 cl::desc("The maximum number of SCEV checks allowed for Loop " 62 "Distribution")); 63 64 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 65 "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 66 cl::Hidden, 67 cl::desc( 68 "The maximum number of SCEV checks allowed for Loop " 69 "Distribution for loop marked with #pragma loop distribute(enable)")); 70 71 // Note that the initial value for this depends on whether the pass is invoked 72 // directly or from the optimization pipeline. 73 static cl::opt<bool> EnableLoopDistribute( 74 "enable-loop-distribute", cl::Hidden, 75 cl::desc("Enable the new, experimental LoopDistribution Pass")); 76 77 STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 78 79 namespace { 80 /// \brief Maintains the set of instructions of the loop for a partition before 81 /// cloning. After cloning, it hosts the new loop. 82 class InstPartition { 83 typedef SmallPtrSet<Instruction *, 8> InstructionSet; 84 85 public: 86 InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 87 : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) { 88 Set.insert(I); 89 } 90 91 /// \brief Returns whether this partition contains a dependence cycle. 92 bool hasDepCycle() const { return DepCycle; } 93 94 /// \brief Adds an instruction to this partition. 95 void add(Instruction *I) { Set.insert(I); } 96 97 /// \brief Collection accessors. 98 InstructionSet::iterator begin() { return Set.begin(); } 99 InstructionSet::iterator end() { return Set.end(); } 100 InstructionSet::const_iterator begin() const { return Set.begin(); } 101 InstructionSet::const_iterator end() const { return Set.end(); } 102 bool empty() const { return Set.empty(); } 103 104 /// \brief Moves this partition into \p Other. This partition becomes empty 105 /// after this. 106 void moveTo(InstPartition &Other) { 107 Other.Set.insert(Set.begin(), Set.end()); 108 Set.clear(); 109 Other.DepCycle |= DepCycle; 110 } 111 112 /// \brief Populates the partition with a transitive closure of all the 113 /// instructions that the seeded instructions dependent on. 114 void populateUsedSet() { 115 // FIXME: We currently don't use control-dependence but simply include all 116 // blocks (possibly empty at the end) and let simplifycfg mostly clean this 117 // up. 118 for (auto *B : OrigLoop->getBlocks()) 119 Set.insert(B->getTerminator()); 120 121 // Follow the use-def chains to form a transitive closure of all the 122 // instructions that the originally seeded instructions depend on. 123 SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 124 while (!Worklist.empty()) { 125 Instruction *I = Worklist.pop_back_val(); 126 // Insert instructions from the loop that we depend on. 127 for (Value *V : I->operand_values()) { 128 auto *I = dyn_cast<Instruction>(V); 129 if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second) 130 Worklist.push_back(I); 131 } 132 } 133 } 134 135 /// \brief Clones the original loop. 136 /// 137 /// Updates LoopInfo and DominatorTree using the information that block \p 138 /// LoopDomBB dominates the loop. 139 Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 140 unsigned Index, LoopInfo *LI, 141 DominatorTree *DT) { 142 ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 143 VMap, Twine(".ldist") + Twine(Index), 144 LI, DT, ClonedLoopBlocks); 145 return ClonedLoop; 146 } 147 148 /// \brief The cloned loop. If this partition is mapped to the original loop, 149 /// this is null. 150 const Loop *getClonedLoop() const { return ClonedLoop; } 151 152 /// \brief Returns the loop where this partition ends up after distribution. 153 /// If this partition is mapped to the original loop then use the block from 154 /// the loop. 155 const Loop *getDistributedLoop() const { 156 return ClonedLoop ? ClonedLoop : OrigLoop; 157 } 158 159 /// \brief The VMap that is populated by cloning and then used in 160 /// remapinstruction to remap the cloned instructions. 161 ValueToValueMapTy &getVMap() { return VMap; } 162 163 /// \brief Remaps the cloned instructions using VMap. 164 void remapInstructions() { 165 remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 166 } 167 168 /// \brief Based on the set of instructions selected for this partition, 169 /// removes the unnecessary ones. 170 void removeUnusedInsts() { 171 SmallVector<Instruction *, 8> Unused; 172 173 for (auto *Block : OrigLoop->getBlocks()) 174 for (auto &Inst : *Block) 175 if (!Set.count(&Inst)) { 176 Instruction *NewInst = &Inst; 177 if (!VMap.empty()) 178 NewInst = cast<Instruction>(VMap[NewInst]); 179 180 assert(!isa<BranchInst>(NewInst) && 181 "Branches are marked used early on"); 182 Unused.push_back(NewInst); 183 } 184 185 // Delete the instructions backwards, as it has a reduced likelihood of 186 // having to update as many def-use and use-def chains. 187 for (auto *Inst : reverse(Unused)) { 188 if (!Inst->use_empty()) 189 Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); 190 Inst->eraseFromParent(); 191 } 192 } 193 194 void print() const { 195 if (DepCycle) 196 dbgs() << " (cycle)\n"; 197 for (auto *I : Set) 198 // Prefix with the block name. 199 dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n"; 200 } 201 202 void printBlocks() const { 203 for (auto *BB : getDistributedLoop()->getBlocks()) 204 dbgs() << *BB; 205 } 206 207 private: 208 /// \brief Instructions from OrigLoop selected for this partition. 209 InstructionSet Set; 210 211 /// \brief Whether this partition contains a dependence cycle. 212 bool DepCycle; 213 214 /// \brief The original loop. 215 Loop *OrigLoop; 216 217 /// \brief The cloned loop. If this partition is mapped to the original loop, 218 /// this is null. 219 Loop *ClonedLoop; 220 221 /// \brief The blocks of ClonedLoop including the preheader. If this 222 /// partition is mapped to the original loop, this is empty. 223 SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 224 225 /// \brief These gets populated once the set of instructions have been 226 /// finalized. If this partition is mapped to the original loop, these are not 227 /// set. 228 ValueToValueMapTy VMap; 229 }; 230 231 /// \brief Holds the set of Partitions. It populates them, merges them and then 232 /// clones the loops. 233 class InstPartitionContainer { 234 typedef DenseMap<Instruction *, int> InstToPartitionIdT; 235 236 public: 237 InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 238 : L(L), LI(LI), DT(DT) {} 239 240 /// \brief Returns the number of partitions. 241 unsigned getSize() const { return PartitionContainer.size(); } 242 243 /// \brief Adds \p Inst into the current partition if that is marked to 244 /// contain cycles. Otherwise start a new partition for it. 245 void addToCyclicPartition(Instruction *Inst) { 246 // If the current partition is non-cyclic. Start a new one. 247 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 248 PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 249 else 250 PartitionContainer.back().add(Inst); 251 } 252 253 /// \brief Adds \p Inst into a partition that is not marked to contain 254 /// dependence cycles. 255 /// 256 // Initially we isolate memory instructions into as many partitions as 257 // possible, then later we may merge them back together. 258 void addToNewNonCyclicPartition(Instruction *Inst) { 259 PartitionContainer.emplace_back(Inst, L); 260 } 261 262 /// \brief Merges adjacent non-cyclic partitions. 263 /// 264 /// The idea is that we currently only want to isolate the non-vectorizable 265 /// partition. We could later allow more distribution among these partition 266 /// too. 267 void mergeAdjacentNonCyclic() { 268 mergeAdjacentPartitionsIf( 269 [](const InstPartition *P) { return !P->hasDepCycle(); }); 270 } 271 272 /// \brief If a partition contains only conditional stores, we won't vectorize 273 /// it. Try to merge it with a previous cyclic partition. 274 void mergeNonIfConvertible() { 275 mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 276 if (Partition->hasDepCycle()) 277 return true; 278 279 // Now, check if all stores are conditional in this partition. 280 bool seenStore = false; 281 282 for (auto *Inst : *Partition) 283 if (isa<StoreInst>(Inst)) { 284 seenStore = true; 285 if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 286 return false; 287 } 288 return seenStore; 289 }); 290 } 291 292 /// \brief Merges the partitions according to various heuristics. 293 void mergeBeforePopulating() { 294 mergeAdjacentNonCyclic(); 295 if (!DistributeNonIfConvertible) 296 mergeNonIfConvertible(); 297 } 298 299 /// \brief Merges partitions in order to ensure that no loads are duplicated. 300 /// 301 /// We can't duplicate loads because that could potentially reorder them. 302 /// LoopAccessAnalysis provides dependency information with the context that 303 /// the order of memory operation is preserved. 304 /// 305 /// Return if any partitions were merged. 306 bool mergeToAvoidDuplicatedLoads() { 307 typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT; 308 typedef EquivalenceClasses<InstPartition *> ToBeMergedT; 309 310 LoadToPartitionT LoadToPartition; 311 ToBeMergedT ToBeMerged; 312 313 // Step through the partitions and create equivalence between partitions 314 // that contain the same load. Also put partitions in between them in the 315 // same equivalence class to avoid reordering of memory operations. 316 for (PartitionContainerT::iterator I = PartitionContainer.begin(), 317 E = PartitionContainer.end(); 318 I != E; ++I) { 319 auto *PartI = &*I; 320 321 // If a load occurs in two partitions PartI and PartJ, merge all 322 // partitions (PartI, PartJ] into PartI. 323 for (Instruction *Inst : *PartI) 324 if (isa<LoadInst>(Inst)) { 325 bool NewElt; 326 LoadToPartitionT::iterator LoadToPart; 327 328 std::tie(LoadToPart, NewElt) = 329 LoadToPartition.insert(std::make_pair(Inst, PartI)); 330 if (!NewElt) { 331 DEBUG(dbgs() << "Merging partitions due to this load in multiple " 332 << "partitions: " << PartI << ", " 333 << LoadToPart->second << "\n" << *Inst << "\n"); 334 335 auto PartJ = I; 336 do { 337 --PartJ; 338 ToBeMerged.unionSets(PartI, &*PartJ); 339 } while (&*PartJ != LoadToPart->second); 340 } 341 } 342 } 343 if (ToBeMerged.empty()) 344 return false; 345 346 // Merge the member of an equivalence class into its class leader. This 347 // makes the members empty. 348 for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 349 I != E; ++I) { 350 if (!I->isLeader()) 351 continue; 352 353 auto PartI = I->getData(); 354 for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 355 ToBeMerged.member_end())) { 356 PartJ->moveTo(*PartI); 357 } 358 } 359 360 // Remove the empty partitions. 361 PartitionContainer.remove_if( 362 [](const InstPartition &P) { return P.empty(); }); 363 364 return true; 365 } 366 367 /// \brief Sets up the mapping between instructions to partitions. If the 368 /// instruction is duplicated across multiple partitions, set the entry to -1. 369 void setupPartitionIdOnInstructions() { 370 int PartitionID = 0; 371 for (const auto &Partition : PartitionContainer) { 372 for (Instruction *Inst : Partition) { 373 bool NewElt; 374 InstToPartitionIdT::iterator Iter; 375 376 std::tie(Iter, NewElt) = 377 InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 378 if (!NewElt) 379 Iter->second = -1; 380 } 381 ++PartitionID; 382 } 383 } 384 385 /// \brief Populates the partition with everything that the seeding 386 /// instructions require. 387 void populateUsedSet() { 388 for (auto &P : PartitionContainer) 389 P.populateUsedSet(); 390 } 391 392 /// \brief This performs the main chunk of the work of cloning the loops for 393 /// the partitions. 394 void cloneLoops() { 395 BasicBlock *OrigPH = L->getLoopPreheader(); 396 // At this point the predecessor of the preheader is either the memcheck 397 // block or the top part of the original preheader. 398 BasicBlock *Pred = OrigPH->getSinglePredecessor(); 399 assert(Pred && "Preheader does not have a single predecessor"); 400 BasicBlock *ExitBlock = L->getExitBlock(); 401 assert(ExitBlock && "No single exit block"); 402 Loop *NewLoop; 403 404 assert(!PartitionContainer.empty() && "at least two partitions expected"); 405 // We're cloning the preheader along with the loop so we already made sure 406 // it was empty. 407 assert(&*OrigPH->begin() == OrigPH->getTerminator() && 408 "preheader not empty"); 409 410 // Create a loop for each partition except the last. Clone the original 411 // loop before PH along with adding a preheader for the cloned loop. Then 412 // update PH to point to the newly added preheader. 413 BasicBlock *TopPH = OrigPH; 414 unsigned Index = getSize() - 1; 415 for (auto I = std::next(PartitionContainer.rbegin()), 416 E = PartitionContainer.rend(); 417 I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) { 418 auto *Part = &*I; 419 420 NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 421 422 Part->getVMap()[ExitBlock] = TopPH; 423 Part->remapInstructions(); 424 } 425 Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 426 427 // Now go in forward order and update the immediate dominator for the 428 // preheaders with the exiting block of the previous loop. Dominance 429 // within the loop is updated in cloneLoopWithPreheader. 430 for (auto Curr = PartitionContainer.cbegin(), 431 Next = std::next(PartitionContainer.cbegin()), 432 E = PartitionContainer.cend(); 433 Next != E; ++Curr, ++Next) 434 DT->changeImmediateDominator( 435 Next->getDistributedLoop()->getLoopPreheader(), 436 Curr->getDistributedLoop()->getExitingBlock()); 437 } 438 439 /// \brief Removes the dead instructions from the cloned loops. 440 void removeUnusedInsts() { 441 for (auto &Partition : PartitionContainer) 442 Partition.removeUnusedInsts(); 443 } 444 445 /// \brief For each memory pointer, it computes the partitionId the pointer is 446 /// used in. 447 /// 448 /// This returns an array of int where the I-th entry corresponds to I-th 449 /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 450 /// partitions its entry is set to -1. 451 SmallVector<int, 8> 452 computePartitionSetForPointers(const LoopAccessInfo &LAI) { 453 const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 454 455 unsigned N = RtPtrCheck->Pointers.size(); 456 SmallVector<int, 8> PtrToPartitions(N); 457 for (unsigned I = 0; I < N; ++I) { 458 Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 459 auto Instructions = 460 LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 461 462 int &Partition = PtrToPartitions[I]; 463 // First set it to uninitialized. 464 Partition = -2; 465 for (Instruction *Inst : Instructions) { 466 // Note that this could be -1 if Inst is duplicated across multiple 467 // partitions. 468 int ThisPartition = this->InstToPartitionId[Inst]; 469 if (Partition == -2) 470 Partition = ThisPartition; 471 // -1 means belonging to multiple partitions. 472 else if (Partition == -1) 473 break; 474 else if (Partition != (int)ThisPartition) 475 Partition = -1; 476 } 477 assert(Partition != -2 && "Pointer not belonging to any partition"); 478 } 479 480 return PtrToPartitions; 481 } 482 483 void print(raw_ostream &OS) const { 484 unsigned Index = 0; 485 for (const auto &P : PartitionContainer) { 486 OS << "Partition " << Index++ << " (" << &P << "):\n"; 487 P.print(); 488 } 489 } 490 491 void dump() const { print(dbgs()); } 492 493 #ifndef NDEBUG 494 friend raw_ostream &operator<<(raw_ostream &OS, 495 const InstPartitionContainer &Partitions) { 496 Partitions.print(OS); 497 return OS; 498 } 499 #endif 500 501 void printBlocks() const { 502 unsigned Index = 0; 503 for (const auto &P : PartitionContainer) { 504 dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n"; 505 P.printBlocks(); 506 } 507 } 508 509 private: 510 typedef std::list<InstPartition> PartitionContainerT; 511 512 /// \brief List of partitions. 513 PartitionContainerT PartitionContainer; 514 515 /// \brief Mapping from Instruction to partition Id. If the instruction 516 /// belongs to multiple partitions the entry contains -1. 517 InstToPartitionIdT InstToPartitionId; 518 519 Loop *L; 520 LoopInfo *LI; 521 DominatorTree *DT; 522 523 /// \brief The control structure to merge adjacent partitions if both satisfy 524 /// the \p Predicate. 525 template <class UnaryPredicate> 526 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 527 InstPartition *PrevMatch = nullptr; 528 for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 529 auto DoesMatch = Predicate(&*I); 530 if (PrevMatch == nullptr && DoesMatch) { 531 PrevMatch = &*I; 532 ++I; 533 } else if (PrevMatch != nullptr && DoesMatch) { 534 I->moveTo(*PrevMatch); 535 I = PartitionContainer.erase(I); 536 } else { 537 PrevMatch = nullptr; 538 ++I; 539 } 540 } 541 } 542 }; 543 544 /// \brief For each memory instruction, this class maintains difference of the 545 /// number of unsafe dependences that start out from this instruction minus 546 /// those that end here. 547 /// 548 /// By traversing the memory instructions in program order and accumulating this 549 /// number, we know whether any unsafe dependence crosses over a program point. 550 class MemoryInstructionDependences { 551 typedef MemoryDepChecker::Dependence Dependence; 552 553 public: 554 struct Entry { 555 Instruction *Inst; 556 unsigned NumUnsafeDependencesStartOrEnd; 557 558 Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {} 559 }; 560 561 typedef SmallVector<Entry, 8> AccessesType; 562 563 AccessesType::const_iterator begin() const { return Accesses.begin(); } 564 AccessesType::const_iterator end() const { return Accesses.end(); } 565 566 MemoryInstructionDependences( 567 const SmallVectorImpl<Instruction *> &Instructions, 568 const SmallVectorImpl<Dependence> &Dependences) { 569 Accesses.append(Instructions.begin(), Instructions.end()); 570 571 DEBUG(dbgs() << "Backward dependences:\n"); 572 for (auto &Dep : Dependences) 573 if (Dep.isPossiblyBackward()) { 574 // Note that the designations source and destination follow the program 575 // order, i.e. source is always first. (The direction is given by the 576 // DepType.) 577 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 578 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 579 580 DEBUG(Dep.print(dbgs(), 2, Instructions)); 581 } 582 } 583 584 private: 585 AccessesType Accesses; 586 }; 587 588 /// \brief The actual class performing the per-loop work. 589 class LoopDistributeForLoop { 590 public: 591 LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 592 ScalarEvolution *SE) 593 : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE) { 594 setForced(); 595 } 596 597 /// \brief Try to distribute an inner-most loop. 598 bool processLoop(LoopAccessLegacyAnalysis *LAA) { 599 assert(L->empty() && "Only process inner loops."); 600 601 DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() 602 << "\" checking " << *L << "\n"); 603 604 BasicBlock *PH = L->getLoopPreheader(); 605 if (!PH) 606 return fail("no preheader"); 607 if (!L->getExitBlock()) 608 return fail("multiple exit blocks"); 609 610 // LAA will check that we only have a single exiting block. 611 LAI = &LAA->getInfo(L); 612 613 // Currently, we only distribute to isolate the part of the loop with 614 // dependence cycles to enable partial vectorization. 615 if (LAI->canVectorizeMemory()) 616 return fail("memory operations are safe for vectorization"); 617 618 auto *Dependences = LAI->getDepChecker().getDependences(); 619 if (!Dependences || Dependences->empty()) 620 return fail("no unsafe dependences to isolate"); 621 622 InstPartitionContainer Partitions(L, LI, DT); 623 624 // First, go through each memory operation and assign them to consecutive 625 // partitions (the order of partitions follows program order). Put those 626 // with unsafe dependences into "cyclic" partition otherwise put each store 627 // in its own "non-cyclic" partition (we'll merge these later). 628 // 629 // Note that a memory operation (e.g. Load2 below) at a program point that 630 // has an unsafe dependence (Store3->Load1) spanning over it must be 631 // included in the same cyclic partition as the dependent operations. This 632 // is to preserve the original program order after distribution. E.g.: 633 // 634 // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 635 // Load1 -. 1 0->1 636 // Load2 | /Unsafe/ 0 1 637 // Store3 -' -1 1->0 638 // Load4 0 0 639 // 640 // NumUnsafeDependencesActive > 0 indicates this situation and in this case 641 // we just keep assigning to the same cyclic partition until 642 // NumUnsafeDependencesActive reaches 0. 643 const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 644 MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 645 *Dependences); 646 647 int NumUnsafeDependencesActive = 0; 648 for (auto &InstDep : MID) { 649 Instruction *I = InstDep.Inst; 650 // We update NumUnsafeDependencesActive post-instruction, catch the 651 // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 652 if (NumUnsafeDependencesActive || 653 InstDep.NumUnsafeDependencesStartOrEnd > 0) 654 Partitions.addToCyclicPartition(I); 655 else 656 Partitions.addToNewNonCyclicPartition(I); 657 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 658 assert(NumUnsafeDependencesActive >= 0 && 659 "Negative number of dependences active"); 660 } 661 662 // Add partitions for values used outside. These partitions can be out of 663 // order from the original program order. This is OK because if the 664 // partition uses a load we will merge this partition with the original 665 // partition of the load that we set up in the previous loop (see 666 // mergeToAvoidDuplicatedLoads). 667 auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 668 for (auto *Inst : DefsUsedOutside) 669 Partitions.addToNewNonCyclicPartition(Inst); 670 671 DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); 672 if (Partitions.getSize() < 2) 673 return fail("cannot isolate unsafe dependencies"); 674 675 // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 676 // should be able to vectorize these together. 677 Partitions.mergeBeforePopulating(); 678 DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); 679 if (Partitions.getSize() < 2) 680 return fail("cannot isolate unsafe dependencies"); 681 682 // Now, populate the partitions with non-memory operations. 683 Partitions.populateUsedSet(); 684 DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions); 685 686 // In order to preserve original lexical order for loads, keep them in the 687 // partition that we set up in the MemoryInstructionDependences loop. 688 if (Partitions.mergeToAvoidDuplicatedLoads()) { 689 DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" 690 << Partitions); 691 if (Partitions.getSize() < 2) 692 return fail("cannot isolate unsafe dependencies"); 693 } 694 695 // Don't distribute the loop if we need too many SCEV run-time checks. 696 const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); 697 if (Pred.getComplexity() > (IsForced.getValueOr(false) 698 ? PragmaDistributeSCEVCheckThreshold 699 : DistributeSCEVCheckThreshold)) 700 return fail("too many SCEV run-time checks needed.\n"); 701 702 DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); 703 // We're done forming the partitions set up the reverse mapping from 704 // instructions to partitions. 705 Partitions.setupPartitionIdOnInstructions(); 706 707 // To keep things simple have an empty preheader before we version or clone 708 // the loop. (Also split if this has no predecessor, i.e. entry, because we 709 // rely on PH having a predecessor.) 710 if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 711 SplitBlock(PH, PH->getTerminator(), DT, LI); 712 713 // If we need run-time checks, version the loop now. 714 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 715 const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 716 const auto &AllChecks = RtPtrChecking->getChecks(); 717 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 718 RtPtrChecking); 719 720 if (!Pred.isAlwaysTrue() || !Checks.empty()) { 721 DEBUG(dbgs() << "\nPointers:\n"); 722 DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 723 LoopVersioning LVer(*LAI, L, LI, DT, SE, false); 724 LVer.setAliasChecks(std::move(Checks)); 725 LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); 726 LVer.versionLoop(DefsUsedOutside); 727 LVer.annotateLoopWithNoAlias(); 728 } 729 730 // Create identical copies of the original loop for each partition and hook 731 // them up sequentially. 732 Partitions.cloneLoops(); 733 734 // Now, we remove the instruction from each loop that don't belong to that 735 // partition. 736 Partitions.removeUnusedInsts(); 737 DEBUG(dbgs() << "\nAfter removing unused Instrs:\n"); 738 DEBUG(Partitions.printBlocks()); 739 740 if (LDistVerify) { 741 LI->verify(); 742 DT->verifyDomTree(); 743 } 744 745 ++NumLoopsDistributed; 746 // Report the success. 747 emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(), 748 "distributed loop"); 749 return true; 750 } 751 752 /// \brief Provide diagnostics then \return with false. 753 bool fail(llvm::StringRef Message) { 754 LLVMContext &Ctx = F->getContext(); 755 bool Forced = isForced().getValueOr(false); 756 757 DEBUG(dbgs() << "Skipping; " << Message << "\n"); 758 759 // With Rpass-missed report that distribution failed. 760 emitOptimizationRemarkMissed( 761 Ctx, LDIST_NAME, *F, L->getStartLoc(), 762 "loop not distributed: use -Rpass-analysis=loop-distribute for more " 763 "info"); 764 765 // With Rpass-analysis report why. This is on by default if distribution 766 // was requested explicitly. 767 emitOptimizationRemarkAnalysis( 768 Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint 769 : LDIST_NAME, 770 *F, L->getStartLoc(), Twine("loop not distributed: ") + Message); 771 772 // Also issue a warning if distribution was requested explicitly but it 773 // failed. 774 if (Forced) 775 Ctx.diagnose(DiagnosticInfoOptimizationFailure( 776 *F, L->getStartLoc(), "loop not disributed: failed " 777 "explicitly specified loop distribution")); 778 779 return false; 780 } 781 782 /// \brief Return if distribution forced to be enabled/disabled for the loop. 783 /// 784 /// If the optional has a value, it indicates whether distribution was forced 785 /// to be enabled (true) or disabled (false). If the optional has no value 786 /// distribution was not forced either way. 787 const Optional<bool> &isForced() const { return IsForced; } 788 789 private: 790 /// \brief Filter out checks between pointers from the same partition. 791 /// 792 /// \p PtrToPartition contains the partition number for pointers. Partition 793 /// number -1 means that the pointer is used in multiple partitions. In this 794 /// case we can't safely omit the check. 795 SmallVector<RuntimePointerChecking::PointerCheck, 4> 796 includeOnlyCrossPartitionChecks( 797 const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, 798 const SmallVectorImpl<int> &PtrToPartition, 799 const RuntimePointerChecking *RtPtrChecking) { 800 SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; 801 802 std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), 803 [&](const RuntimePointerChecking::PointerCheck &Check) { 804 for (unsigned PtrIdx1 : Check.first->Members) 805 for (unsigned PtrIdx2 : Check.second->Members) 806 // Only include this check if there is a pair of pointers 807 // that require checking and the pointers fall into 808 // separate partitions. 809 // 810 // (Note that we already know at this point that the two 811 // pointer groups need checking but it doesn't follow 812 // that each pair of pointers within the two groups need 813 // checking as well. 814 // 815 // In other words we don't want to include a check just 816 // because there is a pair of pointers between the two 817 // pointer groups that require checks and a different 818 // pair whose pointers fall into different partitions.) 819 if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 820 !RuntimePointerChecking::arePointersInSamePartition( 821 PtrToPartition, PtrIdx1, PtrIdx2)) 822 return true; 823 return false; 824 }); 825 826 return Checks; 827 } 828 829 /// \brief Check whether the loop metadata is forcing distribution to be 830 /// enabled/disabled. 831 void setForced() { 832 Optional<const MDOperand *> Value = 833 findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 834 if (!Value) 835 return; 836 837 const MDOperand *Op = *Value; 838 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 839 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 840 } 841 842 Loop *L; 843 Function *F; 844 845 // Analyses used. 846 LoopInfo *LI; 847 const LoopAccessInfo *LAI; 848 DominatorTree *DT; 849 ScalarEvolution *SE; 850 851 /// \brief Indicates whether distribution is forced to be enabled/disabled for 852 /// the loop. 853 /// 854 /// If the optional has a value, it indicates whether distribution was forced 855 /// to be enabled (true) or disabled (false). If the optional has no value 856 /// distribution was not forced either way. 857 Optional<bool> IsForced; 858 }; 859 860 /// \brief The pass class. 861 class LoopDistribute : public FunctionPass { 862 public: 863 /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be 864 /// performed by default. Pass -enable-loop-distribute={0,1} overrides this 865 /// default. We use this to keep LoopDistribution off by default when invoked 866 /// from the optimization pipeline but on when invoked explicitly from opt. 867 LoopDistribute(bool ProcessAllLoopsByDefault = true) 868 : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { 869 // The default is set by the caller. 870 if (EnableLoopDistribute.getNumOccurrences() > 0) 871 ProcessAllLoops = EnableLoopDistribute; 872 initializeLoopDistributePass(*PassRegistry::getPassRegistry()); 873 } 874 875 bool runOnFunction(Function &F) override { 876 if (skipFunction(F)) 877 return false; 878 879 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 880 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 881 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 882 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 883 884 // Build up a worklist of inner-loops to vectorize. This is necessary as the 885 // act of distributing a loop creates new loops and can invalidate iterators 886 // across the loops. 887 SmallVector<Loop *, 8> Worklist; 888 889 for (Loop *TopLevelLoop : *LI) 890 for (Loop *L : depth_first(TopLevelLoop)) 891 // We only handle inner-most loops. 892 if (L->empty()) 893 Worklist.push_back(L); 894 895 // Now walk the identified inner loops. 896 bool Changed = false; 897 for (Loop *L : Worklist) { 898 LoopDistributeForLoop LDL(L, &F, LI, DT, SE); 899 900 // If distribution was forced for the specific loop to be 901 // enabled/disabled, follow that. Otherwise use the global flag. 902 if (LDL.isForced().getValueOr(ProcessAllLoops)) 903 Changed |= LDL.processLoop(LAA); 904 } 905 906 // Process each loop nest in the function. 907 return Changed; 908 } 909 910 void getAnalysisUsage(AnalysisUsage &AU) const override { 911 AU.addRequired<ScalarEvolutionWrapperPass>(); 912 AU.addRequired<LoopInfoWrapperPass>(); 913 AU.addPreserved<LoopInfoWrapperPass>(); 914 AU.addRequired<LoopAccessLegacyAnalysis>(); 915 AU.addRequired<DominatorTreeWrapperPass>(); 916 AU.addPreserved<DominatorTreeWrapperPass>(); 917 } 918 919 static char ID; 920 921 private: 922 /// \brief Whether distribution should be on in this function. The per-loop 923 /// pragma can override this. 924 bool ProcessAllLoops; 925 }; 926 } // anonymous namespace 927 928 char LoopDistribute::ID; 929 static const char ldist_name[] = "Loop Distribition"; 930 931 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) 932 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 933 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 934 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 935 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 936 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) 937 938 namespace llvm { 939 FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { 940 return new LoopDistribute(ProcessAllLoopsByDefault); 941 } 942 } 943