1 //===- HexagonVectorLoopCarriedReuse.cpp ----------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass removes the computation of provably redundant expressions that have 11 // been computed earlier in a previous iteration. It relies on the use of PHIs 12 // to identify loop carried dependences. This is scalar replacement for vector 13 // types. 14 // 15 //----------------------------------------------------------------------------- 16 // Motivation: Consider the case where we have the following loop structure. 17 // 18 // Loop: 19 // t0 = a[i]; 20 // t1 = f(t0); 21 // t2 = g(t1); 22 // ... 23 // t3 = a[i+1]; 24 // t4 = f(t3); 25 // t5 = g(t4); 26 // t6 = op(t2, t5) 27 // cond_branch <Loop> 28 // 29 // This can be converted to 30 // t00 = a[0]; 31 // t10 = f(t00); 32 // t20 = g(t10); 33 // Loop: 34 // t2 = t20; 35 // t3 = a[i+1]; 36 // t4 = f(t3); 37 // t5 = g(t4); 38 // t6 = op(t2, t5) 39 // t20 = t5 40 // cond_branch <Loop> 41 // 42 // SROA does a good job of reusing a[i+1] as a[i] in the next iteration. 43 // Such a loop comes to this pass in the following form. 44 // 45 // LoopPreheader: 46 // X0 = a[0]; 47 // Loop: 48 // X2 = PHI<(X0, LoopPreheader), (X1, Loop)> 49 // t1 = f(X2) <-- I1 50 // t2 = g(t1) 51 // ... 52 // X1 = a[i+1] 53 // t4 = f(X1) <-- I2 54 // t5 = g(t4) 55 // t6 = op(t2, t5) 56 // cond_branch <Loop> 57 // 58 // In this pass, we look for PHIs such as X2 whose incoming values come only 59 // from the Loop Preheader and over the backedge and additionaly, both these 60 // values are the results of the same operation in terms of opcode. We call such 61 // a PHI node a dependence chain or DepChain. In this case, the dependence of X2 62 // over X1 is carried over only one iteration and so the DepChain is only one 63 // PHI node long. 64 // 65 // Then, we traverse the uses of the PHI (X2) and the uses of the value of the 66 // PHI coming over the backedge (X1). We stop at the first pair of such users 67 // I1 (of X2) and I2 (of X1) that meet the following conditions. 68 // 1. I1 and I2 are the same operation, but with different operands. 69 // 2. X2 and X1 are used at the same operand number in the two instructions. 70 // 3. All other operands Op1 of I1 and Op2 of I2 are also such that there is a 71 // a DepChain from Op1 to Op2 of the same length as that between X2 and X1. 72 // 73 // We then make the following transformation 74 // LoopPreheader: 75 // X0 = a[0]; 76 // Y0 = f(X0); 77 // Loop: 78 // X2 = PHI<(X0, LoopPreheader), (X1, Loop)> 79 // Y2 = PHI<(Y0, LoopPreheader), (t4, Loop)> 80 // t1 = f(X2) <-- Will be removed by DCE. 81 // t2 = g(Y2) 82 // ... 83 // X1 = a[i+1] 84 // t4 = f(X1) 85 // t5 = g(t4) 86 // t6 = op(t2, t5) 87 // cond_branch <Loop> 88 // 89 // We proceed until we cannot find any more such instructions I1 and I2. 90 // 91 // --- DepChains & Loop carried dependences --- 92 // Consider a single basic block loop such as 93 // 94 // LoopPreheader: 95 // X0 = ... 96 // Y0 = ... 97 // Loop: 98 // X2 = PHI<(X0, LoopPreheader), (X1, Loop)> 99 // Y2 = PHI<(Y0, LoopPreheader), (X2, Loop)> 100 // ... 101 // X1 = ... 102 // ... 103 // cond_branch <Loop> 104 // 105 // Then there is a dependence between X2 and X1 that goes back one iteration, 106 // i.e. X1 is used as X2 in the very next iteration. We represent this as a 107 // DepChain from X2 to X1 (X2->X1). 108 // Similarly, there is a dependence between Y2 and X1 that goes back two 109 // iterations. X1 is used as Y2 two iterations after it is computed. This is 110 // represented by a DepChain as (Y2->X2->X1). 111 // 112 // A DepChain has the following properties. 113 // 1. Num of edges in DepChain = Number of Instructions in DepChain = Number of 114 // iterations of carried dependence + 1. 115 // 2. All instructions in the DepChain except the last are PHIs. 116 // 117 //===----------------------------------------------------------------------===// 118 119 #include "llvm/ADT/SetVector.h" 120 #include "llvm/ADT/SmallVector.h" 121 #include "llvm/ADT/Statistic.h" 122 #include "llvm/Analysis/LoopInfo.h" 123 #include "llvm/Analysis/LoopPass.h" 124 #include "llvm/IR/BasicBlock.h" 125 #include "llvm/IR/DerivedTypes.h" 126 #include "llvm/IR/IRBuilder.h" 127 #include "llvm/IR/Instruction.h" 128 #include "llvm/IR/Instructions.h" 129 #include "llvm/IR/IntrinsicInst.h" 130 #include "llvm/IR/Intrinsics.h" 131 #include "llvm/IR/Use.h" 132 #include "llvm/IR/User.h" 133 #include "llvm/IR/Value.h" 134 #include "llvm/Pass.h" 135 #include "llvm/Support/Casting.h" 136 #include "llvm/Support/CommandLine.h" 137 #include "llvm/Support/Compiler.h" 138 #include "llvm/Support/Debug.h" 139 #include "llvm/Support/raw_ostream.h" 140 #include "llvm/Transforms/Scalar.h" 141 #include "llvm/Transforms/Utils.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstddef> 145 #include <map> 146 #include <memory> 147 #include <set> 148 149 using namespace llvm; 150 151 #define DEBUG_TYPE "hexagon-vlcr" 152 153 STATISTIC(HexagonNumVectorLoopCarriedReuse, 154 "Number of values that were reused from a previous iteration."); 155 156 static cl::opt<int> HexagonVLCRIterationLim("hexagon-vlcr-iteration-lim", 157 cl::Hidden, 158 cl::desc("Maximum distance of loop carried dependences that are handled"), 159 cl::init(2), cl::ZeroOrMore); 160 161 namespace llvm { 162 163 void initializeHexagonVectorLoopCarriedReusePass(PassRegistry&); 164 Pass *createHexagonVectorLoopCarriedReusePass(); 165 166 } // end namespace llvm 167 168 namespace { 169 170 // See info about DepChain in the comments at the top of this file. 171 using ChainOfDependences = SmallVector<Instruction *, 4>; 172 173 class DepChain { 174 ChainOfDependences Chain; 175 176 public: 177 bool isIdentical(DepChain &Other) const { 178 if (Other.size() != size()) 179 return false; 180 ChainOfDependences &OtherChain = Other.getChain(); 181 for (int i = 0; i < size(); ++i) { 182 if (Chain[i] != OtherChain[i]) 183 return false; 184 } 185 return true; 186 } 187 188 ChainOfDependences &getChain() { 189 return Chain; 190 } 191 192 int size() const { 193 return Chain.size(); 194 } 195 196 void clear() { 197 Chain.clear(); 198 } 199 200 void push_back(Instruction *I) { 201 Chain.push_back(I); 202 } 203 204 int iterations() const { 205 return size() - 1; 206 } 207 208 Instruction *front() const { 209 return Chain.front(); 210 } 211 212 Instruction *back() const { 213 return Chain.back(); 214 } 215 216 Instruction *&operator[](const int index) { 217 return Chain[index]; 218 } 219 220 friend raw_ostream &operator<< (raw_ostream &OS, const DepChain &D); 221 }; 222 223 LLVM_ATTRIBUTE_UNUSED 224 raw_ostream &operator<<(raw_ostream &OS, const DepChain &D) { 225 const ChainOfDependences &CD = D.Chain; 226 int ChainSize = CD.size(); 227 OS << "**DepChain Start::**\n"; 228 for (int i = 0; i < ChainSize -1; ++i) { 229 OS << *(CD[i]) << " -->\n"; 230 } 231 OS << *CD[ChainSize-1] << "\n"; 232 return OS; 233 } 234 235 struct ReuseValue { 236 Instruction *Inst2Replace = nullptr; 237 238 // In the new PHI node that we'll construct this is the value that'll be 239 // used over the backedge. This is teh value that gets reused from a 240 // previous iteration. 241 Instruction *BackedgeInst = nullptr; 242 243 ReuseValue() = default; 244 245 void reset() { Inst2Replace = nullptr; BackedgeInst = nullptr; } 246 bool isDefined() { return Inst2Replace != nullptr; } 247 }; 248 249 LLVM_ATTRIBUTE_UNUSED 250 raw_ostream &operator<<(raw_ostream &OS, const ReuseValue &RU) { 251 OS << "** ReuseValue ***\n"; 252 OS << "Instruction to Replace: " << *(RU.Inst2Replace) << "\n"; 253 OS << "Backedge Instruction: " << *(RU.BackedgeInst) << "\n"; 254 return OS; 255 } 256 257 class HexagonVectorLoopCarriedReuse : public LoopPass { 258 public: 259 static char ID; 260 261 explicit HexagonVectorLoopCarriedReuse() : LoopPass(ID) { 262 PassRegistry *PR = PassRegistry::getPassRegistry(); 263 initializeHexagonVectorLoopCarriedReusePass(*PR); 264 } 265 266 StringRef getPassName() const override { 267 return "Hexagon-specific loop carried reuse for HVX vectors"; 268 } 269 270 void getAnalysisUsage(AnalysisUsage &AU) const override { 271 AU.addRequired<LoopInfoWrapperPass>(); 272 AU.addRequiredID(LoopSimplifyID); 273 AU.addRequiredID(LCSSAID); 274 AU.addPreservedID(LCSSAID); 275 AU.setPreservesCFG(); 276 } 277 278 bool runOnLoop(Loop *L, LPPassManager &LPM) override; 279 280 private: 281 SetVector<DepChain *> Dependences; 282 std::set<Instruction *> ReplacedInsts; 283 Loop *CurLoop; 284 ReuseValue ReuseCandidate; 285 286 bool doVLCR(); 287 void findLoopCarriedDeps(); 288 void findValueToReuse(); 289 void findDepChainFromPHI(Instruction *I, DepChain &D); 290 void reuseValue(); 291 Value *findValueInBlock(Value *Op, BasicBlock *BB); 292 bool isDepChainBtwn(Instruction *I1, Instruction *I2, int Iters); 293 DepChain *getDepChainBtwn(Instruction *I1, Instruction *I2); 294 bool isEquivalentOperation(Instruction *I1, Instruction *I2); 295 bool canReplace(Instruction *I); 296 }; 297 298 } // end anonymous namespace 299 300 char HexagonVectorLoopCarriedReuse::ID = 0; 301 302 INITIALIZE_PASS_BEGIN(HexagonVectorLoopCarriedReuse, "hexagon-vlcr", 303 "Hexagon-specific predictive commoning for HVX vectors", false, false) 304 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 305 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 306 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) 307 INITIALIZE_PASS_END(HexagonVectorLoopCarriedReuse, "hexagon-vlcr", 308 "Hexagon-specific predictive commoning for HVX vectors", false, false) 309 310 bool HexagonVectorLoopCarriedReuse::runOnLoop(Loop *L, LPPassManager &LPM) { 311 if (skipLoop(L)) 312 return false; 313 314 if (!L->getLoopPreheader()) 315 return false; 316 317 // Work only on innermost loops. 318 if (!L->getSubLoops().empty()) 319 return false; 320 321 // Work only on single basic blocks loops. 322 if (L->getNumBlocks() != 1) 323 return false; 324 325 CurLoop = L; 326 327 return doVLCR(); 328 } 329 330 bool HexagonVectorLoopCarriedReuse::isEquivalentOperation(Instruction *I1, 331 Instruction *I2) { 332 if (!I1->isSameOperationAs(I2)) 333 return false; 334 // This check is in place specifically for intrinsics. isSameOperationAs will 335 // return two for any two hexagon intrinsics because they are essentially the 336 // same instruciton (CallInst). We need to scratch the surface to see if they 337 // are calls to the same function. 338 if (CallInst *C1 = dyn_cast<CallInst>(I1)) { 339 if (CallInst *C2 = dyn_cast<CallInst>(I2)) { 340 if (C1->getCalledFunction() != C2->getCalledFunction()) 341 return false; 342 } 343 } 344 345 // If both the Instructions are of Vector Type and any of the element 346 // is integer constant, check their values too for equivalence. 347 if (I1->getType()->isVectorTy() && I2->getType()->isVectorTy()) { 348 unsigned NumOperands = I1->getNumOperands(); 349 for (unsigned i = 0; i < NumOperands; ++i) { 350 ConstantInt *C1 = dyn_cast<ConstantInt>(I1->getOperand(i)); 351 ConstantInt *C2 = dyn_cast<ConstantInt>(I2->getOperand(i)); 352 if(!C1) continue; 353 assert(C2); 354 if (C1->getSExtValue() != C2->getSExtValue()) 355 return false; 356 } 357 } 358 359 return true; 360 } 361 362 bool HexagonVectorLoopCarriedReuse::canReplace(Instruction *I) { 363 const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); 364 if (II && 365 (II->getIntrinsicID() == Intrinsic::hexagon_V6_hi || 366 II->getIntrinsicID() == Intrinsic::hexagon_V6_lo)) { 367 LLVM_DEBUG(dbgs() << "Not considering for reuse: " << *II << "\n"); 368 return false; 369 } 370 return true; 371 } 372 void HexagonVectorLoopCarriedReuse::findValueToReuse() { 373 for (auto *D : Dependences) { 374 LLVM_DEBUG(dbgs() << "Processing dependence " << *(D->front()) << "\n"); 375 if (D->iterations() > HexagonVLCRIterationLim) { 376 LLVM_DEBUG( 377 dbgs() 378 << ".. Skipping because number of iterations > than the limit\n"); 379 continue; 380 } 381 382 PHINode *PN = cast<PHINode>(D->front()); 383 Instruction *BEInst = D->back(); 384 int Iters = D->iterations(); 385 BasicBlock *BB = PN->getParent(); 386 LLVM_DEBUG(dbgs() << "Checking if any uses of " << *PN 387 << " can be reused\n"); 388 389 SmallVector<Instruction *, 4> PNUsers; 390 for (auto UI = PN->use_begin(), E = PN->use_end(); UI != E; ++UI) { 391 Use &U = *UI; 392 Instruction *User = cast<Instruction>(U.getUser()); 393 394 if (User->getParent() != BB) 395 continue; 396 if (ReplacedInsts.count(User)) { 397 LLVM_DEBUG(dbgs() << *User 398 << " has already been replaced. Skipping...\n"); 399 continue; 400 } 401 if (isa<PHINode>(User)) 402 continue; 403 if (User->mayHaveSideEffects()) 404 continue; 405 if (!canReplace(User)) 406 continue; 407 408 PNUsers.push_back(User); 409 } 410 LLVM_DEBUG(dbgs() << PNUsers.size() << " use(s) of the PHI in the block\n"); 411 412 // For each interesting use I of PN, find an Instruction BEUser that 413 // performs the same operation as I on BEInst and whose other operands, 414 // if any, can also be rematerialized in OtherBB. We stop when we find the 415 // first such Instruction BEUser. This is because once BEUser is 416 // rematerialized in OtherBB, we may find more such "fixup" opportunities 417 // in this block. So, we'll start over again. 418 for (Instruction *I : PNUsers) { 419 for (auto UI = BEInst->use_begin(), E = BEInst->use_end(); UI != E; 420 ++UI) { 421 Use &U = *UI; 422 Instruction *BEUser = cast<Instruction>(U.getUser()); 423 424 if (BEUser->getParent() != BB) 425 continue; 426 if (!isEquivalentOperation(I, BEUser)) 427 continue; 428 429 int NumOperands = I->getNumOperands(); 430 431 for (int OpNo = 0; OpNo < NumOperands; ++OpNo) { 432 Value *Op = I->getOperand(OpNo); 433 Instruction *OpInst = dyn_cast<Instruction>(Op); 434 if (!OpInst) 435 continue; 436 437 Value *BEOp = BEUser->getOperand(OpNo); 438 Instruction *BEOpInst = dyn_cast<Instruction>(BEOp); 439 440 if (!isDepChainBtwn(OpInst, BEOpInst, Iters)) { 441 BEUser = nullptr; 442 break; 443 } 444 } 445 if (BEUser) { 446 LLVM_DEBUG(dbgs() << "Found Value for reuse.\n"); 447 ReuseCandidate.Inst2Replace = I; 448 ReuseCandidate.BackedgeInst = BEUser; 449 return; 450 } else 451 ReuseCandidate.reset(); 452 } 453 } 454 } 455 ReuseCandidate.reset(); 456 } 457 458 Value *HexagonVectorLoopCarriedReuse::findValueInBlock(Value *Op, 459 BasicBlock *BB) { 460 PHINode *PN = dyn_cast<PHINode>(Op); 461 assert(PN); 462 Value *ValueInBlock = PN->getIncomingValueForBlock(BB); 463 return ValueInBlock; 464 } 465 466 void HexagonVectorLoopCarriedReuse::reuseValue() { 467 LLVM_DEBUG(dbgs() << ReuseCandidate); 468 Instruction *Inst2Replace = ReuseCandidate.Inst2Replace; 469 Instruction *BEInst = ReuseCandidate.BackedgeInst; 470 int NumOperands = Inst2Replace->getNumOperands(); 471 std::map<Instruction *, DepChain *> DepChains; 472 int Iterations = -1; 473 BasicBlock *LoopPH = CurLoop->getLoopPreheader(); 474 475 for (int i = 0; i < NumOperands; ++i) { 476 Instruction *I = dyn_cast<Instruction>(Inst2Replace->getOperand(i)); 477 if(!I) 478 continue; 479 else { 480 Instruction *J = cast<Instruction>(BEInst->getOperand(i)); 481 DepChain *D = getDepChainBtwn(I, J); 482 483 assert(D && 484 "No DepChain between corresponding operands in ReuseCandidate\n"); 485 if (Iterations == -1) 486 Iterations = D->iterations(); 487 assert(Iterations == D->iterations() && "Iterations mismatch"); 488 DepChains[I] = D; 489 } 490 } 491 492 LLVM_DEBUG(dbgs() << "reuseValue is making the following changes\n"); 493 494 SmallVector<Instruction *, 4> InstsInPreheader; 495 for (int i = 0; i < Iterations; ++i) { 496 Instruction *InstInPreheader = Inst2Replace->clone(); 497 SmallVector<Value *, 4> Ops; 498 for (int j = 0; j < NumOperands; ++j) { 499 Instruction *I = dyn_cast<Instruction>(Inst2Replace->getOperand(j)); 500 if (!I) 501 continue; 502 // Get the DepChain corresponding to this operand. 503 DepChain &D = *DepChains[I]; 504 // Get the PHI for the iteration number and find 505 // the incoming value from the Loop Preheader for 506 // that PHI. 507 Value *ValInPreheader = findValueInBlock(D[i], LoopPH); 508 InstInPreheader->setOperand(j, ValInPreheader); 509 } 510 InstsInPreheader.push_back(InstInPreheader); 511 InstInPreheader->setName(Inst2Replace->getName() + ".hexagon.vlcr"); 512 InstInPreheader->insertBefore(LoopPH->getTerminator()); 513 LLVM_DEBUG(dbgs() << "Added " << *InstInPreheader << " to " 514 << LoopPH->getName() << "\n"); 515 } 516 BasicBlock *BB = BEInst->getParent(); 517 IRBuilder<> IRB(BB); 518 IRB.SetInsertPoint(BB->getFirstNonPHI()); 519 Value *BEVal = BEInst; 520 PHINode *NewPhi; 521 for (int i = Iterations-1; i >=0 ; --i) { 522 Instruction *InstInPreheader = InstsInPreheader[i]; 523 NewPhi = IRB.CreatePHI(InstInPreheader->getType(), 2); 524 NewPhi->addIncoming(InstInPreheader, LoopPH); 525 NewPhi->addIncoming(BEVal, BB); 526 LLVM_DEBUG(dbgs() << "Adding " << *NewPhi << " to " << BB->getName() 527 << "\n"); 528 BEVal = NewPhi; 529 } 530 // We are in LCSSA form. So, a value defined inside the Loop is used only 531 // inside the loop. So, the following is safe. 532 Inst2Replace->replaceAllUsesWith(NewPhi); 533 ReplacedInsts.insert(Inst2Replace); 534 ++HexagonNumVectorLoopCarriedReuse; 535 } 536 537 bool HexagonVectorLoopCarriedReuse::doVLCR() { 538 assert(CurLoop->getSubLoops().empty() && 539 "Can do VLCR on the innermost loop only"); 540 assert((CurLoop->getNumBlocks() == 1) && 541 "Can do VLCR only on single block loops"); 542 543 bool Changed = false; 544 bool Continue; 545 546 LLVM_DEBUG(dbgs() << "Working on Loop: " << *CurLoop->getHeader() << "\n"); 547 do { 548 // Reset datastructures. 549 Dependences.clear(); 550 Continue = false; 551 552 findLoopCarriedDeps(); 553 findValueToReuse(); 554 if (ReuseCandidate.isDefined()) { 555 reuseValue(); 556 Changed = true; 557 Continue = true; 558 } 559 llvm::for_each(Dependences, std::default_delete<DepChain>()); 560 } while (Continue); 561 return Changed; 562 } 563 564 void HexagonVectorLoopCarriedReuse::findDepChainFromPHI(Instruction *I, 565 DepChain &D) { 566 PHINode *PN = dyn_cast<PHINode>(I); 567 if (!PN) { 568 D.push_back(I); 569 return; 570 } else { 571 auto NumIncomingValues = PN->getNumIncomingValues(); 572 if (NumIncomingValues != 2) { 573 D.clear(); 574 return; 575 } 576 577 BasicBlock *BB = PN->getParent(); 578 if (BB != CurLoop->getHeader()) { 579 D.clear(); 580 return; 581 } 582 583 Value *BEVal = PN->getIncomingValueForBlock(BB); 584 Instruction *BEInst = dyn_cast<Instruction>(BEVal); 585 // This is a single block loop with a preheader, so at least 586 // one value should come over the backedge. 587 assert(BEInst && "There should be a value over the backedge"); 588 589 Value *PreHdrVal = 590 PN->getIncomingValueForBlock(CurLoop->getLoopPreheader()); 591 if(!PreHdrVal || !isa<Instruction>(PreHdrVal)) { 592 D.clear(); 593 return; 594 } 595 D.push_back(PN); 596 findDepChainFromPHI(BEInst, D); 597 } 598 } 599 600 bool HexagonVectorLoopCarriedReuse::isDepChainBtwn(Instruction *I1, 601 Instruction *I2, 602 int Iters) { 603 for (auto *D : Dependences) { 604 if (D->front() == I1 && D->back() == I2 && D->iterations() == Iters) 605 return true; 606 } 607 return false; 608 } 609 610 DepChain *HexagonVectorLoopCarriedReuse::getDepChainBtwn(Instruction *I1, 611 Instruction *I2) { 612 for (auto *D : Dependences) { 613 if (D->front() == I1 && D->back() == I2) 614 return D; 615 } 616 return nullptr; 617 } 618 619 void HexagonVectorLoopCarriedReuse::findLoopCarriedDeps() { 620 BasicBlock *BB = CurLoop->getHeader(); 621 for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) { 622 auto *PN = cast<PHINode>(I); 623 if (!isa<VectorType>(PN->getType())) 624 continue; 625 626 DepChain *D = new DepChain(); 627 findDepChainFromPHI(PN, *D); 628 if (D->size() != 0) 629 Dependences.insert(D); 630 else 631 delete D; 632 } 633 LLVM_DEBUG(dbgs() << "Found " << Dependences.size() << " dependences\n"); 634 LLVM_DEBUG(for (size_t i = 0; i < Dependences.size(); 635 ++i) { dbgs() << *Dependences[i] << "\n"; }); 636 } 637 638 Pass *llvm::createHexagonVectorLoopCarriedReusePass() { 639 return new HexagonVectorLoopCarriedReuse(); 640 } 641