1 //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs various transformations related to eliminating memcpy 11 // calls, or transforming sets of stores into memset's. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 16 #include "llvm/Transforms/Scalar.h" 17 #include "llvm/ADT/DenseSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/ValueTracking.h" 21 #include "llvm/IR/DataLayout.h" 22 #include "llvm/IR/GetElementPtrTypeIterator.h" 23 #include "llvm/IR/GlobalVariable.h" 24 #include "llvm/IR/IRBuilder.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/raw_ostream.h" 27 #include "llvm/Transforms/Utils/Local.h" 28 #include <algorithm> 29 using namespace llvm; 30 31 #define DEBUG_TYPE "memcpyopt" 32 33 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); 34 STATISTIC(NumMemSetInfer, "Number of memsets inferred"); 35 STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); 36 STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); 37 38 static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, 39 bool &VariableIdxFound, 40 const DataLayout &DL) { 41 // Skip over the first indices. 42 gep_type_iterator GTI = gep_type_begin(GEP); 43 for (unsigned i = 1; i != Idx; ++i, ++GTI) 44 /*skip along*/; 45 46 // Compute the offset implied by the rest of the indices. 47 int64_t Offset = 0; 48 for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { 49 ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); 50 if (!OpC) 51 return VariableIdxFound = true; 52 if (OpC->isZero()) continue; // No offset. 53 54 // Handle struct indices, which add their field offset to the pointer. 55 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 56 Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); 57 continue; 58 } 59 60 // Otherwise, we have a sequential type like an array or vector. Multiply 61 // the index by the ElementSize. 62 uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); 63 Offset += Size*OpC->getSExtValue(); 64 } 65 66 return Offset; 67 } 68 69 /// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and 70 /// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2 71 /// might be &A[40]. In this case offset would be -8. 72 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, 73 const DataLayout &DL) { 74 Ptr1 = Ptr1->stripPointerCasts(); 75 Ptr2 = Ptr2->stripPointerCasts(); 76 77 // Handle the trivial case first. 78 if (Ptr1 == Ptr2) { 79 Offset = 0; 80 return true; 81 } 82 83 GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); 84 GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); 85 86 bool VariableIdxFound = false; 87 88 // If one pointer is a GEP and the other isn't, then see if the GEP is a 89 // constant offset from the base, as in "P" and "gep P, 1". 90 if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { 91 Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL); 92 return !VariableIdxFound; 93 } 94 95 if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { 96 Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL); 97 return !VariableIdxFound; 98 } 99 100 // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical 101 // base. After that base, they may have some number of common (and 102 // potentially variable) indices. After that they handle some constant 103 // offset, which determines their offset from each other. At this point, we 104 // handle no other case. 105 if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) 106 return false; 107 108 // Skip any common indices and track the GEP types. 109 unsigned Idx = 1; 110 for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) 111 if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) 112 break; 113 114 int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL); 115 int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL); 116 if (VariableIdxFound) return false; 117 118 Offset = Offset2-Offset1; 119 return true; 120 } 121 122 123 /// Represents a range of memset'd bytes with the ByteVal value. 124 /// This allows us to analyze stores like: 125 /// store 0 -> P+1 126 /// store 0 -> P+0 127 /// store 0 -> P+3 128 /// store 0 -> P+2 129 /// which sometimes happens with stores to arrays of structs etc. When we see 130 /// the first store, we make a range [1, 2). The second store extends the range 131 /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the 132 /// two ranges into [0, 3) which is memset'able. 133 namespace { 134 struct MemsetRange { 135 // Start/End - A semi range that describes the span that this range covers. 136 // The range is closed at the start and open at the end: [Start, End). 137 int64_t Start, End; 138 139 /// StartPtr - The getelementptr instruction that points to the start of the 140 /// range. 141 Value *StartPtr; 142 143 /// Alignment - The known alignment of the first store. 144 unsigned Alignment; 145 146 /// TheStores - The actual stores that make up this range. 147 SmallVector<Instruction*, 16> TheStores; 148 149 bool isProfitableToUseMemset(const DataLayout &DL) const; 150 }; 151 } // end anon namespace 152 153 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { 154 // If we found more than 4 stores to merge or 16 bytes, use memset. 155 if (TheStores.size() >= 4 || End-Start >= 16) return true; 156 157 // If there is nothing to merge, don't do anything. 158 if (TheStores.size() < 2) return false; 159 160 // If any of the stores are a memset, then it is always good to extend the 161 // memset. 162 for (Instruction *SI : TheStores) 163 if (!isa<StoreInst>(SI)) 164 return true; 165 166 // Assume that the code generator is capable of merging pairs of stores 167 // together if it wants to. 168 if (TheStores.size() == 2) return false; 169 170 // If we have fewer than 8 stores, it can still be worthwhile to do this. 171 // For example, merging 4 i8 stores into an i32 store is useful almost always. 172 // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the 173 // memset will be split into 2 32-bit stores anyway) and doing so can 174 // pessimize the llvm optimizer. 175 // 176 // Since we don't have perfect knowledge here, make some assumptions: assume 177 // the maximum GPR width is the same size as the largest legal integer 178 // size. If so, check to see whether we will end up actually reducing the 179 // number of stores used. 180 unsigned Bytes = unsigned(End-Start); 181 unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8; 182 if (MaxIntSize == 0) 183 MaxIntSize = 1; 184 unsigned NumPointerStores = Bytes / MaxIntSize; 185 186 // Assume the remaining bytes if any are done a byte at a time. 187 unsigned NumByteStores = Bytes % MaxIntSize; 188 189 // If we will reduce the # stores (according to this heuristic), do the 190 // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 191 // etc. 192 return TheStores.size() > NumPointerStores+NumByteStores; 193 } 194 195 196 namespace { 197 class MemsetRanges { 198 /// A sorted list of the memset ranges. 199 SmallVector<MemsetRange, 8> Ranges; 200 typedef SmallVectorImpl<MemsetRange>::iterator range_iterator; 201 const DataLayout &DL; 202 public: 203 MemsetRanges(const DataLayout &DL) : DL(DL) {} 204 205 typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator; 206 const_iterator begin() const { return Ranges.begin(); } 207 const_iterator end() const { return Ranges.end(); } 208 bool empty() const { return Ranges.empty(); } 209 210 void addInst(int64_t OffsetFromFirst, Instruction *Inst) { 211 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 212 addStore(OffsetFromFirst, SI); 213 else 214 addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); 215 } 216 217 void addStore(int64_t OffsetFromFirst, StoreInst *SI) { 218 int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); 219 220 addRange(OffsetFromFirst, StoreSize, 221 SI->getPointerOperand(), SI->getAlignment(), SI); 222 } 223 224 void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { 225 int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 226 addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); 227 } 228 229 void addRange(int64_t Start, int64_t Size, Value *Ptr, 230 unsigned Alignment, Instruction *Inst); 231 232 }; 233 234 } // end anon namespace 235 236 237 /// Add a new store to the MemsetRanges data structure. This adds a 238 /// new range for the specified store at the specified offset, merging into 239 /// existing ranges as appropriate. 240 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, 241 unsigned Alignment, Instruction *Inst) { 242 int64_t End = Start+Size; 243 244 range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start, 245 [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; }); 246 247 // We now know that I == E, in which case we didn't find anything to merge 248 // with, or that Start <= I->End. If End < I->Start or I == E, then we need 249 // to insert a new range. Handle this now. 250 if (I == Ranges.end() || End < I->Start) { 251 MemsetRange &R = *Ranges.insert(I, MemsetRange()); 252 R.Start = Start; 253 R.End = End; 254 R.StartPtr = Ptr; 255 R.Alignment = Alignment; 256 R.TheStores.push_back(Inst); 257 return; 258 } 259 260 // This store overlaps with I, add it. 261 I->TheStores.push_back(Inst); 262 263 // At this point, we may have an interval that completely contains our store. 264 // If so, just add it to the interval and return. 265 if (I->Start <= Start && I->End >= End) 266 return; 267 268 // Now we know that Start <= I->End and End >= I->Start so the range overlaps 269 // but is not entirely contained within the range. 270 271 // See if the range extends the start of the range. In this case, it couldn't 272 // possibly cause it to join the prior range, because otherwise we would have 273 // stopped on *it*. 274 if (Start < I->Start) { 275 I->Start = Start; 276 I->StartPtr = Ptr; 277 I->Alignment = Alignment; 278 } 279 280 // Now we know that Start <= I->End and Start >= I->Start (so the startpoint 281 // is in or right at the end of I), and that End >= I->Start. Extend I out to 282 // End. 283 if (End > I->End) { 284 I->End = End; 285 range_iterator NextI = I; 286 while (++NextI != Ranges.end() && End >= NextI->Start) { 287 // Merge the range in. 288 I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); 289 if (NextI->End > I->End) 290 I->End = NextI->End; 291 Ranges.erase(NextI); 292 NextI = I; 293 } 294 } 295 } 296 297 //===----------------------------------------------------------------------===// 298 // MemCpyOptLegacyPass Pass 299 //===----------------------------------------------------------------------===// 300 301 namespace { 302 class MemCpyOptLegacyPass : public FunctionPass { 303 MemCpyOptPass Impl; 304 public: 305 static char ID; // Pass identification, replacement for typeid 306 MemCpyOptLegacyPass() : FunctionPass(ID) { 307 initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry()); 308 } 309 310 bool runOnFunction(Function &F) override; 311 312 private: 313 // This transformation requires dominator postdominator info 314 void getAnalysisUsage(AnalysisUsage &AU) const override { 315 AU.setPreservesCFG(); 316 AU.addRequired<AssumptionCacheTracker>(); 317 AU.addRequired<DominatorTreeWrapperPass>(); 318 AU.addRequired<MemoryDependenceWrapperPass>(); 319 AU.addRequired<AAResultsWrapperPass>(); 320 AU.addRequired<TargetLibraryInfoWrapperPass>(); 321 AU.addPreserved<GlobalsAAWrapperPass>(); 322 AU.addPreserved<MemoryDependenceWrapperPass>(); 323 } 324 325 // Helper functions 326 bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); 327 bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); 328 bool processMemCpy(MemCpyInst *M); 329 bool processMemMove(MemMoveInst *M); 330 bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, 331 uint64_t cpyLen, unsigned cpyAlign, CallInst *C); 332 bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep); 333 bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); 334 bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep); 335 bool processByValArgument(CallSite CS, unsigned ArgNo); 336 Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, 337 Value *ByteVal); 338 339 bool iterateOnFunction(Function &F); 340 }; 341 342 char MemCpyOptLegacyPass::ID = 0; 343 } 344 345 /// The public interface to this file... 346 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); } 347 348 INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", 349 false, false) 350 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 351 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 352 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) 353 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 354 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 355 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 356 INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization", 357 false, false) 358 359 /// When scanning forward over instructions, we look for some other patterns to 360 /// fold away. In particular, this looks for stores to neighboring locations of 361 /// memory. If it sees enough consecutive ones, it attempts to merge them 362 /// together into a memcpy/memset. 363 Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, 364 Value *StartPtr, 365 Value *ByteVal) { 366 const DataLayout &DL = StartInst->getModule()->getDataLayout(); 367 368 // Okay, so we now have a single store that can be splatable. Scan to find 369 // all subsequent stores of the same value to offset from the same pointer. 370 // Join these together into ranges, so we can decide whether contiguous blocks 371 // are stored. 372 MemsetRanges Ranges(DL); 373 374 BasicBlock::iterator BI(StartInst); 375 for (++BI; !isa<TerminatorInst>(BI); ++BI) { 376 if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { 377 // If the instruction is readnone, ignore it, otherwise bail out. We 378 // don't even allow readonly here because we don't want something like: 379 // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). 380 if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) 381 break; 382 continue; 383 } 384 385 if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { 386 // If this is a store, see if we can merge it in. 387 if (!NextStore->isSimple()) break; 388 389 // Check to see if this stored value is of the same byte-splattable value. 390 if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) 391 break; 392 393 // Check to see if this store is to a constant offset from the start ptr. 394 int64_t Offset; 395 if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, 396 DL)) 397 break; 398 399 Ranges.addStore(Offset, NextStore); 400 } else { 401 MemSetInst *MSI = cast<MemSetInst>(BI); 402 403 if (MSI->isVolatile() || ByteVal != MSI->getValue() || 404 !isa<ConstantInt>(MSI->getLength())) 405 break; 406 407 // Check to see if this store is to a constant offset from the start ptr. 408 int64_t Offset; 409 if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL)) 410 break; 411 412 Ranges.addMemSet(Offset, MSI); 413 } 414 } 415 416 // If we have no ranges, then we just had a single store with nothing that 417 // could be merged in. This is a very common case of course. 418 if (Ranges.empty()) 419 return nullptr; 420 421 // If we had at least one store that could be merged in, add the starting 422 // store as well. We try to avoid this unless there is at least something 423 // interesting as a small compile-time optimization. 424 Ranges.addInst(0, StartInst); 425 426 // If we create any memsets, we put it right before the first instruction that 427 // isn't part of the memset block. This ensure that the memset is dominated 428 // by any addressing instruction needed by the start of the block. 429 IRBuilder<> Builder(&*BI); 430 431 // Now that we have full information about ranges, loop over the ranges and 432 // emit memset's for anything big enough to be worthwhile. 433 Instruction *AMemSet = nullptr; 434 for (const MemsetRange &Range : Ranges) { 435 436 if (Range.TheStores.size() == 1) continue; 437 438 // If it is profitable to lower this range to memset, do so now. 439 if (!Range.isProfitableToUseMemset(DL)) 440 continue; 441 442 // Otherwise, we do want to transform this! Create a new memset. 443 // Get the starting pointer of the block. 444 StartPtr = Range.StartPtr; 445 446 // Determine alignment 447 unsigned Alignment = Range.Alignment; 448 if (Alignment == 0) { 449 Type *EltType = 450 cast<PointerType>(StartPtr->getType())->getElementType(); 451 Alignment = DL.getABITypeAlignment(EltType); 452 } 453 454 AMemSet = 455 Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); 456 457 DEBUG(dbgs() << "Replace stores:\n"; 458 for (Instruction *SI : Range.TheStores) 459 dbgs() << *SI << '\n'; 460 dbgs() << "With: " << *AMemSet << '\n'); 461 462 if (!Range.TheStores.empty()) 463 AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); 464 465 // Zap all the stores. 466 for (Instruction *SI : Range.TheStores) { 467 MD->removeInstruction(SI); 468 SI->eraseFromParent(); 469 } 470 ++NumMemSetInfer; 471 } 472 473 return AMemSet; 474 } 475 476 static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, 477 const LoadInst *LI) { 478 unsigned StoreAlign = SI->getAlignment(); 479 if (!StoreAlign) 480 StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); 481 unsigned LoadAlign = LI->getAlignment(); 482 if (!LoadAlign) 483 LoadAlign = DL.getABITypeAlignment(LI->getType()); 484 485 return std::min(StoreAlign, LoadAlign); 486 } 487 488 // This method try to lift a store instruction before position P. 489 // It will lift the store and its argument + that anything that 490 // may alias with these. 491 // The method returns true if it was successful. 492 static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P) { 493 // If the store alias this position, early bail out. 494 MemoryLocation StoreLoc = MemoryLocation::get(SI); 495 if (AA.getModRefInfo(P, StoreLoc) != MRI_NoModRef) 496 return false; 497 498 // Keep track of the arguments of all instruction we plan to lift 499 // so we can make sure to lift them as well if apropriate. 500 DenseSet<Instruction*> Args; 501 if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand())) 502 if (Ptr->getParent() == SI->getParent()) 503 Args.insert(Ptr); 504 505 // Instruction to lift before P. 506 SmallVector<Instruction*, 8> ToLift; 507 508 // Memory locations of lifted instructions. 509 SmallVector<MemoryLocation, 8> MemLocs; 510 MemLocs.push_back(StoreLoc); 511 512 // Lifted callsites. 513 SmallVector<ImmutableCallSite, 8> CallSites; 514 515 for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) { 516 auto *C = &*I; 517 518 bool MayAlias = AA.getModRefInfo(C) != MRI_NoModRef; 519 520 bool NeedLift = false; 521 if (Args.erase(C)) 522 NeedLift = true; 523 else if (MayAlias) { 524 NeedLift = std::any_of(MemLocs.begin(), MemLocs.end(), 525 [C, &AA](const MemoryLocation &ML) { 526 return AA.getModRefInfo(C, ML); 527 }); 528 529 if (!NeedLift) 530 NeedLift = std::any_of(CallSites.begin(), CallSites.end(), 531 [C, &AA](const ImmutableCallSite &CS) { 532 return AA.getModRefInfo(C, CS); 533 }); 534 } 535 536 if (!NeedLift) 537 continue; 538 539 if (MayAlias) { 540 if (auto CS = ImmutableCallSite(C)) { 541 // If we can't lift this before P, it's game over. 542 if (AA.getModRefInfo(P, CS) != MRI_NoModRef) 543 return false; 544 545 CallSites.push_back(CS); 546 } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) { 547 // If we can't lift this before P, it's game over. 548 auto ML = MemoryLocation::get(C); 549 if (AA.getModRefInfo(P, ML) != MRI_NoModRef) 550 return false; 551 552 MemLocs.push_back(ML); 553 } else 554 // We don't know how to lift this instruction. 555 return false; 556 } 557 558 ToLift.push_back(C); 559 for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k) 560 if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) 561 if (A->getParent() == SI->getParent()) 562 Args.insert(A); 563 } 564 565 // We made it, we need to lift 566 for (auto *I : reverse(ToLift)) { 567 DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); 568 I->moveBefore(P); 569 } 570 571 return true; 572 } 573 574 bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { 575 if (!SI->isSimple()) return false; 576 577 // Avoid merging nontemporal stores since the resulting 578 // memcpy/memset would not be able to preserve the nontemporal hint. 579 // In theory we could teach how to propagate the !nontemporal metadata to 580 // memset calls. However, that change would force the backend to 581 // conservatively expand !nontemporal memset calls back to sequences of 582 // store instructions (effectively undoing the merging). 583 if (SI->getMetadata(LLVMContext::MD_nontemporal)) 584 return false; 585 586 const DataLayout &DL = SI->getModule()->getDataLayout(); 587 588 // Load to store forwarding can be interpreted as memcpy. 589 if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { 590 if (LI->isSimple() && LI->hasOneUse() && 591 LI->getParent() == SI->getParent()) { 592 593 auto *T = LI->getType(); 594 if (T->isAggregateType()) { 595 AliasAnalysis &AA = LookupAliasAnalysis(); 596 MemoryLocation LoadLoc = MemoryLocation::get(LI); 597 598 // We use alias analysis to check if an instruction may store to 599 // the memory we load from in between the load and the store. If 600 // such an instruction is found, we try to promote there instead 601 // of at the store position. 602 Instruction *P = SI; 603 for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { 604 if (AA.getModRefInfo(&I, LoadLoc) & MRI_Mod) { 605 P = &I; 606 break; 607 } 608 } 609 610 // We found an instruction that may write to the loaded memory. 611 // We can try to promote at this position instead of the store 612 // position if nothing alias the store memory after this and the store 613 // destination is not in the range. 614 if (P && P != SI) { 615 if (!moveUp(AA, SI, P)) 616 P = nullptr; 617 } 618 619 // If a valid insertion position is found, then we can promote 620 // the load/store pair to a memcpy. 621 if (P) { 622 // If we load from memory that may alias the memory we store to, 623 // memmove must be used to preserve semantic. If not, memcpy can 624 // be used. 625 bool UseMemMove = false; 626 if (!AA.isNoAlias(MemoryLocation::get(SI), LoadLoc)) 627 UseMemMove = true; 628 629 unsigned Align = findCommonAlignment(DL, SI, LI); 630 uint64_t Size = DL.getTypeStoreSize(T); 631 632 IRBuilder<> Builder(P); 633 Instruction *M; 634 if (UseMemMove) 635 M = Builder.CreateMemMove(SI->getPointerOperand(), 636 LI->getPointerOperand(), Size, 637 Align, SI->isVolatile()); 638 else 639 M = Builder.CreateMemCpy(SI->getPointerOperand(), 640 LI->getPointerOperand(), Size, 641 Align, SI->isVolatile()); 642 643 DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI 644 << " => " << *M << "\n"); 645 646 MD->removeInstruction(SI); 647 SI->eraseFromParent(); 648 MD->removeInstruction(LI); 649 LI->eraseFromParent(); 650 ++NumMemCpyInstr; 651 652 // Make sure we do not invalidate the iterator. 653 BBI = M->getIterator(); 654 return true; 655 } 656 } 657 658 // Detect cases where we're performing call slot forwarding, but 659 // happen to be using a load-store pair to implement it, rather than 660 // a memcpy. 661 MemDepResult ldep = MD->getDependency(LI); 662 CallInst *C = nullptr; 663 if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) 664 C = dyn_cast<CallInst>(ldep.getInst()); 665 666 if (C) { 667 // Check that nothing touches the dest of the "copy" between 668 // the call and the store. 669 Value *CpyDest = SI->getPointerOperand()->stripPointerCasts(); 670 bool CpyDestIsLocal = isa<AllocaInst>(CpyDest); 671 AliasAnalysis &AA = LookupAliasAnalysis(); 672 MemoryLocation StoreLoc = MemoryLocation::get(SI); 673 for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); 674 I != E; --I) { 675 if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { 676 C = nullptr; 677 break; 678 } 679 // The store to dest may never happen if an exception can be thrown 680 // between the load and the store. 681 if (I->mayThrow() && !CpyDestIsLocal) { 682 C = nullptr; 683 break; 684 } 685 } 686 } 687 688 if (C) { 689 bool changed = performCallSlotOptzn( 690 LI, SI->getPointerOperand()->stripPointerCasts(), 691 LI->getPointerOperand()->stripPointerCasts(), 692 DL.getTypeStoreSize(SI->getOperand(0)->getType()), 693 findCommonAlignment(DL, SI, LI), C); 694 if (changed) { 695 MD->removeInstruction(SI); 696 SI->eraseFromParent(); 697 MD->removeInstruction(LI); 698 LI->eraseFromParent(); 699 ++NumMemCpyInstr; 700 return true; 701 } 702 } 703 } 704 } 705 706 // There are two cases that are interesting for this code to handle: memcpy 707 // and memset. Right now we only handle memset. 708 709 // Ensure that the value being stored is something that can be memset'able a 710 // byte at a time like "0" or "-1" or any width, as well as things like 711 // 0xA0A0A0A0 and 0.0. 712 auto *V = SI->getOperand(0); 713 if (Value *ByteVal = isBytewiseValue(V)) { 714 if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), 715 ByteVal)) { 716 BBI = I->getIterator(); // Don't invalidate iterator. 717 return true; 718 } 719 720 // If we have an aggregate, we try to promote it to memset regardless 721 // of opportunity for merging as it can expose optimization opportunities 722 // in subsequent passes. 723 auto *T = V->getType(); 724 if (T->isAggregateType()) { 725 uint64_t Size = DL.getTypeStoreSize(T); 726 unsigned Align = SI->getAlignment(); 727 if (!Align) 728 Align = DL.getABITypeAlignment(T); 729 IRBuilder<> Builder(SI); 730 auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, 731 Size, Align, SI->isVolatile()); 732 733 DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); 734 735 MD->removeInstruction(SI); 736 SI->eraseFromParent(); 737 NumMemSetInfer++; 738 739 // Make sure we do not invalidate the iterator. 740 BBI = M->getIterator(); 741 return true; 742 } 743 } 744 745 return false; 746 } 747 748 bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { 749 // See if there is another memset or store neighboring this memset which 750 // allows us to widen out the memset to do a single larger store. 751 if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) 752 if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), 753 MSI->getValue())) { 754 BBI = I->getIterator(); // Don't invalidate iterator. 755 return true; 756 } 757 return false; 758 } 759 760 761 /// Takes a memcpy and a call that it depends on, 762 /// and checks for the possibility of a call slot optimization by having 763 /// the call write its result directly into the destination of the memcpy. 764 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, 765 Value *cpySrc, uint64_t cpyLen, 766 unsigned cpyAlign, CallInst *C) { 767 // The general transformation to keep in mind is 768 // 769 // call @func(..., src, ...) 770 // memcpy(dest, src, ...) 771 // 772 // -> 773 // 774 // memcpy(dest, src, ...) 775 // call @func(..., dest, ...) 776 // 777 // Since moving the memcpy is technically awkward, we additionally check that 778 // src only holds uninitialized values at the moment of the call, meaning that 779 // the memcpy can be discarded rather than moved. 780 781 // Lifetime marks shouldn't be operated on. 782 if (Function *F = C->getCalledFunction()) 783 if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start) 784 return false; 785 786 // Deliberately get the source and destination with bitcasts stripped away, 787 // because we'll need to do type comparisons based on the underlying type. 788 CallSite CS(C); 789 790 // Require that src be an alloca. This simplifies the reasoning considerably. 791 AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); 792 if (!srcAlloca) 793 return false; 794 795 ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); 796 if (!srcArraySize) 797 return false; 798 799 const DataLayout &DL = cpy->getModule()->getDataLayout(); 800 uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) * 801 srcArraySize->getZExtValue(); 802 803 if (cpyLen < srcSize) 804 return false; 805 806 // Check that accessing the first srcSize bytes of dest will not cause a 807 // trap. Otherwise the transform is invalid since it might cause a trap 808 // to occur earlier than it otherwise would. 809 if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) { 810 // The destination is an alloca. Check it is larger than srcSize. 811 ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize()); 812 if (!destArraySize) 813 return false; 814 815 uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) * 816 destArraySize->getZExtValue(); 817 818 if (destSize < srcSize) 819 return false; 820 } else if (Argument *A = dyn_cast<Argument>(cpyDest)) { 821 // The store to dest may never happen if the call can throw. 822 if (C->mayThrow()) 823 return false; 824 825 if (A->getDereferenceableBytes() < srcSize) { 826 // If the destination is an sret parameter then only accesses that are 827 // outside of the returned struct type can trap. 828 if (!A->hasStructRetAttr()) 829 return false; 830 831 Type *StructTy = cast<PointerType>(A->getType())->getElementType(); 832 if (!StructTy->isSized()) { 833 // The call may never return and hence the copy-instruction may never 834 // be executed, and therefore it's not safe to say "the destination 835 // has at least <cpyLen> bytes, as implied by the copy-instruction", 836 return false; 837 } 838 839 uint64_t destSize = DL.getTypeAllocSize(StructTy); 840 if (destSize < srcSize) 841 return false; 842 } 843 } else { 844 return false; 845 } 846 847 // Check that dest points to memory that is at least as aligned as src. 848 unsigned srcAlign = srcAlloca->getAlignment(); 849 if (!srcAlign) 850 srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType()); 851 bool isDestSufficientlyAligned = srcAlign <= cpyAlign; 852 // If dest is not aligned enough and we can't increase its alignment then 853 // bail out. 854 if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) 855 return false; 856 857 // Check that src is not accessed except via the call and the memcpy. This 858 // guarantees that it holds only undefined values when passed in (so the final 859 // memcpy can be dropped), that it is not read or written between the call and 860 // the memcpy, and that writing beyond the end of it is undefined. 861 SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(), 862 srcAlloca->user_end()); 863 while (!srcUseList.empty()) { 864 User *U = srcUseList.pop_back_val(); 865 866 if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) { 867 for (User *UU : U->users()) 868 srcUseList.push_back(UU); 869 continue; 870 } 871 if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { 872 if (!G->hasAllZeroIndices()) 873 return false; 874 875 for (User *UU : U->users()) 876 srcUseList.push_back(UU); 877 continue; 878 } 879 if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U)) 880 if (IT->getIntrinsicID() == Intrinsic::lifetime_start || 881 IT->getIntrinsicID() == Intrinsic::lifetime_end) 882 continue; 883 884 if (U != C && U != cpy) 885 return false; 886 } 887 888 // Check that src isn't captured by the called function since the 889 // transformation can cause aliasing issues in that case. 890 for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) 891 if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i)) 892 return false; 893 894 // Since we're changing the parameter to the callsite, we need to make sure 895 // that what would be the new parameter dominates the callsite. 896 DominatorTree &DT = LookupDomTree(); 897 if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) 898 if (!DT.dominates(cpyDestInst, C)) 899 return false; 900 901 // In addition to knowing that the call does not access src in some 902 // unexpected manner, for example via a global, which we deduce from 903 // the use analysis, we also need to know that it does not sneakily 904 // access dest. We rely on AA to figure this out for us. 905 AliasAnalysis &AA = LookupAliasAnalysis(); 906 ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize); 907 // If necessary, perform additional analysis. 908 if (MR != MRI_NoModRef) 909 MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); 910 if (MR != MRI_NoModRef) 911 return false; 912 913 // All the checks have passed, so do the transformation. 914 bool changedArgument = false; 915 for (unsigned i = 0; i < CS.arg_size(); ++i) 916 if (CS.getArgument(i)->stripPointerCasts() == cpySrc) { 917 Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest 918 : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), 919 cpyDest->getName(), C); 920 changedArgument = true; 921 if (CS.getArgument(i)->getType() == Dest->getType()) 922 CS.setArgument(i, Dest); 923 else 924 CS.setArgument(i, CastInst::CreatePointerCast(Dest, 925 CS.getArgument(i)->getType(), Dest->getName(), C)); 926 } 927 928 if (!changedArgument) 929 return false; 930 931 // If the destination wasn't sufficiently aligned then increase its alignment. 932 if (!isDestSufficientlyAligned) { 933 assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); 934 cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); 935 } 936 937 // Drop any cached information about the call, because we may have changed 938 // its dependence information by changing its parameter. 939 MD->removeInstruction(C); 940 941 // Update AA metadata 942 // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be 943 // handled here, but combineMetadata doesn't support them yet 944 unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, 945 LLVMContext::MD_noalias, 946 LLVMContext::MD_invariant_group}; 947 combineMetadata(C, cpy, KnownIDs); 948 949 // Remove the memcpy. 950 MD->removeInstruction(cpy); 951 ++NumMemCpyInstr; 952 953 return true; 954 } 955 956 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is 957 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. 958 bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, 959 MemCpyInst *MDep) { 960 // We can only transforms memcpy's where the dest of one is the source of the 961 // other. 962 if (M->getSource() != MDep->getDest() || MDep->isVolatile()) 963 return false; 964 965 // If dep instruction is reading from our current input, then it is a noop 966 // transfer and substituting the input won't change this instruction. Just 967 // ignore the input and let someone else zap MDep. This handles cases like: 968 // memcpy(a <- a) 969 // memcpy(b <- a) 970 if (M->getSource() == MDep->getSource()) 971 return false; 972 973 // Second, the length of the memcpy's must be the same, or the preceding one 974 // must be larger than the following one. 975 ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); 976 ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); 977 if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) 978 return false; 979 980 AliasAnalysis &AA = LookupAliasAnalysis(); 981 982 // Verify that the copied-from memory doesn't change in between the two 983 // transfers. For example, in: 984 // memcpy(a <- b) 985 // *b = 42; 986 // memcpy(c <- a) 987 // It would be invalid to transform the second memcpy into memcpy(c <- b). 988 // 989 // TODO: If the code between M and MDep is transparent to the destination "c", 990 // then we could still perform the xform by moving M up to the first memcpy. 991 // 992 // NOTE: This is conservative, it will stop on any read from the source loc, 993 // not just the defining memcpy. 994 MemDepResult SourceDep = 995 MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, 996 M->getIterator(), M->getParent()); 997 if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) 998 return false; 999 1000 // If the dest of the second might alias the source of the first, then the 1001 // source and dest might overlap. We still want to eliminate the intermediate 1002 // value, but we have to generate a memmove instead of memcpy. 1003 bool UseMemMove = false; 1004 if (!AA.isNoAlias(MemoryLocation::getForDest(M), 1005 MemoryLocation::getForSource(MDep))) 1006 UseMemMove = true; 1007 1008 // If all checks passed, then we can transform M. 1009 1010 // Make sure to use the lesser of the alignment of the source and the dest 1011 // since we're changing where we're reading from, but don't want to increase 1012 // the alignment past what can be read from or written to. 1013 // TODO: Is this worth it if we're creating a less aligned memcpy? For 1014 // example we could be moving from movaps -> movq on x86. 1015 unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); 1016 1017 IRBuilder<> Builder(M); 1018 if (UseMemMove) 1019 Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), 1020 Align, M->isVolatile()); 1021 else 1022 Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), 1023 Align, M->isVolatile()); 1024 1025 // Remove the instruction we're replacing. 1026 MD->removeInstruction(M); 1027 M->eraseFromParent(); 1028 ++NumMemCpyInstr; 1029 return true; 1030 } 1031 1032 /// We've found that the (upward scanning) memory dependence of \p MemCpy is 1033 /// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that 1034 /// weren't copied over by \p MemCpy. 1035 /// 1036 /// In other words, transform: 1037 /// \code 1038 /// memset(dst, c, dst_size); 1039 /// memcpy(dst, src, src_size); 1040 /// \endcode 1041 /// into: 1042 /// \code 1043 /// memcpy(dst, src, src_size); 1044 /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size); 1045 /// \endcode 1046 bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, 1047 MemSetInst *MemSet) { 1048 // We can only transform memset/memcpy with the same destination. 1049 if (MemSet->getDest() != MemCpy->getDest()) 1050 return false; 1051 1052 // Check that there are no other dependencies on the memset destination. 1053 MemDepResult DstDepInfo = 1054 MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false, 1055 MemCpy->getIterator(), MemCpy->getParent()); 1056 if (DstDepInfo.getInst() != MemSet) 1057 return false; 1058 1059 // Use the same i8* dest as the memcpy, killing the memset dest if different. 1060 Value *Dest = MemCpy->getRawDest(); 1061 Value *DestSize = MemSet->getLength(); 1062 Value *SrcSize = MemCpy->getLength(); 1063 1064 // By default, create an unaligned memset. 1065 unsigned Align = 1; 1066 // If Dest is aligned, and SrcSize is constant, use the minimum alignment 1067 // of the sum. 1068 const unsigned DestAlign = 1069 std::max(MemSet->getAlignment(), MemCpy->getAlignment()); 1070 if (DestAlign > 1) 1071 if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize)) 1072 Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign); 1073 1074 IRBuilder<> Builder(MemCpy); 1075 1076 // If the sizes have different types, zext the smaller one. 1077 if (DestSize->getType() != SrcSize->getType()) { 1078 if (DestSize->getType()->getIntegerBitWidth() > 1079 SrcSize->getType()->getIntegerBitWidth()) 1080 SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType()); 1081 else 1082 DestSize = Builder.CreateZExt(DestSize, SrcSize->getType()); 1083 } 1084 1085 Value *MemsetLen = 1086 Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize), 1087 ConstantInt::getNullValue(DestSize->getType()), 1088 Builder.CreateSub(DestSize, SrcSize)); 1089 Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1), 1090 MemsetLen, Align); 1091 1092 MD->removeInstruction(MemSet); 1093 MemSet->eraseFromParent(); 1094 return true; 1095 } 1096 1097 /// Transform memcpy to memset when its source was just memset. 1098 /// In other words, turn: 1099 /// \code 1100 /// memset(dst1, c, dst1_size); 1101 /// memcpy(dst2, dst1, dst2_size); 1102 /// \endcode 1103 /// into: 1104 /// \code 1105 /// memset(dst1, c, dst1_size); 1106 /// memset(dst2, c, dst2_size); 1107 /// \endcode 1108 /// When dst2_size <= dst1_size. 1109 /// 1110 /// The \p MemCpy must have a Constant length. 1111 bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, 1112 MemSetInst *MemSet) { 1113 // This only makes sense on memcpy(..., memset(...), ...). 1114 if (MemSet->getRawDest() != MemCpy->getRawSource()) 1115 return false; 1116 1117 ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength()); 1118 ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength()); 1119 // Make sure the memcpy doesn't read any more than what the memset wrote. 1120 // Don't worry about sizes larger than i64. 1121 if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue()) 1122 return false; 1123 1124 IRBuilder<> Builder(MemCpy); 1125 Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), 1126 CopySize, MemCpy->getAlignment()); 1127 return true; 1128 } 1129 1130 /// Perform simplification of memcpy's. If we have memcpy A 1131 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite 1132 /// B to be a memcpy from X to Z (or potentially a memmove, depending on 1133 /// circumstances). This allows later passes to remove the first memcpy 1134 /// altogether. 1135 bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { 1136 // We can only optimize non-volatile memcpy's. 1137 if (M->isVolatile()) return false; 1138 1139 // If the source and destination of the memcpy are the same, then zap it. 1140 if (M->getSource() == M->getDest()) { 1141 MD->removeInstruction(M); 1142 M->eraseFromParent(); 1143 return false; 1144 } 1145 1146 // If copying from a constant, try to turn the memcpy into a memset. 1147 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) 1148 if (GV->isConstant() && GV->hasDefinitiveInitializer()) 1149 if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { 1150 IRBuilder<> Builder(M); 1151 Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), 1152 M->getAlignment(), false); 1153 MD->removeInstruction(M); 1154 M->eraseFromParent(); 1155 ++NumCpyToSet; 1156 return true; 1157 } 1158 1159 MemDepResult DepInfo = MD->getDependency(M); 1160 1161 // Try to turn a partially redundant memset + memcpy into 1162 // memcpy + smaller memset. We don't need the memcpy size for this. 1163 if (DepInfo.isClobber()) 1164 if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst())) 1165 if (processMemSetMemCpyDependence(M, MDep)) 1166 return true; 1167 1168 // The optimizations after this point require the memcpy size. 1169 ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); 1170 if (!CopySize) return false; 1171 1172 // There are four possible optimizations we can do for memcpy: 1173 // a) memcpy-memcpy xform which exposes redundance for DSE. 1174 // b) call-memcpy xform for return slot optimization. 1175 // c) memcpy from freshly alloca'd space or space that has just started its 1176 // lifetime copies undefined data, and we can therefore eliminate the 1177 // memcpy in favor of the data that was already at the destination. 1178 // d) memcpy from a just-memset'd source can be turned into memset. 1179 if (DepInfo.isClobber()) { 1180 if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { 1181 if (performCallSlotOptzn(M, M->getDest(), M->getSource(), 1182 CopySize->getZExtValue(), M->getAlignment(), 1183 C)) { 1184 MD->removeInstruction(M); 1185 M->eraseFromParent(); 1186 return true; 1187 } 1188 } 1189 } 1190 1191 MemoryLocation SrcLoc = MemoryLocation::getForSource(M); 1192 MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( 1193 SrcLoc, true, M->getIterator(), M->getParent()); 1194 1195 if (SrcDepInfo.isClobber()) { 1196 if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) 1197 return processMemCpyMemCpyDependence(M, MDep); 1198 } else if (SrcDepInfo.isDef()) { 1199 Instruction *I = SrcDepInfo.getInst(); 1200 bool hasUndefContents = false; 1201 1202 if (isa<AllocaInst>(I)) { 1203 hasUndefContents = true; 1204 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 1205 if (II->getIntrinsicID() == Intrinsic::lifetime_start) 1206 if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) 1207 if (LTSize->getZExtValue() >= CopySize->getZExtValue()) 1208 hasUndefContents = true; 1209 } 1210 1211 if (hasUndefContents) { 1212 MD->removeInstruction(M); 1213 M->eraseFromParent(); 1214 ++NumMemCpyInstr; 1215 return true; 1216 } 1217 } 1218 1219 if (SrcDepInfo.isClobber()) 1220 if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst())) 1221 if (performMemCpyToMemSetOptzn(M, MDep)) { 1222 MD->removeInstruction(M); 1223 M->eraseFromParent(); 1224 ++NumCpyToSet; 1225 return true; 1226 } 1227 1228 return false; 1229 } 1230 1231 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed 1232 /// not to alias. 1233 bool MemCpyOptPass::processMemMove(MemMoveInst *M) { 1234 AliasAnalysis &AA = LookupAliasAnalysis(); 1235 1236 if (!TLI->has(LibFunc::memmove)) 1237 return false; 1238 1239 // See if the pointers alias. 1240 if (!AA.isNoAlias(MemoryLocation::getForDest(M), 1241 MemoryLocation::getForSource(M))) 1242 return false; 1243 1244 DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M 1245 << "\n"); 1246 1247 // If not, then we know we can transform this. 1248 Type *ArgTys[3] = { M->getRawDest()->getType(), 1249 M->getRawSource()->getType(), 1250 M->getLength()->getType() }; 1251 M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), 1252 Intrinsic::memcpy, ArgTys)); 1253 1254 // MemDep may have over conservative information about this instruction, just 1255 // conservatively flush it from the cache. 1256 MD->removeInstruction(M); 1257 1258 ++NumMoveToCpy; 1259 return true; 1260 } 1261 1262 /// This is called on every byval argument in call sites. 1263 bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) { 1264 const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); 1265 // Find out what feeds this byval argument. 1266 Value *ByValArg = CS.getArgument(ArgNo); 1267 Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); 1268 uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); 1269 MemDepResult DepInfo = MD->getPointerDependencyFrom( 1270 MemoryLocation(ByValArg, ByValSize), true, 1271 CS.getInstruction()->getIterator(), CS.getInstruction()->getParent()); 1272 if (!DepInfo.isClobber()) 1273 return false; 1274 1275 // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by 1276 // a memcpy, see if we can byval from the source of the memcpy instead of the 1277 // result. 1278 MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); 1279 if (!MDep || MDep->isVolatile() || 1280 ByValArg->stripPointerCasts() != MDep->getDest()) 1281 return false; 1282 1283 // The length of the memcpy must be larger or equal to the size of the byval. 1284 ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); 1285 if (!C1 || C1->getValue().getZExtValue() < ByValSize) 1286 return false; 1287 1288 // Get the alignment of the byval. If the call doesn't specify the alignment, 1289 // then it is some target specific value that we can't know. 1290 unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); 1291 if (ByValAlign == 0) return false; 1292 1293 // If it is greater than the memcpy, then we check to see if we can force the 1294 // source of the memcpy to the alignment we need. If we fail, we bail out. 1295 AssumptionCache &AC = LookupAssumptionCache(); 1296 DominatorTree &DT = LookupDomTree(); 1297 if (MDep->getAlignment() < ByValAlign && 1298 getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, 1299 CS.getInstruction(), &AC, &DT) < ByValAlign) 1300 return false; 1301 1302 // Verify that the copied-from memory doesn't change in between the memcpy and 1303 // the byval call. 1304 // memcpy(a <- b) 1305 // *b = 42; 1306 // foo(*a) 1307 // It would be invalid to transform the second memcpy into foo(*b). 1308 // 1309 // NOTE: This is conservative, it will stop on any read from the source loc, 1310 // not just the defining memcpy. 1311 MemDepResult SourceDep = MD->getPointerDependencyFrom( 1312 MemoryLocation::getForSource(MDep), false, 1313 CS.getInstruction()->getIterator(), MDep->getParent()); 1314 if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) 1315 return false; 1316 1317 Value *TmpCast = MDep->getSource(); 1318 if (MDep->getSource()->getType() != ByValArg->getType()) 1319 TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), 1320 "tmpcast", CS.getInstruction()); 1321 1322 DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n" 1323 << " " << *MDep << "\n" 1324 << " " << *CS.getInstruction() << "\n"); 1325 1326 // Otherwise we're good! Update the byval argument. 1327 CS.setArgument(ArgNo, TmpCast); 1328 ++NumMemCpyInstr; 1329 return true; 1330 } 1331 1332 /// Executes one iteration of MemCpyOptPass. 1333 bool MemCpyOptPass::iterateOnFunction(Function &F) { 1334 bool MadeChange = false; 1335 1336 // Walk all instruction in the function. 1337 for (BasicBlock &BB : F) { 1338 for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { 1339 // Avoid invalidating the iterator. 1340 Instruction *I = &*BI++; 1341 1342 bool RepeatInstruction = false; 1343 1344 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1345 MadeChange |= processStore(SI, BI); 1346 else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) 1347 RepeatInstruction = processMemSet(M, BI); 1348 else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) 1349 RepeatInstruction = processMemCpy(M); 1350 else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) 1351 RepeatInstruction = processMemMove(M); 1352 else if (auto CS = CallSite(I)) { 1353 for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) 1354 if (CS.isByValArgument(i)) 1355 MadeChange |= processByValArgument(CS, i); 1356 } 1357 1358 // Reprocess the instruction if desired. 1359 if (RepeatInstruction) { 1360 if (BI != BB.begin()) 1361 --BI; 1362 MadeChange = true; 1363 } 1364 } 1365 } 1366 1367 return MadeChange; 1368 } 1369 1370 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { 1371 1372 auto &MD = AM.getResult<MemoryDependenceAnalysis>(F); 1373 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 1374 1375 auto LookupAliasAnalysis = [&]() -> AliasAnalysis & { 1376 return AM.getResult<AAManager>(F); 1377 }; 1378 auto LookupAssumptionCache = [&]() -> AssumptionCache & { 1379 return AM.getResult<AssumptionAnalysis>(F); 1380 }; 1381 auto LookupDomTree = [&]() -> DominatorTree & { 1382 return AM.getResult<DominatorTreeAnalysis>(F); 1383 }; 1384 1385 bool MadeChange = runImpl(F, &MD, &TLI, LookupAliasAnalysis, 1386 LookupAssumptionCache, LookupDomTree); 1387 if (!MadeChange) 1388 return PreservedAnalyses::all(); 1389 PreservedAnalyses PA; 1390 PA.preserve<GlobalsAA>(); 1391 PA.preserve<MemoryDependenceAnalysis>(); 1392 return PA; 1393 } 1394 1395 bool MemCpyOptPass::runImpl( 1396 Function &F, MemoryDependenceResults *MD_, TargetLibraryInfo *TLI_, 1397 std::function<AliasAnalysis &()> LookupAliasAnalysis_, 1398 std::function<AssumptionCache &()> LookupAssumptionCache_, 1399 std::function<DominatorTree &()> LookupDomTree_) { 1400 bool MadeChange = false; 1401 MD = MD_; 1402 TLI = TLI_; 1403 LookupAliasAnalysis = std::move(LookupAliasAnalysis_); 1404 LookupAssumptionCache = std::move(LookupAssumptionCache_); 1405 LookupDomTree = std::move(LookupDomTree_); 1406 1407 // If we don't have at least memset and memcpy, there is little point of doing 1408 // anything here. These are required by a freestanding implementation, so if 1409 // even they are disabled, there is no point in trying hard. 1410 if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy)) 1411 return false; 1412 1413 while (1) { 1414 if (!iterateOnFunction(F)) 1415 break; 1416 MadeChange = true; 1417 } 1418 1419 MD = nullptr; 1420 return MadeChange; 1421 } 1422 1423 /// This is the main transformation entry point for a function. 1424 bool MemCpyOptLegacyPass::runOnFunction(Function &F) { 1425 if (skipFunction(F)) 1426 return false; 1427 1428 auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); 1429 auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); 1430 1431 auto LookupAliasAnalysis = [this]() -> AliasAnalysis & { 1432 return getAnalysis<AAResultsWrapperPass>().getAAResults(); 1433 }; 1434 auto LookupAssumptionCache = [this, &F]() -> AssumptionCache & { 1435 return getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1436 }; 1437 auto LookupDomTree = [this]() -> DominatorTree & { 1438 return getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1439 }; 1440 1441 return Impl.runImpl(F, MD, TLI, LookupAliasAnalysis, LookupAssumptionCache, 1442 LookupDomTree); 1443 } 1444