1 //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs various transformations related to eliminating memcpy 11 // calls, or transforming sets of stores into memset's. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/Scalar.h" 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/MemoryDependenceAnalysis.h" 20 #include "llvm/Analysis/ValueTracking.h" 21 #include "llvm/IR/DataLayout.h" 22 #include "llvm/IR/Dominators.h" 23 #include "llvm/IR/GetElementPtrTypeIterator.h" 24 #include "llvm/IR/GlobalVariable.h" 25 #include "llvm/IR/IRBuilder.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/IntrinsicInst.h" 28 #include "llvm/Support/Debug.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include "llvm/Target/TargetLibraryInfo.h" 31 #include "llvm/Transforms/Utils/Local.h" 32 #include <list> 33 using namespace llvm; 34 35 #define DEBUG_TYPE "memcpyopt" 36 37 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); 38 STATISTIC(NumMemSetInfer, "Number of memsets inferred"); 39 STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); 40 STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); 41 42 static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, 43 bool &VariableIdxFound, const DataLayout &TD){ 44 // Skip over the first indices. 45 gep_type_iterator GTI = gep_type_begin(GEP); 46 for (unsigned i = 1; i != Idx; ++i, ++GTI) 47 /*skip along*/; 48 49 // Compute the offset implied by the rest of the indices. 50 int64_t Offset = 0; 51 for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { 52 ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); 53 if (!OpC) 54 return VariableIdxFound = true; 55 if (OpC->isZero()) continue; // No offset. 56 57 // Handle struct indices, which add their field offset to the pointer. 58 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 59 Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); 60 continue; 61 } 62 63 // Otherwise, we have a sequential type like an array or vector. Multiply 64 // the index by the ElementSize. 65 uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); 66 Offset += Size*OpC->getSExtValue(); 67 } 68 69 return Offset; 70 } 71 72 /// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a 73 /// constant offset, and return that constant offset. For example, Ptr1 might 74 /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. 75 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, 76 const DataLayout &TD) { 77 Ptr1 = Ptr1->stripPointerCasts(); 78 Ptr2 = Ptr2->stripPointerCasts(); 79 80 // Handle the trivial case first. 81 if (Ptr1 == Ptr2) { 82 Offset = 0; 83 return true; 84 } 85 86 GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); 87 GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); 88 89 bool VariableIdxFound = false; 90 91 // If one pointer is a GEP and the other isn't, then see if the GEP is a 92 // constant offset from the base, as in "P" and "gep P, 1". 93 if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { 94 Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); 95 return !VariableIdxFound; 96 } 97 98 if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { 99 Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); 100 return !VariableIdxFound; 101 } 102 103 // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical 104 // base. After that base, they may have some number of common (and 105 // potentially variable) indices. After that they handle some constant 106 // offset, which determines their offset from each other. At this point, we 107 // handle no other case. 108 if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) 109 return false; 110 111 // Skip any common indices and track the GEP types. 112 unsigned Idx = 1; 113 for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) 114 if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) 115 break; 116 117 int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); 118 int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); 119 if (VariableIdxFound) return false; 120 121 Offset = Offset2-Offset1; 122 return true; 123 } 124 125 126 /// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. 127 /// This allows us to analyze stores like: 128 /// store 0 -> P+1 129 /// store 0 -> P+0 130 /// store 0 -> P+3 131 /// store 0 -> P+2 132 /// which sometimes happens with stores to arrays of structs etc. When we see 133 /// the first store, we make a range [1, 2). The second store extends the range 134 /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the 135 /// two ranges into [0, 3) which is memset'able. 136 namespace { 137 struct MemsetRange { 138 // Start/End - A semi range that describes the span that this range covers. 139 // The range is closed at the start and open at the end: [Start, End). 140 int64_t Start, End; 141 142 /// StartPtr - The getelementptr instruction that points to the start of the 143 /// range. 144 Value *StartPtr; 145 146 /// Alignment - The known alignment of the first store. 147 unsigned Alignment; 148 149 /// TheStores - The actual stores that make up this range. 150 SmallVector<Instruction*, 16> TheStores; 151 152 bool isProfitableToUseMemset(const DataLayout &TD) const; 153 154 }; 155 } // end anon namespace 156 157 bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { 158 // If we found more than 4 stores to merge or 16 bytes, use memset. 159 if (TheStores.size() >= 4 || End-Start >= 16) return true; 160 161 // If there is nothing to merge, don't do anything. 162 if (TheStores.size() < 2) return false; 163 164 // If any of the stores are a memset, then it is always good to extend the 165 // memset. 166 for (unsigned i = 0, e = TheStores.size(); i != e; ++i) 167 if (!isa<StoreInst>(TheStores[i])) 168 return true; 169 170 // Assume that the code generator is capable of merging pairs of stores 171 // together if it wants to. 172 if (TheStores.size() == 2) return false; 173 174 // If we have fewer than 8 stores, it can still be worthwhile to do this. 175 // For example, merging 4 i8 stores into an i32 store is useful almost always. 176 // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the 177 // memset will be split into 2 32-bit stores anyway) and doing so can 178 // pessimize the llvm optimizer. 179 // 180 // Since we don't have perfect knowledge here, make some assumptions: assume 181 // the maximum GPR width is the same size as the largest legal integer 182 // size. If so, check to see whether we will end up actually reducing the 183 // number of stores used. 184 unsigned Bytes = unsigned(End-Start); 185 unsigned MaxIntSize = TD.getLargestLegalIntTypeSize(); 186 if (MaxIntSize == 0) 187 MaxIntSize = 1; 188 unsigned NumPointerStores = Bytes / MaxIntSize; 189 190 // Assume the remaining bytes if any are done a byte at a time. 191 unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; 192 193 // If we will reduce the # stores (according to this heuristic), do the 194 // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 195 // etc. 196 return TheStores.size() > NumPointerStores+NumByteStores; 197 } 198 199 200 namespace { 201 class MemsetRanges { 202 /// Ranges - A sorted list of the memset ranges. We use std::list here 203 /// because each element is relatively large and expensive to copy. 204 std::list<MemsetRange> Ranges; 205 typedef std::list<MemsetRange>::iterator range_iterator; 206 const DataLayout &DL; 207 public: 208 MemsetRanges(const DataLayout &DL) : DL(DL) {} 209 210 typedef std::list<MemsetRange>::const_iterator const_iterator; 211 const_iterator begin() const { return Ranges.begin(); } 212 const_iterator end() const { return Ranges.end(); } 213 bool empty() const { return Ranges.empty(); } 214 215 void addInst(int64_t OffsetFromFirst, Instruction *Inst) { 216 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 217 addStore(OffsetFromFirst, SI); 218 else 219 addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); 220 } 221 222 void addStore(int64_t OffsetFromFirst, StoreInst *SI) { 223 int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType()); 224 225 addRange(OffsetFromFirst, StoreSize, 226 SI->getPointerOperand(), SI->getAlignment(), SI); 227 } 228 229 void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { 230 int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 231 addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); 232 } 233 234 void addRange(int64_t Start, int64_t Size, Value *Ptr, 235 unsigned Alignment, Instruction *Inst); 236 237 }; 238 239 } // end anon namespace 240 241 242 /// addRange - Add a new store to the MemsetRanges data structure. This adds a 243 /// new range for the specified store at the specified offset, merging into 244 /// existing ranges as appropriate. 245 /// 246 /// Do a linear search of the ranges to see if this can be joined and/or to 247 /// find the insertion point in the list. We keep the ranges sorted for 248 /// simplicity here. This is a linear search of a linked list, which is ugly, 249 /// however the number of ranges is limited, so this won't get crazy slow. 250 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, 251 unsigned Alignment, Instruction *Inst) { 252 int64_t End = Start+Size; 253 range_iterator I = Ranges.begin(), E = Ranges.end(); 254 255 while (I != E && Start > I->End) 256 ++I; 257 258 // We now know that I == E, in which case we didn't find anything to merge 259 // with, or that Start <= I->End. If End < I->Start or I == E, then we need 260 // to insert a new range. Handle this now. 261 if (I == E || End < I->Start) { 262 MemsetRange &R = *Ranges.insert(I, MemsetRange()); 263 R.Start = Start; 264 R.End = End; 265 R.StartPtr = Ptr; 266 R.Alignment = Alignment; 267 R.TheStores.push_back(Inst); 268 return; 269 } 270 271 // This store overlaps with I, add it. 272 I->TheStores.push_back(Inst); 273 274 // At this point, we may have an interval that completely contains our store. 275 // If so, just add it to the interval and return. 276 if (I->Start <= Start && I->End >= End) 277 return; 278 279 // Now we know that Start <= I->End and End >= I->Start so the range overlaps 280 // but is not entirely contained within the range. 281 282 // See if the range extends the start of the range. In this case, it couldn't 283 // possibly cause it to join the prior range, because otherwise we would have 284 // stopped on *it*. 285 if (Start < I->Start) { 286 I->Start = Start; 287 I->StartPtr = Ptr; 288 I->Alignment = Alignment; 289 } 290 291 // Now we know that Start <= I->End and Start >= I->Start (so the startpoint 292 // is in or right at the end of I), and that End >= I->Start. Extend I out to 293 // End. 294 if (End > I->End) { 295 I->End = End; 296 range_iterator NextI = I; 297 while (++NextI != E && End >= NextI->Start) { 298 // Merge the range in. 299 I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); 300 if (NextI->End > I->End) 301 I->End = NextI->End; 302 Ranges.erase(NextI); 303 NextI = I; 304 } 305 } 306 } 307 308 //===----------------------------------------------------------------------===// 309 // MemCpyOpt Pass 310 //===----------------------------------------------------------------------===// 311 312 namespace { 313 class MemCpyOpt : public FunctionPass { 314 MemoryDependenceAnalysis *MD; 315 TargetLibraryInfo *TLI; 316 const DataLayout *DL; 317 public: 318 static char ID; // Pass identification, replacement for typeid 319 MemCpyOpt() : FunctionPass(ID) { 320 initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); 321 MD = nullptr; 322 TLI = nullptr; 323 DL = nullptr; 324 } 325 326 bool runOnFunction(Function &F) override; 327 328 private: 329 // This transformation requires dominator postdominator info 330 void getAnalysisUsage(AnalysisUsage &AU) const override { 331 AU.setPreservesCFG(); 332 AU.addRequired<DominatorTreeWrapperPass>(); 333 AU.addRequired<MemoryDependenceAnalysis>(); 334 AU.addRequired<AliasAnalysis>(); 335 AU.addRequired<TargetLibraryInfo>(); 336 AU.addPreserved<AliasAnalysis>(); 337 AU.addPreserved<MemoryDependenceAnalysis>(); 338 } 339 340 // Helper fuctions 341 bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); 342 bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); 343 bool processMemCpy(MemCpyInst *M); 344 bool processMemMove(MemMoveInst *M); 345 bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, 346 uint64_t cpyLen, unsigned cpyAlign, CallInst *C); 347 bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, 348 uint64_t MSize); 349 bool processByValArgument(CallSite CS, unsigned ArgNo); 350 Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, 351 Value *ByteVal); 352 353 bool iterateOnFunction(Function &F); 354 }; 355 356 char MemCpyOpt::ID = 0; 357 } 358 359 // createMemCpyOptPass - The public interface to this file... 360 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } 361 362 INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", 363 false, false) 364 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 365 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) 366 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) 367 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 368 INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", 369 false, false) 370 371 /// tryMergingIntoMemset - When scanning forward over instructions, we look for 372 /// some other patterns to fold away. In particular, this looks for stores to 373 /// neighboring locations of memory. If it sees enough consecutive ones, it 374 /// attempts to merge them together into a memcpy/memset. 375 Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, 376 Value *StartPtr, Value *ByteVal) { 377 if (!DL) return nullptr; 378 379 // Okay, so we now have a single store that can be splatable. Scan to find 380 // all subsequent stores of the same value to offset from the same pointer. 381 // Join these together into ranges, so we can decide whether contiguous blocks 382 // are stored. 383 MemsetRanges Ranges(*DL); 384 385 BasicBlock::iterator BI = StartInst; 386 for (++BI; !isa<TerminatorInst>(BI); ++BI) { 387 if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { 388 // If the instruction is readnone, ignore it, otherwise bail out. We 389 // don't even allow readonly here because we don't want something like: 390 // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). 391 if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) 392 break; 393 continue; 394 } 395 396 if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { 397 // If this is a store, see if we can merge it in. 398 if (!NextStore->isSimple()) break; 399 400 // Check to see if this stored value is of the same byte-splattable value. 401 if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) 402 break; 403 404 // Check to see if this store is to a constant offset from the start ptr. 405 int64_t Offset; 406 if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), 407 Offset, *DL)) 408 break; 409 410 Ranges.addStore(Offset, NextStore); 411 } else { 412 MemSetInst *MSI = cast<MemSetInst>(BI); 413 414 if (MSI->isVolatile() || ByteVal != MSI->getValue() || 415 !isa<ConstantInt>(MSI->getLength())) 416 break; 417 418 // Check to see if this store is to a constant offset from the start ptr. 419 int64_t Offset; 420 if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *DL)) 421 break; 422 423 Ranges.addMemSet(Offset, MSI); 424 } 425 } 426 427 // If we have no ranges, then we just had a single store with nothing that 428 // could be merged in. This is a very common case of course. 429 if (Ranges.empty()) 430 return nullptr; 431 432 // If we had at least one store that could be merged in, add the starting 433 // store as well. We try to avoid this unless there is at least something 434 // interesting as a small compile-time optimization. 435 Ranges.addInst(0, StartInst); 436 437 // If we create any memsets, we put it right before the first instruction that 438 // isn't part of the memset block. This ensure that the memset is dominated 439 // by any addressing instruction needed by the start of the block. 440 IRBuilder<> Builder(BI); 441 442 // Now that we have full information about ranges, loop over the ranges and 443 // emit memset's for anything big enough to be worthwhile. 444 Instruction *AMemSet = nullptr; 445 for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); 446 I != E; ++I) { 447 const MemsetRange &Range = *I; 448 449 if (Range.TheStores.size() == 1) continue; 450 451 // If it is profitable to lower this range to memset, do so now. 452 if (!Range.isProfitableToUseMemset(*DL)) 453 continue; 454 455 // Otherwise, we do want to transform this! Create a new memset. 456 // Get the starting pointer of the block. 457 StartPtr = Range.StartPtr; 458 459 // Determine alignment 460 unsigned Alignment = Range.Alignment; 461 if (Alignment == 0) { 462 Type *EltType = 463 cast<PointerType>(StartPtr->getType())->getElementType(); 464 Alignment = DL->getABITypeAlignment(EltType); 465 } 466 467 AMemSet = 468 Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); 469 470 DEBUG(dbgs() << "Replace stores:\n"; 471 for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) 472 dbgs() << *Range.TheStores[i] << '\n'; 473 dbgs() << "With: " << *AMemSet << '\n'); 474 475 if (!Range.TheStores.empty()) 476 AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); 477 478 // Zap all the stores. 479 for (SmallVectorImpl<Instruction *>::const_iterator 480 SI = Range.TheStores.begin(), 481 SE = Range.TheStores.end(); SI != SE; ++SI) { 482 MD->removeInstruction(*SI); 483 (*SI)->eraseFromParent(); 484 } 485 ++NumMemSetInfer; 486 } 487 488 return AMemSet; 489 } 490 491 492 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { 493 if (!SI->isSimple()) return false; 494 495 if (!DL) return false; 496 497 // Detect cases where we're performing call slot forwarding, but 498 // happen to be using a load-store pair to implement it, rather than 499 // a memcpy. 500 if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { 501 if (LI->isSimple() && LI->hasOneUse() && 502 LI->getParent() == SI->getParent()) { 503 MemDepResult ldep = MD->getDependency(LI); 504 CallInst *C = nullptr; 505 if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) 506 C = dyn_cast<CallInst>(ldep.getInst()); 507 508 if (C) { 509 // Check that nothing touches the dest of the "copy" between 510 // the call and the store. 511 AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); 512 AliasAnalysis::Location StoreLoc = AA.getLocation(SI); 513 for (BasicBlock::iterator I = --BasicBlock::iterator(SI), 514 E = C; I != E; --I) { 515 if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { 516 C = nullptr; 517 break; 518 } 519 } 520 } 521 522 if (C) { 523 unsigned storeAlign = SI->getAlignment(); 524 if (!storeAlign) 525 storeAlign = DL->getABITypeAlignment(SI->getOperand(0)->getType()); 526 unsigned loadAlign = LI->getAlignment(); 527 if (!loadAlign) 528 loadAlign = DL->getABITypeAlignment(LI->getType()); 529 530 bool changed = performCallSlotOptzn(LI, 531 SI->getPointerOperand()->stripPointerCasts(), 532 LI->getPointerOperand()->stripPointerCasts(), 533 DL->getTypeStoreSize(SI->getOperand(0)->getType()), 534 std::min(storeAlign, loadAlign), C); 535 if (changed) { 536 MD->removeInstruction(SI); 537 SI->eraseFromParent(); 538 MD->removeInstruction(LI); 539 LI->eraseFromParent(); 540 ++NumMemCpyInstr; 541 return true; 542 } 543 } 544 } 545 } 546 547 // There are two cases that are interesting for this code to handle: memcpy 548 // and memset. Right now we only handle memset. 549 550 // Ensure that the value being stored is something that can be memset'able a 551 // byte at a time like "0" or "-1" or any width, as well as things like 552 // 0xA0A0A0A0 and 0.0. 553 if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) 554 if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), 555 ByteVal)) { 556 BBI = I; // Don't invalidate iterator. 557 return true; 558 } 559 560 return false; 561 } 562 563 bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { 564 // See if there is another memset or store neighboring this memset which 565 // allows us to widen out the memset to do a single larger store. 566 if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) 567 if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), 568 MSI->getValue())) { 569 BBI = I; // Don't invalidate iterator. 570 return true; 571 } 572 return false; 573 } 574 575 576 /// performCallSlotOptzn - takes a memcpy and a call that it depends on, 577 /// and checks for the possibility of a call slot optimization by having 578 /// the call write its result directly into the destination of the memcpy. 579 bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, 580 Value *cpyDest, Value *cpySrc, 581 uint64_t cpyLen, unsigned cpyAlign, 582 CallInst *C) { 583 // The general transformation to keep in mind is 584 // 585 // call @func(..., src, ...) 586 // memcpy(dest, src, ...) 587 // 588 // -> 589 // 590 // memcpy(dest, src, ...) 591 // call @func(..., dest, ...) 592 // 593 // Since moving the memcpy is technically awkward, we additionally check that 594 // src only holds uninitialized values at the moment of the call, meaning that 595 // the memcpy can be discarded rather than moved. 596 597 // Deliberately get the source and destination with bitcasts stripped away, 598 // because we'll need to do type comparisons based on the underlying type. 599 CallSite CS(C); 600 601 // Require that src be an alloca. This simplifies the reasoning considerably. 602 AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); 603 if (!srcAlloca) 604 return false; 605 606 // Check that all of src is copied to dest. 607 if (!DL) return false; 608 609 ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); 610 if (!srcArraySize) 611 return false; 612 613 uint64_t srcSize = DL->getTypeAllocSize(srcAlloca->getAllocatedType()) * 614 srcArraySize->getZExtValue(); 615 616 if (cpyLen < srcSize) 617 return false; 618 619 // Check that accessing the first srcSize bytes of dest will not cause a 620 // trap. Otherwise the transform is invalid since it might cause a trap 621 // to occur earlier than it otherwise would. 622 if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) { 623 // The destination is an alloca. Check it is larger than srcSize. 624 ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize()); 625 if (!destArraySize) 626 return false; 627 628 uint64_t destSize = DL->getTypeAllocSize(A->getAllocatedType()) * 629 destArraySize->getZExtValue(); 630 631 if (destSize < srcSize) 632 return false; 633 } else if (Argument *A = dyn_cast<Argument>(cpyDest)) { 634 // If the destination is an sret parameter then only accesses that are 635 // outside of the returned struct type can trap. 636 if (!A->hasStructRetAttr()) 637 return false; 638 639 Type *StructTy = cast<PointerType>(A->getType())->getElementType(); 640 if (!StructTy->isSized()) { 641 // The call may never return and hence the copy-instruction may never 642 // be executed, and therefore it's not safe to say "the destination 643 // has at least <cpyLen> bytes, as implied by the copy-instruction", 644 return false; 645 } 646 647 uint64_t destSize = DL->getTypeAllocSize(StructTy); 648 if (destSize < srcSize) 649 return false; 650 } else { 651 return false; 652 } 653 654 // Check that dest points to memory that is at least as aligned as src. 655 unsigned srcAlign = srcAlloca->getAlignment(); 656 if (!srcAlign) 657 srcAlign = DL->getABITypeAlignment(srcAlloca->getAllocatedType()); 658 bool isDestSufficientlyAligned = srcAlign <= cpyAlign; 659 // If dest is not aligned enough and we can't increase its alignment then 660 // bail out. 661 if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) 662 return false; 663 664 // Check that src is not accessed except via the call and the memcpy. This 665 // guarantees that it holds only undefined values when passed in (so the final 666 // memcpy can be dropped), that it is not read or written between the call and 667 // the memcpy, and that writing beyond the end of it is undefined. 668 SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(), 669 srcAlloca->user_end()); 670 while (!srcUseList.empty()) { 671 User *U = srcUseList.pop_back_val(); 672 673 if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) { 674 for (User *UU : U->users()) 675 srcUseList.push_back(UU); 676 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) { 677 if (G->hasAllZeroIndices()) 678 for (User *UU : U->users()) 679 srcUseList.push_back(UU); 680 else 681 return false; 682 } else if (U != C && U != cpy) { 683 return false; 684 } 685 } 686 687 // Since we're changing the parameter to the callsite, we need to make sure 688 // that what would be the new parameter dominates the callsite. 689 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 690 if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) 691 if (!DT.dominates(cpyDestInst, C)) 692 return false; 693 694 // In addition to knowing that the call does not access src in some 695 // unexpected manner, for example via a global, which we deduce from 696 // the use analysis, we also need to know that it does not sneakily 697 // access dest. We rely on AA to figure this out for us. 698 AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); 699 AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); 700 // If necessary, perform additional analysis. 701 if (MR != AliasAnalysis::NoModRef) 702 MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); 703 if (MR != AliasAnalysis::NoModRef) 704 return false; 705 706 // All the checks have passed, so do the transformation. 707 bool changedArgument = false; 708 for (unsigned i = 0; i < CS.arg_size(); ++i) 709 if (CS.getArgument(i)->stripPointerCasts() == cpySrc) { 710 Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest 711 : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), 712 cpyDest->getName(), C); 713 changedArgument = true; 714 if (CS.getArgument(i)->getType() == Dest->getType()) 715 CS.setArgument(i, Dest); 716 else 717 CS.setArgument(i, CastInst::CreatePointerCast(Dest, 718 CS.getArgument(i)->getType(), Dest->getName(), C)); 719 } 720 721 if (!changedArgument) 722 return false; 723 724 // If the destination wasn't sufficiently aligned then increase its alignment. 725 if (!isDestSufficientlyAligned) { 726 assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); 727 cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); 728 } 729 730 // Drop any cached information about the call, because we may have changed 731 // its dependence information by changing its parameter. 732 MD->removeInstruction(C); 733 734 // Remove the memcpy. 735 MD->removeInstruction(cpy); 736 ++NumMemCpyInstr; 737 738 return true; 739 } 740 741 /// processMemCpyMemCpyDependence - We've found that the (upward scanning) 742 /// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to 743 /// copy from MDep's input if we can. MSize is the size of M's copy. 744 /// 745 bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, 746 uint64_t MSize) { 747 // We can only transforms memcpy's where the dest of one is the source of the 748 // other. 749 if (M->getSource() != MDep->getDest() || MDep->isVolatile()) 750 return false; 751 752 // If dep instruction is reading from our current input, then it is a noop 753 // transfer and substituting the input won't change this instruction. Just 754 // ignore the input and let someone else zap MDep. This handles cases like: 755 // memcpy(a <- a) 756 // memcpy(b <- a) 757 if (M->getSource() == MDep->getSource()) 758 return false; 759 760 // Second, the length of the memcpy's must be the same, or the preceding one 761 // must be larger than the following one. 762 ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); 763 ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); 764 if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) 765 return false; 766 767 AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); 768 769 // Verify that the copied-from memory doesn't change in between the two 770 // transfers. For example, in: 771 // memcpy(a <- b) 772 // *b = 42; 773 // memcpy(c <- a) 774 // It would be invalid to transform the second memcpy into memcpy(c <- b). 775 // 776 // TODO: If the code between M and MDep is transparent to the destination "c", 777 // then we could still perform the xform by moving M up to the first memcpy. 778 // 779 // NOTE: This is conservative, it will stop on any read from the source loc, 780 // not just the defining memcpy. 781 MemDepResult SourceDep = 782 MD->getPointerDependencyFrom(AA.getLocationForSource(MDep), 783 false, M, M->getParent()); 784 if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) 785 return false; 786 787 // If the dest of the second might alias the source of the first, then the 788 // source and dest might overlap. We still want to eliminate the intermediate 789 // value, but we have to generate a memmove instead of memcpy. 790 bool UseMemMove = false; 791 if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) 792 UseMemMove = true; 793 794 // If all checks passed, then we can transform M. 795 796 // Make sure to use the lesser of the alignment of the source and the dest 797 // since we're changing where we're reading from, but don't want to increase 798 // the alignment past what can be read from or written to. 799 // TODO: Is this worth it if we're creating a less aligned memcpy? For 800 // example we could be moving from movaps -> movq on x86. 801 unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); 802 803 IRBuilder<> Builder(M); 804 if (UseMemMove) 805 Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), 806 Align, M->isVolatile()); 807 else 808 Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), 809 Align, M->isVolatile()); 810 811 // Remove the instruction we're replacing. 812 MD->removeInstruction(M); 813 M->eraseFromParent(); 814 ++NumMemCpyInstr; 815 return true; 816 } 817 818 819 /// processMemCpy - perform simplification of memcpy's. If we have memcpy A 820 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite 821 /// B to be a memcpy from X to Z (or potentially a memmove, depending on 822 /// circumstances). This allows later passes to remove the first memcpy 823 /// altogether. 824 bool MemCpyOpt::processMemCpy(MemCpyInst *M) { 825 // We can only optimize non-volatile memcpy's. 826 if (M->isVolatile()) return false; 827 828 // If the source and destination of the memcpy are the same, then zap it. 829 if (M->getSource() == M->getDest()) { 830 MD->removeInstruction(M); 831 M->eraseFromParent(); 832 return false; 833 } 834 835 // If copying from a constant, try to turn the memcpy into a memset. 836 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) 837 if (GV->isConstant() && GV->hasDefinitiveInitializer()) 838 if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { 839 IRBuilder<> Builder(M); 840 Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), 841 M->getAlignment(), false); 842 MD->removeInstruction(M); 843 M->eraseFromParent(); 844 ++NumCpyToSet; 845 return true; 846 } 847 848 // The optimizations after this point require the memcpy size. 849 ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); 850 if (!CopySize) return false; 851 852 // The are three possible optimizations we can do for memcpy: 853 // a) memcpy-memcpy xform which exposes redundance for DSE. 854 // b) call-memcpy xform for return slot optimization. 855 // c) memcpy from freshly alloca'd space or space that has just started its 856 // lifetime copies undefined data, and we can therefore eliminate the 857 // memcpy in favor of the data that was already at the destination. 858 MemDepResult DepInfo = MD->getDependency(M); 859 if (DepInfo.isClobber()) { 860 if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { 861 if (performCallSlotOptzn(M, M->getDest(), M->getSource(), 862 CopySize->getZExtValue(), M->getAlignment(), 863 C)) { 864 MD->removeInstruction(M); 865 M->eraseFromParent(); 866 return true; 867 } 868 } 869 } 870 871 AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); 872 MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, 873 M, M->getParent()); 874 if (SrcDepInfo.isClobber()) { 875 if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) 876 return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); 877 } else if (SrcDepInfo.isDef()) { 878 Instruction *I = SrcDepInfo.getInst(); 879 bool hasUndefContents = false; 880 881 if (isa<AllocaInst>(I)) { 882 hasUndefContents = true; 883 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 884 if (II->getIntrinsicID() == Intrinsic::lifetime_start) 885 if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0))) 886 if (LTSize->getZExtValue() >= CopySize->getZExtValue()) 887 hasUndefContents = true; 888 } 889 890 if (hasUndefContents) { 891 MD->removeInstruction(M); 892 M->eraseFromParent(); 893 ++NumMemCpyInstr; 894 return true; 895 } 896 } 897 898 return false; 899 } 900 901 /// processMemMove - Transforms memmove calls to memcpy calls when the src/dst 902 /// are guaranteed not to alias. 903 bool MemCpyOpt::processMemMove(MemMoveInst *M) { 904 AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); 905 906 if (!TLI->has(LibFunc::memmove)) 907 return false; 908 909 // See if the pointers alias. 910 if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) 911 return false; 912 913 DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); 914 915 // If not, then we know we can transform this. 916 Module *Mod = M->getParent()->getParent()->getParent(); 917 Type *ArgTys[3] = { M->getRawDest()->getType(), 918 M->getRawSource()->getType(), 919 M->getLength()->getType() }; 920 M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, 921 ArgTys)); 922 923 // MemDep may have over conservative information about this instruction, just 924 // conservatively flush it from the cache. 925 MD->removeInstruction(M); 926 927 ++NumMoveToCpy; 928 return true; 929 } 930 931 /// processByValArgument - This is called on every byval argument in call sites. 932 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { 933 if (!DL) return false; 934 935 // Find out what feeds this byval argument. 936 Value *ByValArg = CS.getArgument(ArgNo); 937 Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); 938 uint64_t ByValSize = DL->getTypeAllocSize(ByValTy); 939 MemDepResult DepInfo = 940 MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), 941 true, CS.getInstruction(), 942 CS.getInstruction()->getParent()); 943 if (!DepInfo.isClobber()) 944 return false; 945 946 // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by 947 // a memcpy, see if we can byval from the source of the memcpy instead of the 948 // result. 949 MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); 950 if (!MDep || MDep->isVolatile() || 951 ByValArg->stripPointerCasts() != MDep->getDest()) 952 return false; 953 954 // The length of the memcpy must be larger or equal to the size of the byval. 955 ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); 956 if (!C1 || C1->getValue().getZExtValue() < ByValSize) 957 return false; 958 959 // Get the alignment of the byval. If the call doesn't specify the alignment, 960 // then it is some target specific value that we can't know. 961 unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); 962 if (ByValAlign == 0) return false; 963 964 // If it is greater than the memcpy, then we check to see if we can force the 965 // source of the memcpy to the alignment we need. If we fail, we bail out. 966 if (MDep->getAlignment() < ByValAlign && 967 getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, DL) < ByValAlign) 968 return false; 969 970 // Verify that the copied-from memory doesn't change in between the memcpy and 971 // the byval call. 972 // memcpy(a <- b) 973 // *b = 42; 974 // foo(*a) 975 // It would be invalid to transform the second memcpy into foo(*b). 976 // 977 // NOTE: This is conservative, it will stop on any read from the source loc, 978 // not just the defining memcpy. 979 MemDepResult SourceDep = 980 MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), 981 false, CS.getInstruction(), MDep->getParent()); 982 if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) 983 return false; 984 985 Value *TmpCast = MDep->getSource(); 986 if (MDep->getSource()->getType() != ByValArg->getType()) 987 TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), 988 "tmpcast", CS.getInstruction()); 989 990 DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" 991 << " " << *MDep << "\n" 992 << " " << *CS.getInstruction() << "\n"); 993 994 // Otherwise we're good! Update the byval argument. 995 CS.setArgument(ArgNo, TmpCast); 996 ++NumMemCpyInstr; 997 return true; 998 } 999 1000 /// iterateOnFunction - Executes one iteration of MemCpyOpt. 1001 bool MemCpyOpt::iterateOnFunction(Function &F) { 1002 bool MadeChange = false; 1003 1004 // Walk all instruction in the function. 1005 for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { 1006 for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { 1007 // Avoid invalidating the iterator. 1008 Instruction *I = BI++; 1009 1010 bool RepeatInstruction = false; 1011 1012 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 1013 MadeChange |= processStore(SI, BI); 1014 else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) 1015 RepeatInstruction = processMemSet(M, BI); 1016 else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) 1017 RepeatInstruction = processMemCpy(M); 1018 else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) 1019 RepeatInstruction = processMemMove(M); 1020 else if (CallSite CS = (Value*)I) { 1021 for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) 1022 if (CS.isByValArgument(i)) 1023 MadeChange |= processByValArgument(CS, i); 1024 } 1025 1026 // Reprocess the instruction if desired. 1027 if (RepeatInstruction) { 1028 if (BI != BB->begin()) --BI; 1029 MadeChange = true; 1030 } 1031 } 1032 } 1033 1034 return MadeChange; 1035 } 1036 1037 // MemCpyOpt::runOnFunction - This is the main transformation entry point for a 1038 // function. 1039 // 1040 bool MemCpyOpt::runOnFunction(Function &F) { 1041 if (skipOptnoneFunction(F)) 1042 return false; 1043 1044 bool MadeChange = false; 1045 MD = &getAnalysis<MemoryDependenceAnalysis>(); 1046 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 1047 DL = DLP ? &DLP->getDataLayout() : nullptr; 1048 TLI = &getAnalysis<TargetLibraryInfo>(); 1049 1050 // If we don't have at least memset and memcpy, there is little point of doing 1051 // anything here. These are required by a freestanding implementation, so if 1052 // even they are disabled, there is no point in trying hard. 1053 if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy)) 1054 return false; 1055 1056 while (1) { 1057 if (!iterateOnFunction(F)) 1058 break; 1059 MadeChange = true; 1060 } 1061 1062 MD = nullptr; 1063 return MadeChange; 1064 } 1065