Home | History | Annotate | Download | only in Scalar
      1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 /// \file
     10 /// This transformation implements the well known scalar replacement of
     11 /// aggregates transformation. It tries to identify promotable elements of an
     12 /// aggregate alloca, and promote them to registers. It will also try to
     13 /// convert uses of an element (or set of elements) of an alloca into a vector
     14 /// or bitfield-style integer scalar if appropriate.
     15 ///
     16 /// It works to do this with minimal slicing of the alloca so that regions
     17 /// which are merely transferred in and out of external memory remain unchanged
     18 /// and are not decomposed to scalar code.
     19 ///
     20 /// Because this also performs alloca promotion, it can be thought of as also
     21 /// serving the purpose of SSA formation. The algorithm iterates on the
     22 /// function until all opportunities for promotion have been realized.
     23 ///
     24 //===----------------------------------------------------------------------===//
     25 
     26 #include "llvm/Transforms/Scalar/SROA.h"
     27 #include "llvm/ADT/STLExtras.h"
     28 #include "llvm/ADT/SmallVector.h"
     29 #include "llvm/ADT/Statistic.h"
     30 #include "llvm/Analysis/AssumptionCache.h"
     31 #include "llvm/Analysis/GlobalsModRef.h"
     32 #include "llvm/Analysis/Loads.h"
     33 #include "llvm/Analysis/PtrUseVisitor.h"
     34 #include "llvm/Analysis/ValueTracking.h"
     35 #include "llvm/IR/Constants.h"
     36 #include "llvm/IR/DIBuilder.h"
     37 #include "llvm/IR/DataLayout.h"
     38 #include "llvm/IR/DebugInfo.h"
     39 #include "llvm/IR/DerivedTypes.h"
     40 #include "llvm/IR/IRBuilder.h"
     41 #include "llvm/IR/InstVisitor.h"
     42 #include "llvm/IR/Instructions.h"
     43 #include "llvm/IR/IntrinsicInst.h"
     44 #include "llvm/IR/LLVMContext.h"
     45 #include "llvm/IR/Operator.h"
     46 #include "llvm/Pass.h"
     47 #include "llvm/Support/CommandLine.h"
     48 #include "llvm/Support/Compiler.h"
     49 #include "llvm/Support/Debug.h"
     50 #include "llvm/Support/ErrorHandling.h"
     51 #include "llvm/Support/MathExtras.h"
     52 #include "llvm/Support/TimeValue.h"
     53 #include "llvm/Support/raw_ostream.h"
     54 #include "llvm/Transforms/Scalar.h"
     55 #include "llvm/Transforms/Utils/Local.h"
     56 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
     57 
     58 #ifndef NDEBUG
     59 // We only use this for a debug check.
     60 #include <random>
     61 #endif
     62 
     63 using namespace llvm;
     64 using namespace llvm::sroa;
     65 
     66 #define DEBUG_TYPE "sroa"
     67 
     68 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
     69 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
     70 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
     71 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
     72 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
     73 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
     74 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
     75 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
     76 STATISTIC(NumDeleted, "Number of instructions deleted");
     77 STATISTIC(NumVectorized, "Number of vectorized aggregates");
     78 
     79 /// Hidden option to enable randomly shuffling the slices to help uncover
     80 /// instability in their order.
     81 static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
     82                                              cl::init(false), cl::Hidden);
     83 
     84 /// Hidden option to experiment with completely strict handling of inbounds
     85 /// GEPs.
     86 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
     87                                         cl::Hidden);
     88 
     89 namespace {
     90 /// \brief A custom IRBuilder inserter which prefixes all names, but only in
     91 /// Assert builds.
     92 class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
     93   std::string Prefix;
     94   const Twine getNameWithPrefix(const Twine &Name) const {
     95     return Name.isTriviallyEmpty() ? Name : Prefix + Name;
     96   }
     97 
     98 public:
     99   void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
    100 
    101 protected:
    102   void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
    103                     BasicBlock::iterator InsertPt) const {
    104     IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
    105                                            InsertPt);
    106   }
    107 };
    108 
    109 /// \brief Provide a typedef for IRBuilder that drops names in release builds.
    110 using IRBuilderTy = llvm::IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
    111 }
    112 
    113 namespace {
    114 /// \brief A used slice of an alloca.
    115 ///
    116 /// This structure represents a slice of an alloca used by some instruction. It
    117 /// stores both the begin and end offsets of this use, a pointer to the use
    118 /// itself, and a flag indicating whether we can classify the use as splittable
    119 /// or not when forming partitions of the alloca.
    120 class Slice {
    121   /// \brief The beginning offset of the range.
    122   uint64_t BeginOffset;
    123 
    124   /// \brief The ending offset, not included in the range.
    125   uint64_t EndOffset;
    126 
    127   /// \brief Storage for both the use of this slice and whether it can be
    128   /// split.
    129   PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
    130 
    131 public:
    132   Slice() : BeginOffset(), EndOffset() {}
    133   Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
    134       : BeginOffset(BeginOffset), EndOffset(EndOffset),
    135         UseAndIsSplittable(U, IsSplittable) {}
    136 
    137   uint64_t beginOffset() const { return BeginOffset; }
    138   uint64_t endOffset() const { return EndOffset; }
    139 
    140   bool isSplittable() const { return UseAndIsSplittable.getInt(); }
    141   void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
    142 
    143   Use *getUse() const { return UseAndIsSplittable.getPointer(); }
    144 
    145   bool isDead() const { return getUse() == nullptr; }
    146   void kill() { UseAndIsSplittable.setPointer(nullptr); }
    147 
    148   /// \brief Support for ordering ranges.
    149   ///
    150   /// This provides an ordering over ranges such that start offsets are
    151   /// always increasing, and within equal start offsets, the end offsets are
    152   /// decreasing. Thus the spanning range comes first in a cluster with the
    153   /// same start position.
    154   bool operator<(const Slice &RHS) const {
    155     if (beginOffset() < RHS.beginOffset())
    156       return true;
    157     if (beginOffset() > RHS.beginOffset())
    158       return false;
    159     if (isSplittable() != RHS.isSplittable())
    160       return !isSplittable();
    161     if (endOffset() > RHS.endOffset())
    162       return true;
    163     return false;
    164   }
    165 
    166   /// \brief Support comparison with a single offset to allow binary searches.
    167   friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
    168                                               uint64_t RHSOffset) {
    169     return LHS.beginOffset() < RHSOffset;
    170   }
    171   friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
    172                                               const Slice &RHS) {
    173     return LHSOffset < RHS.beginOffset();
    174   }
    175 
    176   bool operator==(const Slice &RHS) const {
    177     return isSplittable() == RHS.isSplittable() &&
    178            beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
    179   }
    180   bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
    181 };
    182 } // end anonymous namespace
    183 
    184 namespace llvm {
    185 template <typename T> struct isPodLike;
    186 template <> struct isPodLike<Slice> { static const bool value = true; };
    187 }
    188 
    189 /// \brief Representation of the alloca slices.
    190 ///
    191 /// This class represents the slices of an alloca which are formed by its
    192 /// various uses. If a pointer escapes, we can't fully build a representation
    193 /// for the slices used and we reflect that in this structure. The uses are
    194 /// stored, sorted by increasing beginning offset and with unsplittable slices
    195 /// starting at a particular offset before splittable slices.
    196 class llvm::sroa::AllocaSlices {
    197 public:
    198   /// \brief Construct the slices of a particular alloca.
    199   AllocaSlices(const DataLayout &DL, AllocaInst &AI);
    200 
    201   /// \brief Test whether a pointer to the allocation escapes our analysis.
    202   ///
    203   /// If this is true, the slices are never fully built and should be
    204   /// ignored.
    205   bool isEscaped() const { return PointerEscapingInstr; }
    206 
    207   /// \brief Support for iterating over the slices.
    208   /// @{
    209   typedef SmallVectorImpl<Slice>::iterator iterator;
    210   typedef iterator_range<iterator> range;
    211   iterator begin() { return Slices.begin(); }
    212   iterator end() { return Slices.end(); }
    213 
    214   typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
    215   typedef iterator_range<const_iterator> const_range;
    216   const_iterator begin() const { return Slices.begin(); }
    217   const_iterator end() const { return Slices.end(); }
    218   /// @}
    219 
    220   /// \brief Erase a range of slices.
    221   void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
    222 
    223   /// \brief Insert new slices for this alloca.
    224   ///
    225   /// This moves the slices into the alloca's slices collection, and re-sorts
    226   /// everything so that the usual ordering properties of the alloca's slices
    227   /// hold.
    228   void insert(ArrayRef<Slice> NewSlices) {
    229     int OldSize = Slices.size();
    230     Slices.append(NewSlices.begin(), NewSlices.end());
    231     auto SliceI = Slices.begin() + OldSize;
    232     std::sort(SliceI, Slices.end());
    233     std::inplace_merge(Slices.begin(), SliceI, Slices.end());
    234   }
    235 
    236   // Forward declare the iterator and range accessor for walking the
    237   // partitions.
    238   class partition_iterator;
    239   iterator_range<partition_iterator> partitions();
    240 
    241   /// \brief Access the dead users for this alloca.
    242   ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
    243 
    244   /// \brief Access the dead operands referring to this alloca.
    245   ///
    246   /// These are operands which have cannot actually be used to refer to the
    247   /// alloca as they are outside its range and the user doesn't correct for
    248   /// that. These mostly consist of PHI node inputs and the like which we just
    249   /// need to replace with undef.
    250   ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
    251 
    252 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    253   void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
    254   void printSlice(raw_ostream &OS, const_iterator I,
    255                   StringRef Indent = "  ") const;
    256   void printUse(raw_ostream &OS, const_iterator I,
    257                 StringRef Indent = "  ") const;
    258   void print(raw_ostream &OS) const;
    259   void dump(const_iterator I) const;
    260   void dump() const;
    261 #endif
    262 
    263 private:
    264   template <typename DerivedT, typename RetT = void> class BuilderBase;
    265   class SliceBuilder;
    266   friend class AllocaSlices::SliceBuilder;
    267 
    268 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    269   /// \brief Handle to alloca instruction to simplify method interfaces.
    270   AllocaInst &AI;
    271 #endif
    272 
    273   /// \brief The instruction responsible for this alloca not having a known set
    274   /// of slices.
    275   ///
    276   /// When an instruction (potentially) escapes the pointer to the alloca, we
    277   /// store a pointer to that here and abort trying to form slices of the
    278   /// alloca. This will be null if the alloca slices are analyzed successfully.
    279   Instruction *PointerEscapingInstr;
    280 
    281   /// \brief The slices of the alloca.
    282   ///
    283   /// We store a vector of the slices formed by uses of the alloca here. This
    284   /// vector is sorted by increasing begin offset, and then the unsplittable
    285   /// slices before the splittable ones. See the Slice inner class for more
    286   /// details.
    287   SmallVector<Slice, 8> Slices;
    288 
    289   /// \brief Instructions which will become dead if we rewrite the alloca.
    290   ///
    291   /// Note that these are not separated by slice. This is because we expect an
    292   /// alloca to be completely rewritten or not rewritten at all. If rewritten,
    293   /// all these instructions can simply be removed and replaced with undef as
    294   /// they come from outside of the allocated space.
    295   SmallVector<Instruction *, 8> DeadUsers;
    296 
    297   /// \brief Operands which will become dead if we rewrite the alloca.
    298   ///
    299   /// These are operands that in their particular use can be replaced with
    300   /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
    301   /// to PHI nodes and the like. They aren't entirely dead (there might be
    302   /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
    303   /// want to swap this particular input for undef to simplify the use lists of
    304   /// the alloca.
    305   SmallVector<Use *, 8> DeadOperands;
    306 };
    307 
    308 /// \brief A partition of the slices.
    309 ///
    310 /// An ephemeral representation for a range of slices which can be viewed as
    311 /// a partition of the alloca. This range represents a span of the alloca's
    312 /// memory which cannot be split, and provides access to all of the slices
    313 /// overlapping some part of the partition.
    314 ///
    315 /// Objects of this type are produced by traversing the alloca's slices, but
    316 /// are only ephemeral and not persistent.
    317 class llvm::sroa::Partition {
    318 private:
    319   friend class AllocaSlices;
    320   friend class AllocaSlices::partition_iterator;
    321 
    322   typedef AllocaSlices::iterator iterator;
    323 
    324   /// \brief The beginning and ending offsets of the alloca for this
    325   /// partition.
    326   uint64_t BeginOffset, EndOffset;
    327 
    328   /// \brief The start end end iterators of this partition.
    329   iterator SI, SJ;
    330 
    331   /// \brief A collection of split slice tails overlapping the partition.
    332   SmallVector<Slice *, 4> SplitTails;
    333 
    334   /// \brief Raw constructor builds an empty partition starting and ending at
    335   /// the given iterator.
    336   Partition(iterator SI) : SI(SI), SJ(SI) {}
    337 
    338 public:
    339   /// \brief The start offset of this partition.
    340   ///
    341   /// All of the contained slices start at or after this offset.
    342   uint64_t beginOffset() const { return BeginOffset; }
    343 
    344   /// \brief The end offset of this partition.
    345   ///
    346   /// All of the contained slices end at or before this offset.
    347   uint64_t endOffset() const { return EndOffset; }
    348 
    349   /// \brief The size of the partition.
    350   ///
    351   /// Note that this can never be zero.
    352   uint64_t size() const {
    353     assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
    354     return EndOffset - BeginOffset;
    355   }
    356 
    357   /// \brief Test whether this partition contains no slices, and merely spans
    358   /// a region occupied by split slices.
    359   bool empty() const { return SI == SJ; }
    360 
    361   /// \name Iterate slices that start within the partition.
    362   /// These may be splittable or unsplittable. They have a begin offset >= the
    363   /// partition begin offset.
    364   /// @{
    365   // FIXME: We should probably define a "concat_iterator" helper and use that
    366   // to stitch together pointee_iterators over the split tails and the
    367   // contiguous iterators of the partition. That would give a much nicer
    368   // interface here. We could then additionally expose filtered iterators for
    369   // split, unsplit, and unsplittable splices based on the usage patterns.
    370   iterator begin() const { return SI; }
    371   iterator end() const { return SJ; }
    372   /// @}
    373 
    374   /// \brief Get the sequence of split slice tails.
    375   ///
    376   /// These tails are of slices which start before this partition but are
    377   /// split and overlap into the partition. We accumulate these while forming
    378   /// partitions.
    379   ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
    380 };
    381 
    382 /// \brief An iterator over partitions of the alloca's slices.
    383 ///
    384 /// This iterator implements the core algorithm for partitioning the alloca's
    385 /// slices. It is a forward iterator as we don't support backtracking for
    386 /// efficiency reasons, and re-use a single storage area to maintain the
    387 /// current set of split slices.
    388 ///
    389 /// It is templated on the slice iterator type to use so that it can operate
    390 /// with either const or non-const slice iterators.
    391 class AllocaSlices::partition_iterator
    392     : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
    393                                   Partition> {
    394   friend class AllocaSlices;
    395 
    396   /// \brief Most of the state for walking the partitions is held in a class
    397   /// with a nice interface for examining them.
    398   Partition P;
    399 
    400   /// \brief We need to keep the end of the slices to know when to stop.
    401   AllocaSlices::iterator SE;
    402 
    403   /// \brief We also need to keep track of the maximum split end offset seen.
    404   /// FIXME: Do we really?
    405   uint64_t MaxSplitSliceEndOffset;
    406 
    407   /// \brief Sets the partition to be empty at given iterator, and sets the
    408   /// end iterator.
    409   partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
    410       : P(SI), SE(SE), MaxSplitSliceEndOffset(0) {
    411     // If not already at the end, advance our state to form the initial
    412     // partition.
    413     if (SI != SE)
    414       advance();
    415   }
    416 
    417   /// \brief Advance the iterator to the next partition.
    418   ///
    419   /// Requires that the iterator not be at the end of the slices.
    420   void advance() {
    421     assert((P.SI != SE || !P.SplitTails.empty()) &&
    422            "Cannot advance past the end of the slices!");
    423 
    424     // Clear out any split uses which have ended.
    425     if (!P.SplitTails.empty()) {
    426       if (P.EndOffset >= MaxSplitSliceEndOffset) {
    427         // If we've finished all splits, this is easy.
    428         P.SplitTails.clear();
    429         MaxSplitSliceEndOffset = 0;
    430       } else {
    431         // Remove the uses which have ended in the prior partition. This
    432         // cannot change the max split slice end because we just checked that
    433         // the prior partition ended prior to that max.
    434         P.SplitTails.erase(
    435             std::remove_if(
    436                 P.SplitTails.begin(), P.SplitTails.end(),
    437                 [&](Slice *S) { return S->endOffset() <= P.EndOffset; }),
    438             P.SplitTails.end());
    439         assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(),
    440                            [&](Slice *S) {
    441                              return S->endOffset() == MaxSplitSliceEndOffset;
    442                            }) &&
    443                "Could not find the current max split slice offset!");
    444         assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(),
    445                            [&](Slice *S) {
    446                              return S->endOffset() <= MaxSplitSliceEndOffset;
    447                            }) &&
    448                "Max split slice end offset is not actually the max!");
    449       }
    450     }
    451 
    452     // If P.SI is already at the end, then we've cleared the split tail and
    453     // now have an end iterator.
    454     if (P.SI == SE) {
    455       assert(P.SplitTails.empty() && "Failed to clear the split slices!");
    456       return;
    457     }
    458 
    459     // If we had a non-empty partition previously, set up the state for
    460     // subsequent partitions.
    461     if (P.SI != P.SJ) {
    462       // Accumulate all the splittable slices which started in the old
    463       // partition into the split list.
    464       for (Slice &S : P)
    465         if (S.isSplittable() && S.endOffset() > P.EndOffset) {
    466           P.SplitTails.push_back(&S);
    467           MaxSplitSliceEndOffset =
    468               std::max(S.endOffset(), MaxSplitSliceEndOffset);
    469         }
    470 
    471       // Start from the end of the previous partition.
    472       P.SI = P.SJ;
    473 
    474       // If P.SI is now at the end, we at most have a tail of split slices.
    475       if (P.SI == SE) {
    476         P.BeginOffset = P.EndOffset;
    477         P.EndOffset = MaxSplitSliceEndOffset;
    478         return;
    479       }
    480 
    481       // If the we have split slices and the next slice is after a gap and is
    482       // not splittable immediately form an empty partition for the split
    483       // slices up until the next slice begins.
    484       if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
    485           !P.SI->isSplittable()) {
    486         P.BeginOffset = P.EndOffset;
    487         P.EndOffset = P.SI->beginOffset();
    488         return;
    489       }
    490     }
    491 
    492     // OK, we need to consume new slices. Set the end offset based on the
    493     // current slice, and step SJ past it. The beginning offset of the
    494     // partition is the beginning offset of the next slice unless we have
    495     // pre-existing split slices that are continuing, in which case we begin
    496     // at the prior end offset.
    497     P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
    498     P.EndOffset = P.SI->endOffset();
    499     ++P.SJ;
    500 
    501     // There are two strategies to form a partition based on whether the
    502     // partition starts with an unsplittable slice or a splittable slice.
    503     if (!P.SI->isSplittable()) {
    504       // When we're forming an unsplittable region, it must always start at
    505       // the first slice and will extend through its end.
    506       assert(P.BeginOffset == P.SI->beginOffset());
    507 
    508       // Form a partition including all of the overlapping slices with this
    509       // unsplittable slice.
    510       while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
    511         if (!P.SJ->isSplittable())
    512           P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
    513         ++P.SJ;
    514       }
    515 
    516       // We have a partition across a set of overlapping unsplittable
    517       // partitions.
    518       return;
    519     }
    520 
    521     // If we're starting with a splittable slice, then we need to form
    522     // a synthetic partition spanning it and any other overlapping splittable
    523     // splices.
    524     assert(P.SI->isSplittable() && "Forming a splittable partition!");
    525 
    526     // Collect all of the overlapping splittable slices.
    527     while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
    528            P.SJ->isSplittable()) {
    529       P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
    530       ++P.SJ;
    531     }
    532 
    533     // Back upiP.EndOffset if we ended the span early when encountering an
    534     // unsplittable slice. This synthesizes the early end offset of
    535     // a partition spanning only splittable slices.
    536     if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
    537       assert(!P.SJ->isSplittable());
    538       P.EndOffset = P.SJ->beginOffset();
    539     }
    540   }
    541 
    542 public:
    543   bool operator==(const partition_iterator &RHS) const {
    544     assert(SE == RHS.SE &&
    545            "End iterators don't match between compared partition iterators!");
    546 
    547     // The observed positions of partitions is marked by the P.SI iterator and
    548     // the emptiness of the split slices. The latter is only relevant when
    549     // P.SI == SE, as the end iterator will additionally have an empty split
    550     // slices list, but the prior may have the same P.SI and a tail of split
    551     // slices.
    552     if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
    553       assert(P.SJ == RHS.P.SJ &&
    554              "Same set of slices formed two different sized partitions!");
    555       assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
    556              "Same slice position with differently sized non-empty split "
    557              "slice tails!");
    558       return true;
    559     }
    560     return false;
    561   }
    562 
    563   partition_iterator &operator++() {
    564     advance();
    565     return *this;
    566   }
    567 
    568   Partition &operator*() { return P; }
    569 };
    570 
    571 /// \brief A forward range over the partitions of the alloca's slices.
    572 ///
    573 /// This accesses an iterator range over the partitions of the alloca's
    574 /// slices. It computes these partitions on the fly based on the overlapping
    575 /// offsets of the slices and the ability to split them. It will visit "empty"
    576 /// partitions to cover regions of the alloca only accessed via split
    577 /// slices.
    578 iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
    579   return make_range(partition_iterator(begin(), end()),
    580                     partition_iterator(end(), end()));
    581 }
    582 
    583 static Value *foldSelectInst(SelectInst &SI) {
    584   // If the condition being selected on is a constant or the same value is
    585   // being selected between, fold the select. Yes this does (rarely) happen
    586   // early on.
    587   if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
    588     return SI.getOperand(1 + CI->isZero());
    589   if (SI.getOperand(1) == SI.getOperand(2))
    590     return SI.getOperand(1);
    591 
    592   return nullptr;
    593 }
    594 
    595 /// \brief A helper that folds a PHI node or a select.
    596 static Value *foldPHINodeOrSelectInst(Instruction &I) {
    597   if (PHINode *PN = dyn_cast<PHINode>(&I)) {
    598     // If PN merges together the same value, return that value.
    599     return PN->hasConstantValue();
    600   }
    601   return foldSelectInst(cast<SelectInst>(I));
    602 }
    603 
    604 /// \brief Builder for the alloca slices.
    605 ///
    606 /// This class builds a set of alloca slices by recursively visiting the uses
    607 /// of an alloca and making a slice for each load and store at each offset.
    608 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
    609   friend class PtrUseVisitor<SliceBuilder>;
    610   friend class InstVisitor<SliceBuilder>;
    611   typedef PtrUseVisitor<SliceBuilder> Base;
    612 
    613   const uint64_t AllocSize;
    614   AllocaSlices &AS;
    615 
    616   SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
    617   SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
    618 
    619   /// \brief Set to de-duplicate dead instructions found in the use walk.
    620   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
    621 
    622 public:
    623   SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
    624       : PtrUseVisitor<SliceBuilder>(DL),
    625         AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
    626 
    627 private:
    628   void markAsDead(Instruction &I) {
    629     if (VisitedDeadInsts.insert(&I).second)
    630       AS.DeadUsers.push_back(&I);
    631   }
    632 
    633   void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
    634                  bool IsSplittable = false) {
    635     // Completely skip uses which have a zero size or start either before or
    636     // past the end of the allocation.
    637     if (Size == 0 || Offset.uge(AllocSize)) {
    638       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
    639                    << " which has zero size or starts outside of the "
    640                    << AllocSize << " byte alloca:\n"
    641                    << "    alloca: " << AS.AI << "\n"
    642                    << "       use: " << I << "\n");
    643       return markAsDead(I);
    644     }
    645 
    646     uint64_t BeginOffset = Offset.getZExtValue();
    647     uint64_t EndOffset = BeginOffset + Size;
    648 
    649     // Clamp the end offset to the end of the allocation. Note that this is
    650     // formulated to handle even the case where "BeginOffset + Size" overflows.
    651     // This may appear superficially to be something we could ignore entirely,
    652     // but that is not so! There may be widened loads or PHI-node uses where
    653     // some instructions are dead but not others. We can't completely ignore
    654     // them, and so have to record at least the information here.
    655     assert(AllocSize >= BeginOffset); // Established above.
    656     if (Size > AllocSize - BeginOffset) {
    657       DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
    658                    << " to remain within the " << AllocSize << " byte alloca:\n"
    659                    << "    alloca: " << AS.AI << "\n"
    660                    << "       use: " << I << "\n");
    661       EndOffset = AllocSize;
    662     }
    663 
    664     AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
    665   }
    666 
    667   void visitBitCastInst(BitCastInst &BC) {
    668     if (BC.use_empty())
    669       return markAsDead(BC);
    670 
    671     return Base::visitBitCastInst(BC);
    672   }
    673 
    674   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
    675     if (GEPI.use_empty())
    676       return markAsDead(GEPI);
    677 
    678     if (SROAStrictInbounds && GEPI.isInBounds()) {
    679       // FIXME: This is a manually un-factored variant of the basic code inside
    680       // of GEPs with checking of the inbounds invariant specified in the
    681       // langref in a very strict sense. If we ever want to enable
    682       // SROAStrictInbounds, this code should be factored cleanly into
    683       // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
    684       // by writing out the code here where we have the underlying allocation
    685       // size readily available.
    686       APInt GEPOffset = Offset;
    687       const DataLayout &DL = GEPI.getModule()->getDataLayout();
    688       for (gep_type_iterator GTI = gep_type_begin(GEPI),
    689                              GTE = gep_type_end(GEPI);
    690            GTI != GTE; ++GTI) {
    691         ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
    692         if (!OpC)
    693           break;
    694 
    695         // Handle a struct index, which adds its field offset to the pointer.
    696         if (StructType *STy = dyn_cast<StructType>(*GTI)) {
    697           unsigned ElementIdx = OpC->getZExtValue();
    698           const StructLayout *SL = DL.getStructLayout(STy);
    699           GEPOffset +=
    700               APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
    701         } else {
    702           // For array or vector indices, scale the index by the size of the
    703           // type.
    704           APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
    705           GEPOffset += Index * APInt(Offset.getBitWidth(),
    706                                      DL.getTypeAllocSize(GTI.getIndexedType()));
    707         }
    708 
    709         // If this index has computed an intermediate pointer which is not
    710         // inbounds, then the result of the GEP is a poison value and we can
    711         // delete it and all uses.
    712         if (GEPOffset.ugt(AllocSize))
    713           return markAsDead(GEPI);
    714       }
    715     }
    716 
    717     return Base::visitGetElementPtrInst(GEPI);
    718   }
    719 
    720   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
    721                          uint64_t Size, bool IsVolatile) {
    722     // We allow splitting of non-volatile loads and stores where the type is an
    723     // integer type. These may be used to implement 'memcpy' or other "transfer
    724     // of bits" patterns.
    725     bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
    726 
    727     insertUse(I, Offset, Size, IsSplittable);
    728   }
    729 
    730   void visitLoadInst(LoadInst &LI) {
    731     assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
    732            "All simple FCA loads should have been pre-split");
    733 
    734     if (!IsOffsetKnown)
    735       return PI.setAborted(&LI);
    736 
    737     const DataLayout &DL = LI.getModule()->getDataLayout();
    738     uint64_t Size = DL.getTypeStoreSize(LI.getType());
    739     return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
    740   }
    741 
    742   void visitStoreInst(StoreInst &SI) {
    743     Value *ValOp = SI.getValueOperand();
    744     if (ValOp == *U)
    745       return PI.setEscapedAndAborted(&SI);
    746     if (!IsOffsetKnown)
    747       return PI.setAborted(&SI);
    748 
    749     const DataLayout &DL = SI.getModule()->getDataLayout();
    750     uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
    751 
    752     // If this memory access can be shown to *statically* extend outside the
    753     // bounds of of the allocation, it's behavior is undefined, so simply
    754     // ignore it. Note that this is more strict than the generic clamping
    755     // behavior of insertUse. We also try to handle cases which might run the
    756     // risk of overflow.
    757     // FIXME: We should instead consider the pointer to have escaped if this
    758     // function is being instrumented for addressing bugs or race conditions.
    759     if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
    760       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
    761                    << " which extends past the end of the " << AllocSize
    762                    << " byte alloca:\n"
    763                    << "    alloca: " << AS.AI << "\n"
    764                    << "       use: " << SI << "\n");
    765       return markAsDead(SI);
    766     }
    767 
    768     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
    769            "All simple FCA stores should have been pre-split");
    770     handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
    771   }
    772 
    773   void visitMemSetInst(MemSetInst &II) {
    774     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
    775     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
    776     if ((Length && Length->getValue() == 0) ||
    777         (IsOffsetKnown && Offset.uge(AllocSize)))
    778       // Zero-length mem transfer intrinsics can be ignored entirely.
    779       return markAsDead(II);
    780 
    781     if (!IsOffsetKnown)
    782       return PI.setAborted(&II);
    783 
    784     insertUse(II, Offset, Length ? Length->getLimitedValue()
    785                                  : AllocSize - Offset.getLimitedValue(),
    786               (bool)Length);
    787   }
    788 
    789   void visitMemTransferInst(MemTransferInst &II) {
    790     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
    791     if (Length && Length->getValue() == 0)
    792       // Zero-length mem transfer intrinsics can be ignored entirely.
    793       return markAsDead(II);
    794 
    795     // Because we can visit these intrinsics twice, also check to see if the
    796     // first time marked this instruction as dead. If so, skip it.
    797     if (VisitedDeadInsts.count(&II))
    798       return;
    799 
    800     if (!IsOffsetKnown)
    801       return PI.setAborted(&II);
    802 
    803     // This side of the transfer is completely out-of-bounds, and so we can
    804     // nuke the entire transfer. However, we also need to nuke the other side
    805     // if already added to our partitions.
    806     // FIXME: Yet another place we really should bypass this when
    807     // instrumenting for ASan.
    808     if (Offset.uge(AllocSize)) {
    809       SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
    810           MemTransferSliceMap.find(&II);
    811       if (MTPI != MemTransferSliceMap.end())
    812         AS.Slices[MTPI->second].kill();
    813       return markAsDead(II);
    814     }
    815 
    816     uint64_t RawOffset = Offset.getLimitedValue();
    817     uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
    818 
    819     // Check for the special case where the same exact value is used for both
    820     // source and dest.
    821     if (*U == II.getRawDest() && *U == II.getRawSource()) {
    822       // For non-volatile transfers this is a no-op.
    823       if (!II.isVolatile())
    824         return markAsDead(II);
    825 
    826       return insertUse(II, Offset, Size, /*IsSplittable=*/false);
    827     }
    828 
    829     // If we have seen both source and destination for a mem transfer, then
    830     // they both point to the same alloca.
    831     bool Inserted;
    832     SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
    833     std::tie(MTPI, Inserted) =
    834         MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
    835     unsigned PrevIdx = MTPI->second;
    836     if (!Inserted) {
    837       Slice &PrevP = AS.Slices[PrevIdx];
    838 
    839       // Check if the begin offsets match and this is a non-volatile transfer.
    840       // In that case, we can completely elide the transfer.
    841       if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
    842         PrevP.kill();
    843         return markAsDead(II);
    844       }
    845 
    846       // Otherwise we have an offset transfer within the same alloca. We can't
    847       // split those.
    848       PrevP.makeUnsplittable();
    849     }
    850 
    851     // Insert the use now that we've fixed up the splittable nature.
    852     insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
    853 
    854     // Check that we ended up with a valid index in the map.
    855     assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
    856            "Map index doesn't point back to a slice with this user.");
    857   }
    858 
    859   // Disable SRoA for any intrinsics except for lifetime invariants.
    860   // FIXME: What about debug intrinsics? This matches old behavior, but
    861   // doesn't make sense.
    862   void visitIntrinsicInst(IntrinsicInst &II) {
    863     if (!IsOffsetKnown)
    864       return PI.setAborted(&II);
    865 
    866     if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
    867         II.getIntrinsicID() == Intrinsic::lifetime_end) {
    868       ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
    869       uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
    870                                Length->getLimitedValue());
    871       insertUse(II, Offset, Size, true);
    872       return;
    873     }
    874 
    875     Base::visitIntrinsicInst(II);
    876   }
    877 
    878   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
    879     // We consider any PHI or select that results in a direct load or store of
    880     // the same offset to be a viable use for slicing purposes. These uses
    881     // are considered unsplittable and the size is the maximum loaded or stored
    882     // size.
    883     SmallPtrSet<Instruction *, 4> Visited;
    884     SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
    885     Visited.insert(Root);
    886     Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
    887     const DataLayout &DL = Root->getModule()->getDataLayout();
    888     // If there are no loads or stores, the access is dead. We mark that as
    889     // a size zero access.
    890     Size = 0;
    891     do {
    892       Instruction *I, *UsedI;
    893       std::tie(UsedI, I) = Uses.pop_back_val();
    894 
    895       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
    896         Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
    897         continue;
    898       }
    899       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
    900         Value *Op = SI->getOperand(0);
    901         if (Op == UsedI)
    902           return SI;
    903         Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
    904         continue;
    905       }
    906 
    907       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
    908         if (!GEP->hasAllZeroIndices())
    909           return GEP;
    910       } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
    911                  !isa<SelectInst>(I)) {
    912         return I;
    913       }
    914 
    915       for (User *U : I->users())
    916         if (Visited.insert(cast<Instruction>(U)).second)
    917           Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
    918     } while (!Uses.empty());
    919 
    920     return nullptr;
    921   }
    922 
    923   void visitPHINodeOrSelectInst(Instruction &I) {
    924     assert(isa<PHINode>(I) || isa<SelectInst>(I));
    925     if (I.use_empty())
    926       return markAsDead(I);
    927 
    928     // TODO: We could use SimplifyInstruction here to fold PHINodes and
    929     // SelectInsts. However, doing so requires to change the current
    930     // dead-operand-tracking mechanism. For instance, suppose neither loading
    931     // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
    932     // trap either.  However, if we simply replace %U with undef using the
    933     // current dead-operand-tracking mechanism, "load (select undef, undef,
    934     // %other)" may trap because the select may return the first operand
    935     // "undef".
    936     if (Value *Result = foldPHINodeOrSelectInst(I)) {
    937       if (Result == *U)
    938         // If the result of the constant fold will be the pointer, recurse
    939         // through the PHI/select as if we had RAUW'ed it.
    940         enqueueUsers(I);
    941       else
    942         // Otherwise the operand to the PHI/select is dead, and we can replace
    943         // it with undef.
    944         AS.DeadOperands.push_back(U);
    945 
    946       return;
    947     }
    948 
    949     if (!IsOffsetKnown)
    950       return PI.setAborted(&I);
    951 
    952     // See if we already have computed info on this node.
    953     uint64_t &Size = PHIOrSelectSizes[&I];
    954     if (!Size) {
    955       // This is a new PHI/Select, check for an unsafe use of it.
    956       if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
    957         return PI.setAborted(UnsafeI);
    958     }
    959 
    960     // For PHI and select operands outside the alloca, we can't nuke the entire
    961     // phi or select -- the other side might still be relevant, so we special
    962     // case them here and use a separate structure to track the operands
    963     // themselves which should be replaced with undef.
    964     // FIXME: This should instead be escaped in the event we're instrumenting
    965     // for address sanitization.
    966     if (Offset.uge(AllocSize)) {
    967       AS.DeadOperands.push_back(U);
    968       return;
    969     }
    970 
    971     insertUse(I, Offset, Size);
    972   }
    973 
    974   void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
    975 
    976   void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
    977 
    978   /// \brief Disable SROA entirely if there are unhandled users of the alloca.
    979   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
    980 };
    981 
    982 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
    983     :
    984 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    985       AI(AI),
    986 #endif
    987       PointerEscapingInstr(nullptr) {
    988   SliceBuilder PB(DL, AI, *this);
    989   SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
    990   if (PtrI.isEscaped() || PtrI.isAborted()) {
    991     // FIXME: We should sink the escape vs. abort info into the caller nicely,
    992     // possibly by just storing the PtrInfo in the AllocaSlices.
    993     PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
    994                                                   : PtrI.getAbortingInst();
    995     assert(PointerEscapingInstr && "Did not track a bad instruction");
    996     return;
    997   }
    998 
    999   Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
   1000                               [](const Slice &S) {
   1001                                 return S.isDead();
   1002                               }),
   1003                Slices.end());
   1004 
   1005 #ifndef NDEBUG
   1006   if (SROARandomShuffleSlices) {
   1007     std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
   1008     std::shuffle(Slices.begin(), Slices.end(), MT);
   1009   }
   1010 #endif
   1011 
   1012   // Sort the uses. This arranges for the offsets to be in ascending order,
   1013   // and the sizes to be in descending order.
   1014   std::sort(Slices.begin(), Slices.end());
   1015 }
   1016 
   1017 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   1018 
   1019 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
   1020                          StringRef Indent) const {
   1021   printSlice(OS, I, Indent);
   1022   OS << "\n";
   1023   printUse(OS, I, Indent);
   1024 }
   1025 
   1026 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
   1027                               StringRef Indent) const {
   1028   OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
   1029      << " slice #" << (I - begin())
   1030      << (I->isSplittable() ? " (splittable)" : "");
   1031 }
   1032 
   1033 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
   1034                             StringRef Indent) const {
   1035   OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
   1036 }
   1037 
   1038 void AllocaSlices::print(raw_ostream &OS) const {
   1039   if (PointerEscapingInstr) {
   1040     OS << "Can't analyze slices for alloca: " << AI << "\n"
   1041        << "  A pointer to this alloca escaped by:\n"
   1042        << "  " << *PointerEscapingInstr << "\n";
   1043     return;
   1044   }
   1045 
   1046   OS << "Slices of alloca: " << AI << "\n";
   1047   for (const_iterator I = begin(), E = end(); I != E; ++I)
   1048     print(OS, I);
   1049 }
   1050 
   1051 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
   1052   print(dbgs(), I);
   1053 }
   1054 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
   1055 
   1056 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   1057 
   1058 /// Walk the range of a partitioning looking for a common type to cover this
   1059 /// sequence of slices.
   1060 static Type *findCommonType(AllocaSlices::const_iterator B,
   1061                             AllocaSlices::const_iterator E,
   1062                             uint64_t EndOffset) {
   1063   Type *Ty = nullptr;
   1064   bool TyIsCommon = true;
   1065   IntegerType *ITy = nullptr;
   1066 
   1067   // Note that we need to look at *every* alloca slice's Use to ensure we
   1068   // always get consistent results regardless of the order of slices.
   1069   for (AllocaSlices::const_iterator I = B; I != E; ++I) {
   1070     Use *U = I->getUse();
   1071     if (isa<IntrinsicInst>(*U->getUser()))
   1072       continue;
   1073     if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
   1074       continue;
   1075 
   1076     Type *UserTy = nullptr;
   1077     if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
   1078       UserTy = LI->getType();
   1079     } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
   1080       UserTy = SI->getValueOperand()->getType();
   1081     }
   1082 
   1083     if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
   1084       // If the type is larger than the partition, skip it. We only encounter
   1085       // this for split integer operations where we want to use the type of the
   1086       // entity causing the split. Also skip if the type is not a byte width
   1087       // multiple.
   1088       if (UserITy->getBitWidth() % 8 != 0 ||
   1089           UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
   1090         continue;
   1091 
   1092       // Track the largest bitwidth integer type used in this way in case there
   1093       // is no common type.
   1094       if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
   1095         ITy = UserITy;
   1096     }
   1097 
   1098     // To avoid depending on the order of slices, Ty and TyIsCommon must not
   1099     // depend on types skipped above.
   1100     if (!UserTy || (Ty && Ty != UserTy))
   1101       TyIsCommon = false; // Give up on anything but an iN type.
   1102     else
   1103       Ty = UserTy;
   1104   }
   1105 
   1106   return TyIsCommon ? Ty : ITy;
   1107 }
   1108 
   1109 /// PHI instructions that use an alloca and are subsequently loaded can be
   1110 /// rewritten to load both input pointers in the pred blocks and then PHI the
   1111 /// results, allowing the load of the alloca to be promoted.
   1112 /// From this:
   1113 ///   %P2 = phi [i32* %Alloca, i32* %Other]
   1114 ///   %V = load i32* %P2
   1115 /// to:
   1116 ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
   1117 ///   ...
   1118 ///   %V2 = load i32* %Other
   1119 ///   ...
   1120 ///   %V = phi [i32 %V1, i32 %V2]
   1121 ///
   1122 /// We can do this to a select if its only uses are loads and if the operands
   1123 /// to the select can be loaded unconditionally.
   1124 ///
   1125 /// FIXME: This should be hoisted into a generic utility, likely in
   1126 /// Transforms/Util/Local.h
   1127 static bool isSafePHIToSpeculate(PHINode &PN) {
   1128   // For now, we can only do this promotion if the load is in the same block
   1129   // as the PHI, and if there are no stores between the phi and load.
   1130   // TODO: Allow recursive phi users.
   1131   // TODO: Allow stores.
   1132   BasicBlock *BB = PN.getParent();
   1133   unsigned MaxAlign = 0;
   1134   bool HaveLoad = false;
   1135   for (User *U : PN.users()) {
   1136     LoadInst *LI = dyn_cast<LoadInst>(U);
   1137     if (!LI || !LI->isSimple())
   1138       return false;
   1139 
   1140     // For now we only allow loads in the same block as the PHI.  This is
   1141     // a common case that happens when instcombine merges two loads through
   1142     // a PHI.
   1143     if (LI->getParent() != BB)
   1144       return false;
   1145 
   1146     // Ensure that there are no instructions between the PHI and the load that
   1147     // could store.
   1148     for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
   1149       if (BBI->mayWriteToMemory())
   1150         return false;
   1151 
   1152     MaxAlign = std::max(MaxAlign, LI->getAlignment());
   1153     HaveLoad = true;
   1154   }
   1155 
   1156   if (!HaveLoad)
   1157     return false;
   1158 
   1159   const DataLayout &DL = PN.getModule()->getDataLayout();
   1160 
   1161   // We can only transform this if it is safe to push the loads into the
   1162   // predecessor blocks. The only thing to watch out for is that we can't put
   1163   // a possibly trapping load in the predecessor if it is a critical edge.
   1164   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
   1165     TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
   1166     Value *InVal = PN.getIncomingValue(Idx);
   1167 
   1168     // If the value is produced by the terminator of the predecessor (an
   1169     // invoke) or it has side-effects, there is no valid place to put a load
   1170     // in the predecessor.
   1171     if (TI == InVal || TI->mayHaveSideEffects())
   1172       return false;
   1173 
   1174     // If the predecessor has a single successor, then the edge isn't
   1175     // critical.
   1176     if (TI->getNumSuccessors() == 1)
   1177       continue;
   1178 
   1179     // If this pointer is always safe to load, or if we can prove that there
   1180     // is already a load in the block, then we can move the load to the pred
   1181     // block.
   1182     if (isSafeToLoadUnconditionally(InVal, MaxAlign, DL, TI))
   1183       continue;
   1184 
   1185     return false;
   1186   }
   1187 
   1188   return true;
   1189 }
   1190 
   1191 static void speculatePHINodeLoads(PHINode &PN) {
   1192   DEBUG(dbgs() << "    original: " << PN << "\n");
   1193 
   1194   Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
   1195   IRBuilderTy PHIBuilder(&PN);
   1196   PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
   1197                                         PN.getName() + ".sroa.speculated");
   1198 
   1199   // Get the AA tags and alignment to use from one of the loads.  It doesn't
   1200   // matter which one we get and if any differ.
   1201   LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
   1202 
   1203   AAMDNodes AATags;
   1204   SomeLoad->getAAMetadata(AATags);
   1205   unsigned Align = SomeLoad->getAlignment();
   1206 
   1207   // Rewrite all loads of the PN to use the new PHI.
   1208   while (!PN.use_empty()) {
   1209     LoadInst *LI = cast<LoadInst>(PN.user_back());
   1210     LI->replaceAllUsesWith(NewPN);
   1211     LI->eraseFromParent();
   1212   }
   1213 
   1214   // Inject loads into all of the pred blocks.
   1215   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
   1216     BasicBlock *Pred = PN.getIncomingBlock(Idx);
   1217     TerminatorInst *TI = Pred->getTerminator();
   1218     Value *InVal = PN.getIncomingValue(Idx);
   1219     IRBuilderTy PredBuilder(TI);
   1220 
   1221     LoadInst *Load = PredBuilder.CreateLoad(
   1222         InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
   1223     ++NumLoadsSpeculated;
   1224     Load->setAlignment(Align);
   1225     if (AATags)
   1226       Load->setAAMetadata(AATags);
   1227     NewPN->addIncoming(Load, Pred);
   1228   }
   1229 
   1230   DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
   1231   PN.eraseFromParent();
   1232 }
   1233 
   1234 /// Select instructions that use an alloca and are subsequently loaded can be
   1235 /// rewritten to load both input pointers and then select between the result,
   1236 /// allowing the load of the alloca to be promoted.
   1237 /// From this:
   1238 ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
   1239 ///   %V = load i32* %P2
   1240 /// to:
   1241 ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
   1242 ///   %V2 = load i32* %Other
   1243 ///   %V = select i1 %cond, i32 %V1, i32 %V2
   1244 ///
   1245 /// We can do this to a select if its only uses are loads and if the operand
   1246 /// to the select can be loaded unconditionally.
   1247 static bool isSafeSelectToSpeculate(SelectInst &SI) {
   1248   Value *TValue = SI.getTrueValue();
   1249   Value *FValue = SI.getFalseValue();
   1250   const DataLayout &DL = SI.getModule()->getDataLayout();
   1251 
   1252   for (User *U : SI.users()) {
   1253     LoadInst *LI = dyn_cast<LoadInst>(U);
   1254     if (!LI || !LI->isSimple())
   1255       return false;
   1256 
   1257     // Both operands to the select need to be dereferencable, either
   1258     // absolutely (e.g. allocas) or at this point because we can see other
   1259     // accesses to it.
   1260     if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
   1261       return false;
   1262     if (!isSafeToLoadUnconditionally(FValue, LI->getAlignment(), DL, LI))
   1263       return false;
   1264   }
   1265 
   1266   return true;
   1267 }
   1268 
   1269 static void speculateSelectInstLoads(SelectInst &SI) {
   1270   DEBUG(dbgs() << "    original: " << SI << "\n");
   1271 
   1272   IRBuilderTy IRB(&SI);
   1273   Value *TV = SI.getTrueValue();
   1274   Value *FV = SI.getFalseValue();
   1275   // Replace the loads of the select with a select of two loads.
   1276   while (!SI.use_empty()) {
   1277     LoadInst *LI = cast<LoadInst>(SI.user_back());
   1278     assert(LI->isSimple() && "We only speculate simple loads");
   1279 
   1280     IRB.SetInsertPoint(LI);
   1281     LoadInst *TL =
   1282         IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
   1283     LoadInst *FL =
   1284         IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
   1285     NumLoadsSpeculated += 2;
   1286 
   1287     // Transfer alignment and AA info if present.
   1288     TL->setAlignment(LI->getAlignment());
   1289     FL->setAlignment(LI->getAlignment());
   1290 
   1291     AAMDNodes Tags;
   1292     LI->getAAMetadata(Tags);
   1293     if (Tags) {
   1294       TL->setAAMetadata(Tags);
   1295       FL->setAAMetadata(Tags);
   1296     }
   1297 
   1298     Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
   1299                                 LI->getName() + ".sroa.speculated");
   1300 
   1301     DEBUG(dbgs() << "          speculated to: " << *V << "\n");
   1302     LI->replaceAllUsesWith(V);
   1303     LI->eraseFromParent();
   1304   }
   1305   SI.eraseFromParent();
   1306 }
   1307 
   1308 /// \brief Build a GEP out of a base pointer and indices.
   1309 ///
   1310 /// This will return the BasePtr if that is valid, or build a new GEP
   1311 /// instruction using the IRBuilder if GEP-ing is needed.
   1312 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
   1313                        SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
   1314   if (Indices.empty())
   1315     return BasePtr;
   1316 
   1317   // A single zero index is a no-op, so check for this and avoid building a GEP
   1318   // in that case.
   1319   if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
   1320     return BasePtr;
   1321 
   1322   return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices,
   1323                                NamePrefix + "sroa_idx");
   1324 }
   1325 
   1326 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
   1327 /// TargetTy without changing the offset of the pointer.
   1328 ///
   1329 /// This routine assumes we've already established a properly offset GEP with
   1330 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
   1331 /// zero-indices down through type layers until we find one the same as
   1332 /// TargetTy. If we can't find one with the same type, we at least try to use
   1333 /// one with the same size. If none of that works, we just produce the GEP as
   1334 /// indicated by Indices to have the correct offset.
   1335 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
   1336                                     Value *BasePtr, Type *Ty, Type *TargetTy,
   1337                                     SmallVectorImpl<Value *> &Indices,
   1338                                     Twine NamePrefix) {
   1339   if (Ty == TargetTy)
   1340     return buildGEP(IRB, BasePtr, Indices, NamePrefix);
   1341 
   1342   // Pointer size to use for the indices.
   1343   unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
   1344 
   1345   // See if we can descend into a struct and locate a field with the correct
   1346   // type.
   1347   unsigned NumLayers = 0;
   1348   Type *ElementTy = Ty;
   1349   do {
   1350     if (ElementTy->isPointerTy())
   1351       break;
   1352 
   1353     if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
   1354       ElementTy = ArrayTy->getElementType();
   1355       Indices.push_back(IRB.getIntN(PtrSize, 0));
   1356     } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
   1357       ElementTy = VectorTy->getElementType();
   1358       Indices.push_back(IRB.getInt32(0));
   1359     } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
   1360       if (STy->element_begin() == STy->element_end())
   1361         break; // Nothing left to descend into.
   1362       ElementTy = *STy->element_begin();
   1363       Indices.push_back(IRB.getInt32(0));
   1364     } else {
   1365       break;
   1366     }
   1367     ++NumLayers;
   1368   } while (ElementTy != TargetTy);
   1369   if (ElementTy != TargetTy)
   1370     Indices.erase(Indices.end() - NumLayers, Indices.end());
   1371 
   1372   return buildGEP(IRB, BasePtr, Indices, NamePrefix);
   1373 }
   1374 
   1375 /// \brief Recursively compute indices for a natural GEP.
   1376 ///
   1377 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
   1378 /// element types adding appropriate indices for the GEP.
   1379 static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   1380                                        Value *Ptr, Type *Ty, APInt &Offset,
   1381                                        Type *TargetTy,
   1382                                        SmallVectorImpl<Value *> &Indices,
   1383                                        Twine NamePrefix) {
   1384   if (Offset == 0)
   1385     return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
   1386                                  NamePrefix);
   1387 
   1388   // We can't recurse through pointer types.
   1389   if (Ty->isPointerTy())
   1390     return nullptr;
   1391 
   1392   // We try to analyze GEPs over vectors here, but note that these GEPs are
   1393   // extremely poorly defined currently. The long-term goal is to remove GEPing
   1394   // over a vector from the IR completely.
   1395   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
   1396     unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
   1397     if (ElementSizeInBits % 8 != 0) {
   1398       // GEPs over non-multiple of 8 size vector elements are invalid.
   1399       return nullptr;
   1400     }
   1401     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
   1402     APInt NumSkippedElements = Offset.sdiv(ElementSize);
   1403     if (NumSkippedElements.ugt(VecTy->getNumElements()))
   1404       return nullptr;
   1405     Offset -= NumSkippedElements * ElementSize;
   1406     Indices.push_back(IRB.getInt(NumSkippedElements));
   1407     return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
   1408                                     Offset, TargetTy, Indices, NamePrefix);
   1409   }
   1410 
   1411   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
   1412     Type *ElementTy = ArrTy->getElementType();
   1413     APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
   1414     APInt NumSkippedElements = Offset.sdiv(ElementSize);
   1415     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
   1416       return nullptr;
   1417 
   1418     Offset -= NumSkippedElements * ElementSize;
   1419     Indices.push_back(IRB.getInt(NumSkippedElements));
   1420     return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
   1421                                     Indices, NamePrefix);
   1422   }
   1423 
   1424   StructType *STy = dyn_cast<StructType>(Ty);
   1425   if (!STy)
   1426     return nullptr;
   1427 
   1428   const StructLayout *SL = DL.getStructLayout(STy);
   1429   uint64_t StructOffset = Offset.getZExtValue();
   1430   if (StructOffset >= SL->getSizeInBytes())
   1431     return nullptr;
   1432   unsigned Index = SL->getElementContainingOffset(StructOffset);
   1433   Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
   1434   Type *ElementTy = STy->getElementType(Index);
   1435   if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
   1436     return nullptr; // The offset points into alignment padding.
   1437 
   1438   Indices.push_back(IRB.getInt32(Index));
   1439   return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
   1440                                   Indices, NamePrefix);
   1441 }
   1442 
   1443 /// \brief Get a natural GEP from a base pointer to a particular offset and
   1444 /// resulting in a particular type.
   1445 ///
   1446 /// The goal is to produce a "natural" looking GEP that works with the existing
   1447 /// composite types to arrive at the appropriate offset and element type for
   1448 /// a pointer. TargetTy is the element type the returned GEP should point-to if
   1449 /// possible. We recurse by decreasing Offset, adding the appropriate index to
   1450 /// Indices, and setting Ty to the result subtype.
   1451 ///
   1452 /// If no natural GEP can be constructed, this function returns null.
   1453 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   1454                                       Value *Ptr, APInt Offset, Type *TargetTy,
   1455                                       SmallVectorImpl<Value *> &Indices,
   1456                                       Twine NamePrefix) {
   1457   PointerType *Ty = cast<PointerType>(Ptr->getType());
   1458 
   1459   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
   1460   // an i8.
   1461   if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
   1462     return nullptr;
   1463 
   1464   Type *ElementTy = Ty->getElementType();
   1465   if (!ElementTy->isSized())
   1466     return nullptr; // We can't GEP through an unsized element.
   1467   APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
   1468   if (ElementSize == 0)
   1469     return nullptr; // Zero-length arrays can't help us build a natural GEP.
   1470   APInt NumSkippedElements = Offset.sdiv(ElementSize);
   1471 
   1472   Offset -= NumSkippedElements * ElementSize;
   1473   Indices.push_back(IRB.getInt(NumSkippedElements));
   1474   return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
   1475                                   Indices, NamePrefix);
   1476 }
   1477 
   1478 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
   1479 /// resulting pointer has PointerTy.
   1480 ///
   1481 /// This tries very hard to compute a "natural" GEP which arrives at the offset
   1482 /// and produces the pointer type desired. Where it cannot, it will try to use
   1483 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
   1484 /// fails, it will try to use an existing i8* and GEP to the byte offset and
   1485 /// bitcast to the type.
   1486 ///
   1487 /// The strategy for finding the more natural GEPs is to peel off layers of the
   1488 /// pointer, walking back through bit casts and GEPs, searching for a base
   1489 /// pointer from which we can compute a natural GEP with the desired
   1490 /// properties. The algorithm tries to fold as many constant indices into
   1491 /// a single GEP as possible, thus making each GEP more independent of the
   1492 /// surrounding code.
   1493 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
   1494                              APInt Offset, Type *PointerTy, Twine NamePrefix) {
   1495   // Even though we don't look through PHI nodes, we could be called on an
   1496   // instruction in an unreachable block, which may be on a cycle.
   1497   SmallPtrSet<Value *, 4> Visited;
   1498   Visited.insert(Ptr);
   1499   SmallVector<Value *, 4> Indices;
   1500 
   1501   // We may end up computing an offset pointer that has the wrong type. If we
   1502   // never are able to compute one directly that has the correct type, we'll
   1503   // fall back to it, so keep it and the base it was computed from around here.
   1504   Value *OffsetPtr = nullptr;
   1505   Value *OffsetBasePtr;
   1506 
   1507   // Remember any i8 pointer we come across to re-use if we need to do a raw
   1508   // byte offset.
   1509   Value *Int8Ptr = nullptr;
   1510   APInt Int8PtrOffset(Offset.getBitWidth(), 0);
   1511 
   1512   Type *TargetTy = PointerTy->getPointerElementType();
   1513 
   1514   do {
   1515     // First fold any existing GEPs into the offset.
   1516     while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
   1517       APInt GEPOffset(Offset.getBitWidth(), 0);
   1518       if (!GEP->accumulateConstantOffset(DL, GEPOffset))
   1519         break;
   1520       Offset += GEPOffset;
   1521       Ptr = GEP->getPointerOperand();
   1522       if (!Visited.insert(Ptr).second)
   1523         break;
   1524     }
   1525 
   1526     // See if we can perform a natural GEP here.
   1527     Indices.clear();
   1528     if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
   1529                                            Indices, NamePrefix)) {
   1530       // If we have a new natural pointer at the offset, clear out any old
   1531       // offset pointer we computed. Unless it is the base pointer or
   1532       // a non-instruction, we built a GEP we don't need. Zap it.
   1533       if (OffsetPtr && OffsetPtr != OffsetBasePtr)
   1534         if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
   1535           assert(I->use_empty() && "Built a GEP with uses some how!");
   1536           I->eraseFromParent();
   1537         }
   1538       OffsetPtr = P;
   1539       OffsetBasePtr = Ptr;
   1540       // If we also found a pointer of the right type, we're done.
   1541       if (P->getType() == PointerTy)
   1542         return P;
   1543     }
   1544 
   1545     // Stash this pointer if we've found an i8*.
   1546     if (Ptr->getType()->isIntegerTy(8)) {
   1547       Int8Ptr = Ptr;
   1548       Int8PtrOffset = Offset;
   1549     }
   1550 
   1551     // Peel off a layer of the pointer and update the offset appropriately.
   1552     if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
   1553       Ptr = cast<Operator>(Ptr)->getOperand(0);
   1554     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
   1555       if (GA->isInterposable())
   1556         break;
   1557       Ptr = GA->getAliasee();
   1558     } else {
   1559       break;
   1560     }
   1561     assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
   1562   } while (Visited.insert(Ptr).second);
   1563 
   1564   if (!OffsetPtr) {
   1565     if (!Int8Ptr) {
   1566       Int8Ptr = IRB.CreateBitCast(
   1567           Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
   1568           NamePrefix + "sroa_raw_cast");
   1569       Int8PtrOffset = Offset;
   1570     }
   1571 
   1572     OffsetPtr = Int8PtrOffset == 0
   1573                     ? Int8Ptr
   1574                     : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
   1575                                             IRB.getInt(Int8PtrOffset),
   1576                                             NamePrefix + "sroa_raw_idx");
   1577   }
   1578   Ptr = OffsetPtr;
   1579 
   1580   // On the off chance we were targeting i8*, guard the bitcast here.
   1581   if (Ptr->getType() != PointerTy)
   1582     Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
   1583 
   1584   return Ptr;
   1585 }
   1586 
   1587 /// \brief Compute the adjusted alignment for a load or store from an offset.
   1588 static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
   1589                                      const DataLayout &DL) {
   1590   unsigned Alignment;
   1591   Type *Ty;
   1592   if (auto *LI = dyn_cast<LoadInst>(I)) {
   1593     Alignment = LI->getAlignment();
   1594     Ty = LI->getType();
   1595   } else if (auto *SI = dyn_cast<StoreInst>(I)) {
   1596     Alignment = SI->getAlignment();
   1597     Ty = SI->getValueOperand()->getType();
   1598   } else {
   1599     llvm_unreachable("Only loads and stores are allowed!");
   1600   }
   1601 
   1602   if (!Alignment)
   1603     Alignment = DL.getABITypeAlignment(Ty);
   1604 
   1605   return MinAlign(Alignment, Offset);
   1606 }
   1607 
   1608 /// \brief Test whether we can convert a value from the old to the new type.
   1609 ///
   1610 /// This predicate should be used to guard calls to convertValue in order to
   1611 /// ensure that we only try to convert viable values. The strategy is that we
   1612 /// will peel off single element struct and array wrappings to get to an
   1613 /// underlying value, and convert that value.
   1614 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   1615   if (OldTy == NewTy)
   1616     return true;
   1617 
   1618   // For integer types, we can't handle any bit-width differences. This would
   1619   // break both vector conversions with extension and introduce endianness
   1620   // issues when in conjunction with loads and stores.
   1621   if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
   1622     assert(cast<IntegerType>(OldTy)->getBitWidth() !=
   1623                cast<IntegerType>(NewTy)->getBitWidth() &&
   1624            "We can't have the same bitwidth for different int types");
   1625     return false;
   1626   }
   1627 
   1628   if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
   1629     return false;
   1630   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
   1631     return false;
   1632 
   1633   // We can convert pointers to integers and vice-versa. Same for vectors
   1634   // of pointers and integers.
   1635   OldTy = OldTy->getScalarType();
   1636   NewTy = NewTy->getScalarType();
   1637   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
   1638     if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
   1639       return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
   1640         cast<PointerType>(OldTy)->getPointerAddressSpace();
   1641     }
   1642     if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
   1643       return true;
   1644     return false;
   1645   }
   1646 
   1647   return true;
   1648 }
   1649 
   1650 /// \brief Generic routine to convert an SSA value to a value of a different
   1651 /// type.
   1652 ///
   1653 /// This will try various different casting techniques, such as bitcasts,
   1654 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
   1655 /// two types for viability with this routine.
   1656 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
   1657                            Type *NewTy) {
   1658   Type *OldTy = V->getType();
   1659   assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
   1660 
   1661   if (OldTy == NewTy)
   1662     return V;
   1663 
   1664   assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
   1665          "Integer types must be the exact same to convert.");
   1666 
   1667   // See if we need inttoptr for this type pair. A cast involving both scalars
   1668   // and vectors requires and additional bitcast.
   1669   if (OldTy->getScalarType()->isIntegerTy() &&
   1670       NewTy->getScalarType()->isPointerTy()) {
   1671     // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
   1672     if (OldTy->isVectorTy() && !NewTy->isVectorTy())
   1673       return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
   1674                                 NewTy);
   1675 
   1676     // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
   1677     if (!OldTy->isVectorTy() && NewTy->isVectorTy())
   1678       return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
   1679                                 NewTy);
   1680 
   1681     return IRB.CreateIntToPtr(V, NewTy);
   1682   }
   1683 
   1684   // See if we need ptrtoint for this type pair. A cast involving both scalars
   1685   // and vectors requires and additional bitcast.
   1686   if (OldTy->getScalarType()->isPointerTy() &&
   1687       NewTy->getScalarType()->isIntegerTy()) {
   1688     // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
   1689     if (OldTy->isVectorTy() && !NewTy->isVectorTy())
   1690       return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
   1691                                NewTy);
   1692 
   1693     // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
   1694     if (!OldTy->isVectorTy() && NewTy->isVectorTy())
   1695       return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
   1696                                NewTy);
   1697 
   1698     return IRB.CreatePtrToInt(V, NewTy);
   1699   }
   1700 
   1701   return IRB.CreateBitCast(V, NewTy);
   1702 }
   1703 
   1704 /// \brief Test whether the given slice use can be promoted to a vector.
   1705 ///
   1706 /// This function is called to test each entry in a partition which is slated
   1707 /// for a single slice.
   1708 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   1709                                             VectorType *Ty,
   1710                                             uint64_t ElementSize,
   1711                                             const DataLayout &DL) {
   1712   // First validate the slice offsets.
   1713   uint64_t BeginOffset =
   1714       std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
   1715   uint64_t BeginIndex = BeginOffset / ElementSize;
   1716   if (BeginIndex * ElementSize != BeginOffset ||
   1717       BeginIndex >= Ty->getNumElements())
   1718     return false;
   1719   uint64_t EndOffset =
   1720       std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
   1721   uint64_t EndIndex = EndOffset / ElementSize;
   1722   if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
   1723     return false;
   1724 
   1725   assert(EndIndex > BeginIndex && "Empty vector!");
   1726   uint64_t NumElements = EndIndex - BeginIndex;
   1727   Type *SliceTy = (NumElements == 1)
   1728                       ? Ty->getElementType()
   1729                       : VectorType::get(Ty->getElementType(), NumElements);
   1730 
   1731   Type *SplitIntTy =
   1732       Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
   1733 
   1734   Use *U = S.getUse();
   1735 
   1736   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
   1737     if (MI->isVolatile())
   1738       return false;
   1739     if (!S.isSplittable())
   1740       return false; // Skip any unsplittable intrinsics.
   1741   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
   1742     if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
   1743         II->getIntrinsicID() != Intrinsic::lifetime_end)
   1744       return false;
   1745   } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
   1746     // Disable vector promotion when there are loads or stores of an FCA.
   1747     return false;
   1748   } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
   1749     if (LI->isVolatile())
   1750       return false;
   1751     Type *LTy = LI->getType();
   1752     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
   1753       assert(LTy->isIntegerTy());
   1754       LTy = SplitIntTy;
   1755     }
   1756     if (!canConvertValue(DL, SliceTy, LTy))
   1757       return false;
   1758   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
   1759     if (SI->isVolatile())
   1760       return false;
   1761     Type *STy = SI->getValueOperand()->getType();
   1762     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
   1763       assert(STy->isIntegerTy());
   1764       STy = SplitIntTy;
   1765     }
   1766     if (!canConvertValue(DL, STy, SliceTy))
   1767       return false;
   1768   } else {
   1769     return false;
   1770   }
   1771 
   1772   return true;
   1773 }
   1774 
   1775 /// \brief Test whether the given alloca partitioning and range of slices can be
   1776 /// promoted to a vector.
   1777 ///
   1778 /// This is a quick test to check whether we can rewrite a particular alloca
   1779 /// partition (and its newly formed alloca) into a vector alloca with only
   1780 /// whole-vector loads and stores such that it could be promoted to a vector
   1781 /// SSA value. We only can ensure this for a limited set of operations, and we
   1782 /// don't want to do the rewrites unless we are confident that the result will
   1783 /// be promotable, so we have an early test here.
   1784 static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   1785   // Collect the candidate types for vector-based promotion. Also track whether
   1786   // we have different element types.
   1787   SmallVector<VectorType *, 4> CandidateTys;
   1788   Type *CommonEltTy = nullptr;
   1789   bool HaveCommonEltTy = true;
   1790   auto CheckCandidateType = [&](Type *Ty) {
   1791     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
   1792       CandidateTys.push_back(VTy);
   1793       if (!CommonEltTy)
   1794         CommonEltTy = VTy->getElementType();
   1795       else if (CommonEltTy != VTy->getElementType())
   1796         HaveCommonEltTy = false;
   1797     }
   1798   };
   1799   // Consider any loads or stores that are the exact size of the slice.
   1800   for (const Slice &S : P)
   1801     if (S.beginOffset() == P.beginOffset() &&
   1802         S.endOffset() == P.endOffset()) {
   1803       if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
   1804         CheckCandidateType(LI->getType());
   1805       else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
   1806         CheckCandidateType(SI->getValueOperand()->getType());
   1807     }
   1808 
   1809   // If we didn't find a vector type, nothing to do here.
   1810   if (CandidateTys.empty())
   1811     return nullptr;
   1812 
   1813   // Remove non-integer vector types if we had multiple common element types.
   1814   // FIXME: It'd be nice to replace them with integer vector types, but we can't
   1815   // do that until all the backends are known to produce good code for all
   1816   // integer vector types.
   1817   if (!HaveCommonEltTy) {
   1818     CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
   1819                                       [](VectorType *VTy) {
   1820                          return !VTy->getElementType()->isIntegerTy();
   1821                        }),
   1822                        CandidateTys.end());
   1823 
   1824     // If there were no integer vector types, give up.
   1825     if (CandidateTys.empty())
   1826       return nullptr;
   1827 
   1828     // Rank the remaining candidate vector types. This is easy because we know
   1829     // they're all integer vectors. We sort by ascending number of elements.
   1830     auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
   1831       assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
   1832              "Cannot have vector types of different sizes!");
   1833       assert(RHSTy->getElementType()->isIntegerTy() &&
   1834              "All non-integer types eliminated!");
   1835       assert(LHSTy->getElementType()->isIntegerTy() &&
   1836              "All non-integer types eliminated!");
   1837       return RHSTy->getNumElements() < LHSTy->getNumElements();
   1838     };
   1839     std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
   1840     CandidateTys.erase(
   1841         std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
   1842         CandidateTys.end());
   1843   } else {
   1844 // The only way to have the same element type in every vector type is to
   1845 // have the same vector type. Check that and remove all but one.
   1846 #ifndef NDEBUG
   1847     for (VectorType *VTy : CandidateTys) {
   1848       assert(VTy->getElementType() == CommonEltTy &&
   1849              "Unaccounted for element type!");
   1850       assert(VTy == CandidateTys[0] &&
   1851              "Different vector types with the same element type!");
   1852     }
   1853 #endif
   1854     CandidateTys.resize(1);
   1855   }
   1856 
   1857   // Try each vector type, and return the one which works.
   1858   auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
   1859     uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
   1860 
   1861     // While the definition of LLVM vectors is bitpacked, we don't support sizes
   1862     // that aren't byte sized.
   1863     if (ElementSize % 8)
   1864       return false;
   1865     assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
   1866            "vector size not a multiple of element size?");
   1867     ElementSize /= 8;
   1868 
   1869     for (const Slice &S : P)
   1870       if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
   1871         return false;
   1872 
   1873     for (const Slice *S : P.splitSliceTails())
   1874       if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
   1875         return false;
   1876 
   1877     return true;
   1878   };
   1879   for (VectorType *VTy : CandidateTys)
   1880     if (CheckVectorTypeForPromotion(VTy))
   1881       return VTy;
   1882 
   1883   return nullptr;
   1884 }
   1885 
   1886 /// \brief Test whether a slice of an alloca is valid for integer widening.
   1887 ///
   1888 /// This implements the necessary checking for the \c isIntegerWideningViable
   1889 /// test below on a single slice of the alloca.
   1890 static bool isIntegerWideningViableForSlice(const Slice &S,
   1891                                             uint64_t AllocBeginOffset,
   1892                                             Type *AllocaTy,
   1893                                             const DataLayout &DL,
   1894                                             bool &WholeAllocaOp) {
   1895   uint64_t Size = DL.getTypeStoreSize(AllocaTy);
   1896 
   1897   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
   1898   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
   1899 
   1900   // We can't reasonably handle cases where the load or store extends past
   1901   // the end of the alloca's type and into its padding.
   1902   if (RelEnd > Size)
   1903     return false;
   1904 
   1905   Use *U = S.getUse();
   1906 
   1907   if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
   1908     if (LI->isVolatile())
   1909       return false;
   1910     // We can't handle loads that extend past the allocated memory.
   1911     if (DL.getTypeStoreSize(LI->getType()) > Size)
   1912       return false;
   1913     // Note that we don't count vector loads or stores as whole-alloca
   1914     // operations which enable integer widening because we would prefer to use
   1915     // vector widening instead.
   1916     if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
   1917       WholeAllocaOp = true;
   1918     if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
   1919       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
   1920         return false;
   1921     } else if (RelBegin != 0 || RelEnd != Size ||
   1922                !canConvertValue(DL, AllocaTy, LI->getType())) {
   1923       // Non-integer loads need to be convertible from the alloca type so that
   1924       // they are promotable.
   1925       return false;
   1926     }
   1927   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
   1928     Type *ValueTy = SI->getValueOperand()->getType();
   1929     if (SI->isVolatile())
   1930       return false;
   1931     // We can't handle stores that extend past the allocated memory.
   1932     if (DL.getTypeStoreSize(ValueTy) > Size)
   1933       return false;
   1934     // Note that we don't count vector loads or stores as whole-alloca
   1935     // operations which enable integer widening because we would prefer to use
   1936     // vector widening instead.
   1937     if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
   1938       WholeAllocaOp = true;
   1939     if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
   1940       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
   1941         return false;
   1942     } else if (RelBegin != 0 || RelEnd != Size ||
   1943                !canConvertValue(DL, ValueTy, AllocaTy)) {
   1944       // Non-integer stores need to be convertible to the alloca type so that
   1945       // they are promotable.
   1946       return false;
   1947     }
   1948   } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
   1949     if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
   1950       return false;
   1951     if (!S.isSplittable())
   1952       return false; // Skip any unsplittable intrinsics.
   1953   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
   1954     if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
   1955         II->getIntrinsicID() != Intrinsic::lifetime_end)
   1956       return false;
   1957   } else {
   1958     return false;
   1959   }
   1960 
   1961   return true;
   1962 }
   1963 
   1964 /// \brief Test whether the given alloca partition's integer operations can be
   1965 /// widened to promotable ones.
   1966 ///
   1967 /// This is a quick test to check whether we can rewrite the integer loads and
   1968 /// stores to a particular alloca into wider loads and stores and be able to
   1969 /// promote the resulting alloca.
   1970 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
   1971                                     const DataLayout &DL) {
   1972   uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
   1973   // Don't create integer types larger than the maximum bitwidth.
   1974   if (SizeInBits > IntegerType::MAX_INT_BITS)
   1975     return false;
   1976 
   1977   // Don't try to handle allocas with bit-padding.
   1978   if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
   1979     return false;
   1980 
   1981   // We need to ensure that an integer type with the appropriate bitwidth can
   1982   // be converted to the alloca type, whatever that is. We don't want to force
   1983   // the alloca itself to have an integer type if there is a more suitable one.
   1984   Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
   1985   if (!canConvertValue(DL, AllocaTy, IntTy) ||
   1986       !canConvertValue(DL, IntTy, AllocaTy))
   1987     return false;
   1988 
   1989   // While examining uses, we ensure that the alloca has a covering load or
   1990   // store. We don't want to widen the integer operations only to fail to
   1991   // promote due to some other unsplittable entry (which we may make splittable
   1992   // later). However, if there are only splittable uses, go ahead and assume
   1993   // that we cover the alloca.
   1994   // FIXME: We shouldn't consider split slices that happen to start in the
   1995   // partition here...
   1996   bool WholeAllocaOp =
   1997       P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits);
   1998 
   1999   for (const Slice &S : P)
   2000     if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
   2001                                          WholeAllocaOp))
   2002       return false;
   2003 
   2004   for (const Slice *S : P.splitSliceTails())
   2005     if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
   2006                                          WholeAllocaOp))
   2007       return false;
   2008 
   2009   return WholeAllocaOp;
   2010 }
   2011 
   2012 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
   2013                              IntegerType *Ty, uint64_t Offset,
   2014                              const Twine &Name) {
   2015   DEBUG(dbgs() << "       start: " << *V << "\n");
   2016   IntegerType *IntTy = cast<IntegerType>(V->getType());
   2017   assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
   2018          "Element extends past full value");
   2019   uint64_t ShAmt = 8 * Offset;
   2020   if (DL.isBigEndian())
   2021     ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
   2022   if (ShAmt) {
   2023     V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
   2024     DEBUG(dbgs() << "     shifted: " << *V << "\n");
   2025   }
   2026   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
   2027          "Cannot extract to a larger integer!");
   2028   if (Ty != IntTy) {
   2029     V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
   2030     DEBUG(dbgs() << "     trunced: " << *V << "\n");
   2031   }
   2032   return V;
   2033 }
   2034 
   2035 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
   2036                             Value *V, uint64_t Offset, const Twine &Name) {
   2037   IntegerType *IntTy = cast<IntegerType>(Old->getType());
   2038   IntegerType *Ty = cast<IntegerType>(V->getType());
   2039   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
   2040          "Cannot insert a larger integer!");
   2041   DEBUG(dbgs() << "       start: " << *V << "\n");
   2042   if (Ty != IntTy) {
   2043     V = IRB.CreateZExt(V, IntTy, Name + ".ext");
   2044     DEBUG(dbgs() << "    extended: " << *V << "\n");
   2045   }
   2046   assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
   2047          "Element store outside of alloca store");
   2048   uint64_t ShAmt = 8 * Offset;
   2049   if (DL.isBigEndian())
   2050     ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
   2051   if (ShAmt) {
   2052     V = IRB.CreateShl(V, ShAmt, Name + ".shift");
   2053     DEBUG(dbgs() << "     shifted: " << *V << "\n");
   2054   }
   2055 
   2056   if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
   2057     APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
   2058     Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
   2059     DEBUG(dbgs() << "      masked: " << *Old << "\n");
   2060     V = IRB.CreateOr(Old, V, Name + ".insert");
   2061     DEBUG(dbgs() << "    inserted: " << *V << "\n");
   2062   }
   2063   return V;
   2064 }
   2065 
   2066 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
   2067                             unsigned EndIndex, const Twine &Name) {
   2068   VectorType *VecTy = cast<VectorType>(V->getType());
   2069   unsigned NumElements = EndIndex - BeginIndex;
   2070   assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
   2071 
   2072   if (NumElements == VecTy->getNumElements())
   2073     return V;
   2074 
   2075   if (NumElements == 1) {
   2076     V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
   2077                                  Name + ".extract");
   2078     DEBUG(dbgs() << "     extract: " << *V << "\n");
   2079     return V;
   2080   }
   2081 
   2082   SmallVector<Constant *, 8> Mask;
   2083   Mask.reserve(NumElements);
   2084   for (unsigned i = BeginIndex; i != EndIndex; ++i)
   2085     Mask.push_back(IRB.getInt32(i));
   2086   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
   2087                               ConstantVector::get(Mask), Name + ".extract");
   2088   DEBUG(dbgs() << "     shuffle: " << *V << "\n");
   2089   return V;
   2090 }
   2091 
   2092 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   2093                            unsigned BeginIndex, const Twine &Name) {
   2094   VectorType *VecTy = cast<VectorType>(Old->getType());
   2095   assert(VecTy && "Can only insert a vector into a vector");
   2096 
   2097   VectorType *Ty = dyn_cast<VectorType>(V->getType());
   2098   if (!Ty) {
   2099     // Single element to insert.
   2100     V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
   2101                                 Name + ".insert");
   2102     DEBUG(dbgs() << "     insert: " << *V << "\n");
   2103     return V;
   2104   }
   2105 
   2106   assert(Ty->getNumElements() <= VecTy->getNumElements() &&
   2107          "Too many elements!");
   2108   if (Ty->getNumElements() == VecTy->getNumElements()) {
   2109     assert(V->getType() == VecTy && "Vector type mismatch");
   2110     return V;
   2111   }
   2112   unsigned EndIndex = BeginIndex + Ty->getNumElements();
   2113 
   2114   // When inserting a smaller vector into the larger to store, we first
   2115   // use a shuffle vector to widen it with undef elements, and then
   2116   // a second shuffle vector to select between the loaded vector and the
   2117   // incoming vector.
   2118   SmallVector<Constant *, 8> Mask;
   2119   Mask.reserve(VecTy->getNumElements());
   2120   for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
   2121     if (i >= BeginIndex && i < EndIndex)
   2122       Mask.push_back(IRB.getInt32(i - BeginIndex));
   2123     else
   2124       Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
   2125   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
   2126                               ConstantVector::get(Mask), Name + ".expand");
   2127   DEBUG(dbgs() << "    shuffle: " << *V << "\n");
   2128 
   2129   Mask.clear();
   2130   for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
   2131     Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
   2132 
   2133   V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
   2134 
   2135   DEBUG(dbgs() << "    blend: " << *V << "\n");
   2136   return V;
   2137 }
   2138 
   2139 /// \brief Visitor to rewrite instructions using p particular slice of an alloca
   2140 /// to use a new alloca.
   2141 ///
   2142 /// Also implements the rewriting to vector-based accesses when the partition
   2143 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
   2144 /// lives here.
   2145 class llvm::sroa::AllocaSliceRewriter
   2146     : public InstVisitor<AllocaSliceRewriter, bool> {
   2147   // Befriend the base class so it can delegate to private visit methods.
   2148   friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
   2149   typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
   2150 
   2151   const DataLayout &DL;
   2152   AllocaSlices &AS;
   2153   SROA &Pass;
   2154   AllocaInst &OldAI, &NewAI;
   2155   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
   2156   Type *NewAllocaTy;
   2157 
   2158   // This is a convenience and flag variable that will be null unless the new
   2159   // alloca's integer operations should be widened to this integer type due to
   2160   // passing isIntegerWideningViable above. If it is non-null, the desired
   2161   // integer type will be stored here for easy access during rewriting.
   2162   IntegerType *IntTy;
   2163 
   2164   // If we are rewriting an alloca partition which can be written as pure
   2165   // vector operations, we stash extra information here. When VecTy is
   2166   // non-null, we have some strict guarantees about the rewritten alloca:
   2167   //   - The new alloca is exactly the size of the vector type here.
   2168   //   - The accesses all either map to the entire vector or to a single
   2169   //     element.
   2170   //   - The set of accessing instructions is only one of those handled above
   2171   //     in isVectorPromotionViable. Generally these are the same access kinds
   2172   //     which are promotable via mem2reg.
   2173   VectorType *VecTy;
   2174   Type *ElementTy;
   2175   uint64_t ElementSize;
   2176 
   2177   // The original offset of the slice currently being rewritten relative to
   2178   // the original alloca.
   2179   uint64_t BeginOffset, EndOffset;
   2180   // The new offsets of the slice currently being rewritten relative to the
   2181   // original alloca.
   2182   uint64_t NewBeginOffset, NewEndOffset;
   2183 
   2184   uint64_t SliceSize;
   2185   bool IsSplittable;
   2186   bool IsSplit;
   2187   Use *OldUse;
   2188   Instruction *OldPtr;
   2189 
   2190   // Track post-rewrite users which are PHI nodes and Selects.
   2191   SmallPtrSetImpl<PHINode *> &PHIUsers;
   2192   SmallPtrSetImpl<SelectInst *> &SelectUsers;
   2193 
   2194   // Utility IR builder, whose name prefix is setup for each visited use, and
   2195   // the insertion point is set to point to the user.
   2196   IRBuilderTy IRB;
   2197 
   2198 public:
   2199   AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
   2200                       AllocaInst &OldAI, AllocaInst &NewAI,
   2201                       uint64_t NewAllocaBeginOffset,
   2202                       uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
   2203                       VectorType *PromotableVecTy,
   2204                       SmallPtrSetImpl<PHINode *> &PHIUsers,
   2205                       SmallPtrSetImpl<SelectInst *> &SelectUsers)
   2206       : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
   2207         NewAllocaBeginOffset(NewAllocaBeginOffset),
   2208         NewAllocaEndOffset(NewAllocaEndOffset),
   2209         NewAllocaTy(NewAI.getAllocatedType()),
   2210         IntTy(IsIntegerPromotable
   2211                   ? Type::getIntNTy(
   2212                         NewAI.getContext(),
   2213                         DL.getTypeSizeInBits(NewAI.getAllocatedType()))
   2214                   : nullptr),
   2215         VecTy(PromotableVecTy),
   2216         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
   2217         ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
   2218         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
   2219         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
   2220         IRB(NewAI.getContext(), ConstantFolder()) {
   2221     if (VecTy) {
   2222       assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
   2223              "Only multiple-of-8 sized vector elements are viable");
   2224       ++NumVectorized;
   2225     }
   2226     assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
   2227   }
   2228 
   2229   bool visit(AllocaSlices::const_iterator I) {
   2230     bool CanSROA = true;
   2231     BeginOffset = I->beginOffset();
   2232     EndOffset = I->endOffset();
   2233     IsSplittable = I->isSplittable();
   2234     IsSplit =
   2235         BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
   2236     DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
   2237     DEBUG(AS.printSlice(dbgs(), I, ""));
   2238     DEBUG(dbgs() << "\n");
   2239 
   2240     // Compute the intersecting offset range.
   2241     assert(BeginOffset < NewAllocaEndOffset);
   2242     assert(EndOffset > NewAllocaBeginOffset);
   2243     NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
   2244     NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
   2245 
   2246     SliceSize = NewEndOffset - NewBeginOffset;
   2247 
   2248     OldUse = I->getUse();
   2249     OldPtr = cast<Instruction>(OldUse->get());
   2250 
   2251     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
   2252     IRB.SetInsertPoint(OldUserI);
   2253     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
   2254     IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
   2255 
   2256     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
   2257     if (VecTy || IntTy)
   2258       assert(CanSROA);
   2259     return CanSROA;
   2260   }
   2261 
   2262 private:
   2263   // Make sure the other visit overloads are visible.
   2264   using Base::visit;
   2265 
   2266   // Every instruction which can end up as a user must have a rewrite rule.
   2267   bool visitInstruction(Instruction &I) {
   2268     DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
   2269     llvm_unreachable("No rewrite rule for this instruction!");
   2270   }
   2271 
   2272   Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
   2273     // Note that the offset computation can use BeginOffset or NewBeginOffset
   2274     // interchangeably for unsplit slices.
   2275     assert(IsSplit || BeginOffset == NewBeginOffset);
   2276     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
   2277 
   2278 #ifndef NDEBUG
   2279     StringRef OldName = OldPtr->getName();
   2280     // Skip through the last '.sroa.' component of the name.
   2281     size_t LastSROAPrefix = OldName.rfind(".sroa.");
   2282     if (LastSROAPrefix != StringRef::npos) {
   2283       OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
   2284       // Look for an SROA slice index.
   2285       size_t IndexEnd = OldName.find_first_not_of("0123456789");
   2286       if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
   2287         // Strip the index and look for the offset.
   2288         OldName = OldName.substr(IndexEnd + 1);
   2289         size_t OffsetEnd = OldName.find_first_not_of("0123456789");
   2290         if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
   2291           // Strip the offset.
   2292           OldName = OldName.substr(OffsetEnd + 1);
   2293       }
   2294     }
   2295     // Strip any SROA suffixes as well.
   2296     OldName = OldName.substr(0, OldName.find(".sroa_"));
   2297 #endif
   2298 
   2299     return getAdjustedPtr(IRB, DL, &NewAI,
   2300                           APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
   2301 #ifndef NDEBUG
   2302                           Twine(OldName) + "."
   2303 #else
   2304                           Twine()
   2305 #endif
   2306                           );
   2307   }
   2308 
   2309   /// \brief Compute suitable alignment to access this slice of the *new*
   2310   /// alloca.
   2311   ///
   2312   /// You can optionally pass a type to this routine and if that type's ABI
   2313   /// alignment is itself suitable, this will return zero.
   2314   unsigned getSliceAlign(Type *Ty = nullptr) {
   2315     unsigned NewAIAlign = NewAI.getAlignment();
   2316     if (!NewAIAlign)
   2317       NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
   2318     unsigned Align =
   2319         MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
   2320     return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
   2321   }
   2322 
   2323   unsigned getIndex(uint64_t Offset) {
   2324     assert(VecTy && "Can only call getIndex when rewriting a vector");
   2325     uint64_t RelOffset = Offset - NewAllocaBeginOffset;
   2326     assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
   2327     uint32_t Index = RelOffset / ElementSize;
   2328     assert(Index * ElementSize == RelOffset);
   2329     return Index;
   2330   }
   2331 
   2332   void deleteIfTriviallyDead(Value *V) {
   2333     Instruction *I = cast<Instruction>(V);
   2334     if (isInstructionTriviallyDead(I))
   2335       Pass.DeadInsts.insert(I);
   2336   }
   2337 
   2338   Value *rewriteVectorizedLoadInst() {
   2339     unsigned BeginIndex = getIndex(NewBeginOffset);
   2340     unsigned EndIndex = getIndex(NewEndOffset);
   2341     assert(EndIndex > BeginIndex && "Empty vector!");
   2342 
   2343     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
   2344     return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   2345   }
   2346 
   2347   Value *rewriteIntegerLoad(LoadInst &LI) {
   2348     assert(IntTy && "We cannot insert an integer to the alloca");
   2349     assert(!LI.isVolatile());
   2350     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
   2351     V = convertValue(DL, IRB, V, IntTy);
   2352     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
   2353     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
   2354     if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
   2355       IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
   2356       V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
   2357     }
   2358     // It is possible that the extracted type is not the load type. This
   2359     // happens if there is a load past the end of the alloca, and as
   2360     // a consequence the slice is narrower but still a candidate for integer
   2361     // lowering. To handle this case, we just zero extend the extracted
   2362     // integer.
   2363     assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
   2364            "Can only handle an extract for an overly wide load");
   2365     if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
   2366       V = IRB.CreateZExt(V, LI.getType());
   2367     return V;
   2368   }
   2369 
   2370   bool visitLoadInst(LoadInst &LI) {
   2371     DEBUG(dbgs() << "    original: " << LI << "\n");
   2372     Value *OldOp = LI.getOperand(0);
   2373     assert(OldOp == OldPtr);
   2374 
   2375     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
   2376                              : LI.getType();
   2377     const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
   2378     bool IsPtrAdjusted = false;
   2379     Value *V;
   2380     if (VecTy) {
   2381       V = rewriteVectorizedLoadInst();
   2382     } else if (IntTy && LI.getType()->isIntegerTy()) {
   2383       V = rewriteIntegerLoad(LI);
   2384     } else if (NewBeginOffset == NewAllocaBeginOffset &&
   2385                NewEndOffset == NewAllocaEndOffset &&
   2386                (canConvertValue(DL, NewAllocaTy, TargetTy) ||
   2387                 (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
   2388                  TargetTy->isIntegerTy()))) {
   2389       LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
   2390                                               LI.isVolatile(), LI.getName());
   2391       if (LI.isVolatile())
   2392         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
   2393       V = NewLI;
   2394 
   2395       // If this is an integer load past the end of the slice (which means the
   2396       // bytes outside the slice are undef or this load is dead) just forcibly
   2397       // fix the integer size with correct handling of endianness.
   2398       if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
   2399         if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
   2400           if (AITy->getBitWidth() < TITy->getBitWidth()) {
   2401             V = IRB.CreateZExt(V, TITy, "load.ext");
   2402             if (DL.isBigEndian())
   2403               V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
   2404                                 "endian_shift");
   2405           }
   2406     } else {
   2407       Type *LTy = TargetTy->getPointerTo();
   2408       LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
   2409                                               getSliceAlign(TargetTy),
   2410                                               LI.isVolatile(), LI.getName());
   2411       if (LI.isVolatile())
   2412         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
   2413 
   2414       V = NewLI;
   2415       IsPtrAdjusted = true;
   2416     }
   2417     V = convertValue(DL, IRB, V, TargetTy);
   2418 
   2419     if (IsSplit) {
   2420       assert(!LI.isVolatile());
   2421       assert(LI.getType()->isIntegerTy() &&
   2422              "Only integer type loads and stores are split");
   2423       assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
   2424              "Split load isn't smaller than original load");
   2425       assert(LI.getType()->getIntegerBitWidth() ==
   2426                  DL.getTypeStoreSizeInBits(LI.getType()) &&
   2427              "Non-byte-multiple bit width");
   2428       // Move the insertion point just past the load so that we can refer to it.
   2429       IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
   2430       // Create a placeholder value with the same type as LI to use as the
   2431       // basis for the new value. This allows us to replace the uses of LI with
   2432       // the computed value, and then replace the placeholder with LI, leaving
   2433       // LI only used for this computation.
   2434       Value *Placeholder =
   2435           new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
   2436       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
   2437                         "insert");
   2438       LI.replaceAllUsesWith(V);
   2439       Placeholder->replaceAllUsesWith(&LI);
   2440       delete Placeholder;
   2441     } else {
   2442       LI.replaceAllUsesWith(V);
   2443     }
   2444 
   2445     Pass.DeadInsts.insert(&LI);
   2446     deleteIfTriviallyDead(OldOp);
   2447     DEBUG(dbgs() << "          to: " << *V << "\n");
   2448     return !LI.isVolatile() && !IsPtrAdjusted;
   2449   }
   2450 
   2451   bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
   2452     if (V->getType() != VecTy) {
   2453       unsigned BeginIndex = getIndex(NewBeginOffset);
   2454       unsigned EndIndex = getIndex(NewEndOffset);
   2455       assert(EndIndex > BeginIndex && "Empty vector!");
   2456       unsigned NumElements = EndIndex - BeginIndex;
   2457       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
   2458       Type *SliceTy = (NumElements == 1)
   2459                           ? ElementTy
   2460                           : VectorType::get(ElementTy, NumElements);
   2461       if (V->getType() != SliceTy)
   2462         V = convertValue(DL, IRB, V, SliceTy);
   2463 
   2464       // Mix in the existing elements.
   2465       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
   2466       V = insertVector(IRB, Old, V, BeginIndex, "vec");
   2467     }
   2468     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
   2469     Pass.DeadInsts.insert(&SI);
   2470 
   2471     (void)Store;
   2472     DEBUG(dbgs() << "          to: " << *Store << "\n");
   2473     return true;
   2474   }
   2475 
   2476   bool rewriteIntegerStore(Value *V, StoreInst &SI) {
   2477     assert(IntTy && "We cannot extract an integer from the alloca");
   2478     assert(!SI.isVolatile());
   2479     if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
   2480       Value *Old =
   2481           IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
   2482       Old = convertValue(DL, IRB, Old, IntTy);
   2483       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
   2484       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
   2485       V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
   2486     }
   2487     V = convertValue(DL, IRB, V, NewAllocaTy);
   2488     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
   2489     Pass.DeadInsts.insert(&SI);
   2490     (void)Store;
   2491     DEBUG(dbgs() << "          to: " << *Store << "\n");
   2492     return true;
   2493   }
   2494 
   2495   bool visitStoreInst(StoreInst &SI) {
   2496     DEBUG(dbgs() << "    original: " << SI << "\n");
   2497     Value *OldOp = SI.getOperand(1);
   2498     assert(OldOp == OldPtr);
   2499 
   2500     Value *V = SI.getValueOperand();
   2501 
   2502     // Strip all inbounds GEPs and pointer casts to try to dig out any root
   2503     // alloca that should be re-examined after promoting this alloca.
   2504     if (V->getType()->isPointerTy())
   2505       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
   2506         Pass.PostPromotionWorklist.insert(AI);
   2507 
   2508     if (SliceSize < DL.getTypeStoreSize(V->getType())) {
   2509       assert(!SI.isVolatile());
   2510       assert(V->getType()->isIntegerTy() &&
   2511              "Only integer type loads and stores are split");
   2512       assert(V->getType()->getIntegerBitWidth() ==
   2513                  DL.getTypeStoreSizeInBits(V->getType()) &&
   2514              "Non-byte-multiple bit width");
   2515       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
   2516       V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
   2517                          "extract");
   2518     }
   2519 
   2520     if (VecTy)
   2521       return rewriteVectorizedStoreInst(V, SI, OldOp);
   2522     if (IntTy && V->getType()->isIntegerTy())
   2523       return rewriteIntegerStore(V, SI);
   2524 
   2525     const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
   2526     StoreInst *NewSI;
   2527     if (NewBeginOffset == NewAllocaBeginOffset &&
   2528         NewEndOffset == NewAllocaEndOffset &&
   2529         (canConvertValue(DL, V->getType(), NewAllocaTy) ||
   2530          (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
   2531           V->getType()->isIntegerTy()))) {
   2532       // If this is an integer store past the end of slice (and thus the bytes
   2533       // past that point are irrelevant or this is unreachable), truncate the
   2534       // value prior to storing.
   2535       if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
   2536         if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
   2537           if (VITy->getBitWidth() > AITy->getBitWidth()) {
   2538             if (DL.isBigEndian())
   2539               V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
   2540                                  "endian_shift");
   2541             V = IRB.CreateTrunc(V, AITy, "load.trunc");
   2542           }
   2543 
   2544       V = convertValue(DL, IRB, V, NewAllocaTy);
   2545       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
   2546                                      SI.isVolatile());
   2547     } else {
   2548       Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
   2549       NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
   2550                                      SI.isVolatile());
   2551     }
   2552     if (SI.isVolatile())
   2553       NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
   2554     Pass.DeadInsts.insert(&SI);
   2555     deleteIfTriviallyDead(OldOp);
   2556 
   2557     DEBUG(dbgs() << "          to: " << *NewSI << "\n");
   2558     return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
   2559   }
   2560 
   2561   /// \brief Compute an integer value from splatting an i8 across the given
   2562   /// number of bytes.
   2563   ///
   2564   /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
   2565   /// call this routine.
   2566   /// FIXME: Heed the advice above.
   2567   ///
   2568   /// \param V The i8 value to splat.
   2569   /// \param Size The number of bytes in the output (assuming i8 is one byte)
   2570   Value *getIntegerSplat(Value *V, unsigned Size) {
   2571     assert(Size > 0 && "Expected a positive number of bytes.");
   2572     IntegerType *VTy = cast<IntegerType>(V->getType());
   2573     assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
   2574     if (Size == 1)
   2575       return V;
   2576 
   2577     Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
   2578     V = IRB.CreateMul(
   2579         IRB.CreateZExt(V, SplatIntTy, "zext"),
   2580         ConstantExpr::getUDiv(
   2581             Constant::getAllOnesValue(SplatIntTy),
   2582             ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
   2583                                   SplatIntTy)),
   2584         "isplat");
   2585     return V;
   2586   }
   2587 
   2588   /// \brief Compute a vector splat for a given element value.
   2589   Value *getVectorSplat(Value *V, unsigned NumElements) {
   2590     V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
   2591     DEBUG(dbgs() << "       splat: " << *V << "\n");
   2592     return V;
   2593   }
   2594 
   2595   bool visitMemSetInst(MemSetInst &II) {
   2596     DEBUG(dbgs() << "    original: " << II << "\n");
   2597     assert(II.getRawDest() == OldPtr);
   2598 
   2599     // If the memset has a variable size, it cannot be split, just adjust the
   2600     // pointer to the new alloca.
   2601     if (!isa<Constant>(II.getLength())) {
   2602       assert(!IsSplit);
   2603       assert(NewBeginOffset == BeginOffset);
   2604       II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
   2605       Type *CstTy = II.getAlignmentCst()->getType();
   2606       II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
   2607 
   2608       deleteIfTriviallyDead(OldPtr);
   2609       return false;
   2610     }
   2611 
   2612     // Record this instruction for deletion.
   2613     Pass.DeadInsts.insert(&II);
   2614 
   2615     Type *AllocaTy = NewAI.getAllocatedType();
   2616     Type *ScalarTy = AllocaTy->getScalarType();
   2617 
   2618     // If this doesn't map cleanly onto the alloca type, and that type isn't
   2619     // a single value type, just emit a memset.
   2620     if (!VecTy && !IntTy &&
   2621         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
   2622          SliceSize != DL.getTypeStoreSize(AllocaTy) ||
   2623          !AllocaTy->isSingleValueType() ||
   2624          !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
   2625          DL.getTypeSizeInBits(ScalarTy) % 8 != 0)) {
   2626       Type *SizeTy = II.getLength()->getType();
   2627       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
   2628       CallInst *New = IRB.CreateMemSet(
   2629           getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
   2630           getSliceAlign(), II.isVolatile());
   2631       (void)New;
   2632       DEBUG(dbgs() << "          to: " << *New << "\n");
   2633       return false;
   2634     }
   2635 
   2636     // If we can represent this as a simple value, we have to build the actual
   2637     // value to store, which requires expanding the byte present in memset to
   2638     // a sensible representation for the alloca type. This is essentially
   2639     // splatting the byte to a sufficiently wide integer, splatting it across
   2640     // any desired vector width, and bitcasting to the final type.
   2641     Value *V;
   2642 
   2643     if (VecTy) {
   2644       // If this is a memset of a vectorized alloca, insert it.
   2645       assert(ElementTy == ScalarTy);
   2646 
   2647       unsigned BeginIndex = getIndex(NewBeginOffset);
   2648       unsigned EndIndex = getIndex(NewEndOffset);
   2649       assert(EndIndex > BeginIndex && "Empty vector!");
   2650       unsigned NumElements = EndIndex - BeginIndex;
   2651       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
   2652 
   2653       Value *Splat =
   2654           getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
   2655       Splat = convertValue(DL, IRB, Splat, ElementTy);
   2656       if (NumElements > 1)
   2657         Splat = getVectorSplat(Splat, NumElements);
   2658 
   2659       Value *Old =
   2660           IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
   2661       V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
   2662     } else if (IntTy) {
   2663       // If this is a memset on an alloca where we can widen stores, insert the
   2664       // set integer.
   2665       assert(!II.isVolatile());
   2666 
   2667       uint64_t Size = NewEndOffset - NewBeginOffset;
   2668       V = getIntegerSplat(II.getValue(), Size);
   2669 
   2670       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
   2671                     EndOffset != NewAllocaBeginOffset)) {
   2672         Value *Old =
   2673             IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
   2674         Old = convertValue(DL, IRB, Old, IntTy);
   2675         uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
   2676         V = insertInteger(DL, IRB, Old, V, Offset, "insert");
   2677       } else {
   2678         assert(V->getType() == IntTy &&
   2679                "Wrong type for an alloca wide integer!");
   2680       }
   2681       V = convertValue(DL, IRB, V, AllocaTy);
   2682     } else {
   2683       // Established these invariants above.
   2684       assert(NewBeginOffset == NewAllocaBeginOffset);
   2685       assert(NewEndOffset == NewAllocaEndOffset);
   2686 
   2687       V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
   2688       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
   2689         V = getVectorSplat(V, AllocaVecTy->getNumElements());
   2690 
   2691       V = convertValue(DL, IRB, V, AllocaTy);
   2692     }
   2693 
   2694     Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
   2695                                         II.isVolatile());
   2696     (void)New;
   2697     DEBUG(dbgs() << "          to: " << *New << "\n");
   2698     return !II.isVolatile();
   2699   }
   2700 
   2701   bool visitMemTransferInst(MemTransferInst &II) {
   2702     // Rewriting of memory transfer instructions can be a bit tricky. We break
   2703     // them into two categories: split intrinsics and unsplit intrinsics.
   2704 
   2705     DEBUG(dbgs() << "    original: " << II << "\n");
   2706 
   2707     bool IsDest = &II.getRawDestUse() == OldUse;
   2708     assert((IsDest && II.getRawDest() == OldPtr) ||
   2709            (!IsDest && II.getRawSource() == OldPtr));
   2710 
   2711     unsigned SliceAlign = getSliceAlign();
   2712 
   2713     // For unsplit intrinsics, we simply modify the source and destination
   2714     // pointers in place. This isn't just an optimization, it is a matter of
   2715     // correctness. With unsplit intrinsics we may be dealing with transfers
   2716     // within a single alloca before SROA ran, or with transfers that have
   2717     // a variable length. We may also be dealing with memmove instead of
   2718     // memcpy, and so simply updating the pointers is the necessary for us to
   2719     // update both source and dest of a single call.
   2720     if (!IsSplittable) {
   2721       Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
   2722       if (IsDest)
   2723         II.setDest(AdjustedPtr);
   2724       else
   2725         II.setSource(AdjustedPtr);
   2726 
   2727       if (II.getAlignment() > SliceAlign) {
   2728         Type *CstTy = II.getAlignmentCst()->getType();
   2729         II.setAlignment(
   2730             ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
   2731       }
   2732 
   2733       DEBUG(dbgs() << "          to: " << II << "\n");
   2734       deleteIfTriviallyDead(OldPtr);
   2735       return false;
   2736     }
   2737     // For split transfer intrinsics we have an incredibly useful assurance:
   2738     // the source and destination do not reside within the same alloca, and at
   2739     // least one of them does not escape. This means that we can replace
   2740     // memmove with memcpy, and we don't need to worry about all manner of
   2741     // downsides to splitting and transforming the operations.
   2742 
   2743     // If this doesn't map cleanly onto the alloca type, and that type isn't
   2744     // a single value type, just emit a memcpy.
   2745     bool EmitMemCpy =
   2746         !VecTy && !IntTy &&
   2747         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
   2748          SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
   2749          !NewAI.getAllocatedType()->isSingleValueType());
   2750 
   2751     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
   2752     // size hasn't been shrunk based on analysis of the viable range, this is
   2753     // a no-op.
   2754     if (EmitMemCpy && &OldAI == &NewAI) {
   2755       // Ensure the start lines up.
   2756       assert(NewBeginOffset == BeginOffset);
   2757 
   2758       // Rewrite the size as needed.
   2759       if (NewEndOffset != EndOffset)
   2760         II.setLength(ConstantInt::get(II.getLength()->getType(),
   2761                                       NewEndOffset - NewBeginOffset));
   2762       return false;
   2763     }
   2764     // Record this instruction for deletion.
   2765     Pass.DeadInsts.insert(&II);
   2766 
   2767     // Strip all inbounds GEPs and pointer casts to try to dig out any root
   2768     // alloca that should be re-examined after rewriting this instruction.
   2769     Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
   2770     if (AllocaInst *AI =
   2771             dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
   2772       assert(AI != &OldAI && AI != &NewAI &&
   2773              "Splittable transfers cannot reach the same alloca on both ends.");
   2774       Pass.Worklist.insert(AI);
   2775     }
   2776 
   2777     Type *OtherPtrTy = OtherPtr->getType();
   2778     unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
   2779 
   2780     // Compute the relative offset for the other pointer within the transfer.
   2781     unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
   2782     APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
   2783     unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
   2784                                    OtherOffset.zextOrTrunc(64).getZExtValue());
   2785 
   2786     if (EmitMemCpy) {
   2787       // Compute the other pointer, folding as much as possible to produce
   2788       // a single, simple GEP in most cases.
   2789       OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
   2790                                 OtherPtr->getName() + ".");
   2791 
   2792       Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
   2793       Type *SizeTy = II.getLength()->getType();
   2794       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
   2795 
   2796       CallInst *New = IRB.CreateMemCpy(
   2797           IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
   2798           MinAlign(SliceAlign, OtherAlign), II.isVolatile());
   2799       (void)New;
   2800       DEBUG(dbgs() << "          to: " << *New << "\n");
   2801       return false;
   2802     }
   2803 
   2804     bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
   2805                          NewEndOffset == NewAllocaEndOffset;
   2806     uint64_t Size = NewEndOffset - NewBeginOffset;
   2807     unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
   2808     unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
   2809     unsigned NumElements = EndIndex - BeginIndex;
   2810     IntegerType *SubIntTy =
   2811         IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
   2812 
   2813     // Reset the other pointer type to match the register type we're going to
   2814     // use, but using the address space of the original other pointer.
   2815     if (VecTy && !IsWholeAlloca) {
   2816       if (NumElements == 1)
   2817         OtherPtrTy = VecTy->getElementType();
   2818       else
   2819         OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
   2820 
   2821       OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS);
   2822     } else if (IntTy && !IsWholeAlloca) {
   2823       OtherPtrTy = SubIntTy->getPointerTo(OtherAS);
   2824     } else {
   2825       OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS);
   2826     }
   2827 
   2828     Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
   2829                                    OtherPtr->getName() + ".");
   2830     unsigned SrcAlign = OtherAlign;
   2831     Value *DstPtr = &NewAI;
   2832     unsigned DstAlign = SliceAlign;
   2833     if (!IsDest) {
   2834       std::swap(SrcPtr, DstPtr);
   2835       std::swap(SrcAlign, DstAlign);
   2836     }
   2837 
   2838     Value *Src;
   2839     if (VecTy && !IsWholeAlloca && !IsDest) {
   2840       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
   2841       Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
   2842     } else if (IntTy && !IsWholeAlloca && !IsDest) {
   2843       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load");
   2844       Src = convertValue(DL, IRB, Src, IntTy);
   2845       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
   2846       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
   2847     } else {
   2848       Src =
   2849           IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), "copyload");
   2850     }
   2851 
   2852     if (VecTy && !IsWholeAlloca && IsDest) {
   2853       Value *Old =
   2854           IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
   2855       Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
   2856     } else if (IntTy && !IsWholeAlloca && IsDest) {
   2857       Value *Old =
   2858           IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload");
   2859       Old = convertValue(DL, IRB, Old, IntTy);
   2860       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
   2861       Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
   2862       Src = convertValue(DL, IRB, Src, NewAllocaTy);
   2863     }
   2864 
   2865     StoreInst *Store = cast<StoreInst>(
   2866         IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
   2867     (void)Store;
   2868     DEBUG(dbgs() << "          to: " << *Store << "\n");
   2869     return !II.isVolatile();
   2870   }
   2871 
   2872   bool visitIntrinsicInst(IntrinsicInst &II) {
   2873     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
   2874            II.getIntrinsicID() == Intrinsic::lifetime_end);
   2875     DEBUG(dbgs() << "    original: " << II << "\n");
   2876     assert(II.getArgOperand(1) == OldPtr);
   2877 
   2878     // Record this instruction for deletion.
   2879     Pass.DeadInsts.insert(&II);
   2880 
   2881     ConstantInt *Size =
   2882         ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
   2883                          NewEndOffset - NewBeginOffset);
   2884     Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
   2885     Value *New;
   2886     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
   2887       New = IRB.CreateLifetimeStart(Ptr, Size);
   2888     else
   2889       New = IRB.CreateLifetimeEnd(Ptr, Size);
   2890 
   2891     (void)New;
   2892     DEBUG(dbgs() << "          to: " << *New << "\n");
   2893     return true;
   2894   }
   2895 
   2896   bool visitPHINode(PHINode &PN) {
   2897     DEBUG(dbgs() << "    original: " << PN << "\n");
   2898     assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
   2899     assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
   2900 
   2901     // We would like to compute a new pointer in only one place, but have it be
   2902     // as local as possible to the PHI. To do that, we re-use the location of
   2903     // the old pointer, which necessarily must be in the right position to
   2904     // dominate the PHI.
   2905     IRBuilderTy PtrBuilder(IRB);
   2906     if (isa<PHINode>(OldPtr))
   2907       PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
   2908     else
   2909       PtrBuilder.SetInsertPoint(OldPtr);
   2910     PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
   2911 
   2912     Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
   2913     // Replace the operands which were using the old pointer.
   2914     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
   2915 
   2916     DEBUG(dbgs() << "          to: " << PN << "\n");
   2917     deleteIfTriviallyDead(OldPtr);
   2918 
   2919     // PHIs can't be promoted on their own, but often can be speculated. We
   2920     // check the speculation outside of the rewriter so that we see the
   2921     // fully-rewritten alloca.
   2922     PHIUsers.insert(&PN);
   2923     return true;
   2924   }
   2925 
   2926   bool visitSelectInst(SelectInst &SI) {
   2927     DEBUG(dbgs() << "    original: " << SI << "\n");
   2928     assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
   2929            "Pointer isn't an operand!");
   2930     assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
   2931     assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
   2932 
   2933     Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
   2934     // Replace the operands which were using the old pointer.
   2935     if (SI.getOperand(1) == OldPtr)
   2936       SI.setOperand(1, NewPtr);
   2937     if (SI.getOperand(2) == OldPtr)
   2938       SI.setOperand(2, NewPtr);
   2939 
   2940     DEBUG(dbgs() << "          to: " << SI << "\n");
   2941     deleteIfTriviallyDead(OldPtr);
   2942 
   2943     // Selects can't be promoted on their own, but often can be speculated. We
   2944     // check the speculation outside of the rewriter so that we see the
   2945     // fully-rewritten alloca.
   2946     SelectUsers.insert(&SI);
   2947     return true;
   2948   }
   2949 };
   2950 
   2951 namespace {
   2952 /// \brief Visitor to rewrite aggregate loads and stores as scalar.
   2953 ///
   2954 /// This pass aggressively rewrites all aggregate loads and stores on
   2955 /// a particular pointer (or any pointer derived from it which we can identify)
   2956 /// with scalar loads and stores.
   2957 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   2958   // Befriend the base class so it can delegate to private visit methods.
   2959   friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
   2960 
   2961   /// Queue of pointer uses to analyze and potentially rewrite.
   2962   SmallVector<Use *, 8> Queue;
   2963 
   2964   /// Set to prevent us from cycling with phi nodes and loops.
   2965   SmallPtrSet<User *, 8> Visited;
   2966 
   2967   /// The current pointer use being rewritten. This is used to dig up the used
   2968   /// value (as opposed to the user).
   2969   Use *U;
   2970 
   2971 public:
   2972   /// Rewrite loads and stores through a pointer and all pointers derived from
   2973   /// it.
   2974   bool rewrite(Instruction &I) {
   2975     DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
   2976     enqueueUsers(I);
   2977     bool Changed = false;
   2978     while (!Queue.empty()) {
   2979       U = Queue.pop_back_val();
   2980       Changed |= visit(cast<Instruction>(U->getUser()));
   2981     }
   2982     return Changed;
   2983   }
   2984 
   2985 private:
   2986   /// Enqueue all the users of the given instruction for further processing.
   2987   /// This uses a set to de-duplicate users.
   2988   void enqueueUsers(Instruction &I) {
   2989     for (Use &U : I.uses())
   2990       if (Visited.insert(U.getUser()).second)
   2991         Queue.push_back(&U);
   2992   }
   2993 
   2994   // Conservative default is to not rewrite anything.
   2995   bool visitInstruction(Instruction &I) { return false; }
   2996 
   2997   /// \brief Generic recursive split emission class.
   2998   template <typename Derived> class OpSplitter {
   2999   protected:
   3000     /// The builder used to form new instructions.
   3001     IRBuilderTy IRB;
   3002     /// The indices which to be used with insert- or extractvalue to select the
   3003     /// appropriate value within the aggregate.
   3004     SmallVector<unsigned, 4> Indices;
   3005     /// The indices to a GEP instruction which will move Ptr to the correct slot
   3006     /// within the aggregate.
   3007     SmallVector<Value *, 4> GEPIndices;
   3008     /// The base pointer of the original op, used as a base for GEPing the
   3009     /// split operations.
   3010     Value *Ptr;
   3011 
   3012     /// Initialize the splitter with an insertion point, Ptr and start with a
   3013     /// single zero GEP index.
   3014     OpSplitter(Instruction *InsertionPoint, Value *Ptr)
   3015         : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
   3016 
   3017   public:
   3018     /// \brief Generic recursive split emission routine.
   3019     ///
   3020     /// This method recursively splits an aggregate op (load or store) into
   3021     /// scalar or vector ops. It splits recursively until it hits a single value
   3022     /// and emits that single value operation via the template argument.
   3023     ///
   3024     /// The logic of this routine relies on GEPs and insertvalue and
   3025     /// extractvalue all operating with the same fundamental index list, merely
   3026     /// formatted differently (GEPs need actual values).
   3027     ///
   3028     /// \param Ty  The type being split recursively into smaller ops.
   3029     /// \param Agg The aggregate value being built up or stored, depending on
   3030     /// whether this is splitting a load or a store respectively.
   3031     void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
   3032       if (Ty->isSingleValueType())
   3033         return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
   3034 
   3035       if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
   3036         unsigned OldSize = Indices.size();
   3037         (void)OldSize;
   3038         for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
   3039              ++Idx) {
   3040           assert(Indices.size() == OldSize && "Did not return to the old size");
   3041           Indices.push_back(Idx);
   3042           GEPIndices.push_back(IRB.getInt32(Idx));
   3043           emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
   3044           GEPIndices.pop_back();
   3045           Indices.pop_back();
   3046         }
   3047         return;
   3048       }
   3049 
   3050       if (StructType *STy = dyn_cast<StructType>(Ty)) {
   3051         unsigned OldSize = Indices.size();
   3052         (void)OldSize;
   3053         for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
   3054              ++Idx) {
   3055           assert(Indices.size() == OldSize && "Did not return to the old size");
   3056           Indices.push_back(Idx);
   3057           GEPIndices.push_back(IRB.getInt32(Idx));
   3058           emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
   3059           GEPIndices.pop_back();
   3060           Indices.pop_back();
   3061         }
   3062         return;
   3063       }
   3064 
   3065       llvm_unreachable("Only arrays and structs are aggregate loadable types");
   3066     }
   3067   };
   3068 
   3069   struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
   3070     LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
   3071         : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
   3072 
   3073     /// Emit a leaf load of a single value. This is called at the leaves of the
   3074     /// recursive emission to actually load values.
   3075     void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
   3076       assert(Ty->isSingleValueType());
   3077       // Load the single value and insert it using the indices.
   3078       Value *GEP =
   3079           IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
   3080       Value *Load = IRB.CreateLoad(GEP, Name + ".load");
   3081       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
   3082       DEBUG(dbgs() << "          to: " << *Load << "\n");
   3083     }
   3084   };
   3085 
   3086   bool visitLoadInst(LoadInst &LI) {
   3087     assert(LI.getPointerOperand() == *U);
   3088     if (!LI.isSimple() || LI.getType()->isSingleValueType())
   3089       return false;
   3090 
   3091     // We have an aggregate being loaded, split it apart.
   3092     DEBUG(dbgs() << "    original: " << LI << "\n");
   3093     LoadOpSplitter Splitter(&LI, *U);
   3094     Value *V = UndefValue::get(LI.getType());
   3095     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
   3096     LI.replaceAllUsesWith(V);
   3097     LI.eraseFromParent();
   3098     return true;
   3099   }
   3100 
   3101   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
   3102     StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
   3103         : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
   3104 
   3105     /// Emit a leaf store of a single value. This is called at the leaves of the
   3106     /// recursive emission to actually produce stores.
   3107     void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
   3108       assert(Ty->isSingleValueType());
   3109       // Extract the single value and store it using the indices.
   3110       //
   3111       // The gep and extractvalue values are factored out of the CreateStore
   3112       // call to make the output independent of the argument evaluation order.
   3113       Value *ExtractValue =
   3114           IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
   3115       Value *InBoundsGEP =
   3116           IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
   3117       Value *Store = IRB.CreateStore(ExtractValue, InBoundsGEP);
   3118       (void)Store;
   3119       DEBUG(dbgs() << "          to: " << *Store << "\n");
   3120     }
   3121   };
   3122 
   3123   bool visitStoreInst(StoreInst &SI) {
   3124     if (!SI.isSimple() || SI.getPointerOperand() != *U)
   3125       return false;
   3126     Value *V = SI.getValueOperand();
   3127     if (V->getType()->isSingleValueType())
   3128       return false;
   3129 
   3130     // We have an aggregate being stored, split it apart.
   3131     DEBUG(dbgs() << "    original: " << SI << "\n");
   3132     StoreOpSplitter Splitter(&SI, *U);
   3133     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
   3134     SI.eraseFromParent();
   3135     return true;
   3136   }
   3137 
   3138   bool visitBitCastInst(BitCastInst &BC) {
   3139     enqueueUsers(BC);
   3140     return false;
   3141   }
   3142 
   3143   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   3144     enqueueUsers(GEPI);
   3145     return false;
   3146   }
   3147 
   3148   bool visitPHINode(PHINode &PN) {
   3149     enqueueUsers(PN);
   3150     return false;
   3151   }
   3152 
   3153   bool visitSelectInst(SelectInst &SI) {
   3154     enqueueUsers(SI);
   3155     return false;
   3156   }
   3157 };
   3158 }
   3159 
   3160 /// \brief Strip aggregate type wrapping.
   3161 ///
   3162 /// This removes no-op aggregate types wrapping an underlying type. It will
   3163 /// strip as many layers of types as it can without changing either the type
   3164 /// size or the allocated size.
   3165 static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
   3166   if (Ty->isSingleValueType())
   3167     return Ty;
   3168 
   3169   uint64_t AllocSize = DL.getTypeAllocSize(Ty);
   3170   uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
   3171 
   3172   Type *InnerTy;
   3173   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
   3174     InnerTy = ArrTy->getElementType();
   3175   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
   3176     const StructLayout *SL = DL.getStructLayout(STy);
   3177     unsigned Index = SL->getElementContainingOffset(0);
   3178     InnerTy = STy->getElementType(Index);
   3179   } else {
   3180     return Ty;
   3181   }
   3182 
   3183   if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
   3184       TypeSize > DL.getTypeSizeInBits(InnerTy))
   3185     return Ty;
   3186 
   3187   return stripAggregateTypeWrapping(DL, InnerTy);
   3188 }
   3189 
   3190 /// \brief Try to find a partition of the aggregate type passed in for a given
   3191 /// offset and size.
   3192 ///
   3193 /// This recurses through the aggregate type and tries to compute a subtype
   3194 /// based on the offset and size. When the offset and size span a sub-section
   3195 /// of an array, it will even compute a new array type for that sub-section,
   3196 /// and the same for structs.
   3197 ///
   3198 /// Note that this routine is very strict and tries to find a partition of the
   3199 /// type which produces the *exact* right offset and size. It is not forgiving
   3200 /// when the size or offset cause either end of type-based partition to be off.
   3201 /// Also, this is a best-effort routine. It is reasonable to give up and not
   3202 /// return a type if necessary.
   3203 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
   3204                               uint64_t Size) {
   3205   if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
   3206     return stripAggregateTypeWrapping(DL, Ty);
   3207   if (Offset > DL.getTypeAllocSize(Ty) ||
   3208       (DL.getTypeAllocSize(Ty) - Offset) < Size)
   3209     return nullptr;
   3210 
   3211   if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
   3212     // We can't partition pointers...
   3213     if (SeqTy->isPointerTy())
   3214       return nullptr;
   3215 
   3216     Type *ElementTy = SeqTy->getElementType();
   3217     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
   3218     uint64_t NumSkippedElements = Offset / ElementSize;
   3219     if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
   3220       if (NumSkippedElements >= ArrTy->getNumElements())
   3221         return nullptr;
   3222     } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
   3223       if (NumSkippedElements >= VecTy->getNumElements())
   3224         return nullptr;
   3225     }
   3226     Offset -= NumSkippedElements * ElementSize;
   3227 
   3228     // First check if we need to recurse.
   3229     if (Offset > 0 || Size < ElementSize) {
   3230       // Bail if the partition ends in a different array element.
   3231       if ((Offset + Size) > ElementSize)
   3232         return nullptr;
   3233       // Recurse through the element type trying to peel off offset bytes.
   3234       return getTypePartition(DL, ElementTy, Offset, Size);
   3235     }
   3236     assert(Offset == 0);
   3237 
   3238     if (Size == ElementSize)
   3239       return stripAggregateTypeWrapping(DL, ElementTy);
   3240     assert(Size > ElementSize);
   3241     uint64_t NumElements = Size / ElementSize;
   3242     if (NumElements * ElementSize != Size)
   3243       return nullptr;
   3244     return ArrayType::get(ElementTy, NumElements);
   3245   }
   3246 
   3247   StructType *STy = dyn_cast<StructType>(Ty);
   3248   if (!STy)
   3249     return nullptr;
   3250 
   3251   const StructLayout *SL = DL.getStructLayout(STy);
   3252   if (Offset >= SL->getSizeInBytes())
   3253     return nullptr;
   3254   uint64_t EndOffset = Offset + Size;
   3255   if (EndOffset > SL->getSizeInBytes())
   3256     return nullptr;
   3257 
   3258   unsigned Index = SL->getElementContainingOffset(Offset);
   3259   Offset -= SL->getElementOffset(Index);
   3260 
   3261   Type *ElementTy = STy->getElementType(Index);
   3262   uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
   3263   if (Offset >= ElementSize)
   3264     return nullptr; // The offset points into alignment padding.
   3265 
   3266   // See if any partition must be contained by the element.
   3267   if (Offset > 0 || Size < ElementSize) {
   3268     if ((Offset + Size) > ElementSize)
   3269       return nullptr;
   3270     return getTypePartition(DL, ElementTy, Offset, Size);
   3271   }
   3272   assert(Offset == 0);
   3273 
   3274   if (Size == ElementSize)
   3275     return stripAggregateTypeWrapping(DL, ElementTy);
   3276 
   3277   StructType::element_iterator EI = STy->element_begin() + Index,
   3278                                EE = STy->element_end();
   3279   if (EndOffset < SL->getSizeInBytes()) {
   3280     unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
   3281     if (Index == EndIndex)
   3282       return nullptr; // Within a single element and its padding.
   3283 
   3284     // Don't try to form "natural" types if the elements don't line up with the
   3285     // expected size.
   3286     // FIXME: We could potentially recurse down through the last element in the
   3287     // sub-struct to find a natural end point.
   3288     if (SL->getElementOffset(EndIndex) != EndOffset)
   3289       return nullptr;
   3290 
   3291     assert(Index < EndIndex);
   3292     EE = STy->element_begin() + EndIndex;
   3293   }
   3294 
   3295   // Try to build up a sub-structure.
   3296   StructType *SubTy =
   3297       StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
   3298   const StructLayout *SubSL = DL.getStructLayout(SubTy);
   3299   if (Size != SubSL->getSizeInBytes())
   3300     return nullptr; // The sub-struct doesn't have quite the size needed.
   3301 
   3302   return SubTy;
   3303 }
   3304 
   3305 /// \brief Pre-split loads and stores to simplify rewriting.
   3306 ///
   3307 /// We want to break up the splittable load+store pairs as much as
   3308 /// possible. This is important to do as a preprocessing step, as once we
   3309 /// start rewriting the accesses to partitions of the alloca we lose the
   3310 /// necessary information to correctly split apart paired loads and stores
   3311 /// which both point into this alloca. The case to consider is something like
   3312 /// the following:
   3313 ///
   3314 ///   %a = alloca [12 x i8]
   3315 ///   %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
   3316 ///   %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
   3317 ///   %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
   3318 ///   %iptr1 = bitcast i8* %gep1 to i64*
   3319 ///   %iptr2 = bitcast i8* %gep2 to i64*
   3320 ///   %fptr1 = bitcast i8* %gep1 to float*
   3321 ///   %fptr2 = bitcast i8* %gep2 to float*
   3322 ///   %fptr3 = bitcast i8* %gep3 to float*
   3323 ///   store float 0.0, float* %fptr1
   3324 ///   store float 1.0, float* %fptr2
   3325 ///   %v = load i64* %iptr1
   3326 ///   store i64 %v, i64* %iptr2
   3327 ///   %f1 = load float* %fptr2
   3328 ///   %f2 = load float* %fptr3
   3329 ///
   3330 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
   3331 /// promote everything so we recover the 2 SSA values that should have been
   3332 /// there all along.
   3333 ///
   3334 /// \returns true if any changes are made.
   3335 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   3336   DEBUG(dbgs() << "Pre-splitting loads and stores\n");
   3337 
   3338   // Track the loads and stores which are candidates for pre-splitting here, in
   3339   // the order they first appear during the partition scan. These give stable
   3340   // iteration order and a basis for tracking which loads and stores we
   3341   // actually split.
   3342   SmallVector<LoadInst *, 4> Loads;
   3343   SmallVector<StoreInst *, 4> Stores;
   3344 
   3345   // We need to accumulate the splits required of each load or store where we
   3346   // can find them via a direct lookup. This is important to cross-check loads
   3347   // and stores against each other. We also track the slice so that we can kill
   3348   // all the slices that end up split.
   3349   struct SplitOffsets {
   3350     Slice *S;
   3351     std::vector<uint64_t> Splits;
   3352   };
   3353   SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
   3354 
   3355   // Track loads out of this alloca which cannot, for any reason, be pre-split.
   3356   // This is important as we also cannot pre-split stores of those loads!
   3357   // FIXME: This is all pretty gross. It means that we can be more aggressive
   3358   // in pre-splitting when the load feeding the store happens to come from
   3359   // a separate alloca. Put another way, the effectiveness of SROA would be
   3360   // decreased by a frontend which just concatenated all of its local allocas
   3361   // into one big flat alloca. But defeating such patterns is exactly the job
   3362   // SROA is tasked with! Sadly, to not have this discrepancy we would have
   3363   // change store pre-splitting to actually force pre-splitting of the load
   3364   // that feeds it *and all stores*. That makes pre-splitting much harder, but
   3365   // maybe it would make it more principled?
   3366   SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
   3367 
   3368   DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
   3369   for (auto &P : AS.partitions()) {
   3370     for (Slice &S : P) {
   3371       Instruction *I = cast<Instruction>(S.getUse()->getUser());
   3372       if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
   3373         // If this is a load we have to track that it can't participate in any
   3374         // pre-splitting. If this is a store of a load we have to track that
   3375         // that load also can't participate in any pre-splitting.
   3376         if (auto *LI = dyn_cast<LoadInst>(I))
   3377           UnsplittableLoads.insert(LI);
   3378         else if (auto *SI = dyn_cast<StoreInst>(I))
   3379           if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
   3380             UnsplittableLoads.insert(LI);
   3381         continue;
   3382       }
   3383       assert(P.endOffset() > S.beginOffset() &&
   3384              "Empty or backwards partition!");
   3385 
   3386       // Determine if this is a pre-splittable slice.
   3387       if (auto *LI = dyn_cast<LoadInst>(I)) {
   3388         assert(!LI->isVolatile() && "Cannot split volatile loads!");
   3389 
   3390         // The load must be used exclusively to store into other pointers for
   3391         // us to be able to arbitrarily pre-split it. The stores must also be
   3392         // simple to avoid changing semantics.
   3393         auto IsLoadSimplyStored = [](LoadInst *LI) {
   3394           for (User *LU : LI->users()) {
   3395             auto *SI = dyn_cast<StoreInst>(LU);
   3396             if (!SI || !SI->isSimple())
   3397               return false;
   3398           }
   3399           return true;
   3400         };
   3401         if (!IsLoadSimplyStored(LI)) {
   3402           UnsplittableLoads.insert(LI);
   3403           continue;
   3404         }
   3405 
   3406         Loads.push_back(LI);
   3407       } else if (auto *SI = dyn_cast<StoreInst>(I)) {
   3408         if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
   3409           // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
   3410           continue;
   3411         auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
   3412         if (!StoredLoad || !StoredLoad->isSimple())
   3413           continue;
   3414         assert(!SI->isVolatile() && "Cannot split volatile stores!");
   3415 
   3416         Stores.push_back(SI);
   3417       } else {
   3418         // Other uses cannot be pre-split.
   3419         continue;
   3420       }
   3421 
   3422       // Record the initial split.
   3423       DEBUG(dbgs() << "    Candidate: " << *I << "\n");
   3424       auto &Offsets = SplitOffsetsMap[I];
   3425       assert(Offsets.Splits.empty() &&
   3426              "Should not have splits the first time we see an instruction!");
   3427       Offsets.S = &S;
   3428       Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
   3429     }
   3430 
   3431     // Now scan the already split slices, and add a split for any of them which
   3432     // we're going to pre-split.
   3433     for (Slice *S : P.splitSliceTails()) {
   3434       auto SplitOffsetsMapI =
   3435           SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
   3436       if (SplitOffsetsMapI == SplitOffsetsMap.end())
   3437         continue;
   3438       auto &Offsets = SplitOffsetsMapI->second;
   3439 
   3440       assert(Offsets.S == S && "Found a mismatched slice!");
   3441       assert(!Offsets.Splits.empty() &&
   3442              "Cannot have an empty set of splits on the second partition!");
   3443       assert(Offsets.Splits.back() ==
   3444                  P.beginOffset() - Offsets.S->beginOffset() &&
   3445              "Previous split does not end where this one begins!");
   3446 
   3447       // Record each split. The last partition's end isn't needed as the size
   3448       // of the slice dictates that.
   3449       if (S->endOffset() > P.endOffset())
   3450         Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
   3451     }
   3452   }
   3453 
   3454   // We may have split loads where some of their stores are split stores. For
   3455   // such loads and stores, we can only pre-split them if their splits exactly
   3456   // match relative to their starting offset. We have to verify this prior to
   3457   // any rewriting.
   3458   Stores.erase(
   3459       std::remove_if(Stores.begin(), Stores.end(),
   3460                      [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
   3461                        // Lookup the load we are storing in our map of split
   3462                        // offsets.
   3463                        auto *LI = cast<LoadInst>(SI->getValueOperand());
   3464                        // If it was completely unsplittable, then we're done,
   3465                        // and this store can't be pre-split.
   3466                        if (UnsplittableLoads.count(LI))
   3467                          return true;
   3468 
   3469                        auto LoadOffsetsI = SplitOffsetsMap.find(LI);
   3470                        if (LoadOffsetsI == SplitOffsetsMap.end())
   3471                          return false; // Unrelated loads are definitely safe.
   3472                        auto &LoadOffsets = LoadOffsetsI->second;
   3473 
   3474                        // Now lookup the store's offsets.
   3475                        auto &StoreOffsets = SplitOffsetsMap[SI];
   3476 
   3477                        // If the relative offsets of each split in the load and
   3478                        // store match exactly, then we can split them and we
   3479                        // don't need to remove them here.
   3480                        if (LoadOffsets.Splits == StoreOffsets.Splits)
   3481                          return false;
   3482 
   3483                        DEBUG(dbgs()
   3484                              << "    Mismatched splits for load and store:\n"
   3485                              << "      " << *LI << "\n"
   3486                              << "      " << *SI << "\n");
   3487 
   3488                        // We've found a store and load that we need to split
   3489                        // with mismatched relative splits. Just give up on them
   3490                        // and remove both instructions from our list of
   3491                        // candidates.
   3492                        UnsplittableLoads.insert(LI);
   3493                        return true;
   3494                      }),
   3495       Stores.end());
   3496   // Now we have to go *back* through all the stores, because a later store may
   3497   // have caused an earlier store's load to become unsplittable and if it is
   3498   // unsplittable for the later store, then we can't rely on it being split in
   3499   // the earlier store either.
   3500   Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
   3501                               [&UnsplittableLoads](StoreInst *SI) {
   3502                                 auto *LI =
   3503                                     cast<LoadInst>(SI->getValueOperand());
   3504                                 return UnsplittableLoads.count(LI);
   3505                               }),
   3506                Stores.end());
   3507   // Once we've established all the loads that can't be split for some reason,
   3508   // filter any that made it into our list out.
   3509   Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
   3510                              [&UnsplittableLoads](LoadInst *LI) {
   3511                                return UnsplittableLoads.count(LI);
   3512                              }),
   3513               Loads.end());
   3514 
   3515 
   3516   // If no loads or stores are left, there is no pre-splitting to be done for
   3517   // this alloca.
   3518   if (Loads.empty() && Stores.empty())
   3519     return false;
   3520 
   3521   // From here on, we can't fail and will be building new accesses, so rig up
   3522   // an IR builder.
   3523   IRBuilderTy IRB(&AI);
   3524 
   3525   // Collect the new slices which we will merge into the alloca slices.
   3526   SmallVector<Slice, 4> NewSlices;
   3527 
   3528   // Track any allocas we end up splitting loads and stores for so we iterate
   3529   // on them.
   3530   SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
   3531 
   3532   // At this point, we have collected all of the loads and stores we can
   3533   // pre-split, and the specific splits needed for them. We actually do the
   3534   // splitting in a specific order in order to handle when one of the loads in
   3535   // the value operand to one of the stores.
   3536   //
   3537   // First, we rewrite all of the split loads, and just accumulate each split
   3538   // load in a parallel structure. We also build the slices for them and append
   3539   // them to the alloca slices.
   3540   SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
   3541   std::vector<LoadInst *> SplitLoads;
   3542   const DataLayout &DL = AI.getModule()->getDataLayout();
   3543   for (LoadInst *LI : Loads) {
   3544     SplitLoads.clear();
   3545 
   3546     IntegerType *Ty = cast<IntegerType>(LI->getType());
   3547     uint64_t LoadSize = Ty->getBitWidth() / 8;
   3548     assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
   3549 
   3550     auto &Offsets = SplitOffsetsMap[LI];
   3551     assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
   3552            "Slice size should always match load size exactly!");
   3553     uint64_t BaseOffset = Offsets.S->beginOffset();
   3554     assert(BaseOffset + LoadSize > BaseOffset &&
   3555            "Cannot represent alloca access size using 64-bit integers!");
   3556 
   3557     Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
   3558     IRB.SetInsertPoint(LI);
   3559 
   3560     DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
   3561 
   3562     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
   3563     int Idx = 0, Size = Offsets.Splits.size();
   3564     for (;;) {
   3565       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
   3566       auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
   3567       LoadInst *PLoad = IRB.CreateAlignedLoad(
   3568           getAdjustedPtr(IRB, DL, BasePtr,
   3569                          APInt(DL.getPointerSizeInBits(), PartOffset),
   3570                          PartPtrTy, BasePtr->getName() + "."),
   3571           getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
   3572           LI->getName());
   3573 
   3574       // Append this load onto the list of split loads so we can find it later
   3575       // to rewrite the stores.
   3576       SplitLoads.push_back(PLoad);
   3577 
   3578       // Now build a new slice for the alloca.
   3579       NewSlices.push_back(
   3580           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
   3581                 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
   3582                 /*IsSplittable*/ false));
   3583       DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
   3584                    << ", " << NewSlices.back().endOffset() << "): " << *PLoad
   3585                    << "\n");
   3586 
   3587       // See if we've handled all the splits.
   3588       if (Idx >= Size)
   3589         break;
   3590 
   3591       // Setup the next partition.
   3592       PartOffset = Offsets.Splits[Idx];
   3593       ++Idx;
   3594       PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
   3595     }
   3596 
   3597     // Now that we have the split loads, do the slow walk over all uses of the
   3598     // load and rewrite them as split stores, or save the split loads to use
   3599     // below if the store is going to be split there anyways.
   3600     bool DeferredStores = false;
   3601     for (User *LU : LI->users()) {
   3602       StoreInst *SI = cast<StoreInst>(LU);
   3603       if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
   3604         DeferredStores = true;
   3605         DEBUG(dbgs() << "    Deferred splitting of store: " << *SI << "\n");
   3606         continue;
   3607       }
   3608 
   3609       Value *StoreBasePtr = SI->getPointerOperand();
   3610       IRB.SetInsertPoint(SI);
   3611 
   3612       DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
   3613 
   3614       for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
   3615         LoadInst *PLoad = SplitLoads[Idx];
   3616         uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
   3617         auto *PartPtrTy =
   3618             PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
   3619 
   3620         StoreInst *PStore = IRB.CreateAlignedStore(
   3621             PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
   3622                                   APInt(DL.getPointerSizeInBits(), PartOffset),
   3623                                   PartPtrTy, StoreBasePtr->getName() + "."),
   3624             getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
   3625         (void)PStore;
   3626         DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
   3627       }
   3628 
   3629       // We want to immediately iterate on any allocas impacted by splitting
   3630       // this store, and we have to track any promotable alloca (indicated by
   3631       // a direct store) as needing to be resplit because it is no longer
   3632       // promotable.
   3633       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
   3634         ResplitPromotableAllocas.insert(OtherAI);
   3635         Worklist.insert(OtherAI);
   3636       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
   3637                      StoreBasePtr->stripInBoundsOffsets())) {
   3638         Worklist.insert(OtherAI);
   3639       }
   3640 
   3641       // Mark the original store as dead.
   3642       DeadInsts.insert(SI);
   3643     }
   3644 
   3645     // Save the split loads if there are deferred stores among the users.
   3646     if (DeferredStores)
   3647       SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
   3648 
   3649     // Mark the original load as dead and kill the original slice.
   3650     DeadInsts.insert(LI);
   3651     Offsets.S->kill();
   3652   }
   3653 
   3654   // Second, we rewrite all of the split stores. At this point, we know that
   3655   // all loads from this alloca have been split already. For stores of such
   3656   // loads, we can simply look up the pre-existing split loads. For stores of
   3657   // other loads, we split those loads first and then write split stores of
   3658   // them.
   3659   for (StoreInst *SI : Stores) {
   3660     auto *LI = cast<LoadInst>(SI->getValueOperand());
   3661     IntegerType *Ty = cast<IntegerType>(LI->getType());
   3662     uint64_t StoreSize = Ty->getBitWidth() / 8;
   3663     assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
   3664 
   3665     auto &Offsets = SplitOffsetsMap[SI];
   3666     assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
   3667            "Slice size should always match load size exactly!");
   3668     uint64_t BaseOffset = Offsets.S->beginOffset();
   3669     assert(BaseOffset + StoreSize > BaseOffset &&
   3670            "Cannot represent alloca access size using 64-bit integers!");
   3671 
   3672     Value *LoadBasePtr = LI->getPointerOperand();
   3673     Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
   3674 
   3675     DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
   3676 
   3677     // Check whether we have an already split load.
   3678     auto SplitLoadsMapI = SplitLoadsMap.find(LI);
   3679     std::vector<LoadInst *> *SplitLoads = nullptr;
   3680     if (SplitLoadsMapI != SplitLoadsMap.end()) {
   3681       SplitLoads = &SplitLoadsMapI->second;
   3682       assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
   3683              "Too few split loads for the number of splits in the store!");
   3684     } else {
   3685       DEBUG(dbgs() << "          of load: " << *LI << "\n");
   3686     }
   3687 
   3688     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
   3689     int Idx = 0, Size = Offsets.Splits.size();
   3690     for (;;) {
   3691       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
   3692       auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
   3693 
   3694       // Either lookup a split load or create one.
   3695       LoadInst *PLoad;
   3696       if (SplitLoads) {
   3697         PLoad = (*SplitLoads)[Idx];
   3698       } else {
   3699         IRB.SetInsertPoint(LI);
   3700         PLoad = IRB.CreateAlignedLoad(
   3701             getAdjustedPtr(IRB, DL, LoadBasePtr,
   3702                            APInt(DL.getPointerSizeInBits(), PartOffset),
   3703                            PartPtrTy, LoadBasePtr->getName() + "."),
   3704             getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
   3705             LI->getName());
   3706       }
   3707 
   3708       // And store this partition.
   3709       IRB.SetInsertPoint(SI);
   3710       StoreInst *PStore = IRB.CreateAlignedStore(
   3711           PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
   3712                                 APInt(DL.getPointerSizeInBits(), PartOffset),
   3713                                 PartPtrTy, StoreBasePtr->getName() + "."),
   3714           getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
   3715 
   3716       // Now build a new slice for the alloca.
   3717       NewSlices.push_back(
   3718           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
   3719                 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
   3720                 /*IsSplittable*/ false));
   3721       DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
   3722                    << ", " << NewSlices.back().endOffset() << "): " << *PStore
   3723                    << "\n");
   3724       if (!SplitLoads) {
   3725         DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
   3726       }
   3727 
   3728       // See if we've finished all the splits.
   3729       if (Idx >= Size)
   3730         break;
   3731 
   3732       // Setup the next partition.
   3733       PartOffset = Offsets.Splits[Idx];
   3734       ++Idx;
   3735       PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
   3736     }
   3737 
   3738     // We want to immediately iterate on any allocas impacted by splitting
   3739     // this load, which is only relevant if it isn't a load of this alloca and
   3740     // thus we didn't already split the loads above. We also have to keep track
   3741     // of any promotable allocas we split loads on as they can no longer be
   3742     // promoted.
   3743     if (!SplitLoads) {
   3744       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
   3745         assert(OtherAI != &AI && "We can't re-split our own alloca!");
   3746         ResplitPromotableAllocas.insert(OtherAI);
   3747         Worklist.insert(OtherAI);
   3748       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
   3749                      LoadBasePtr->stripInBoundsOffsets())) {
   3750         assert(OtherAI != &AI && "We can't re-split our own alloca!");
   3751         Worklist.insert(OtherAI);
   3752       }
   3753     }
   3754 
   3755     // Mark the original store as dead now that we've split it up and kill its
   3756     // slice. Note that we leave the original load in place unless this store
   3757     // was its only use. It may in turn be split up if it is an alloca load
   3758     // for some other alloca, but it may be a normal load. This may introduce
   3759     // redundant loads, but where those can be merged the rest of the optimizer
   3760     // should handle the merging, and this uncovers SSA splits which is more
   3761     // important. In practice, the original loads will almost always be fully
   3762     // split and removed eventually, and the splits will be merged by any
   3763     // trivial CSE, including instcombine.
   3764     if (LI->hasOneUse()) {
   3765       assert(*LI->user_begin() == SI && "Single use isn't this store!");
   3766       DeadInsts.insert(LI);
   3767     }
   3768     DeadInsts.insert(SI);
   3769     Offsets.S->kill();
   3770   }
   3771 
   3772   // Remove the killed slices that have ben pre-split.
   3773   AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
   3774     return S.isDead();
   3775   }), AS.end());
   3776 
   3777   // Insert our new slices. This will sort and merge them into the sorted
   3778   // sequence.
   3779   AS.insert(NewSlices);
   3780 
   3781   DEBUG(dbgs() << "  Pre-split slices:\n");
   3782 #ifndef NDEBUG
   3783   for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
   3784     DEBUG(AS.print(dbgs(), I, "    "));
   3785 #endif
   3786 
   3787   // Finally, don't try to promote any allocas that new require re-splitting.
   3788   // They have already been added to the worklist above.
   3789   PromotableAllocas.erase(
   3790       std::remove_if(
   3791           PromotableAllocas.begin(), PromotableAllocas.end(),
   3792           [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
   3793       PromotableAllocas.end());
   3794 
   3795   return true;
   3796 }
   3797 
   3798 /// \brief Rewrite an alloca partition's users.
   3799 ///
   3800 /// This routine drives both of the rewriting goals of the SROA pass. It tries
   3801 /// to rewrite uses of an alloca partition to be conducive for SSA value
   3802 /// promotion. If the partition needs a new, more refined alloca, this will
   3803 /// build that new alloca, preserving as much type information as possible, and
   3804 /// rewrite the uses of the old alloca to point at the new one and have the
   3805 /// appropriate new offsets. It also evaluates how successful the rewrite was
   3806 /// at enabling promotion and if it was successful queues the alloca to be
   3807 /// promoted.
   3808 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   3809                                    Partition &P) {
   3810   // Try to compute a friendly type for this partition of the alloca. This
   3811   // won't always succeed, in which case we fall back to a legal integer type
   3812   // or an i8 array of an appropriate size.
   3813   Type *SliceTy = nullptr;
   3814   const DataLayout &DL = AI.getModule()->getDataLayout();
   3815   if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
   3816     if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
   3817       SliceTy = CommonUseTy;
   3818   if (!SliceTy)
   3819     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
   3820                                                  P.beginOffset(), P.size()))
   3821       SliceTy = TypePartitionTy;
   3822   if ((!SliceTy || (SliceTy->isArrayTy() &&
   3823                     SliceTy->getArrayElementType()->isIntegerTy())) &&
   3824       DL.isLegalInteger(P.size() * 8))
   3825     SliceTy = Type::getIntNTy(*C, P.size() * 8);
   3826   if (!SliceTy)
   3827     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
   3828   assert(DL.getTypeAllocSize(SliceTy) >= P.size());
   3829 
   3830   bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
   3831 
   3832   VectorType *VecTy =
   3833       IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
   3834   if (VecTy)
   3835     SliceTy = VecTy;
   3836 
   3837   // Check for the case where we're going to rewrite to a new alloca of the
   3838   // exact same type as the original, and with the same access offsets. In that
   3839   // case, re-use the existing alloca, but still run through the rewriter to
   3840   // perform phi and select speculation.
   3841   AllocaInst *NewAI;
   3842   if (SliceTy == AI.getAllocatedType()) {
   3843     assert(P.beginOffset() == 0 &&
   3844            "Non-zero begin offset but same alloca type");
   3845     NewAI = &AI;
   3846     // FIXME: We should be able to bail at this point with "nothing changed".
   3847     // FIXME: We might want to defer PHI speculation until after here.
   3848     // FIXME: return nullptr;
   3849   } else {
   3850     unsigned Alignment = AI.getAlignment();
   3851     if (!Alignment) {
   3852       // The minimum alignment which users can rely on when the explicit
   3853       // alignment is omitted or zero is that required by the ABI for this
   3854       // type.
   3855       Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
   3856     }
   3857     Alignment = MinAlign(Alignment, P.beginOffset());
   3858     // If we will get at least this much alignment from the type alone, leave
   3859     // the alloca's alignment unconstrained.
   3860     if (Alignment <= DL.getABITypeAlignment(SliceTy))
   3861       Alignment = 0;
   3862     NewAI = new AllocaInst(
   3863         SliceTy, nullptr, Alignment,
   3864         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
   3865     ++NumNewAllocas;
   3866   }
   3867 
   3868   DEBUG(dbgs() << "Rewriting alloca partition "
   3869                << "[" << P.beginOffset() << "," << P.endOffset()
   3870                << ") to: " << *NewAI << "\n");
   3871 
   3872   // Track the high watermark on the worklist as it is only relevant for
   3873   // promoted allocas. We will reset it to this point if the alloca is not in
   3874   // fact scheduled for promotion.
   3875   unsigned PPWOldSize = PostPromotionWorklist.size();
   3876   unsigned NumUses = 0;
   3877   SmallPtrSet<PHINode *, 8> PHIUsers;
   3878   SmallPtrSet<SelectInst *, 8> SelectUsers;
   3879 
   3880   AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
   3881                                P.endOffset(), IsIntegerPromotable, VecTy,
   3882                                PHIUsers, SelectUsers);
   3883   bool Promotable = true;
   3884   for (Slice *S : P.splitSliceTails()) {
   3885     Promotable &= Rewriter.visit(S);
   3886     ++NumUses;
   3887   }
   3888   for (Slice &S : P) {
   3889     Promotable &= Rewriter.visit(&S);
   3890     ++NumUses;
   3891   }
   3892 
   3893   NumAllocaPartitionUses += NumUses;
   3894   MaxUsesPerAllocaPartition =
   3895       std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
   3896 
   3897   // Now that we've processed all the slices in the new partition, check if any
   3898   // PHIs or Selects would block promotion.
   3899   for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
   3900                                             E = PHIUsers.end();
   3901        I != E; ++I)
   3902     if (!isSafePHIToSpeculate(**I)) {
   3903       Promotable = false;
   3904       PHIUsers.clear();
   3905       SelectUsers.clear();
   3906       break;
   3907     }
   3908   for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
   3909                                                E = SelectUsers.end();
   3910        I != E; ++I)
   3911     if (!isSafeSelectToSpeculate(**I)) {
   3912       Promotable = false;
   3913       PHIUsers.clear();
   3914       SelectUsers.clear();
   3915       break;
   3916     }
   3917 
   3918   if (Promotable) {
   3919     if (PHIUsers.empty() && SelectUsers.empty()) {
   3920       // Promote the alloca.
   3921       PromotableAllocas.push_back(NewAI);
   3922     } else {
   3923       // If we have either PHIs or Selects to speculate, add them to those
   3924       // worklists and re-queue the new alloca so that we promote in on the
   3925       // next iteration.
   3926       for (PHINode *PHIUser : PHIUsers)
   3927         SpeculatablePHIs.insert(PHIUser);
   3928       for (SelectInst *SelectUser : SelectUsers)
   3929         SpeculatableSelects.insert(SelectUser);
   3930       Worklist.insert(NewAI);
   3931     }
   3932   } else {
   3933     // Drop any post-promotion work items if promotion didn't happen.
   3934     while (PostPromotionWorklist.size() > PPWOldSize)
   3935       PostPromotionWorklist.pop_back();
   3936 
   3937     // We couldn't promote and we didn't create a new partition, nothing
   3938     // happened.
   3939     if (NewAI == &AI)
   3940       return nullptr;
   3941 
   3942     // If we can't promote the alloca, iterate on it to check for new
   3943     // refinements exposed by splitting the current alloca. Don't iterate on an
   3944     // alloca which didn't actually change and didn't get promoted.
   3945     Worklist.insert(NewAI);
   3946   }
   3947 
   3948   return NewAI;
   3949 }
   3950 
   3951 /// \brief Walks the slices of an alloca and form partitions based on them,
   3952 /// rewriting each of their uses.
   3953 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   3954   if (AS.begin() == AS.end())
   3955     return false;
   3956 
   3957   unsigned NumPartitions = 0;
   3958   bool Changed = false;
   3959   const DataLayout &DL = AI.getModule()->getDataLayout();
   3960 
   3961   // First try to pre-split loads and stores.
   3962   Changed |= presplitLoadsAndStores(AI, AS);
   3963 
   3964   // Now that we have identified any pre-splitting opportunities, mark any
   3965   // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
   3966   // to split these during pre-splitting, we want to force them to be
   3967   // rewritten into a partition.
   3968   bool IsSorted = true;
   3969   for (Slice &S : AS) {
   3970     if (!S.isSplittable())
   3971       continue;
   3972     // FIXME: We currently leave whole-alloca splittable loads and stores. This
   3973     // used to be the only splittable loads and stores and we need to be
   3974     // confident that the above handling of splittable loads and stores is
   3975     // completely sufficient before we forcibly disable the remaining handling.
   3976     if (S.beginOffset() == 0 &&
   3977         S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
   3978       continue;
   3979     if (isa<LoadInst>(S.getUse()->getUser()) ||
   3980         isa<StoreInst>(S.getUse()->getUser())) {
   3981       S.makeUnsplittable();
   3982       IsSorted = false;
   3983     }
   3984   }
   3985   if (!IsSorted)
   3986     std::sort(AS.begin(), AS.end());
   3987 
   3988   /// \brief Describes the allocas introduced by rewritePartition
   3989   /// in order to migrate the debug info.
   3990   struct Piece {
   3991     AllocaInst *Alloca;
   3992     uint64_t Offset;
   3993     uint64_t Size;
   3994     Piece(AllocaInst *AI, uint64_t O, uint64_t S)
   3995       : Alloca(AI), Offset(O), Size(S) {}
   3996   };
   3997   SmallVector<Piece, 4> Pieces;
   3998 
   3999   // Rewrite each partition.
   4000   for (auto &P : AS.partitions()) {
   4001     if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
   4002       Changed = true;
   4003       if (NewAI != &AI) {
   4004         uint64_t SizeOfByte = 8;
   4005         uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
   4006         // Don't include any padding.
   4007         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
   4008         Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
   4009       }
   4010     }
   4011     ++NumPartitions;
   4012   }
   4013 
   4014   NumAllocaPartitions += NumPartitions;
   4015   MaxPartitionsPerAlloca =
   4016       std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
   4017 
   4018   // Migrate debug information from the old alloca to the new alloca(s)
   4019   // and the individual partitions.
   4020   if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
   4021     auto *Var = DbgDecl->getVariable();
   4022     auto *Expr = DbgDecl->getExpression();
   4023     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
   4024     uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
   4025     for (auto Piece : Pieces) {
   4026       // Create a piece expression describing the new partition or reuse AI's
   4027       // expression if there is only one partition.
   4028       auto *PieceExpr = Expr;
   4029       if (Piece.Size < AllocaSize || Expr->isBitPiece()) {
   4030         // If this alloca is already a scalar replacement of a larger aggregate,
   4031         // Piece.Offset describes the offset inside the scalar.
   4032         uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
   4033         uint64_t Start = Offset + Piece.Offset;
   4034         uint64_t Size = Piece.Size;
   4035         if (Expr->isBitPiece()) {
   4036           uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
   4037           if (Start >= AbsEnd)
   4038             // No need to describe a SROAed padding.
   4039             continue;
   4040           Size = std::min(Size, AbsEnd - Start);
   4041         }
   4042         PieceExpr = DIB.createBitPieceExpression(Start, Size);
   4043       } else {
   4044         assert(Pieces.size() == 1 &&
   4045                "partition is as large as original alloca");
   4046       }
   4047 
   4048       // Remove any existing dbg.declare intrinsic describing the same alloca.
   4049       if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
   4050         OldDDI->eraseFromParent();
   4051 
   4052       DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
   4053                         &AI);
   4054     }
   4055   }
   4056   return Changed;
   4057 }
   4058 
   4059 /// \brief Clobber a use with undef, deleting the used value if it becomes dead.
   4060 void SROA::clobberUse(Use &U) {
   4061   Value *OldV = U;
   4062   // Replace the use with an undef value.
   4063   U = UndefValue::get(OldV->getType());
   4064 
   4065   // Check for this making an instruction dead. We have to garbage collect
   4066   // all the dead instructions to ensure the uses of any alloca end up being
   4067   // minimal.
   4068   if (Instruction *OldI = dyn_cast<Instruction>(OldV))
   4069     if (isInstructionTriviallyDead(OldI)) {
   4070       DeadInsts.insert(OldI);
   4071     }
   4072 }
   4073 
   4074 /// \brief Analyze an alloca for SROA.
   4075 ///
   4076 /// This analyzes the alloca to ensure we can reason about it, builds
   4077 /// the slices of the alloca, and then hands it off to be split and
   4078 /// rewritten as needed.
   4079 bool SROA::runOnAlloca(AllocaInst &AI) {
   4080   DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
   4081   ++NumAllocasAnalyzed;
   4082 
   4083   // Special case dead allocas, as they're trivial.
   4084   if (AI.use_empty()) {
   4085     AI.eraseFromParent();
   4086     return true;
   4087   }
   4088   const DataLayout &DL = AI.getModule()->getDataLayout();
   4089 
   4090   // Skip alloca forms that this analysis can't handle.
   4091   if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
   4092       DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
   4093     return false;
   4094 
   4095   bool Changed = false;
   4096 
   4097   // First, split any FCA loads and stores touching this alloca to promote
   4098   // better splitting and promotion opportunities.
   4099   AggLoadStoreRewriter AggRewriter;
   4100   Changed |= AggRewriter.rewrite(AI);
   4101 
   4102   // Build the slices using a recursive instruction-visiting builder.
   4103   AllocaSlices AS(DL, AI);
   4104   DEBUG(AS.print(dbgs()));
   4105   if (AS.isEscaped())
   4106     return Changed;
   4107 
   4108   // Delete all the dead users of this alloca before splitting and rewriting it.
   4109   for (Instruction *DeadUser : AS.getDeadUsers()) {
   4110     // Free up everything used by this instruction.
   4111     for (Use &DeadOp : DeadUser->operands())
   4112       clobberUse(DeadOp);
   4113 
   4114     // Now replace the uses of this instruction.
   4115     DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
   4116 
   4117     // And mark it for deletion.
   4118     DeadInsts.insert(DeadUser);
   4119     Changed = true;
   4120   }
   4121   for (Use *DeadOp : AS.getDeadOperands()) {
   4122     clobberUse(*DeadOp);
   4123     Changed = true;
   4124   }
   4125 
   4126   // No slices to split. Leave the dead alloca for a later pass to clean up.
   4127   if (AS.begin() == AS.end())
   4128     return Changed;
   4129 
   4130   Changed |= splitAlloca(AI, AS);
   4131 
   4132   DEBUG(dbgs() << "  Speculating PHIs\n");
   4133   while (!SpeculatablePHIs.empty())
   4134     speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
   4135 
   4136   DEBUG(dbgs() << "  Speculating Selects\n");
   4137   while (!SpeculatableSelects.empty())
   4138     speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
   4139 
   4140   return Changed;
   4141 }
   4142 
   4143 /// \brief Delete the dead instructions accumulated in this run.
   4144 ///
   4145 /// Recursively deletes the dead instructions we've accumulated. This is done
   4146 /// at the very end to maximize locality of the recursive delete and to
   4147 /// minimize the problems of invalidated instruction pointers as such pointers
   4148 /// are used heavily in the intermediate stages of the algorithm.
   4149 ///
   4150 /// We also record the alloca instructions deleted here so that they aren't
   4151 /// subsequently handed to mem2reg to promote.
   4152 void SROA::deleteDeadInstructions(
   4153     SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
   4154   while (!DeadInsts.empty()) {
   4155     Instruction *I = DeadInsts.pop_back_val();
   4156     DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
   4157 
   4158     I->replaceAllUsesWith(UndefValue::get(I->getType()));
   4159 
   4160     for (Use &Operand : I->operands())
   4161       if (Instruction *U = dyn_cast<Instruction>(Operand)) {
   4162         // Zero out the operand and see if it becomes trivially dead.
   4163         Operand = nullptr;
   4164         if (isInstructionTriviallyDead(U))
   4165           DeadInsts.insert(U);
   4166       }
   4167 
   4168     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
   4169       DeletedAllocas.insert(AI);
   4170       if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
   4171         DbgDecl->eraseFromParent();
   4172     }
   4173 
   4174     ++NumDeleted;
   4175     I->eraseFromParent();
   4176   }
   4177 }
   4178 
   4179 /// \brief Promote the allocas, using the best available technique.
   4180 ///
   4181 /// This attempts to promote whatever allocas have been identified as viable in
   4182 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
   4183 /// This function returns whether any promotion occurred.
   4184 bool SROA::promoteAllocas(Function &F) {
   4185   if (PromotableAllocas.empty())
   4186     return false;
   4187 
   4188   NumPromoted += PromotableAllocas.size();
   4189 
   4190   DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
   4191   PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
   4192   PromotableAllocas.clear();
   4193   return true;
   4194 }
   4195 
   4196 PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   4197                                 AssumptionCache &RunAC) {
   4198   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   4199   C = &F.getContext();
   4200   DT = &RunDT;
   4201   AC = &RunAC;
   4202 
   4203   BasicBlock &EntryBB = F.getEntryBlock();
   4204   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
   4205        I != E; ++I) {
   4206     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
   4207       Worklist.insert(AI);
   4208   }
   4209 
   4210   bool Changed = false;
   4211   // A set of deleted alloca instruction pointers which should be removed from
   4212   // the list of promotable allocas.
   4213   SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
   4214 
   4215   do {
   4216     while (!Worklist.empty()) {
   4217       Changed |= runOnAlloca(*Worklist.pop_back_val());
   4218       deleteDeadInstructions(DeletedAllocas);
   4219 
   4220       // Remove the deleted allocas from various lists so that we don't try to
   4221       // continue processing them.
   4222       if (!DeletedAllocas.empty()) {
   4223         auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
   4224         Worklist.remove_if(IsInSet);
   4225         PostPromotionWorklist.remove_if(IsInSet);
   4226         PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
   4227                                                PromotableAllocas.end(),
   4228                                                IsInSet),
   4229                                 PromotableAllocas.end());
   4230         DeletedAllocas.clear();
   4231       }
   4232     }
   4233 
   4234     Changed |= promoteAllocas(F);
   4235 
   4236     Worklist = PostPromotionWorklist;
   4237     PostPromotionWorklist.clear();
   4238   } while (!Worklist.empty());
   4239 
   4240   if (!Changed)
   4241     return PreservedAnalyses::all();
   4242 
   4243   // FIXME: Even when promoting allocas we should preserve some abstract set of
   4244   // CFG-specific analyses.
   4245   PreservedAnalyses PA;
   4246   PA.preserve<GlobalsAA>();
   4247   return PA;
   4248 }
   4249 
   4250 PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> &AM) {
   4251   return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
   4252                  AM.getResult<AssumptionAnalysis>(F));
   4253 }
   4254 
   4255 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
   4256 ///
   4257 /// This is in the llvm namespace purely to allow it to be a friend of the \c
   4258 /// SROA pass.
   4259 class llvm::sroa::SROALegacyPass : public FunctionPass {
   4260   /// The SROA implementation.
   4261   SROA Impl;
   4262 
   4263 public:
   4264   SROALegacyPass() : FunctionPass(ID) {
   4265     initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
   4266   }
   4267   bool runOnFunction(Function &F) override {
   4268     if (skipFunction(F))
   4269       return false;
   4270 
   4271     auto PA = Impl.runImpl(
   4272         F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
   4273         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
   4274     return !PA.areAllPreserved();
   4275   }
   4276   void getAnalysisUsage(AnalysisUsage &AU) const override {
   4277     AU.addRequired<AssumptionCacheTracker>();
   4278     AU.addRequired<DominatorTreeWrapperPass>();
   4279     AU.addPreserved<GlobalsAAWrapperPass>();
   4280     AU.setPreservesCFG();
   4281   }
   4282 
   4283   const char *getPassName() const override { return "SROA"; }
   4284   static char ID;
   4285 };
   4286 
   4287 char SROALegacyPass::ID = 0;
   4288 
   4289 FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
   4290 
   4291 INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
   4292                       "Scalar Replacement Of Aggregates", false, false)
   4293 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
   4294 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
   4295 INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
   4296                     false, false)
   4297