Home | History | Annotate | Download | only in Vectorize
      1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This pass merges loads/stores to/from sequential memory addresses into vector
     11 // loads/stores.  Although there's nothing GPU-specific in here, this pass is
     12 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
     13 //
     14 // (For simplicity below we talk about loads only, but everything also applies
     15 // to stores.)
     16 //
     17 // This pass is intended to be run late in the pipeline, after other
     18 // vectorization opportunities have been exploited.  So the assumption here is
     19 // that immediately following our new vector load we'll need to extract out the
     20 // individual elements of the load, so we can operate on them individually.
     21 //
     22 // On CPUs this transformation is usually not beneficial, because extracting the
     23 // elements of a vector register is expensive on most architectures.  It's
     24 // usually better just to load each element individually into its own scalar
     25 // register.
     26 //
     27 // However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
     28 // "vector load" loads directly into a series of scalar registers.  In effect,
     29 // extracting the elements of the vector is free.  It's therefore always
     30 // beneficial to vectorize a sequence of loads on these architectures.
     31 //
     32 // Vectorizing (perhaps a better name might be "coalescing") loads can have
     33 // large performance impacts on GPU kernels, and opportunities for vectorizing
     34 // are common in GPU code.  This pass tries very hard to find such
     35 // opportunities; its runtime is quadratic in the number of loads in a BB.
     36 //
     37 // Some CPU architectures, such as ARM, have instructions that load into
     38 // multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
     39 // could use this pass (with some modifications), but currently it implements
     40 // its own pass to do something similar to what we do here.
     41 
     42 #include "llvm/ADT/APInt.h"
     43 #include "llvm/ADT/ArrayRef.h"
     44 #include "llvm/ADT/MapVector.h"
     45 #include "llvm/ADT/PostOrderIterator.h"
     46 #include "llvm/ADT/STLExtras.h"
     47 #include "llvm/ADT/SmallPtrSet.h"
     48 #include "llvm/ADT/SmallVector.h"
     49 #include "llvm/ADT/Statistic.h"
     50 #include "llvm/ADT/iterator_range.h"
     51 #include "llvm/Analysis/AliasAnalysis.h"
     52 #include "llvm/Analysis/MemoryLocation.h"
     53 #include "llvm/Analysis/OrderedBasicBlock.h"
     54 #include "llvm/Analysis/ScalarEvolution.h"
     55 #include "llvm/Analysis/TargetTransformInfo.h"
     56 #include "llvm/Transforms/Utils/Local.h"
     57 #include "llvm/Analysis/ValueTracking.h"
     58 #include "llvm/Analysis/VectorUtils.h"
     59 #include "llvm/IR/Attributes.h"
     60 #include "llvm/IR/BasicBlock.h"
     61 #include "llvm/IR/Constants.h"
     62 #include "llvm/IR/DataLayout.h"
     63 #include "llvm/IR/DerivedTypes.h"
     64 #include "llvm/IR/Dominators.h"
     65 #include "llvm/IR/Function.h"
     66 #include "llvm/IR/IRBuilder.h"
     67 #include "llvm/IR/InstrTypes.h"
     68 #include "llvm/IR/Instruction.h"
     69 #include "llvm/IR/Instructions.h"
     70 #include "llvm/IR/IntrinsicInst.h"
     71 #include "llvm/IR/Module.h"
     72 #include "llvm/IR/Type.h"
     73 #include "llvm/IR/User.h"
     74 #include "llvm/IR/Value.h"
     75 #include "llvm/Pass.h"
     76 #include "llvm/Support/Casting.h"
     77 #include "llvm/Support/Debug.h"
     78 #include "llvm/Support/KnownBits.h"
     79 #include "llvm/Support/MathExtras.h"
     80 #include "llvm/Support/raw_ostream.h"
     81 #include "llvm/Transforms/Vectorize.h"
     82 #include <algorithm>
     83 #include <cassert>
     84 #include <cstdlib>
     85 #include <tuple>
     86 #include <utility>
     87 
     88 using namespace llvm;
     89 
     90 #define DEBUG_TYPE "load-store-vectorizer"
     91 
     92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
     93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
     94 
     95 // FIXME: Assuming stack alignment of 4 is always good enough
     96 static const unsigned StackAdjustedAlignment = 4;
     97 
     98 namespace {
     99 
    100 /// ChainID is an arbitrary token that is allowed to be different only for the
    101 /// accesses that are guaranteed to be considered non-consecutive by
    102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
    103 /// together and reducing the number of instructions the main search operates on
    104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
    105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
    106 /// be relied upon.
    107 using ChainID = const Value *;
    108 using InstrList = SmallVector<Instruction *, 8>;
    109 using InstrListMap = MapVector<ChainID, InstrList>;
    110 
    111 class Vectorizer {
    112   Function &F;
    113   AliasAnalysis &AA;
    114   DominatorTree &DT;
    115   ScalarEvolution &SE;
    116   TargetTransformInfo &TTI;
    117   const DataLayout &DL;
    118   IRBuilder<> Builder;
    119 
    120 public:
    121   Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
    122              ScalarEvolution &SE, TargetTransformInfo &TTI)
    123       : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
    124         DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
    125 
    126   bool run();
    127 
    128 private:
    129   unsigned getPointerAddressSpace(Value *I);
    130 
    131   unsigned getAlignment(LoadInst *LI) const {
    132     unsigned Align = LI->getAlignment();
    133     if (Align != 0)
    134       return Align;
    135 
    136     return DL.getABITypeAlignment(LI->getType());
    137   }
    138 
    139   unsigned getAlignment(StoreInst *SI) const {
    140     unsigned Align = SI->getAlignment();
    141     if (Align != 0)
    142       return Align;
    143 
    144     return DL.getABITypeAlignment(SI->getValueOperand()->getType());
    145   }
    146 
    147   static const unsigned MaxDepth = 3;
    148 
    149   bool isConsecutiveAccess(Value *A, Value *B);
    150   bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
    151                               unsigned Depth = 0) const;
    152   bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
    153                                    unsigned Depth) const;
    154   bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
    155                           unsigned Depth) const;
    156 
    157   /// After vectorization, reorder the instructions that I depends on
    158   /// (the instructions defining its operands), to ensure they dominate I.
    159   void reorder(Instruction *I);
    160 
    161   /// Returns the first and the last instructions in Chain.
    162   std::pair<BasicBlock::iterator, BasicBlock::iterator>
    163   getBoundaryInstrs(ArrayRef<Instruction *> Chain);
    164 
    165   /// Erases the original instructions after vectorizing.
    166   void eraseInstructions(ArrayRef<Instruction *> Chain);
    167 
    168   /// "Legalize" the vector type that would be produced by combining \p
    169   /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
    170   /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
    171   /// expected to have more than 4 elements.
    172   std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
    173   splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
    174 
    175   /// Finds the largest prefix of Chain that's vectorizable, checking for
    176   /// intervening instructions which may affect the memory accessed by the
    177   /// instructions within Chain.
    178   ///
    179   /// The elements of \p Chain must be all loads or all stores and must be in
    180   /// address order.
    181   ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
    182 
    183   /// Collects load and store instructions to vectorize.
    184   std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
    185 
    186   /// Processes the collected instructions, the \p Map. The values of \p Map
    187   /// should be all loads or all stores.
    188   bool vectorizeChains(InstrListMap &Map);
    189 
    190   /// Finds the load/stores to consecutive memory addresses and vectorizes them.
    191   bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
    192 
    193   /// Vectorizes the load instructions in Chain.
    194   bool
    195   vectorizeLoadChain(ArrayRef<Instruction *> Chain,
    196                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
    197 
    198   /// Vectorizes the store instructions in Chain.
    199   bool
    200   vectorizeStoreChain(ArrayRef<Instruction *> Chain,
    201                       SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
    202 
    203   /// Check if this load/store access is misaligned accesses.
    204   bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
    205                           unsigned Alignment);
    206 };
    207 
    208 class LoadStoreVectorizer : public FunctionPass {
    209 public:
    210   static char ID;
    211 
    212   LoadStoreVectorizer() : FunctionPass(ID) {
    213     initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
    214   }
    215 
    216   bool runOnFunction(Function &F) override;
    217 
    218   StringRef getPassName() const override {
    219     return "GPU Load and Store Vectorizer";
    220   }
    221 
    222   void getAnalysisUsage(AnalysisUsage &AU) const override {
    223     AU.addRequired<AAResultsWrapperPass>();
    224     AU.addRequired<ScalarEvolutionWrapperPass>();
    225     AU.addRequired<DominatorTreeWrapperPass>();
    226     AU.addRequired<TargetTransformInfoWrapperPass>();
    227     AU.setPreservesCFG();
    228   }
    229 };
    230 
    231 } // end anonymous namespace
    232 
    233 char LoadStoreVectorizer::ID = 0;
    234 
    235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
    236                       "Vectorize load and Store instructions", false, false)
    237 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
    238 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
    239 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
    240 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
    241 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
    242 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
    243                     "Vectorize load and store instructions", false, false)
    244 
    245 Pass *llvm::createLoadStoreVectorizerPass() {
    246   return new LoadStoreVectorizer();
    247 }
    248 
    249 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
    250 // vectors of Instructions.
    251 static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
    252   SmallVector<Value *, 8> VL(IL.begin(), IL.end());
    253   propagateMetadata(I, VL);
    254 }
    255 
    256 bool LoadStoreVectorizer::runOnFunction(Function &F) {
    257   // Don't vectorize when the attribute NoImplicitFloat is used.
    258   if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
    259     return false;
    260 
    261   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
    262   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
    263   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
    264   TargetTransformInfo &TTI =
    265       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
    266 
    267   Vectorizer V(F, AA, DT, SE, TTI);
    268   return V.run();
    269 }
    270 
    271 // Vectorizer Implementation
    272 bool Vectorizer::run() {
    273   bool Changed = false;
    274 
    275   // Scan the blocks in the function in post order.
    276   for (BasicBlock *BB : post_order(&F)) {
    277     InstrListMap LoadRefs, StoreRefs;
    278     std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
    279     Changed |= vectorizeChains(LoadRefs);
    280     Changed |= vectorizeChains(StoreRefs);
    281   }
    282 
    283   return Changed;
    284 }
    285 
    286 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
    287   if (LoadInst *L = dyn_cast<LoadInst>(I))
    288     return L->getPointerAddressSpace();
    289   if (StoreInst *S = dyn_cast<StoreInst>(I))
    290     return S->getPointerAddressSpace();
    291   return -1;
    292 }
    293 
    294 // FIXME: Merge with llvm::isConsecutiveAccess
    295 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
    296   Value *PtrA = getLoadStorePointerOperand(A);
    297   Value *PtrB = getLoadStorePointerOperand(B);
    298   unsigned ASA = getPointerAddressSpace(A);
    299   unsigned ASB = getPointerAddressSpace(B);
    300 
    301   // Check that the address spaces match and that the pointers are valid.
    302   if (!PtrA || !PtrB || (ASA != ASB))
    303     return false;
    304 
    305   // Make sure that A and B are different pointers of the same size type.
    306   Type *PtrATy = PtrA->getType()->getPointerElementType();
    307   Type *PtrBTy = PtrB->getType()->getPointerElementType();
    308   if (PtrA == PtrB ||
    309       PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
    310       DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
    311       DL.getTypeStoreSize(PtrATy->getScalarType()) !=
    312           DL.getTypeStoreSize(PtrBTy->getScalarType()))
    313     return false;
    314 
    315   unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
    316   APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
    317 
    318   return areConsecutivePointers(PtrA, PtrB, Size);
    319 }
    320 
    321 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
    322                                         const APInt &PtrDelta,
    323                                         unsigned Depth) const {
    324   unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
    325   APInt OffsetA(PtrBitWidth, 0);
    326   APInt OffsetB(PtrBitWidth, 0);
    327   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
    328   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
    329 
    330   APInt OffsetDelta = OffsetB - OffsetA;
    331 
    332   // Check if they are based on the same pointer. That makes the offsets
    333   // sufficient.
    334   if (PtrA == PtrB)
    335     return OffsetDelta == PtrDelta;
    336 
    337   // Compute the necessary base pointer delta to have the necessary final delta
    338   // equal to the pointer delta requested.
    339   APInt BaseDelta = PtrDelta - OffsetDelta;
    340 
    341   // Compute the distance with SCEV between the base pointers.
    342   const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
    343   const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
    344   const SCEV *C = SE.getConstant(BaseDelta);
    345   const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
    346   if (X == PtrSCEVB)
    347     return true;
    348 
    349   // The above check will not catch the cases where one of the pointers is
    350   // factorized but the other one is not, such as (C + (S * (A + B))) vs
    351   // (AS + BS). Get the minus scev. That will allow re-combining the expresions
    352   // and getting the simplified difference.
    353   const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
    354   if (C == Dist)
    355     return true;
    356 
    357   // Sometimes even this doesn't work, because SCEV can't always see through
    358   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
    359   // things the hard way.
    360   return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
    361 }
    362 
    363 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
    364                                              APInt PtrDelta,
    365                                              unsigned Depth) const {
    366   auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
    367   auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
    368   if (!GEPA || !GEPB)
    369     return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
    370 
    371   // Look through GEPs after checking they're the same except for the last
    372   // index.
    373   if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
    374       GEPA->getPointerOperand() != GEPB->getPointerOperand())
    375     return false;
    376   gep_type_iterator GTIA = gep_type_begin(GEPA);
    377   gep_type_iterator GTIB = gep_type_begin(GEPB);
    378   for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
    379     if (GTIA.getOperand() != GTIB.getOperand())
    380       return false;
    381     ++GTIA;
    382     ++GTIB;
    383   }
    384 
    385   Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
    386   Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
    387   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
    388       OpA->getType() != OpB->getType())
    389     return false;
    390 
    391   if (PtrDelta.isNegative()) {
    392     if (PtrDelta.isMinSignedValue())
    393       return false;
    394     PtrDelta.negate();
    395     std::swap(OpA, OpB);
    396   }
    397   uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
    398   if (PtrDelta.urem(Stride) != 0)
    399     return false;
    400   unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
    401   APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
    402 
    403   // Only look through a ZExt/SExt.
    404   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
    405     return false;
    406 
    407   bool Signed = isa<SExtInst>(OpA);
    408 
    409   // At this point A could be a function parameter, i.e. not an instruction
    410   Value *ValA = OpA->getOperand(0);
    411   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
    412   if (!OpB || ValA->getType() != OpB->getType())
    413     return false;
    414 
    415   // Now we need to prove that adding IdxDiff to ValA won't overflow.
    416   bool Safe = false;
    417   // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
    418   // ValA, we're okay.
    419   if (OpB->getOpcode() == Instruction::Add &&
    420       isa<ConstantInt>(OpB->getOperand(1)) &&
    421       IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
    422     if (Signed)
    423       Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
    424     else
    425       Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
    426   }
    427 
    428   unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
    429 
    430   // Second attempt:
    431   // If all set bits of IdxDiff or any higher order bit other than the sign bit
    432   // are known to be zero in ValA, we can add Diff to it while guaranteeing no
    433   // overflow of any sort.
    434   if (!Safe) {
    435     OpA = dyn_cast<Instruction>(ValA);
    436     if (!OpA)
    437       return false;
    438     KnownBits Known(BitWidth);
    439     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
    440     APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
    441     if (Signed)
    442       BitsAllowedToBeSet.clearBit(BitWidth - 1);
    443     if (BitsAllowedToBeSet.ult(IdxDiff))
    444       return false;
    445   }
    446 
    447   const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
    448   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
    449   const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
    450   const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
    451   return X == OffsetSCEVB;
    452 }
    453 
    454 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
    455                                     const APInt &PtrDelta,
    456                                     unsigned Depth) const {
    457   if (Depth++ == MaxDepth)
    458     return false;
    459 
    460   if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
    461     if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
    462       return SelectA->getCondition() == SelectB->getCondition() &&
    463              areConsecutivePointers(SelectA->getTrueValue(),
    464                                     SelectB->getTrueValue(), PtrDelta, Depth) &&
    465              areConsecutivePointers(SelectA->getFalseValue(),
    466                                     SelectB->getFalseValue(), PtrDelta, Depth);
    467     }
    468   }
    469   return false;
    470 }
    471 
    472 void Vectorizer::reorder(Instruction *I) {
    473   OrderedBasicBlock OBB(I->getParent());
    474   SmallPtrSet<Instruction *, 16> InstructionsToMove;
    475   SmallVector<Instruction *, 16> Worklist;
    476 
    477   Worklist.push_back(I);
    478   while (!Worklist.empty()) {
    479     Instruction *IW = Worklist.pop_back_val();
    480     int NumOperands = IW->getNumOperands();
    481     for (int i = 0; i < NumOperands; i++) {
    482       Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
    483       if (!IM || IM->getOpcode() == Instruction::PHI)
    484         continue;
    485 
    486       // If IM is in another BB, no need to move it, because this pass only
    487       // vectorizes instructions within one BB.
    488       if (IM->getParent() != I->getParent())
    489         continue;
    490 
    491       if (!OBB.dominates(IM, I)) {
    492         InstructionsToMove.insert(IM);
    493         Worklist.push_back(IM);
    494       }
    495     }
    496   }
    497 
    498   // All instructions to move should follow I. Start from I, not from begin().
    499   for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
    500        ++BBI) {
    501     if (!InstructionsToMove.count(&*BBI))
    502       continue;
    503     Instruction *IM = &*BBI;
    504     --BBI;
    505     IM->removeFromParent();
    506     IM->insertBefore(I);
    507   }
    508 }
    509 
    510 std::pair<BasicBlock::iterator, BasicBlock::iterator>
    511 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
    512   Instruction *C0 = Chain[0];
    513   BasicBlock::iterator FirstInstr = C0->getIterator();
    514   BasicBlock::iterator LastInstr = C0->getIterator();
    515 
    516   BasicBlock *BB = C0->getParent();
    517   unsigned NumFound = 0;
    518   for (Instruction &I : *BB) {
    519     if (!is_contained(Chain, &I))
    520       continue;
    521 
    522     ++NumFound;
    523     if (NumFound == 1) {
    524       FirstInstr = I.getIterator();
    525     }
    526     if (NumFound == Chain.size()) {
    527       LastInstr = I.getIterator();
    528       break;
    529     }
    530   }
    531 
    532   // Range is [first, last).
    533   return std::make_pair(FirstInstr, ++LastInstr);
    534 }
    535 
    536 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
    537   SmallVector<Instruction *, 16> Instrs;
    538   for (Instruction *I : Chain) {
    539     Value *PtrOperand = getLoadStorePointerOperand(I);
    540     assert(PtrOperand && "Instruction must have a pointer operand.");
    541     Instrs.push_back(I);
    542     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
    543       Instrs.push_back(GEP);
    544   }
    545 
    546   // Erase instructions.
    547   for (Instruction *I : Instrs)
    548     if (I->use_empty())
    549       I->eraseFromParent();
    550 }
    551 
    552 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
    553 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
    554                                unsigned ElementSizeBits) {
    555   unsigned ElementSizeBytes = ElementSizeBits / 8;
    556   unsigned SizeBytes = ElementSizeBytes * Chain.size();
    557   unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
    558   if (NumLeft == Chain.size()) {
    559     if ((NumLeft & 1) == 0)
    560       NumLeft /= 2; // Split even in half
    561     else
    562       --NumLeft;    // Split off last element
    563   } else if (NumLeft == 0)
    564     NumLeft = 1;
    565   return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
    566 }
    567 
    568 ArrayRef<Instruction *>
    569 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
    570   // These are in BB order, unlike Chain, which is in address order.
    571   SmallVector<Instruction *, 16> MemoryInstrs;
    572   SmallVector<Instruction *, 16> ChainInstrs;
    573 
    574   bool IsLoadChain = isa<LoadInst>(Chain[0]);
    575   LLVM_DEBUG({
    576     for (Instruction *I : Chain) {
    577       if (IsLoadChain)
    578         assert(isa<LoadInst>(I) &&
    579                "All elements of Chain must be loads, or all must be stores.");
    580       else
    581         assert(isa<StoreInst>(I) &&
    582                "All elements of Chain must be loads, or all must be stores.");
    583     }
    584   });
    585 
    586   for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
    587     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
    588       if (!is_contained(Chain, &I))
    589         MemoryInstrs.push_back(&I);
    590       else
    591         ChainInstrs.push_back(&I);
    592     } else if (isa<IntrinsicInst>(&I) &&
    593                cast<IntrinsicInst>(&I)->getIntrinsicID() ==
    594                    Intrinsic::sideeffect) {
    595       // Ignore llvm.sideeffect calls.
    596     } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
    597       LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
    598                         << '\n');
    599       break;
    600     } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
    601       LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
    602                         << '\n');
    603       break;
    604     }
    605   }
    606 
    607   OrderedBasicBlock OBB(Chain[0]->getParent());
    608 
    609   // Loop until we find an instruction in ChainInstrs that we can't vectorize.
    610   unsigned ChainInstrIdx = 0;
    611   Instruction *BarrierMemoryInstr = nullptr;
    612 
    613   for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
    614     Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
    615 
    616     // If a barrier memory instruction was found, chain instructions that follow
    617     // will not be added to the valid prefix.
    618     if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
    619       break;
    620 
    621     // Check (in BB order) if any instruction prevents ChainInstr from being
    622     // vectorized. Find and store the first such "conflicting" instruction.
    623     for (Instruction *MemInstr : MemoryInstrs) {
    624       // If a barrier memory instruction was found, do not check past it.
    625       if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
    626         break;
    627 
    628       auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
    629       auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
    630       if (MemLoad && ChainLoad)
    631         continue;
    632 
    633       // We can ignore the alias if the we have a load store pair and the load
    634       // is known to be invariant. The load cannot be clobbered by the store.
    635       auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
    636         return LI->getMetadata(LLVMContext::MD_invariant_load);
    637       };
    638 
    639       // We can ignore the alias as long as the load comes before the store,
    640       // because that means we won't be moving the load past the store to
    641       // vectorize it (the vectorized load is inserted at the location of the
    642       // first load in the chain).
    643       if (isa<StoreInst>(MemInstr) && ChainLoad &&
    644           (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
    645         continue;
    646 
    647       // Same case, but in reverse.
    648       if (MemLoad && isa<StoreInst>(ChainInstr) &&
    649           (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
    650         continue;
    651 
    652       if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
    653                         MemoryLocation::get(ChainInstr))) {
    654         LLVM_DEBUG({
    655           dbgs() << "LSV: Found alias:\n"
    656                     "  Aliasing instruction and pointer:\n"
    657                  << "  " << *MemInstr << '\n'
    658                  << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
    659                  << "  Aliased instruction and pointer:\n"
    660                  << "  " << *ChainInstr << '\n'
    661                  << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
    662         });
    663         // Save this aliasing memory instruction as a barrier, but allow other
    664         // instructions that precede the barrier to be vectorized with this one.
    665         BarrierMemoryInstr = MemInstr;
    666         break;
    667       }
    668     }
    669     // Continue the search only for store chains, since vectorizing stores that
    670     // precede an aliasing load is valid. Conversely, vectorizing loads is valid
    671     // up to an aliasing store, but should not pull loads from further down in
    672     // the basic block.
    673     if (IsLoadChain && BarrierMemoryInstr) {
    674       // The BarrierMemoryInstr is a store that precedes ChainInstr.
    675       assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
    676       break;
    677     }
    678   }
    679 
    680   // Find the largest prefix of Chain whose elements are all in
    681   // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
    682   // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
    683   // order.)
    684   SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
    685       ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
    686   unsigned ChainIdx = 0;
    687   for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
    688     if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
    689       break;
    690   }
    691   return Chain.slice(0, ChainIdx);
    692 }
    693 
    694 static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
    695   const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
    696   if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
    697     // The select's themselves are distinct instructions even if they share the
    698     // same condition and evaluate to consecutive pointers for true and false
    699     // values of the condition. Therefore using the select's themselves for
    700     // grouping instructions would put consecutive accesses into different lists
    701     // and they won't be even checked for being consecutive, and won't be
    702     // vectorized.
    703     return Sel->getCondition();
    704   }
    705   return ObjPtr;
    706 }
    707 
    708 std::pair<InstrListMap, InstrListMap>
    709 Vectorizer::collectInstructions(BasicBlock *BB) {
    710   InstrListMap LoadRefs;
    711   InstrListMap StoreRefs;
    712 
    713   for (Instruction &I : *BB) {
    714     if (!I.mayReadOrWriteMemory())
    715       continue;
    716 
    717     if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
    718       if (!LI->isSimple())
    719         continue;
    720 
    721       // Skip if it's not legal.
    722       if (!TTI.isLegalToVectorizeLoad(LI))
    723         continue;
    724 
    725       Type *Ty = LI->getType();
    726       if (!VectorType::isValidElementType(Ty->getScalarType()))
    727         continue;
    728 
    729       // Skip weird non-byte sizes. They probably aren't worth the effort of
    730       // handling correctly.
    731       unsigned TySize = DL.getTypeSizeInBits(Ty);
    732       if ((TySize % 8) != 0)
    733         continue;
    734 
    735       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
    736       // functions are currently using an integer type for the vectorized
    737       // load/store, and does not support casting between the integer type and a
    738       // vector of pointers (e.g. i64 to <2 x i16*>)
    739       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
    740         continue;
    741 
    742       Value *Ptr = LI->getPointerOperand();
    743       unsigned AS = Ptr->getType()->getPointerAddressSpace();
    744       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
    745 
    746       unsigned VF = VecRegSize / TySize;
    747       VectorType *VecTy = dyn_cast<VectorType>(Ty);
    748 
    749       // No point in looking at these if they're too big to vectorize.
    750       if (TySize > VecRegSize / 2 ||
    751           (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
    752         continue;
    753 
    754       // Make sure all the users of a vector are constant-index extracts.
    755       if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
    756             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
    757             return EEI && isa<ConstantInt>(EEI->getOperand(1));
    758           }))
    759         continue;
    760 
    761       // Save the load locations.
    762       const ChainID ID = getChainID(Ptr, DL);
    763       LoadRefs[ID].push_back(LI);
    764     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
    765       if (!SI->isSimple())
    766         continue;
    767 
    768       // Skip if it's not legal.
    769       if (!TTI.isLegalToVectorizeStore(SI))
    770         continue;
    771 
    772       Type *Ty = SI->getValueOperand()->getType();
    773       if (!VectorType::isValidElementType(Ty->getScalarType()))
    774         continue;
    775 
    776       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
    777       // functions are currently using an integer type for the vectorized
    778       // load/store, and does not support casting between the integer type and a
    779       // vector of pointers (e.g. i64 to <2 x i16*>)
    780       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
    781         continue;
    782 
    783       // Skip weird non-byte sizes. They probably aren't worth the effort of
    784       // handling correctly.
    785       unsigned TySize = DL.getTypeSizeInBits(Ty);
    786       if ((TySize % 8) != 0)
    787         continue;
    788 
    789       Value *Ptr = SI->getPointerOperand();
    790       unsigned AS = Ptr->getType()->getPointerAddressSpace();
    791       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
    792 
    793       unsigned VF = VecRegSize / TySize;
    794       VectorType *VecTy = dyn_cast<VectorType>(Ty);
    795 
    796       // No point in looking at these if they're too big to vectorize.
    797       if (TySize > VecRegSize / 2 ||
    798           (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
    799         continue;
    800 
    801       if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
    802             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
    803             return EEI && isa<ConstantInt>(EEI->getOperand(1));
    804           }))
    805         continue;
    806 
    807       // Save store location.
    808       const ChainID ID = getChainID(Ptr, DL);
    809       StoreRefs[ID].push_back(SI);
    810     }
    811   }
    812 
    813   return {LoadRefs, StoreRefs};
    814 }
    815 
    816 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
    817   bool Changed = false;
    818 
    819   for (const std::pair<ChainID, InstrList> &Chain : Map) {
    820     unsigned Size = Chain.second.size();
    821     if (Size < 2)
    822       continue;
    823 
    824     LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
    825 
    826     // Process the stores in chunks of 64.
    827     for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
    828       unsigned Len = std::min<unsigned>(CE - CI, 64);
    829       ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
    830       Changed |= vectorizeInstructions(Chunk);
    831     }
    832   }
    833 
    834   return Changed;
    835 }
    836 
    837 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
    838   LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
    839                     << " instructions.\n");
    840   SmallVector<int, 16> Heads, Tails;
    841   int ConsecutiveChain[64];
    842 
    843   // Do a quadratic search on all of the given loads/stores and find all of the
    844   // pairs of loads/stores that follow each other.
    845   for (int i = 0, e = Instrs.size(); i < e; ++i) {
    846     ConsecutiveChain[i] = -1;
    847     for (int j = e - 1; j >= 0; --j) {
    848       if (i == j)
    849         continue;
    850 
    851       if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
    852         if (ConsecutiveChain[i] != -1) {
    853           int CurDistance = std::abs(ConsecutiveChain[i] - i);
    854           int NewDistance = std::abs(ConsecutiveChain[i] - j);
    855           if (j < i || NewDistance > CurDistance)
    856             continue; // Should not insert.
    857         }
    858 
    859         Tails.push_back(j);
    860         Heads.push_back(i);
    861         ConsecutiveChain[i] = j;
    862       }
    863     }
    864   }
    865 
    866   bool Changed = false;
    867   SmallPtrSet<Instruction *, 16> InstructionsProcessed;
    868 
    869   for (int Head : Heads) {
    870     if (InstructionsProcessed.count(Instrs[Head]))
    871       continue;
    872     bool LongerChainExists = false;
    873     for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
    874       if (Head == Tails[TIt] &&
    875           !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
    876         LongerChainExists = true;
    877         break;
    878       }
    879     if (LongerChainExists)
    880       continue;
    881 
    882     // We found an instr that starts a chain. Now follow the chain and try to
    883     // vectorize it.
    884     SmallVector<Instruction *, 16> Operands;
    885     int I = Head;
    886     while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
    887       if (InstructionsProcessed.count(Instrs[I]))
    888         break;
    889 
    890       Operands.push_back(Instrs[I]);
    891       I = ConsecutiveChain[I];
    892     }
    893 
    894     bool Vectorized = false;
    895     if (isa<LoadInst>(*Operands.begin()))
    896       Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
    897     else
    898       Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
    899 
    900     Changed |= Vectorized;
    901   }
    902 
    903   return Changed;
    904 }
    905 
    906 bool Vectorizer::vectorizeStoreChain(
    907     ArrayRef<Instruction *> Chain,
    908     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
    909   StoreInst *S0 = cast<StoreInst>(Chain[0]);
    910 
    911   // If the vector has an int element, default to int for the whole store.
    912   Type *StoreTy;
    913   for (Instruction *I : Chain) {
    914     StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
    915     if (StoreTy->isIntOrIntVectorTy())
    916       break;
    917 
    918     if (StoreTy->isPtrOrPtrVectorTy()) {
    919       StoreTy = Type::getIntNTy(F.getParent()->getContext(),
    920                                 DL.getTypeSizeInBits(StoreTy));
    921       break;
    922     }
    923   }
    924 
    925   unsigned Sz = DL.getTypeSizeInBits(StoreTy);
    926   unsigned AS = S0->getPointerAddressSpace();
    927   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
    928   unsigned VF = VecRegSize / Sz;
    929   unsigned ChainSize = Chain.size();
    930   unsigned Alignment = getAlignment(S0);
    931 
    932   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
    933     InstructionsProcessed->insert(Chain.begin(), Chain.end());
    934     return false;
    935   }
    936 
    937   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
    938   if (NewChain.empty()) {
    939     // No vectorization possible.
    940     InstructionsProcessed->insert(Chain.begin(), Chain.end());
    941     return false;
    942   }
    943   if (NewChain.size() == 1) {
    944     // Failed after the first instruction. Discard it and try the smaller chain.
    945     InstructionsProcessed->insert(NewChain.front());
    946     return false;
    947   }
    948 
    949   // Update Chain to the valid vectorizable subchain.
    950   Chain = NewChain;
    951   ChainSize = Chain.size();
    952 
    953   // Check if it's legal to vectorize this chain. If not, split the chain and
    954   // try again.
    955   unsigned EltSzInBytes = Sz / 8;
    956   unsigned SzInBytes = EltSzInBytes * ChainSize;
    957   if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
    958     auto Chains = splitOddVectorElts(Chain, Sz);
    959     return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
    960            vectorizeStoreChain(Chains.second, InstructionsProcessed);
    961   }
    962 
    963   VectorType *VecTy;
    964   VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
    965   if (VecStoreTy)
    966     VecTy = VectorType::get(StoreTy->getScalarType(),
    967                             Chain.size() * VecStoreTy->getNumElements());
    968   else
    969     VecTy = VectorType::get(StoreTy, Chain.size());
    970 
    971   // If it's more than the max vector size or the target has a better
    972   // vector factor, break it into two pieces.
    973   unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
    974   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
    975     LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
    976                          " Creating two separate arrays.\n");
    977     return vectorizeStoreChain(Chain.slice(0, TargetVF),
    978                                InstructionsProcessed) |
    979            vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
    980   }
    981 
    982   LLVM_DEBUG({
    983     dbgs() << "LSV: Stores to vectorize:\n";
    984     for (Instruction *I : Chain)
    985       dbgs() << "  " << *I << "\n";
    986   });
    987 
    988   // We won't try again to vectorize the elements of the chain, regardless of
    989   // whether we succeed below.
    990   InstructionsProcessed->insert(Chain.begin(), Chain.end());
    991 
    992   // If the store is going to be misaligned, don't vectorize it.
    993   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
    994     if (S0->getPointerAddressSpace() != 0)
    995       return false;
    996 
    997     unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
    998                                                    StackAdjustedAlignment,
    999                                                    DL, S0, nullptr, &DT);
   1000     if (NewAlign < StackAdjustedAlignment)
   1001       return false;
   1002   }
   1003 
   1004   BasicBlock::iterator First, Last;
   1005   std::tie(First, Last) = getBoundaryInstrs(Chain);
   1006   Builder.SetInsertPoint(&*Last);
   1007 
   1008   Value *Vec = UndefValue::get(VecTy);
   1009 
   1010   if (VecStoreTy) {
   1011     unsigned VecWidth = VecStoreTy->getNumElements();
   1012     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
   1013       StoreInst *Store = cast<StoreInst>(Chain[I]);
   1014       for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
   1015         unsigned NewIdx = J + I * VecWidth;
   1016         Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
   1017                                                       Builder.getInt32(J));
   1018         if (Extract->getType() != StoreTy->getScalarType())
   1019           Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
   1020 
   1021         Value *Insert =
   1022             Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
   1023         Vec = Insert;
   1024       }
   1025     }
   1026   } else {
   1027     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
   1028       StoreInst *Store = cast<StoreInst>(Chain[I]);
   1029       Value *Extract = Store->getValueOperand();
   1030       if (Extract->getType() != StoreTy->getScalarType())
   1031         Extract =
   1032             Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
   1033 
   1034       Value *Insert =
   1035           Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
   1036       Vec = Insert;
   1037     }
   1038   }
   1039 
   1040   // This cast is safe because Builder.CreateStore() always creates a bona fide
   1041   // StoreInst.
   1042   StoreInst *SI = cast<StoreInst>(
   1043       Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
   1044                                                      VecTy->getPointerTo(AS))));
   1045   propagateMetadata(SI, Chain);
   1046   SI->setAlignment(Alignment);
   1047 
   1048   eraseInstructions(Chain);
   1049   ++NumVectorInstructions;
   1050   NumScalarsVectorized += Chain.size();
   1051   return true;
   1052 }
   1053 
   1054 bool Vectorizer::vectorizeLoadChain(
   1055     ArrayRef<Instruction *> Chain,
   1056     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
   1057   LoadInst *L0 = cast<LoadInst>(Chain[0]);
   1058 
   1059   // If the vector has an int element, default to int for the whole load.
   1060   Type *LoadTy;
   1061   for (const auto &V : Chain) {
   1062     LoadTy = cast<LoadInst>(V)->getType();
   1063     if (LoadTy->isIntOrIntVectorTy())
   1064       break;
   1065 
   1066     if (LoadTy->isPtrOrPtrVectorTy()) {
   1067       LoadTy = Type::getIntNTy(F.getParent()->getContext(),
   1068                                DL.getTypeSizeInBits(LoadTy));
   1069       break;
   1070     }
   1071   }
   1072 
   1073   unsigned Sz = DL.getTypeSizeInBits(LoadTy);
   1074   unsigned AS = L0->getPointerAddressSpace();
   1075   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
   1076   unsigned VF = VecRegSize / Sz;
   1077   unsigned ChainSize = Chain.size();
   1078   unsigned Alignment = getAlignment(L0);
   1079 
   1080   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
   1081     InstructionsProcessed->insert(Chain.begin(), Chain.end());
   1082     return false;
   1083   }
   1084 
   1085   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
   1086   if (NewChain.empty()) {
   1087     // No vectorization possible.
   1088     InstructionsProcessed->insert(Chain.begin(), Chain.end());
   1089     return false;
   1090   }
   1091   if (NewChain.size() == 1) {
   1092     // Failed after the first instruction. Discard it and try the smaller chain.
   1093     InstructionsProcessed->insert(NewChain.front());
   1094     return false;
   1095   }
   1096 
   1097   // Update Chain to the valid vectorizable subchain.
   1098   Chain = NewChain;
   1099   ChainSize = Chain.size();
   1100 
   1101   // Check if it's legal to vectorize this chain. If not, split the chain and
   1102   // try again.
   1103   unsigned EltSzInBytes = Sz / 8;
   1104   unsigned SzInBytes = EltSzInBytes * ChainSize;
   1105   if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
   1106     auto Chains = splitOddVectorElts(Chain, Sz);
   1107     return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
   1108            vectorizeLoadChain(Chains.second, InstructionsProcessed);
   1109   }
   1110 
   1111   VectorType *VecTy;
   1112   VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
   1113   if (VecLoadTy)
   1114     VecTy = VectorType::get(LoadTy->getScalarType(),
   1115                             Chain.size() * VecLoadTy->getNumElements());
   1116   else
   1117     VecTy = VectorType::get(LoadTy, Chain.size());
   1118 
   1119   // If it's more than the max vector size or the target has a better
   1120   // vector factor, break it into two pieces.
   1121   unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
   1122   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
   1123     LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
   1124                          " Creating two separate arrays.\n");
   1125     return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
   1126            vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
   1127   }
   1128 
   1129   // We won't try again to vectorize the elements of the chain, regardless of
   1130   // whether we succeed below.
   1131   InstructionsProcessed->insert(Chain.begin(), Chain.end());
   1132 
   1133   // If the load is going to be misaligned, don't vectorize it.
   1134   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
   1135     if (L0->getPointerAddressSpace() != 0)
   1136       return false;
   1137 
   1138     unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
   1139                                                    StackAdjustedAlignment,
   1140                                                    DL, L0, nullptr, &DT);
   1141     if (NewAlign < StackAdjustedAlignment)
   1142       return false;
   1143 
   1144     Alignment = NewAlign;
   1145   }
   1146 
   1147   LLVM_DEBUG({
   1148     dbgs() << "LSV: Loads to vectorize:\n";
   1149     for (Instruction *I : Chain)
   1150       I->dump();
   1151   });
   1152 
   1153   // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
   1154   // Last may have changed since then, but the value of First won't have.  If it
   1155   // matters, we could compute getBoundaryInstrs only once and reuse it here.
   1156   BasicBlock::iterator First, Last;
   1157   std::tie(First, Last) = getBoundaryInstrs(Chain);
   1158   Builder.SetInsertPoint(&*First);
   1159 
   1160   Value *Bitcast =
   1161       Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
   1162   // This cast is safe because Builder.CreateLoad always creates a bona fide
   1163   // LoadInst.
   1164   LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
   1165   propagateMetadata(LI, Chain);
   1166   LI->setAlignment(Alignment);
   1167 
   1168   if (VecLoadTy) {
   1169     SmallVector<Instruction *, 16> InstrsToErase;
   1170 
   1171     unsigned VecWidth = VecLoadTy->getNumElements();
   1172     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
   1173       for (auto Use : Chain[I]->users()) {
   1174         // All users of vector loads are ExtractElement instructions with
   1175         // constant indices, otherwise we would have bailed before now.
   1176         Instruction *UI = cast<Instruction>(Use);
   1177         unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
   1178         unsigned NewIdx = Idx + I * VecWidth;
   1179         Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
   1180                                                 UI->getName());
   1181         if (V->getType() != UI->getType())
   1182           V = Builder.CreateBitCast(V, UI->getType());
   1183 
   1184         // Replace the old instruction.
   1185         UI->replaceAllUsesWith(V);
   1186         InstrsToErase.push_back(UI);
   1187       }
   1188     }
   1189 
   1190     // Bitcast might not be an Instruction, if the value being loaded is a
   1191     // constant.  In that case, no need to reorder anything.
   1192     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
   1193       reorder(BitcastInst);
   1194 
   1195     for (auto I : InstrsToErase)
   1196       I->eraseFromParent();
   1197   } else {
   1198     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
   1199       Value *CV = Chain[I];
   1200       Value *V =
   1201           Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
   1202       if (V->getType() != CV->getType()) {
   1203         V = Builder.CreateBitOrPointerCast(V, CV->getType());
   1204       }
   1205 
   1206       // Replace the old instruction.
   1207       CV->replaceAllUsesWith(V);
   1208     }
   1209 
   1210     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
   1211       reorder(BitcastInst);
   1212   }
   1213 
   1214   eraseInstructions(Chain);
   1215 
   1216   ++NumVectorInstructions;
   1217   NumScalarsVectorized += Chain.size();
   1218   return true;
   1219 }
   1220 
   1221 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
   1222                                     unsigned Alignment) {
   1223   if (Alignment % SzInBytes == 0)
   1224     return false;
   1225 
   1226   bool Fast = false;
   1227   bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
   1228                                                    SzInBytes * 8, AddressSpace,
   1229                                                    Alignment, &Fast);
   1230   LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
   1231                     << " and fast? " << Fast << "\n";);
   1232   return !Allows || !Fast;
   1233 }
   1234