Home | History | Annotate | Download | only in Vectorize
      1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
     10 // stores that can be put together into vector-stores. Next, it attempts to
     11 // construct vectorizable tree using the use-def chains. If a profitable tree
     12 // was found, the SLP vectorizer performs vectorization on the tree.
     13 //
     14 // The pass is inspired by the work described in the paper:
     15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
     16 //
     17 //===----------------------------------------------------------------------===//
     18 #include "llvm/Transforms/Vectorize.h"
     19 #include "llvm/ADT/MapVector.h"
     20 #include "llvm/ADT/Optional.h"
     21 #include "llvm/ADT/PostOrderIterator.h"
     22 #include "llvm/ADT/SetVector.h"
     23 #include "llvm/ADT/Statistic.h"
     24 #include "llvm/Analysis/AliasAnalysis.h"
     25 #include "llvm/Analysis/AssumptionCache.h"
     26 #include "llvm/Analysis/CodeMetrics.h"
     27 #include "llvm/Analysis/LoopInfo.h"
     28 #include "llvm/Analysis/ScalarEvolution.h"
     29 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
     30 #include "llvm/Analysis/TargetTransformInfo.h"
     31 #include "llvm/Analysis/ValueTracking.h"
     32 #include "llvm/IR/DataLayout.h"
     33 #include "llvm/IR/Dominators.h"
     34 #include "llvm/IR/IRBuilder.h"
     35 #include "llvm/IR/Instructions.h"
     36 #include "llvm/IR/IntrinsicInst.h"
     37 #include "llvm/IR/Module.h"
     38 #include "llvm/IR/NoFolder.h"
     39 #include "llvm/IR/Type.h"
     40 #include "llvm/IR/Value.h"
     41 #include "llvm/IR/Verifier.h"
     42 #include "llvm/Pass.h"
     43 #include "llvm/Support/CommandLine.h"
     44 #include "llvm/Support/Debug.h"
     45 #include "llvm/Support/raw_ostream.h"
     46 #include "llvm/Transforms/Utils/VectorUtils.h"
     47 #include <algorithm>
     48 #include <map>
     49 #include <memory>
     50 
     51 using namespace llvm;
     52 
     53 #define SV_NAME "slp-vectorizer"
     54 #define DEBUG_TYPE "SLP"
     55 
     56 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
     57 
     58 static cl::opt<int>
     59     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
     60                      cl::desc("Only vectorize if you gain more than this "
     61                               "number "));
     62 
     63 static cl::opt<bool>
     64 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
     65                    cl::desc("Attempt to vectorize horizontal reductions"));
     66 
     67 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
     68     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
     69     cl::desc(
     70         "Attempt to vectorize horizontal reductions feeding into a store"));
     71 
     72 namespace {
     73 
     74 static const unsigned MinVecRegSize = 128;
     75 
     76 static const unsigned RecursionMaxDepth = 12;
     77 
     78 // Limit the number of alias checks. The limit is chosen so that
     79 // it has no negative effect on the llvm benchmarks.
     80 static const unsigned AliasedCheckLimit = 10;
     81 
     82 // Another limit for the alias checks: The maximum distance between load/store
     83 // instructions where alias checks are done.
     84 // This limit is useful for very large basic blocks.
     85 static const unsigned MaxMemDepDistance = 160;
     86 
     87 /// \brief Predicate for the element types that the SLP vectorizer supports.
     88 ///
     89 /// The most important thing to filter here are types which are invalid in LLVM
     90 /// vectors. We also filter target specific types which have absolutely no
     91 /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
     92 /// avoids spending time checking the cost model and realizing that they will
     93 /// be inevitably scalarized.
     94 static bool isValidElementType(Type *Ty) {
     95   return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
     96          !Ty->isPPC_FP128Ty();
     97 }
     98 
     99 /// \returns the parent basic block if all of the instructions in \p VL
    100 /// are in the same block or null otherwise.
    101 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
    102   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
    103   if (!I0)
    104     return nullptr;
    105   BasicBlock *BB = I0->getParent();
    106   for (int i = 1, e = VL.size(); i < e; i++) {
    107     Instruction *I = dyn_cast<Instruction>(VL[i]);
    108     if (!I)
    109       return nullptr;
    110 
    111     if (BB != I->getParent())
    112       return nullptr;
    113   }
    114   return BB;
    115 }
    116 
    117 /// \returns True if all of the values in \p VL are constants.
    118 static bool allConstant(ArrayRef<Value *> VL) {
    119   for (unsigned i = 0, e = VL.size(); i < e; ++i)
    120     if (!isa<Constant>(VL[i]))
    121       return false;
    122   return true;
    123 }
    124 
    125 /// \returns True if all of the values in \p VL are identical.
    126 static bool isSplat(ArrayRef<Value *> VL) {
    127   for (unsigned i = 1, e = VL.size(); i < e; ++i)
    128     if (VL[i] != VL[0])
    129       return false;
    130   return true;
    131 }
    132 
    133 ///\returns Opcode that can be clubbed with \p Op to create an alternate
    134 /// sequence which can later be merged as a ShuffleVector instruction.
    135 static unsigned getAltOpcode(unsigned Op) {
    136   switch (Op) {
    137   case Instruction::FAdd:
    138     return Instruction::FSub;
    139   case Instruction::FSub:
    140     return Instruction::FAdd;
    141   case Instruction::Add:
    142     return Instruction::Sub;
    143   case Instruction::Sub:
    144     return Instruction::Add;
    145   default:
    146     return 0;
    147   }
    148 }
    149 
    150 ///\returns bool representing if Opcode \p Op can be part
    151 /// of an alternate sequence which can later be merged as
    152 /// a ShuffleVector instruction.
    153 static bool canCombineAsAltInst(unsigned Op) {
    154   if (Op == Instruction::FAdd || Op == Instruction::FSub ||
    155       Op == Instruction::Sub || Op == Instruction::Add)
    156     return true;
    157   return false;
    158 }
    159 
    160 /// \returns ShuffleVector instruction if intructions in \p VL have
    161 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
    162 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
    163 static unsigned isAltInst(ArrayRef<Value *> VL) {
    164   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
    165   unsigned Opcode = I0->getOpcode();
    166   unsigned AltOpcode = getAltOpcode(Opcode);
    167   for (int i = 1, e = VL.size(); i < e; i++) {
    168     Instruction *I = dyn_cast<Instruction>(VL[i]);
    169     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
    170       return 0;
    171   }
    172   return Instruction::ShuffleVector;
    173 }
    174 
    175 /// \returns The opcode if all of the Instructions in \p VL have the same
    176 /// opcode, or zero.
    177 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
    178   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
    179   if (!I0)
    180     return 0;
    181   unsigned Opcode = I0->getOpcode();
    182   for (int i = 1, e = VL.size(); i < e; i++) {
    183     Instruction *I = dyn_cast<Instruction>(VL[i]);
    184     if (!I || Opcode != I->getOpcode()) {
    185       if (canCombineAsAltInst(Opcode) && i == 1)
    186         return isAltInst(VL);
    187       return 0;
    188     }
    189   }
    190   return Opcode;
    191 }
    192 
    193 /// Get the intersection (logical and) of all of the potential IR flags
    194 /// of each scalar operation (VL) that will be converted into a vector (I).
    195 /// Flag set: NSW, NUW, exact, and all of fast-math.
    196 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
    197   if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
    198     if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
    199       // Intersection is initialized to the 0th scalar,
    200       // so start counting from index '1'.
    201       for (int i = 1, e = VL.size(); i < e; ++i) {
    202         if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
    203           Intersection->andIRFlags(Scalar);
    204       }
    205       VecOp->copyIRFlags(Intersection);
    206     }
    207   }
    208 }
    209 
    210 /// \returns \p I after propagating metadata from \p VL.
    211 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
    212   Instruction *I0 = cast<Instruction>(VL[0]);
    213   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
    214   I0->getAllMetadataOtherThanDebugLoc(Metadata);
    215 
    216   for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
    217     unsigned Kind = Metadata[i].first;
    218     MDNode *MD = Metadata[i].second;
    219 
    220     for (int i = 1, e = VL.size(); MD && i != e; i++) {
    221       Instruction *I = cast<Instruction>(VL[i]);
    222       MDNode *IMD = I->getMetadata(Kind);
    223 
    224       switch (Kind) {
    225       default:
    226         MD = nullptr; // Remove unknown metadata
    227         break;
    228       case LLVMContext::MD_tbaa:
    229         MD = MDNode::getMostGenericTBAA(MD, IMD);
    230         break;
    231       case LLVMContext::MD_alias_scope:
    232         MD = MDNode::getMostGenericAliasScope(MD, IMD);
    233         break;
    234       case LLVMContext::MD_noalias:
    235         MD = MDNode::intersect(MD, IMD);
    236         break;
    237       case LLVMContext::MD_fpmath:
    238         MD = MDNode::getMostGenericFPMath(MD, IMD);
    239         break;
    240       }
    241     }
    242     I->setMetadata(Kind, MD);
    243   }
    244   return I;
    245 }
    246 
    247 /// \returns The type that all of the values in \p VL have or null if there
    248 /// are different types.
    249 static Type* getSameType(ArrayRef<Value *> VL) {
    250   Type *Ty = VL[0]->getType();
    251   for (int i = 1, e = VL.size(); i < e; i++)
    252     if (VL[i]->getType() != Ty)
    253       return nullptr;
    254 
    255   return Ty;
    256 }
    257 
    258 /// \returns True if the ExtractElement instructions in VL can be vectorized
    259 /// to use the original vector.
    260 static bool CanReuseExtract(ArrayRef<Value *> VL) {
    261   assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
    262   // Check if all of the extracts come from the same vector and from the
    263   // correct offset.
    264   Value *VL0 = VL[0];
    265   ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
    266   Value *Vec = E0->getOperand(0);
    267 
    268   // We have to extract from the same vector type.
    269   unsigned NElts = Vec->getType()->getVectorNumElements();
    270 
    271   if (NElts != VL.size())
    272     return false;
    273 
    274   // Check that all of the indices extract from the correct offset.
    275   ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
    276   if (!CI || CI->getZExtValue())
    277     return false;
    278 
    279   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
    280     ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
    281     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
    282 
    283     if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
    284       return false;
    285   }
    286 
    287   return true;
    288 }
    289 
    290 /// \returns True if in-tree use also needs extract. This refers to
    291 /// possible scalar operand in vectorized instruction.
    292 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
    293                                     TargetLibraryInfo *TLI) {
    294 
    295   unsigned Opcode = UserInst->getOpcode();
    296   switch (Opcode) {
    297   case Instruction::Load: {
    298     LoadInst *LI = cast<LoadInst>(UserInst);
    299     return (LI->getPointerOperand() == Scalar);
    300   }
    301   case Instruction::Store: {
    302     StoreInst *SI = cast<StoreInst>(UserInst);
    303     return (SI->getPointerOperand() == Scalar);
    304   }
    305   case Instruction::Call: {
    306     CallInst *CI = cast<CallInst>(UserInst);
    307     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
    308     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
    309       return (CI->getArgOperand(1) == Scalar);
    310     }
    311   }
    312   default:
    313     return false;
    314   }
    315 }
    316 
    317 /// \returns the AA location that is being access by the instruction.
    318 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
    319   if (StoreInst *SI = dyn_cast<StoreInst>(I))
    320     return AA->getLocation(SI);
    321   if (LoadInst *LI = dyn_cast<LoadInst>(I))
    322     return AA->getLocation(LI);
    323   return AliasAnalysis::Location();
    324 }
    325 
    326 /// \returns True if the instruction is not a volatile or atomic load/store.
    327 static bool isSimple(Instruction *I) {
    328   if (LoadInst *LI = dyn_cast<LoadInst>(I))
    329     return LI->isSimple();
    330   if (StoreInst *SI = dyn_cast<StoreInst>(I))
    331     return SI->isSimple();
    332   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
    333     return !MI->isVolatile();
    334   return true;
    335 }
    336 
    337 /// Bottom Up SLP Vectorizer.
    338 class BoUpSLP {
    339 public:
    340   typedef SmallVector<Value *, 8> ValueList;
    341   typedef SmallVector<Instruction *, 16> InstrList;
    342   typedef SmallPtrSet<Value *, 16> ValueSet;
    343   typedef SmallVector<StoreInst *, 8> StoreList;
    344 
    345   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
    346           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
    347           DominatorTree *Dt, AssumptionCache *AC)
    348       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
    349         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
    350         Builder(Se->getContext()) {
    351     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
    352   }
    353 
    354   /// \brief Vectorize the tree that starts with the elements in \p VL.
    355   /// Returns the vectorized root.
    356   Value *vectorizeTree();
    357 
    358   /// \returns the cost incurred by unwanted spills and fills, caused by
    359   /// holding live values over call sites.
    360   int getSpillCost();
    361 
    362   /// \returns the vectorization cost of the subtree that starts at \p VL.
    363   /// A negative number means that this is profitable.
    364   int getTreeCost();
    365 
    366   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
    367   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
    368   void buildTree(ArrayRef<Value *> Roots,
    369                  ArrayRef<Value *> UserIgnoreLst = None);
    370 
    371   /// Clear the internal data structures that are created by 'buildTree'.
    372   void deleteTree() {
    373     VectorizableTree.clear();
    374     ScalarToTreeEntry.clear();
    375     MustGather.clear();
    376     ExternalUses.clear();
    377     NumLoadsWantToKeepOrder = 0;
    378     NumLoadsWantToChangeOrder = 0;
    379     for (auto &Iter : BlocksSchedules) {
    380       BlockScheduling *BS = Iter.second.get();
    381       BS->clear();
    382     }
    383   }
    384 
    385   /// \returns true if the memory operations A and B are consecutive.
    386   bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
    387 
    388   /// \brief Perform LICM and CSE on the newly generated gather sequences.
    389   void optimizeGatherSequence();
    390 
    391   /// \returns true if it is benefitial to reverse the vector order.
    392   bool shouldReorder() const {
    393     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
    394   }
    395 
    396 private:
    397   struct TreeEntry;
    398 
    399   /// \returns the cost of the vectorizable entry.
    400   int getEntryCost(TreeEntry *E);
    401 
    402   /// This is the recursive part of buildTree.
    403   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
    404 
    405   /// Vectorize a single entry in the tree.
    406   Value *vectorizeTree(TreeEntry *E);
    407 
    408   /// Vectorize a single entry in the tree, starting in \p VL.
    409   Value *vectorizeTree(ArrayRef<Value *> VL);
    410 
    411   /// \returns the pointer to the vectorized value if \p VL is already
    412   /// vectorized, or NULL. They may happen in cycles.
    413   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
    414 
    415   /// \brief Take the pointer operand from the Load/Store instruction.
    416   /// \returns NULL if this is not a valid Load/Store instruction.
    417   static Value *getPointerOperand(Value *I);
    418 
    419   /// \brief Take the address space operand from the Load/Store instruction.
    420   /// \returns -1 if this is not a valid Load/Store instruction.
    421   static unsigned getAddressSpaceOperand(Value *I);
    422 
    423   /// \returns the scalarization cost for this type. Scalarization in this
    424   /// context means the creation of vectors from a group of scalars.
    425   int getGatherCost(Type *Ty);
    426 
    427   /// \returns the scalarization cost for this list of values. Assuming that
    428   /// this subtree gets vectorized, we may need to extract the values from the
    429   /// roots. This method calculates the cost of extracting the values.
    430   int getGatherCost(ArrayRef<Value *> VL);
    431 
    432   /// \brief Set the Builder insert point to one after the last instruction in
    433   /// the bundle
    434   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
    435 
    436   /// \returns a vector from a collection of scalars in \p VL.
    437   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
    438 
    439   /// \returns whether the VectorizableTree is fully vectoriable and will
    440   /// be beneficial even the tree height is tiny.
    441   bool isFullyVectorizableTinyTree();
    442 
    443   /// \reorder commutative operands in alt shuffle if they result in
    444   ///  vectorized code.
    445   void reorderAltShuffleOperands(ArrayRef<Value *> VL,
    446                                  SmallVectorImpl<Value *> &Left,
    447                                  SmallVectorImpl<Value *> &Right);
    448   /// \reorder commutative operands to get better probability of
    449   /// generating vectorized code.
    450   void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
    451                                       SmallVectorImpl<Value *> &Left,
    452                                       SmallVectorImpl<Value *> &Right);
    453   struct TreeEntry {
    454     TreeEntry() : Scalars(), VectorizedValue(nullptr),
    455     NeedToGather(0) {}
    456 
    457     /// \returns true if the scalars in VL are equal to this entry.
    458     bool isSame(ArrayRef<Value *> VL) const {
    459       assert(VL.size() == Scalars.size() && "Invalid size");
    460       return std::equal(VL.begin(), VL.end(), Scalars.begin());
    461     }
    462 
    463     /// A vector of scalars.
    464     ValueList Scalars;
    465 
    466     /// The Scalars are vectorized into this value. It is initialized to Null.
    467     Value *VectorizedValue;
    468 
    469     /// Do we need to gather this sequence ?
    470     bool NeedToGather;
    471   };
    472 
    473   /// Create a new VectorizableTree entry.
    474   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
    475     VectorizableTree.push_back(TreeEntry());
    476     int idx = VectorizableTree.size() - 1;
    477     TreeEntry *Last = &VectorizableTree[idx];
    478     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
    479     Last->NeedToGather = !Vectorized;
    480     if (Vectorized) {
    481       for (int i = 0, e = VL.size(); i != e; ++i) {
    482         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
    483         ScalarToTreeEntry[VL[i]] = idx;
    484       }
    485     } else {
    486       MustGather.insert(VL.begin(), VL.end());
    487     }
    488     return Last;
    489   }
    490 
    491   /// -- Vectorization State --
    492   /// Holds all of the tree entries.
    493   std::vector<TreeEntry> VectorizableTree;
    494 
    495   /// Maps a specific scalar to its tree entry.
    496   SmallDenseMap<Value*, int> ScalarToTreeEntry;
    497 
    498   /// A list of scalars that we found that we need to keep as scalars.
    499   ValueSet MustGather;
    500 
    501   /// This POD struct describes one external user in the vectorized tree.
    502   struct ExternalUser {
    503     ExternalUser (Value *S, llvm::User *U, int L) :
    504       Scalar(S), User(U), Lane(L){};
    505     // Which scalar in our function.
    506     Value *Scalar;
    507     // Which user that uses the scalar.
    508     llvm::User *User;
    509     // Which lane does the scalar belong to.
    510     int Lane;
    511   };
    512   typedef SmallVector<ExternalUser, 16> UserList;
    513 
    514   /// Checks if two instructions may access the same memory.
    515   ///
    516   /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
    517   /// is invariant in the calling loop.
    518   bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1,
    519                  Instruction *Inst2) {
    520 
    521     // First check if the result is already in the cache.
    522     AliasCacheKey key = std::make_pair(Inst1, Inst2);
    523     Optional<bool> &result = AliasCache[key];
    524     if (result.hasValue()) {
    525       return result.getValue();
    526     }
    527     AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
    528     bool aliased = true;
    529     if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
    530       // Do the alias check.
    531       aliased = AA->alias(Loc1, Loc2);
    532     }
    533     // Store the result in the cache.
    534     result = aliased;
    535     return aliased;
    536   }
    537 
    538   typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
    539 
    540   /// Cache for alias results.
    541   /// TODO: consider moving this to the AliasAnalysis itself.
    542   DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
    543 
    544   /// Removes an instruction from its block and eventually deletes it.
    545   /// It's like Instruction::eraseFromParent() except that the actual deletion
    546   /// is delayed until BoUpSLP is destructed.
    547   /// This is required to ensure that there are no incorrect collisions in the
    548   /// AliasCache, which can happen if a new instruction is allocated at the
    549   /// same address as a previously deleted instruction.
    550   void eraseInstruction(Instruction *I) {
    551     I->removeFromParent();
    552     I->dropAllReferences();
    553     DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
    554   }
    555 
    556   /// Temporary store for deleted instructions. Instructions will be deleted
    557   /// eventually when the BoUpSLP is destructed.
    558   SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
    559 
    560   /// A list of values that need to extracted out of the tree.
    561   /// This list holds pairs of (Internal Scalar : External User).
    562   UserList ExternalUses;
    563 
    564   /// Values used only by @llvm.assume calls.
    565   SmallPtrSet<const Value *, 32> EphValues;
    566 
    567   /// Holds all of the instructions that we gathered.
    568   SetVector<Instruction *> GatherSeq;
    569   /// A list of blocks that we are going to CSE.
    570   SetVector<BasicBlock *> CSEBlocks;
    571 
    572   /// Contains all scheduling relevant data for an instruction.
    573   /// A ScheduleData either represents a single instruction or a member of an
    574   /// instruction bundle (= a group of instructions which is combined into a
    575   /// vector instruction).
    576   struct ScheduleData {
    577 
    578     // The initial value for the dependency counters. It means that the
    579     // dependencies are not calculated yet.
    580     enum { InvalidDeps = -1 };
    581 
    582     ScheduleData()
    583         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
    584           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
    585           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
    586           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
    587 
    588     void init(int BlockSchedulingRegionID) {
    589       FirstInBundle = this;
    590       NextInBundle = nullptr;
    591       NextLoadStore = nullptr;
    592       IsScheduled = false;
    593       SchedulingRegionID = BlockSchedulingRegionID;
    594       UnscheduledDepsInBundle = UnscheduledDeps;
    595       clearDependencies();
    596     }
    597 
    598     /// Returns true if the dependency information has been calculated.
    599     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
    600 
    601     /// Returns true for single instructions and for bundle representatives
    602     /// (= the head of a bundle).
    603     bool isSchedulingEntity() const { return FirstInBundle == this; }
    604 
    605     /// Returns true if it represents an instruction bundle and not only a
    606     /// single instruction.
    607     bool isPartOfBundle() const {
    608       return NextInBundle != nullptr || FirstInBundle != this;
    609     }
    610 
    611     /// Returns true if it is ready for scheduling, i.e. it has no more
    612     /// unscheduled depending instructions/bundles.
    613     bool isReady() const {
    614       assert(isSchedulingEntity() &&
    615              "can't consider non-scheduling entity for ready list");
    616       return UnscheduledDepsInBundle == 0 && !IsScheduled;
    617     }
    618 
    619     /// Modifies the number of unscheduled dependencies, also updating it for
    620     /// the whole bundle.
    621     int incrementUnscheduledDeps(int Incr) {
    622       UnscheduledDeps += Incr;
    623       return FirstInBundle->UnscheduledDepsInBundle += Incr;
    624     }
    625 
    626     /// Sets the number of unscheduled dependencies to the number of
    627     /// dependencies.
    628     void resetUnscheduledDeps() {
    629       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
    630     }
    631 
    632     /// Clears all dependency information.
    633     void clearDependencies() {
    634       Dependencies = InvalidDeps;
    635       resetUnscheduledDeps();
    636       MemoryDependencies.clear();
    637     }
    638 
    639     void dump(raw_ostream &os) const {
    640       if (!isSchedulingEntity()) {
    641         os << "/ " << *Inst;
    642       } else if (NextInBundle) {
    643         os << '[' << *Inst;
    644         ScheduleData *SD = NextInBundle;
    645         while (SD) {
    646           os << ';' << *SD->Inst;
    647           SD = SD->NextInBundle;
    648         }
    649         os << ']';
    650       } else {
    651         os << *Inst;
    652       }
    653     }
    654 
    655     Instruction *Inst;
    656 
    657     /// Points to the head in an instruction bundle (and always to this for
    658     /// single instructions).
    659     ScheduleData *FirstInBundle;
    660 
    661     /// Single linked list of all instructions in a bundle. Null if it is a
    662     /// single instruction.
    663     ScheduleData *NextInBundle;
    664 
    665     /// Single linked list of all memory instructions (e.g. load, store, call)
    666     /// in the block - until the end of the scheduling region.
    667     ScheduleData *NextLoadStore;
    668 
    669     /// The dependent memory instructions.
    670     /// This list is derived on demand in calculateDependencies().
    671     SmallVector<ScheduleData *, 4> MemoryDependencies;
    672 
    673     /// This ScheduleData is in the current scheduling region if this matches
    674     /// the current SchedulingRegionID of BlockScheduling.
    675     int SchedulingRegionID;
    676 
    677     /// Used for getting a "good" final ordering of instructions.
    678     int SchedulingPriority;
    679 
    680     /// The number of dependencies. Constitutes of the number of users of the
    681     /// instruction plus the number of dependent memory instructions (if any).
    682     /// This value is calculated on demand.
    683     /// If InvalidDeps, the number of dependencies is not calculated yet.
    684     ///
    685     int Dependencies;
    686 
    687     /// The number of dependencies minus the number of dependencies of scheduled
    688     /// instructions. As soon as this is zero, the instruction/bundle gets ready
    689     /// for scheduling.
    690     /// Note that this is negative as long as Dependencies is not calculated.
    691     int UnscheduledDeps;
    692 
    693     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
    694     /// single instructions.
    695     int UnscheduledDepsInBundle;
    696 
    697     /// True if this instruction is scheduled (or considered as scheduled in the
    698     /// dry-run).
    699     bool IsScheduled;
    700   };
    701 
    702 #ifndef NDEBUG
    703   friend raw_ostream &operator<<(raw_ostream &os,
    704                                  const BoUpSLP::ScheduleData &SD);
    705 #endif
    706 
    707   /// Contains all scheduling data for a basic block.
    708   ///
    709   struct BlockScheduling {
    710 
    711     BlockScheduling(BasicBlock *BB)
    712         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
    713           ScheduleStart(nullptr), ScheduleEnd(nullptr),
    714           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
    715           // Make sure that the initial SchedulingRegionID is greater than the
    716           // initial SchedulingRegionID in ScheduleData (which is 0).
    717           SchedulingRegionID(1) {}
    718 
    719     void clear() {
    720       ReadyInsts.clear();
    721       ScheduleStart = nullptr;
    722       ScheduleEnd = nullptr;
    723       FirstLoadStoreInRegion = nullptr;
    724       LastLoadStoreInRegion = nullptr;
    725 
    726       // Make a new scheduling region, i.e. all existing ScheduleData is not
    727       // in the new region yet.
    728       ++SchedulingRegionID;
    729     }
    730 
    731     ScheduleData *getScheduleData(Value *V) {
    732       ScheduleData *SD = ScheduleDataMap[V];
    733       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
    734         return SD;
    735       return nullptr;
    736     }
    737 
    738     bool isInSchedulingRegion(ScheduleData *SD) {
    739       return SD->SchedulingRegionID == SchedulingRegionID;
    740     }
    741 
    742     /// Marks an instruction as scheduled and puts all dependent ready
    743     /// instructions into the ready-list.
    744     template <typename ReadyListType>
    745     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
    746       SD->IsScheduled = true;
    747       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
    748 
    749       ScheduleData *BundleMember = SD;
    750       while (BundleMember) {
    751         // Handle the def-use chain dependencies.
    752         for (Use &U : BundleMember->Inst->operands()) {
    753           ScheduleData *OpDef = getScheduleData(U.get());
    754           if (OpDef && OpDef->hasValidDependencies() &&
    755               OpDef->incrementUnscheduledDeps(-1) == 0) {
    756             // There are no more unscheduled dependencies after decrementing,
    757             // so we can put the dependent instruction into the ready list.
    758             ScheduleData *DepBundle = OpDef->FirstInBundle;
    759             assert(!DepBundle->IsScheduled &&
    760                    "already scheduled bundle gets ready");
    761             ReadyList.insert(DepBundle);
    762             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
    763           }
    764         }
    765         // Handle the memory dependencies.
    766         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
    767           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
    768             // There are no more unscheduled dependencies after decrementing,
    769             // so we can put the dependent instruction into the ready list.
    770             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
    771             assert(!DepBundle->IsScheduled &&
    772                    "already scheduled bundle gets ready");
    773             ReadyList.insert(DepBundle);
    774             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
    775           }
    776         }
    777         BundleMember = BundleMember->NextInBundle;
    778       }
    779     }
    780 
    781     /// Put all instructions into the ReadyList which are ready for scheduling.
    782     template <typename ReadyListType>
    783     void initialFillReadyList(ReadyListType &ReadyList) {
    784       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
    785         ScheduleData *SD = getScheduleData(I);
    786         if (SD->isSchedulingEntity() && SD->isReady()) {
    787           ReadyList.insert(SD);
    788           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
    789         }
    790       }
    791     }
    792 
    793     /// Checks if a bundle of instructions can be scheduled, i.e. has no
    794     /// cyclic dependencies. This is only a dry-run, no instructions are
    795     /// actually moved at this stage.
    796     bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
    797 
    798     /// Un-bundles a group of instructions.
    799     void cancelScheduling(ArrayRef<Value *> VL);
    800 
    801     /// Extends the scheduling region so that V is inside the region.
    802     void extendSchedulingRegion(Value *V);
    803 
    804     /// Initialize the ScheduleData structures for new instructions in the
    805     /// scheduling region.
    806     void initScheduleData(Instruction *FromI, Instruction *ToI,
    807                           ScheduleData *PrevLoadStore,
    808                           ScheduleData *NextLoadStore);
    809 
    810     /// Updates the dependency information of a bundle and of all instructions/
    811     /// bundles which depend on the original bundle.
    812     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
    813                                BoUpSLP *SLP);
    814 
    815     /// Sets all instruction in the scheduling region to un-scheduled.
    816     void resetSchedule();
    817 
    818     BasicBlock *BB;
    819 
    820     /// Simple memory allocation for ScheduleData.
    821     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
    822 
    823     /// The size of a ScheduleData array in ScheduleDataChunks.
    824     int ChunkSize;
    825 
    826     /// The allocator position in the current chunk, which is the last entry
    827     /// of ScheduleDataChunks.
    828     int ChunkPos;
    829 
    830     /// Attaches ScheduleData to Instruction.
    831     /// Note that the mapping survives during all vectorization iterations, i.e.
    832     /// ScheduleData structures are recycled.
    833     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
    834 
    835     struct ReadyList : SmallVector<ScheduleData *, 8> {
    836       void insert(ScheduleData *SD) { push_back(SD); }
    837     };
    838 
    839     /// The ready-list for scheduling (only used for the dry-run).
    840     ReadyList ReadyInsts;
    841 
    842     /// The first instruction of the scheduling region.
    843     Instruction *ScheduleStart;
    844 
    845     /// The first instruction _after_ the scheduling region.
    846     Instruction *ScheduleEnd;
    847 
    848     /// The first memory accessing instruction in the scheduling region
    849     /// (can be null).
    850     ScheduleData *FirstLoadStoreInRegion;
    851 
    852     /// The last memory accessing instruction in the scheduling region
    853     /// (can be null).
    854     ScheduleData *LastLoadStoreInRegion;
    855 
    856     /// The ID of the scheduling region. For a new vectorization iteration this
    857     /// is incremented which "removes" all ScheduleData from the region.
    858     int SchedulingRegionID;
    859   };
    860 
    861   /// Attaches the BlockScheduling structures to basic blocks.
    862   MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
    863 
    864   /// Performs the "real" scheduling. Done before vectorization is actually
    865   /// performed in a basic block.
    866   void scheduleBlock(BlockScheduling *BS);
    867 
    868   /// List of users to ignore during scheduling and that don't need extracting.
    869   ArrayRef<Value *> UserIgnoreList;
    870 
    871   // Number of load-bundles, which contain consecutive loads.
    872   int NumLoadsWantToKeepOrder;
    873 
    874   // Number of load-bundles of size 2, which are consecutive loads if reversed.
    875   int NumLoadsWantToChangeOrder;
    876 
    877   // Analysis and block reference.
    878   Function *F;
    879   ScalarEvolution *SE;
    880   TargetTransformInfo *TTI;
    881   TargetLibraryInfo *TLI;
    882   AliasAnalysis *AA;
    883   LoopInfo *LI;
    884   DominatorTree *DT;
    885   /// Instruction builder to construct the vectorized tree.
    886   IRBuilder<> Builder;
    887 };
    888 
    889 #ifndef NDEBUG
    890 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
    891   SD.dump(os);
    892   return os;
    893 }
    894 #endif
    895 
    896 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
    897                         ArrayRef<Value *> UserIgnoreLst) {
    898   deleteTree();
    899   UserIgnoreList = UserIgnoreLst;
    900   if (!getSameType(Roots))
    901     return;
    902   buildTree_rec(Roots, 0);
    903 
    904   // Collect the values that we need to extract from the tree.
    905   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
    906     TreeEntry *Entry = &VectorizableTree[EIdx];
    907 
    908     // For each lane:
    909     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
    910       Value *Scalar = Entry->Scalars[Lane];
    911 
    912       // No need to handle users of gathered values.
    913       if (Entry->NeedToGather)
    914         continue;
    915 
    916       for (User *U : Scalar->users()) {
    917         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
    918 
    919         Instruction *UserInst = dyn_cast<Instruction>(U);
    920         if (!UserInst)
    921           continue;
    922 
    923         // Skip in-tree scalars that become vectors
    924         if (ScalarToTreeEntry.count(U)) {
    925           int Idx = ScalarToTreeEntry[U];
    926           TreeEntry *UseEntry = &VectorizableTree[Idx];
    927           Value *UseScalar = UseEntry->Scalars[0];
    928           // Some in-tree scalars will remain as scalar in vectorized
    929           // instructions. If that is the case, the one in Lane 0 will
    930           // be used.
    931           if (UseScalar != U ||
    932               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
    933             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
    934                          << ".\n");
    935             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
    936             continue;
    937           }
    938         }
    939 
    940         // Ignore users in the user ignore list.
    941         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
    942             UserIgnoreList.end())
    943           continue;
    944 
    945         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
    946               Lane << " from " << *Scalar << ".\n");
    947         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
    948       }
    949     }
    950   }
    951 }
    952 
    953 
    954 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    955   bool SameTy = getSameType(VL); (void)SameTy;
    956   bool isAltShuffle = false;
    957   assert(SameTy && "Invalid types!");
    958 
    959   if (Depth == RecursionMaxDepth) {
    960     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
    961     newTreeEntry(VL, false);
    962     return;
    963   }
    964 
    965   // Don't handle vectors.
    966   if (VL[0]->getType()->isVectorTy()) {
    967     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
    968     newTreeEntry(VL, false);
    969     return;
    970   }
    971 
    972   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
    973     if (SI->getValueOperand()->getType()->isVectorTy()) {
    974       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
    975       newTreeEntry(VL, false);
    976       return;
    977     }
    978   unsigned Opcode = getSameOpcode(VL);
    979 
    980   // Check that this shuffle vector refers to the alternate
    981   // sequence of opcodes.
    982   if (Opcode == Instruction::ShuffleVector) {
    983     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
    984     unsigned Op = I0->getOpcode();
    985     if (Op != Instruction::ShuffleVector)
    986       isAltShuffle = true;
    987   }
    988 
    989   // If all of the operands are identical or constant we have a simple solution.
    990   if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
    991     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
    992     newTreeEntry(VL, false);
    993     return;
    994   }
    995 
    996   // We now know that this is a vector of instructions of the same type from
    997   // the same block.
    998 
    999   // Don't vectorize ephemeral values.
   1000   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
   1001     if (EphValues.count(VL[i])) {
   1002       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
   1003             ") is ephemeral.\n");
   1004       newTreeEntry(VL, false);
   1005       return;
   1006     }
   1007   }
   1008 
   1009   // Check if this is a duplicate of another entry.
   1010   if (ScalarToTreeEntry.count(VL[0])) {
   1011     int Idx = ScalarToTreeEntry[VL[0]];
   1012     TreeEntry *E = &VectorizableTree[Idx];
   1013     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
   1014       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
   1015       if (E->Scalars[i] != VL[i]) {
   1016         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
   1017         newTreeEntry(VL, false);
   1018         return;
   1019       }
   1020     }
   1021     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
   1022     return;
   1023   }
   1024 
   1025   // Check that none of the instructions in the bundle are already in the tree.
   1026   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
   1027     if (ScalarToTreeEntry.count(VL[i])) {
   1028       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
   1029             ") is already in tree.\n");
   1030       newTreeEntry(VL, false);
   1031       return;
   1032     }
   1033   }
   1034 
   1035   // If any of the scalars is marked as a value that needs to stay scalar then
   1036   // we need to gather the scalars.
   1037   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
   1038     if (MustGather.count(VL[i])) {
   1039       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
   1040       newTreeEntry(VL, false);
   1041       return;
   1042     }
   1043   }
   1044 
   1045   // Check that all of the users of the scalars that we want to vectorize are
   1046   // schedulable.
   1047   Instruction *VL0 = cast<Instruction>(VL[0]);
   1048   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
   1049 
   1050   if (!DT->isReachableFromEntry(BB)) {
   1051     // Don't go into unreachable blocks. They may contain instructions with
   1052     // dependency cycles which confuse the final scheduling.
   1053     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
   1054     newTreeEntry(VL, false);
   1055     return;
   1056   }
   1057 
   1058   // Check that every instructions appears once in this bundle.
   1059   for (unsigned i = 0, e = VL.size(); i < e; ++i)
   1060     for (unsigned j = i+1; j < e; ++j)
   1061       if (VL[i] == VL[j]) {
   1062         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
   1063         newTreeEntry(VL, false);
   1064         return;
   1065       }
   1066 
   1067   auto &BSRef = BlocksSchedules[BB];
   1068   if (!BSRef) {
   1069     BSRef = llvm::make_unique<BlockScheduling>(BB);
   1070   }
   1071   BlockScheduling &BS = *BSRef.get();
   1072 
   1073   if (!BS.tryScheduleBundle(VL, this)) {
   1074     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
   1075     BS.cancelScheduling(VL);
   1076     newTreeEntry(VL, false);
   1077     return;
   1078   }
   1079   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
   1080 
   1081   switch (Opcode) {
   1082     case Instruction::PHI: {
   1083       PHINode *PH = dyn_cast<PHINode>(VL0);
   1084 
   1085       // Check for terminator values (e.g. invoke).
   1086       for (unsigned j = 0; j < VL.size(); ++j)
   1087         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
   1088           TerminatorInst *Term = dyn_cast<TerminatorInst>(
   1089               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
   1090           if (Term) {
   1091             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
   1092             BS.cancelScheduling(VL);
   1093             newTreeEntry(VL, false);
   1094             return;
   1095           }
   1096         }
   1097 
   1098       newTreeEntry(VL, true);
   1099       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
   1100 
   1101       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
   1102         ValueList Operands;
   1103         // Prepare the operand vector.
   1104         for (unsigned j = 0; j < VL.size(); ++j)
   1105           Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
   1106               PH->getIncomingBlock(i)));
   1107 
   1108         buildTree_rec(Operands, Depth + 1);
   1109       }
   1110       return;
   1111     }
   1112     case Instruction::ExtractElement: {
   1113       bool Reuse = CanReuseExtract(VL);
   1114       if (Reuse) {
   1115         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
   1116       } else {
   1117         BS.cancelScheduling(VL);
   1118       }
   1119       newTreeEntry(VL, Reuse);
   1120       return;
   1121     }
   1122     case Instruction::Load: {
   1123       // Check if the loads are consecutive or of we need to swizzle them.
   1124       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
   1125         LoadInst *L = cast<LoadInst>(VL[i]);
   1126         if (!L->isSimple()) {
   1127           BS.cancelScheduling(VL);
   1128           newTreeEntry(VL, false);
   1129           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
   1130           return;
   1131         }
   1132         const DataLayout &DL = F->getParent()->getDataLayout();
   1133         if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
   1134           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
   1135             ++NumLoadsWantToChangeOrder;
   1136           }
   1137           BS.cancelScheduling(VL);
   1138           newTreeEntry(VL, false);
   1139           DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
   1140           return;
   1141         }
   1142       }
   1143       ++NumLoadsWantToKeepOrder;
   1144       newTreeEntry(VL, true);
   1145       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
   1146       return;
   1147     }
   1148     case Instruction::ZExt:
   1149     case Instruction::SExt:
   1150     case Instruction::FPToUI:
   1151     case Instruction::FPToSI:
   1152     case Instruction::FPExt:
   1153     case Instruction::PtrToInt:
   1154     case Instruction::IntToPtr:
   1155     case Instruction::SIToFP:
   1156     case Instruction::UIToFP:
   1157     case Instruction::Trunc:
   1158     case Instruction::FPTrunc:
   1159     case Instruction::BitCast: {
   1160       Type *SrcTy = VL0->getOperand(0)->getType();
   1161       for (unsigned i = 0; i < VL.size(); ++i) {
   1162         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
   1163         if (Ty != SrcTy || !isValidElementType(Ty)) {
   1164           BS.cancelScheduling(VL);
   1165           newTreeEntry(VL, false);
   1166           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
   1167           return;
   1168         }
   1169       }
   1170       newTreeEntry(VL, true);
   1171       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
   1172 
   1173       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
   1174         ValueList Operands;
   1175         // Prepare the operand vector.
   1176         for (unsigned j = 0; j < VL.size(); ++j)
   1177           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
   1178 
   1179         buildTree_rec(Operands, Depth+1);
   1180       }
   1181       return;
   1182     }
   1183     case Instruction::ICmp:
   1184     case Instruction::FCmp: {
   1185       // Check that all of the compares have the same predicate.
   1186       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
   1187       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
   1188       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
   1189         CmpInst *Cmp = cast<CmpInst>(VL[i]);
   1190         if (Cmp->getPredicate() != P0 ||
   1191             Cmp->getOperand(0)->getType() != ComparedTy) {
   1192           BS.cancelScheduling(VL);
   1193           newTreeEntry(VL, false);
   1194           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
   1195           return;
   1196         }
   1197       }
   1198 
   1199       newTreeEntry(VL, true);
   1200       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
   1201 
   1202       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
   1203         ValueList Operands;
   1204         // Prepare the operand vector.
   1205         for (unsigned j = 0; j < VL.size(); ++j)
   1206           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
   1207 
   1208         buildTree_rec(Operands, Depth+1);
   1209       }
   1210       return;
   1211     }
   1212     case Instruction::Select:
   1213     case Instruction::Add:
   1214     case Instruction::FAdd:
   1215     case Instruction::Sub:
   1216     case Instruction::FSub:
   1217     case Instruction::Mul:
   1218     case Instruction::FMul:
   1219     case Instruction::UDiv:
   1220     case Instruction::SDiv:
   1221     case Instruction::FDiv:
   1222     case Instruction::URem:
   1223     case Instruction::SRem:
   1224     case Instruction::FRem:
   1225     case Instruction::Shl:
   1226     case Instruction::LShr:
   1227     case Instruction::AShr:
   1228     case Instruction::And:
   1229     case Instruction::Or:
   1230     case Instruction::Xor: {
   1231       newTreeEntry(VL, true);
   1232       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
   1233 
   1234       // Sort operands of the instructions so that each side is more likely to
   1235       // have the same opcode.
   1236       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
   1237         ValueList Left, Right;
   1238         reorderInputsAccordingToOpcode(VL, Left, Right);
   1239         buildTree_rec(Left, Depth + 1);
   1240         buildTree_rec(Right, Depth + 1);
   1241         return;
   1242       }
   1243 
   1244       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
   1245         ValueList Operands;
   1246         // Prepare the operand vector.
   1247         for (unsigned j = 0; j < VL.size(); ++j)
   1248           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
   1249 
   1250         buildTree_rec(Operands, Depth+1);
   1251       }
   1252       return;
   1253     }
   1254     case Instruction::GetElementPtr: {
   1255       // We don't combine GEPs with complicated (nested) indexing.
   1256       for (unsigned j = 0; j < VL.size(); ++j) {
   1257         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
   1258           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
   1259           BS.cancelScheduling(VL);
   1260           newTreeEntry(VL, false);
   1261           return;
   1262         }
   1263       }
   1264 
   1265       // We can't combine several GEPs into one vector if they operate on
   1266       // different types.
   1267       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
   1268       for (unsigned j = 0; j < VL.size(); ++j) {
   1269         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
   1270         if (Ty0 != CurTy) {
   1271           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
   1272           BS.cancelScheduling(VL);
   1273           newTreeEntry(VL, false);
   1274           return;
   1275         }
   1276       }
   1277 
   1278       // We don't combine GEPs with non-constant indexes.
   1279       for (unsigned j = 0; j < VL.size(); ++j) {
   1280         auto Op = cast<Instruction>(VL[j])->getOperand(1);
   1281         if (!isa<ConstantInt>(Op)) {
   1282           DEBUG(
   1283               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
   1284           BS.cancelScheduling(VL);
   1285           newTreeEntry(VL, false);
   1286           return;
   1287         }
   1288       }
   1289 
   1290       newTreeEntry(VL, true);
   1291       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
   1292       for (unsigned i = 0, e = 2; i < e; ++i) {
   1293         ValueList Operands;
   1294         // Prepare the operand vector.
   1295         for (unsigned j = 0; j < VL.size(); ++j)
   1296           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
   1297 
   1298         buildTree_rec(Operands, Depth + 1);
   1299       }
   1300       return;
   1301     }
   1302     case Instruction::Store: {
   1303       const DataLayout &DL = F->getParent()->getDataLayout();
   1304       // Check if the stores are consecutive or of we need to swizzle them.
   1305       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
   1306         if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
   1307           BS.cancelScheduling(VL);
   1308           newTreeEntry(VL, false);
   1309           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
   1310           return;
   1311         }
   1312 
   1313       newTreeEntry(VL, true);
   1314       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
   1315 
   1316       ValueList Operands;
   1317       for (unsigned j = 0; j < VL.size(); ++j)
   1318         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
   1319 
   1320       buildTree_rec(Operands, Depth + 1);
   1321       return;
   1322     }
   1323     case Instruction::Call: {
   1324       // Check if the calls are all to the same vectorizable intrinsic.
   1325       CallInst *CI = cast<CallInst>(VL[0]);
   1326       // Check if this is an Intrinsic call or something that can be
   1327       // represented by an intrinsic call
   1328       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
   1329       if (!isTriviallyVectorizable(ID)) {
   1330         BS.cancelScheduling(VL);
   1331         newTreeEntry(VL, false);
   1332         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
   1333         return;
   1334       }
   1335       Function *Int = CI->getCalledFunction();
   1336       Value *A1I = nullptr;
   1337       if (hasVectorInstrinsicScalarOpd(ID, 1))
   1338         A1I = CI->getArgOperand(1);
   1339       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
   1340         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
   1341         if (!CI2 || CI2->getCalledFunction() != Int ||
   1342             getIntrinsicIDForCall(CI2, TLI) != ID) {
   1343           BS.cancelScheduling(VL);
   1344           newTreeEntry(VL, false);
   1345           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
   1346                        << "\n");
   1347           return;
   1348         }
   1349         // ctlz,cttz and powi are special intrinsics whose second argument
   1350         // should be same in order for them to be vectorized.
   1351         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
   1352           Value *A1J = CI2->getArgOperand(1);
   1353           if (A1I != A1J) {
   1354             BS.cancelScheduling(VL);
   1355             newTreeEntry(VL, false);
   1356             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
   1357                          << " argument "<< A1I<<"!=" << A1J
   1358                          << "\n");
   1359             return;
   1360           }
   1361         }
   1362       }
   1363 
   1364       newTreeEntry(VL, true);
   1365       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
   1366         ValueList Operands;
   1367         // Prepare the operand vector.
   1368         for (unsigned j = 0; j < VL.size(); ++j) {
   1369           CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
   1370           Operands.push_back(CI2->getArgOperand(i));
   1371         }
   1372         buildTree_rec(Operands, Depth + 1);
   1373       }
   1374       return;
   1375     }
   1376     case Instruction::ShuffleVector: {
   1377       // If this is not an alternate sequence of opcode like add-sub
   1378       // then do not vectorize this instruction.
   1379       if (!isAltShuffle) {
   1380         BS.cancelScheduling(VL);
   1381         newTreeEntry(VL, false);
   1382         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
   1383         return;
   1384       }
   1385       newTreeEntry(VL, true);
   1386       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
   1387 
   1388       // Reorder operands if reordering would enable vectorization.
   1389       if (isa<BinaryOperator>(VL0)) {
   1390         ValueList Left, Right;
   1391         reorderAltShuffleOperands(VL, Left, Right);
   1392         buildTree_rec(Left, Depth + 1);
   1393         buildTree_rec(Right, Depth + 1);
   1394         return;
   1395       }
   1396 
   1397       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
   1398         ValueList Operands;
   1399         // Prepare the operand vector.
   1400         for (unsigned j = 0; j < VL.size(); ++j)
   1401           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
   1402 
   1403         buildTree_rec(Operands, Depth + 1);
   1404       }
   1405       return;
   1406     }
   1407     default:
   1408       BS.cancelScheduling(VL);
   1409       newTreeEntry(VL, false);
   1410       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
   1411       return;
   1412   }
   1413 }
   1414 
   1415 int BoUpSLP::getEntryCost(TreeEntry *E) {
   1416   ArrayRef<Value*> VL = E->Scalars;
   1417 
   1418   Type *ScalarTy = VL[0]->getType();
   1419   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
   1420     ScalarTy = SI->getValueOperand()->getType();
   1421   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
   1422 
   1423   if (E->NeedToGather) {
   1424     if (allConstant(VL))
   1425       return 0;
   1426     if (isSplat(VL)) {
   1427       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
   1428     }
   1429     return getGatherCost(E->Scalars);
   1430   }
   1431   unsigned Opcode = getSameOpcode(VL);
   1432   assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
   1433   Instruction *VL0 = cast<Instruction>(VL[0]);
   1434   switch (Opcode) {
   1435     case Instruction::PHI: {
   1436       return 0;
   1437     }
   1438     case Instruction::ExtractElement: {
   1439       if (CanReuseExtract(VL)) {
   1440         int DeadCost = 0;
   1441         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
   1442           ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
   1443           if (E->hasOneUse())
   1444             // Take credit for instruction that will become dead.
   1445             DeadCost +=
   1446                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
   1447         }
   1448         return -DeadCost;
   1449       }
   1450       return getGatherCost(VecTy);
   1451     }
   1452     case Instruction::ZExt:
   1453     case Instruction::SExt:
   1454     case Instruction::FPToUI:
   1455     case Instruction::FPToSI:
   1456     case Instruction::FPExt:
   1457     case Instruction::PtrToInt:
   1458     case Instruction::IntToPtr:
   1459     case Instruction::SIToFP:
   1460     case Instruction::UIToFP:
   1461     case Instruction::Trunc:
   1462     case Instruction::FPTrunc:
   1463     case Instruction::BitCast: {
   1464       Type *SrcTy = VL0->getOperand(0)->getType();
   1465 
   1466       // Calculate the cost of this instruction.
   1467       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
   1468                                                          VL0->getType(), SrcTy);
   1469 
   1470       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
   1471       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
   1472       return VecCost - ScalarCost;
   1473     }
   1474     case Instruction::FCmp:
   1475     case Instruction::ICmp:
   1476     case Instruction::Select:
   1477     case Instruction::Add:
   1478     case Instruction::FAdd:
   1479     case Instruction::Sub:
   1480     case Instruction::FSub:
   1481     case Instruction::Mul:
   1482     case Instruction::FMul:
   1483     case Instruction::UDiv:
   1484     case Instruction::SDiv:
   1485     case Instruction::FDiv:
   1486     case Instruction::URem:
   1487     case Instruction::SRem:
   1488     case Instruction::FRem:
   1489     case Instruction::Shl:
   1490     case Instruction::LShr:
   1491     case Instruction::AShr:
   1492     case Instruction::And:
   1493     case Instruction::Or:
   1494     case Instruction::Xor: {
   1495       // Calculate the cost of this instruction.
   1496       int ScalarCost = 0;
   1497       int VecCost = 0;
   1498       if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
   1499           Opcode == Instruction::Select) {
   1500         VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
   1501         ScalarCost = VecTy->getNumElements() *
   1502         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
   1503         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
   1504       } else {
   1505         // Certain instructions can be cheaper to vectorize if they have a
   1506         // constant second vector operand.
   1507         TargetTransformInfo::OperandValueKind Op1VK =
   1508             TargetTransformInfo::OK_AnyValue;
   1509         TargetTransformInfo::OperandValueKind Op2VK =
   1510             TargetTransformInfo::OK_UniformConstantValue;
   1511         TargetTransformInfo::OperandValueProperties Op1VP =
   1512             TargetTransformInfo::OP_None;
   1513         TargetTransformInfo::OperandValueProperties Op2VP =
   1514             TargetTransformInfo::OP_None;
   1515 
   1516         // If all operands are exactly the same ConstantInt then set the
   1517         // operand kind to OK_UniformConstantValue.
   1518         // If instead not all operands are constants, then set the operand kind
   1519         // to OK_AnyValue. If all operands are constants but not the same,
   1520         // then set the operand kind to OK_NonUniformConstantValue.
   1521         ConstantInt *CInt = nullptr;
   1522         for (unsigned i = 0; i < VL.size(); ++i) {
   1523           const Instruction *I = cast<Instruction>(VL[i]);
   1524           if (!isa<ConstantInt>(I->getOperand(1))) {
   1525             Op2VK = TargetTransformInfo::OK_AnyValue;
   1526             break;
   1527           }
   1528           if (i == 0) {
   1529             CInt = cast<ConstantInt>(I->getOperand(1));
   1530             continue;
   1531           }
   1532           if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
   1533               CInt != cast<ConstantInt>(I->getOperand(1)))
   1534             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
   1535         }
   1536         // FIXME: Currently cost of model modification for division by
   1537         // power of 2 is handled only for X86. Add support for other targets.
   1538         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
   1539             CInt->getValue().isPowerOf2())
   1540           Op2VP = TargetTransformInfo::OP_PowerOf2;
   1541 
   1542         ScalarCost = VecTy->getNumElements() *
   1543                      TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
   1544                                                  Op1VP, Op2VP);
   1545         VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
   1546                                               Op1VP, Op2VP);
   1547       }
   1548       return VecCost - ScalarCost;
   1549     }
   1550     case Instruction::GetElementPtr: {
   1551       TargetTransformInfo::OperandValueKind Op1VK =
   1552           TargetTransformInfo::OK_AnyValue;
   1553       TargetTransformInfo::OperandValueKind Op2VK =
   1554           TargetTransformInfo::OK_UniformConstantValue;
   1555 
   1556       int ScalarCost =
   1557           VecTy->getNumElements() *
   1558           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
   1559       int VecCost =
   1560           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
   1561 
   1562       return VecCost - ScalarCost;
   1563     }
   1564     case Instruction::Load: {
   1565       // Cost of wide load - cost of scalar loads.
   1566       int ScalarLdCost = VecTy->getNumElements() *
   1567       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
   1568       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
   1569       return VecLdCost - ScalarLdCost;
   1570     }
   1571     case Instruction::Store: {
   1572       // We know that we can merge the stores. Calculate the cost.
   1573       int ScalarStCost = VecTy->getNumElements() *
   1574       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
   1575       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
   1576       return VecStCost - ScalarStCost;
   1577     }
   1578     case Instruction::Call: {
   1579       CallInst *CI = cast<CallInst>(VL0);
   1580       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
   1581 
   1582       // Calculate the cost of the scalar and vector calls.
   1583       SmallVector<Type*, 4> ScalarTys, VecTys;
   1584       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
   1585         ScalarTys.push_back(CI->getArgOperand(op)->getType());
   1586         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
   1587                                          VecTy->getNumElements()));
   1588       }
   1589 
   1590       int ScalarCallCost = VecTy->getNumElements() *
   1591           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
   1592 
   1593       int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
   1594 
   1595       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
   1596             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
   1597             << " for " << *CI << "\n");
   1598 
   1599       return VecCallCost - ScalarCallCost;
   1600     }
   1601     case Instruction::ShuffleVector: {
   1602       TargetTransformInfo::OperandValueKind Op1VK =
   1603           TargetTransformInfo::OK_AnyValue;
   1604       TargetTransformInfo::OperandValueKind Op2VK =
   1605           TargetTransformInfo::OK_AnyValue;
   1606       int ScalarCost = 0;
   1607       int VecCost = 0;
   1608       for (unsigned i = 0; i < VL.size(); ++i) {
   1609         Instruction *I = cast<Instruction>(VL[i]);
   1610         if (!I)
   1611           break;
   1612         ScalarCost +=
   1613             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
   1614       }
   1615       // VecCost is equal to sum of the cost of creating 2 vectors
   1616       // and the cost of creating shuffle.
   1617       Instruction *I0 = cast<Instruction>(VL[0]);
   1618       VecCost =
   1619           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
   1620       Instruction *I1 = cast<Instruction>(VL[1]);
   1621       VecCost +=
   1622           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
   1623       VecCost +=
   1624           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
   1625       return VecCost - ScalarCost;
   1626     }
   1627     default:
   1628       llvm_unreachable("Unknown instruction");
   1629   }
   1630 }
   1631 
   1632 bool BoUpSLP::isFullyVectorizableTinyTree() {
   1633   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
   1634         VectorizableTree.size() << " is fully vectorizable .\n");
   1635 
   1636   // We only handle trees of height 2.
   1637   if (VectorizableTree.size() != 2)
   1638     return false;
   1639 
   1640   // Handle splat stores.
   1641   if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
   1642     return true;
   1643 
   1644   // Gathering cost would be too much for tiny trees.
   1645   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
   1646     return false;
   1647 
   1648   return true;
   1649 }
   1650 
   1651 int BoUpSLP::getSpillCost() {
   1652   // Walk from the bottom of the tree to the top, tracking which values are
   1653   // live. When we see a call instruction that is not part of our tree,
   1654   // query TTI to see if there is a cost to keeping values live over it
   1655   // (for example, if spills and fills are required).
   1656   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
   1657   int Cost = 0;
   1658 
   1659   SmallPtrSet<Instruction*, 4> LiveValues;
   1660   Instruction *PrevInst = nullptr;
   1661 
   1662   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
   1663     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
   1664     if (!Inst)
   1665       continue;
   1666 
   1667     if (!PrevInst) {
   1668       PrevInst = Inst;
   1669       continue;
   1670     }
   1671 
   1672     DEBUG(
   1673       dbgs() << "SLP: #LV: " << LiveValues.size();
   1674       for (auto *X : LiveValues)
   1675         dbgs() << " " << X->getName();
   1676       dbgs() << ", Looking at ";
   1677       Inst->dump();
   1678       );
   1679 
   1680     // Update LiveValues.
   1681     LiveValues.erase(PrevInst);
   1682     for (auto &J : PrevInst->operands()) {
   1683       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
   1684         LiveValues.insert(cast<Instruction>(&*J));
   1685     }
   1686 
   1687     // Now find the sequence of instructions between PrevInst and Inst.
   1688     BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
   1689     --PrevInstIt;
   1690     while (InstIt != PrevInstIt) {
   1691       if (PrevInstIt == PrevInst->getParent()->rend()) {
   1692         PrevInstIt = Inst->getParent()->rbegin();
   1693         continue;
   1694       }
   1695 
   1696       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
   1697         SmallVector<Type*, 4> V;
   1698         for (auto *II : LiveValues)
   1699           V.push_back(VectorType::get(II->getType(), BundleWidth));
   1700         Cost += TTI->getCostOfKeepingLiveOverCall(V);
   1701       }
   1702 
   1703       ++PrevInstIt;
   1704     }
   1705 
   1706     PrevInst = Inst;
   1707   }
   1708 
   1709   DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
   1710   return Cost;
   1711 }
   1712 
   1713 int BoUpSLP::getTreeCost() {
   1714   int Cost = 0;
   1715   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
   1716         VectorizableTree.size() << ".\n");
   1717 
   1718   // We only vectorize tiny trees if it is fully vectorizable.
   1719   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
   1720     if (VectorizableTree.empty()) {
   1721       assert(!ExternalUses.size() && "We should not have any external users");
   1722     }
   1723     return INT_MAX;
   1724   }
   1725 
   1726   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
   1727 
   1728   for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
   1729     int C = getEntryCost(&VectorizableTree[i]);
   1730     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
   1731           << *VectorizableTree[i].Scalars[0] << " .\n");
   1732     Cost += C;
   1733   }
   1734 
   1735   SmallSet<Value *, 16> ExtractCostCalculated;
   1736   int ExtractCost = 0;
   1737   for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
   1738        I != E; ++I) {
   1739     // We only add extract cost once for the same scalar.
   1740     if (!ExtractCostCalculated.insert(I->Scalar).second)
   1741       continue;
   1742 
   1743     // Uses by ephemeral values are free (because the ephemeral value will be
   1744     // removed prior to code generation, and so the extraction will be
   1745     // removed as well).
   1746     if (EphValues.count(I->User))
   1747       continue;
   1748 
   1749     VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
   1750     ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
   1751                                            I->Lane);
   1752   }
   1753 
   1754   Cost += getSpillCost();
   1755 
   1756   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
   1757   return  Cost + ExtractCost;
   1758 }
   1759 
   1760 int BoUpSLP::getGatherCost(Type *Ty) {
   1761   int Cost = 0;
   1762   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
   1763     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
   1764   return Cost;
   1765 }
   1766 
   1767 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
   1768   // Find the type of the operands in VL.
   1769   Type *ScalarTy = VL[0]->getType();
   1770   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
   1771     ScalarTy = SI->getValueOperand()->getType();
   1772   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
   1773   // Find the cost of inserting/extracting values from the vector.
   1774   return getGatherCost(VecTy);
   1775 }
   1776 
   1777 Value *BoUpSLP::getPointerOperand(Value *I) {
   1778   if (LoadInst *LI = dyn_cast<LoadInst>(I))
   1779     return LI->getPointerOperand();
   1780   if (StoreInst *SI = dyn_cast<StoreInst>(I))
   1781     return SI->getPointerOperand();
   1782   return nullptr;
   1783 }
   1784 
   1785 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
   1786   if (LoadInst *L = dyn_cast<LoadInst>(I))
   1787     return L->getPointerAddressSpace();
   1788   if (StoreInst *S = dyn_cast<StoreInst>(I))
   1789     return S->getPointerAddressSpace();
   1790   return -1;
   1791 }
   1792 
   1793 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
   1794   Value *PtrA = getPointerOperand(A);
   1795   Value *PtrB = getPointerOperand(B);
   1796   unsigned ASA = getAddressSpaceOperand(A);
   1797   unsigned ASB = getAddressSpaceOperand(B);
   1798 
   1799   // Check that the address spaces match and that the pointers are valid.
   1800   if (!PtrA || !PtrB || (ASA != ASB))
   1801     return false;
   1802 
   1803   // Make sure that A and B are different pointers of the same type.
   1804   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
   1805     return false;
   1806 
   1807   unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   1808   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
   1809   APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
   1810 
   1811   APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
   1812   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   1813   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
   1814 
   1815   APInt OffsetDelta = OffsetB - OffsetA;
   1816 
   1817   // Check if they are based on the same pointer. That makes the offsets
   1818   // sufficient.
   1819   if (PtrA == PtrB)
   1820     return OffsetDelta == Size;
   1821 
   1822   // Compute the necessary base pointer delta to have the necessary final delta
   1823   // equal to the size.
   1824   APInt BaseDelta = Size - OffsetDelta;
   1825 
   1826   // Otherwise compute the distance with SCEV between the base pointers.
   1827   const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
   1828   const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
   1829   const SCEV *C = SE->getConstant(BaseDelta);
   1830   const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
   1831   return X == PtrSCEVB;
   1832 }
   1833 
   1834 // Reorder commutative operations in alternate shuffle if the resulting vectors
   1835 // are consecutive loads. This would allow us to vectorize the tree.
   1836 // If we have something like-
   1837 // load a[0] - load b[0]
   1838 // load b[1] + load a[1]
   1839 // load a[2] - load b[2]
   1840 // load a[3] + load b[3]
   1841 // Reordering the second load b[1]  load a[1] would allow us to vectorize this
   1842 // code.
   1843 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
   1844                                         SmallVectorImpl<Value *> &Left,
   1845                                         SmallVectorImpl<Value *> &Right) {
   1846   const DataLayout &DL = F->getParent()->getDataLayout();
   1847 
   1848   // Push left and right operands of binary operation into Left and Right
   1849   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
   1850     Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
   1851     Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
   1852   }
   1853 
   1854   // Reorder if we have a commutative operation and consecutive access
   1855   // are on either side of the alternate instructions.
   1856   for (unsigned j = 0; j < VL.size() - 1; ++j) {
   1857     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
   1858       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
   1859         Instruction *VL1 = cast<Instruction>(VL[j]);
   1860         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
   1861         if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
   1862           std::swap(Left[j], Right[j]);
   1863           continue;
   1864         } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
   1865           std::swap(Left[j + 1], Right[j + 1]);
   1866           continue;
   1867         }
   1868         // else unchanged
   1869       }
   1870     }
   1871     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
   1872       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
   1873         Instruction *VL1 = cast<Instruction>(VL[j]);
   1874         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
   1875         if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
   1876           std::swap(Left[j], Right[j]);
   1877           continue;
   1878         } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
   1879           std::swap(Left[j + 1], Right[j + 1]);
   1880           continue;
   1881         }
   1882         // else unchanged
   1883       }
   1884     }
   1885   }
   1886 }
   1887 
   1888 void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   1889                                              SmallVectorImpl<Value *> &Left,
   1890                                              SmallVectorImpl<Value *> &Right) {
   1891 
   1892   SmallVector<Value *, 16> OrigLeft, OrigRight;
   1893 
   1894   bool AllSameOpcodeLeft = true;
   1895   bool AllSameOpcodeRight = true;
   1896   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
   1897     Instruction *I = cast<Instruction>(VL[i]);
   1898     Value *VLeft = I->getOperand(0);
   1899     Value *VRight = I->getOperand(1);
   1900 
   1901     OrigLeft.push_back(VLeft);
   1902     OrigRight.push_back(VRight);
   1903 
   1904     Instruction *ILeft = dyn_cast<Instruction>(VLeft);
   1905     Instruction *IRight = dyn_cast<Instruction>(VRight);
   1906 
   1907     // Check whether all operands on one side have the same opcode. In this case
   1908     // we want to preserve the original order and not make things worse by
   1909     // reordering.
   1910     if (i && AllSameOpcodeLeft && ILeft) {
   1911       if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
   1912         if (PLeft->getOpcode() != ILeft->getOpcode())
   1913           AllSameOpcodeLeft = false;
   1914       } else
   1915         AllSameOpcodeLeft = false;
   1916     }
   1917     if (i && AllSameOpcodeRight && IRight) {
   1918       if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
   1919         if (PRight->getOpcode() != IRight->getOpcode())
   1920           AllSameOpcodeRight = false;
   1921       } else
   1922         AllSameOpcodeRight = false;
   1923     }
   1924 
   1925     // Sort two opcodes. In the code below we try to preserve the ability to use
   1926     // broadcast of values instead of individual inserts.
   1927     // vl1 = load
   1928     // vl2 = phi
   1929     // vr1 = load
   1930     // vr2 = vr2
   1931     //    = vl1 x vr1
   1932     //    = vl2 x vr2
   1933     // If we just sorted according to opcode we would leave the first line in
   1934     // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
   1935     //    = vl1 x vr1
   1936     //    = vr2 x vl2
   1937     // Because vr2 and vr1 are from the same load we loose the opportunity of a
   1938     // broadcast for the packed right side in the backend: we have [vr1, vl2]
   1939     // instead of [vr1, vr2=vr1].
   1940     if (ILeft && IRight) {
   1941       if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
   1942         Left.push_back(IRight);
   1943         Right.push_back(ILeft);
   1944       } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
   1945                  Right[i - 1] != IRight) {
   1946         // Try not to destroy a broad cast for no apparent benefit.
   1947         Left.push_back(IRight);
   1948         Right.push_back(ILeft);
   1949       } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
   1950                  Right[i - 1] == ILeft) {
   1951         // Try preserve broadcasts.
   1952         Left.push_back(IRight);
   1953         Right.push_back(ILeft);
   1954       } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
   1955                  Left[i - 1] == IRight) {
   1956         // Try preserve broadcasts.
   1957         Left.push_back(IRight);
   1958         Right.push_back(ILeft);
   1959       } else {
   1960         Left.push_back(ILeft);
   1961         Right.push_back(IRight);
   1962       }
   1963       continue;
   1964     }
   1965     // One opcode, put the instruction on the right.
   1966     if (ILeft) {
   1967       Left.push_back(VRight);
   1968       Right.push_back(ILeft);
   1969       continue;
   1970     }
   1971     Left.push_back(VLeft);
   1972     Right.push_back(VRight);
   1973   }
   1974 
   1975   bool LeftBroadcast = isSplat(Left);
   1976   bool RightBroadcast = isSplat(Right);
   1977 
   1978   // If operands end up being broadcast return this operand order.
   1979   if (LeftBroadcast || RightBroadcast)
   1980     return;
   1981 
   1982   // Don't reorder if the operands where good to begin.
   1983   if (AllSameOpcodeRight || AllSameOpcodeLeft) {
   1984     Left = OrigLeft;
   1985     Right = OrigRight;
   1986   }
   1987 
   1988   const DataLayout &DL = F->getParent()->getDataLayout();
   1989 
   1990   // Finally check if we can get longer vectorizable chain by reordering
   1991   // without breaking the good operand order detected above.
   1992   // E.g. If we have something like-
   1993   // load a[0]  load b[0]
   1994   // load b[1]  load a[1]
   1995   // load a[2]  load b[2]
   1996   // load a[3]  load b[3]
   1997   // Reordering the second load b[1]  load a[1] would allow us to vectorize
   1998   // this code and we still retain AllSameOpcode property.
   1999   // FIXME: This load reordering might break AllSameOpcode in some rare cases
   2000   // such as-
   2001   // add a[0],c[0]  load b[0]
   2002   // add a[1],c[2]  load b[1]
   2003   // b[2]           load b[2]
   2004   // add a[3],c[3]  load b[3]
   2005   for (unsigned j = 0; j < VL.size() - 1; ++j) {
   2006     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
   2007       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
   2008         if (isConsecutiveAccess(L, L1, DL)) {
   2009           std::swap(Left[j + 1], Right[j + 1]);
   2010           continue;
   2011         }
   2012       }
   2013     }
   2014     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
   2015       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
   2016         if (isConsecutiveAccess(L, L1, DL)) {
   2017           std::swap(Left[j + 1], Right[j + 1]);
   2018           continue;
   2019         }
   2020       }
   2021     }
   2022     // else unchanged
   2023   }
   2024 }
   2025 
   2026 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
   2027   Instruction *VL0 = cast<Instruction>(VL[0]);
   2028   BasicBlock::iterator NextInst = VL0;
   2029   ++NextInst;
   2030   Builder.SetInsertPoint(VL0->getParent(), NextInst);
   2031   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
   2032 }
   2033 
   2034 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   2035   Value *Vec = UndefValue::get(Ty);
   2036   // Generate the 'InsertElement' instruction.
   2037   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
   2038     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
   2039     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
   2040       GatherSeq.insert(Insrt);
   2041       CSEBlocks.insert(Insrt->getParent());
   2042 
   2043       // Add to our 'need-to-extract' list.
   2044       if (ScalarToTreeEntry.count(VL[i])) {
   2045         int Idx = ScalarToTreeEntry[VL[i]];
   2046         TreeEntry *E = &VectorizableTree[Idx];
   2047         // Find which lane we need to extract.
   2048         int FoundLane = -1;
   2049         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
   2050           // Is this the lane of the scalar that we are looking for ?
   2051           if (E->Scalars[Lane] == VL[i]) {
   2052             FoundLane = Lane;
   2053             break;
   2054           }
   2055         }
   2056         assert(FoundLane >= 0 && "Could not find the correct lane");
   2057         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
   2058       }
   2059     }
   2060   }
   2061 
   2062   return Vec;
   2063 }
   2064 
   2065 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
   2066   SmallDenseMap<Value*, int>::const_iterator Entry
   2067     = ScalarToTreeEntry.find(VL[0]);
   2068   if (Entry != ScalarToTreeEntry.end()) {
   2069     int Idx = Entry->second;
   2070     const TreeEntry *En = &VectorizableTree[Idx];
   2071     if (En->isSame(VL) && En->VectorizedValue)
   2072       return En->VectorizedValue;
   2073   }
   2074   return nullptr;
   2075 }
   2076 
   2077 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   2078   if (ScalarToTreeEntry.count(VL[0])) {
   2079     int Idx = ScalarToTreeEntry[VL[0]];
   2080     TreeEntry *E = &VectorizableTree[Idx];
   2081     if (E->isSame(VL))
   2082       return vectorizeTree(E);
   2083   }
   2084 
   2085   Type *ScalarTy = VL[0]->getType();
   2086   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
   2087     ScalarTy = SI->getValueOperand()->getType();
   2088   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
   2089 
   2090   return Gather(VL, VecTy);
   2091 }
   2092 
   2093 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   2094   IRBuilder<>::InsertPointGuard Guard(Builder);
   2095 
   2096   if (E->VectorizedValue) {
   2097     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
   2098     return E->VectorizedValue;
   2099   }
   2100 
   2101   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
   2102   Type *ScalarTy = VL0->getType();
   2103   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
   2104     ScalarTy = SI->getValueOperand()->getType();
   2105   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
   2106 
   2107   if (E->NeedToGather) {
   2108     setInsertPointAfterBundle(E->Scalars);
   2109     return Gather(E->Scalars, VecTy);
   2110   }
   2111 
   2112   const DataLayout &DL = F->getParent()->getDataLayout();
   2113   unsigned Opcode = getSameOpcode(E->Scalars);
   2114 
   2115   switch (Opcode) {
   2116     case Instruction::PHI: {
   2117       PHINode *PH = dyn_cast<PHINode>(VL0);
   2118       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
   2119       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
   2120       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
   2121       E->VectorizedValue = NewPhi;
   2122 
   2123       // PHINodes may have multiple entries from the same block. We want to
   2124       // visit every block once.
   2125       SmallSet<BasicBlock*, 4> VisitedBBs;
   2126 
   2127       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
   2128         ValueList Operands;
   2129         BasicBlock *IBB = PH->getIncomingBlock(i);
   2130 
   2131         if (!VisitedBBs.insert(IBB).second) {
   2132           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
   2133           continue;
   2134         }
   2135 
   2136         // Prepare the operand vector.
   2137         for (unsigned j = 0; j < E->Scalars.size(); ++j)
   2138           Operands.push_back(cast<PHINode>(E->Scalars[j])->
   2139                              getIncomingValueForBlock(IBB));
   2140 
   2141         Builder.SetInsertPoint(IBB->getTerminator());
   2142         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
   2143         Value *Vec = vectorizeTree(Operands);
   2144         NewPhi->addIncoming(Vec, IBB);
   2145       }
   2146 
   2147       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
   2148              "Invalid number of incoming values");
   2149       return NewPhi;
   2150     }
   2151 
   2152     case Instruction::ExtractElement: {
   2153       if (CanReuseExtract(E->Scalars)) {
   2154         Value *V = VL0->getOperand(0);
   2155         E->VectorizedValue = V;
   2156         return V;
   2157       }
   2158       return Gather(E->Scalars, VecTy);
   2159     }
   2160     case Instruction::ZExt:
   2161     case Instruction::SExt:
   2162     case Instruction::FPToUI:
   2163     case Instruction::FPToSI:
   2164     case Instruction::FPExt:
   2165     case Instruction::PtrToInt:
   2166     case Instruction::IntToPtr:
   2167     case Instruction::SIToFP:
   2168     case Instruction::UIToFP:
   2169     case Instruction::Trunc:
   2170     case Instruction::FPTrunc:
   2171     case Instruction::BitCast: {
   2172       ValueList INVL;
   2173       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
   2174         INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
   2175 
   2176       setInsertPointAfterBundle(E->Scalars);
   2177 
   2178       Value *InVec = vectorizeTree(INVL);
   2179 
   2180       if (Value *V = alreadyVectorized(E->Scalars))
   2181         return V;
   2182 
   2183       CastInst *CI = dyn_cast<CastInst>(VL0);
   2184       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
   2185       E->VectorizedValue = V;
   2186       ++NumVectorInstructions;
   2187       return V;
   2188     }
   2189     case Instruction::FCmp:
   2190     case Instruction::ICmp: {
   2191       ValueList LHSV, RHSV;
   2192       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
   2193         LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
   2194         RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
   2195       }
   2196 
   2197       setInsertPointAfterBundle(E->Scalars);
   2198 
   2199       Value *L = vectorizeTree(LHSV);
   2200       Value *R = vectorizeTree(RHSV);
   2201 
   2202       if (Value *V = alreadyVectorized(E->Scalars))
   2203         return V;
   2204 
   2205       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
   2206       Value *V;
   2207       if (Opcode == Instruction::FCmp)
   2208         V = Builder.CreateFCmp(P0, L, R);
   2209       else
   2210         V = Builder.CreateICmp(P0, L, R);
   2211 
   2212       E->VectorizedValue = V;
   2213       ++NumVectorInstructions;
   2214       return V;
   2215     }
   2216     case Instruction::Select: {
   2217       ValueList TrueVec, FalseVec, CondVec;
   2218       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
   2219         CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
   2220         TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
   2221         FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
   2222       }
   2223 
   2224       setInsertPointAfterBundle(E->Scalars);
   2225 
   2226       Value *Cond = vectorizeTree(CondVec);
   2227       Value *True = vectorizeTree(TrueVec);
   2228       Value *False = vectorizeTree(FalseVec);
   2229 
   2230       if (Value *V = alreadyVectorized(E->Scalars))
   2231         return V;
   2232 
   2233       Value *V = Builder.CreateSelect(Cond, True, False);
   2234       E->VectorizedValue = V;
   2235       ++NumVectorInstructions;
   2236       return V;
   2237     }
   2238     case Instruction::Add:
   2239     case Instruction::FAdd:
   2240     case Instruction::Sub:
   2241     case Instruction::FSub:
   2242     case Instruction::Mul:
   2243     case Instruction::FMul:
   2244     case Instruction::UDiv:
   2245     case Instruction::SDiv:
   2246     case Instruction::FDiv:
   2247     case Instruction::URem:
   2248     case Instruction::SRem:
   2249     case Instruction::FRem:
   2250     case Instruction::Shl:
   2251     case Instruction::LShr:
   2252     case Instruction::AShr:
   2253     case Instruction::And:
   2254     case Instruction::Or:
   2255     case Instruction::Xor: {
   2256       ValueList LHSVL, RHSVL;
   2257       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
   2258         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
   2259       else
   2260         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
   2261           LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
   2262           RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
   2263         }
   2264 
   2265       setInsertPointAfterBundle(E->Scalars);
   2266 
   2267       Value *LHS = vectorizeTree(LHSVL);
   2268       Value *RHS = vectorizeTree(RHSVL);
   2269 
   2270       if (LHS == RHS && isa<Instruction>(LHS)) {
   2271         assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
   2272       }
   2273 
   2274       if (Value *V = alreadyVectorized(E->Scalars))
   2275         return V;
   2276 
   2277       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
   2278       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
   2279       E->VectorizedValue = V;
   2280       propagateIRFlags(E->VectorizedValue, E->Scalars);
   2281       ++NumVectorInstructions;
   2282 
   2283       if (Instruction *I = dyn_cast<Instruction>(V))
   2284         return propagateMetadata(I, E->Scalars);
   2285 
   2286       return V;
   2287     }
   2288     case Instruction::Load: {
   2289       // Loads are inserted at the head of the tree because we don't want to
   2290       // sink them all the way down past store instructions.
   2291       setInsertPointAfterBundle(E->Scalars);
   2292 
   2293       LoadInst *LI = cast<LoadInst>(VL0);
   2294       Type *ScalarLoadTy = LI->getType();
   2295       unsigned AS = LI->getPointerAddressSpace();
   2296 
   2297       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
   2298                                             VecTy->getPointerTo(AS));
   2299 
   2300       // The pointer operand uses an in-tree scalar so we add the new BitCast to
   2301       // ExternalUses list to make sure that an extract will be generated in the
   2302       // future.
   2303       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
   2304         ExternalUses.push_back(
   2305             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
   2306 
   2307       unsigned Alignment = LI->getAlignment();
   2308       LI = Builder.CreateLoad(VecPtr);
   2309       if (!Alignment) {
   2310         Alignment = DL.getABITypeAlignment(ScalarLoadTy);
   2311       }
   2312       LI->setAlignment(Alignment);
   2313       E->VectorizedValue = LI;
   2314       ++NumVectorInstructions;
   2315       return propagateMetadata(LI, E->Scalars);
   2316     }
   2317     case Instruction::Store: {
   2318       StoreInst *SI = cast<StoreInst>(VL0);
   2319       unsigned Alignment = SI->getAlignment();
   2320       unsigned AS = SI->getPointerAddressSpace();
   2321 
   2322       ValueList ValueOp;
   2323       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
   2324         ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
   2325 
   2326       setInsertPointAfterBundle(E->Scalars);
   2327 
   2328       Value *VecValue = vectorizeTree(ValueOp);
   2329       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
   2330                                             VecTy->getPointerTo(AS));
   2331       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
   2332 
   2333       // The pointer operand uses an in-tree scalar so we add the new BitCast to
   2334       // ExternalUses list to make sure that an extract will be generated in the
   2335       // future.
   2336       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
   2337         ExternalUses.push_back(
   2338             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
   2339 
   2340       if (!Alignment) {
   2341         Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
   2342       }
   2343       S->setAlignment(Alignment);
   2344       E->VectorizedValue = S;
   2345       ++NumVectorInstructions;
   2346       return propagateMetadata(S, E->Scalars);
   2347     }
   2348     case Instruction::GetElementPtr: {
   2349       setInsertPointAfterBundle(E->Scalars);
   2350 
   2351       ValueList Op0VL;
   2352       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
   2353         Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
   2354 
   2355       Value *Op0 = vectorizeTree(Op0VL);
   2356 
   2357       std::vector<Value *> OpVecs;
   2358       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
   2359            ++j) {
   2360         ValueList OpVL;
   2361         for (int i = 0, e = E->Scalars.size(); i < e; ++i)
   2362           OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
   2363 
   2364         Value *OpVec = vectorizeTree(OpVL);
   2365         OpVecs.push_back(OpVec);
   2366       }
   2367 
   2368       Value *V = Builder.CreateGEP(
   2369           cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
   2370       E->VectorizedValue = V;
   2371       ++NumVectorInstructions;
   2372 
   2373       if (Instruction *I = dyn_cast<Instruction>(V))
   2374         return propagateMetadata(I, E->Scalars);
   2375 
   2376       return V;
   2377     }
   2378     case Instruction::Call: {
   2379       CallInst *CI = cast<CallInst>(VL0);
   2380       setInsertPointAfterBundle(E->Scalars);
   2381       Function *FI;
   2382       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
   2383       Value *ScalarArg = nullptr;
   2384       if (CI && (FI = CI->getCalledFunction())) {
   2385         IID = (Intrinsic::ID) FI->getIntrinsicID();
   2386       }
   2387       std::vector<Value *> OpVecs;
   2388       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
   2389         ValueList OpVL;
   2390         // ctlz,cttz and powi are special intrinsics whose second argument is
   2391         // a scalar. This argument should not be vectorized.
   2392         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
   2393           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
   2394           ScalarArg = CEI->getArgOperand(j);
   2395           OpVecs.push_back(CEI->getArgOperand(j));
   2396           continue;
   2397         }
   2398         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
   2399           CallInst *CEI = cast<CallInst>(E->Scalars[i]);
   2400           OpVL.push_back(CEI->getArgOperand(j));
   2401         }
   2402 
   2403         Value *OpVec = vectorizeTree(OpVL);
   2404         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
   2405         OpVecs.push_back(OpVec);
   2406       }
   2407 
   2408       Module *M = F->getParent();
   2409       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
   2410       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
   2411       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
   2412       Value *V = Builder.CreateCall(CF, OpVecs);
   2413 
   2414       // The scalar argument uses an in-tree scalar so we add the new vectorized
   2415       // call to ExternalUses list to make sure that an extract will be
   2416       // generated in the future.
   2417       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
   2418         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
   2419 
   2420       E->VectorizedValue = V;
   2421       ++NumVectorInstructions;
   2422       return V;
   2423     }
   2424     case Instruction::ShuffleVector: {
   2425       ValueList LHSVL, RHSVL;
   2426       assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
   2427       reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
   2428       setInsertPointAfterBundle(E->Scalars);
   2429 
   2430       Value *LHS = vectorizeTree(LHSVL);
   2431       Value *RHS = vectorizeTree(RHSVL);
   2432 
   2433       if (Value *V = alreadyVectorized(E->Scalars))
   2434         return V;
   2435 
   2436       // Create a vector of LHS op1 RHS
   2437       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
   2438       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
   2439 
   2440       // Create a vector of LHS op2 RHS
   2441       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
   2442       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
   2443       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
   2444 
   2445       // Create shuffle to take alternate operations from the vector.
   2446       // Also, gather up odd and even scalar ops to propagate IR flags to
   2447       // each vector operation.
   2448       ValueList OddScalars, EvenScalars;
   2449       unsigned e = E->Scalars.size();
   2450       SmallVector<Constant *, 8> Mask(e);
   2451       for (unsigned i = 0; i < e; ++i) {
   2452         if (i & 1) {
   2453           Mask[i] = Builder.getInt32(e + i);
   2454           OddScalars.push_back(E->Scalars[i]);
   2455         } else {
   2456           Mask[i] = Builder.getInt32(i);
   2457           EvenScalars.push_back(E->Scalars[i]);
   2458         }
   2459       }
   2460 
   2461       Value *ShuffleMask = ConstantVector::get(Mask);
   2462       propagateIRFlags(V0, EvenScalars);
   2463       propagateIRFlags(V1, OddScalars);
   2464 
   2465       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
   2466       E->VectorizedValue = V;
   2467       ++NumVectorInstructions;
   2468       if (Instruction *I = dyn_cast<Instruction>(V))
   2469         return propagateMetadata(I, E->Scalars);
   2470 
   2471       return V;
   2472     }
   2473     default:
   2474     llvm_unreachable("unknown inst");
   2475   }
   2476   return nullptr;
   2477 }
   2478 
   2479 Value *BoUpSLP::vectorizeTree() {
   2480 
   2481   // All blocks must be scheduled before any instructions are inserted.
   2482   for (auto &BSIter : BlocksSchedules) {
   2483     scheduleBlock(BSIter.second.get());
   2484   }
   2485 
   2486   Builder.SetInsertPoint(F->getEntryBlock().begin());
   2487   vectorizeTree(&VectorizableTree[0]);
   2488 
   2489   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
   2490 
   2491   // Extract all of the elements with the external uses.
   2492   for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
   2493        it != e; ++it) {
   2494     Value *Scalar = it->Scalar;
   2495     llvm::User *User = it->User;
   2496 
   2497     // Skip users that we already RAUW. This happens when one instruction
   2498     // has multiple uses of the same value.
   2499     if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
   2500         Scalar->user_end())
   2501       continue;
   2502     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
   2503 
   2504     int Idx = ScalarToTreeEntry[Scalar];
   2505     TreeEntry *E = &VectorizableTree[Idx];
   2506     assert(!E->NeedToGather && "Extracting from a gather list");
   2507 
   2508     Value *Vec = E->VectorizedValue;
   2509     assert(Vec && "Can't find vectorizable value");
   2510 
   2511     Value *Lane = Builder.getInt32(it->Lane);
   2512     // Generate extracts for out-of-tree users.
   2513     // Find the insertion point for the extractelement lane.
   2514     if (isa<Instruction>(Vec)){
   2515       if (PHINode *PH = dyn_cast<PHINode>(User)) {
   2516         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
   2517           if (PH->getIncomingValue(i) == Scalar) {
   2518             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
   2519             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
   2520             CSEBlocks.insert(PH->getIncomingBlock(i));
   2521             PH->setOperand(i, Ex);
   2522           }
   2523         }
   2524       } else {
   2525         Builder.SetInsertPoint(cast<Instruction>(User));
   2526         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
   2527         CSEBlocks.insert(cast<Instruction>(User)->getParent());
   2528         User->replaceUsesOfWith(Scalar, Ex);
   2529      }
   2530     } else {
   2531       Builder.SetInsertPoint(F->getEntryBlock().begin());
   2532       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
   2533       CSEBlocks.insert(&F->getEntryBlock());
   2534       User->replaceUsesOfWith(Scalar, Ex);
   2535     }
   2536 
   2537     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
   2538   }
   2539 
   2540   // For each vectorized value:
   2541   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
   2542     TreeEntry *Entry = &VectorizableTree[EIdx];
   2543 
   2544     // For each lane:
   2545     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
   2546       Value *Scalar = Entry->Scalars[Lane];
   2547       // No need to handle users of gathered values.
   2548       if (Entry->NeedToGather)
   2549         continue;
   2550 
   2551       assert(Entry->VectorizedValue && "Can't find vectorizable value");
   2552 
   2553       Type *Ty = Scalar->getType();
   2554       if (!Ty->isVoidTy()) {
   2555 #ifndef NDEBUG
   2556         for (User *U : Scalar->users()) {
   2557           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
   2558 
   2559           assert((ScalarToTreeEntry.count(U) ||
   2560                   // It is legal to replace users in the ignorelist by undef.
   2561                   (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
   2562                    UserIgnoreList.end())) &&
   2563                  "Replacing out-of-tree value with undef");
   2564         }
   2565 #endif
   2566         Value *Undef = UndefValue::get(Ty);
   2567         Scalar->replaceAllUsesWith(Undef);
   2568       }
   2569       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
   2570       eraseInstruction(cast<Instruction>(Scalar));
   2571     }
   2572   }
   2573 
   2574   Builder.ClearInsertionPoint();
   2575 
   2576   return VectorizableTree[0].VectorizedValue;
   2577 }
   2578 
   2579 void BoUpSLP::optimizeGatherSequence() {
   2580   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
   2581         << " gather sequences instructions.\n");
   2582   // LICM InsertElementInst sequences.
   2583   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
   2584        e = GatherSeq.end(); it != e; ++it) {
   2585     InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
   2586 
   2587     if (!Insert)
   2588       continue;
   2589 
   2590     // Check if this block is inside a loop.
   2591     Loop *L = LI->getLoopFor(Insert->getParent());
   2592     if (!L)
   2593       continue;
   2594 
   2595     // Check if it has a preheader.
   2596     BasicBlock *PreHeader = L->getLoopPreheader();
   2597     if (!PreHeader)
   2598       continue;
   2599 
   2600     // If the vector or the element that we insert into it are
   2601     // instructions that are defined in this basic block then we can't
   2602     // hoist this instruction.
   2603     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
   2604     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
   2605     if (CurrVec && L->contains(CurrVec))
   2606       continue;
   2607     if (NewElem && L->contains(NewElem))
   2608       continue;
   2609 
   2610     // We can hoist this instruction. Move it to the pre-header.
   2611     Insert->moveBefore(PreHeader->getTerminator());
   2612   }
   2613 
   2614   // Make a list of all reachable blocks in our CSE queue.
   2615   SmallVector<const DomTreeNode *, 8> CSEWorkList;
   2616   CSEWorkList.reserve(CSEBlocks.size());
   2617   for (BasicBlock *BB : CSEBlocks)
   2618     if (DomTreeNode *N = DT->getNode(BB)) {
   2619       assert(DT->isReachableFromEntry(N));
   2620       CSEWorkList.push_back(N);
   2621     }
   2622 
   2623   // Sort blocks by domination. This ensures we visit a block after all blocks
   2624   // dominating it are visited.
   2625   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
   2626                    [this](const DomTreeNode *A, const DomTreeNode *B) {
   2627     return DT->properlyDominates(A, B);
   2628   });
   2629 
   2630   // Perform O(N^2) search over the gather sequences and merge identical
   2631   // instructions. TODO: We can further optimize this scan if we split the
   2632   // instructions into different buckets based on the insert lane.
   2633   SmallVector<Instruction *, 16> Visited;
   2634   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
   2635     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
   2636            "Worklist not sorted properly!");
   2637     BasicBlock *BB = (*I)->getBlock();
   2638     // For all instructions in blocks containing gather sequences:
   2639     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
   2640       Instruction *In = it++;
   2641       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
   2642         continue;
   2643 
   2644       // Check if we can replace this instruction with any of the
   2645       // visited instructions.
   2646       for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
   2647                                                     ve = Visited.end();
   2648            v != ve; ++v) {
   2649         if (In->isIdenticalTo(*v) &&
   2650             DT->dominates((*v)->getParent(), In->getParent())) {
   2651           In->replaceAllUsesWith(*v);
   2652           eraseInstruction(In);
   2653           In = nullptr;
   2654           break;
   2655         }
   2656       }
   2657       if (In) {
   2658         assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
   2659         Visited.push_back(In);
   2660       }
   2661     }
   2662   }
   2663   CSEBlocks.clear();
   2664   GatherSeq.clear();
   2665 }
   2666 
   2667 // Groups the instructions to a bundle (which is then a single scheduling entity)
   2668 // and schedules instructions until the bundle gets ready.
   2669 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   2670                                                  BoUpSLP *SLP) {
   2671   if (isa<PHINode>(VL[0]))
   2672     return true;
   2673 
   2674   // Initialize the instruction bundle.
   2675   Instruction *OldScheduleEnd = ScheduleEnd;
   2676   ScheduleData *PrevInBundle = nullptr;
   2677   ScheduleData *Bundle = nullptr;
   2678   bool ReSchedule = false;
   2679   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
   2680   for (Value *V : VL) {
   2681     extendSchedulingRegion(V);
   2682     ScheduleData *BundleMember = getScheduleData(V);
   2683     assert(BundleMember &&
   2684            "no ScheduleData for bundle member (maybe not in same basic block)");
   2685     if (BundleMember->IsScheduled) {
   2686       // A bundle member was scheduled as single instruction before and now
   2687       // needs to be scheduled as part of the bundle. We just get rid of the
   2688       // existing schedule.
   2689       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
   2690                    << " was already scheduled\n");
   2691       ReSchedule = true;
   2692     }
   2693     assert(BundleMember->isSchedulingEntity() &&
   2694            "bundle member already part of other bundle");
   2695     if (PrevInBundle) {
   2696       PrevInBundle->NextInBundle = BundleMember;
   2697     } else {
   2698       Bundle = BundleMember;
   2699     }
   2700     BundleMember->UnscheduledDepsInBundle = 0;
   2701     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
   2702 
   2703     // Group the instructions to a bundle.
   2704     BundleMember->FirstInBundle = Bundle;
   2705     PrevInBundle = BundleMember;
   2706   }
   2707   if (ScheduleEnd != OldScheduleEnd) {
   2708     // The scheduling region got new instructions at the lower end (or it is a
   2709     // new region for the first bundle). This makes it necessary to
   2710     // recalculate all dependencies.
   2711     // It is seldom that this needs to be done a second time after adding the
   2712     // initial bundle to the region.
   2713     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
   2714       ScheduleData *SD = getScheduleData(I);
   2715       SD->clearDependencies();
   2716     }
   2717     ReSchedule = true;
   2718   }
   2719   if (ReSchedule) {
   2720     resetSchedule();
   2721     initialFillReadyList(ReadyInsts);
   2722   }
   2723 
   2724   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
   2725                << BB->getName() << "\n");
   2726 
   2727   calculateDependencies(Bundle, true, SLP);
   2728 
   2729   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
   2730   // means that there are no cyclic dependencies and we can schedule it.
   2731   // Note that's important that we don't "schedule" the bundle yet (see
   2732   // cancelScheduling).
   2733   while (!Bundle->isReady() && !ReadyInsts.empty()) {
   2734 
   2735     ScheduleData *pickedSD = ReadyInsts.back();
   2736     ReadyInsts.pop_back();
   2737 
   2738     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
   2739       schedule(pickedSD, ReadyInsts);
   2740     }
   2741   }
   2742   return Bundle->isReady();
   2743 }
   2744 
   2745 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
   2746   if (isa<PHINode>(VL[0]))
   2747     return;
   2748 
   2749   ScheduleData *Bundle = getScheduleData(VL[0]);
   2750   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   2751   assert(!Bundle->IsScheduled &&
   2752          "Can't cancel bundle which is already scheduled");
   2753   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
   2754          "tried to unbundle something which is not a bundle");
   2755 
   2756   // Un-bundle: make single instructions out of the bundle.
   2757   ScheduleData *BundleMember = Bundle;
   2758   while (BundleMember) {
   2759     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
   2760     BundleMember->FirstInBundle = BundleMember;
   2761     ScheduleData *Next = BundleMember->NextInBundle;
   2762     BundleMember->NextInBundle = nullptr;
   2763     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
   2764     if (BundleMember->UnscheduledDepsInBundle == 0) {
   2765       ReadyInsts.insert(BundleMember);
   2766     }
   2767     BundleMember = Next;
   2768   }
   2769 }
   2770 
   2771 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
   2772   if (getScheduleData(V))
   2773     return;
   2774   Instruction *I = dyn_cast<Instruction>(V);
   2775   assert(I && "bundle member must be an instruction");
   2776   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
   2777   if (!ScheduleStart) {
   2778     // It's the first instruction in the new region.
   2779     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
   2780     ScheduleStart = I;
   2781     ScheduleEnd = I->getNextNode();
   2782     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
   2783     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
   2784     return;
   2785   }
   2786   // Search up and down at the same time, because we don't know if the new
   2787   // instruction is above or below the existing scheduling region.
   2788   BasicBlock::reverse_iterator UpIter(ScheduleStart);
   2789   BasicBlock::reverse_iterator UpperEnd = BB->rend();
   2790   BasicBlock::iterator DownIter(ScheduleEnd);
   2791   BasicBlock::iterator LowerEnd = BB->end();
   2792   for (;;) {
   2793     if (UpIter != UpperEnd) {
   2794       if (&*UpIter == I) {
   2795         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
   2796         ScheduleStart = I;
   2797         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
   2798         return;
   2799       }
   2800       UpIter++;
   2801     }
   2802     if (DownIter != LowerEnd) {
   2803       if (&*DownIter == I) {
   2804         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
   2805                          nullptr);
   2806         ScheduleEnd = I->getNextNode();
   2807         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
   2808         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
   2809         return;
   2810       }
   2811       DownIter++;
   2812     }
   2813     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
   2814            "instruction not found in block");
   2815   }
   2816 }
   2817 
   2818 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
   2819                                                 Instruction *ToI,
   2820                                                 ScheduleData *PrevLoadStore,
   2821                                                 ScheduleData *NextLoadStore) {
   2822   ScheduleData *CurrentLoadStore = PrevLoadStore;
   2823   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
   2824     ScheduleData *SD = ScheduleDataMap[I];
   2825     if (!SD) {
   2826       // Allocate a new ScheduleData for the instruction.
   2827       if (ChunkPos >= ChunkSize) {
   2828         ScheduleDataChunks.push_back(
   2829             llvm::make_unique<ScheduleData[]>(ChunkSize));
   2830         ChunkPos = 0;
   2831       }
   2832       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
   2833       ScheduleDataMap[I] = SD;
   2834       SD->Inst = I;
   2835     }
   2836     assert(!isInSchedulingRegion(SD) &&
   2837            "new ScheduleData already in scheduling region");
   2838     SD->init(SchedulingRegionID);
   2839 
   2840     if (I->mayReadOrWriteMemory()) {
   2841       // Update the linked list of memory accessing instructions.
   2842       if (CurrentLoadStore) {
   2843         CurrentLoadStore->NextLoadStore = SD;
   2844       } else {
   2845         FirstLoadStoreInRegion = SD;
   2846       }
   2847       CurrentLoadStore = SD;
   2848     }
   2849   }
   2850   if (NextLoadStore) {
   2851     if (CurrentLoadStore)
   2852       CurrentLoadStore->NextLoadStore = NextLoadStore;
   2853   } else {
   2854     LastLoadStoreInRegion = CurrentLoadStore;
   2855   }
   2856 }
   2857 
   2858 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
   2859                                                      bool InsertInReadyList,
   2860                                                      BoUpSLP *SLP) {
   2861   assert(SD->isSchedulingEntity());
   2862 
   2863   SmallVector<ScheduleData *, 10> WorkList;
   2864   WorkList.push_back(SD);
   2865 
   2866   while (!WorkList.empty()) {
   2867     ScheduleData *SD = WorkList.back();
   2868     WorkList.pop_back();
   2869 
   2870     ScheduleData *BundleMember = SD;
   2871     while (BundleMember) {
   2872       assert(isInSchedulingRegion(BundleMember));
   2873       if (!BundleMember->hasValidDependencies()) {
   2874 
   2875         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
   2876         BundleMember->Dependencies = 0;
   2877         BundleMember->resetUnscheduledDeps();
   2878 
   2879         // Handle def-use chain dependencies.
   2880         for (User *U : BundleMember->Inst->users()) {
   2881           if (isa<Instruction>(U)) {
   2882             ScheduleData *UseSD = getScheduleData(U);
   2883             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
   2884               BundleMember->Dependencies++;
   2885               ScheduleData *DestBundle = UseSD->FirstInBundle;
   2886               if (!DestBundle->IsScheduled) {
   2887                 BundleMember->incrementUnscheduledDeps(1);
   2888               }
   2889               if (!DestBundle->hasValidDependencies()) {
   2890                 WorkList.push_back(DestBundle);
   2891               }
   2892             }
   2893           } else {
   2894             // I'm not sure if this can ever happen. But we need to be safe.
   2895             // This lets the instruction/bundle never be scheduled and eventally
   2896             // disable vectorization.
   2897             BundleMember->Dependencies++;
   2898             BundleMember->incrementUnscheduledDeps(1);
   2899           }
   2900         }
   2901 
   2902         // Handle the memory dependencies.
   2903         ScheduleData *DepDest = BundleMember->NextLoadStore;
   2904         if (DepDest) {
   2905           Instruction *SrcInst = BundleMember->Inst;
   2906           AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
   2907           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
   2908           unsigned numAliased = 0;
   2909           unsigned DistToSrc = 1;
   2910 
   2911           while (DepDest) {
   2912             assert(isInSchedulingRegion(DepDest));
   2913 
   2914             // We have two limits to reduce the complexity:
   2915             // 1) AliasedCheckLimit: It's a small limit to reduce calls to
   2916             //    SLP->isAliased (which is the expensive part in this loop).
   2917             // 2) MaxMemDepDistance: It's for very large blocks and it aborts
   2918             //    the whole loop (even if the loop is fast, it's quadratic).
   2919             //    It's important for the loop break condition (see below) to
   2920             //    check this limit even between two read-only instructions.
   2921             if (DistToSrc >= MaxMemDepDistance ||
   2922                     ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
   2923                      (numAliased >= AliasedCheckLimit ||
   2924                       SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
   2925 
   2926               // We increment the counter only if the locations are aliased
   2927               // (instead of counting all alias checks). This gives a better
   2928               // balance between reduced runtime and accurate dependencies.
   2929               numAliased++;
   2930 
   2931               DepDest->MemoryDependencies.push_back(BundleMember);
   2932               BundleMember->Dependencies++;
   2933               ScheduleData *DestBundle = DepDest->FirstInBundle;
   2934               if (!DestBundle->IsScheduled) {
   2935                 BundleMember->incrementUnscheduledDeps(1);
   2936               }
   2937               if (!DestBundle->hasValidDependencies()) {
   2938                 WorkList.push_back(DestBundle);
   2939               }
   2940             }
   2941             DepDest = DepDest->NextLoadStore;
   2942 
   2943             // Example, explaining the loop break condition: Let's assume our
   2944             // starting instruction is i0 and MaxMemDepDistance = 3.
   2945             //
   2946             //                      +--------v--v--v
   2947             //             i0,i1,i2,i3,i4,i5,i6,i7,i8
   2948             //             +--------^--^--^
   2949             //
   2950             // MaxMemDepDistance let us stop alias-checking at i3 and we add
   2951             // dependencies from i0 to i3,i4,.. (even if they are not aliased).
   2952             // Previously we already added dependencies from i3 to i6,i7,i8
   2953             // (because of MaxMemDepDistance). As we added a dependency from
   2954             // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
   2955             // and we can abort this loop at i6.
   2956             if (DistToSrc >= 2 * MaxMemDepDistance)
   2957                 break;
   2958             DistToSrc++;
   2959           }
   2960         }
   2961       }
   2962       BundleMember = BundleMember->NextInBundle;
   2963     }
   2964     if (InsertInReadyList && SD->isReady()) {
   2965       ReadyInsts.push_back(SD);
   2966       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
   2967     }
   2968   }
   2969 }
   2970 
   2971 void BoUpSLP::BlockScheduling::resetSchedule() {
   2972   assert(ScheduleStart &&
   2973          "tried to reset schedule on block which has not been scheduled");
   2974   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
   2975     ScheduleData *SD = getScheduleData(I);
   2976     assert(isInSchedulingRegion(SD));
   2977     SD->IsScheduled = false;
   2978     SD->resetUnscheduledDeps();
   2979   }
   2980   ReadyInsts.clear();
   2981 }
   2982 
   2983 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   2984 
   2985   if (!BS->ScheduleStart)
   2986     return;
   2987 
   2988   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
   2989 
   2990   BS->resetSchedule();
   2991 
   2992   // For the real scheduling we use a more sophisticated ready-list: it is
   2993   // sorted by the original instruction location. This lets the final schedule
   2994   // be as  close as possible to the original instruction order.
   2995   struct ScheduleDataCompare {
   2996     bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
   2997       return SD2->SchedulingPriority < SD1->SchedulingPriority;
   2998     }
   2999   };
   3000   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
   3001 
   3002   // Ensure that all depencency data is updated and fill the ready-list with
   3003   // initial instructions.
   3004   int Idx = 0;
   3005   int NumToSchedule = 0;
   3006   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
   3007        I = I->getNextNode()) {
   3008     ScheduleData *SD = BS->getScheduleData(I);
   3009     assert(
   3010         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
   3011         "scheduler and vectorizer have different opinion on what is a bundle");
   3012     SD->FirstInBundle->SchedulingPriority = Idx++;
   3013     if (SD->isSchedulingEntity()) {
   3014       BS->calculateDependencies(SD, false, this);
   3015       NumToSchedule++;
   3016     }
   3017   }
   3018   BS->initialFillReadyList(ReadyInsts);
   3019 
   3020   Instruction *LastScheduledInst = BS->ScheduleEnd;
   3021 
   3022   // Do the "real" scheduling.
   3023   while (!ReadyInsts.empty()) {
   3024     ScheduleData *picked = *ReadyInsts.begin();
   3025     ReadyInsts.erase(ReadyInsts.begin());
   3026 
   3027     // Move the scheduled instruction(s) to their dedicated places, if not
   3028     // there yet.
   3029     ScheduleData *BundleMember = picked;
   3030     while (BundleMember) {
   3031       Instruction *pickedInst = BundleMember->Inst;
   3032       if (LastScheduledInst->getNextNode() != pickedInst) {
   3033         BS->BB->getInstList().remove(pickedInst);
   3034         BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
   3035       }
   3036       LastScheduledInst = pickedInst;
   3037       BundleMember = BundleMember->NextInBundle;
   3038     }
   3039 
   3040     BS->schedule(picked, ReadyInsts);
   3041     NumToSchedule--;
   3042   }
   3043   assert(NumToSchedule == 0 && "could not schedule all instructions");
   3044 
   3045   // Avoid duplicate scheduling of the block.
   3046   BS->ScheduleStart = nullptr;
   3047 }
   3048 
   3049 /// The SLPVectorizer Pass.
   3050 struct SLPVectorizer : public FunctionPass {
   3051   typedef SmallVector<StoreInst *, 8> StoreList;
   3052   typedef MapVector<Value *, StoreList> StoreListMap;
   3053 
   3054   /// Pass identification, replacement for typeid
   3055   static char ID;
   3056 
   3057   explicit SLPVectorizer() : FunctionPass(ID) {
   3058     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
   3059   }
   3060 
   3061   ScalarEvolution *SE;
   3062   TargetTransformInfo *TTI;
   3063   TargetLibraryInfo *TLI;
   3064   AliasAnalysis *AA;
   3065   LoopInfo *LI;
   3066   DominatorTree *DT;
   3067   AssumptionCache *AC;
   3068 
   3069   bool runOnFunction(Function &F) override {
   3070     if (skipOptnoneFunction(F))
   3071       return false;
   3072 
   3073     SE = &getAnalysis<ScalarEvolution>();
   3074     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   3075     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   3076     TLI = TLIP ? &TLIP->getTLI() : nullptr;
   3077     AA = &getAnalysis<AliasAnalysis>();
   3078     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   3079     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   3080     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   3081 
   3082     StoreRefs.clear();
   3083     bool Changed = false;
   3084 
   3085     // If the target claims to have no vector registers don't attempt
   3086     // vectorization.
   3087     if (!TTI->getNumberOfRegisters(true))
   3088       return false;
   3089 
   3090     // Don't vectorize when the attribute NoImplicitFloat is used.
   3091     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
   3092       return false;
   3093 
   3094     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
   3095 
   3096     // Use the bottom up slp vectorizer to construct chains that start with
   3097     // store instructions.
   3098     BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
   3099 
   3100     // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   3101     // delete instructions.
   3102 
   3103     // Scan the blocks in the function in post order.
   3104     for (auto BB : post_order(&F.getEntryBlock())) {
   3105       // Vectorize trees that end at stores.
   3106       if (unsigned count = collectStores(BB, R)) {
   3107         (void)count;
   3108         DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
   3109         Changed |= vectorizeStoreChains(R);
   3110       }
   3111 
   3112       // Vectorize trees that end at reductions.
   3113       Changed |= vectorizeChainsInBlock(BB, R);
   3114     }
   3115 
   3116     if (Changed) {
   3117       R.optimizeGatherSequence();
   3118       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
   3119       DEBUG(verifyFunction(F));
   3120     }
   3121     return Changed;
   3122   }
   3123 
   3124   void getAnalysisUsage(AnalysisUsage &AU) const override {
   3125     FunctionPass::getAnalysisUsage(AU);
   3126     AU.addRequired<AssumptionCacheTracker>();
   3127     AU.addRequired<ScalarEvolution>();
   3128     AU.addRequired<AliasAnalysis>();
   3129     AU.addRequired<TargetTransformInfoWrapperPass>();
   3130     AU.addRequired<LoopInfoWrapperPass>();
   3131     AU.addRequired<DominatorTreeWrapperPass>();
   3132     AU.addPreserved<LoopInfoWrapperPass>();
   3133     AU.addPreserved<DominatorTreeWrapperPass>();
   3134     AU.setPreservesCFG();
   3135   }
   3136 
   3137 private:
   3138 
   3139   /// \brief Collect memory references and sort them according to their base
   3140   /// object. We sort the stores to their base objects to reduce the cost of the
   3141   /// quadratic search on the stores. TODO: We can further reduce this cost
   3142   /// if we flush the chain creation every time we run into a memory barrier.
   3143   unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
   3144 
   3145   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
   3146   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
   3147 
   3148   /// \brief Try to vectorize a list of operands.
   3149   /// \@param BuildVector A list of users to ignore for the purpose of
   3150   ///                     scheduling and that don't need extracting.
   3151   /// \returns true if a value was vectorized.
   3152   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   3153                           ArrayRef<Value *> BuildVector = None,
   3154                           bool allowReorder = false);
   3155 
   3156   /// \brief Try to vectorize a chain that may start at the operands of \V;
   3157   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
   3158 
   3159   /// \brief Vectorize the stores that were collected in StoreRefs.
   3160   bool vectorizeStoreChains(BoUpSLP &R);
   3161 
   3162   /// \brief Scan the basic block and look for patterns that are likely to start
   3163   /// a vectorization chain.
   3164   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
   3165 
   3166   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
   3167                            BoUpSLP &R);
   3168 
   3169   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
   3170                        BoUpSLP &R);
   3171 private:
   3172   StoreListMap StoreRefs;
   3173 };
   3174 
   3175 /// \brief Check that the Values in the slice in VL array are still existent in
   3176 /// the WeakVH array.
   3177 /// Vectorization of part of the VL array may cause later values in the VL array
   3178 /// to become invalid. We track when this has happened in the WeakVH array.
   3179 static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
   3180                                unsigned SliceBegin, unsigned SliceSize) {
   3181   VL = VL.slice(SliceBegin, SliceSize);
   3182   VH = VH.slice(SliceBegin, SliceSize);
   3183   return !std::equal(VL.begin(), VL.end(), VH.begin());
   3184 }
   3185 
   3186 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
   3187                                           int CostThreshold, BoUpSLP &R) {
   3188   unsigned ChainLen = Chain.size();
   3189   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
   3190         << "\n");
   3191   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
   3192   auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
   3193   unsigned Sz = DL.getTypeSizeInBits(StoreTy);
   3194   unsigned VF = MinVecRegSize / Sz;
   3195 
   3196   if (!isPowerOf2_32(Sz) || VF < 2)
   3197     return false;
   3198 
   3199   // Keep track of values that were deleted by vectorizing in the loop below.
   3200   SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
   3201 
   3202   bool Changed = false;
   3203   // Look for profitable vectorizable trees at all offsets, starting at zero.
   3204   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
   3205     if (i + VF > e)
   3206       break;
   3207 
   3208     // Check that a previous iteration of this loop did not delete the Value.
   3209     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
   3210       continue;
   3211 
   3212     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
   3213           << "\n");
   3214     ArrayRef<Value *> Operands = Chain.slice(i, VF);
   3215 
   3216     R.buildTree(Operands);
   3217 
   3218     int Cost = R.getTreeCost();
   3219 
   3220     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
   3221     if (Cost < CostThreshold) {
   3222       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
   3223       R.vectorizeTree();
   3224 
   3225       // Move to the next bundle.
   3226       i += VF - 1;
   3227       Changed = true;
   3228     }
   3229   }
   3230 
   3231   return Changed;
   3232 }
   3233 
   3234 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
   3235                                     int costThreshold, BoUpSLP &R) {
   3236   SetVector<StoreInst *> Heads, Tails;
   3237   SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
   3238 
   3239   // We may run into multiple chains that merge into a single chain. We mark the
   3240   // stores that we vectorized so that we don't visit the same store twice.
   3241   BoUpSLP::ValueSet VectorizedStores;
   3242   bool Changed = false;
   3243 
   3244   // Do a quadratic search on all of the given stores and find
   3245   // all of the pairs of stores that follow each other.
   3246   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
   3247     for (unsigned j = 0; j < e; ++j) {
   3248       if (i == j)
   3249         continue;
   3250       const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
   3251       if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
   3252         Tails.insert(Stores[j]);
   3253         Heads.insert(Stores[i]);
   3254         ConsecutiveChain[Stores[i]] = Stores[j];
   3255       }
   3256     }
   3257   }
   3258 
   3259   // For stores that start but don't end a link in the chain:
   3260   for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
   3261        it != e; ++it) {
   3262     if (Tails.count(*it))
   3263       continue;
   3264 
   3265     // We found a store instr that starts a chain. Now follow the chain and try
   3266     // to vectorize it.
   3267     BoUpSLP::ValueList Operands;
   3268     StoreInst *I = *it;
   3269     // Collect the chain into a list.
   3270     while (Tails.count(I) || Heads.count(I)) {
   3271       if (VectorizedStores.count(I))
   3272         break;
   3273       Operands.push_back(I);
   3274       // Move to the next value in the chain.
   3275       I = ConsecutiveChain[I];
   3276     }
   3277 
   3278     bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
   3279 
   3280     // Mark the vectorized stores so that we don't vectorize them again.
   3281     if (Vectorized)
   3282       VectorizedStores.insert(Operands.begin(), Operands.end());
   3283     Changed |= Vectorized;
   3284   }
   3285 
   3286   return Changed;
   3287 }
   3288 
   3289 
   3290 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
   3291   unsigned count = 0;
   3292   StoreRefs.clear();
   3293   const DataLayout &DL = BB->getModule()->getDataLayout();
   3294   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
   3295     StoreInst *SI = dyn_cast<StoreInst>(it);
   3296     if (!SI)
   3297       continue;
   3298 
   3299     // Don't touch volatile stores.
   3300     if (!SI->isSimple())
   3301       continue;
   3302 
   3303     // Check that the pointer points to scalars.
   3304     Type *Ty = SI->getValueOperand()->getType();
   3305     if (!isValidElementType(Ty))
   3306       continue;
   3307 
   3308     // Find the base pointer.
   3309     Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
   3310 
   3311     // Save the store locations.
   3312     StoreRefs[Ptr].push_back(SI);
   3313     count++;
   3314   }
   3315   return count;
   3316 }
   3317 
   3318 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   3319   if (!A || !B)
   3320     return false;
   3321   Value *VL[] = { A, B };
   3322   return tryToVectorizeList(VL, R, None, true);
   3323 }
   3324 
   3325 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   3326                                        ArrayRef<Value *> BuildVector,
   3327                                        bool allowReorder) {
   3328   if (VL.size() < 2)
   3329     return false;
   3330 
   3331   DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
   3332 
   3333   // Check that all of the parts are scalar instructions of the same type.
   3334   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
   3335   if (!I0)
   3336     return false;
   3337 
   3338   unsigned Opcode0 = I0->getOpcode();
   3339   const DataLayout &DL = I0->getModule()->getDataLayout();
   3340 
   3341   Type *Ty0 = I0->getType();
   3342   unsigned Sz = DL.getTypeSizeInBits(Ty0);
   3343   unsigned VF = MinVecRegSize / Sz;
   3344 
   3345   for (int i = 0, e = VL.size(); i < e; ++i) {
   3346     Type *Ty = VL[i]->getType();
   3347     if (!isValidElementType(Ty))
   3348       return false;
   3349     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
   3350     if (!Inst || Inst->getOpcode() != Opcode0)
   3351       return false;
   3352   }
   3353 
   3354   bool Changed = false;
   3355 
   3356   // Keep track of values that were deleted by vectorizing in the loop below.
   3357   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
   3358 
   3359   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
   3360     unsigned OpsWidth = 0;
   3361 
   3362     if (i + VF > e)
   3363       OpsWidth = e - i;
   3364     else
   3365       OpsWidth = VF;
   3366 
   3367     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
   3368       break;
   3369 
   3370     // Check that a previous iteration of this loop did not delete the Value.
   3371     if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
   3372       continue;
   3373 
   3374     DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
   3375                  << "\n");
   3376     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
   3377 
   3378     ArrayRef<Value *> BuildVectorSlice;
   3379     if (!BuildVector.empty())
   3380       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
   3381 
   3382     R.buildTree(Ops, BuildVectorSlice);
   3383     // TODO: check if we can allow reordering also for other cases than
   3384     // tryToVectorizePair()
   3385     if (allowReorder && R.shouldReorder()) {
   3386       assert(Ops.size() == 2);
   3387       assert(BuildVectorSlice.empty());
   3388       Value *ReorderedOps[] = { Ops[1], Ops[0] };
   3389       R.buildTree(ReorderedOps, None);
   3390     }
   3391     int Cost = R.getTreeCost();
   3392 
   3393     if (Cost < -SLPCostThreshold) {
   3394       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
   3395       Value *VectorizedRoot = R.vectorizeTree();
   3396 
   3397       // Reconstruct the build vector by extracting the vectorized root. This
   3398       // way we handle the case where some elements of the vector are undefined.
   3399       //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
   3400       if (!BuildVectorSlice.empty()) {
   3401         // The insert point is the last build vector instruction. The vectorized
   3402         // root will precede it. This guarantees that we get an instruction. The
   3403         // vectorized tree could have been constant folded.
   3404         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
   3405         unsigned VecIdx = 0;
   3406         for (auto &V : BuildVectorSlice) {
   3407           IRBuilder<true, NoFolder> Builder(
   3408               ++BasicBlock::iterator(InsertAfter));
   3409           InsertElementInst *IE = cast<InsertElementInst>(V);
   3410           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
   3411               VectorizedRoot, Builder.getInt32(VecIdx++)));
   3412           IE->setOperand(1, Extract);
   3413           IE->removeFromParent();
   3414           IE->insertAfter(Extract);
   3415           InsertAfter = IE;
   3416         }
   3417       }
   3418       // Move to the next bundle.
   3419       i += VF - 1;
   3420       Changed = true;
   3421     }
   3422   }
   3423 
   3424   return Changed;
   3425 }
   3426 
   3427 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   3428   if (!V)
   3429     return false;
   3430 
   3431   // Try to vectorize V.
   3432   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
   3433     return true;
   3434 
   3435   BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
   3436   BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
   3437   // Try to skip B.
   3438   if (B && B->hasOneUse()) {
   3439     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
   3440     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
   3441     if (tryToVectorizePair(A, B0, R)) {
   3442       return true;
   3443     }
   3444     if (tryToVectorizePair(A, B1, R)) {
   3445       return true;
   3446     }
   3447   }
   3448 
   3449   // Try to skip A.
   3450   if (A && A->hasOneUse()) {
   3451     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
   3452     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
   3453     if (tryToVectorizePair(A0, B, R)) {
   3454       return true;
   3455     }
   3456     if (tryToVectorizePair(A1, B, R)) {
   3457       return true;
   3458     }
   3459   }
   3460   return 0;
   3461 }
   3462 
   3463 /// \brief Generate a shuffle mask to be used in a reduction tree.
   3464 ///
   3465 /// \param VecLen The length of the vector to be reduced.
   3466 /// \param NumEltsToRdx The number of elements that should be reduced in the
   3467 ///        vector.
   3468 /// \param IsPairwise Whether the reduction is a pairwise or splitting
   3469 ///        reduction. A pairwise reduction will generate a mask of
   3470 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
   3471 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
   3472 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
   3473 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
   3474                                    bool IsPairwise, bool IsLeft,
   3475                                    IRBuilder<> &Builder) {
   3476   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
   3477 
   3478   SmallVector<Constant *, 32> ShuffleMask(
   3479       VecLen, UndefValue::get(Builder.getInt32Ty()));
   3480 
   3481   if (IsPairwise)
   3482     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
   3483     for (unsigned i = 0; i != NumEltsToRdx; ++i)
   3484       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
   3485   else
   3486     // Move the upper half of the vector to the lower half.
   3487     for (unsigned i = 0; i != NumEltsToRdx; ++i)
   3488       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
   3489 
   3490   return ConstantVector::get(ShuffleMask);
   3491 }
   3492 
   3493 
   3494 /// Model horizontal reductions.
   3495 ///
   3496 /// A horizontal reduction is a tree of reduction operations (currently add and
   3497 /// fadd) that has operations that can be put into a vector as its leaf.
   3498 /// For example, this tree:
   3499 ///
   3500 /// mul mul mul mul
   3501 ///  \  /    \  /
   3502 ///   +       +
   3503 ///    \     /
   3504 ///       +
   3505 /// This tree has "mul" as its reduced values and "+" as its reduction
   3506 /// operations. A reduction might be feeding into a store or a binary operation
   3507 /// feeding a phi.
   3508 ///    ...
   3509 ///    \  /
   3510 ///     +
   3511 ///     |
   3512 ///  phi +=
   3513 ///
   3514 ///  Or:
   3515 ///    ...
   3516 ///    \  /
   3517 ///     +
   3518 ///     |
   3519 ///   *p =
   3520 ///
   3521 class HorizontalReduction {
   3522   SmallVector<Value *, 16> ReductionOps;
   3523   SmallVector<Value *, 32> ReducedVals;
   3524 
   3525   BinaryOperator *ReductionRoot;
   3526   PHINode *ReductionPHI;
   3527 
   3528   /// The opcode of the reduction.
   3529   unsigned ReductionOpcode;
   3530   /// The opcode of the values we perform a reduction on.
   3531   unsigned ReducedValueOpcode;
   3532   /// The width of one full horizontal reduction operation.
   3533   unsigned ReduxWidth;
   3534   /// Should we model this reduction as a pairwise reduction tree or a tree that
   3535   /// splits the vector in halves and adds those halves.
   3536   bool IsPairwiseReduction;
   3537 
   3538 public:
   3539   HorizontalReduction()
   3540     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
   3541     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
   3542 
   3543   /// \brief Try to find a reduction tree.
   3544   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
   3545     assert((!Phi ||
   3546             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
   3547            "Thi phi needs to use the binary operator");
   3548 
   3549     // We could have a initial reductions that is not an add.
   3550     //  r *= v1 + v2 + v3 + v4
   3551     // In such a case start looking for a tree rooted in the first '+'.
   3552     if (Phi) {
   3553       if (B->getOperand(0) == Phi) {
   3554         Phi = nullptr;
   3555         B = dyn_cast<BinaryOperator>(B->getOperand(1));
   3556       } else if (B->getOperand(1) == Phi) {
   3557         Phi = nullptr;
   3558         B = dyn_cast<BinaryOperator>(B->getOperand(0));
   3559       }
   3560     }
   3561 
   3562     if (!B)
   3563       return false;
   3564 
   3565     Type *Ty = B->getType();
   3566     if (!isValidElementType(Ty))
   3567       return false;
   3568 
   3569     const DataLayout &DL = B->getModule()->getDataLayout();
   3570     ReductionOpcode = B->getOpcode();
   3571     ReducedValueOpcode = 0;
   3572     ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
   3573     ReductionRoot = B;
   3574     ReductionPHI = Phi;
   3575 
   3576     if (ReduxWidth < 4)
   3577       return false;
   3578 
   3579     // We currently only support adds.
   3580     if (ReductionOpcode != Instruction::Add &&
   3581         ReductionOpcode != Instruction::FAdd)
   3582       return false;
   3583 
   3584     // Post order traverse the reduction tree starting at B. We only handle true
   3585     // trees containing only binary operators.
   3586     SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
   3587     Stack.push_back(std::make_pair(B, 0));
   3588     while (!Stack.empty()) {
   3589       BinaryOperator *TreeN = Stack.back().first;
   3590       unsigned EdgeToVist = Stack.back().second++;
   3591       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
   3592 
   3593       // Only handle trees in the current basic block.
   3594       if (TreeN->getParent() != B->getParent())
   3595         return false;
   3596 
   3597       // Each tree node needs to have one user except for the ultimate
   3598       // reduction.
   3599       if (!TreeN->hasOneUse() && TreeN != B)
   3600         return false;
   3601 
   3602       // Postorder vist.
   3603       if (EdgeToVist == 2 || IsReducedValue) {
   3604         if (IsReducedValue) {
   3605           // Make sure that the opcodes of the operations that we are going to
   3606           // reduce match.
   3607           if (!ReducedValueOpcode)
   3608             ReducedValueOpcode = TreeN->getOpcode();
   3609           else if (ReducedValueOpcode != TreeN->getOpcode())
   3610             return false;
   3611           ReducedVals.push_back(TreeN);
   3612         } else {
   3613           // We need to be able to reassociate the adds.
   3614           if (!TreeN->isAssociative())
   3615             return false;
   3616           ReductionOps.push_back(TreeN);
   3617         }
   3618         // Retract.
   3619         Stack.pop_back();
   3620         continue;
   3621       }
   3622 
   3623       // Visit left or right.
   3624       Value *NextV = TreeN->getOperand(EdgeToVist);
   3625       BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
   3626       if (Next)
   3627         Stack.push_back(std::make_pair(Next, 0));
   3628       else if (NextV != Phi)
   3629         return false;
   3630     }
   3631     return true;
   3632   }
   3633 
   3634   /// \brief Attempt to vectorize the tree found by
   3635   /// matchAssociativeReduction.
   3636   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
   3637     if (ReducedVals.empty())
   3638       return false;
   3639 
   3640     unsigned NumReducedVals = ReducedVals.size();
   3641     if (NumReducedVals < ReduxWidth)
   3642       return false;
   3643 
   3644     Value *VectorizedTree = nullptr;
   3645     IRBuilder<> Builder(ReductionRoot);
   3646     FastMathFlags Unsafe;
   3647     Unsafe.setUnsafeAlgebra();
   3648     Builder.SetFastMathFlags(Unsafe);
   3649     unsigned i = 0;
   3650 
   3651     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
   3652       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
   3653 
   3654       // Estimate cost.
   3655       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
   3656       if (Cost >= -SLPCostThreshold)
   3657         break;
   3658 
   3659       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
   3660                    << ". (HorRdx)\n");
   3661 
   3662       // Vectorize a tree.
   3663       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
   3664       Value *VectorizedRoot = V.vectorizeTree();
   3665 
   3666       // Emit a reduction.
   3667       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
   3668       if (VectorizedTree) {
   3669         Builder.SetCurrentDebugLocation(Loc);
   3670         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
   3671                                      ReducedSubTree, "bin.rdx");
   3672       } else
   3673         VectorizedTree = ReducedSubTree;
   3674     }
   3675 
   3676     if (VectorizedTree) {
   3677       // Finish the reduction.
   3678       for (; i < NumReducedVals; ++i) {
   3679         Builder.SetCurrentDebugLocation(
   3680           cast<Instruction>(ReducedVals[i])->getDebugLoc());
   3681         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
   3682                                      ReducedVals[i]);
   3683       }
   3684       // Update users.
   3685       if (ReductionPHI) {
   3686         assert(ReductionRoot && "Need a reduction operation");
   3687         ReductionRoot->setOperand(0, VectorizedTree);
   3688         ReductionRoot->setOperand(1, ReductionPHI);
   3689       } else
   3690         ReductionRoot->replaceAllUsesWith(VectorizedTree);
   3691     }
   3692     return VectorizedTree != nullptr;
   3693   }
   3694 
   3695 private:
   3696 
   3697   /// \brief Calcuate the cost of a reduction.
   3698   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
   3699     Type *ScalarTy = FirstReducedVal->getType();
   3700     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
   3701 
   3702     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
   3703     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
   3704 
   3705     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
   3706     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
   3707 
   3708     int ScalarReduxCost =
   3709         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
   3710 
   3711     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
   3712                  << " for reduction that starts with " << *FirstReducedVal
   3713                  << " (It is a "
   3714                  << (IsPairwiseReduction ? "pairwise" : "splitting")
   3715                  << " reduction)\n");
   3716 
   3717     return VecReduxCost - ScalarReduxCost;
   3718   }
   3719 
   3720   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
   3721                             Value *R, const Twine &Name = "") {
   3722     if (Opcode == Instruction::FAdd)
   3723       return Builder.CreateFAdd(L, R, Name);
   3724     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
   3725   }
   3726 
   3727   /// \brief Emit a horizontal reduction of the vectorized value.
   3728   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
   3729     assert(VectorizedValue && "Need to have a vectorized tree node");
   3730     assert(isPowerOf2_32(ReduxWidth) &&
   3731            "We only handle power-of-two reductions for now");
   3732 
   3733     Value *TmpVec = VectorizedValue;
   3734     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
   3735       if (IsPairwiseReduction) {
   3736         Value *LeftMask =
   3737           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
   3738         Value *RightMask =
   3739           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
   3740 
   3741         Value *LeftShuf = Builder.CreateShuffleVector(
   3742           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
   3743         Value *RightShuf = Builder.CreateShuffleVector(
   3744           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
   3745           "rdx.shuf.r");
   3746         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
   3747                              "bin.rdx");
   3748       } else {
   3749         Value *UpperHalf =
   3750           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
   3751         Value *Shuf = Builder.CreateShuffleVector(
   3752           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
   3753         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
   3754       }
   3755     }
   3756 
   3757     // The result is in the first element of the vector.
   3758     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
   3759   }
   3760 };
   3761 
   3762 /// \brief Recognize construction of vectors like
   3763 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
   3764 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
   3765 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
   3766 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
   3767 ///
   3768 /// Returns true if it matches
   3769 ///
   3770 static bool findBuildVector(InsertElementInst *FirstInsertElem,
   3771                             SmallVectorImpl<Value *> &BuildVector,
   3772                             SmallVectorImpl<Value *> &BuildVectorOpds) {
   3773   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
   3774     return false;
   3775 
   3776   InsertElementInst *IE = FirstInsertElem;
   3777   while (true) {
   3778     BuildVector.push_back(IE);
   3779     BuildVectorOpds.push_back(IE->getOperand(1));
   3780 
   3781     if (IE->use_empty())
   3782       return false;
   3783 
   3784     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
   3785     if (!NextUse)
   3786       return true;
   3787 
   3788     // If this isn't the final use, make sure the next insertelement is the only
   3789     // use. It's OK if the final constructed vector is used multiple times
   3790     if (!IE->hasOneUse())
   3791       return false;
   3792 
   3793     IE = NextUse;
   3794   }
   3795 
   3796   return false;
   3797 }
   3798 
   3799 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
   3800   return V->getType() < V2->getType();
   3801 }
   3802 
   3803 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   3804   bool Changed = false;
   3805   SmallVector<Value *, 4> Incoming;
   3806   SmallSet<Value *, 16> VisitedInstrs;
   3807 
   3808   bool HaveVectorizedPhiNodes = true;
   3809   while (HaveVectorizedPhiNodes) {
   3810     HaveVectorizedPhiNodes = false;
   3811 
   3812     // Collect the incoming values from the PHIs.
   3813     Incoming.clear();
   3814     for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
   3815          ++instr) {
   3816       PHINode *P = dyn_cast<PHINode>(instr);
   3817       if (!P)
   3818         break;
   3819 
   3820       if (!VisitedInstrs.count(P))
   3821         Incoming.push_back(P);
   3822     }
   3823 
   3824     // Sort by type.
   3825     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
   3826 
   3827     // Try to vectorize elements base on their type.
   3828     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
   3829                                            E = Incoming.end();
   3830          IncIt != E;) {
   3831 
   3832       // Look for the next elements with the same type.
   3833       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
   3834       while (SameTypeIt != E &&
   3835              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
   3836         VisitedInstrs.insert(*SameTypeIt);
   3837         ++SameTypeIt;
   3838       }
   3839 
   3840       // Try to vectorize them.
   3841       unsigned NumElts = (SameTypeIt - IncIt);
   3842       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
   3843       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
   3844         // Success start over because instructions might have been changed.
   3845         HaveVectorizedPhiNodes = true;
   3846         Changed = true;
   3847         break;
   3848       }
   3849 
   3850       // Start over at the next instruction of a different type (or the end).
   3851       IncIt = SameTypeIt;
   3852     }
   3853   }
   3854 
   3855   VisitedInstrs.clear();
   3856 
   3857   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
   3858     // We may go through BB multiple times so skip the one we have checked.
   3859     if (!VisitedInstrs.insert(it).second)
   3860       continue;
   3861 
   3862     if (isa<DbgInfoIntrinsic>(it))
   3863       continue;
   3864 
   3865     // Try to vectorize reductions that use PHINodes.
   3866     if (PHINode *P = dyn_cast<PHINode>(it)) {
   3867       // Check that the PHI is a reduction PHI.
   3868       if (P->getNumIncomingValues() != 2)
   3869         return Changed;
   3870       Value *Rdx =
   3871           (P->getIncomingBlock(0) == BB
   3872                ? (P->getIncomingValue(0))
   3873                : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
   3874                                                : nullptr));
   3875       // Check if this is a Binary Operator.
   3876       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
   3877       if (!BI)
   3878         continue;
   3879 
   3880       // Try to match and vectorize a horizontal reduction.
   3881       HorizontalReduction HorRdx;
   3882       if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
   3883           HorRdx.tryToReduce(R, TTI)) {
   3884         Changed = true;
   3885         it = BB->begin();
   3886         e = BB->end();
   3887         continue;
   3888       }
   3889 
   3890      Value *Inst = BI->getOperand(0);
   3891       if (Inst == P)
   3892         Inst = BI->getOperand(1);
   3893 
   3894       if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
   3895         // We would like to start over since some instructions are deleted
   3896         // and the iterator may become invalid value.
   3897         Changed = true;
   3898         it = BB->begin();
   3899         e = BB->end();
   3900         continue;
   3901       }
   3902 
   3903       continue;
   3904     }
   3905 
   3906     // Try to vectorize horizontal reductions feeding into a store.
   3907     if (ShouldStartVectorizeHorAtStore)
   3908       if (StoreInst *SI = dyn_cast<StoreInst>(it))
   3909         if (BinaryOperator *BinOp =
   3910                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
   3911           HorizontalReduction HorRdx;
   3912           if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
   3913                 HorRdx.tryToReduce(R, TTI)) ||
   3914                tryToVectorize(BinOp, R))) {
   3915             Changed = true;
   3916             it = BB->begin();
   3917             e = BB->end();
   3918             continue;
   3919           }
   3920         }
   3921 
   3922     // Try to vectorize horizontal reductions feeding into a return.
   3923     if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
   3924       if (RI->getNumOperands() != 0)
   3925         if (BinaryOperator *BinOp =
   3926                 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
   3927           DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
   3928           if (tryToVectorizePair(BinOp->getOperand(0),
   3929                                  BinOp->getOperand(1), R)) {
   3930             Changed = true;
   3931             it = BB->begin();
   3932             e = BB->end();
   3933             continue;
   3934           }
   3935         }
   3936 
   3937     // Try to vectorize trees that start at compare instructions.
   3938     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
   3939       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
   3940         Changed = true;
   3941         // We would like to start over since some instructions are deleted
   3942         // and the iterator may become invalid value.
   3943         it = BB->begin();
   3944         e = BB->end();
   3945         continue;
   3946       }
   3947 
   3948       for (int i = 0; i < 2; ++i) {
   3949         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
   3950           if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
   3951             Changed = true;
   3952             // We would like to start over since some instructions are deleted
   3953             // and the iterator may become invalid value.
   3954             it = BB->begin();
   3955             e = BB->end();
   3956             break;
   3957           }
   3958         }
   3959       }
   3960       continue;
   3961     }
   3962 
   3963     // Try to vectorize trees that start at insertelement instructions.
   3964     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
   3965       SmallVector<Value *, 16> BuildVector;
   3966       SmallVector<Value *, 16> BuildVectorOpds;
   3967       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
   3968         continue;
   3969 
   3970       // Vectorize starting with the build vector operands ignoring the
   3971       // BuildVector instructions for the purpose of scheduling and user
   3972       // extraction.
   3973       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
   3974         Changed = true;
   3975         it = BB->begin();
   3976         e = BB->end();
   3977       }
   3978 
   3979       continue;
   3980     }
   3981   }
   3982 
   3983   return Changed;
   3984 }
   3985 
   3986 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
   3987   bool Changed = false;
   3988   // Attempt to sort and vectorize each of the store-groups.
   3989   for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
   3990        it != e; ++it) {
   3991     if (it->second.size() < 2)
   3992       continue;
   3993 
   3994     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
   3995           << it->second.size() << ".\n");
   3996 
   3997     // Process the stores in chunks of 16.
   3998     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
   3999       unsigned Len = std::min<unsigned>(CE - CI, 16);
   4000       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
   4001                                  -SLPCostThreshold, R);
   4002     }
   4003   }
   4004   return Changed;
   4005 }
   4006 
   4007 } // end anonymous namespace
   4008 
   4009 char SLPVectorizer::ID = 0;
   4010 static const char lv_name[] = "SLP Vectorizer";
   4011 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
   4012 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
   4013 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
   4014 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
   4015 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
   4016 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
   4017 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
   4018 
   4019 namespace llvm {
   4020 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
   4021 }
   4022