Home | History | Annotate | Download | only in Vectorize
      1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
     11 // and generates target-independent LLVM-IR.
     12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
     13 // of instructions in order to estimate the profitability of vectorization.
     14 //
     15 // The loop vectorizer combines consecutive loop iterations into a single
     16 // 'wide' iteration. After this transformation the index is incremented
     17 // by the SIMD vector width, and not by one.
     18 //
     19 // This pass has three parts:
     20 // 1. The main loop pass that drives the different parts.
     21 // 2. LoopVectorizationLegality - A unit that checks for the legality
     22 //    of the vectorization.
     23 // 3. InnerLoopVectorizer - A unit that performs the actual
     24 //    widening of instructions.
     25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
     26 //    of vectorization. It decides on the optimal vector width, which
     27 //    can be one, if vectorization is not profitable.
     28 //
     29 //===----------------------------------------------------------------------===//
     30 //
     31 // The reduction-variable vectorization is based on the paper:
     32 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
     33 //
     34 // Variable uniformity checks are inspired by:
     35 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
     36 //
     37 // The interleaved access vectorization is based on the paper:
     38 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
     39 //  Data for SIMD
     40 //
     41 // Other ideas/concepts are from:
     42 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
     43 //
     44 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
     45 //  Vectorizing Compilers.
     46 //
     47 //===----------------------------------------------------------------------===//
     48 
     49 #include "llvm/Transforms/Vectorize.h"
     50 #include "llvm/ADT/DenseMap.h"
     51 #include "llvm/ADT/Hashing.h"
     52 #include "llvm/ADT/MapVector.h"
     53 #include "llvm/ADT/SetVector.h"
     54 #include "llvm/ADT/SmallPtrSet.h"
     55 #include "llvm/ADT/SmallSet.h"
     56 #include "llvm/ADT/SmallVector.h"
     57 #include "llvm/ADT/Statistic.h"
     58 #include "llvm/ADT/StringExtras.h"
     59 #include "llvm/Analysis/AliasAnalysis.h"
     60 #include "llvm/Analysis/BasicAliasAnalysis.h"
     61 #include "llvm/Analysis/AliasSetTracker.h"
     62 #include "llvm/Analysis/AssumptionCache.h"
     63 #include "llvm/Analysis/BlockFrequencyInfo.h"
     64 #include "llvm/Analysis/CodeMetrics.h"
     65 #include "llvm/Analysis/DemandedBits.h"
     66 #include "llvm/Analysis/GlobalsModRef.h"
     67 #include "llvm/Analysis/LoopAccessAnalysis.h"
     68 #include "llvm/Analysis/LoopInfo.h"
     69 #include "llvm/Analysis/LoopIterator.h"
     70 #include "llvm/Analysis/LoopPass.h"
     71 #include "llvm/Analysis/ScalarEvolution.h"
     72 #include "llvm/Analysis/ScalarEvolutionExpander.h"
     73 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
     74 #include "llvm/Analysis/TargetTransformInfo.h"
     75 #include "llvm/Analysis/ValueTracking.h"
     76 #include "llvm/IR/Constants.h"
     77 #include "llvm/IR/DataLayout.h"
     78 #include "llvm/IR/DebugInfo.h"
     79 #include "llvm/IR/DerivedTypes.h"
     80 #include "llvm/IR/DiagnosticInfo.h"
     81 #include "llvm/IR/Dominators.h"
     82 #include "llvm/IR/Function.h"
     83 #include "llvm/IR/IRBuilder.h"
     84 #include "llvm/IR/Instructions.h"
     85 #include "llvm/IR/IntrinsicInst.h"
     86 #include "llvm/IR/LLVMContext.h"
     87 #include "llvm/IR/Module.h"
     88 #include "llvm/IR/PatternMatch.h"
     89 #include "llvm/IR/Type.h"
     90 #include "llvm/IR/Value.h"
     91 #include "llvm/IR/ValueHandle.h"
     92 #include "llvm/IR/Verifier.h"
     93 #include "llvm/Pass.h"
     94 #include "llvm/Support/BranchProbability.h"
     95 #include "llvm/Support/CommandLine.h"
     96 #include "llvm/Support/Debug.h"
     97 #include "llvm/Support/raw_ostream.h"
     98 #include "llvm/Transforms/Scalar.h"
     99 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
    100 #include "llvm/Transforms/Utils/Local.h"
    101 #include "llvm/Analysis/VectorUtils.h"
    102 #include "llvm/Transforms/Utils/LoopUtils.h"
    103 #include <algorithm>
    104 #include <functional>
    105 #include <map>
    106 #include <tuple>
    107 
    108 using namespace llvm;
    109 using namespace llvm::PatternMatch;
    110 
    111 #define LV_NAME "loop-vectorize"
    112 #define DEBUG_TYPE LV_NAME
    113 
    114 STATISTIC(LoopsVectorized, "Number of loops vectorized");
    115 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
    116 
    117 static cl::opt<bool>
    118 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
    119                    cl::desc("Enable if-conversion during vectorization."));
    120 
    121 /// We don't vectorize loops with a known constant trip count below this number.
    122 static cl::opt<unsigned>
    123 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
    124                              cl::Hidden,
    125                              cl::desc("Don't vectorize loops with a constant "
    126                                       "trip count that is smaller than this "
    127                                       "value."));
    128 
    129 static cl::opt<bool> MaximizeBandwidth(
    130     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
    131     cl::desc("Maximize bandwidth when selecting vectorization factor which "
    132              "will be determined by the smallest type in loop."));
    133 
    134 /// This enables versioning on the strides of symbolically striding memory
    135 /// accesses in code like the following.
    136 ///   for (i = 0; i < N; ++i)
    137 ///     A[i * Stride1] += B[i * Stride2] ...
    138 ///
    139 /// Will be roughly translated to
    140 ///    if (Stride1 == 1 && Stride2 == 1) {
    141 ///      for (i = 0; i < N; i+=4)
    142 ///       A[i:i+3] += ...
    143 ///    } else
    144 ///      ...
    145 static cl::opt<bool> EnableMemAccessVersioning(
    146     "enable-mem-access-versioning", cl::init(true), cl::Hidden,
    147     cl::desc("Enable symbolic stride memory access versioning"));
    148 
    149 static cl::opt<bool> EnableInterleavedMemAccesses(
    150     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
    151     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
    152 
    153 /// Maximum factor for an interleaved memory access.
    154 static cl::opt<unsigned> MaxInterleaveGroupFactor(
    155     "max-interleave-group-factor", cl::Hidden,
    156     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
    157     cl::init(8));
    158 
    159 /// We don't interleave loops with a known constant trip count below this
    160 /// number.
    161 static const unsigned TinyTripCountInterleaveThreshold = 128;
    162 
    163 static cl::opt<unsigned> ForceTargetNumScalarRegs(
    164     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
    165     cl::desc("A flag that overrides the target's number of scalar registers."));
    166 
    167 static cl::opt<unsigned> ForceTargetNumVectorRegs(
    168     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
    169     cl::desc("A flag that overrides the target's number of vector registers."));
    170 
    171 /// Maximum vectorization interleave count.
    172 static const unsigned MaxInterleaveFactor = 16;
    173 
    174 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
    175     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
    176     cl::desc("A flag that overrides the target's max interleave factor for "
    177              "scalar loops."));
    178 
    179 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
    180     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
    181     cl::desc("A flag that overrides the target's max interleave factor for "
    182              "vectorized loops."));
    183 
    184 static cl::opt<unsigned> ForceTargetInstructionCost(
    185     "force-target-instruction-cost", cl::init(0), cl::Hidden,
    186     cl::desc("A flag that overrides the target's expected cost for "
    187              "an instruction to a single constant value. Mostly "
    188              "useful for getting consistent testing."));
    189 
    190 static cl::opt<unsigned> SmallLoopCost(
    191     "small-loop-cost", cl::init(20), cl::Hidden,
    192     cl::desc(
    193         "The cost of a loop that is considered 'small' by the interleaver."));
    194 
    195 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
    196     "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
    197     cl::desc("Enable the use of the block frequency analysis to access PGO "
    198              "heuristics minimizing code growth in cold regions and being more "
    199              "aggressive in hot regions."));
    200 
    201 // Runtime interleave loops for load/store throughput.
    202 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
    203     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
    204     cl::desc(
    205         "Enable runtime interleaving until load/store ports are saturated"));
    206 
    207 /// The number of stores in a loop that are allowed to need predication.
    208 static cl::opt<unsigned> NumberOfStoresToPredicate(
    209     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
    210     cl::desc("Max number of stores to be predicated behind an if."));
    211 
    212 static cl::opt<bool> EnableIndVarRegisterHeur(
    213     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
    214     cl::desc("Count the induction variable only once when interleaving"));
    215 
    216 static cl::opt<bool> EnableCondStoresVectorization(
    217     "enable-cond-stores-vec", cl::init(false), cl::Hidden,
    218     cl::desc("Enable if predication of stores during vectorization."));
    219 
    220 static cl::opt<unsigned> MaxNestedScalarReductionIC(
    221     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
    222     cl::desc("The maximum interleave count to use when interleaving a scalar "
    223              "reduction in a nested loop."));
    224 
    225 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
    226     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
    227     cl::desc("The maximum allowed number of runtime memory checks with a "
    228              "vectorize(enable) pragma."));
    229 
    230 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
    231     "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
    232     cl::desc("The maximum number of SCEV checks allowed."));
    233 
    234 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
    235     "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
    236     cl::desc("The maximum number of SCEV checks allowed with a "
    237              "vectorize(enable) pragma"));
    238 
    239 namespace {
    240 
    241 // Forward declarations.
    242 class LoopVectorizeHints;
    243 class LoopVectorizationLegality;
    244 class LoopVectorizationCostModel;
    245 class LoopVectorizationRequirements;
    246 
    247 /// \brief This modifies LoopAccessReport to initialize message with
    248 /// loop-vectorizer-specific part.
    249 class VectorizationReport : public LoopAccessReport {
    250 public:
    251   VectorizationReport(Instruction *I = nullptr)
    252       : LoopAccessReport("loop not vectorized: ", I) {}
    253 
    254   /// \brief This allows promotion of the loop-access analysis report into the
    255   /// loop-vectorizer report.  It modifies the message to add the
    256   /// loop-vectorizer-specific part of the message.
    257   explicit VectorizationReport(const LoopAccessReport &R)
    258       : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
    259                          R.getInstr()) {}
    260 };
    261 
    262 /// A helper function for converting Scalar types to vector types.
    263 /// If the incoming type is void, we return void. If the VF is 1, we return
    264 /// the scalar type.
    265 static Type* ToVectorTy(Type *Scalar, unsigned VF) {
    266   if (Scalar->isVoidTy() || VF == 1)
    267     return Scalar;
    268   return VectorType::get(Scalar, VF);
    269 }
    270 
    271 /// A helper function that returns GEP instruction and knows to skip a
    272 /// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
    273 /// pointee types of the 'bitcast' have the same size.
    274 /// For example:
    275 ///   bitcast double** %var to i64* - can be skipped
    276 ///   bitcast double** %var to i8*  - can not
    277 static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
    278 
    279   if (isa<GetElementPtrInst>(Ptr))
    280     return cast<GetElementPtrInst>(Ptr);
    281 
    282   if (isa<BitCastInst>(Ptr) &&
    283       isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
    284     Type *BitcastTy = Ptr->getType();
    285     Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
    286     if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
    287       return nullptr;
    288     Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
    289     Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
    290     const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
    291     if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
    292       return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
    293   }
    294   return nullptr;
    295 }
    296 
    297 /// InnerLoopVectorizer vectorizes loops which contain only one basic
    298 /// block to a specified vectorization factor (VF).
    299 /// This class performs the widening of scalars into vectors, or multiple
    300 /// scalars. This class also implements the following features:
    301 /// * It inserts an epilogue loop for handling loops that don't have iteration
    302 ///   counts that are known to be a multiple of the vectorization factor.
    303 /// * It handles the code generation for reduction variables.
    304 /// * Scalarization (implementation using scalars) of un-vectorizable
    305 ///   instructions.
    306 /// InnerLoopVectorizer does not perform any vectorization-legality
    307 /// checks, and relies on the caller to check for the different legality
    308 /// aspects. The InnerLoopVectorizer relies on the
    309 /// LoopVectorizationLegality class to provide information about the induction
    310 /// and reduction variables that were found to a given vectorization factor.
    311 class InnerLoopVectorizer {
    312 public:
    313   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
    314                       LoopInfo *LI, DominatorTree *DT,
    315                       const TargetLibraryInfo *TLI,
    316                       const TargetTransformInfo *TTI, unsigned VecWidth,
    317                       unsigned UnrollFactor)
    318       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
    319         VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
    320         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
    321         TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
    322         AddedSafetyChecks(false) {}
    323 
    324   // Perform the actual loop widening (vectorization).
    325   // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
    326   // can be validly truncated to. The cost model has assumed this truncation
    327   // will happen when vectorizing.
    328   void vectorize(LoopVectorizationLegality *L,
    329                  MapVector<Instruction*,uint64_t> MinimumBitWidths) {
    330     MinBWs = MinimumBitWidths;
    331     Legal = L;
    332     // Create a new empty loop. Unlink the old loop and connect the new one.
    333     createEmptyLoop();
    334     // Widen each instruction in the old loop to a new one in the new loop.
    335     // Use the Legality module to find the induction and reduction variables.
    336     vectorizeLoop();
    337   }
    338 
    339   // Return true if any runtime check is added.
    340   bool IsSafetyChecksAdded() {
    341     return AddedSafetyChecks;
    342   }
    343 
    344   virtual ~InnerLoopVectorizer() {}
    345 
    346 protected:
    347   /// A small list of PHINodes.
    348   typedef SmallVector<PHINode*, 4> PhiVector;
    349   /// When we unroll loops we have multiple vector values for each scalar.
    350   /// This data structure holds the unrolled and vectorized values that
    351   /// originated from one scalar instruction.
    352   typedef SmallVector<Value*, 2> VectorParts;
    353 
    354   // When we if-convert we need to create edge masks. We have to cache values
    355   // so that we don't end up with exponential recursion/IR.
    356   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
    357                    VectorParts> EdgeMaskCache;
    358 
    359   /// Create an empty loop, based on the loop ranges of the old loop.
    360   void createEmptyLoop();
    361   /// Create a new induction variable inside L.
    362   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
    363                                    Value *Step, Instruction *DL);
    364   /// Copy and widen the instructions from the old loop.
    365   virtual void vectorizeLoop();
    366 
    367   /// \brief The Loop exit block may have single value PHI nodes where the
    368   /// incoming value is 'Undef'. While vectorizing we only handled real values
    369   /// that were defined inside the loop. Here we fix the 'undef case'.
    370   /// See PR14725.
    371   void fixLCSSAPHIs();
    372 
    373   /// Shrinks vector element sizes based on information in "MinBWs".
    374   void truncateToMinimalBitwidths();
    375 
    376   /// A helper function that computes the predicate of the block BB, assuming
    377   /// that the header block of the loop is set to True. It returns the *entry*
    378   /// mask for the block BB.
    379   VectorParts createBlockInMask(BasicBlock *BB);
    380   /// A helper function that computes the predicate of the edge between SRC
    381   /// and DST.
    382   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
    383 
    384   /// A helper function to vectorize a single BB within the innermost loop.
    385   void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
    386 
    387   /// Vectorize a single PHINode in a block. This method handles the induction
    388   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
    389   /// arbitrary length vectors.
    390   void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
    391                            unsigned UF, unsigned VF, PhiVector *PV);
    392 
    393   /// Insert the new loop to the loop hierarchy and pass manager
    394   /// and update the analysis passes.
    395   void updateAnalysis();
    396 
    397   /// This instruction is un-vectorizable. Implement it as a sequence
    398   /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
    399   /// scalarized instruction behind an if block predicated on the control
    400   /// dependence of the instruction.
    401   virtual void scalarizeInstruction(Instruction *Instr,
    402                                     bool IfPredicateStore=false);
    403 
    404   /// Vectorize Load and Store instructions,
    405   virtual void vectorizeMemoryInstruction(Instruction *Instr);
    406 
    407   /// Create a broadcast instruction. This method generates a broadcast
    408   /// instruction (shuffle) for loop invariant values and for the induction
    409   /// value. If this is the induction variable then we extend it to N, N+1, ...
    410   /// this is needed because each iteration in the loop corresponds to a SIMD
    411   /// element.
    412   virtual Value *getBroadcastInstrs(Value *V);
    413 
    414   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
    415   /// to each vector element of Val. The sequence starts at StartIndex.
    416   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
    417 
    418   /// When we go over instructions in the basic block we rely on previous
    419   /// values within the current basic block or on loop invariant values.
    420   /// When we widen (vectorize) values we place them in the map. If the values
    421   /// are not within the map, they have to be loop invariant, so we simply
    422   /// broadcast them into a vector.
    423   VectorParts &getVectorValue(Value *V);
    424 
    425   /// Try to vectorize the interleaved access group that \p Instr belongs to.
    426   void vectorizeInterleaveGroup(Instruction *Instr);
    427 
    428   /// Generate a shuffle sequence that will reverse the vector Vec.
    429   virtual Value *reverseVector(Value *Vec);
    430 
    431   /// Returns (and creates if needed) the original loop trip count.
    432   Value *getOrCreateTripCount(Loop *NewLoop);
    433 
    434   /// Returns (and creates if needed) the trip count of the widened loop.
    435   Value *getOrCreateVectorTripCount(Loop *NewLoop);
    436 
    437   /// Emit a bypass check to see if the trip count would overflow, or we
    438   /// wouldn't have enough iterations to execute one vector loop.
    439   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
    440   /// Emit a bypass check to see if the vector trip count is nonzero.
    441   void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
    442   /// Emit a bypass check to see if all of the SCEV assumptions we've
    443   /// had to make are correct.
    444   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
    445   /// Emit bypass checks to check any memory assumptions we may have made.
    446   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
    447 
    448   /// This is a helper class that holds the vectorizer state. It maps scalar
    449   /// instructions to vector instructions. When the code is 'unrolled' then
    450   /// then a single scalar value is mapped to multiple vector parts. The parts
    451   /// are stored in the VectorPart type.
    452   struct ValueMap {
    453     /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
    454     /// are mapped.
    455     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
    456 
    457     /// \return True if 'Key' is saved in the Value Map.
    458     bool has(Value *Key) const { return MapStorage.count(Key); }
    459 
    460     /// Initializes a new entry in the map. Sets all of the vector parts to the
    461     /// save value in 'Val'.
    462     /// \return A reference to a vector with splat values.
    463     VectorParts &splat(Value *Key, Value *Val) {
    464       VectorParts &Entry = MapStorage[Key];
    465       Entry.assign(UF, Val);
    466       return Entry;
    467     }
    468 
    469     ///\return A reference to the value that is stored at 'Key'.
    470     VectorParts &get(Value *Key) {
    471       VectorParts &Entry = MapStorage[Key];
    472       if (Entry.empty())
    473         Entry.resize(UF);
    474       assert(Entry.size() == UF);
    475       return Entry;
    476     }
    477 
    478   private:
    479     /// The unroll factor. Each entry in the map stores this number of vector
    480     /// elements.
    481     unsigned UF;
    482 
    483     /// Map storage. We use std::map and not DenseMap because insertions to a
    484     /// dense map invalidates its iterators.
    485     std::map<Value *, VectorParts> MapStorage;
    486   };
    487 
    488   /// The original loop.
    489   Loop *OrigLoop;
    490   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
    491   /// dynamic knowledge to simplify SCEV expressions and converts them to a
    492   /// more usable form.
    493   PredicatedScalarEvolution &PSE;
    494   /// Loop Info.
    495   LoopInfo *LI;
    496   /// Dominator Tree.
    497   DominatorTree *DT;
    498   /// Alias Analysis.
    499   AliasAnalysis *AA;
    500   /// Target Library Info.
    501   const TargetLibraryInfo *TLI;
    502   /// Target Transform Info.
    503   const TargetTransformInfo *TTI;
    504 
    505   /// The vectorization SIMD factor to use. Each vector will have this many
    506   /// vector elements.
    507   unsigned VF;
    508 
    509 protected:
    510   /// The vectorization unroll factor to use. Each scalar is vectorized to this
    511   /// many different vector instructions.
    512   unsigned UF;
    513 
    514   /// The builder that we use
    515   IRBuilder<> Builder;
    516 
    517   // --- Vectorization state ---
    518 
    519   /// The vector-loop preheader.
    520   BasicBlock *LoopVectorPreHeader;
    521   /// The scalar-loop preheader.
    522   BasicBlock *LoopScalarPreHeader;
    523   /// Middle Block between the vector and the scalar.
    524   BasicBlock *LoopMiddleBlock;
    525   ///The ExitBlock of the scalar loop.
    526   BasicBlock *LoopExitBlock;
    527   ///The vector loop body.
    528   SmallVector<BasicBlock *, 4> LoopVectorBody;
    529   ///The scalar loop body.
    530   BasicBlock *LoopScalarBody;
    531   /// A list of all bypass blocks. The first block is the entry of the loop.
    532   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
    533 
    534   /// The new Induction variable which was added to the new block.
    535   PHINode *Induction;
    536   /// The induction variable of the old basic block.
    537   PHINode *OldInduction;
    538   /// Maps scalars to widened vectors.
    539   ValueMap WidenMap;
    540   /// Store instructions that should be predicated, as a pair
    541   ///   <StoreInst, Predicate>
    542   SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
    543   EdgeMaskCache MaskCache;
    544   /// Trip count of the original loop.
    545   Value *TripCount;
    546   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
    547   Value *VectorTripCount;
    548 
    549   /// Map of scalar integer values to the smallest bitwidth they can be legally
    550   /// represented as. The vector equivalents of these values should be truncated
    551   /// to this type.
    552   MapVector<Instruction*,uint64_t> MinBWs;
    553   LoopVectorizationLegality *Legal;
    554 
    555   // Record whether runtime check is added.
    556   bool AddedSafetyChecks;
    557 };
    558 
    559 class InnerLoopUnroller : public InnerLoopVectorizer {
    560 public:
    561   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
    562                     LoopInfo *LI, DominatorTree *DT,
    563                     const TargetLibraryInfo *TLI,
    564                     const TargetTransformInfo *TTI, unsigned UnrollFactor)
    565       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
    566 
    567 private:
    568   void scalarizeInstruction(Instruction *Instr,
    569                             bool IfPredicateStore = false) override;
    570   void vectorizeMemoryInstruction(Instruction *Instr) override;
    571   Value *getBroadcastInstrs(Value *V) override;
    572   Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
    573   Value *reverseVector(Value *Vec) override;
    574 };
    575 
    576 /// \brief Look for a meaningful debug location on the instruction or it's
    577 /// operands.
    578 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
    579   if (!I)
    580     return I;
    581 
    582   DebugLoc Empty;
    583   if (I->getDebugLoc() != Empty)
    584     return I;
    585 
    586   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
    587     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
    588       if (OpInst->getDebugLoc() != Empty)
    589         return OpInst;
    590   }
    591 
    592   return I;
    593 }
    594 
    595 /// \brief Set the debug location in the builder using the debug location in the
    596 /// instruction.
    597 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
    598   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
    599     B.SetCurrentDebugLocation(Inst->getDebugLoc());
    600   else
    601     B.SetCurrentDebugLocation(DebugLoc());
    602 }
    603 
    604 #ifndef NDEBUG
    605 /// \return string containing a file name and a line # for the given loop.
    606 static std::string getDebugLocString(const Loop *L) {
    607   std::string Result;
    608   if (L) {
    609     raw_string_ostream OS(Result);
    610     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
    611       LoopDbgLoc.print(OS);
    612     else
    613       // Just print the module name.
    614       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
    615     OS.flush();
    616   }
    617   return Result;
    618 }
    619 #endif
    620 
    621 /// \brief Propagate known metadata from one instruction to another.
    622 static void propagateMetadata(Instruction *To, const Instruction *From) {
    623   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
    624   From->getAllMetadataOtherThanDebugLoc(Metadata);
    625 
    626   for (auto M : Metadata) {
    627     unsigned Kind = M.first;
    628 
    629     // These are safe to transfer (this is safe for TBAA, even when we
    630     // if-convert, because should that metadata have had a control dependency
    631     // on the condition, and thus actually aliased with some other
    632     // non-speculated memory access when the condition was false, this would be
    633     // caught by the runtime overlap checks).
    634     if (Kind != LLVMContext::MD_tbaa &&
    635         Kind != LLVMContext::MD_alias_scope &&
    636         Kind != LLVMContext::MD_noalias &&
    637         Kind != LLVMContext::MD_fpmath &&
    638         Kind != LLVMContext::MD_nontemporal)
    639       continue;
    640 
    641     To->setMetadata(Kind, M.second);
    642   }
    643 }
    644 
    645 /// \brief Propagate known metadata from one instruction to a vector of others.
    646 static void propagateMetadata(SmallVectorImpl<Value *> &To,
    647                               const Instruction *From) {
    648   for (Value *V : To)
    649     if (Instruction *I = dyn_cast<Instruction>(V))
    650       propagateMetadata(I, From);
    651 }
    652 
    653 /// \brief The group of interleaved loads/stores sharing the same stride and
    654 /// close to each other.
    655 ///
    656 /// Each member in this group has an index starting from 0, and the largest
    657 /// index should be less than interleaved factor, which is equal to the absolute
    658 /// value of the access's stride.
    659 ///
    660 /// E.g. An interleaved load group of factor 4:
    661 ///        for (unsigned i = 0; i < 1024; i+=4) {
    662 ///          a = A[i];                           // Member of index 0
    663 ///          b = A[i+1];                         // Member of index 1
    664 ///          d = A[i+3];                         // Member of index 3
    665 ///          ...
    666 ///        }
    667 ///
    668 ///      An interleaved store group of factor 4:
    669 ///        for (unsigned i = 0; i < 1024; i+=4) {
    670 ///          ...
    671 ///          A[i]   = a;                         // Member of index 0
    672 ///          A[i+1] = b;                         // Member of index 1
    673 ///          A[i+2] = c;                         // Member of index 2
    674 ///          A[i+3] = d;                         // Member of index 3
    675 ///        }
    676 ///
    677 /// Note: the interleaved load group could have gaps (missing members), but
    678 /// the interleaved store group doesn't allow gaps.
    679 class InterleaveGroup {
    680 public:
    681   InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
    682       : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
    683     assert(Align && "The alignment should be non-zero");
    684 
    685     Factor = std::abs(Stride);
    686     assert(Factor > 1 && "Invalid interleave factor");
    687 
    688     Reverse = Stride < 0;
    689     Members[0] = Instr;
    690   }
    691 
    692   bool isReverse() const { return Reverse; }
    693   unsigned getFactor() const { return Factor; }
    694   unsigned getAlignment() const { return Align; }
    695   unsigned getNumMembers() const { return Members.size(); }
    696 
    697   /// \brief Try to insert a new member \p Instr with index \p Index and
    698   /// alignment \p NewAlign. The index is related to the leader and it could be
    699   /// negative if it is the new leader.
    700   ///
    701   /// \returns false if the instruction doesn't belong to the group.
    702   bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
    703     assert(NewAlign && "The new member's alignment should be non-zero");
    704 
    705     int Key = Index + SmallestKey;
    706 
    707     // Skip if there is already a member with the same index.
    708     if (Members.count(Key))
    709       return false;
    710 
    711     if (Key > LargestKey) {
    712       // The largest index is always less than the interleave factor.
    713       if (Index >= static_cast<int>(Factor))
    714         return false;
    715 
    716       LargestKey = Key;
    717     } else if (Key < SmallestKey) {
    718       // The largest index is always less than the interleave factor.
    719       if (LargestKey - Key >= static_cast<int>(Factor))
    720         return false;
    721 
    722       SmallestKey = Key;
    723     }
    724 
    725     // It's always safe to select the minimum alignment.
    726     Align = std::min(Align, NewAlign);
    727     Members[Key] = Instr;
    728     return true;
    729   }
    730 
    731   /// \brief Get the member with the given index \p Index
    732   ///
    733   /// \returns nullptr if contains no such member.
    734   Instruction *getMember(unsigned Index) const {
    735     int Key = SmallestKey + Index;
    736     if (!Members.count(Key))
    737       return nullptr;
    738 
    739     return Members.find(Key)->second;
    740   }
    741 
    742   /// \brief Get the index for the given member. Unlike the key in the member
    743   /// map, the index starts from 0.
    744   unsigned getIndex(Instruction *Instr) const {
    745     for (auto I : Members)
    746       if (I.second == Instr)
    747         return I.first - SmallestKey;
    748 
    749     llvm_unreachable("InterleaveGroup contains no such member");
    750   }
    751 
    752   Instruction *getInsertPos() const { return InsertPos; }
    753   void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
    754 
    755 private:
    756   unsigned Factor; // Interleave Factor.
    757   bool Reverse;
    758   unsigned Align;
    759   DenseMap<int, Instruction *> Members;
    760   int SmallestKey;
    761   int LargestKey;
    762 
    763   // To avoid breaking dependences, vectorized instructions of an interleave
    764   // group should be inserted at either the first load or the last store in
    765   // program order.
    766   //
    767   // E.g. %even = load i32             // Insert Position
    768   //      %add = add i32 %even         // Use of %even
    769   //      %odd = load i32
    770   //
    771   //      store i32 %even
    772   //      %odd = add i32               // Def of %odd
    773   //      store i32 %odd               // Insert Position
    774   Instruction *InsertPos;
    775 };
    776 
    777 /// \brief Drive the analysis of interleaved memory accesses in the loop.
    778 ///
    779 /// Use this class to analyze interleaved accesses only when we can vectorize
    780 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
    781 /// on interleaved accesses is unsafe.
    782 ///
    783 /// The analysis collects interleave groups and records the relationships
    784 /// between the member and the group in a map.
    785 class InterleavedAccessInfo {
    786 public:
    787   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
    788                         DominatorTree *DT)
    789       : PSE(PSE), TheLoop(L), DT(DT) {}
    790 
    791   ~InterleavedAccessInfo() {
    792     SmallSet<InterleaveGroup *, 4> DelSet;
    793     // Avoid releasing a pointer twice.
    794     for (auto &I : InterleaveGroupMap)
    795       DelSet.insert(I.second);
    796     for (auto *Ptr : DelSet)
    797       delete Ptr;
    798   }
    799 
    800   /// \brief Analyze the interleaved accesses and collect them in interleave
    801   /// groups. Substitute symbolic strides using \p Strides.
    802   void analyzeInterleaving(const ValueToValueMap &Strides);
    803 
    804   /// \brief Check if \p Instr belongs to any interleave group.
    805   bool isInterleaved(Instruction *Instr) const {
    806     return InterleaveGroupMap.count(Instr);
    807   }
    808 
    809   /// \brief Get the interleave group that \p Instr belongs to.
    810   ///
    811   /// \returns nullptr if doesn't have such group.
    812   InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
    813     if (InterleaveGroupMap.count(Instr))
    814       return InterleaveGroupMap.find(Instr)->second;
    815     return nullptr;
    816   }
    817 
    818 private:
    819   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
    820   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
    821   /// The interleaved access analysis can also add new predicates (for example
    822   /// by versioning strides of pointers).
    823   PredicatedScalarEvolution &PSE;
    824   Loop *TheLoop;
    825   DominatorTree *DT;
    826 
    827   /// Holds the relationships between the members and the interleave group.
    828   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
    829 
    830   /// \brief The descriptor for a strided memory access.
    831   struct StrideDescriptor {
    832     StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
    833                      unsigned Align)
    834         : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
    835 
    836     StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
    837 
    838     int Stride; // The access's stride. It is negative for a reverse access.
    839     const SCEV *Scev; // The scalar expression of this access
    840     unsigned Size;    // The size of the memory object.
    841     unsigned Align;   // The alignment of this access.
    842   };
    843 
    844   /// \brief Create a new interleave group with the given instruction \p Instr,
    845   /// stride \p Stride and alignment \p Align.
    846   ///
    847   /// \returns the newly created interleave group.
    848   InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
    849                                          unsigned Align) {
    850     assert(!InterleaveGroupMap.count(Instr) &&
    851            "Already in an interleaved access group");
    852     InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
    853     return InterleaveGroupMap[Instr];
    854   }
    855 
    856   /// \brief Release the group and remove all the relationships.
    857   void releaseGroup(InterleaveGroup *Group) {
    858     for (unsigned i = 0; i < Group->getFactor(); i++)
    859       if (Instruction *Member = Group->getMember(i))
    860         InterleaveGroupMap.erase(Member);
    861 
    862     delete Group;
    863   }
    864 
    865   /// \brief Collect all the accesses with a constant stride in program order.
    866   void collectConstStridedAccesses(
    867       MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
    868       const ValueToValueMap &Strides);
    869 };
    870 
    871 /// Utility class for getting and setting loop vectorizer hints in the form
    872 /// of loop metadata.
    873 /// This class keeps a number of loop annotations locally (as member variables)
    874 /// and can, upon request, write them back as metadata on the loop. It will
    875 /// initially scan the loop for existing metadata, and will update the local
    876 /// values based on information in the loop.
    877 /// We cannot write all values to metadata, as the mere presence of some info,
    878 /// for example 'force', means a decision has been made. So, we need to be
    879 /// careful NOT to add them if the user hasn't specifically asked so.
    880 class LoopVectorizeHints {
    881   enum HintKind {
    882     HK_WIDTH,
    883     HK_UNROLL,
    884     HK_FORCE
    885   };
    886 
    887   /// Hint - associates name and validation with the hint value.
    888   struct Hint {
    889     const char * Name;
    890     unsigned Value; // This may have to change for non-numeric values.
    891     HintKind Kind;
    892 
    893     Hint(const char * Name, unsigned Value, HintKind Kind)
    894       : Name(Name), Value(Value), Kind(Kind) { }
    895 
    896     bool validate(unsigned Val) {
    897       switch (Kind) {
    898       case HK_WIDTH:
    899         return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
    900       case HK_UNROLL:
    901         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
    902       case HK_FORCE:
    903         return (Val <= 1);
    904       }
    905       return false;
    906     }
    907   };
    908 
    909   /// Vectorization width.
    910   Hint Width;
    911   /// Vectorization interleave factor.
    912   Hint Interleave;
    913   /// Vectorization forced
    914   Hint Force;
    915 
    916   /// Return the loop metadata prefix.
    917   static StringRef Prefix() { return "llvm.loop."; }
    918 
    919 public:
    920   enum ForceKind {
    921     FK_Undefined = -1, ///< Not selected.
    922     FK_Disabled = 0,   ///< Forcing disabled.
    923     FK_Enabled = 1,    ///< Forcing enabled.
    924   };
    925 
    926   LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
    927       : Width("vectorize.width", VectorizerParams::VectorizationFactor,
    928               HK_WIDTH),
    929         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
    930         Force("vectorize.enable", FK_Undefined, HK_FORCE),
    931         TheLoop(L) {
    932     // Populate values with existing loop metadata.
    933     getHintsFromMetadata();
    934 
    935     // force-vector-interleave overrides DisableInterleaving.
    936     if (VectorizerParams::isInterleaveForced())
    937       Interleave.Value = VectorizerParams::VectorizationInterleave;
    938 
    939     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
    940           << "LV: Interleaving disabled by the pass manager\n");
    941   }
    942 
    943   /// Mark the loop L as already vectorized by setting the width to 1.
    944   void setAlreadyVectorized() {
    945     Width.Value = Interleave.Value = 1;
    946     Hint Hints[] = {Width, Interleave};
    947     writeHintsToMetadata(Hints);
    948   }
    949 
    950   bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
    951     if (getForce() == LoopVectorizeHints::FK_Disabled) {
    952       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
    953       emitOptimizationRemarkAnalysis(F->getContext(),
    954                                      vectorizeAnalysisPassName(), *F,
    955                                      L->getStartLoc(), emitRemark());
    956       return false;
    957     }
    958 
    959     if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
    960       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
    961       emitOptimizationRemarkAnalysis(F->getContext(),
    962                                      vectorizeAnalysisPassName(), *F,
    963                                      L->getStartLoc(), emitRemark());
    964       return false;
    965     }
    966 
    967     if (getWidth() == 1 && getInterleave() == 1) {
    968       // FIXME: Add a separate metadata to indicate when the loop has already
    969       // been vectorized instead of setting width and count to 1.
    970       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
    971       // FIXME: Add interleave.disable metadata. This will allow
    972       // vectorize.disable to be used without disabling the pass and errors
    973       // to differentiate between disabled vectorization and a width of 1.
    974       emitOptimizationRemarkAnalysis(
    975           F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(),
    976           "loop not vectorized: vectorization and interleaving are explicitly "
    977           "disabled, or vectorize width and interleave count are both set to "
    978           "1");
    979       return false;
    980     }
    981 
    982     return true;
    983   }
    984 
    985   /// Dumps all the hint information.
    986   std::string emitRemark() const {
    987     VectorizationReport R;
    988     if (Force.Value == LoopVectorizeHints::FK_Disabled)
    989       R << "vectorization is explicitly disabled";
    990     else {
    991       R << "use -Rpass-analysis=loop-vectorize for more info";
    992       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
    993         R << " (Force=true";
    994         if (Width.Value != 0)
    995           R << ", Vector Width=" << Width.Value;
    996         if (Interleave.Value != 0)
    997           R << ", Interleave Count=" << Interleave.Value;
    998         R << ")";
    999       }
   1000     }
   1001 
   1002     return R.str();
   1003   }
   1004 
   1005   unsigned getWidth() const { return Width.Value; }
   1006   unsigned getInterleave() const { return Interleave.Value; }
   1007   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
   1008   const char *vectorizeAnalysisPassName() const {
   1009     // If hints are provided that don't disable vectorization use the
   1010     // AlwaysPrint pass name to force the frontend to print the diagnostic.
   1011     if (getWidth() == 1)
   1012       return LV_NAME;
   1013     if (getForce() == LoopVectorizeHints::FK_Disabled)
   1014       return LV_NAME;
   1015     if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
   1016       return LV_NAME;
   1017     return DiagnosticInfo::AlwaysPrint;
   1018   }
   1019 
   1020   bool allowReordering() const {
   1021     // When enabling loop hints are provided we allow the vectorizer to change
   1022     // the order of operations that is given by the scalar loop. This is not
   1023     // enabled by default because can be unsafe or inefficient. For example,
   1024     // reordering floating-point operations will change the way round-off
   1025     // error accumulates in the loop.
   1026     return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
   1027   }
   1028 
   1029 private:
   1030   /// Find hints specified in the loop metadata and update local values.
   1031   void getHintsFromMetadata() {
   1032     MDNode *LoopID = TheLoop->getLoopID();
   1033     if (!LoopID)
   1034       return;
   1035 
   1036     // First operand should refer to the loop id itself.
   1037     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
   1038     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
   1039 
   1040     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
   1041       const MDString *S = nullptr;
   1042       SmallVector<Metadata *, 4> Args;
   1043 
   1044       // The expected hint is either a MDString or a MDNode with the first
   1045       // operand a MDString.
   1046       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
   1047         if (!MD || MD->getNumOperands() == 0)
   1048           continue;
   1049         S = dyn_cast<MDString>(MD->getOperand(0));
   1050         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
   1051           Args.push_back(MD->getOperand(i));
   1052       } else {
   1053         S = dyn_cast<MDString>(LoopID->getOperand(i));
   1054         assert(Args.size() == 0 && "too many arguments for MDString");
   1055       }
   1056 
   1057       if (!S)
   1058         continue;
   1059 
   1060       // Check if the hint starts with the loop metadata prefix.
   1061       StringRef Name = S->getString();
   1062       if (Args.size() == 1)
   1063         setHint(Name, Args[0]);
   1064     }
   1065   }
   1066 
   1067   /// Checks string hint with one operand and set value if valid.
   1068   void setHint(StringRef Name, Metadata *Arg) {
   1069     if (!Name.startswith(Prefix()))
   1070       return;
   1071     Name = Name.substr(Prefix().size(), StringRef::npos);
   1072 
   1073     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
   1074     if (!C) return;
   1075     unsigned Val = C->getZExtValue();
   1076 
   1077     Hint *Hints[] = {&Width, &Interleave, &Force};
   1078     for (auto H : Hints) {
   1079       if (Name == H->Name) {
   1080         if (H->validate(Val))
   1081           H->Value = Val;
   1082         else
   1083           DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
   1084         break;
   1085       }
   1086     }
   1087   }
   1088 
   1089   /// Create a new hint from name / value pair.
   1090   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
   1091     LLVMContext &Context = TheLoop->getHeader()->getContext();
   1092     Metadata *MDs[] = {MDString::get(Context, Name),
   1093                        ConstantAsMetadata::get(
   1094                            ConstantInt::get(Type::getInt32Ty(Context), V))};
   1095     return MDNode::get(Context, MDs);
   1096   }
   1097 
   1098   /// Matches metadata with hint name.
   1099   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
   1100     MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
   1101     if (!Name)
   1102       return false;
   1103 
   1104     for (auto H : HintTypes)
   1105       if (Name->getString().endswith(H.Name))
   1106         return true;
   1107     return false;
   1108   }
   1109 
   1110   /// Sets current hints into loop metadata, keeping other values intact.
   1111   void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
   1112     if (HintTypes.size() == 0)
   1113       return;
   1114 
   1115     // Reserve the first element to LoopID (see below).
   1116     SmallVector<Metadata *, 4> MDs(1);
   1117     // If the loop already has metadata, then ignore the existing operands.
   1118     MDNode *LoopID = TheLoop->getLoopID();
   1119     if (LoopID) {
   1120       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
   1121         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
   1122         // If node in update list, ignore old value.
   1123         if (!matchesHintMetadataName(Node, HintTypes))
   1124           MDs.push_back(Node);
   1125       }
   1126     }
   1127 
   1128     // Now, add the missing hints.
   1129     for (auto H : HintTypes)
   1130       MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
   1131 
   1132     // Replace current metadata node with new one.
   1133     LLVMContext &Context = TheLoop->getHeader()->getContext();
   1134     MDNode *NewLoopID = MDNode::get(Context, MDs);
   1135     // Set operand 0 to refer to the loop id itself.
   1136     NewLoopID->replaceOperandWith(0, NewLoopID);
   1137 
   1138     TheLoop->setLoopID(NewLoopID);
   1139   }
   1140 
   1141   /// The loop these hints belong to.
   1142   const Loop *TheLoop;
   1143 };
   1144 
   1145 static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop,
   1146                              const LoopVectorizeHints &Hints,
   1147                              const LoopAccessReport &Message) {
   1148   const char *Name = Hints.vectorizeAnalysisPassName();
   1149   LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name);
   1150 }
   1151 
   1152 static void emitMissedWarning(Function *F, Loop *L,
   1153                               const LoopVectorizeHints &LH) {
   1154   emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(),
   1155                                LH.emitRemark());
   1156 
   1157   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
   1158     if (LH.getWidth() != 1)
   1159       emitLoopVectorizeWarning(
   1160           F->getContext(), *F, L->getStartLoc(),
   1161           "failed explicitly specified loop vectorization");
   1162     else if (LH.getInterleave() != 1)
   1163       emitLoopInterleaveWarning(
   1164           F->getContext(), *F, L->getStartLoc(),
   1165           "failed explicitly specified loop interleaving");
   1166   }
   1167 }
   1168 
   1169 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
   1170 /// to what vectorization factor.
   1171 /// This class does not look at the profitability of vectorization, only the
   1172 /// legality. This class has two main kinds of checks:
   1173 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
   1174 ///   will change the order of memory accesses in a way that will change the
   1175 ///   correctness of the program.
   1176 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
   1177 /// checks for a number of different conditions, such as the availability of a
   1178 /// single induction variable, that all types are supported and vectorize-able,
   1179 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
   1180 /// This class is also used by InnerLoopVectorizer for identifying
   1181 /// induction variable and the different reduction variables.
   1182 class LoopVectorizationLegality {
   1183 public:
   1184   LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE,
   1185                             DominatorTree *DT, TargetLibraryInfo *TLI,
   1186                             AliasAnalysis *AA, Function *F,
   1187                             const TargetTransformInfo *TTI,
   1188                             LoopAccessAnalysis *LAA,
   1189                             LoopVectorizationRequirements *R,
   1190                             const LoopVectorizeHints *H)
   1191       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
   1192         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
   1193         Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
   1194         Requirements(R), Hints(H) {}
   1195 
   1196   /// ReductionList contains the reduction descriptors for all
   1197   /// of the reductions that were found in the loop.
   1198   typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
   1199 
   1200   /// InductionList saves induction variables and maps them to the
   1201   /// induction descriptor.
   1202   typedef MapVector<PHINode*, InductionDescriptor> InductionList;
   1203 
   1204   /// Returns true if it is legal to vectorize this loop.
   1205   /// This does not mean that it is profitable to vectorize this
   1206   /// loop, only that it is legal to do so.
   1207   bool canVectorize();
   1208 
   1209   /// Returns the Induction variable.
   1210   PHINode *getInduction() { return Induction; }
   1211 
   1212   /// Returns the reduction variables found in the loop.
   1213   ReductionList *getReductionVars() { return &Reductions; }
   1214 
   1215   /// Returns the induction variables found in the loop.
   1216   InductionList *getInductionVars() { return &Inductions; }
   1217 
   1218   /// Returns the widest induction type.
   1219   Type *getWidestInductionType() { return WidestIndTy; }
   1220 
   1221   /// Returns True if V is an induction variable in this loop.
   1222   bool isInductionVariable(const Value *V);
   1223 
   1224   /// Returns True if PN is a reduction variable in this loop.
   1225   bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
   1226 
   1227   /// Return true if the block BB needs to be predicated in order for the loop
   1228   /// to be vectorized.
   1229   bool blockNeedsPredication(BasicBlock *BB);
   1230 
   1231   /// Check if this  pointer is consecutive when vectorizing. This happens
   1232   /// when the last index of the GEP is the induction variable, or that the
   1233   /// pointer itself is an induction variable.
   1234   /// This check allows us to vectorize A[idx] into a wide load/store.
   1235   /// Returns:
   1236   /// 0 - Stride is unknown or non-consecutive.
   1237   /// 1 - Address is consecutive.
   1238   /// -1 - Address is consecutive, and decreasing.
   1239   int isConsecutivePtr(Value *Ptr);
   1240 
   1241   /// Returns true if the value V is uniform within the loop.
   1242   bool isUniform(Value *V);
   1243 
   1244   /// Returns true if this instruction will remain scalar after vectorization.
   1245   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
   1246 
   1247   /// Returns the information that we collected about runtime memory check.
   1248   const RuntimePointerChecking *getRuntimePointerChecking() const {
   1249     return LAI->getRuntimePointerChecking();
   1250   }
   1251 
   1252   const LoopAccessInfo *getLAI() const {
   1253     return LAI;
   1254   }
   1255 
   1256   /// \brief Check if \p Instr belongs to any interleaved access group.
   1257   bool isAccessInterleaved(Instruction *Instr) {
   1258     return InterleaveInfo.isInterleaved(Instr);
   1259   }
   1260 
   1261   /// \brief Get the interleaved access group that \p Instr belongs to.
   1262   const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
   1263     return InterleaveInfo.getInterleaveGroup(Instr);
   1264   }
   1265 
   1266   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
   1267 
   1268   bool hasStride(Value *V) { return StrideSet.count(V); }
   1269   bool mustCheckStrides() { return !StrideSet.empty(); }
   1270   SmallPtrSet<Value *, 8>::iterator strides_begin() {
   1271     return StrideSet.begin();
   1272   }
   1273   SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
   1274 
   1275   /// Returns true if the target machine supports masked store operation
   1276   /// for the given \p DataType and kind of access to \p Ptr.
   1277   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
   1278     return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
   1279   }
   1280   /// Returns true if the target machine supports masked load operation
   1281   /// for the given \p DataType and kind of access to \p Ptr.
   1282   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
   1283     return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
   1284   }
   1285   /// Returns true if vector representation of the instruction \p I
   1286   /// requires mask.
   1287   bool isMaskRequired(const Instruction* I) {
   1288     return (MaskedOp.count(I) != 0);
   1289   }
   1290   unsigned getNumStores() const {
   1291     return LAI->getNumStores();
   1292   }
   1293   unsigned getNumLoads() const {
   1294     return LAI->getNumLoads();
   1295   }
   1296   unsigned getNumPredStores() const {
   1297     return NumPredStores;
   1298   }
   1299 private:
   1300   /// Check if a single basic block loop is vectorizable.
   1301   /// At this point we know that this is a loop with a constant trip count
   1302   /// and we only need to check individual instructions.
   1303   bool canVectorizeInstrs();
   1304 
   1305   /// When we vectorize loops we may change the order in which
   1306   /// we read and write from memory. This method checks if it is
   1307   /// legal to vectorize the code, considering only memory constrains.
   1308   /// Returns true if the loop is vectorizable
   1309   bool canVectorizeMemory();
   1310 
   1311   /// Return true if we can vectorize this loop using the IF-conversion
   1312   /// transformation.
   1313   bool canVectorizeWithIfConvert();
   1314 
   1315   /// Collect the variables that need to stay uniform after vectorization.
   1316   void collectLoopUniforms();
   1317 
   1318   /// Return true if all of the instructions in the block can be speculatively
   1319   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   1320   /// and we know that we can read from them without segfault.
   1321   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
   1322 
   1323   /// \brief Collect memory access with loop invariant strides.
   1324   ///
   1325   /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop
   1326   /// invariant.
   1327   void collectStridedAccess(Value *LoadOrStoreInst);
   1328 
   1329   /// Report an analysis message to assist the user in diagnosing loops that are
   1330   /// not vectorized.  These are handled as LoopAccessReport rather than
   1331   /// VectorizationReport because the << operator of VectorizationReport returns
   1332   /// LoopAccessReport.
   1333   void emitAnalysis(const LoopAccessReport &Message) const {
   1334     emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
   1335   }
   1336 
   1337   unsigned NumPredStores;
   1338 
   1339   /// The loop that we evaluate.
   1340   Loop *TheLoop;
   1341   /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
   1342   /// Applies dynamic knowledge to simplify SCEV expressions in the context
   1343   /// of existing SCEV assumptions. The analysis will also add a minimal set
   1344   /// of new predicates if this is required to enable vectorization and
   1345   /// unrolling.
   1346   PredicatedScalarEvolution &PSE;
   1347   /// Target Library Info.
   1348   TargetLibraryInfo *TLI;
   1349   /// Parent function
   1350   Function *TheFunction;
   1351   /// Target Transform Info
   1352   const TargetTransformInfo *TTI;
   1353   /// Dominator Tree.
   1354   DominatorTree *DT;
   1355   // LoopAccess analysis.
   1356   LoopAccessAnalysis *LAA;
   1357   // And the loop-accesses info corresponding to this loop.  This pointer is
   1358   // null until canVectorizeMemory sets it up.
   1359   const LoopAccessInfo *LAI;
   1360 
   1361   /// The interleave access information contains groups of interleaved accesses
   1362   /// with the same stride and close to each other.
   1363   InterleavedAccessInfo InterleaveInfo;
   1364 
   1365   //  ---  vectorization state --- //
   1366 
   1367   /// Holds the integer induction variable. This is the counter of the
   1368   /// loop.
   1369   PHINode *Induction;
   1370   /// Holds the reduction variables.
   1371   ReductionList Reductions;
   1372   /// Holds all of the induction variables that we found in the loop.
   1373   /// Notice that inductions don't need to start at zero and that induction
   1374   /// variables can be pointers.
   1375   InductionList Inductions;
   1376   /// Holds the widest induction type encountered.
   1377   Type *WidestIndTy;
   1378 
   1379   /// Allowed outside users. This holds the reduction
   1380   /// vars which can be accessed from outside the loop.
   1381   SmallPtrSet<Value*, 4> AllowedExit;
   1382   /// This set holds the variables which are known to be uniform after
   1383   /// vectorization.
   1384   SmallPtrSet<Instruction*, 4> Uniforms;
   1385 
   1386   /// Can we assume the absence of NaNs.
   1387   bool HasFunNoNaNAttr;
   1388 
   1389   /// Vectorization requirements that will go through late-evaluation.
   1390   LoopVectorizationRequirements *Requirements;
   1391 
   1392   /// Used to emit an analysis of any legality issues.
   1393   const LoopVectorizeHints *Hints;
   1394 
   1395   ValueToValueMap Strides;
   1396   SmallPtrSet<Value *, 8> StrideSet;
   1397 
   1398   /// While vectorizing these instructions we have to generate a
   1399   /// call to the appropriate masked intrinsic
   1400   SmallPtrSet<const Instruction *, 8> MaskedOp;
   1401 };
   1402 
   1403 /// LoopVectorizationCostModel - estimates the expected speedups due to
   1404 /// vectorization.
   1405 /// In many cases vectorization is not profitable. This can happen because of
   1406 /// a number of reasons. In this class we mainly attempt to predict the
   1407 /// expected speedup/slowdowns due to the supported instruction set. We use the
   1408 /// TargetTransformInfo to query the different backends for the cost of
   1409 /// different operations.
   1410 class LoopVectorizationCostModel {
   1411 public:
   1412   LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
   1413                              LoopInfo *LI, LoopVectorizationLegality *Legal,
   1414                              const TargetTransformInfo &TTI,
   1415                              const TargetLibraryInfo *TLI, DemandedBits *DB,
   1416                              AssumptionCache *AC, const Function *F,
   1417                              const LoopVectorizeHints *Hints)
   1418       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
   1419         AC(AC), TheFunction(F), Hints(Hints) {}
   1420 
   1421   /// Information about vectorization costs
   1422   struct VectorizationFactor {
   1423     unsigned Width; // Vector width with best cost
   1424     unsigned Cost; // Cost of the loop with that width
   1425   };
   1426   /// \return The most profitable vectorization factor and the cost of that VF.
   1427   /// This method checks every power of two up to VF. If UserVF is not ZERO
   1428   /// then this vectorization factor will be selected if vectorization is
   1429   /// possible.
   1430   VectorizationFactor selectVectorizationFactor(bool OptForSize);
   1431 
   1432   /// \return The size (in bits) of the smallest and widest types in the code
   1433   /// that needs to be vectorized. We ignore values that remain scalar such as
   1434   /// 64 bit loop indices.
   1435   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
   1436 
   1437   /// \return The desired interleave count.
   1438   /// If interleave count has been specified by metadata it will be returned.
   1439   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   1440   /// are the selected vectorization factor and the cost of the selected VF.
   1441   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
   1442                                  unsigned LoopCost);
   1443 
   1444   /// \return The most profitable unroll factor.
   1445   /// This method finds the best unroll-factor based on register pressure and
   1446   /// other parameters. VF and LoopCost are the selected vectorization factor
   1447   /// and the cost of the selected VF.
   1448   unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
   1449                                   unsigned LoopCost);
   1450 
   1451   /// \brief A struct that represents some properties of the register usage
   1452   /// of a loop.
   1453   struct RegisterUsage {
   1454     /// Holds the number of loop invariant values that are used in the loop.
   1455     unsigned LoopInvariantRegs;
   1456     /// Holds the maximum number of concurrent live intervals in the loop.
   1457     unsigned MaxLocalUsers;
   1458     /// Holds the number of instructions in the loop.
   1459     unsigned NumInstructions;
   1460   };
   1461 
   1462   /// \return Returns information about the register usages of the loop for the
   1463   /// given vectorization factors.
   1464   SmallVector<RegisterUsage, 8>
   1465   calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
   1466 
   1467   /// Collect values we want to ignore in the cost model.
   1468   void collectValuesToIgnore();
   1469 
   1470 private:
   1471   /// Returns the expected execution cost. The unit of the cost does
   1472   /// not matter because we use the 'cost' units to compare different
   1473   /// vector widths. The cost that is returned is *not* normalized by
   1474   /// the factor width.
   1475   unsigned expectedCost(unsigned VF);
   1476 
   1477   /// Returns the execution time cost of an instruction for a given vector
   1478   /// width. Vector width of one means scalar.
   1479   unsigned getInstructionCost(Instruction *I, unsigned VF);
   1480 
   1481   /// Returns whether the instruction is a load or store and will be a emitted
   1482   /// as a vector operation.
   1483   bool isConsecutiveLoadOrStore(Instruction *I);
   1484 
   1485   /// Report an analysis message to assist the user in diagnosing loops that are
   1486   /// not vectorized.  These are handled as LoopAccessReport rather than
   1487   /// VectorizationReport because the << operator of VectorizationReport returns
   1488   /// LoopAccessReport.
   1489   void emitAnalysis(const LoopAccessReport &Message) const {
   1490     emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
   1491   }
   1492 
   1493 public:
   1494   /// Map of scalar integer values to the smallest bitwidth they can be legally
   1495   /// represented as. The vector equivalents of these values should be truncated
   1496   /// to this type.
   1497   MapVector<Instruction*,uint64_t> MinBWs;
   1498 
   1499   /// The loop that we evaluate.
   1500   Loop *TheLoop;
   1501   /// Predicated scalar evolution analysis.
   1502   PredicatedScalarEvolution &PSE;
   1503   /// Loop Info analysis.
   1504   LoopInfo *LI;
   1505   /// Vectorization legality.
   1506   LoopVectorizationLegality *Legal;
   1507   /// Vector target information.
   1508   const TargetTransformInfo &TTI;
   1509   /// Target Library Info.
   1510   const TargetLibraryInfo *TLI;
   1511   /// Demanded bits analysis.
   1512   DemandedBits *DB;
   1513   /// Assumption cache.
   1514   AssumptionCache *AC;
   1515   const Function *TheFunction;
   1516   /// Loop Vectorize Hint.
   1517   const LoopVectorizeHints *Hints;
   1518   /// Values to ignore in the cost model.
   1519   SmallPtrSet<const Value *, 16> ValuesToIgnore;
   1520   /// Values to ignore in the cost model when VF > 1.
   1521   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
   1522 };
   1523 
   1524 /// \brief This holds vectorization requirements that must be verified late in
   1525 /// the process. The requirements are set by legalize and costmodel. Once
   1526 /// vectorization has been determined to be possible and profitable the
   1527 /// requirements can be verified by looking for metadata or compiler options.
   1528 /// For example, some loops require FP commutativity which is only allowed if
   1529 /// vectorization is explicitly specified or if the fast-math compiler option
   1530 /// has been provided.
   1531 /// Late evaluation of these requirements allows helpful diagnostics to be
   1532 /// composed that tells the user what need to be done to vectorize the loop. For
   1533 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
   1534 /// evaluation should be used only when diagnostics can generated that can be
   1535 /// followed by a non-expert user.
   1536 class LoopVectorizationRequirements {
   1537 public:
   1538   LoopVectorizationRequirements()
   1539       : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {}
   1540 
   1541   void addUnsafeAlgebraInst(Instruction *I) {
   1542     // First unsafe algebra instruction.
   1543     if (!UnsafeAlgebraInst)
   1544       UnsafeAlgebraInst = I;
   1545   }
   1546 
   1547   void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
   1548 
   1549   bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
   1550     const char *Name = Hints.vectorizeAnalysisPassName();
   1551     bool Failed = false;
   1552     if (UnsafeAlgebraInst && !Hints.allowReordering()) {
   1553       emitOptimizationRemarkAnalysisFPCommute(
   1554           F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
   1555           VectorizationReport() << "cannot prove it is safe to reorder "
   1556                                    "floating-point operations");
   1557       Failed = true;
   1558     }
   1559 
   1560     // Test if runtime memcheck thresholds are exceeded.
   1561     bool PragmaThresholdReached =
   1562         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
   1563     bool ThresholdReached =
   1564         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
   1565     if ((ThresholdReached && !Hints.allowReordering()) ||
   1566         PragmaThresholdReached) {
   1567       emitOptimizationRemarkAnalysisAliasing(
   1568           F->getContext(), Name, *F, L->getStartLoc(),
   1569           VectorizationReport()
   1570               << "cannot prove it is safe to reorder memory operations");
   1571       DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
   1572       Failed = true;
   1573     }
   1574 
   1575     return Failed;
   1576   }
   1577 
   1578 private:
   1579   unsigned NumRuntimePointerChecks;
   1580   Instruction *UnsafeAlgebraInst;
   1581 };
   1582 
   1583 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
   1584   if (L.empty())
   1585     return V.push_back(&L);
   1586 
   1587   for (Loop *InnerL : L)
   1588     addInnerLoop(*InnerL, V);
   1589 }
   1590 
   1591 /// The LoopVectorize Pass.
   1592 struct LoopVectorize : public FunctionPass {
   1593   /// Pass identification, replacement for typeid
   1594   static char ID;
   1595 
   1596   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
   1597     : FunctionPass(ID),
   1598       DisableUnrolling(NoUnrolling),
   1599       AlwaysVectorize(AlwaysVectorize) {
   1600     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   1601   }
   1602 
   1603   ScalarEvolution *SE;
   1604   LoopInfo *LI;
   1605   TargetTransformInfo *TTI;
   1606   DominatorTree *DT;
   1607   BlockFrequencyInfo *BFI;
   1608   TargetLibraryInfo *TLI;
   1609   DemandedBits *DB;
   1610   AliasAnalysis *AA;
   1611   AssumptionCache *AC;
   1612   LoopAccessAnalysis *LAA;
   1613   bool DisableUnrolling;
   1614   bool AlwaysVectorize;
   1615 
   1616   BlockFrequency ColdEntryFreq;
   1617 
   1618   bool runOnFunction(Function &F) override {
   1619     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   1620     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   1621     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   1622     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   1623     BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
   1624     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   1625     TLI = TLIP ? &TLIP->getTLI() : nullptr;
   1626     AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   1627     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   1628     LAA = &getAnalysis<LoopAccessAnalysis>();
   1629     DB = &getAnalysis<DemandedBits>();
   1630 
   1631     // Compute some weights outside of the loop over the loops. Compute this
   1632     // using a BranchProbability to re-use its scaling math.
   1633     const BranchProbability ColdProb(1, 5); // 20%
   1634     ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
   1635 
   1636     // Don't attempt if
   1637     // 1. the target claims to have no vector registers, and
   1638     // 2. interleaving won't help ILP.
   1639     //
   1640     // The second condition is necessary because, even if the target has no
   1641     // vector registers, loop vectorization may still enable scalar
   1642     // interleaving.
   1643     if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
   1644       return false;
   1645 
   1646     // Build up a worklist of inner-loops to vectorize. This is necessary as
   1647     // the act of vectorizing or partially unrolling a loop creates new loops
   1648     // and can invalidate iterators across the loops.
   1649     SmallVector<Loop *, 8> Worklist;
   1650 
   1651     for (Loop *L : *LI)
   1652       addInnerLoop(*L, Worklist);
   1653 
   1654     LoopsAnalyzed += Worklist.size();
   1655 
   1656     // Now walk the identified inner loops.
   1657     bool Changed = false;
   1658     while (!Worklist.empty())
   1659       Changed |= processLoop(Worklist.pop_back_val());
   1660 
   1661     // Process each loop nest in the function.
   1662     return Changed;
   1663   }
   1664 
   1665   static void AddRuntimeUnrollDisableMetaData(Loop *L) {
   1666     SmallVector<Metadata *, 4> MDs;
   1667     // Reserve first location for self reference to the LoopID metadata node.
   1668     MDs.push_back(nullptr);
   1669     bool IsUnrollMetadata = false;
   1670     MDNode *LoopID = L->getLoopID();
   1671     if (LoopID) {
   1672       // First find existing loop unrolling disable metadata.
   1673       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
   1674         MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
   1675         if (MD) {
   1676           const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
   1677           IsUnrollMetadata =
   1678               S && S->getString().startswith("llvm.loop.unroll.disable");
   1679         }
   1680         MDs.push_back(LoopID->getOperand(i));
   1681       }
   1682     }
   1683 
   1684     if (!IsUnrollMetadata) {
   1685       // Add runtime unroll disable metadata.
   1686       LLVMContext &Context = L->getHeader()->getContext();
   1687       SmallVector<Metadata *, 1> DisableOperands;
   1688       DisableOperands.push_back(
   1689           MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
   1690       MDNode *DisableNode = MDNode::get(Context, DisableOperands);
   1691       MDs.push_back(DisableNode);
   1692       MDNode *NewLoopID = MDNode::get(Context, MDs);
   1693       // Set operand 0 to refer to the loop id itself.
   1694       NewLoopID->replaceOperandWith(0, NewLoopID);
   1695       L->setLoopID(NewLoopID);
   1696     }
   1697   }
   1698 
   1699   bool processLoop(Loop *L) {
   1700     assert(L->empty() && "Only process inner loops.");
   1701 
   1702 #ifndef NDEBUG
   1703     const std::string DebugLocStr = getDebugLocString(L);
   1704 #endif /* NDEBUG */
   1705 
   1706     DEBUG(dbgs() << "\nLV: Checking a loop in \""
   1707                  << L->getHeader()->getParent()->getName() << "\" from "
   1708                  << DebugLocStr << "\n");
   1709 
   1710     LoopVectorizeHints Hints(L, DisableUnrolling);
   1711 
   1712     DEBUG(dbgs() << "LV: Loop hints:"
   1713                  << " force="
   1714                  << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
   1715                          ? "disabled"
   1716                          : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
   1717                                 ? "enabled"
   1718                                 : "?")) << " width=" << Hints.getWidth()
   1719                  << " unroll=" << Hints.getInterleave() << "\n");
   1720 
   1721     // Function containing loop
   1722     Function *F = L->getHeader()->getParent();
   1723 
   1724     // Looking at the diagnostic output is the only way to determine if a loop
   1725     // was vectorized (other than looking at the IR or machine code), so it
   1726     // is important to generate an optimization remark for each loop. Most of
   1727     // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
   1728     // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
   1729     // less verbose reporting vectorized loops and unvectorized loops that may
   1730     // benefit from vectorization, respectively.
   1731 
   1732     if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
   1733       DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
   1734       return false;
   1735     }
   1736 
   1737     // Check the loop for a trip count threshold:
   1738     // do not vectorize loops with a tiny trip count.
   1739     const unsigned TC = SE->getSmallConstantTripCount(L);
   1740     if (TC > 0u && TC < TinyTripCountVectorThreshold) {
   1741       DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
   1742                    << "This loop is not worth vectorizing.");
   1743       if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
   1744         DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
   1745       else {
   1746         DEBUG(dbgs() << "\n");
   1747         emitAnalysisDiag(F, L, Hints, VectorizationReport()
   1748                                           << "vectorization is not beneficial "
   1749                                              "and is not explicitly forced");
   1750         return false;
   1751       }
   1752     }
   1753 
   1754     PredicatedScalarEvolution PSE(*SE);
   1755 
   1756     // Check if it is legal to vectorize the loop.
   1757     LoopVectorizationRequirements Requirements;
   1758     LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
   1759                                   &Requirements, &Hints);
   1760     if (!LVL.canVectorize()) {
   1761       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
   1762       emitMissedWarning(F, L, Hints);
   1763       return false;
   1764     }
   1765 
   1766     // Use the cost model.
   1767     LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
   1768                                   &Hints);
   1769     CM.collectValuesToIgnore();
   1770 
   1771     // Check the function attributes to find out if this function should be
   1772     // optimized for size.
   1773     bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
   1774                       F->optForSize();
   1775 
   1776     // Compute the weighted frequency of this loop being executed and see if it
   1777     // is less than 20% of the function entry baseline frequency. Note that we
   1778     // always have a canonical loop here because we think we *can* vectorize.
   1779     // FIXME: This is hidden behind a flag due to pervasive problems with
   1780     // exactly what block frequency models.
   1781     if (LoopVectorizeWithBlockFrequency) {
   1782       BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
   1783       if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
   1784           LoopEntryFreq < ColdEntryFreq)
   1785         OptForSize = true;
   1786     }
   1787 
   1788     // Check the function attributes to see if implicit floats are allowed.
   1789     // FIXME: This check doesn't seem possibly correct -- what if the loop is
   1790     // an integer loop and the vector instructions selected are purely integer
   1791     // vector instructions?
   1792     if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
   1793       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
   1794             "attribute is used.\n");
   1795       emitAnalysisDiag(
   1796           F, L, Hints,
   1797           VectorizationReport()
   1798               << "loop not vectorized due to NoImplicitFloat attribute");
   1799       emitMissedWarning(F, L, Hints);
   1800       return false;
   1801     }
   1802 
   1803     // Select the optimal vectorization factor.
   1804     const LoopVectorizationCostModel::VectorizationFactor VF =
   1805         CM.selectVectorizationFactor(OptForSize);
   1806 
   1807     // Select the interleave count.
   1808     unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
   1809 
   1810     // Get user interleave count.
   1811     unsigned UserIC = Hints.getInterleave();
   1812 
   1813     // Identify the diagnostic messages that should be produced.
   1814     std::string VecDiagMsg, IntDiagMsg;
   1815     bool VectorizeLoop = true, InterleaveLoop = true;
   1816 
   1817     if (Requirements.doesNotMeet(F, L, Hints)) {
   1818       DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
   1819                       "requirements.\n");
   1820       emitMissedWarning(F, L, Hints);
   1821       return false;
   1822     }
   1823 
   1824     if (VF.Width == 1) {
   1825       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
   1826       VecDiagMsg =
   1827           "the cost-model indicates that vectorization is not beneficial";
   1828       VectorizeLoop = false;
   1829     }
   1830 
   1831     if (IC == 1 && UserIC <= 1) {
   1832       // Tell the user interleaving is not beneficial.
   1833       DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
   1834       IntDiagMsg =
   1835           "the cost-model indicates that interleaving is not beneficial";
   1836       InterleaveLoop = false;
   1837       if (UserIC == 1)
   1838         IntDiagMsg +=
   1839             " and is explicitly disabled or interleave count is set to 1";
   1840     } else if (IC > 1 && UserIC == 1) {
   1841       // Tell the user interleaving is beneficial, but it explicitly disabled.
   1842       DEBUG(dbgs()
   1843             << "LV: Interleaving is beneficial but is explicitly disabled.");
   1844       IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
   1845                    "but is explicitly disabled or interleave count is set to 1";
   1846       InterleaveLoop = false;
   1847     }
   1848 
   1849     // Override IC if user provided an interleave count.
   1850     IC = UserIC > 0 ? UserIC : IC;
   1851 
   1852     // Emit diagnostic messages, if any.
   1853     const char *VAPassName = Hints.vectorizeAnalysisPassName();
   1854     if (!VectorizeLoop && !InterleaveLoop) {
   1855       // Do not vectorize or interleaving the loop.
   1856       emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
   1857                                      L->getStartLoc(), VecDiagMsg);
   1858       emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
   1859                                      L->getStartLoc(), IntDiagMsg);
   1860       return false;
   1861     } else if (!VectorizeLoop && InterleaveLoop) {
   1862       DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   1863       emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
   1864                                      L->getStartLoc(), VecDiagMsg);
   1865     } else if (VectorizeLoop && !InterleaveLoop) {
   1866       DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
   1867                    << DebugLocStr << '\n');
   1868       emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
   1869                                      L->getStartLoc(), IntDiagMsg);
   1870     } else if (VectorizeLoop && InterleaveLoop) {
   1871       DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
   1872                    << DebugLocStr << '\n');
   1873       DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   1874     }
   1875 
   1876     if (!VectorizeLoop) {
   1877       assert(IC > 1 && "interleave count should not be 1 or 0");
   1878       // If we decided that it is not legal to vectorize the loop then
   1879       // interleave it.
   1880       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC);
   1881       Unroller.vectorize(&LVL, CM.MinBWs);
   1882 
   1883       emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
   1884                              Twine("interleaved loop (interleaved count: ") +
   1885                                  Twine(IC) + ")");
   1886     } else {
   1887       // If we decided that it is *legal* to vectorize the loop then do it.
   1888       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC);
   1889       LB.vectorize(&LVL, CM.MinBWs);
   1890       ++LoopsVectorized;
   1891 
   1892       // Add metadata to disable runtime unrolling scalar loop when there's no
   1893       // runtime check about strides and memory. Because at this situation,
   1894       // scalar loop is rarely used not worthy to be unrolled.
   1895       if (!LB.IsSafetyChecksAdded())
   1896         AddRuntimeUnrollDisableMetaData(L);
   1897 
   1898       // Report the vectorization decision.
   1899       emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
   1900                              Twine("vectorized loop (vectorization width: ") +
   1901                                  Twine(VF.Width) + ", interleaved count: " +
   1902                                  Twine(IC) + ")");
   1903     }
   1904 
   1905     // Mark the loop as already vectorized to avoid vectorizing again.
   1906     Hints.setAlreadyVectorized();
   1907 
   1908     DEBUG(verifyFunction(*L->getHeader()->getParent()));
   1909     return true;
   1910   }
   1911 
   1912   void getAnalysisUsage(AnalysisUsage &AU) const override {
   1913     AU.addRequired<AssumptionCacheTracker>();
   1914     AU.addRequiredID(LoopSimplifyID);
   1915     AU.addRequiredID(LCSSAID);
   1916     AU.addRequired<BlockFrequencyInfoWrapperPass>();
   1917     AU.addRequired<DominatorTreeWrapperPass>();
   1918     AU.addRequired<LoopInfoWrapperPass>();
   1919     AU.addRequired<ScalarEvolutionWrapperPass>();
   1920     AU.addRequired<TargetTransformInfoWrapperPass>();
   1921     AU.addRequired<AAResultsWrapperPass>();
   1922     AU.addRequired<LoopAccessAnalysis>();
   1923     AU.addRequired<DemandedBits>();
   1924     AU.addPreserved<LoopInfoWrapperPass>();
   1925     AU.addPreserved<DominatorTreeWrapperPass>();
   1926     AU.addPreserved<BasicAAWrapperPass>();
   1927     AU.addPreserved<AAResultsWrapperPass>();
   1928     AU.addPreserved<GlobalsAAWrapperPass>();
   1929   }
   1930 
   1931 };
   1932 
   1933 } // end anonymous namespace
   1934 
   1935 //===----------------------------------------------------------------------===//
   1936 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
   1937 // LoopVectorizationCostModel.
   1938 //===----------------------------------------------------------------------===//
   1939 
   1940 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   1941   // We need to place the broadcast of invariant variables outside the loop.
   1942   Instruction *Instr = dyn_cast<Instruction>(V);
   1943   bool NewInstr =
   1944       (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
   1945                           Instr->getParent()) != LoopVectorBody.end());
   1946   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
   1947 
   1948   // Place the code for broadcasting invariant variables in the new preheader.
   1949   IRBuilder<>::InsertPointGuard Guard(Builder);
   1950   if (Invariant)
   1951     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
   1952 
   1953   // Broadcast the scalar into all locations in the vector.
   1954   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
   1955 
   1956   return Shuf;
   1957 }
   1958 
   1959 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
   1960                                           Value *Step) {
   1961   assert(Val->getType()->isVectorTy() && "Must be a vector");
   1962   assert(Val->getType()->getScalarType()->isIntegerTy() &&
   1963          "Elem must be an integer");
   1964   assert(Step->getType() == Val->getType()->getScalarType() &&
   1965          "Step has wrong type");
   1966   // Create the types.
   1967   Type *ITy = Val->getType()->getScalarType();
   1968   VectorType *Ty = cast<VectorType>(Val->getType());
   1969   int VLen = Ty->getNumElements();
   1970   SmallVector<Constant*, 8> Indices;
   1971 
   1972   // Create a vector of consecutive numbers from zero to VF.
   1973   for (int i = 0; i < VLen; ++i)
   1974     Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
   1975 
   1976   // Add the consecutive indices to the vector value.
   1977   Constant *Cv = ConstantVector::get(Indices);
   1978   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
   1979   Step = Builder.CreateVectorSplat(VLen, Step);
   1980   assert(Step->getType() == Val->getType() && "Invalid step vec");
   1981   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
   1982   // which can be found from the original scalar operations.
   1983   Step = Builder.CreateMul(Cv, Step);
   1984   return Builder.CreateAdd(Val, Step, "induction");
   1985 }
   1986 
   1987 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   1988   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
   1989   auto *SE = PSE.getSE();
   1990   // Make sure that the pointer does not point to structs.
   1991   if (Ptr->getType()->getPointerElementType()->isAggregateType())
   1992     return 0;
   1993 
   1994   // If this value is a pointer induction variable we know it is consecutive.
   1995   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
   1996   if (Phi && Inductions.count(Phi)) {
   1997     InductionDescriptor II = Inductions[Phi];
   1998     return II.getConsecutiveDirection();
   1999   }
   2000 
   2001   GetElementPtrInst *Gep = getGEPInstruction(Ptr);
   2002   if (!Gep)
   2003     return 0;
   2004 
   2005   unsigned NumOperands = Gep->getNumOperands();
   2006   Value *GpPtr = Gep->getPointerOperand();
   2007   // If this GEP value is a consecutive pointer induction variable and all of
   2008   // the indices are constant then we know it is consecutive. We can
   2009   Phi = dyn_cast<PHINode>(GpPtr);
   2010   if (Phi && Inductions.count(Phi)) {
   2011 
   2012     // Make sure that the pointer does not point to structs.
   2013     PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
   2014     if (GepPtrType->getElementType()->isAggregateType())
   2015       return 0;
   2016 
   2017     // Make sure that all of the index operands are loop invariant.
   2018     for (unsigned i = 1; i < NumOperands; ++i)
   2019       if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
   2020         return 0;
   2021 
   2022     InductionDescriptor II = Inductions[Phi];
   2023     return II.getConsecutiveDirection();
   2024   }
   2025 
   2026   unsigned InductionOperand = getGEPInductionOperand(Gep);
   2027 
   2028   // Check that all of the gep indices are uniform except for our induction
   2029   // operand.
   2030   for (unsigned i = 0; i != NumOperands; ++i)
   2031     if (i != InductionOperand &&
   2032         !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
   2033       return 0;
   2034 
   2035   // We can emit wide load/stores only if the last non-zero index is the
   2036   // induction variable.
   2037   const SCEV *Last = nullptr;
   2038   if (!Strides.count(Gep))
   2039     Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
   2040   else {
   2041     // Because of the multiplication by a stride we can have a s/zext cast.
   2042     // We are going to replace this stride by 1 so the cast is safe to ignore.
   2043     //
   2044     //  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   2045     //  %0 = trunc i64 %indvars.iv to i32
   2046     //  %mul = mul i32 %0, %Stride1
   2047     //  %idxprom = zext i32 %mul to i64  << Safe cast.
   2048     //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
   2049     //
   2050     Last = replaceSymbolicStrideSCEV(PSE, Strides,
   2051                                      Gep->getOperand(InductionOperand), Gep);
   2052     if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
   2053       Last =
   2054           (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
   2055               ? C->getOperand()
   2056               : Last;
   2057   }
   2058   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
   2059     const SCEV *Step = AR->getStepRecurrence(*SE);
   2060 
   2061     // The memory is consecutive because the last index is consecutive
   2062     // and all other indices are loop invariant.
   2063     if (Step->isOne())
   2064       return 1;
   2065     if (Step->isAllOnesValue())
   2066       return -1;
   2067   }
   2068 
   2069   return 0;
   2070 }
   2071 
   2072 bool LoopVectorizationLegality::isUniform(Value *V) {
   2073   return LAI->isUniform(V);
   2074 }
   2075 
   2076 InnerLoopVectorizer::VectorParts&
   2077 InnerLoopVectorizer::getVectorValue(Value *V) {
   2078   assert(V != Induction && "The new induction variable should not be used.");
   2079   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   2080 
   2081   // If we have a stride that is replaced by one, do it here.
   2082   if (Legal->hasStride(V))
   2083     V = ConstantInt::get(V->getType(), 1);
   2084 
   2085   // If we have this scalar in the map, return it.
   2086   if (WidenMap.has(V))
   2087     return WidenMap.get(V);
   2088 
   2089   // If this scalar is unknown, assume that it is a constant or that it is
   2090   // loop invariant. Broadcast V and save the value for future uses.
   2091   Value *B = getBroadcastInstrs(V);
   2092   return WidenMap.splat(V, B);
   2093 }
   2094 
   2095 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   2096   assert(Vec->getType()->isVectorTy() && "Invalid type");
   2097   SmallVector<Constant*, 8> ShuffleMask;
   2098   for (unsigned i = 0; i < VF; ++i)
   2099     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
   2100 
   2101   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
   2102                                      ConstantVector::get(ShuffleMask),
   2103                                      "reverse");
   2104 }
   2105 
   2106 // Get a mask to interleave \p NumVec vectors into a wide vector.
   2107 // I.e.  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
   2108 // E.g. For 2 interleaved vectors, if VF is 4, the mask is:
   2109 //      <0, 4, 1, 5, 2, 6, 3, 7>
   2110 static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
   2111                                     unsigned NumVec) {
   2112   SmallVector<Constant *, 16> Mask;
   2113   for (unsigned i = 0; i < VF; i++)
   2114     for (unsigned j = 0; j < NumVec; j++)
   2115       Mask.push_back(Builder.getInt32(j * VF + i));
   2116 
   2117   return ConstantVector::get(Mask);
   2118 }
   2119 
   2120 // Get the strided mask starting from index \p Start.
   2121 // I.e.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
   2122 static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
   2123                                 unsigned Stride, unsigned VF) {
   2124   SmallVector<Constant *, 16> Mask;
   2125   for (unsigned i = 0; i < VF; i++)
   2126     Mask.push_back(Builder.getInt32(Start + i * Stride));
   2127 
   2128   return ConstantVector::get(Mask);
   2129 }
   2130 
   2131 // Get a mask of two parts: The first part consists of sequential integers
   2132 // starting from 0, The second part consists of UNDEFs.
   2133 // I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
   2134 static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
   2135                                    unsigned NumUndef) {
   2136   SmallVector<Constant *, 16> Mask;
   2137   for (unsigned i = 0; i < NumInt; i++)
   2138     Mask.push_back(Builder.getInt32(i));
   2139 
   2140   Constant *Undef = UndefValue::get(Builder.getInt32Ty());
   2141   for (unsigned i = 0; i < NumUndef; i++)
   2142     Mask.push_back(Undef);
   2143 
   2144   return ConstantVector::get(Mask);
   2145 }
   2146 
   2147 // Concatenate two vectors with the same element type. The 2nd vector should
   2148 // not have more elements than the 1st vector. If the 2nd vector has less
   2149 // elements, extend it with UNDEFs.
   2150 static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
   2151                                     Value *V2) {
   2152   VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
   2153   VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
   2154   assert(VecTy1 && VecTy2 &&
   2155          VecTy1->getScalarType() == VecTy2->getScalarType() &&
   2156          "Expect two vectors with the same element type");
   2157 
   2158   unsigned NumElts1 = VecTy1->getNumElements();
   2159   unsigned NumElts2 = VecTy2->getNumElements();
   2160   assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
   2161 
   2162   if (NumElts1 > NumElts2) {
   2163     // Extend with UNDEFs.
   2164     Constant *ExtMask =
   2165         getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
   2166     V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
   2167   }
   2168 
   2169   Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
   2170   return Builder.CreateShuffleVector(V1, V2, Mask);
   2171 }
   2172 
   2173 // Concatenate vectors in the given list. All vectors have the same type.
   2174 static Value *ConcatenateVectors(IRBuilder<> &Builder,
   2175                                  ArrayRef<Value *> InputList) {
   2176   unsigned NumVec = InputList.size();
   2177   assert(NumVec > 1 && "Should be at least two vectors");
   2178 
   2179   SmallVector<Value *, 8> ResList;
   2180   ResList.append(InputList.begin(), InputList.end());
   2181   do {
   2182     SmallVector<Value *, 8> TmpList;
   2183     for (unsigned i = 0; i < NumVec - 1; i += 2) {
   2184       Value *V0 = ResList[i], *V1 = ResList[i + 1];
   2185       assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
   2186              "Only the last vector may have a different type");
   2187 
   2188       TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
   2189     }
   2190 
   2191     // Push the last vector if the total number of vectors is odd.
   2192     if (NumVec % 2 != 0)
   2193       TmpList.push_back(ResList[NumVec - 1]);
   2194 
   2195     ResList = TmpList;
   2196     NumVec = ResList.size();
   2197   } while (NumVec > 1);
   2198 
   2199   return ResList[0];
   2200 }
   2201 
   2202 // Try to vectorize the interleave group that \p Instr belongs to.
   2203 //
   2204 // E.g. Translate following interleaved load group (factor = 3):
   2205 //   for (i = 0; i < N; i+=3) {
   2206 //     R = Pic[i];             // Member of index 0
   2207 //     G = Pic[i+1];           // Member of index 1
   2208 //     B = Pic[i+2];           // Member of index 2
   2209 //     ... // do something to R, G, B
   2210 //   }
   2211 // To:
   2212 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
   2213 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
   2214 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
   2215 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
   2216 //
   2217 // Or translate following interleaved store group (factor = 3):
   2218 //   for (i = 0; i < N; i+=3) {
   2219 //     ... do something to R, G, B
   2220 //     Pic[i]   = R;           // Member of index 0
   2221 //     Pic[i+1] = G;           // Member of index 1
   2222 //     Pic[i+2] = B;           // Member of index 2
   2223 //   }
   2224 // To:
   2225 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
   2226 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
   2227 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
   2228 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
   2229 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
   2230 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   2231   const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
   2232   assert(Group && "Fail to get an interleaved access group.");
   2233 
   2234   // Skip if current instruction is not the insert position.
   2235   if (Instr != Group->getInsertPos())
   2236     return;
   2237 
   2238   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   2239   StoreInst *SI = dyn_cast<StoreInst>(Instr);
   2240   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   2241 
   2242   // Prepare for the vector type of the interleaved load/store.
   2243   Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   2244   unsigned InterleaveFactor = Group->getFactor();
   2245   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
   2246   Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
   2247 
   2248   // Prepare for the new pointers.
   2249   setDebugLocFromInst(Builder, Ptr);
   2250   VectorParts &PtrParts = getVectorValue(Ptr);
   2251   SmallVector<Value *, 2> NewPtrs;
   2252   unsigned Index = Group->getIndex(Instr);
   2253   for (unsigned Part = 0; Part < UF; Part++) {
   2254     // Extract the pointer for current instruction from the pointer vector. A
   2255     // reverse access uses the pointer in the last lane.
   2256     Value *NewPtr = Builder.CreateExtractElement(
   2257         PtrParts[Part],
   2258         Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
   2259 
   2260     // Notice current instruction could be any index. Need to adjust the address
   2261     // to the member of index 0.
   2262     //
   2263     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
   2264     //       b = A[i];       // Member of index 0
   2265     // Current pointer is pointed to A[i+1], adjust it to A[i].
   2266     //
   2267     // E.g.  A[i+1] = a;     // Member of index 1
   2268     //       A[i]   = b;     // Member of index 0
   2269     //       A[i+2] = c;     // Member of index 2 (Current instruction)
   2270     // Current pointer is pointed to A[i+2], adjust it to A[i].
   2271     NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
   2272 
   2273     // Cast to the vector pointer type.
   2274     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
   2275   }
   2276 
   2277   setDebugLocFromInst(Builder, Instr);
   2278   Value *UndefVec = UndefValue::get(VecTy);
   2279 
   2280   // Vectorize the interleaved load group.
   2281   if (LI) {
   2282     for (unsigned Part = 0; Part < UF; Part++) {
   2283       Instruction *NewLoadInstr = Builder.CreateAlignedLoad(
   2284           NewPtrs[Part], Group->getAlignment(), "wide.vec");
   2285 
   2286       for (unsigned i = 0; i < InterleaveFactor; i++) {
   2287         Instruction *Member = Group->getMember(i);
   2288 
   2289         // Skip the gaps in the group.
   2290         if (!Member)
   2291           continue;
   2292 
   2293         Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF);
   2294         Value *StridedVec = Builder.CreateShuffleVector(
   2295             NewLoadInstr, UndefVec, StrideMask, "strided.vec");
   2296 
   2297         // If this member has different type, cast the result type.
   2298         if (Member->getType() != ScalarTy) {
   2299           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
   2300           StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
   2301         }
   2302 
   2303         VectorParts &Entry = WidenMap.get(Member);
   2304         Entry[Part] =
   2305             Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
   2306       }
   2307 
   2308       propagateMetadata(NewLoadInstr, Instr);
   2309     }
   2310     return;
   2311   }
   2312 
   2313   // The sub vector type for current instruction.
   2314   VectorType *SubVT = VectorType::get(ScalarTy, VF);
   2315 
   2316   // Vectorize the interleaved store group.
   2317   for (unsigned Part = 0; Part < UF; Part++) {
   2318     // Collect the stored vector from each member.
   2319     SmallVector<Value *, 4> StoredVecs;
   2320     for (unsigned i = 0; i < InterleaveFactor; i++) {
   2321       // Interleaved store group doesn't allow a gap, so each index has a member
   2322       Instruction *Member = Group->getMember(i);
   2323       assert(Member && "Fail to get a member from an interleaved store group");
   2324 
   2325       Value *StoredVec =
   2326           getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
   2327       if (Group->isReverse())
   2328         StoredVec = reverseVector(StoredVec);
   2329 
   2330       // If this member has different type, cast it to an unified type.
   2331       if (StoredVec->getType() != SubVT)
   2332         StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
   2333 
   2334       StoredVecs.push_back(StoredVec);
   2335     }
   2336 
   2337     // Concatenate all vectors into a wide vector.
   2338     Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
   2339 
   2340     // Interleave the elements in the wide vector.
   2341     Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
   2342     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
   2343                                               "interleaved.vec");
   2344 
   2345     Instruction *NewStoreInstr =
   2346         Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
   2347     propagateMetadata(NewStoreInstr, Instr);
   2348   }
   2349 }
   2350 
   2351 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   2352   // Attempt to issue a wide load.
   2353   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   2354   StoreInst *SI = dyn_cast<StoreInst>(Instr);
   2355 
   2356   assert((LI || SI) && "Invalid Load/Store instruction");
   2357 
   2358   // Try to vectorize the interleave group if this access is interleaved.
   2359   if (Legal->isAccessInterleaved(Instr))
   2360     return vectorizeInterleaveGroup(Instr);
   2361 
   2362   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   2363   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   2364   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   2365   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
   2366   // An alignment of 0 means target abi alignment. We need to use the scalar's
   2367   // target abi alignment in such a case.
   2368   const DataLayout &DL = Instr->getModule()->getDataLayout();
   2369   if (!Alignment)
   2370     Alignment = DL.getABITypeAlignment(ScalarDataTy);
   2371   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
   2372   unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
   2373   unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
   2374 
   2375   if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
   2376       !Legal->isMaskRequired(SI))
   2377     return scalarizeInstruction(Instr, true);
   2378 
   2379   if (ScalarAllocatedSize != VectorElementSize)
   2380     return scalarizeInstruction(Instr);
   2381 
   2382   // If the pointer is loop invariant or if it is non-consecutive,
   2383   // scalarize the load.
   2384   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   2385   bool Reverse = ConsecutiveStride < 0;
   2386   bool UniformLoad = LI && Legal->isUniform(Ptr);
   2387   if (!ConsecutiveStride || UniformLoad)
   2388     return scalarizeInstruction(Instr);
   2389 
   2390   Constant *Zero = Builder.getInt32(0);
   2391   VectorParts &Entry = WidenMap.get(Instr);
   2392 
   2393   // Handle consecutive loads/stores.
   2394   GetElementPtrInst *Gep = getGEPInstruction(Ptr);
   2395   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
   2396     setDebugLocFromInst(Builder, Gep);
   2397     Value *PtrOperand = Gep->getPointerOperand();
   2398     Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
   2399     FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
   2400 
   2401     // Create the new GEP with the new induction variable.
   2402     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
   2403     Gep2->setOperand(0, FirstBasePtr);
   2404     Gep2->setName("gep.indvar.base");
   2405     Ptr = Builder.Insert(Gep2);
   2406   } else if (Gep) {
   2407     setDebugLocFromInst(Builder, Gep);
   2408     assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
   2409                                         OrigLoop) &&
   2410            "Base ptr must be invariant");
   2411 
   2412     // The last index does not have to be the induction. It can be
   2413     // consecutive and be a function of the index. For example A[I+1];
   2414     unsigned NumOperands = Gep->getNumOperands();
   2415     unsigned InductionOperand = getGEPInductionOperand(Gep);
   2416     // Create the new GEP with the new induction variable.
   2417     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
   2418 
   2419     for (unsigned i = 0; i < NumOperands; ++i) {
   2420       Value *GepOperand = Gep->getOperand(i);
   2421       Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
   2422 
   2423       // Update last index or loop invariant instruction anchored in loop.
   2424       if (i == InductionOperand ||
   2425           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
   2426         assert((i == InductionOperand ||
   2427                 PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst),
   2428                                              OrigLoop)) &&
   2429                "Must be last index or loop invariant");
   2430 
   2431         VectorParts &GEPParts = getVectorValue(GepOperand);
   2432         Value *Index = GEPParts[0];
   2433         Index = Builder.CreateExtractElement(Index, Zero);
   2434         Gep2->setOperand(i, Index);
   2435         Gep2->setName("gep.indvar.idx");
   2436       }
   2437     }
   2438     Ptr = Builder.Insert(Gep2);
   2439   } else {
   2440     // Use the induction element ptr.
   2441     assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
   2442     setDebugLocFromInst(Builder, Ptr);
   2443     VectorParts &PtrVal = getVectorValue(Ptr);
   2444     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
   2445   }
   2446 
   2447   VectorParts Mask = createBlockInMask(Instr->getParent());
   2448   // Handle Stores:
   2449   if (SI) {
   2450     assert(!Legal->isUniform(SI->getPointerOperand()) &&
   2451            "We do not allow storing to uniform addresses");
   2452     setDebugLocFromInst(Builder, SI);
   2453     // We don't want to update the value in the map as it might be used in
   2454     // another expression. So don't use a reference type for "StoredVal".
   2455     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
   2456 
   2457     for (unsigned Part = 0; Part < UF; ++Part) {
   2458       // Calculate the pointer for the specific unroll-part.
   2459       Value *PartPtr =
   2460           Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
   2461 
   2462       if (Reverse) {
   2463         // If we store to reverse consecutive memory locations, then we need
   2464         // to reverse the order of elements in the stored value.
   2465         StoredVal[Part] = reverseVector(StoredVal[Part]);
   2466         // If the address is consecutive but reversed, then the
   2467         // wide store needs to start at the last vector element.
   2468         PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
   2469         PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
   2470         Mask[Part] = reverseVector(Mask[Part]);
   2471       }
   2472 
   2473       Value *VecPtr = Builder.CreateBitCast(PartPtr,
   2474                                             DataTy->getPointerTo(AddressSpace));
   2475 
   2476       Instruction *NewSI;
   2477       if (Legal->isMaskRequired(SI))
   2478         NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
   2479                                           Mask[Part]);
   2480       else
   2481         NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
   2482       propagateMetadata(NewSI, SI);
   2483     }
   2484     return;
   2485   }
   2486 
   2487   // Handle loads.
   2488   assert(LI && "Must have a load instruction");
   2489   setDebugLocFromInst(Builder, LI);
   2490   for (unsigned Part = 0; Part < UF; ++Part) {
   2491     // Calculate the pointer for the specific unroll-part.
   2492     Value *PartPtr =
   2493         Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
   2494 
   2495     if (Reverse) {
   2496       // If the address is consecutive but reversed, then the
   2497       // wide load needs to start at the last vector element.
   2498       PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
   2499       PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
   2500       Mask[Part] = reverseVector(Mask[Part]);
   2501     }
   2502 
   2503     Instruction* NewLI;
   2504     Value *VecPtr = Builder.CreateBitCast(PartPtr,
   2505                                           DataTy->getPointerTo(AddressSpace));
   2506     if (Legal->isMaskRequired(LI))
   2507       NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
   2508                                        UndefValue::get(DataTy),
   2509                                        "wide.masked.load");
   2510     else
   2511       NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
   2512     propagateMetadata(NewLI, LI);
   2513     Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
   2514   }
   2515 }
   2516 
   2517 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   2518                                                bool IfPredicateStore) {
   2519   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   2520   // Holds vector parameters or scalars, in case of uniform vals.
   2521   SmallVector<VectorParts, 4> Params;
   2522 
   2523   setDebugLocFromInst(Builder, Instr);
   2524 
   2525   // Find all of the vectorized parameters.
   2526   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
   2527     Value *SrcOp = Instr->getOperand(op);
   2528 
   2529     // If we are accessing the old induction variable, use the new one.
   2530     if (SrcOp == OldInduction) {
   2531       Params.push_back(getVectorValue(SrcOp));
   2532       continue;
   2533     }
   2534 
   2535     // Try using previously calculated values.
   2536     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
   2537 
   2538     // If the src is an instruction that appeared earlier in the basic block,
   2539     // then it should already be vectorized.
   2540     if (SrcInst && OrigLoop->contains(SrcInst)) {
   2541       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
   2542       // The parameter is a vector value from earlier.
   2543       Params.push_back(WidenMap.get(SrcInst));
   2544     } else {
   2545       // The parameter is a scalar from outside the loop. Maybe even a constant.
   2546       VectorParts Scalars;
   2547       Scalars.append(UF, SrcOp);
   2548       Params.push_back(Scalars);
   2549     }
   2550   }
   2551 
   2552   assert(Params.size() == Instr->getNumOperands() &&
   2553          "Invalid number of operands");
   2554 
   2555   // Does this instruction return a value ?
   2556   bool IsVoidRetTy = Instr->getType()->isVoidTy();
   2557 
   2558   Value *UndefVec = IsVoidRetTy ? nullptr :
   2559     UndefValue::get(VectorType::get(Instr->getType(), VF));
   2560   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   2561   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
   2562 
   2563   VectorParts Cond;
   2564   if (IfPredicateStore) {
   2565     assert(Instr->getParent()->getSinglePredecessor() &&
   2566            "Only support single predecessor blocks");
   2567     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
   2568                           Instr->getParent());
   2569   }
   2570 
   2571   // For each vector unroll 'part':
   2572   for (unsigned Part = 0; Part < UF; ++Part) {
   2573     // For each scalar that we create:
   2574     for (unsigned Width = 0; Width < VF; ++Width) {
   2575 
   2576       // Start if-block.
   2577       Value *Cmp = nullptr;
   2578       if (IfPredicateStore) {
   2579         Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
   2580         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
   2581                                  ConstantInt::get(Cmp->getType(), 1));
   2582       }
   2583 
   2584       Instruction *Cloned = Instr->clone();
   2585       if (!IsVoidRetTy)
   2586         Cloned->setName(Instr->getName() + ".cloned");
   2587       // Replace the operands of the cloned instructions with extracted scalars.
   2588       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
   2589         Value *Op = Params[op][Part];
   2590         // Param is a vector. Need to extract the right lane.
   2591         if (Op->getType()->isVectorTy())
   2592           Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
   2593         Cloned->setOperand(op, Op);
   2594       }
   2595 
   2596       // Place the cloned scalar in the new loop.
   2597       Builder.Insert(Cloned);
   2598 
   2599       // If the original scalar returns a value we need to place it in a vector
   2600       // so that future users will be able to use it.
   2601       if (!IsVoidRetTy)
   2602         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
   2603                                                        Builder.getInt32(Width));
   2604       // End if-block.
   2605       if (IfPredicateStore)
   2606         PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
   2607                                                   Cmp));
   2608     }
   2609   }
   2610 }
   2611 
   2612 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
   2613                                                       Value *End, Value *Step,
   2614                                                       Instruction *DL) {
   2615   BasicBlock *Header = L->getHeader();
   2616   BasicBlock *Latch = L->getLoopLatch();
   2617   // As we're just creating this loop, it's possible no latch exists
   2618   // yet. If so, use the header as this will be a single block loop.
   2619   if (!Latch)
   2620     Latch = Header;
   2621 
   2622   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
   2623   setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
   2624   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
   2625 
   2626   Builder.SetInsertPoint(Latch->getTerminator());
   2627 
   2628   // Create i+1 and fill the PHINode.
   2629   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
   2630   Induction->addIncoming(Start, L->getLoopPreheader());
   2631   Induction->addIncoming(Next, Latch);
   2632   // Create the compare.
   2633   Value *ICmp = Builder.CreateICmpEQ(Next, End);
   2634   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
   2635 
   2636   // Now we have two terminators. Remove the old one from the block.
   2637   Latch->getTerminator()->eraseFromParent();
   2638 
   2639   return Induction;
   2640 }
   2641 
   2642 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   2643   if (TripCount)
   2644     return TripCount;
   2645 
   2646   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   2647   // Find the loop boundaries.
   2648   ScalarEvolution *SE = PSE.getSE();
   2649   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
   2650   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
   2651          "Invalid loop count");
   2652 
   2653   Type *IdxTy = Legal->getWidestInductionType();
   2654 
   2655   // The exit count might have the type of i64 while the phi is i32. This can
   2656   // happen if we have an induction variable that is sign extended before the
   2657   // compare. The only way that we get a backedge taken count is that the
   2658   // induction variable was signed and as such will not overflow. In such a case
   2659   // truncation is legal.
   2660   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
   2661       IdxTy->getPrimitiveSizeInBits())
   2662     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
   2663   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
   2664 
   2665   // Get the total trip count from the count by adding 1.
   2666   const SCEV *ExitCount = SE->getAddExpr(
   2667       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
   2668 
   2669   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
   2670 
   2671   // Expand the trip count and place the new instructions in the preheader.
   2672   // Notice that the pre-header does not change, only the loop body.
   2673   SCEVExpander Exp(*SE, DL, "induction");
   2674 
   2675   // Count holds the overall loop count (N).
   2676   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
   2677                                 L->getLoopPreheader()->getTerminator());
   2678 
   2679   if (TripCount->getType()->isPointerTy())
   2680     TripCount =
   2681       CastInst::CreatePointerCast(TripCount, IdxTy,
   2682                                   "exitcount.ptrcnt.to.int",
   2683                                   L->getLoopPreheader()->getTerminator());
   2684 
   2685   return TripCount;
   2686 }
   2687 
   2688 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   2689   if (VectorTripCount)
   2690     return VectorTripCount;
   2691 
   2692   Value *TC = getOrCreateTripCount(L);
   2693   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   2694 
   2695   // Now we need to generate the expression for N - (N % VF), which is
   2696   // the part that the vectorized body will execute.
   2697   // The loop step is equal to the vectorization factor (num of SIMD elements)
   2698   // times the unroll factor (num of SIMD instructions).
   2699   Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
   2700   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
   2701   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
   2702 
   2703   return VectorTripCount;
   2704 }
   2705 
   2706 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   2707                                                          BasicBlock *Bypass) {
   2708   Value *Count = getOrCreateTripCount(L);
   2709   BasicBlock *BB = L->getLoopPreheader();
   2710   IRBuilder<> Builder(BB->getTerminator());
   2711 
   2712   // Generate code to check that the loop's trip count that we computed by
   2713   // adding one to the backedge-taken count will not overflow.
   2714   Value *CheckMinIters =
   2715     Builder.CreateICmpULT(Count,
   2716                           ConstantInt::get(Count->getType(), VF * UF),
   2717                           "min.iters.check");
   2718 
   2719   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
   2720                                           "min.iters.checked");
   2721   if (L->getParentLoop())
   2722     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   2723   ReplaceInstWithInst(BB->getTerminator(),
   2724                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
   2725   LoopBypassBlocks.push_back(BB);
   2726 }
   2727 
   2728 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
   2729                                                      BasicBlock *Bypass) {
   2730   Value *TC = getOrCreateVectorTripCount(L);
   2731   BasicBlock *BB = L->getLoopPreheader();
   2732   IRBuilder<> Builder(BB->getTerminator());
   2733 
   2734   // Now, compare the new count to zero. If it is zero skip the vector loop and
   2735   // jump to the scalar loop.
   2736   Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
   2737                                     "cmp.zero");
   2738 
   2739   // Generate code to check that the loop's trip count that we computed by
   2740   // adding one to the backedge-taken count will not overflow.
   2741   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
   2742                                           "vector.ph");
   2743   if (L->getParentLoop())
   2744     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   2745   ReplaceInstWithInst(BB->getTerminator(),
   2746                       BranchInst::Create(Bypass, NewBB, Cmp));
   2747   LoopBypassBlocks.push_back(BB);
   2748 }
   2749 
   2750 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
   2751   BasicBlock *BB = L->getLoopPreheader();
   2752 
   2753   // Generate the code to check that the SCEV assumptions that we made.
   2754   // We want the new basic block to start at the first instruction in a
   2755   // sequence of instructions that form a check.
   2756   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
   2757                    "scev.check");
   2758   Value *SCEVCheck =
   2759       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
   2760 
   2761   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
   2762     if (C->isZero())
   2763       return;
   2764 
   2765   // Create a new block containing the stride check.
   2766   BB->setName("vector.scevcheck");
   2767   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   2768   if (L->getParentLoop())
   2769     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   2770   ReplaceInstWithInst(BB->getTerminator(),
   2771                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
   2772   LoopBypassBlocks.push_back(BB);
   2773   AddedSafetyChecks = true;
   2774 }
   2775 
   2776 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
   2777                                                BasicBlock *Bypass) {
   2778   BasicBlock *BB = L->getLoopPreheader();
   2779 
   2780   // Generate the code that checks in runtime if arrays overlap. We put the
   2781   // checks into a separate block to make the more common case of few elements
   2782   // faster.
   2783   Instruction *FirstCheckInst;
   2784   Instruction *MemRuntimeCheck;
   2785   std::tie(FirstCheckInst, MemRuntimeCheck) =
   2786       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
   2787   if (!MemRuntimeCheck)
   2788     return;
   2789 
   2790   // Create a new block containing the memory check.
   2791   BB->setName("vector.memcheck");
   2792   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   2793   if (L->getParentLoop())
   2794     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   2795   ReplaceInstWithInst(BB->getTerminator(),
   2796                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
   2797   LoopBypassBlocks.push_back(BB);
   2798   AddedSafetyChecks = true;
   2799 }
   2800 
   2801 
   2802 void InnerLoopVectorizer::createEmptyLoop() {
   2803   /*
   2804    In this function we generate a new loop. The new loop will contain
   2805    the vectorized instructions while the old loop will continue to run the
   2806    scalar remainder.
   2807 
   2808        [ ] <-- loop iteration number check.
   2809     /   |
   2810    /    v
   2811   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
   2812   |  /  |
   2813   | /   v
   2814   ||   [ ]     <-- vector pre header.
   2815   |/    |
   2816   |     v
   2817   |    [  ] \
   2818   |    [  ]_|   <-- vector loop.
   2819   |     |
   2820   |     v
   2821   |   -[ ]   <--- middle-block.
   2822   |  /  |
   2823   | /   v
   2824   -|- >[ ]     <--- new preheader.
   2825    |    |
   2826    |    v
   2827    |   [ ] \
   2828    |   [ ]_|   <-- old scalar loop to handle remainder.
   2829     \   |
   2830      \  v
   2831       >[ ]     <-- exit block.
   2832    ...
   2833    */
   2834 
   2835   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
   2836   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
   2837   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   2838   assert(VectorPH && "Invalid loop structure");
   2839   assert(ExitBlock && "Must have an exit block");
   2840 
   2841   // Some loops have a single integer induction variable, while other loops
   2842   // don't. One example is c++ iterators that often have multiple pointer
   2843   // induction variables. In the code below we also support a case where we
   2844   // don't have a single induction variable.
   2845   //
   2846   // We try to obtain an induction variable from the original loop as hard
   2847   // as possible. However if we don't find one that:
   2848   //   - is an integer
   2849   //   - counts from zero, stepping by one
   2850   //   - is the size of the widest induction variable type
   2851   // then we create a new one.
   2852   OldInduction = Legal->getInduction();
   2853   Type *IdxTy = Legal->getWidestInductionType();
   2854 
   2855   // Split the single block loop into the two loop structure described above.
   2856   BasicBlock *VecBody =
   2857       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
   2858   BasicBlock *MiddleBlock =
   2859   VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   2860   BasicBlock *ScalarPH =
   2861   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
   2862 
   2863   // Create and register the new vector loop.
   2864   Loop* Lp = new Loop();
   2865   Loop *ParentLoop = OrigLoop->getParentLoop();
   2866 
   2867   // Insert the new loop into the loop nest and register the new basic blocks
   2868   // before calling any utilities such as SCEV that require valid LoopInfo.
   2869   if (ParentLoop) {
   2870     ParentLoop->addChildLoop(Lp);
   2871     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
   2872     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
   2873   } else {
   2874     LI->addTopLevelLoop(Lp);
   2875   }
   2876   Lp->addBasicBlockToLoop(VecBody, *LI);
   2877 
   2878   // Find the loop boundaries.
   2879   Value *Count = getOrCreateTripCount(Lp);
   2880 
   2881   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   2882 
   2883   // We need to test whether the backedge-taken count is uint##_max. Adding one
   2884   // to it will cause overflow and an incorrect loop trip count in the vector
   2885   // body. In case of overflow we want to directly jump to the scalar remainder
   2886   // loop.
   2887   emitMinimumIterationCountCheck(Lp, ScalarPH);
   2888   // Now, compare the new count to zero. If it is zero skip the vector loop and
   2889   // jump to the scalar loop.
   2890   emitVectorLoopEnteredCheck(Lp, ScalarPH);
   2891   // Generate the code to check any assumptions that we've made for SCEV
   2892   // expressions.
   2893   emitSCEVChecks(Lp, ScalarPH);
   2894 
   2895   // Generate the code that checks in runtime if arrays overlap. We put the
   2896   // checks into a separate block to make the more common case of few elements
   2897   // faster.
   2898   emitMemRuntimeChecks(Lp, ScalarPH);
   2899 
   2900   // Generate the induction variable.
   2901   // The loop step is equal to the vectorization factor (num of SIMD elements)
   2902   // times the unroll factor (num of SIMD instructions).
   2903   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   2904   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
   2905   Induction =
   2906     createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
   2907                             getDebugLocFromInstOrOperands(OldInduction));
   2908 
   2909   // We are going to resume the execution of the scalar loop.
   2910   // Go over all of the induction variables that we found and fix the
   2911   // PHIs that are left in the scalar version of the loop.
   2912   // The starting values of PHI nodes depend on the counter of the last
   2913   // iteration in the vectorized loop.
   2914   // If we come from a bypass edge then we need to start from the original
   2915   // start value.
   2916 
   2917   // This variable saves the new starting index for the scalar loop. It is used
   2918   // to test if there are any tail iterations left once the vector loop has
   2919   // completed.
   2920   LoopVectorizationLegality::InductionList::iterator I, E;
   2921   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   2922   for (I = List->begin(), E = List->end(); I != E; ++I) {
   2923     PHINode *OrigPhi = I->first;
   2924     InductionDescriptor II = I->second;
   2925 
   2926     // Create phi nodes to merge from the  backedge-taken check block.
   2927     PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
   2928                                            "bc.resume.val",
   2929                                            ScalarPH->getTerminator());
   2930     Value *EndValue;
   2931     if (OrigPhi == OldInduction) {
   2932       // We know what the end value is.
   2933       EndValue = CountRoundDown;
   2934     } else {
   2935       IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
   2936       Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
   2937                                        II.getStepValue()->getType(),
   2938                                        "cast.crd");
   2939       EndValue = II.transform(B, CRD);
   2940       EndValue->setName("ind.end");
   2941     }
   2942 
   2943     // The new PHI merges the original incoming value, in case of a bypass,
   2944     // or the value at the end of the vectorized loop.
   2945     BCResumeVal->addIncoming(EndValue, MiddleBlock);
   2946 
   2947     // Fix the scalar body counter (PHI node).
   2948     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
   2949 
   2950     // The old induction's phi node in the scalar body needs the truncated
   2951     // value.
   2952     for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
   2953       BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
   2954     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
   2955   }
   2956 
   2957   // Add a check in the middle block to see if we have completed
   2958   // all of the iterations in the first vector loop.
   2959   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   2960   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
   2961                                 CountRoundDown, "cmp.n",
   2962                                 MiddleBlock->getTerminator());
   2963   ReplaceInstWithInst(MiddleBlock->getTerminator(),
   2964                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
   2965 
   2966   // Get ready to start creating new instructions into the vectorized body.
   2967   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
   2968 
   2969   // Save the state.
   2970   LoopVectorPreHeader = Lp->getLoopPreheader();
   2971   LoopScalarPreHeader = ScalarPH;
   2972   LoopMiddleBlock = MiddleBlock;
   2973   LoopExitBlock = ExitBlock;
   2974   LoopVectorBody.push_back(VecBody);
   2975   LoopScalarBody = OldBasicBlock;
   2976 
   2977   LoopVectorizeHints Hints(Lp, true);
   2978   Hints.setAlreadyVectorized();
   2979 }
   2980 
   2981 namespace {
   2982 struct CSEDenseMapInfo {
   2983   static bool canHandle(Instruction *I) {
   2984     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
   2985            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
   2986   }
   2987   static inline Instruction *getEmptyKey() {
   2988     return DenseMapInfo<Instruction *>::getEmptyKey();
   2989   }
   2990   static inline Instruction *getTombstoneKey() {
   2991     return DenseMapInfo<Instruction *>::getTombstoneKey();
   2992   }
   2993   static unsigned getHashValue(Instruction *I) {
   2994     assert(canHandle(I) && "Unknown instruction!");
   2995     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
   2996                                                            I->value_op_end()));
   2997   }
   2998   static bool isEqual(Instruction *LHS, Instruction *RHS) {
   2999     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
   3000         LHS == getTombstoneKey() || RHS == getTombstoneKey())
   3001       return LHS == RHS;
   3002     return LHS->isIdenticalTo(RHS);
   3003   }
   3004 };
   3005 }
   3006 
   3007 /// \brief Check whether this block is a predicated block.
   3008 /// Due to if predication of stores we might create a sequence of "if(pred) a[i]
   3009 /// = ...;  " blocks. We start with one vectorized basic block. For every
   3010 /// conditional block we split this vectorized block. Therefore, every second
   3011 /// block will be a predicated one.
   3012 static bool isPredicatedBlock(unsigned BlockNum) {
   3013   return BlockNum % 2;
   3014 }
   3015 
   3016 ///\brief Perform cse of induction variable instructions.
   3017 static void cse(SmallVector<BasicBlock *, 4> &BBs) {
   3018   // Perform simple cse.
   3019   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
   3020   for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
   3021     BasicBlock *BB = BBs[i];
   3022     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
   3023       Instruction *In = &*I++;
   3024 
   3025       if (!CSEDenseMapInfo::canHandle(In))
   3026         continue;
   3027 
   3028       // Check if we can replace this instruction with any of the
   3029       // visited instructions.
   3030       if (Instruction *V = CSEMap.lookup(In)) {
   3031         In->replaceAllUsesWith(V);
   3032         In->eraseFromParent();
   3033         continue;
   3034       }
   3035       // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
   3036       // ...;" blocks for predicated stores. Every second block is a predicated
   3037       // block.
   3038       if (isPredicatedBlock(i))
   3039         continue;
   3040 
   3041       CSEMap[In] = In;
   3042     }
   3043   }
   3044 }
   3045 
   3046 /// \brief Adds a 'fast' flag to floating point operations.
   3047 static Value *addFastMathFlag(Value *V) {
   3048   if (isa<FPMathOperator>(V)){
   3049     FastMathFlags Flags;
   3050     Flags.setUnsafeAlgebra();
   3051     cast<Instruction>(V)->setFastMathFlags(Flags);
   3052   }
   3053   return V;
   3054 }
   3055 
   3056 /// Estimate the overhead of scalarizing a value. Insert and Extract are set if
   3057 /// the result needs to be inserted and/or extracted from vectors.
   3058 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
   3059                                          const TargetTransformInfo &TTI) {
   3060   if (Ty->isVoidTy())
   3061     return 0;
   3062 
   3063   assert(Ty->isVectorTy() && "Can only scalarize vectors");
   3064   unsigned Cost = 0;
   3065 
   3066   for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
   3067     if (Insert)
   3068       Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
   3069     if (Extract)
   3070       Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
   3071   }
   3072 
   3073   return Cost;
   3074 }
   3075 
   3076 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
   3077 // Return the cost of the instruction, including scalarization overhead if it's
   3078 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
   3079 // i.e. either vector version isn't available, or is too expensive.
   3080 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
   3081                                   const TargetTransformInfo &TTI,
   3082                                   const TargetLibraryInfo *TLI,
   3083                                   bool &NeedToScalarize) {
   3084   Function *F = CI->getCalledFunction();
   3085   StringRef FnName = CI->getCalledFunction()->getName();
   3086   Type *ScalarRetTy = CI->getType();
   3087   SmallVector<Type *, 4> Tys, ScalarTys;
   3088   for (auto &ArgOp : CI->arg_operands())
   3089     ScalarTys.push_back(ArgOp->getType());
   3090 
   3091   // Estimate cost of scalarized vector call. The source operands are assumed
   3092   // to be vectors, so we need to extract individual elements from there,
   3093   // execute VF scalar calls, and then gather the result into the vector return
   3094   // value.
   3095   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
   3096   if (VF == 1)
   3097     return ScalarCallCost;
   3098 
   3099   // Compute corresponding vector type for return value and arguments.
   3100   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
   3101   for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
   3102     Tys.push_back(ToVectorTy(ScalarTys[i], VF));
   3103 
   3104   // Compute costs of unpacking argument values for the scalar calls and
   3105   // packing the return values to a vector.
   3106   unsigned ScalarizationCost =
   3107       getScalarizationOverhead(RetTy, true, false, TTI);
   3108   for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
   3109     ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
   3110 
   3111   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
   3112 
   3113   // If we can't emit a vector call for this function, then the currently found
   3114   // cost is the cost we need to return.
   3115   NeedToScalarize = true;
   3116   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
   3117     return Cost;
   3118 
   3119   // If the corresponding vector cost is cheaper, return its cost.
   3120   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
   3121   if (VectorCallCost < Cost) {
   3122     NeedToScalarize = false;
   3123     return VectorCallCost;
   3124   }
   3125   return Cost;
   3126 }
   3127 
   3128 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
   3129 // factor VF.  Return the cost of the instruction, including scalarization
   3130 // overhead if it's needed.
   3131 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
   3132                                        const TargetTransformInfo &TTI,
   3133                                        const TargetLibraryInfo *TLI) {
   3134   Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
   3135   assert(ID && "Expected intrinsic call!");
   3136 
   3137   Type *RetTy = ToVectorTy(CI->getType(), VF);
   3138   SmallVector<Type *, 4> Tys;
   3139   for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
   3140     Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
   3141 
   3142   return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
   3143 }
   3144 
   3145 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
   3146   IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
   3147   IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
   3148   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
   3149 }
   3150 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
   3151   IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
   3152   IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
   3153   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
   3154 }
   3155 
   3156 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
   3157   // For every instruction `I` in MinBWs, truncate the operands, create a
   3158   // truncated version of `I` and reextend its result. InstCombine runs
   3159   // later and will remove any ext/trunc pairs.
   3160   //
   3161   for (auto &KV : MinBWs) {
   3162     VectorParts &Parts = WidenMap.get(KV.first);
   3163     for (Value *&I : Parts) {
   3164       if (I->use_empty())
   3165         continue;
   3166       Type *OriginalTy = I->getType();
   3167       Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(),
   3168                                                  KV.second);
   3169       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
   3170                                           OriginalTy->getVectorNumElements());
   3171       if (TruncatedTy == OriginalTy)
   3172         continue;
   3173 
   3174       IRBuilder<> B(cast<Instruction>(I));
   3175       auto ShrinkOperand = [&](Value *V) -> Value* {
   3176         if (auto *ZI = dyn_cast<ZExtInst>(V))
   3177           if (ZI->getSrcTy() == TruncatedTy)
   3178             return ZI->getOperand(0);
   3179         return B.CreateZExtOrTrunc(V, TruncatedTy);
   3180       };
   3181 
   3182       // The actual instruction modification depends on the instruction type,
   3183       // unfortunately.
   3184       Value *NewI = nullptr;
   3185       if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
   3186         NewI = B.CreateBinOp(BO->getOpcode(),
   3187                              ShrinkOperand(BO->getOperand(0)),
   3188                              ShrinkOperand(BO->getOperand(1)));
   3189         cast<BinaryOperator>(NewI)->copyIRFlags(I);
   3190       } else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) {
   3191         NewI = B.CreateICmp(CI->getPredicate(),
   3192                             ShrinkOperand(CI->getOperand(0)),
   3193                             ShrinkOperand(CI->getOperand(1)));
   3194       } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
   3195         NewI = B.CreateSelect(SI->getCondition(),
   3196                               ShrinkOperand(SI->getTrueValue()),
   3197                               ShrinkOperand(SI->getFalseValue()));
   3198       } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
   3199         switch (CI->getOpcode()) {
   3200         default: llvm_unreachable("Unhandled cast!");
   3201         case Instruction::Trunc:
   3202           NewI = ShrinkOperand(CI->getOperand(0));
   3203           break;
   3204         case Instruction::SExt:
   3205           NewI = B.CreateSExtOrTrunc(CI->getOperand(0),
   3206                                      smallestIntegerVectorType(OriginalTy,
   3207                                                                TruncatedTy));
   3208           break;
   3209         case Instruction::ZExt:
   3210           NewI = B.CreateZExtOrTrunc(CI->getOperand(0),
   3211                                      smallestIntegerVectorType(OriginalTy,
   3212                                                                TruncatedTy));
   3213           break;
   3214         }
   3215       } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
   3216         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
   3217         auto *O0 =
   3218           B.CreateZExtOrTrunc(SI->getOperand(0),
   3219                               VectorType::get(ScalarTruncatedTy, Elements0));
   3220         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
   3221         auto *O1 =
   3222           B.CreateZExtOrTrunc(SI->getOperand(1),
   3223                               VectorType::get(ScalarTruncatedTy, Elements1));
   3224 
   3225         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
   3226       } else if (isa<LoadInst>(I)) {
   3227         // Don't do anything with the operands, just extend the result.
   3228         continue;
   3229       } else {
   3230         llvm_unreachable("Unhandled instruction type!");
   3231       }
   3232 
   3233       // Lastly, extend the result.
   3234       NewI->takeName(cast<Instruction>(I));
   3235       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
   3236       I->replaceAllUsesWith(Res);
   3237       cast<Instruction>(I)->eraseFromParent();
   3238       I = Res;
   3239     }
   3240   }
   3241 
   3242   // We'll have created a bunch of ZExts that are now parentless. Clean up.
   3243   for (auto &KV : MinBWs) {
   3244     VectorParts &Parts = WidenMap.get(KV.first);
   3245     for (Value *&I : Parts) {
   3246       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
   3247       if (Inst && Inst->use_empty()) {
   3248         Value *NewI = Inst->getOperand(0);
   3249         Inst->eraseFromParent();
   3250         I = NewI;
   3251       }
   3252     }
   3253   }
   3254 }
   3255 
   3256 void InnerLoopVectorizer::vectorizeLoop() {
   3257   //===------------------------------------------------===//
   3258   //
   3259   // Notice: any optimization or new instruction that go
   3260   // into the code below should be also be implemented in
   3261   // the cost-model.
   3262   //
   3263   //===------------------------------------------------===//
   3264   Constant *Zero = Builder.getInt32(0);
   3265 
   3266   // In order to support reduction variables we need to be able to vectorize
   3267   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
   3268   // stages. First, we create a new vector PHI node with no incoming edges.
   3269   // We use this value when we vectorize all of the instructions that use the
   3270   // PHI. Next, after all of the instructions in the block are complete we
   3271   // add the new incoming edges to the PHI. At this point all of the
   3272   // instructions in the basic block are vectorized, so we can use them to
   3273   // construct the PHI.
   3274   PhiVector RdxPHIsToFix;
   3275 
   3276   // Scan the loop in a topological order to ensure that defs are vectorized
   3277   // before users.
   3278   LoopBlocksDFS DFS(OrigLoop);
   3279   DFS.perform(LI);
   3280 
   3281   // Vectorize all of the blocks in the original loop.
   3282   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
   3283        be = DFS.endRPO(); bb != be; ++bb)
   3284     vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
   3285 
   3286   // Insert truncates and extends for any truncated instructions as hints to
   3287   // InstCombine.
   3288   if (VF > 1)
   3289     truncateToMinimalBitwidths();
   3290 
   3291   // At this point every instruction in the original loop is widened to
   3292   // a vector form. We are almost done. Now, we need to fix the PHI nodes
   3293   // that we vectorized. The PHI nodes are currently empty because we did
   3294   // not want to introduce cycles. Notice that the remaining PHI nodes
   3295   // that we need to fix are reduction variables.
   3296 
   3297   // Create the 'reduced' values for each of the induction vars.
   3298   // The reduced values are the vector values that we scalarize and combine
   3299   // after the loop is finished.
   3300   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
   3301        it != e; ++it) {
   3302     PHINode *RdxPhi = *it;
   3303     assert(RdxPhi && "Unable to recover vectorized PHI");
   3304 
   3305     // Find the reduction variable descriptor.
   3306     assert(Legal->isReductionVariable(RdxPhi) &&
   3307            "Unable to find the reduction variable");
   3308     RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
   3309 
   3310     RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
   3311     TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
   3312     Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
   3313     RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
   3314         RdxDesc.getMinMaxRecurrenceKind();
   3315     setDebugLocFromInst(Builder, ReductionStartValue);
   3316 
   3317     // We need to generate a reduction vector from the incoming scalar.
   3318     // To do so, we need to generate the 'identity' vector and override
   3319     // one of the elements with the incoming scalar reduction. We need
   3320     // to do it in the vector-loop preheader.
   3321     Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
   3322 
   3323     // This is the vector-clone of the value that leaves the loop.
   3324     VectorParts &VectorExit = getVectorValue(LoopExitInst);
   3325     Type *VecTy = VectorExit[0]->getType();
   3326 
   3327     // Find the reduction identity variable. Zero for addition, or, xor,
   3328     // one for multiplication, -1 for And.
   3329     Value *Identity;
   3330     Value *VectorStart;
   3331     if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
   3332         RK == RecurrenceDescriptor::RK_FloatMinMax) {
   3333       // MinMax reduction have the start value as their identify.
   3334       if (VF == 1) {
   3335         VectorStart = Identity = ReductionStartValue;
   3336       } else {
   3337         VectorStart = Identity =
   3338             Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
   3339       }
   3340     } else {
   3341       // Handle other reduction kinds:
   3342       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
   3343           RK, VecTy->getScalarType());
   3344       if (VF == 1) {
   3345         Identity = Iden;
   3346         // This vector is the Identity vector where the first element is the
   3347         // incoming scalar reduction.
   3348         VectorStart = ReductionStartValue;
   3349       } else {
   3350         Identity = ConstantVector::getSplat(VF, Iden);
   3351 
   3352         // This vector is the Identity vector where the first element is the
   3353         // incoming scalar reduction.
   3354         VectorStart =
   3355             Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
   3356       }
   3357     }
   3358 
   3359     // Fix the vector-loop phi.
   3360 
   3361     // Reductions do not have to start at zero. They can start with
   3362     // any loop invariant values.
   3363     VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
   3364     BasicBlock *Latch = OrigLoop->getLoopLatch();
   3365     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
   3366     VectorParts &Val = getVectorValue(LoopVal);
   3367     for (unsigned part = 0; part < UF; ++part) {
   3368       // Make sure to add the reduction stat value only to the
   3369       // first unroll part.
   3370       Value *StartVal = (part == 0) ? VectorStart : Identity;
   3371       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
   3372                                                   LoopVectorPreHeader);
   3373       cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
   3374                                                   LoopVectorBody.back());
   3375     }
   3376 
   3377     // Before each round, move the insertion point right between
   3378     // the PHIs and the values we are going to write.
   3379     // This allows us to write both PHINodes and the extractelement
   3380     // instructions.
   3381     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
   3382 
   3383     VectorParts RdxParts = getVectorValue(LoopExitInst);
   3384     setDebugLocFromInst(Builder, LoopExitInst);
   3385 
   3386     // If the vector reduction can be performed in a smaller type, we truncate
   3387     // then extend the loop exit value to enable InstCombine to evaluate the
   3388     // entire expression in the smaller type.
   3389     if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) {
   3390       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
   3391       Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
   3392       for (unsigned part = 0; part < UF; ++part) {
   3393         Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
   3394         Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
   3395                                           : Builder.CreateZExt(Trunc, VecTy);
   3396         for (Value::user_iterator UI = RdxParts[part]->user_begin();
   3397              UI != RdxParts[part]->user_end();)
   3398           if (*UI != Trunc) {
   3399             (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
   3400             RdxParts[part] = Extnd;
   3401           } else {
   3402             ++UI;
   3403           }
   3404       }
   3405       Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
   3406       for (unsigned part = 0; part < UF; ++part)
   3407         RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
   3408     }
   3409 
   3410     // Reduce all of the unrolled parts into a single vector.
   3411     Value *ReducedPartRdx = RdxParts[0];
   3412     unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
   3413     setDebugLocFromInst(Builder, ReducedPartRdx);
   3414     for (unsigned part = 1; part < UF; ++part) {
   3415       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
   3416         // Floating point operations had to be 'fast' to enable the reduction.
   3417         ReducedPartRdx = addFastMathFlag(
   3418             Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
   3419                                 ReducedPartRdx, "bin.rdx"));
   3420       else
   3421         ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
   3422             Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
   3423     }
   3424 
   3425     if (VF > 1) {
   3426       // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   3427       // and vector ops, reducing the set of values being computed by half each
   3428       // round.
   3429       assert(isPowerOf2_32(VF) &&
   3430              "Reduction emission only supported for pow2 vectors!");
   3431       Value *TmpVec = ReducedPartRdx;
   3432       SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
   3433       for (unsigned i = VF; i != 1; i >>= 1) {
   3434         // Move the upper half of the vector to the lower half.
   3435         for (unsigned j = 0; j != i/2; ++j)
   3436           ShuffleMask[j] = Builder.getInt32(i/2 + j);
   3437 
   3438         // Fill the rest of the mask with undef.
   3439         std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
   3440                   UndefValue::get(Builder.getInt32Ty()));
   3441 
   3442         Value *Shuf =
   3443         Builder.CreateShuffleVector(TmpVec,
   3444                                     UndefValue::get(TmpVec->getType()),
   3445                                     ConstantVector::get(ShuffleMask),
   3446                                     "rdx.shuf");
   3447 
   3448         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
   3449           // Floating point operations had to be 'fast' to enable the reduction.
   3450           TmpVec = addFastMathFlag(Builder.CreateBinOp(
   3451               (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
   3452         else
   3453           TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
   3454                                                         TmpVec, Shuf);
   3455       }
   3456 
   3457       // The result is in the first element of the vector.
   3458       ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
   3459                                                     Builder.getInt32(0));
   3460 
   3461       // If the reduction can be performed in a smaller type, we need to extend
   3462       // the reduction to the wider type before we branch to the original loop.
   3463       if (RdxPhi->getType() != RdxDesc.getRecurrenceType())
   3464         ReducedPartRdx =
   3465             RdxDesc.isSigned()
   3466                 ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType())
   3467                 : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType());
   3468     }
   3469 
   3470     // Create a phi node that merges control-flow from the backedge-taken check
   3471     // block and the middle block.
   3472     PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
   3473                                           LoopScalarPreHeader->getTerminator());
   3474     for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
   3475       BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
   3476     BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
   3477 
   3478     // Now, we need to fix the users of the reduction variable
   3479     // inside and outside of the scalar remainder loop.
   3480     // We know that the loop is in LCSSA form. We need to update the
   3481     // PHI nodes in the exit blocks.
   3482     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
   3483          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
   3484       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
   3485       if (!LCSSAPhi) break;
   3486 
   3487       // All PHINodes need to have a single entry edge, or two if
   3488       // we already fixed them.
   3489       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
   3490 
   3491       // We found our reduction value exit-PHI. Update it with the
   3492       // incoming bypass edge.
   3493       if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
   3494         // Add an edge coming from the bypass.
   3495         LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
   3496         break;
   3497       }
   3498     }// end of the LCSSA phi scan.
   3499 
   3500     // Fix the scalar loop reduction variable with the incoming reduction sum
   3501     // from the vector body and from the backedge value.
   3502     int IncomingEdgeBlockIdx =
   3503     (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
   3504     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
   3505     // Pick the other block.
   3506     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
   3507     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
   3508     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
   3509   }// end of for each redux variable.
   3510 
   3511   fixLCSSAPHIs();
   3512 
   3513   // Make sure DomTree is updated.
   3514   updateAnalysis();
   3515 
   3516   // Predicate any stores.
   3517   for (auto KV : PredicatedStores) {
   3518     BasicBlock::iterator I(KV.first);
   3519     auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
   3520     auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
   3521                                         /*BranchWeights=*/nullptr, DT);
   3522     I->moveBefore(T);
   3523     I->getParent()->setName("pred.store.if");
   3524     BB->setName("pred.store.continue");
   3525   }
   3526   DEBUG(DT->verifyDomTree());
   3527   // Remove redundant induction instructions.
   3528   cse(LoopVectorBody);
   3529 }
   3530 
   3531 void InnerLoopVectorizer::fixLCSSAPHIs() {
   3532   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
   3533        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
   3534     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
   3535     if (!LCSSAPhi) break;
   3536     if (LCSSAPhi->getNumIncomingValues() == 1)
   3537       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
   3538                             LoopMiddleBlock);
   3539   }
   3540 }
   3541 
   3542 InnerLoopVectorizer::VectorParts
   3543 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   3544   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
   3545          "Invalid edge");
   3546 
   3547   // Look for cached value.
   3548   std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
   3549   EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
   3550   if (ECEntryIt != MaskCache.end())
   3551     return ECEntryIt->second;
   3552 
   3553   VectorParts SrcMask = createBlockInMask(Src);
   3554 
   3555   // The terminator has to be a branch inst!
   3556   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
   3557   assert(BI && "Unexpected terminator found");
   3558 
   3559   if (BI->isConditional()) {
   3560     VectorParts EdgeMask = getVectorValue(BI->getCondition());
   3561 
   3562     if (BI->getSuccessor(0) != Dst)
   3563       for (unsigned part = 0; part < UF; ++part)
   3564         EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
   3565 
   3566     for (unsigned part = 0; part < UF; ++part)
   3567       EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
   3568 
   3569     MaskCache[Edge] = EdgeMask;
   3570     return EdgeMask;
   3571   }
   3572 
   3573   MaskCache[Edge] = SrcMask;
   3574   return SrcMask;
   3575 }
   3576 
   3577 InnerLoopVectorizer::VectorParts
   3578 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   3579   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
   3580 
   3581   // Loop incoming mask is all-one.
   3582   if (OrigLoop->getHeader() == BB) {
   3583     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
   3584     return getVectorValue(C);
   3585   }
   3586 
   3587   // This is the block mask. We OR all incoming edges, and with zero.
   3588   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
   3589   VectorParts BlockMask = getVectorValue(Zero);
   3590 
   3591   // For each pred:
   3592   for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
   3593     VectorParts EM = createEdgeMask(*it, BB);
   3594     for (unsigned part = 0; part < UF; ++part)
   3595       BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
   3596   }
   3597 
   3598   return BlockMask;
   3599 }
   3600 
   3601 void InnerLoopVectorizer::widenPHIInstruction(
   3602     Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF,
   3603     unsigned VF, PhiVector *PV) {
   3604   PHINode* P = cast<PHINode>(PN);
   3605   // Handle reduction variables:
   3606   if (Legal->isReductionVariable(P)) {
   3607     for (unsigned part = 0; part < UF; ++part) {
   3608       // This is phase one of vectorizing PHIs.
   3609       Type *VecTy = (VF == 1) ? PN->getType() :
   3610       VectorType::get(PN->getType(), VF);
   3611       Entry[part] = PHINode::Create(
   3612           VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
   3613     }
   3614     PV->push_back(P);
   3615     return;
   3616   }
   3617 
   3618   setDebugLocFromInst(Builder, P);
   3619   // Check for PHI nodes that are lowered to vector selects.
   3620   if (P->getParent() != OrigLoop->getHeader()) {
   3621     // We know that all PHIs in non-header blocks are converted into
   3622     // selects, so we don't have to worry about the insertion order and we
   3623     // can just use the builder.
   3624     // At this point we generate the predication tree. There may be
   3625     // duplications since this is a simple recursive scan, but future
   3626     // optimizations will clean it up.
   3627 
   3628     unsigned NumIncoming = P->getNumIncomingValues();
   3629 
   3630     // Generate a sequence of selects of the form:
   3631     // SELECT(Mask3, In3,
   3632     //      SELECT(Mask2, In2,
   3633     //                   ( ...)))
   3634     for (unsigned In = 0; In < NumIncoming; In++) {
   3635       VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
   3636                                         P->getParent());
   3637       VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
   3638 
   3639       for (unsigned part = 0; part < UF; ++part) {
   3640         // We might have single edge PHIs (blocks) - use an identity
   3641         // 'select' for the first PHI operand.
   3642         if (In == 0)
   3643           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
   3644                                              In0[part]);
   3645         else
   3646           // Select between the current value and the previous incoming edge
   3647           // based on the incoming mask.
   3648           Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
   3649                                              Entry[part], "predphi");
   3650       }
   3651     }
   3652     return;
   3653   }
   3654 
   3655   // This PHINode must be an induction variable.
   3656   // Make sure that we know about it.
   3657   assert(Legal->getInductionVars()->count(P) &&
   3658          "Not an induction variable");
   3659 
   3660   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
   3661 
   3662   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
   3663   // which can be found from the original scalar operations.
   3664   switch (II.getKind()) {
   3665     case InductionDescriptor::IK_NoInduction:
   3666       llvm_unreachable("Unknown induction");
   3667     case InductionDescriptor::IK_IntInduction: {
   3668       assert(P->getType() == II.getStartValue()->getType() &&
   3669              "Types must match");
   3670       // Handle other induction variables that are now based on the
   3671       // canonical one.
   3672       Value *V = Induction;
   3673       if (P != OldInduction) {
   3674         V = Builder.CreateSExtOrTrunc(Induction, P->getType());
   3675         V = II.transform(Builder, V);
   3676         V->setName("offset.idx");
   3677       }
   3678       Value *Broadcasted = getBroadcastInstrs(V);
   3679       // After broadcasting the induction variable we need to make the vector
   3680       // consecutive by adding 0, 1, 2, etc.
   3681       for (unsigned part = 0; part < UF; ++part)
   3682         Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue());
   3683       return;
   3684     }
   3685     case InductionDescriptor::IK_PtrInduction:
   3686       // Handle the pointer induction variable case.
   3687       assert(P->getType()->isPointerTy() && "Unexpected type.");
   3688       // This is the normalized GEP that starts counting at zero.
   3689       Value *PtrInd = Induction;
   3690       PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());
   3691       // This is the vector of results. Notice that we don't generate
   3692       // vector geps because scalar geps result in better code.
   3693       for (unsigned part = 0; part < UF; ++part) {
   3694         if (VF == 1) {
   3695           int EltIndex = part;
   3696           Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
   3697           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
   3698           Value *SclrGep = II.transform(Builder, GlobalIdx);
   3699           SclrGep->setName("next.gep");
   3700           Entry[part] = SclrGep;
   3701           continue;
   3702         }
   3703 
   3704         Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
   3705         for (unsigned int i = 0; i < VF; ++i) {
   3706           int EltIndex = i + part * VF;
   3707           Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
   3708           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
   3709           Value *SclrGep = II.transform(Builder, GlobalIdx);
   3710           SclrGep->setName("next.gep");
   3711           VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
   3712                                                Builder.getInt32(i),
   3713                                                "insert.gep");
   3714         }
   3715         Entry[part] = VecVal;
   3716       }
   3717       return;
   3718   }
   3719 }
   3720 
   3721 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
   3722   // For each instruction in the old loop.
   3723   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
   3724     VectorParts &Entry = WidenMap.get(&*it);
   3725 
   3726     switch (it->getOpcode()) {
   3727     case Instruction::Br:
   3728       // Nothing to do for PHIs and BR, since we already took care of the
   3729       // loop control flow instructions.
   3730       continue;
   3731     case Instruction::PHI: {
   3732       // Vectorize PHINodes.
   3733       widenPHIInstruction(&*it, Entry, UF, VF, PV);
   3734       continue;
   3735     }// End of PHI.
   3736 
   3737     case Instruction::Add:
   3738     case Instruction::FAdd:
   3739     case Instruction::Sub:
   3740     case Instruction::FSub:
   3741     case Instruction::Mul:
   3742     case Instruction::FMul:
   3743     case Instruction::UDiv:
   3744     case Instruction::SDiv:
   3745     case Instruction::FDiv:
   3746     case Instruction::URem:
   3747     case Instruction::SRem:
   3748     case Instruction::FRem:
   3749     case Instruction::Shl:
   3750     case Instruction::LShr:
   3751     case Instruction::AShr:
   3752     case Instruction::And:
   3753     case Instruction::Or:
   3754     case Instruction::Xor: {
   3755       // Just widen binops.
   3756       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
   3757       setDebugLocFromInst(Builder, BinOp);
   3758       VectorParts &A = getVectorValue(it->getOperand(0));
   3759       VectorParts &B = getVectorValue(it->getOperand(1));
   3760 
   3761       // Use this vector value for all users of the original instruction.
   3762       for (unsigned Part = 0; Part < UF; ++Part) {
   3763         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
   3764 
   3765         if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
   3766           VecOp->copyIRFlags(BinOp);
   3767 
   3768         Entry[Part] = V;
   3769       }
   3770 
   3771       propagateMetadata(Entry, &*it);
   3772       break;
   3773     }
   3774     case Instruction::Select: {
   3775       // Widen selects.
   3776       // If the selector is loop invariant we can create a select
   3777       // instruction with a scalar condition. Otherwise, use vector-select.
   3778       auto *SE = PSE.getSE();
   3779       bool InvariantCond =
   3780           SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop);
   3781       setDebugLocFromInst(Builder, &*it);
   3782 
   3783       // The condition can be loop invariant  but still defined inside the
   3784       // loop. This means that we can't just use the original 'cond' value.
   3785       // We have to take the 'vectorized' value and pick the first lane.
   3786       // Instcombine will make this a no-op.
   3787       VectorParts &Cond = getVectorValue(it->getOperand(0));
   3788       VectorParts &Op0  = getVectorValue(it->getOperand(1));
   3789       VectorParts &Op1  = getVectorValue(it->getOperand(2));
   3790 
   3791       Value *ScalarCond = (VF == 1) ? Cond[0] :
   3792         Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
   3793 
   3794       for (unsigned Part = 0; Part < UF; ++Part) {
   3795         Entry[Part] = Builder.CreateSelect(
   3796           InvariantCond ? ScalarCond : Cond[Part],
   3797           Op0[Part],
   3798           Op1[Part]);
   3799       }
   3800 
   3801       propagateMetadata(Entry, &*it);
   3802       break;
   3803     }
   3804 
   3805     case Instruction::ICmp:
   3806     case Instruction::FCmp: {
   3807       // Widen compares. Generate vector compares.
   3808       bool FCmp = (it->getOpcode() == Instruction::FCmp);
   3809       CmpInst *Cmp = dyn_cast<CmpInst>(it);
   3810       setDebugLocFromInst(Builder, &*it);
   3811       VectorParts &A = getVectorValue(it->getOperand(0));
   3812       VectorParts &B = getVectorValue(it->getOperand(1));
   3813       for (unsigned Part = 0; Part < UF; ++Part) {
   3814         Value *C = nullptr;
   3815         if (FCmp) {
   3816           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
   3817           cast<FCmpInst>(C)->copyFastMathFlags(&*it);
   3818         } else {
   3819           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
   3820         }
   3821         Entry[Part] = C;
   3822       }
   3823 
   3824       propagateMetadata(Entry, &*it);
   3825       break;
   3826     }
   3827 
   3828     case Instruction::Store:
   3829     case Instruction::Load:
   3830       vectorizeMemoryInstruction(&*it);
   3831         break;
   3832     case Instruction::ZExt:
   3833     case Instruction::SExt:
   3834     case Instruction::FPToUI:
   3835     case Instruction::FPToSI:
   3836     case Instruction::FPExt:
   3837     case Instruction::PtrToInt:
   3838     case Instruction::IntToPtr:
   3839     case Instruction::SIToFP:
   3840     case Instruction::UIToFP:
   3841     case Instruction::Trunc:
   3842     case Instruction::FPTrunc:
   3843     case Instruction::BitCast: {
   3844       CastInst *CI = dyn_cast<CastInst>(it);
   3845       setDebugLocFromInst(Builder, &*it);
   3846       /// Optimize the special case where the source is the induction
   3847       /// variable. Notice that we can only optimize the 'trunc' case
   3848       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
   3849       /// c. other casts depend on pointer size.
   3850       if (CI->getOperand(0) == OldInduction &&
   3851           it->getOpcode() == Instruction::Trunc) {
   3852         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
   3853                                                CI->getType());
   3854         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
   3855         InductionDescriptor II =
   3856             Legal->getInductionVars()->lookup(OldInduction);
   3857         Constant *Step = ConstantInt::getSigned(
   3858             CI->getType(), II.getStepValue()->getSExtValue());
   3859         for (unsigned Part = 0; Part < UF; ++Part)
   3860           Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
   3861         propagateMetadata(Entry, &*it);
   3862         break;
   3863       }
   3864       /// Vectorize casts.
   3865       Type *DestTy = (VF == 1) ? CI->getType() :
   3866                                  VectorType::get(CI->getType(), VF);
   3867 
   3868       VectorParts &A = getVectorValue(it->getOperand(0));
   3869       for (unsigned Part = 0; Part < UF; ++Part)
   3870         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
   3871       propagateMetadata(Entry, &*it);
   3872       break;
   3873     }
   3874 
   3875     case Instruction::Call: {
   3876       // Ignore dbg intrinsics.
   3877       if (isa<DbgInfoIntrinsic>(it))
   3878         break;
   3879       setDebugLocFromInst(Builder, &*it);
   3880 
   3881       Module *M = BB->getParent()->getParent();
   3882       CallInst *CI = cast<CallInst>(it);
   3883 
   3884       StringRef FnName = CI->getCalledFunction()->getName();
   3885       Function *F = CI->getCalledFunction();
   3886       Type *RetTy = ToVectorTy(CI->getType(), VF);
   3887       SmallVector<Type *, 4> Tys;
   3888       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
   3889         Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
   3890 
   3891       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
   3892       if (ID &&
   3893           (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
   3894            ID == Intrinsic::lifetime_start)) {
   3895         scalarizeInstruction(&*it);
   3896         break;
   3897       }
   3898       // The flag shows whether we use Intrinsic or a usual Call for vectorized
   3899       // version of the instruction.
   3900       // Is it beneficial to perform intrinsic call compared to lib call?
   3901       bool NeedToScalarize;
   3902       unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
   3903       bool UseVectorIntrinsic =
   3904           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
   3905       if (!UseVectorIntrinsic && NeedToScalarize) {
   3906         scalarizeInstruction(&*it);
   3907         break;
   3908       }
   3909 
   3910       for (unsigned Part = 0; Part < UF; ++Part) {
   3911         SmallVector<Value *, 4> Args;
   3912         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
   3913           Value *Arg = CI->getArgOperand(i);
   3914           // Some intrinsics have a scalar argument - don't replace it with a
   3915           // vector.
   3916           if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
   3917             VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
   3918             Arg = VectorArg[Part];
   3919           }
   3920           Args.push_back(Arg);
   3921         }
   3922 
   3923         Function *VectorF;
   3924         if (UseVectorIntrinsic) {
   3925           // Use vector version of the intrinsic.
   3926           Type *TysForDecl[] = {CI->getType()};
   3927           if (VF > 1)
   3928             TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
   3929           VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
   3930         } else {
   3931           // Use vector version of the library call.
   3932           StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
   3933           assert(!VFnName.empty() && "Vector function name is empty.");
   3934           VectorF = M->getFunction(VFnName);
   3935           if (!VectorF) {
   3936             // Generate a declaration
   3937             FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
   3938             VectorF =
   3939                 Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
   3940             VectorF->copyAttributesFrom(F);
   3941           }
   3942         }
   3943         assert(VectorF && "Can't create vector function.");
   3944         Entry[Part] = Builder.CreateCall(VectorF, Args);
   3945       }
   3946 
   3947       propagateMetadata(Entry, &*it);
   3948       break;
   3949     }
   3950 
   3951     default:
   3952       // All other instructions are unsupported. Scalarize them.
   3953       scalarizeInstruction(&*it);
   3954       break;
   3955     }// end of switch.
   3956   }// end of for_each instr.
   3957 }
   3958 
   3959 void InnerLoopVectorizer::updateAnalysis() {
   3960   // Forget the original basic block.
   3961   PSE.getSE()->forgetLoop(OrigLoop);
   3962 
   3963   // Update the dominator tree information.
   3964   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
   3965          "Entry does not dominate exit.");
   3966 
   3967   for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
   3968     DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
   3969   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
   3970 
   3971   // We don't predicate stores by this point, so the vector body should be a
   3972   // single loop.
   3973   assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
   3974   DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
   3975 
   3976   DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
   3977   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   3978   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   3979   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
   3980 
   3981   DEBUG(DT->verifyDomTree());
   3982 }
   3983 
   3984 /// \brief Check whether it is safe to if-convert this phi node.
   3985 ///
   3986 /// Phi nodes with constant expressions that can trap are not safe to if
   3987 /// convert.
   3988 static bool canIfConvertPHINodes(BasicBlock *BB) {
   3989   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
   3990     PHINode *Phi = dyn_cast<PHINode>(I);
   3991     if (!Phi)
   3992       return true;
   3993     for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
   3994       if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
   3995         if (C->canTrap())
   3996           return false;
   3997   }
   3998   return true;
   3999 }
   4000 
   4001 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   4002   if (!EnableIfConversion) {
   4003     emitAnalysis(VectorizationReport() << "if-conversion is disabled");
   4004     return false;
   4005   }
   4006 
   4007   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
   4008 
   4009   // A list of pointers that we can safely read and write to.
   4010   SmallPtrSet<Value *, 8> SafePointes;
   4011 
   4012   // Collect safe addresses.
   4013   for (Loop::block_iterator BI = TheLoop->block_begin(),
   4014          BE = TheLoop->block_end(); BI != BE; ++BI) {
   4015     BasicBlock *BB = *BI;
   4016 
   4017     if (blockNeedsPredication(BB))
   4018       continue;
   4019 
   4020     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
   4021       if (LoadInst *LI = dyn_cast<LoadInst>(I))
   4022         SafePointes.insert(LI->getPointerOperand());
   4023       else if (StoreInst *SI = dyn_cast<StoreInst>(I))
   4024         SafePointes.insert(SI->getPointerOperand());
   4025     }
   4026   }
   4027 
   4028   // Collect the blocks that need predication.
   4029   BasicBlock *Header = TheLoop->getHeader();
   4030   for (Loop::block_iterator BI = TheLoop->block_begin(),
   4031          BE = TheLoop->block_end(); BI != BE; ++BI) {
   4032     BasicBlock *BB = *BI;
   4033 
   4034     // We don't support switch statements inside loops.
   4035     if (!isa<BranchInst>(BB->getTerminator())) {
   4036       emitAnalysis(VectorizationReport(BB->getTerminator())
   4037                    << "loop contains a switch statement");
   4038       return false;
   4039     }
   4040 
   4041     // We must be able to predicate all blocks that need to be predicated.
   4042     if (blockNeedsPredication(BB)) {
   4043       if (!blockCanBePredicated(BB, SafePointes)) {
   4044         emitAnalysis(VectorizationReport(BB->getTerminator())
   4045                      << "control flow cannot be substituted for a select");
   4046         return false;
   4047       }
   4048     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
   4049       emitAnalysis(VectorizationReport(BB->getTerminator())
   4050                    << "control flow cannot be substituted for a select");
   4051       return false;
   4052     }
   4053   }
   4054 
   4055   // We can if-convert this loop.
   4056   return true;
   4057 }
   4058 
   4059 bool LoopVectorizationLegality::canVectorize() {
   4060   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   4061   // be canonicalized.
   4062   if (!TheLoop->getLoopPreheader()) {
   4063     emitAnalysis(
   4064         VectorizationReport() <<
   4065         "loop control flow is not understood by vectorizer");
   4066     return false;
   4067   }
   4068 
   4069   // We can only vectorize innermost loops.
   4070   if (!TheLoop->empty()) {
   4071     emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
   4072     return false;
   4073   }
   4074 
   4075   // We must have a single backedge.
   4076   if (TheLoop->getNumBackEdges() != 1) {
   4077     emitAnalysis(
   4078         VectorizationReport() <<
   4079         "loop control flow is not understood by vectorizer");
   4080     return false;
   4081   }
   4082 
   4083   // We must have a single exiting block.
   4084   if (!TheLoop->getExitingBlock()) {
   4085     emitAnalysis(
   4086         VectorizationReport() <<
   4087         "loop control flow is not understood by vectorizer");
   4088     return false;
   4089   }
   4090 
   4091   // We only handle bottom-tested loops, i.e. loop in which the condition is
   4092   // checked at the end of each iteration. With that we can assume that all
   4093   // instructions in the loop are executed the same number of times.
   4094   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
   4095     emitAnalysis(
   4096         VectorizationReport() <<
   4097         "loop control flow is not understood by vectorizer");
   4098     return false;
   4099   }
   4100 
   4101   // We need to have a loop header.
   4102   DEBUG(dbgs() << "LV: Found a loop: " <<
   4103         TheLoop->getHeader()->getName() << '\n');
   4104 
   4105   // Check if we can if-convert non-single-bb loops.
   4106   unsigned NumBlocks = TheLoop->getNumBlocks();
   4107   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
   4108     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
   4109     return false;
   4110   }
   4111 
   4112   // ScalarEvolution needs to be able to find the exit count.
   4113   const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop);
   4114   if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
   4115     emitAnalysis(VectorizationReport()
   4116                  << "could not determine number of loop iterations");
   4117     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
   4118     return false;
   4119   }
   4120 
   4121   // Check if we can vectorize the instructions and CFG in this loop.
   4122   if (!canVectorizeInstrs()) {
   4123     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
   4124     return false;
   4125   }
   4126 
   4127   // Go over each instruction and look at memory deps.
   4128   if (!canVectorizeMemory()) {
   4129     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
   4130     return false;
   4131   }
   4132 
   4133   // Collect all of the variables that remain uniform after vectorization.
   4134   collectLoopUniforms();
   4135 
   4136   DEBUG(dbgs() << "LV: We can vectorize this loop"
   4137                << (LAI->getRuntimePointerChecking()->Need
   4138                        ? " (with a runtime bound check)"
   4139                        : "")
   4140                << "!\n");
   4141 
   4142   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
   4143 
   4144   // If an override option has been passed in for interleaved accesses, use it.
   4145   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
   4146     UseInterleaved = EnableInterleavedMemAccesses;
   4147 
   4148   // Analyze interleaved memory accesses.
   4149   if (UseInterleaved)
   4150     InterleaveInfo.analyzeInterleaving(Strides);
   4151 
   4152   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   4153   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
   4154     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
   4155 
   4156   if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
   4157     emitAnalysis(VectorizationReport()
   4158                  << "Too many SCEV assumptions need to be made and checked "
   4159                  << "at runtime");
   4160     DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
   4161     return false;
   4162   }
   4163 
   4164   // Okay! We can vectorize. At this point we don't have any other mem analysis
   4165   // which may limit our maximum vectorization factor, so just return true with
   4166   // no restrictions.
   4167   return true;
   4168 }
   4169 
   4170 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
   4171   if (Ty->isPointerTy())
   4172     return DL.getIntPtrType(Ty);
   4173 
   4174   // It is possible that char's or short's overflow when we ask for the loop's
   4175   // trip count, work around this by changing the type size.
   4176   if (Ty->getScalarSizeInBits() < 32)
   4177     return Type::getInt32Ty(Ty->getContext());
   4178 
   4179   return Ty;
   4180 }
   4181 
   4182 static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
   4183   Ty0 = convertPointerToIntegerType(DL, Ty0);
   4184   Ty1 = convertPointerToIntegerType(DL, Ty1);
   4185   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
   4186     return Ty0;
   4187   return Ty1;
   4188 }
   4189 
   4190 /// \brief Check that the instruction has outside loop users and is not an
   4191 /// identified reduction variable.
   4192 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
   4193                                SmallPtrSetImpl<Value *> &Reductions) {
   4194   // Reduction instructions are allowed to have exit users. All other
   4195   // instructions must not have external users.
   4196   if (!Reductions.count(Inst))
   4197     //Check that all of the users of the loop are inside the BB.
   4198     for (User *U : Inst->users()) {
   4199       Instruction *UI = cast<Instruction>(U);
   4200       // This user may be a reduction exit value.
   4201       if (!TheLoop->contains(UI)) {
   4202         DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
   4203         return true;
   4204       }
   4205     }
   4206   return false;
   4207 }
   4208 
   4209 bool LoopVectorizationLegality::canVectorizeInstrs() {
   4210   BasicBlock *Header = TheLoop->getHeader();
   4211 
   4212   // Look for the attribute signaling the absence of NaNs.
   4213   Function &F = *Header->getParent();
   4214   const DataLayout &DL = F.getParent()->getDataLayout();
   4215   if (F.hasFnAttribute("no-nans-fp-math"))
   4216     HasFunNoNaNAttr =
   4217         F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
   4218 
   4219   // For each block in the loop.
   4220   for (Loop::block_iterator bb = TheLoop->block_begin(),
   4221        be = TheLoop->block_end(); bb != be; ++bb) {
   4222 
   4223     // Scan the instructions in the block and look for hazards.
   4224     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
   4225          ++it) {
   4226 
   4227       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
   4228         Type *PhiTy = Phi->getType();
   4229         // Check that this PHI type is allowed.
   4230         if (!PhiTy->isIntegerTy() &&
   4231             !PhiTy->isFloatingPointTy() &&
   4232             !PhiTy->isPointerTy()) {
   4233           emitAnalysis(VectorizationReport(&*it)
   4234                        << "loop control flow is not understood by vectorizer");
   4235           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
   4236           return false;
   4237         }
   4238 
   4239         // If this PHINode is not in the header block, then we know that we
   4240         // can convert it to select during if-conversion. No need to check if
   4241         // the PHIs in this block are induction or reduction variables.
   4242         if (*bb != Header) {
   4243           // Check that this instruction has no outside users or is an
   4244           // identified reduction value with an outside user.
   4245           if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit))
   4246             continue;
   4247           emitAnalysis(VectorizationReport(&*it) <<
   4248                        "value could not be identified as "
   4249                        "an induction or reduction variable");
   4250           return false;
   4251         }
   4252 
   4253         // We only allow if-converted PHIs with exactly two incoming values.
   4254         if (Phi->getNumIncomingValues() != 2) {
   4255           emitAnalysis(VectorizationReport(&*it)
   4256                        << "control flow not understood by vectorizer");
   4257           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
   4258           return false;
   4259         }
   4260 
   4261         InductionDescriptor ID;
   4262         if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) {
   4263           Inductions[Phi] = ID;
   4264           // Get the widest type.
   4265           if (!WidestIndTy)
   4266             WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
   4267           else
   4268             WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
   4269 
   4270           // Int inductions are special because we only allow one IV.
   4271           if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
   4272               ID.getStepValue()->isOne() &&
   4273               isa<Constant>(ID.getStartValue()) &&
   4274                 cast<Constant>(ID.getStartValue())->isNullValue()) {
   4275             // Use the phi node with the widest type as induction. Use the last
   4276             // one if there are multiple (no good reason for doing this other
   4277             // than it is expedient). We've checked that it begins at zero and
   4278             // steps by one, so this is a canonical induction variable.
   4279             if (!Induction || PhiTy == WidestIndTy)
   4280               Induction = Phi;
   4281           }
   4282 
   4283           DEBUG(dbgs() << "LV: Found an induction variable.\n");
   4284 
   4285           // Until we explicitly handle the case of an induction variable with
   4286           // an outside loop user we have to give up vectorizing this loop.
   4287           if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
   4288             emitAnalysis(VectorizationReport(&*it) <<
   4289                          "use of induction value outside of the "
   4290                          "loop is not handled by vectorizer");
   4291             return false;
   4292           }
   4293 
   4294           continue;
   4295         }
   4296 
   4297         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,
   4298                                                  Reductions[Phi])) {
   4299           if (Reductions[Phi].hasUnsafeAlgebra())
   4300             Requirements->addUnsafeAlgebraInst(
   4301                 Reductions[Phi].getUnsafeAlgebraInst());
   4302           AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
   4303           continue;
   4304         }
   4305 
   4306         emitAnalysis(VectorizationReport(&*it) <<
   4307                      "value that could not be identified as "
   4308                      "reduction is used outside the loop");
   4309         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
   4310         return false;
   4311       }// end of PHI handling
   4312 
   4313       // We handle calls that:
   4314       //   * Are debug info intrinsics.
   4315       //   * Have a mapping to an IR intrinsic.
   4316       //   * Have a vector version available.
   4317       CallInst *CI = dyn_cast<CallInst>(it);
   4318       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
   4319           !(CI->getCalledFunction() && TLI &&
   4320             TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
   4321         emitAnalysis(VectorizationReport(&*it)
   4322                      << "call instruction cannot be vectorized");
   4323         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
   4324         return false;
   4325       }
   4326 
   4327       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
   4328       // second argument is the same (i.e. loop invariant)
   4329       if (CI &&
   4330           hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
   4331         auto *SE = PSE.getSE();
   4332         if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
   4333           emitAnalysis(VectorizationReport(&*it)
   4334                        << "intrinsic instruction cannot be vectorized");
   4335           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
   4336           return false;
   4337         }
   4338       }
   4339 
   4340       // Check that the instruction return type is vectorizable.
   4341       // Also, we can't vectorize extractelement instructions.
   4342       if ((!VectorType::isValidElementType(it->getType()) &&
   4343            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
   4344         emitAnalysis(VectorizationReport(&*it)
   4345                      << "instruction return type cannot be vectorized");
   4346         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
   4347         return false;
   4348       }
   4349 
   4350       // Check that the stored type is vectorizable.
   4351       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
   4352         Type *T = ST->getValueOperand()->getType();
   4353         if (!VectorType::isValidElementType(T)) {
   4354           emitAnalysis(VectorizationReport(ST) <<
   4355                        "store instruction cannot be vectorized");
   4356           return false;
   4357         }
   4358         if (EnableMemAccessVersioning)
   4359           collectStridedAccess(ST);
   4360       }
   4361 
   4362       if (EnableMemAccessVersioning)
   4363         if (LoadInst *LI = dyn_cast<LoadInst>(it))
   4364           collectStridedAccess(LI);
   4365 
   4366       // Reduction instructions are allowed to have exit users.
   4367       // All other instructions must not have external users.
   4368       if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
   4369         emitAnalysis(VectorizationReport(&*it) <<
   4370                      "value cannot be used outside the loop");
   4371         return false;
   4372       }
   4373 
   4374     } // next instr.
   4375 
   4376   }
   4377 
   4378   if (!Induction) {
   4379     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
   4380     if (Inductions.empty()) {
   4381       emitAnalysis(VectorizationReport()
   4382                    << "loop induction variable could not be identified");
   4383       return false;
   4384     }
   4385   }
   4386 
   4387   // Now we know the widest induction type, check if our found induction
   4388   // is the same size. If it's not, unset it here and InnerLoopVectorizer
   4389   // will create another.
   4390   if (Induction && WidestIndTy != Induction->getType())
   4391     Induction = nullptr;
   4392 
   4393   return true;
   4394 }
   4395 
   4396 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   4397   Value *Ptr = nullptr;
   4398   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
   4399     Ptr = LI->getPointerOperand();
   4400   else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
   4401     Ptr = SI->getPointerOperand();
   4402   else
   4403     return;
   4404 
   4405   Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop);
   4406   if (!Stride)
   4407     return;
   4408 
   4409   DEBUG(dbgs() << "LV: Found a strided access that we can version");
   4410   DEBUG(dbgs() << "  Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
   4411   Strides[Ptr] = Stride;
   4412   StrideSet.insert(Stride);
   4413 }
   4414 
   4415 void LoopVectorizationLegality::collectLoopUniforms() {
   4416   // We now know that the loop is vectorizable!
   4417   // Collect variables that will remain uniform after vectorization.
   4418   std::vector<Value*> Worklist;
   4419   BasicBlock *Latch = TheLoop->getLoopLatch();
   4420 
   4421   // Start with the conditional branch and walk up the block.
   4422   Worklist.push_back(Latch->getTerminator()->getOperand(0));
   4423 
   4424   // Also add all consecutive pointer values; these values will be uniform
   4425   // after vectorization (and subsequent cleanup) and, until revectorization is
   4426   // supported, all dependencies must also be uniform.
   4427   for (Loop::block_iterator B = TheLoop->block_begin(),
   4428        BE = TheLoop->block_end(); B != BE; ++B)
   4429     for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
   4430          I != IE; ++I)
   4431       if (I->getType()->isPointerTy() && isConsecutivePtr(&*I))
   4432         Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
   4433 
   4434   while (!Worklist.empty()) {
   4435     Instruction *I = dyn_cast<Instruction>(Worklist.back());
   4436     Worklist.pop_back();
   4437 
   4438     // Look at instructions inside this loop.
   4439     // Stop when reaching PHI nodes.
   4440     // TODO: we need to follow values all over the loop, not only in this block.
   4441     if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
   4442       continue;
   4443 
   4444     // This is a known uniform.
   4445     Uniforms.insert(I);
   4446 
   4447     // Insert all operands.
   4448     Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
   4449   }
   4450 }
   4451 
   4452 bool LoopVectorizationLegality::canVectorizeMemory() {
   4453   LAI = &LAA->getInfo(TheLoop, Strides);
   4454   auto &OptionalReport = LAI->getReport();
   4455   if (OptionalReport)
   4456     emitAnalysis(VectorizationReport(*OptionalReport));
   4457   if (!LAI->canVectorizeMemory())
   4458     return false;
   4459 
   4460   if (LAI->hasStoreToLoopInvariantAddress()) {
   4461     emitAnalysis(
   4462         VectorizationReport()
   4463         << "write to a loop invariant address could not be vectorized");
   4464     DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
   4465     return false;
   4466   }
   4467 
   4468   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
   4469   PSE.addPredicate(LAI->PSE.getUnionPredicate());
   4470 
   4471   return true;
   4472 }
   4473 
   4474 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
   4475   Value *In0 = const_cast<Value*>(V);
   4476   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
   4477   if (!PN)
   4478     return false;
   4479 
   4480   return Inductions.count(PN);
   4481 }
   4482 
   4483 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
   4484   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
   4485 }
   4486 
   4487 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
   4488                                            SmallPtrSetImpl<Value *> &SafePtrs) {
   4489 
   4490   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
   4491     // Check that we don't have a constant expression that can trap as operand.
   4492     for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
   4493          OI != OE; ++OI) {
   4494       if (Constant *C = dyn_cast<Constant>(*OI))
   4495         if (C->canTrap())
   4496           return false;
   4497     }
   4498     // We might be able to hoist the load.
   4499     if (it->mayReadFromMemory()) {
   4500       LoadInst *LI = dyn_cast<LoadInst>(it);
   4501       if (!LI)
   4502         return false;
   4503       if (!SafePtrs.count(LI->getPointerOperand())) {
   4504         if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
   4505           MaskedOp.insert(LI);
   4506           continue;
   4507         }
   4508         return false;
   4509       }
   4510     }
   4511 
   4512     // We don't predicate stores at the moment.
   4513     if (it->mayWriteToMemory()) {
   4514       StoreInst *SI = dyn_cast<StoreInst>(it);
   4515       // We only support predication of stores in basic blocks with one
   4516       // predecessor.
   4517       if (!SI)
   4518         return false;
   4519 
   4520       bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
   4521       bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
   4522 
   4523       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
   4524           !isSinglePredecessor) {
   4525         // Build a masked store if it is legal for the target, otherwise
   4526         // scalarize the block.
   4527         bool isLegalMaskedOp =
   4528           isLegalMaskedStore(SI->getValueOperand()->getType(),
   4529                              SI->getPointerOperand());
   4530         if (isLegalMaskedOp) {
   4531           --NumPredStores;
   4532           MaskedOp.insert(SI);
   4533           continue;
   4534         }
   4535         return false;
   4536       }
   4537     }
   4538     if (it->mayThrow())
   4539       return false;
   4540 
   4541     // The instructions below can trap.
   4542     switch (it->getOpcode()) {
   4543     default: continue;
   4544     case Instruction::UDiv:
   4545     case Instruction::SDiv:
   4546     case Instruction::URem:
   4547     case Instruction::SRem:
   4548       return false;
   4549     }
   4550   }
   4551 
   4552   return true;
   4553 }
   4554 
   4555 void InterleavedAccessInfo::collectConstStridedAccesses(
   4556     MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
   4557     const ValueToValueMap &Strides) {
   4558   // Holds load/store instructions in program order.
   4559   SmallVector<Instruction *, 16> AccessList;
   4560 
   4561   for (auto *BB : TheLoop->getBlocks()) {
   4562     bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
   4563 
   4564     for (auto &I : *BB) {
   4565       if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
   4566         continue;
   4567       // FIXME: Currently we can't handle mixed accesses and predicated accesses
   4568       if (IsPred)
   4569         return;
   4570 
   4571       AccessList.push_back(&I);
   4572     }
   4573   }
   4574 
   4575   if (AccessList.empty())
   4576     return;
   4577 
   4578   auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
   4579   for (auto I : AccessList) {
   4580     LoadInst *LI = dyn_cast<LoadInst>(I);
   4581     StoreInst *SI = dyn_cast<StoreInst>(I);
   4582 
   4583     Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   4584     int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides);
   4585 
   4586     // The factor of the corresponding interleave group.
   4587     unsigned Factor = std::abs(Stride);
   4588 
   4589     // Ignore the access if the factor is too small or too large.
   4590     if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
   4591       continue;
   4592 
   4593     const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
   4594     PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
   4595     unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
   4596 
   4597     // An alignment of 0 means target ABI alignment.
   4598     unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
   4599     if (!Align)
   4600       Align = DL.getABITypeAlignment(PtrTy->getElementType());
   4601 
   4602     StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
   4603   }
   4604 }
   4605 
   4606 // Analyze interleaved accesses and collect them into interleave groups.
   4607 //
   4608 // Notice that the vectorization on interleaved groups will change instruction
   4609 // orders and may break dependences. But the memory dependence check guarantees
   4610 // that there is no overlap between two pointers of different strides, element
   4611 // sizes or underlying bases.
   4612 //
   4613 // For pointers sharing the same stride, element size and underlying base, no
   4614 // need to worry about Read-After-Write dependences and Write-After-Read
   4615 // dependences.
   4616 //
   4617 // E.g. The RAW dependence:  A[i] = a;
   4618 //                           b = A[i];
   4619 // This won't exist as it is a store-load forwarding conflict, which has
   4620 // already been checked and forbidden in the dependence check.
   4621 //
   4622 // E.g. The WAR dependence:  a = A[i];  // (1)
   4623 //                           A[i] = b;  // (2)
   4624 // The store group of (2) is always inserted at or below (2), and the load group
   4625 // of (1) is always inserted at or above (1). The dependence is safe.
   4626 void InterleavedAccessInfo::analyzeInterleaving(
   4627     const ValueToValueMap &Strides) {
   4628   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
   4629 
   4630   // Holds all the stride accesses.
   4631   MapVector<Instruction *, StrideDescriptor> StrideAccesses;
   4632   collectConstStridedAccesses(StrideAccesses, Strides);
   4633 
   4634   if (StrideAccesses.empty())
   4635     return;
   4636 
   4637   // Holds all interleaved store groups temporarily.
   4638   SmallSetVector<InterleaveGroup *, 4> StoreGroups;
   4639 
   4640   // Search the load-load/write-write pair B-A in bottom-up order and try to
   4641   // insert B into the interleave group of A according to 3 rules:
   4642   //   1. A and B have the same stride.
   4643   //   2. A and B have the same memory object size.
   4644   //   3. B belongs to the group according to the distance.
   4645   //
   4646   // The bottom-up order can avoid breaking the Write-After-Write dependences
   4647   // between two pointers of the same base.
   4648   // E.g.  A[i]   = a;   (1)
   4649   //       A[i]   = b;   (2)
   4650   //       A[i+1] = c    (3)
   4651   // We form the group (2)+(3) in front, so (1) has to form groups with accesses
   4652   // above (1), which guarantees that (1) is always above (2).
   4653   for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
   4654        ++I) {
   4655     Instruction *A = I->first;
   4656     StrideDescriptor DesA = I->second;
   4657 
   4658     InterleaveGroup *Group = getInterleaveGroup(A);
   4659     if (!Group) {
   4660       DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
   4661       Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
   4662     }
   4663 
   4664     if (A->mayWriteToMemory())
   4665       StoreGroups.insert(Group);
   4666 
   4667     for (auto II = std::next(I); II != E; ++II) {
   4668       Instruction *B = II->first;
   4669       StrideDescriptor DesB = II->second;
   4670 
   4671       // Ignore if B is already in a group or B is a different memory operation.
   4672       if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
   4673         continue;
   4674 
   4675       // Check the rule 1 and 2.
   4676       if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
   4677         continue;
   4678 
   4679       // Calculate the distance and prepare for the rule 3.
   4680       const SCEVConstant *DistToA = dyn_cast<SCEVConstant>(
   4681           PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev));
   4682       if (!DistToA)
   4683         continue;
   4684 
   4685       int DistanceToA = DistToA->getAPInt().getSExtValue();
   4686 
   4687       // Skip if the distance is not multiple of size as they are not in the
   4688       // same group.
   4689       if (DistanceToA % static_cast<int>(DesA.Size))
   4690         continue;
   4691 
   4692       // The index of B is the index of A plus the related index to A.
   4693       int IndexB =
   4694           Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);
   4695 
   4696       // Try to insert B into the group.
   4697       if (Group->insertMember(B, IndexB, DesB.Align)) {
   4698         DEBUG(dbgs() << "LV: Inserted:" << *B << '\n'
   4699                      << "    into the interleave group with" << *A << '\n');
   4700         InterleaveGroupMap[B] = Group;
   4701 
   4702         // Set the first load in program order as the insert position.
   4703         if (B->mayReadFromMemory())
   4704           Group->setInsertPos(B);
   4705       }
   4706     } // Iteration on instruction B
   4707   }   // Iteration on instruction A
   4708 
   4709   // Remove interleaved store groups with gaps.
   4710   for (InterleaveGroup *Group : StoreGroups)
   4711     if (Group->getNumMembers() != Group->getFactor())
   4712       releaseGroup(Group);
   4713 }
   4714 
   4715 LoopVectorizationCostModel::VectorizationFactor
   4716 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   4717   // Width 1 means no vectorize
   4718   VectorizationFactor Factor = { 1U, 0U };
   4719   if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
   4720     emitAnalysis(VectorizationReport() <<
   4721                  "runtime pointer checks needed. Enable vectorization of this "
   4722                  "loop with '#pragma clang loop vectorize(enable)' when "
   4723                  "compiling with -Os/-Oz");
   4724     DEBUG(dbgs() <<
   4725           "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
   4726     return Factor;
   4727   }
   4728 
   4729   if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
   4730     emitAnalysis(VectorizationReport() <<
   4731                  "store that is conditionally executed prevents vectorization");
   4732     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
   4733     return Factor;
   4734   }
   4735 
   4736   // Find the trip count.
   4737   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   4738   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
   4739 
   4740   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   4741   unsigned SmallestType, WidestType;
   4742   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
   4743   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
   4744   unsigned MaxSafeDepDist = -1U;
   4745   if (Legal->getMaxSafeDepDistBytes() != -1U)
   4746     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
   4747   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
   4748                     WidestRegister : MaxSafeDepDist);
   4749   unsigned MaxVectorSize = WidestRegister / WidestType;
   4750 
   4751   DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
   4752                << WidestType << " bits.\n");
   4753   DEBUG(dbgs() << "LV: The Widest register is: "
   4754           << WidestRegister << " bits.\n");
   4755 
   4756   if (MaxVectorSize == 0) {
   4757     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
   4758     MaxVectorSize = 1;
   4759   }
   4760 
   4761   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
   4762          " into one vector!");
   4763 
   4764   unsigned VF = MaxVectorSize;
   4765   if (MaximizeBandwidth && !OptForSize) {
   4766     // Collect all viable vectorization factors.
   4767     SmallVector<unsigned, 8> VFs;
   4768     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
   4769     for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
   4770       VFs.push_back(VS);
   4771 
   4772     // For each VF calculate its register usage.
   4773     auto RUs = calculateRegisterUsage(VFs);
   4774 
   4775     // Select the largest VF which doesn't require more registers than existing
   4776     // ones.
   4777     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
   4778     for (int i = RUs.size() - 1; i >= 0; --i) {
   4779       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
   4780         VF = VFs[i];
   4781         break;
   4782       }
   4783     }
   4784   }
   4785 
   4786   // If we optimize the program for size, avoid creating the tail loop.
   4787   if (OptForSize) {
   4788     // If we are unable to calculate the trip count then don't try to vectorize.
   4789     if (TC < 2) {
   4790       emitAnalysis
   4791         (VectorizationReport() <<
   4792          "unable to calculate the loop count due to complex control flow");
   4793       DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
   4794       return Factor;
   4795     }
   4796 
   4797     // Find the maximum SIMD width that can fit within the trip count.
   4798     VF = TC % MaxVectorSize;
   4799 
   4800     if (VF == 0)
   4801       VF = MaxVectorSize;
   4802     else {
   4803       // If the trip count that we found modulo the vectorization factor is not
   4804       // zero then we require a tail.
   4805       emitAnalysis(VectorizationReport() <<
   4806                    "cannot optimize for size and vectorize at the "
   4807                    "same time. Enable vectorization of this loop "
   4808                    "with '#pragma clang loop vectorize(enable)' "
   4809                    "when compiling with -Os/-Oz");
   4810       DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
   4811       return Factor;
   4812     }
   4813   }
   4814 
   4815   int UserVF = Hints->getWidth();
   4816   if (UserVF != 0) {
   4817     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
   4818     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
   4819 
   4820     Factor.Width = UserVF;
   4821     return Factor;
   4822   }
   4823 
   4824   float Cost = expectedCost(1);
   4825 #ifndef NDEBUG
   4826   const float ScalarCost = Cost;
   4827 #endif /* NDEBUG */
   4828   unsigned Width = 1;
   4829   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
   4830 
   4831   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   4832   // Ignore scalar width, because the user explicitly wants vectorization.
   4833   if (ForceVectorization && VF > 1) {
   4834     Width = 2;
   4835     Cost = expectedCost(Width) / (float)Width;
   4836   }
   4837 
   4838   for (unsigned i=2; i <= VF; i*=2) {
   4839     // Notice that the vector loop needs to be executed less times, so
   4840     // we need to divide the cost of the vector loops by the width of
   4841     // the vector elements.
   4842     float VectorCost = expectedCost(i) / (float)i;
   4843     DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
   4844           (int)VectorCost << ".\n");
   4845     if (VectorCost < Cost) {
   4846       Cost = VectorCost;
   4847       Width = i;
   4848     }
   4849   }
   4850 
   4851   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
   4852         << "LV: Vectorization seems to be not beneficial, "
   4853         << "but was forced by a user.\n");
   4854   DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
   4855   Factor.Width = Width;
   4856   Factor.Cost = Width * Cost;
   4857   return Factor;
   4858 }
   4859 
   4860 std::pair<unsigned, unsigned>
   4861 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   4862   unsigned MinWidth = -1U;
   4863   unsigned MaxWidth = 8;
   4864   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
   4865 
   4866   // For each block.
   4867   for (Loop::block_iterator bb = TheLoop->block_begin(),
   4868        be = TheLoop->block_end(); bb != be; ++bb) {
   4869     BasicBlock *BB = *bb;
   4870 
   4871     // For each instruction in the loop.
   4872     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
   4873       Type *T = it->getType();
   4874 
   4875       // Skip ignored values.
   4876       if (ValuesToIgnore.count(&*it))
   4877         continue;
   4878 
   4879       // Only examine Loads, Stores and PHINodes.
   4880       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
   4881         continue;
   4882 
   4883       // Examine PHI nodes that are reduction variables. Update the type to
   4884       // account for the recurrence type.
   4885       if (PHINode *PN = dyn_cast<PHINode>(it)) {
   4886         if (!Legal->isReductionVariable(PN))
   4887           continue;
   4888         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
   4889         T = RdxDesc.getRecurrenceType();
   4890       }
   4891 
   4892       // Examine the stored values.
   4893       if (StoreInst *ST = dyn_cast<StoreInst>(it))
   4894         T = ST->getValueOperand()->getType();
   4895 
   4896       // Ignore loaded pointer types and stored pointer types that are not
   4897       // consecutive. However, we do want to take consecutive stores/loads of
   4898       // pointer vectors into account.
   4899       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
   4900         continue;
   4901 
   4902       MinWidth = std::min(MinWidth,
   4903                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
   4904       MaxWidth = std::max(MaxWidth,
   4905                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
   4906     }
   4907   }
   4908 
   4909   return {MinWidth, MaxWidth};
   4910 }
   4911 
   4912 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   4913                                                            unsigned VF,
   4914                                                            unsigned LoopCost) {
   4915 
   4916   // -- The interleave heuristics --
   4917   // We interleave the loop in order to expose ILP and reduce the loop overhead.
   4918   // There are many micro-architectural considerations that we can't predict
   4919   // at this level. For example, frontend pressure (on decode or fetch) due to
   4920   // code size, or the number and capabilities of the execution ports.
   4921   //
   4922   // We use the following heuristics to select the interleave count:
   4923   // 1. If the code has reductions, then we interleave to break the cross
   4924   // iteration dependency.
   4925   // 2. If the loop is really small, then we interleave to reduce the loop
   4926   // overhead.
   4927   // 3. We don't interleave if we think that we will spill registers to memory
   4928   // due to the increased register pressure.
   4929 
   4930   // When we optimize for size, we don't interleave.
   4931   if (OptForSize)
   4932     return 1;
   4933 
   4934   // We used the distance for the interleave count.
   4935   if (Legal->getMaxSafeDepDistBytes() != -1U)
   4936     return 1;
   4937 
   4938   // Do not interleave loops with a relatively small trip count.
   4939   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   4940   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
   4941     return 1;
   4942 
   4943   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
   4944   DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
   4945         " registers\n");
   4946 
   4947   if (VF == 1) {
   4948     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
   4949       TargetNumRegisters = ForceTargetNumScalarRegs;
   4950   } else {
   4951     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
   4952       TargetNumRegisters = ForceTargetNumVectorRegs;
   4953   }
   4954 
   4955   RegisterUsage R = calculateRegisterUsage({VF})[0];
   4956   // We divide by these constants so assume that we have at least one
   4957   // instruction that uses at least one register.
   4958   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
   4959   R.NumInstructions = std::max(R.NumInstructions, 1U);
   4960 
   4961   // We calculate the interleave count using the following formula.
   4962   // Subtract the number of loop invariants from the number of available
   4963   // registers. These registers are used by all of the interleaved instances.
   4964   // Next, divide the remaining registers by the number of registers that is
   4965   // required by the loop, in order to estimate how many parallel instances
   4966   // fit without causing spills. All of this is rounded down if necessary to be
   4967   // a power of two. We want power of two interleave count to simplify any
   4968   // addressing operations or alignment considerations.
   4969   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
   4970                               R.MaxLocalUsers);
   4971 
   4972   // Don't count the induction variable as interleaved.
   4973   if (EnableIndVarRegisterHeur)
   4974     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
   4975                        std::max(1U, (R.MaxLocalUsers - 1)));
   4976 
   4977   // Clamp the interleave ranges to reasonable counts.
   4978   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
   4979 
   4980   // Check if the user has overridden the max.
   4981   if (VF == 1) {
   4982     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
   4983       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
   4984   } else {
   4985     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
   4986       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
   4987   }
   4988 
   4989   // If we did not calculate the cost for VF (because the user selected the VF)
   4990   // then we calculate the cost of VF here.
   4991   if (LoopCost == 0)
   4992     LoopCost = expectedCost(VF);
   4993 
   4994   // Clamp the calculated IC to be between the 1 and the max interleave count
   4995   // that the target allows.
   4996   if (IC > MaxInterleaveCount)
   4997     IC = MaxInterleaveCount;
   4998   else if (IC < 1)
   4999     IC = 1;
   5000 
   5001   // Interleave if we vectorized this loop and there is a reduction that could
   5002   // benefit from interleaving.
   5003   if (VF > 1 && Legal->getReductionVars()->size()) {
   5004     DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
   5005     return IC;
   5006   }
   5007 
   5008   // Note that if we've already vectorized the loop we will have done the
   5009   // runtime check and so interleaving won't require further checks.
   5010   bool InterleavingRequiresRuntimePointerCheck =
   5011       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
   5012 
   5013   // We want to interleave small loops in order to reduce the loop overhead and
   5014   // potentially expose ILP opportunities.
   5015   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
   5016   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
   5017     // We assume that the cost overhead is 1 and we use the cost model
   5018     // to estimate the cost of the loop and interleave until the cost of the
   5019     // loop overhead is about 5% of the cost of the loop.
   5020     unsigned SmallIC =
   5021         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
   5022 
   5023     // Interleave until store/load ports (estimated by max interleave count) are
   5024     // saturated.
   5025     unsigned NumStores = Legal->getNumStores();
   5026     unsigned NumLoads = Legal->getNumLoads();
   5027     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
   5028     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
   5029 
   5030     // If we have a scalar reduction (vector reductions are already dealt with
   5031     // by this point), we can increase the critical path length if the loop
   5032     // we're interleaving is inside another loop. Limit, by default to 2, so the
   5033     // critical path only gets increased by one reduction operation.
   5034     if (Legal->getReductionVars()->size() &&
   5035         TheLoop->getLoopDepth() > 1) {
   5036       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
   5037       SmallIC = std::min(SmallIC, F);
   5038       StoresIC = std::min(StoresIC, F);
   5039       LoadsIC = std::min(LoadsIC, F);
   5040     }
   5041 
   5042     if (EnableLoadStoreRuntimeInterleave &&
   5043         std::max(StoresIC, LoadsIC) > SmallIC) {
   5044       DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
   5045       return std::max(StoresIC, LoadsIC);
   5046     }
   5047 
   5048     DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
   5049     return SmallIC;
   5050   }
   5051 
   5052   // Interleave if this is a large loop (small loops are already dealt with by
   5053   // this point) that could benefit from interleaving.
   5054   bool HasReductions = (Legal->getReductionVars()->size() > 0);
   5055   if (TTI.enableAggressiveInterleaving(HasReductions)) {
   5056     DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
   5057     return IC;
   5058   }
   5059 
   5060   DEBUG(dbgs() << "LV: Not Interleaving.\n");
   5061   return 1;
   5062 }
   5063 
   5064 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
   5065 LoopVectorizationCostModel::calculateRegisterUsage(
   5066     const SmallVector<unsigned, 8> &VFs) {
   5067   // This function calculates the register usage by measuring the highest number
   5068   // of values that are alive at a single location. Obviously, this is a very
   5069   // rough estimation. We scan the loop in a topological order in order and
   5070   // assign a number to each instruction. We use RPO to ensure that defs are
   5071   // met before their users. We assume that each instruction that has in-loop
   5072   // users starts an interval. We record every time that an in-loop value is
   5073   // used, so we have a list of the first and last occurrences of each
   5074   // instruction. Next, we transpose this data structure into a multi map that
   5075   // holds the list of intervals that *end* at a specific location. This multi
   5076   // map allows us to perform a linear search. We scan the instructions linearly
   5077   // and record each time that a new interval starts, by placing it in a set.
   5078   // If we find this value in the multi-map then we remove it from the set.
   5079   // The max register usage is the maximum size of the set.
   5080   // We also search for instructions that are defined outside the loop, but are
   5081   // used inside the loop. We need this number separately from the max-interval
   5082   // usage number because when we unroll, loop-invariant values do not take
   5083   // more register.
   5084   LoopBlocksDFS DFS(TheLoop);
   5085   DFS.perform(LI);
   5086 
   5087   RegisterUsage RU;
   5088   RU.NumInstructions = 0;
   5089 
   5090   // Each 'key' in the map opens a new interval. The values
   5091   // of the map are the index of the 'last seen' usage of the
   5092   // instruction that is the key.
   5093   typedef DenseMap<Instruction*, unsigned> IntervalMap;
   5094   // Maps instruction to its index.
   5095   DenseMap<unsigned, Instruction*> IdxToInstr;
   5096   // Marks the end of each interval.
   5097   IntervalMap EndPoint;
   5098   // Saves the list of instruction indices that are used in the loop.
   5099   SmallSet<Instruction*, 8> Ends;
   5100   // Saves the list of values that are used in the loop but are
   5101   // defined outside the loop, such as arguments and constants.
   5102   SmallPtrSet<Value*, 8> LoopInvariants;
   5103 
   5104   unsigned Index = 0;
   5105   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
   5106        be = DFS.endRPO(); bb != be; ++bb) {
   5107     RU.NumInstructions += (*bb)->size();
   5108     for (Instruction &I : **bb) {
   5109       IdxToInstr[Index++] = &I;
   5110 
   5111       // Save the end location of each USE.
   5112       for (unsigned i = 0; i < I.getNumOperands(); ++i) {
   5113         Value *U = I.getOperand(i);
   5114         Instruction *Instr = dyn_cast<Instruction>(U);
   5115 
   5116         // Ignore non-instruction values such as arguments, constants, etc.
   5117         if (!Instr) continue;
   5118 
   5119         // If this instruction is outside the loop then record it and continue.
   5120         if (!TheLoop->contains(Instr)) {
   5121           LoopInvariants.insert(Instr);
   5122           continue;
   5123         }
   5124 
   5125         // Overwrite previous end points.
   5126         EndPoint[Instr] = Index;
   5127         Ends.insert(Instr);
   5128       }
   5129     }
   5130   }
   5131 
   5132   // Saves the list of intervals that end with the index in 'key'.
   5133   typedef SmallVector<Instruction*, 2> InstrList;
   5134   DenseMap<unsigned, InstrList> TransposeEnds;
   5135 
   5136   // Transpose the EndPoints to a list of values that end at each index.
   5137   for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
   5138        it != e; ++it)
   5139     TransposeEnds[it->second].push_back(it->first);
   5140 
   5141   SmallSet<Instruction*, 8> OpenIntervals;
   5142 
   5143   // Get the size of the widest register.
   5144   unsigned MaxSafeDepDist = -1U;
   5145   if (Legal->getMaxSafeDepDistBytes() != -1U)
   5146     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
   5147   unsigned WidestRegister =
   5148       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
   5149   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
   5150 
   5151   SmallVector<RegisterUsage, 8> RUs(VFs.size());
   5152   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
   5153 
   5154   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
   5155 
   5156   // A lambda that gets the register usage for the given type and VF.
   5157   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
   5158     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
   5159     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
   5160   };
   5161 
   5162   for (unsigned int i = 0; i < Index; ++i) {
   5163     Instruction *I = IdxToInstr[i];
   5164     // Ignore instructions that are never used within the loop.
   5165     if (!Ends.count(I)) continue;
   5166 
   5167     // Remove all of the instructions that end at this location.
   5168     InstrList &List = TransposeEnds[i];
   5169     for (unsigned int j = 0, e = List.size(); j < e; ++j)
   5170       OpenIntervals.erase(List[j]);
   5171 
   5172     // Skip ignored values.
   5173     if (ValuesToIgnore.count(I))
   5174       continue;
   5175 
   5176     // For each VF find the maximum usage of registers.
   5177     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
   5178       if (VFs[j] == 1) {
   5179         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
   5180         continue;
   5181       }
   5182 
   5183       // Count the number of live intervals.
   5184       unsigned RegUsage = 0;
   5185       for (auto Inst : OpenIntervals) {
   5186         // Skip ignored values for VF > 1.
   5187         if (VecValuesToIgnore.count(Inst))
   5188           continue;
   5189         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
   5190       }
   5191       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
   5192     }
   5193 
   5194     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
   5195                  << OpenIntervals.size() << '\n');
   5196 
   5197     // Add the current instruction to the list of open intervals.
   5198     OpenIntervals.insert(I);
   5199   }
   5200 
   5201   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
   5202     unsigned Invariant = 0;
   5203     if (VFs[i] == 1)
   5204       Invariant = LoopInvariants.size();
   5205     else {
   5206       for (auto Inst : LoopInvariants)
   5207         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
   5208     }
   5209 
   5210     DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] <<  '\n');
   5211     DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
   5212     DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
   5213     DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
   5214 
   5215     RU.LoopInvariantRegs = Invariant;
   5216     RU.MaxLocalUsers = MaxUsages[i];
   5217     RUs[i] = RU;
   5218   }
   5219 
   5220   return RUs;
   5221 }
   5222 
   5223 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   5224   unsigned Cost = 0;
   5225 
   5226   // For each block.
   5227   for (Loop::block_iterator bb = TheLoop->block_begin(),
   5228        be = TheLoop->block_end(); bb != be; ++bb) {
   5229     unsigned BlockCost = 0;
   5230     BasicBlock *BB = *bb;
   5231 
   5232     // For each instruction in the old loop.
   5233     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
   5234       // Skip dbg intrinsics.
   5235       if (isa<DbgInfoIntrinsic>(it))
   5236         continue;
   5237 
   5238       // Skip ignored values.
   5239       if (ValuesToIgnore.count(&*it))
   5240         continue;
   5241 
   5242       unsigned C = getInstructionCost(&*it, VF);
   5243 
   5244       // Check if we should override the cost.
   5245       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
   5246         C = ForceTargetInstructionCost;
   5247 
   5248       BlockCost += C;
   5249       DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
   5250             VF << " For instruction: " << *it << '\n');
   5251     }
   5252 
   5253     // We assume that if-converted blocks have a 50% chance of being executed.
   5254     // When the code is scalar then some of the blocks are avoided due to CF.
   5255     // When the code is vectorized we execute all code paths.
   5256     if (VF == 1 && Legal->blockNeedsPredication(*bb))
   5257       BlockCost /= 2;
   5258 
   5259     Cost += BlockCost;
   5260   }
   5261 
   5262   return Cost;
   5263 }
   5264 
   5265 /// \brief Check whether the address computation for a non-consecutive memory
   5266 /// access looks like an unlikely candidate for being merged into the indexing
   5267 /// mode.
   5268 ///
   5269 /// We look for a GEP which has one index that is an induction variable and all
   5270 /// other indices are loop invariant. If the stride of this access is also
   5271 /// within a small bound we decide that this address computation can likely be
   5272 /// merged into the addressing mode.
   5273 /// In all other cases, we identify the address computation as complex.
   5274 static bool isLikelyComplexAddressComputation(Value *Ptr,
   5275                                               LoopVectorizationLegality *Legal,
   5276                                               ScalarEvolution *SE,
   5277                                               const Loop *TheLoop) {
   5278   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
   5279   if (!Gep)
   5280     return true;
   5281 
   5282   // We are looking for a gep with all loop invariant indices except for one
   5283   // which should be an induction variable.
   5284   unsigned NumOperands = Gep->getNumOperands();
   5285   for (unsigned i = 1; i < NumOperands; ++i) {
   5286     Value *Opd = Gep->getOperand(i);
   5287     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
   5288         !Legal->isInductionVariable(Opd))
   5289       return true;
   5290   }
   5291 
   5292   // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
   5293   // can likely be merged into the address computation.
   5294   unsigned MaxMergeDistance = 64;
   5295 
   5296   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
   5297   if (!AddRec)
   5298     return true;
   5299 
   5300   // Check the step is constant.
   5301   const SCEV *Step = AddRec->getStepRecurrence(*SE);
   5302   // Calculate the pointer stride and check if it is consecutive.
   5303   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
   5304   if (!C)
   5305     return true;
   5306 
   5307   const APInt &APStepVal = C->getAPInt();
   5308 
   5309   // Huge step value - give up.
   5310   if (APStepVal.getBitWidth() > 64)
   5311     return true;
   5312 
   5313   int64_t StepVal = APStepVal.getSExtValue();
   5314 
   5315   return StepVal > MaxMergeDistance;
   5316 }
   5317 
   5318 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
   5319   return Legal->hasStride(I->getOperand(0)) ||
   5320          Legal->hasStride(I->getOperand(1));
   5321 }
   5322 
   5323 unsigned
   5324 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   5325   // If we know that this instruction will remain uniform, check the cost of
   5326   // the scalar version.
   5327   if (Legal->isUniformAfterVectorization(I))
   5328     VF = 1;
   5329 
   5330   Type *RetTy = I->getType();
   5331   if (VF > 1 && MinBWs.count(I))
   5332     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
   5333   Type *VectorTy = ToVectorTy(RetTy, VF);
   5334   auto SE = PSE.getSE();
   5335 
   5336   // TODO: We need to estimate the cost of intrinsic calls.
   5337   switch (I->getOpcode()) {
   5338   case Instruction::GetElementPtr:
   5339     // We mark this instruction as zero-cost because the cost of GEPs in
   5340     // vectorized code depends on whether the corresponding memory instruction
   5341     // is scalarized or not. Therefore, we handle GEPs with the memory
   5342     // instruction cost.
   5343     return 0;
   5344   case Instruction::Br: {
   5345     return TTI.getCFInstrCost(I->getOpcode());
   5346   }
   5347   case Instruction::PHI:
   5348     //TODO: IF-converted IFs become selects.
   5349     return 0;
   5350   case Instruction::Add:
   5351   case Instruction::FAdd:
   5352   case Instruction::Sub:
   5353   case Instruction::FSub:
   5354   case Instruction::Mul:
   5355   case Instruction::FMul:
   5356   case Instruction::UDiv:
   5357   case Instruction::SDiv:
   5358   case Instruction::FDiv:
   5359   case Instruction::URem:
   5360   case Instruction::SRem:
   5361   case Instruction::FRem:
   5362   case Instruction::Shl:
   5363   case Instruction::LShr:
   5364   case Instruction::AShr:
   5365   case Instruction::And:
   5366   case Instruction::Or:
   5367   case Instruction::Xor: {
   5368     // Since we will replace the stride by 1 the multiplication should go away.
   5369     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
   5370       return 0;
   5371     // Certain instructions can be cheaper to vectorize if they have a constant
   5372     // second vector operand. One example of this are shifts on x86.
   5373     TargetTransformInfo::OperandValueKind Op1VK =
   5374       TargetTransformInfo::OK_AnyValue;
   5375     TargetTransformInfo::OperandValueKind Op2VK =
   5376       TargetTransformInfo::OK_AnyValue;
   5377     TargetTransformInfo::OperandValueProperties Op1VP =
   5378         TargetTransformInfo::OP_None;
   5379     TargetTransformInfo::OperandValueProperties Op2VP =
   5380         TargetTransformInfo::OP_None;
   5381     Value *Op2 = I->getOperand(1);
   5382 
   5383     // Check for a splat of a constant or for a non uniform vector of constants.
   5384     if (isa<ConstantInt>(Op2)) {
   5385       ConstantInt *CInt = cast<ConstantInt>(Op2);
   5386       if (CInt && CInt->getValue().isPowerOf2())
   5387         Op2VP = TargetTransformInfo::OP_PowerOf2;
   5388       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
   5389     } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
   5390       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
   5391       Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
   5392       if (SplatValue) {
   5393         ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
   5394         if (CInt && CInt->getValue().isPowerOf2())
   5395           Op2VP = TargetTransformInfo::OP_PowerOf2;
   5396         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
   5397       }
   5398     }
   5399 
   5400     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
   5401                                       Op1VP, Op2VP);
   5402   }
   5403   case Instruction::Select: {
   5404     SelectInst *SI = cast<SelectInst>(I);
   5405     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
   5406     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
   5407     Type *CondTy = SI->getCondition()->getType();
   5408     if (!ScalarCond)
   5409       CondTy = VectorType::get(CondTy, VF);
   5410 
   5411     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
   5412   }
   5413   case Instruction::ICmp:
   5414   case Instruction::FCmp: {
   5415     Type *ValTy = I->getOperand(0)->getType();
   5416     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
   5417     auto It = MinBWs.find(Op0AsInstruction);
   5418     if (VF > 1 && It != MinBWs.end())
   5419       ValTy = IntegerType::get(ValTy->getContext(), It->second);
   5420     VectorTy = ToVectorTy(ValTy, VF);
   5421     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
   5422   }
   5423   case Instruction::Store:
   5424   case Instruction::Load: {
   5425     StoreInst *SI = dyn_cast<StoreInst>(I);
   5426     LoadInst *LI = dyn_cast<LoadInst>(I);
   5427     Type *ValTy = (SI ? SI->getValueOperand()->getType() :
   5428                    LI->getType());
   5429     VectorTy = ToVectorTy(ValTy, VF);
   5430 
   5431     unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
   5432     unsigned AS = SI ? SI->getPointerAddressSpace() :
   5433       LI->getPointerAddressSpace();
   5434     Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
   5435     // We add the cost of address computation here instead of with the gep
   5436     // instruction because only here we know whether the operation is
   5437     // scalarized.
   5438     if (VF == 1)
   5439       return TTI.getAddressComputationCost(VectorTy) +
   5440         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
   5441 
   5442     // For an interleaved access, calculate the total cost of the whole
   5443     // interleave group.
   5444     if (Legal->isAccessInterleaved(I)) {
   5445       auto Group = Legal->getInterleavedAccessGroup(I);
   5446       assert(Group && "Fail to get an interleaved access group.");
   5447 
   5448       // Only calculate the cost once at the insert position.
   5449       if (Group->getInsertPos() != I)
   5450         return 0;
   5451 
   5452       unsigned InterleaveFactor = Group->getFactor();
   5453       Type *WideVecTy =
   5454           VectorType::get(VectorTy->getVectorElementType(),
   5455                           VectorTy->getVectorNumElements() * InterleaveFactor);
   5456 
   5457       // Holds the indices of existing members in an interleaved load group.
   5458       // An interleaved store group doesn't need this as it dones't allow gaps.
   5459       SmallVector<unsigned, 4> Indices;
   5460       if (LI) {
   5461         for (unsigned i = 0; i < InterleaveFactor; i++)
   5462           if (Group->getMember(i))
   5463             Indices.push_back(i);
   5464       }
   5465 
   5466       // Calculate the cost of the whole interleaved group.
   5467       unsigned Cost = TTI.getInterleavedMemoryOpCost(
   5468           I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
   5469           Group->getAlignment(), AS);
   5470 
   5471       if (Group->isReverse())
   5472         Cost +=
   5473             Group->getNumMembers() *
   5474             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
   5475 
   5476       // FIXME: The interleaved load group with a huge gap could be even more
   5477       // expensive than scalar operations. Then we could ignore such group and
   5478       // use scalar operations instead.
   5479       return Cost;
   5480     }
   5481 
   5482     // Scalarized loads/stores.
   5483     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   5484     bool Reverse = ConsecutiveStride < 0;
   5485     const DataLayout &DL = I->getModule()->getDataLayout();
   5486     unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
   5487     unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
   5488     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
   5489       bool IsComplexComputation =
   5490         isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
   5491       unsigned Cost = 0;
   5492       // The cost of extracting from the value vector and pointer vector.
   5493       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
   5494       for (unsigned i = 0; i < VF; ++i) {
   5495         //  The cost of extracting the pointer operand.
   5496         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
   5497         // In case of STORE, the cost of ExtractElement from the vector.
   5498         // In case of LOAD, the cost of InsertElement into the returned
   5499         // vector.
   5500         Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
   5501                                             Instruction::InsertElement,
   5502                                             VectorTy, i);
   5503       }
   5504 
   5505       // The cost of the scalar loads/stores.
   5506       Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
   5507       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
   5508                                        Alignment, AS);
   5509       return Cost;
   5510     }
   5511 
   5512     // Wide load/stores.
   5513     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
   5514     if (Legal->isMaskRequired(I))
   5515       Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
   5516                                         AS);
   5517     else
   5518       Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
   5519 
   5520     if (Reverse)
   5521       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
   5522                                   VectorTy, 0);
   5523     return Cost;
   5524   }
   5525   case Instruction::ZExt:
   5526   case Instruction::SExt:
   5527   case Instruction::FPToUI:
   5528   case Instruction::FPToSI:
   5529   case Instruction::FPExt:
   5530   case Instruction::PtrToInt:
   5531   case Instruction::IntToPtr:
   5532   case Instruction::SIToFP:
   5533   case Instruction::UIToFP:
   5534   case Instruction::Trunc:
   5535   case Instruction::FPTrunc:
   5536   case Instruction::BitCast: {
   5537     // We optimize the truncation of induction variable.
   5538     // The cost of these is the same as the scalar operation.
   5539     if (I->getOpcode() == Instruction::Trunc &&
   5540         Legal->isInductionVariable(I->getOperand(0)))
   5541       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
   5542                                   I->getOperand(0)->getType());
   5543 
   5544     Type *SrcScalarTy = I->getOperand(0)->getType();
   5545     Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
   5546     if (VF > 1 && MinBWs.count(I)) {
   5547       // This cast is going to be shrunk. This may remove the cast or it might
   5548       // turn it into slightly different cast. For example, if MinBW == 16,
   5549       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
   5550       //
   5551       // Calculate the modified src and dest types.
   5552       Type *MinVecTy = VectorTy;
   5553       if (I->getOpcode() == Instruction::Trunc) {
   5554         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
   5555         VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF),
   5556                                             MinVecTy);
   5557       } else if (I->getOpcode() == Instruction::ZExt ||
   5558                  I->getOpcode() == Instruction::SExt) {
   5559         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
   5560         VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF),
   5561                                              MinVecTy);
   5562       }
   5563     }
   5564 
   5565     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   5566   }
   5567   case Instruction::Call: {
   5568     bool NeedToScalarize;
   5569     CallInst *CI = cast<CallInst>(I);
   5570     unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
   5571     if (getIntrinsicIDForCall(CI, TLI))
   5572       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
   5573     return CallCost;
   5574   }
   5575   default: {
   5576     // We are scalarizing the instruction. Return the cost of the scalar
   5577     // instruction, plus the cost of insert and extract into vector
   5578     // elements, times the vector width.
   5579     unsigned Cost = 0;
   5580 
   5581     if (!RetTy->isVoidTy() && VF != 1) {
   5582       unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
   5583                                                 VectorTy);
   5584       unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
   5585                                                 VectorTy);
   5586 
   5587       // The cost of inserting the results plus extracting each one of the
   5588       // operands.
   5589       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
   5590     }
   5591 
   5592     // The cost of executing VF copies of the scalar instruction. This opcode
   5593     // is unknown. Assume that it is the same as 'mul'.
   5594     Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
   5595     return Cost;
   5596   }
   5597   }// end of switch.
   5598 }
   5599 
   5600 char LoopVectorize::ID = 0;
   5601 static const char lv_name[] = "Loop Vectorization";
   5602 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
   5603 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
   5604 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
   5605 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
   5606 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
   5607 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
   5608 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
   5609 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
   5610 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
   5611 INITIALIZE_PASS_DEPENDENCY(LCSSA)
   5612 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
   5613 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
   5614 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
   5615 INITIALIZE_PASS_DEPENDENCY(DemandedBits)
   5616 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
   5617 
   5618 namespace llvm {
   5619   Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
   5620     return new LoopVectorize(NoUnrolling, AlwaysVectorize);
   5621   }
   5622 }
   5623 
   5624 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
   5625   // Check for a store.
   5626   if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
   5627     return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
   5628 
   5629   // Check for a load.
   5630   if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
   5631     return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
   5632 
   5633   return false;
   5634 }
   5635 
   5636 void LoopVectorizationCostModel::collectValuesToIgnore() {
   5637   // Ignore ephemeral values.
   5638   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
   5639 
   5640   // Ignore type-promoting instructions we identified during reduction
   5641   // detection.
   5642   for (auto &Reduction : *Legal->getReductionVars()) {
   5643     RecurrenceDescriptor &RedDes = Reduction.second;
   5644     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
   5645     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   5646   }
   5647 
   5648   // Ignore induction phis that are only used in either GetElementPtr or ICmp
   5649   // instruction to exit loop. Induction variables usually have large types and
   5650   // can have big impact when estimating register usage.
   5651   // This is for when VF > 1.
   5652   for (auto &Induction : *Legal->getInductionVars()) {
   5653     auto *PN = Induction.first;
   5654     auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
   5655 
   5656     // Check that the PHI is only used by the induction increment (UpdateV) or
   5657     // by GEPs. Then check that UpdateV is only used by a compare instruction or
   5658     // the loop header PHI.
   5659     // FIXME: Need precise def-use analysis to determine if this instruction
   5660     // variable will be vectorized.
   5661     if (std::all_of(PN->user_begin(), PN->user_end(),
   5662                     [&](const User *U) -> bool {
   5663                       return U == UpdateV || isa<GetElementPtrInst>(U);
   5664                     }) &&
   5665         std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
   5666                     [&](const User *U) -> bool {
   5667                       return U == PN || isa<ICmpInst>(U);
   5668                     })) {
   5669       VecValuesToIgnore.insert(PN);
   5670       VecValuesToIgnore.insert(UpdateV);
   5671     }
   5672   }
   5673 
   5674   // Ignore instructions that will not be vectorized.
   5675   // This is for when VF > 1.
   5676   for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
   5677        ++bb) {
   5678     for (auto &Inst : **bb) {
   5679       switch (Inst.getOpcode()) {
   5680       case Instruction::GetElementPtr: {
   5681         // Ignore GEP if its last operand is an induction variable so that it is
   5682         // a consecutive load/store and won't be vectorized as scatter/gather
   5683         // pattern.
   5684 
   5685         GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
   5686         unsigned NumOperands = Gep->getNumOperands();
   5687         unsigned InductionOperand = getGEPInductionOperand(Gep);
   5688         bool GepToIgnore = true;
   5689 
   5690         // Check that all of the gep indices are uniform except for the
   5691         // induction operand.
   5692         for (unsigned i = 0; i != NumOperands; ++i) {
   5693           if (i != InductionOperand &&
   5694               !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
   5695                                             TheLoop)) {
   5696             GepToIgnore = false;
   5697             break;
   5698           }
   5699         }
   5700 
   5701         if (GepToIgnore)
   5702           VecValuesToIgnore.insert(&Inst);
   5703         break;
   5704       }
   5705       }
   5706     }
   5707   }
   5708 }
   5709 
   5710 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
   5711                                              bool IfPredicateStore) {
   5712   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   5713   // Holds vector parameters or scalars, in case of uniform vals.
   5714   SmallVector<VectorParts, 4> Params;
   5715 
   5716   setDebugLocFromInst(Builder, Instr);
   5717 
   5718   // Find all of the vectorized parameters.
   5719   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
   5720     Value *SrcOp = Instr->getOperand(op);
   5721 
   5722     // If we are accessing the old induction variable, use the new one.
   5723     if (SrcOp == OldInduction) {
   5724       Params.push_back(getVectorValue(SrcOp));
   5725       continue;
   5726     }
   5727 
   5728     // Try using previously calculated values.
   5729     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
   5730 
   5731     // If the src is an instruction that appeared earlier in the basic block
   5732     // then it should already be vectorized.
   5733     if (SrcInst && OrigLoop->contains(SrcInst)) {
   5734       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
   5735       // The parameter is a vector value from earlier.
   5736       Params.push_back(WidenMap.get(SrcInst));
   5737     } else {
   5738       // The parameter is a scalar from outside the loop. Maybe even a constant.
   5739       VectorParts Scalars;
   5740       Scalars.append(UF, SrcOp);
   5741       Params.push_back(Scalars);
   5742     }
   5743   }
   5744 
   5745   assert(Params.size() == Instr->getNumOperands() &&
   5746          "Invalid number of operands");
   5747 
   5748   // Does this instruction return a value ?
   5749   bool IsVoidRetTy = Instr->getType()->isVoidTy();
   5750 
   5751   Value *UndefVec = IsVoidRetTy ? nullptr :
   5752   UndefValue::get(Instr->getType());
   5753   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   5754   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
   5755 
   5756   VectorParts Cond;
   5757   if (IfPredicateStore) {
   5758     assert(Instr->getParent()->getSinglePredecessor() &&
   5759            "Only support single predecessor blocks");
   5760     Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
   5761                           Instr->getParent());
   5762   }
   5763 
   5764   // For each vector unroll 'part':
   5765   for (unsigned Part = 0; Part < UF; ++Part) {
   5766     // For each scalar that we create:
   5767 
   5768     // Start an "if (pred) a[i] = ..." block.
   5769     Value *Cmp = nullptr;
   5770     if (IfPredicateStore) {
   5771       if (Cond[Part]->getType()->isVectorTy())
   5772         Cond[Part] =
   5773             Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
   5774       Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
   5775                                ConstantInt::get(Cond[Part]->getType(), 1));
   5776     }
   5777 
   5778     Instruction *Cloned = Instr->clone();
   5779       if (!IsVoidRetTy)
   5780         Cloned->setName(Instr->getName() + ".cloned");
   5781       // Replace the operands of the cloned instructions with extracted scalars.
   5782       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
   5783         Value *Op = Params[op][Part];
   5784         Cloned->setOperand(op, Op);
   5785       }
   5786 
   5787       // Place the cloned scalar in the new loop.
   5788       Builder.Insert(Cloned);
   5789 
   5790       // If the original scalar returns a value we need to place it in a vector
   5791       // so that future users will be able to use it.
   5792       if (!IsVoidRetTy)
   5793         VecResults[Part] = Cloned;
   5794 
   5795       // End if-block.
   5796       if (IfPredicateStore)
   5797         PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
   5798                                                   Cmp));
   5799   }
   5800 }
   5801 
   5802 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
   5803   StoreInst *SI = dyn_cast<StoreInst>(Instr);
   5804   bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
   5805 
   5806   return scalarizeInstruction(Instr, IfPredicateStore);
   5807 }
   5808 
   5809 Value *InnerLoopUnroller::reverseVector(Value *Vec) {
   5810   return Vec;
   5811 }
   5812 
   5813 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
   5814   return V;
   5815 }
   5816 
   5817 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
   5818   // When unrolling and the VF is 1, we only need to add a simple scalar.
   5819   Type *ITy = Val->getType();
   5820   assert(!ITy->isVectorTy() && "Val must be a scalar");
   5821   Constant *C = ConstantInt::get(ITy, StartIdx);
   5822   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
   5823 }
   5824