1 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a basic-block vectorization pass. The algorithm was 11 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12 // et al. It works by looking for chains of pairable operations and then 13 // pairing them. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #define BBV_NAME "bb-vectorize" 18 #define DEBUG_TYPE BBV_NAME 19 #include "llvm/Constants.h" 20 #include "llvm/DerivedTypes.h" 21 #include "llvm/Function.h" 22 #include "llvm/Instructions.h" 23 #include "llvm/IntrinsicInst.h" 24 #include "llvm/Intrinsics.h" 25 #include "llvm/LLVMContext.h" 26 #include "llvm/Pass.h" 27 #include "llvm/Type.h" 28 #include "llvm/ADT/DenseMap.h" 29 #include "llvm/ADT/DenseSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/ADT/STLExtras.h" 33 #include "llvm/ADT/StringExtras.h" 34 #include "llvm/Analysis/AliasAnalysis.h" 35 #include "llvm/Analysis/AliasSetTracker.h" 36 #include "llvm/Analysis/ScalarEvolution.h" 37 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 38 #include "llvm/Analysis/ValueTracking.h" 39 #include "llvm/Support/CommandLine.h" 40 #include "llvm/Support/Debug.h" 41 #include "llvm/Support/raw_ostream.h" 42 #include "llvm/Support/ValueHandle.h" 43 #include "llvm/Target/TargetData.h" 44 #include "llvm/Transforms/Vectorize.h" 45 #include <algorithm> 46 #include <map> 47 using namespace llvm; 48 49 static cl::opt<unsigned> 50 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 51 cl::desc("The required chain depth for vectorization")); 52 53 static cl::opt<unsigned> 54 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 55 cl::desc("The maximum search distance for instruction pairs")); 56 57 static cl::opt<bool> 58 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 59 cl::desc("Replicating one element to a pair breaks the chain")); 60 61 static cl::opt<unsigned> 62 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 63 cl::desc("The size of the native vector registers")); 64 65 static cl::opt<unsigned> 66 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 67 cl::desc("The maximum number of pairing iterations")); 68 69 static cl::opt<unsigned> 70 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 71 cl::desc("The maximum number of pairable instructions per group")); 72 73 static cl::opt<unsigned> 74 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 75 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 76 " a full cycle check")); 77 78 static cl::opt<bool> 79 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 80 cl::desc("Don't try to vectorize integer values")); 81 82 static cl::opt<bool> 83 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 84 cl::desc("Don't try to vectorize floating-point values")); 85 86 static cl::opt<bool> 87 NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden, 88 cl::desc("Don't try to vectorize pointer values")); 89 90 static cl::opt<bool> 91 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 92 cl::desc("Don't try to vectorize casting (conversion) operations")); 93 94 static cl::opt<bool> 95 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 96 cl::desc("Don't try to vectorize floating-point math intrinsics")); 97 98 static cl::opt<bool> 99 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 100 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 101 102 static cl::opt<bool> 103 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 104 cl::desc("Don't try to vectorize select instructions")); 105 106 static cl::opt<bool> 107 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 108 cl::desc("Don't try to vectorize getelementptr instructions")); 109 110 static cl::opt<bool> 111 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 112 cl::desc("Don't try to vectorize loads and stores")); 113 114 static cl::opt<bool> 115 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 116 cl::desc("Only generate aligned loads and stores")); 117 118 static cl::opt<bool> 119 NoMemOpBoost("bb-vectorize-no-mem-op-boost", 120 cl::init(false), cl::Hidden, 121 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 122 123 static cl::opt<bool> 124 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 125 cl::desc("Use a fast instruction dependency analysis")); 126 127 #ifndef NDEBUG 128 static cl::opt<bool> 129 DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 130 cl::init(false), cl::Hidden, 131 cl::desc("When debugging is enabled, output information on the" 132 " instruction-examination process")); 133 static cl::opt<bool> 134 DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 135 cl::init(false), cl::Hidden, 136 cl::desc("When debugging is enabled, output information on the" 137 " candidate-selection process")); 138 static cl::opt<bool> 139 DebugPairSelection("bb-vectorize-debug-pair-selection", 140 cl::init(false), cl::Hidden, 141 cl::desc("When debugging is enabled, output information on the" 142 " pair-selection process")); 143 static cl::opt<bool> 144 DebugCycleCheck("bb-vectorize-debug-cycle-check", 145 cl::init(false), cl::Hidden, 146 cl::desc("When debugging is enabled, output information on the" 147 " cycle-checking process")); 148 #endif 149 150 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 151 152 namespace { 153 struct BBVectorize : public BasicBlockPass { 154 static char ID; // Pass identification, replacement for typeid 155 156 const VectorizeConfig Config; 157 158 BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 159 : BasicBlockPass(ID), Config(C) { 160 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 161 } 162 163 BBVectorize(Pass *P, const VectorizeConfig &C) 164 : BasicBlockPass(ID), Config(C) { 165 AA = &P->getAnalysis<AliasAnalysis>(); 166 SE = &P->getAnalysis<ScalarEvolution>(); 167 TD = P->getAnalysisIfAvailable<TargetData>(); 168 } 169 170 typedef std::pair<Value *, Value *> ValuePair; 171 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 172 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 173 typedef std::pair<std::multimap<Value *, Value *>::iterator, 174 std::multimap<Value *, Value *>::iterator> VPIteratorPair; 175 typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, 176 std::multimap<ValuePair, ValuePair>::iterator> 177 VPPIteratorPair; 178 179 AliasAnalysis *AA; 180 ScalarEvolution *SE; 181 TargetData *TD; 182 183 // FIXME: const correct? 184 185 bool vectorizePairs(BasicBlock &BB); 186 187 bool getCandidatePairs(BasicBlock &BB, 188 BasicBlock::iterator &Start, 189 std::multimap<Value *, Value *> &CandidatePairs, 190 std::vector<Value *> &PairableInsts); 191 192 void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, 193 std::vector<Value *> &PairableInsts, 194 std::multimap<ValuePair, ValuePair> &ConnectedPairs); 195 196 void buildDepMap(BasicBlock &BB, 197 std::multimap<Value *, Value *> &CandidatePairs, 198 std::vector<Value *> &PairableInsts, 199 DenseSet<ValuePair> &PairableInstUsers); 200 201 void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, 202 std::vector<Value *> &PairableInsts, 203 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 204 DenseSet<ValuePair> &PairableInstUsers, 205 DenseMap<Value *, Value *>& ChosenPairs); 206 207 void fuseChosenPairs(BasicBlock &BB, 208 std::vector<Value *> &PairableInsts, 209 DenseMap<Value *, Value *>& ChosenPairs); 210 211 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 212 213 bool areInstsCompatible(Instruction *I, Instruction *J, 214 bool IsSimpleLoadStore); 215 216 bool trackUsesOfI(DenseSet<Value *> &Users, 217 AliasSetTracker &WriteSet, Instruction *I, 218 Instruction *J, bool UpdateUsers = true, 219 std::multimap<Value *, Value *> *LoadMoveSet = 0); 220 221 void computePairsConnectedTo( 222 std::multimap<Value *, Value *> &CandidatePairs, 223 std::vector<Value *> &PairableInsts, 224 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 225 ValuePair P); 226 227 bool pairsConflict(ValuePair P, ValuePair Q, 228 DenseSet<ValuePair> &PairableInstUsers, 229 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); 230 231 bool pairWillFormCycle(ValuePair P, 232 std::multimap<ValuePair, ValuePair> &PairableInstUsers, 233 DenseSet<ValuePair> &CurrentPairs); 234 235 void pruneTreeFor( 236 std::multimap<Value *, Value *> &CandidatePairs, 237 std::vector<Value *> &PairableInsts, 238 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 239 DenseSet<ValuePair> &PairableInstUsers, 240 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 241 DenseMap<Value *, Value *> &ChosenPairs, 242 DenseMap<ValuePair, size_t> &Tree, 243 DenseSet<ValuePair> &PrunedTree, ValuePair J, 244 bool UseCycleCheck); 245 246 void buildInitialTreeFor( 247 std::multimap<Value *, Value *> &CandidatePairs, 248 std::vector<Value *> &PairableInsts, 249 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 250 DenseSet<ValuePair> &PairableInstUsers, 251 DenseMap<Value *, Value *> &ChosenPairs, 252 DenseMap<ValuePair, size_t> &Tree, ValuePair J); 253 254 void findBestTreeFor( 255 std::multimap<Value *, Value *> &CandidatePairs, 256 std::vector<Value *> &PairableInsts, 257 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 258 DenseSet<ValuePair> &PairableInstUsers, 259 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 260 DenseMap<Value *, Value *> &ChosenPairs, 261 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 262 size_t &BestEffSize, VPIteratorPair ChoiceRange, 263 bool UseCycleCheck); 264 265 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 266 Instruction *J, unsigned o, bool &FlipMemInputs); 267 268 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 269 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 270 unsigned IdxOffset, std::vector<Constant*> &Mask); 271 272 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 273 Instruction *J); 274 275 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 276 Instruction *J, unsigned o, bool FlipMemInputs); 277 278 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 279 Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, 280 bool &FlipMemInputs); 281 282 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 283 Instruction *J, Instruction *K, 284 Instruction *&InsertionPt, Instruction *&K1, 285 Instruction *&K2, bool &FlipMemInputs); 286 287 void collectPairLoadMoveSet(BasicBlock &BB, 288 DenseMap<Value *, Value *> &ChosenPairs, 289 std::multimap<Value *, Value *> &LoadMoveSet, 290 Instruction *I); 291 292 void collectLoadMoveSet(BasicBlock &BB, 293 std::vector<Value *> &PairableInsts, 294 DenseMap<Value *, Value *> &ChosenPairs, 295 std::multimap<Value *, Value *> &LoadMoveSet); 296 297 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 298 std::multimap<Value *, Value *> &LoadMoveSet, 299 Instruction *I, Instruction *J); 300 301 void moveUsesOfIAfterJ(BasicBlock &BB, 302 std::multimap<Value *, Value *> &LoadMoveSet, 303 Instruction *&InsertionPt, 304 Instruction *I, Instruction *J); 305 306 bool vectorizeBB(BasicBlock &BB) { 307 bool changed = false; 308 // Iterate a sufficient number of times to merge types of size 1 bit, 309 // then 2 bits, then 4, etc. up to half of the target vector width of the 310 // target vector register. 311 for (unsigned v = 2, n = 1; 312 v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); 313 v *= 2, ++n) { 314 DEBUG(dbgs() << "BBV: fusing loop #" << n << 315 " for " << BB.getName() << " in " << 316 BB.getParent()->getName() << "...\n"); 317 if (vectorizePairs(BB)) 318 changed = true; 319 else 320 break; 321 } 322 323 DEBUG(dbgs() << "BBV: done!\n"); 324 return changed; 325 } 326 327 virtual bool runOnBasicBlock(BasicBlock &BB) { 328 AA = &getAnalysis<AliasAnalysis>(); 329 SE = &getAnalysis<ScalarEvolution>(); 330 TD = getAnalysisIfAvailable<TargetData>(); 331 332 return vectorizeBB(BB); 333 } 334 335 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 336 BasicBlockPass::getAnalysisUsage(AU); 337 AU.addRequired<AliasAnalysis>(); 338 AU.addRequired<ScalarEvolution>(); 339 AU.addPreserved<AliasAnalysis>(); 340 AU.addPreserved<ScalarEvolution>(); 341 AU.setPreservesCFG(); 342 } 343 344 // This returns the vector type that holds a pair of the provided type. 345 // If the provided type is already a vector, then its length is doubled. 346 static inline VectorType *getVecTypeForPair(Type *ElemTy) { 347 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 348 unsigned numElem = VTy->getNumElements(); 349 return VectorType::get(ElemTy->getScalarType(), numElem*2); 350 } 351 352 return VectorType::get(ElemTy, 2); 353 } 354 355 // Returns the weight associated with the provided value. A chain of 356 // candidate pairs has a length given by the sum of the weights of its 357 // members (one weight per pair; the weight of each member of the pair 358 // is assumed to be the same). This length is then compared to the 359 // chain-length threshold to determine if a given chain is significant 360 // enough to be vectorized. The length is also used in comparing 361 // candidate chains where longer chains are considered to be better. 362 // Note: when this function returns 0, the resulting instructions are 363 // not actually fused. 364 inline size_t getDepthFactor(Value *V) { 365 // InsertElement and ExtractElement have a depth factor of zero. This is 366 // for two reasons: First, they cannot be usefully fused. Second, because 367 // the pass generates a lot of these, they can confuse the simple metric 368 // used to compare the trees in the next iteration. Thus, giving them a 369 // weight of zero allows the pass to essentially ignore them in 370 // subsequent iterations when looking for vectorization opportunities 371 // while still tracking dependency chains that flow through those 372 // instructions. 373 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 374 return 0; 375 376 // Give a load or store half of the required depth so that load/store 377 // pairs will vectorize. 378 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 379 return Config.ReqChainDepth/2; 380 381 return 1; 382 } 383 384 // This determines the relative offset of two loads or stores, returning 385 // true if the offset could be determined to be some constant value. 386 // For example, if OffsetInElmts == 1, then J accesses the memory directly 387 // after I; if OffsetInElmts == -1 then I accesses the memory 388 // directly after J. This function assumes that both instructions 389 // have the same type. 390 bool getPairPtrInfo(Instruction *I, Instruction *J, 391 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 392 int64_t &OffsetInElmts) { 393 OffsetInElmts = 0; 394 if (isa<LoadInst>(I)) { 395 IPtr = cast<LoadInst>(I)->getPointerOperand(); 396 JPtr = cast<LoadInst>(J)->getPointerOperand(); 397 IAlignment = cast<LoadInst>(I)->getAlignment(); 398 JAlignment = cast<LoadInst>(J)->getAlignment(); 399 } else { 400 IPtr = cast<StoreInst>(I)->getPointerOperand(); 401 JPtr = cast<StoreInst>(J)->getPointerOperand(); 402 IAlignment = cast<StoreInst>(I)->getAlignment(); 403 JAlignment = cast<StoreInst>(J)->getAlignment(); 404 } 405 406 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 407 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 408 409 // If this is a trivial offset, then we'll get something like 410 // 1*sizeof(type). With target data, which we need anyway, this will get 411 // constant folded into a number. 412 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 413 if (const SCEVConstant *ConstOffSCEV = 414 dyn_cast<SCEVConstant>(OffsetSCEV)) { 415 ConstantInt *IntOff = ConstOffSCEV->getValue(); 416 int64_t Offset = IntOff->getSExtValue(); 417 418 Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); 419 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); 420 421 assert(VTy == cast<PointerType>(JPtr->getType())->getElementType()); 422 423 OffsetInElmts = Offset/VTyTSS; 424 return (abs64(Offset) % VTyTSS) == 0; 425 } 426 427 return false; 428 } 429 430 // Returns true if the provided CallInst represents an intrinsic that can 431 // be vectorized. 432 bool isVectorizableIntrinsic(CallInst* I) { 433 Function *F = I->getCalledFunction(); 434 if (!F) return false; 435 436 unsigned IID = F->getIntrinsicID(); 437 if (!IID) return false; 438 439 switch(IID) { 440 default: 441 return false; 442 case Intrinsic::sqrt: 443 case Intrinsic::powi: 444 case Intrinsic::sin: 445 case Intrinsic::cos: 446 case Intrinsic::log: 447 case Intrinsic::log2: 448 case Intrinsic::log10: 449 case Intrinsic::exp: 450 case Intrinsic::exp2: 451 case Intrinsic::pow: 452 return Config.VectorizeMath; 453 case Intrinsic::fma: 454 return Config.VectorizeFMA; 455 } 456 } 457 458 // Returns true if J is the second element in some pair referenced by 459 // some multimap pair iterator pair. 460 template <typename V> 461 bool isSecondInIteratorPair(V J, std::pair< 462 typename std::multimap<V, V>::iterator, 463 typename std::multimap<V, V>::iterator> PairRange) { 464 for (typename std::multimap<V, V>::iterator K = PairRange.first; 465 K != PairRange.second; ++K) 466 if (K->second == J) return true; 467 468 return false; 469 } 470 }; 471 472 // This function implements one vectorization iteration on the provided 473 // basic block. It returns true if the block is changed. 474 bool BBVectorize::vectorizePairs(BasicBlock &BB) { 475 bool ShouldContinue; 476 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 477 478 std::vector<Value *> AllPairableInsts; 479 DenseMap<Value *, Value *> AllChosenPairs; 480 481 do { 482 std::vector<Value *> PairableInsts; 483 std::multimap<Value *, Value *> CandidatePairs; 484 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 485 PairableInsts); 486 if (PairableInsts.empty()) continue; 487 488 // Now we have a map of all of the pairable instructions and we need to 489 // select the best possible pairing. A good pairing is one such that the 490 // users of the pair are also paired. This defines a (directed) forest 491 // over the pairs such that two pairs are connected iff the second pair 492 // uses the first. 493 494 // Note that it only matters that both members of the second pair use some 495 // element of the first pair (to allow for splatting). 496 497 std::multimap<ValuePair, ValuePair> ConnectedPairs; 498 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); 499 if (ConnectedPairs.empty()) continue; 500 501 // Build the pairable-instruction dependency map 502 DenseSet<ValuePair> PairableInstUsers; 503 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 504 505 // There is now a graph of the connected pairs. For each variable, pick 506 // the pairing with the largest tree meeting the depth requirement on at 507 // least one branch. Then select all pairings that are part of that tree 508 // and remove them from the list of available pairings and pairable 509 // variables. 510 511 DenseMap<Value *, Value *> ChosenPairs; 512 choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, 513 PairableInstUsers, ChosenPairs); 514 515 if (ChosenPairs.empty()) continue; 516 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 517 PairableInsts.end()); 518 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 519 } while (ShouldContinue); 520 521 if (AllChosenPairs.empty()) return false; 522 NumFusedOps += AllChosenPairs.size(); 523 524 // A set of pairs has now been selected. It is now necessary to replace the 525 // paired instructions with vector instructions. For this procedure each 526 // operand must be replaced with a vector operand. This vector is formed 527 // by using build_vector on the old operands. The replaced values are then 528 // replaced with a vector_extract on the result. Subsequent optimization 529 // passes should coalesce the build/extract combinations. 530 531 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); 532 return true; 533 } 534 535 // This function returns true if the provided instruction is capable of being 536 // fused into a vector instruction. This determination is based only on the 537 // type and other attributes of the instruction. 538 bool BBVectorize::isInstVectorizable(Instruction *I, 539 bool &IsSimpleLoadStore) { 540 IsSimpleLoadStore = false; 541 542 if (CallInst *C = dyn_cast<CallInst>(I)) { 543 if (!isVectorizableIntrinsic(C)) 544 return false; 545 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 546 // Vectorize simple loads if possbile: 547 IsSimpleLoadStore = L->isSimple(); 548 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 549 return false; 550 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 551 // Vectorize simple stores if possbile: 552 IsSimpleLoadStore = S->isSimple(); 553 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 554 return false; 555 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 556 // We can vectorize casts, but not casts of pointer types, etc. 557 if (!Config.VectorizeCasts) 558 return false; 559 560 Type *SrcTy = C->getSrcTy(); 561 if (!SrcTy->isSingleValueType()) 562 return false; 563 564 Type *DestTy = C->getDestTy(); 565 if (!DestTy->isSingleValueType()) 566 return false; 567 } else if (isa<SelectInst>(I)) { 568 if (!Config.VectorizeSelect) 569 return false; 570 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 571 if (!Config.VectorizeGEP) 572 return false; 573 574 // Currently, vector GEPs exist only with one index. 575 if (G->getNumIndices() != 1) 576 return false; 577 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 578 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 579 return false; 580 } 581 582 // We can't vectorize memory operations without target data 583 if (TD == 0 && IsSimpleLoadStore) 584 return false; 585 586 Type *T1, *T2; 587 if (isa<StoreInst>(I)) { 588 // For stores, it is the value type, not the pointer type that matters 589 // because the value is what will come from a vector register. 590 591 Value *IVal = cast<StoreInst>(I)->getValueOperand(); 592 T1 = IVal->getType(); 593 } else { 594 T1 = I->getType(); 595 } 596 597 if (I->isCast()) 598 T2 = cast<CastInst>(I)->getSrcTy(); 599 else 600 T2 = T1; 601 602 // Not every type can be vectorized... 603 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 604 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 605 return false; 606 607 if (!Config.VectorizeInts 608 && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) 609 return false; 610 611 if (!Config.VectorizeFloats 612 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 613 return false; 614 615 if ((!Config.VectorizePointers || TD == 0) && 616 (T1->getScalarType()->isPointerTy() || 617 T2->getScalarType()->isPointerTy())) 618 return false; 619 620 if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 || 621 T2->getPrimitiveSizeInBits() > Config.VectorBits/2) 622 return false; 623 624 return true; 625 } 626 627 // This function returns true if the two provided instructions are compatible 628 // (meaning that they can be fused into a vector instruction). This assumes 629 // that I has already been determined to be vectorizable and that J is not 630 // in the use tree of I. 631 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 632 bool IsSimpleLoadStore) { 633 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 634 " <-> " << *J << "\n"); 635 636 // Loads and stores can be merged if they have different alignments, 637 // but are otherwise the same. 638 LoadInst *LI, *LJ; 639 StoreInst *SI, *SJ; 640 if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) { 641 if (I->getType() != J->getType()) 642 return false; 643 644 if (LI->getPointerOperand()->getType() != 645 LJ->getPointerOperand()->getType() || 646 LI->isVolatile() != LJ->isVolatile() || 647 LI->getOrdering() != LJ->getOrdering() || 648 LI->getSynchScope() != LJ->getSynchScope()) 649 return false; 650 } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { 651 if (SI->getValueOperand()->getType() != 652 SJ->getValueOperand()->getType() || 653 SI->getPointerOperand()->getType() != 654 SJ->getPointerOperand()->getType() || 655 SI->isVolatile() != SJ->isVolatile() || 656 SI->getOrdering() != SJ->getOrdering() || 657 SI->getSynchScope() != SJ->getSynchScope()) 658 return false; 659 } else if (!J->isSameOperationAs(I)) { 660 return false; 661 } 662 // FIXME: handle addsub-type operations! 663 664 if (IsSimpleLoadStore) { 665 Value *IPtr, *JPtr; 666 unsigned IAlignment, JAlignment; 667 int64_t OffsetInElmts = 0; 668 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 669 OffsetInElmts) && abs64(OffsetInElmts) == 1) { 670 if (Config.AlignedOnly) { 671 Type *aType = isa<StoreInst>(I) ? 672 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 673 // An aligned load or store is possible only if the instruction 674 // with the lower offset has an alignment suitable for the 675 // vector type. 676 677 unsigned BottomAlignment = IAlignment; 678 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 679 680 Type *VType = getVecTypeForPair(aType); 681 unsigned VecAlignment = TD->getPrefTypeAlignment(VType); 682 if (BottomAlignment < VecAlignment) 683 return false; 684 } 685 } else { 686 return false; 687 } 688 } else if (isa<ShuffleVectorInst>(I)) { 689 // Only merge two shuffles if they're both constant 690 return isa<Constant>(I->getOperand(2)) && 691 isa<Constant>(J->getOperand(2)); 692 // FIXME: We may want to vectorize non-constant shuffles also. 693 } 694 695 // The powi intrinsic is special because only the first argument is 696 // vectorized, the second arguments must be equal. 697 CallInst *CI = dyn_cast<CallInst>(I); 698 Function *FI; 699 if (CI && (FI = CI->getCalledFunction()) && 700 FI->getIntrinsicID() == Intrinsic::powi) { 701 702 Value *A1I = CI->getArgOperand(1), 703 *A1J = cast<CallInst>(J)->getArgOperand(1); 704 const SCEV *A1ISCEV = SE->getSCEV(A1I), 705 *A1JSCEV = SE->getSCEV(A1J); 706 return (A1ISCEV == A1JSCEV); 707 } 708 709 return true; 710 } 711 712 // Figure out whether or not J uses I and update the users and write-set 713 // structures associated with I. Specifically, Users represents the set of 714 // instructions that depend on I. WriteSet represents the set 715 // of memory locations that are dependent on I. If UpdateUsers is true, 716 // and J uses I, then Users is updated to contain J and WriteSet is updated 717 // to contain any memory locations to which J writes. The function returns 718 // true if J uses I. By default, alias analysis is used to determine 719 // whether J reads from memory that overlaps with a location in WriteSet. 720 // If LoadMoveSet is not null, then it is a previously-computed multimap 721 // where the key is the memory-based user instruction and the value is 722 // the instruction to be compared with I. So, if LoadMoveSet is provided, 723 // then the alias analysis is not used. This is necessary because this 724 // function is called during the process of moving instructions during 725 // vectorization and the results of the alias analysis are not stable during 726 // that process. 727 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 728 AliasSetTracker &WriteSet, Instruction *I, 729 Instruction *J, bool UpdateUsers, 730 std::multimap<Value *, Value *> *LoadMoveSet) { 731 bool UsesI = false; 732 733 // This instruction may already be marked as a user due, for example, to 734 // being a member of a selected pair. 735 if (Users.count(J)) 736 UsesI = true; 737 738 if (!UsesI) 739 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 740 JU != JE; ++JU) { 741 Value *V = *JU; 742 if (I == V || Users.count(V)) { 743 UsesI = true; 744 break; 745 } 746 } 747 if (!UsesI && J->mayReadFromMemory()) { 748 if (LoadMoveSet) { 749 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); 750 UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); 751 } else { 752 for (AliasSetTracker::iterator W = WriteSet.begin(), 753 WE = WriteSet.end(); W != WE; ++W) { 754 if (W->aliasesUnknownInst(J, *AA)) { 755 UsesI = true; 756 break; 757 } 758 } 759 } 760 } 761 762 if (UsesI && UpdateUsers) { 763 if (J->mayWriteToMemory()) WriteSet.add(J); 764 Users.insert(J); 765 } 766 767 return UsesI; 768 } 769 770 // This function iterates over all instruction pairs in the provided 771 // basic block and collects all candidate pairs for vectorization. 772 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 773 BasicBlock::iterator &Start, 774 std::multimap<Value *, Value *> &CandidatePairs, 775 std::vector<Value *> &PairableInsts) { 776 BasicBlock::iterator E = BB.end(); 777 if (Start == E) return false; 778 779 bool ShouldContinue = false, IAfterStart = false; 780 for (BasicBlock::iterator I = Start++; I != E; ++I) { 781 if (I == Start) IAfterStart = true; 782 783 bool IsSimpleLoadStore; 784 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 785 786 // Look for an instruction with which to pair instruction *I... 787 DenseSet<Value *> Users; 788 AliasSetTracker WriteSet(*AA); 789 bool JAfterStart = IAfterStart; 790 BasicBlock::iterator J = llvm::next(I); 791 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 792 if (J == Start) JAfterStart = true; 793 794 // Determine if J uses I, if so, exit the loop. 795 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); 796 if (Config.FastDep) { 797 // Note: For this heuristic to be effective, independent operations 798 // must tend to be intermixed. This is likely to be true from some 799 // kinds of grouped loop unrolling (but not the generic LLVM pass), 800 // but otherwise may require some kind of reordering pass. 801 802 // When using fast dependency analysis, 803 // stop searching after first use: 804 if (UsesI) break; 805 } else { 806 if (UsesI) continue; 807 } 808 809 // J does not use I, and comes before the first use of I, so it can be 810 // merged with I if the instructions are compatible. 811 if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; 812 813 // J is a candidate for merging with I. 814 if (!PairableInsts.size() || 815 PairableInsts[PairableInsts.size()-1] != I) { 816 PairableInsts.push_back(I); 817 } 818 819 CandidatePairs.insert(ValuePair(I, J)); 820 821 // The next call to this function must start after the last instruction 822 // selected during this invocation. 823 if (JAfterStart) { 824 Start = llvm::next(J); 825 IAfterStart = JAfterStart = false; 826 } 827 828 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 829 << *I << " <-> " << *J << "\n"); 830 831 // If we have already found too many pairs, break here and this function 832 // will be called again starting after the last instruction selected 833 // during this invocation. 834 if (PairableInsts.size() >= Config.MaxInsts) { 835 ShouldContinue = true; 836 break; 837 } 838 } 839 840 if (ShouldContinue) 841 break; 842 } 843 844 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 845 << " instructions with candidate pairs\n"); 846 847 return ShouldContinue; 848 } 849 850 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 851 // it looks for pairs such that both members have an input which is an 852 // output of PI or PJ. 853 void BBVectorize::computePairsConnectedTo( 854 std::multimap<Value *, Value *> &CandidatePairs, 855 std::vector<Value *> &PairableInsts, 856 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 857 ValuePair P) { 858 StoreInst *SI, *SJ; 859 860 // For each possible pairing for this variable, look at the uses of 861 // the first value... 862 for (Value::use_iterator I = P.first->use_begin(), 863 E = P.first->use_end(); I != E; ++I) { 864 if (isa<LoadInst>(*I)) { 865 // A pair cannot be connected to a load because the load only takes one 866 // operand (the address) and it is a scalar even after vectorization. 867 continue; 868 } else if ((SI = dyn_cast<StoreInst>(*I)) && 869 P.first == SI->getPointerOperand()) { 870 // Similarly, a pair cannot be connected to a store through its 871 // pointer operand. 872 continue; 873 } 874 875 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 876 877 // For each use of the first variable, look for uses of the second 878 // variable... 879 for (Value::use_iterator J = P.second->use_begin(), 880 E2 = P.second->use_end(); J != E2; ++J) { 881 if ((SJ = dyn_cast<StoreInst>(*J)) && 882 P.second == SJ->getPointerOperand()) 883 continue; 884 885 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); 886 887 // Look for <I, J>: 888 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 889 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 890 891 // Look for <J, I>: 892 if (isSecondInIteratorPair<Value*>(*I, JPairRange)) 893 ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); 894 } 895 896 if (Config.SplatBreaksChain) continue; 897 // Look for cases where just the first value in the pair is used by 898 // both members of another pair (splatting). 899 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { 900 if ((SJ = dyn_cast<StoreInst>(*J)) && 901 P.first == SJ->getPointerOperand()) 902 continue; 903 904 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 905 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 906 } 907 } 908 909 if (Config.SplatBreaksChain) return; 910 // Look for cases where just the second value in the pair is used by 911 // both members of another pair (splatting). 912 for (Value::use_iterator I = P.second->use_begin(), 913 E = P.second->use_end(); I != E; ++I) { 914 if (isa<LoadInst>(*I)) 915 continue; 916 else if ((SI = dyn_cast<StoreInst>(*I)) && 917 P.second == SI->getPointerOperand()) 918 continue; 919 920 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); 921 922 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { 923 if ((SJ = dyn_cast<StoreInst>(*J)) && 924 P.second == SJ->getPointerOperand()) 925 continue; 926 927 if (isSecondInIteratorPair<Value*>(*J, IPairRange)) 928 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); 929 } 930 } 931 } 932 933 // This function figures out which pairs are connected. Two pairs are 934 // connected if some output of the first pair forms an input to both members 935 // of the second pair. 936 void BBVectorize::computeConnectedPairs( 937 std::multimap<Value *, Value *> &CandidatePairs, 938 std::vector<Value *> &PairableInsts, 939 std::multimap<ValuePair, ValuePair> &ConnectedPairs) { 940 941 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 942 PE = PairableInsts.end(); PI != PE; ++PI) { 943 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); 944 945 for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; 946 P != choiceRange.second; ++P) 947 computePairsConnectedTo(CandidatePairs, PairableInsts, 948 ConnectedPairs, *P); 949 } 950 951 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() 952 << " pair connections.\n"); 953 } 954 955 // This function builds a set of use tuples such that <A, B> is in the set 956 // if B is in the use tree of A. If B is in the use tree of A, then B 957 // depends on the output of A. 958 void BBVectorize::buildDepMap( 959 BasicBlock &BB, 960 std::multimap<Value *, Value *> &CandidatePairs, 961 std::vector<Value *> &PairableInsts, 962 DenseSet<ValuePair> &PairableInstUsers) { 963 DenseSet<Value *> IsInPair; 964 for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(), 965 E = CandidatePairs.end(); C != E; ++C) { 966 IsInPair.insert(C->first); 967 IsInPair.insert(C->second); 968 } 969 970 // Iterate through the basic block, recording all Users of each 971 // pairable instruction. 972 973 BasicBlock::iterator E = BB.end(); 974 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 975 if (IsInPair.find(I) == IsInPair.end()) continue; 976 977 DenseSet<Value *> Users; 978 AliasSetTracker WriteSet(*AA); 979 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) 980 (void) trackUsesOfI(Users, WriteSet, I, J); 981 982 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 983 U != E; ++U) 984 PairableInstUsers.insert(ValuePair(I, *U)); 985 } 986 } 987 988 // Returns true if an input to pair P is an output of pair Q and also an 989 // input of pair Q is an output of pair P. If this is the case, then these 990 // two pairs cannot be simultaneously fused. 991 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 992 DenseSet<ValuePair> &PairableInstUsers, 993 std::multimap<ValuePair, ValuePair> *PairableInstUserMap) { 994 // Two pairs are in conflict if they are mutual Users of eachother. 995 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 996 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 997 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 998 PairableInstUsers.count(ValuePair(P.second, Q.second)); 999 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 1000 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 1001 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 1002 PairableInstUsers.count(ValuePair(Q.second, P.second)); 1003 if (PairableInstUserMap) { 1004 // FIXME: The expensive part of the cycle check is not so much the cycle 1005 // check itself but this edge insertion procedure. This needs some 1006 // profiling and probably a different data structure (same is true of 1007 // most uses of std::multimap). 1008 if (PUsesQ) { 1009 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); 1010 if (!isSecondInIteratorPair(P, QPairRange)) 1011 PairableInstUserMap->insert(VPPair(Q, P)); 1012 } 1013 if (QUsesP) { 1014 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); 1015 if (!isSecondInIteratorPair(Q, PPairRange)) 1016 PairableInstUserMap->insert(VPPair(P, Q)); 1017 } 1018 } 1019 1020 return (QUsesP && PUsesQ); 1021 } 1022 1023 // This function walks the use graph of current pairs to see if, starting 1024 // from P, the walk returns to P. 1025 bool BBVectorize::pairWillFormCycle(ValuePair P, 1026 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1027 DenseSet<ValuePair> &CurrentPairs) { 1028 DEBUG(if (DebugCycleCheck) 1029 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 1030 << *P.second << "\n"); 1031 // A lookup table of visisted pairs is kept because the PairableInstUserMap 1032 // contains non-direct associations. 1033 DenseSet<ValuePair> Visited; 1034 SmallVector<ValuePair, 32> Q; 1035 // General depth-first post-order traversal: 1036 Q.push_back(P); 1037 do { 1038 ValuePair QTop = Q.pop_back_val(); 1039 Visited.insert(QTop); 1040 1041 DEBUG(if (DebugCycleCheck) 1042 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 1043 << *QTop.second << "\n"); 1044 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop); 1045 for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first; 1046 C != QPairRange.second; ++C) { 1047 if (C->second == P) { 1048 DEBUG(dbgs() 1049 << "BBV: rejected to prevent non-trivial cycle formation: " 1050 << *C->first.first << " <-> " << *C->first.second << "\n"); 1051 return true; 1052 } 1053 1054 if (CurrentPairs.count(C->second) && !Visited.count(C->second)) 1055 Q.push_back(C->second); 1056 } 1057 } while (!Q.empty()); 1058 1059 return false; 1060 } 1061 1062 // This function builds the initial tree of connected pairs with the 1063 // pair J at the root. 1064 void BBVectorize::buildInitialTreeFor( 1065 std::multimap<Value *, Value *> &CandidatePairs, 1066 std::vector<Value *> &PairableInsts, 1067 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1068 DenseSet<ValuePair> &PairableInstUsers, 1069 DenseMap<Value *, Value *> &ChosenPairs, 1070 DenseMap<ValuePair, size_t> &Tree, ValuePair J) { 1071 // Each of these pairs is viewed as the root node of a Tree. The Tree 1072 // is then walked (depth-first). As this happens, we keep track of 1073 // the pairs that compose the Tree and the maximum depth of the Tree. 1074 SmallVector<ValuePairWithDepth, 32> Q; 1075 // General depth-first post-order traversal: 1076 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1077 do { 1078 ValuePairWithDepth QTop = Q.back(); 1079 1080 // Push each child onto the queue: 1081 bool MoreChildren = false; 1082 size_t MaxChildDepth = QTop.second; 1083 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first); 1084 for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first; 1085 k != qtRange.second; ++k) { 1086 // Make sure that this child pair is still a candidate: 1087 bool IsStillCand = false; 1088 VPIteratorPair checkRange = 1089 CandidatePairs.equal_range(k->second.first); 1090 for (std::multimap<Value *, Value *>::iterator m = checkRange.first; 1091 m != checkRange.second; ++m) { 1092 if (m->second == k->second.second) { 1093 IsStillCand = true; 1094 break; 1095 } 1096 } 1097 1098 if (IsStillCand) { 1099 DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second); 1100 if (C == Tree.end()) { 1101 size_t d = getDepthFactor(k->second.first); 1102 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d)); 1103 MoreChildren = true; 1104 } else { 1105 MaxChildDepth = std::max(MaxChildDepth, C->second); 1106 } 1107 } 1108 } 1109 1110 if (!MoreChildren) { 1111 // Record the current pair as part of the Tree: 1112 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1113 Q.pop_back(); 1114 } 1115 } while (!Q.empty()); 1116 } 1117 1118 // Given some initial tree, prune it by removing conflicting pairs (pairs 1119 // that cannot be simultaneously chosen for vectorization). 1120 void BBVectorize::pruneTreeFor( 1121 std::multimap<Value *, Value *> &CandidatePairs, 1122 std::vector<Value *> &PairableInsts, 1123 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1124 DenseSet<ValuePair> &PairableInstUsers, 1125 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1126 DenseMap<Value *, Value *> &ChosenPairs, 1127 DenseMap<ValuePair, size_t> &Tree, 1128 DenseSet<ValuePair> &PrunedTree, ValuePair J, 1129 bool UseCycleCheck) { 1130 SmallVector<ValuePairWithDepth, 32> Q; 1131 // General depth-first post-order traversal: 1132 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1133 do { 1134 ValuePairWithDepth QTop = Q.pop_back_val(); 1135 PrunedTree.insert(QTop.first); 1136 1137 // Visit each child, pruning as necessary... 1138 DenseMap<ValuePair, size_t> BestChildren; 1139 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); 1140 for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first; 1141 K != QTopRange.second; ++K) { 1142 DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second); 1143 if (C == Tree.end()) continue; 1144 1145 // This child is in the Tree, now we need to make sure it is the 1146 // best of any conflicting children. There could be multiple 1147 // conflicting children, so first, determine if we're keeping 1148 // this child, then delete conflicting children as necessary. 1149 1150 // It is also necessary to guard against pairing-induced 1151 // dependencies. Consider instructions a .. x .. y .. b 1152 // such that (a,b) are to be fused and (x,y) are to be fused 1153 // but a is an input to x and b is an output from y. This 1154 // means that y cannot be moved after b but x must be moved 1155 // after b for (a,b) to be fused. In other words, after 1156 // fusing (a,b) we have y .. a/b .. x where y is an input 1157 // to a/b and x is an output to a/b: x and y can no longer 1158 // be legally fused. To prevent this condition, we must 1159 // make sure that a child pair added to the Tree is not 1160 // both an input and output of an already-selected pair. 1161 1162 // Pairing-induced dependencies can also form from more complicated 1163 // cycles. The pair vs. pair conflicts are easy to check, and so 1164 // that is done explicitly for "fast rejection", and because for 1165 // child vs. child conflicts, we may prefer to keep the current 1166 // pair in preference to the already-selected child. 1167 DenseSet<ValuePair> CurrentPairs; 1168 1169 bool CanAdd = true; 1170 for (DenseMap<ValuePair, size_t>::iterator C2 1171 = BestChildren.begin(), E2 = BestChildren.end(); 1172 C2 != E2; ++C2) { 1173 if (C2->first.first == C->first.first || 1174 C2->first.first == C->first.second || 1175 C2->first.second == C->first.first || 1176 C2->first.second == C->first.second || 1177 pairsConflict(C2->first, C->first, PairableInstUsers, 1178 UseCycleCheck ? &PairableInstUserMap : 0)) { 1179 if (C2->second >= C->second) { 1180 CanAdd = false; 1181 break; 1182 } 1183 1184 CurrentPairs.insert(C2->first); 1185 } 1186 } 1187 if (!CanAdd) continue; 1188 1189 // Even worse, this child could conflict with another node already 1190 // selected for the Tree. If that is the case, ignore this child. 1191 for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(), 1192 E2 = PrunedTree.end(); T != E2; ++T) { 1193 if (T->first == C->first.first || 1194 T->first == C->first.second || 1195 T->second == C->first.first || 1196 T->second == C->first.second || 1197 pairsConflict(*T, C->first, PairableInstUsers, 1198 UseCycleCheck ? &PairableInstUserMap : 0)) { 1199 CanAdd = false; 1200 break; 1201 } 1202 1203 CurrentPairs.insert(*T); 1204 } 1205 if (!CanAdd) continue; 1206 1207 // And check the queue too... 1208 for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(), 1209 E2 = Q.end(); C2 != E2; ++C2) { 1210 if (C2->first.first == C->first.first || 1211 C2->first.first == C->first.second || 1212 C2->first.second == C->first.first || 1213 C2->first.second == C->first.second || 1214 pairsConflict(C2->first, C->first, PairableInstUsers, 1215 UseCycleCheck ? &PairableInstUserMap : 0)) { 1216 CanAdd = false; 1217 break; 1218 } 1219 1220 CurrentPairs.insert(C2->first); 1221 } 1222 if (!CanAdd) continue; 1223 1224 // Last but not least, check for a conflict with any of the 1225 // already-chosen pairs. 1226 for (DenseMap<Value *, Value *>::iterator C2 = 1227 ChosenPairs.begin(), E2 = ChosenPairs.end(); 1228 C2 != E2; ++C2) { 1229 if (pairsConflict(*C2, C->first, PairableInstUsers, 1230 UseCycleCheck ? &PairableInstUserMap : 0)) { 1231 CanAdd = false; 1232 break; 1233 } 1234 1235 CurrentPairs.insert(*C2); 1236 } 1237 if (!CanAdd) continue; 1238 1239 // To check for non-trivial cycles formed by the addition of the 1240 // current pair we've formed a list of all relevant pairs, now use a 1241 // graph walk to check for a cycle. We start from the current pair and 1242 // walk the use tree to see if we again reach the current pair. If we 1243 // do, then the current pair is rejected. 1244 1245 // FIXME: It may be more efficient to use a topological-ordering 1246 // algorithm to improve the cycle check. This should be investigated. 1247 if (UseCycleCheck && 1248 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1249 continue; 1250 1251 // This child can be added, but we may have chosen it in preference 1252 // to an already-selected child. Check for this here, and if a 1253 // conflict is found, then remove the previously-selected child 1254 // before adding this one in its place. 1255 for (DenseMap<ValuePair, size_t>::iterator C2 1256 = BestChildren.begin(); C2 != BestChildren.end();) { 1257 if (C2->first.first == C->first.first || 1258 C2->first.first == C->first.second || 1259 C2->first.second == C->first.first || 1260 C2->first.second == C->first.second || 1261 pairsConflict(C2->first, C->first, PairableInstUsers)) 1262 BestChildren.erase(C2++); 1263 else 1264 ++C2; 1265 } 1266 1267 BestChildren.insert(ValuePairWithDepth(C->first, C->second)); 1268 } 1269 1270 for (DenseMap<ValuePair, size_t>::iterator C 1271 = BestChildren.begin(), E2 = BestChildren.end(); 1272 C != E2; ++C) { 1273 size_t DepthF = getDepthFactor(C->first.first); 1274 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1275 } 1276 } while (!Q.empty()); 1277 } 1278 1279 // This function finds the best tree of mututally-compatible connected 1280 // pairs, given the choice of root pairs as an iterator range. 1281 void BBVectorize::findBestTreeFor( 1282 std::multimap<Value *, Value *> &CandidatePairs, 1283 std::vector<Value *> &PairableInsts, 1284 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1285 DenseSet<ValuePair> &PairableInstUsers, 1286 std::multimap<ValuePair, ValuePair> &PairableInstUserMap, 1287 DenseMap<Value *, Value *> &ChosenPairs, 1288 DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, 1289 size_t &BestEffSize, VPIteratorPair ChoiceRange, 1290 bool UseCycleCheck) { 1291 for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; 1292 J != ChoiceRange.second; ++J) { 1293 1294 // Before going any further, make sure that this pair does not 1295 // conflict with any already-selected pairs (see comment below 1296 // near the Tree pruning for more details). 1297 DenseSet<ValuePair> ChosenPairSet; 1298 bool DoesConflict = false; 1299 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1300 E = ChosenPairs.end(); C != E; ++C) { 1301 if (pairsConflict(*C, *J, PairableInstUsers, 1302 UseCycleCheck ? &PairableInstUserMap : 0)) { 1303 DoesConflict = true; 1304 break; 1305 } 1306 1307 ChosenPairSet.insert(*C); 1308 } 1309 if (DoesConflict) continue; 1310 1311 if (UseCycleCheck && 1312 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet)) 1313 continue; 1314 1315 DenseMap<ValuePair, size_t> Tree; 1316 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1317 PairableInstUsers, ChosenPairs, Tree, *J); 1318 1319 // Because we'll keep the child with the largest depth, the largest 1320 // depth is still the same in the unpruned Tree. 1321 size_t MaxDepth = Tree.lookup(*J); 1322 1323 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {" 1324 << *J->first << " <-> " << *J->second << "} of depth " << 1325 MaxDepth << " and size " << Tree.size() << "\n"); 1326 1327 // At this point the Tree has been constructed, but, may contain 1328 // contradictory children (meaning that different children of 1329 // some tree node may be attempting to fuse the same instruction). 1330 // So now we walk the tree again, in the case of a conflict, 1331 // keep only the child with the largest depth. To break a tie, 1332 // favor the first child. 1333 1334 DenseSet<ValuePair> PrunedTree; 1335 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1336 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, 1337 PrunedTree, *J, UseCycleCheck); 1338 1339 size_t EffSize = 0; 1340 for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), 1341 E = PrunedTree.end(); S != E; ++S) 1342 EffSize += getDepthFactor(S->first); 1343 1344 DEBUG(if (DebugPairSelection) 1345 dbgs() << "BBV: found pruned Tree for pair {" 1346 << *J->first << " <-> " << *J->second << "} of depth " << 1347 MaxDepth << " and size " << PrunedTree.size() << 1348 " (effective size: " << EffSize << ")\n"); 1349 if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { 1350 BestMaxDepth = MaxDepth; 1351 BestEffSize = EffSize; 1352 BestTree = PrunedTree; 1353 } 1354 } 1355 } 1356 1357 // Given the list of candidate pairs, this function selects those 1358 // that will be fused into vector instructions. 1359 void BBVectorize::choosePairs( 1360 std::multimap<Value *, Value *> &CandidatePairs, 1361 std::vector<Value *> &PairableInsts, 1362 std::multimap<ValuePair, ValuePair> &ConnectedPairs, 1363 DenseSet<ValuePair> &PairableInstUsers, 1364 DenseMap<Value *, Value *>& ChosenPairs) { 1365 bool UseCycleCheck = 1366 CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck; 1367 std::multimap<ValuePair, ValuePair> PairableInstUserMap; 1368 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 1369 E = PairableInsts.end(); I != E; ++I) { 1370 // The number of possible pairings for this variable: 1371 size_t NumChoices = CandidatePairs.count(*I); 1372 if (!NumChoices) continue; 1373 1374 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); 1375 1376 // The best pair to choose and its tree: 1377 size_t BestMaxDepth = 0, BestEffSize = 0; 1378 DenseSet<ValuePair> BestTree; 1379 findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, 1380 PairableInstUsers, PairableInstUserMap, ChosenPairs, 1381 BestTree, BestMaxDepth, BestEffSize, ChoiceRange, 1382 UseCycleCheck); 1383 1384 // A tree has been chosen (or not) at this point. If no tree was 1385 // chosen, then this instruction, I, cannot be paired (and is no longer 1386 // considered). 1387 1388 DEBUG(if (BestTree.size() > 0) 1389 dbgs() << "BBV: selected pairs in the best tree for: " 1390 << *cast<Instruction>(*I) << "\n"); 1391 1392 for (DenseSet<ValuePair>::iterator S = BestTree.begin(), 1393 SE2 = BestTree.end(); S != SE2; ++S) { 1394 // Insert the members of this tree into the list of chosen pairs. 1395 ChosenPairs.insert(ValuePair(S->first, S->second)); 1396 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 1397 *S->second << "\n"); 1398 1399 // Remove all candidate pairs that have values in the chosen tree. 1400 for (std::multimap<Value *, Value *>::iterator K = 1401 CandidatePairs.begin(); K != CandidatePairs.end();) { 1402 if (K->first == S->first || K->second == S->first || 1403 K->second == S->second || K->first == S->second) { 1404 // Don't remove the actual pair chosen so that it can be used 1405 // in subsequent tree selections. 1406 if (!(K->first == S->first && K->second == S->second)) 1407 CandidatePairs.erase(K++); 1408 else 1409 ++K; 1410 } else { 1411 ++K; 1412 } 1413 } 1414 } 1415 } 1416 1417 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 1418 } 1419 1420 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 1421 unsigned n = 0) { 1422 if (!I->hasName()) 1423 return ""; 1424 1425 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 1426 (n > 0 ? "." + utostr(n) : "")).str(); 1427 } 1428 1429 // Returns the value that is to be used as the pointer input to the vector 1430 // instruction that fuses I with J. 1431 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 1432 Instruction *I, Instruction *J, unsigned o, 1433 bool &FlipMemInputs) { 1434 Value *IPtr, *JPtr; 1435 unsigned IAlignment, JAlignment; 1436 int64_t OffsetInElmts; 1437 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 1438 OffsetInElmts); 1439 1440 // The pointer value is taken to be the one with the lowest offset. 1441 Value *VPtr; 1442 if (OffsetInElmts > 0) { 1443 VPtr = IPtr; 1444 } else { 1445 FlipMemInputs = true; 1446 VPtr = JPtr; 1447 } 1448 1449 Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType(); 1450 Type *VArgType = getVecTypeForPair(ArgType); 1451 Type *VArgPtrType = PointerType::get(VArgType, 1452 cast<PointerType>(IPtr->getType())->getAddressSpace()); 1453 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 1454 /* insert before */ FlipMemInputs ? J : I); 1455 } 1456 1457 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 1458 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, 1459 unsigned IdxOffset, std::vector<Constant*> &Mask) { 1460 for (unsigned v = 0; v < NumElem/2; ++v) { 1461 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 1462 if (m < 0) { 1463 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 1464 } else { 1465 unsigned mm = m + (int) IdxOffset; 1466 if (m >= (int) NumInElem) 1467 mm += (int) NumInElem; 1468 1469 Mask[v+MaskOffset] = 1470 ConstantInt::get(Type::getInt32Ty(Context), mm); 1471 } 1472 } 1473 } 1474 1475 // Returns the value that is to be used as the vector-shuffle mask to the 1476 // vector instruction that fuses I with J. 1477 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 1478 Instruction *I, Instruction *J) { 1479 // This is the shuffle mask. We need to append the second 1480 // mask to the first, and the numbers need to be adjusted. 1481 1482 Type *ArgType = I->getType(); 1483 Type *VArgType = getVecTypeForPair(ArgType); 1484 1485 // Get the total number of elements in the fused vector type. 1486 // By definition, this must equal the number of elements in 1487 // the final mask. 1488 unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); 1489 std::vector<Constant*> Mask(NumElem); 1490 1491 Type *OpType = I->getOperand(0)->getType(); 1492 unsigned NumInElem = cast<VectorType>(OpType)->getNumElements(); 1493 1494 // For the mask from the first pair... 1495 fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); 1496 1497 // For the mask from the second pair... 1498 fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, 1499 Mask); 1500 1501 return ConstantVector::get(Mask); 1502 } 1503 1504 // Returns the value to be used as the specified operand of the vector 1505 // instruction that fuses I with J. 1506 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 1507 Instruction *J, unsigned o, bool FlipMemInputs) { 1508 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1509 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1510 1511 // Compute the fused vector type for this operand 1512 Type *ArgType = I->getOperand(o)->getType(); 1513 VectorType *VArgType = getVecTypeForPair(ArgType); 1514 1515 Instruction *L = I, *H = J; 1516 if (FlipMemInputs) { 1517 L = J; 1518 H = I; 1519 } 1520 1521 if (ArgType->isVectorTy()) { 1522 unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); 1523 std::vector<Constant*> Mask(numElem); 1524 for (unsigned v = 0; v < numElem; ++v) 1525 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1526 1527 Instruction *BV = new ShuffleVectorInst(L->getOperand(o), 1528 H->getOperand(o), 1529 ConstantVector::get(Mask), 1530 getReplacementName(I, true, o)); 1531 BV->insertBefore(J); 1532 return BV; 1533 } 1534 1535 // If these two inputs are the output of another vector instruction, 1536 // then we should use that output directly. It might be necessary to 1537 // permute it first. [When pairings are fused recursively, you can 1538 // end up with cases where a large vector is decomposed into scalars 1539 // using extractelement instructions, then built into size-2 1540 // vectors using insertelement and the into larger vectors using 1541 // shuffles. InstCombine does not simplify all of these cases well, 1542 // and so we make sure that shuffles are generated here when possible. 1543 ExtractElementInst *LEE 1544 = dyn_cast<ExtractElementInst>(L->getOperand(o)); 1545 ExtractElementInst *HEE 1546 = dyn_cast<ExtractElementInst>(H->getOperand(o)); 1547 1548 if (LEE && HEE && 1549 LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { 1550 VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType()); 1551 unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue(); 1552 unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue(); 1553 if (LEE->getOperand(0) == HEE->getOperand(0)) { 1554 if (LowIndx == 0 && HighIndx == 1) 1555 return LEE->getOperand(0); 1556 1557 std::vector<Constant*> Mask(2); 1558 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1559 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1560 1561 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1562 UndefValue::get(EEType), 1563 ConstantVector::get(Mask), 1564 getReplacementName(I, true, o)); 1565 BV->insertBefore(J); 1566 return BV; 1567 } 1568 1569 std::vector<Constant*> Mask(2); 1570 HighIndx += EEType->getNumElements(); 1571 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); 1572 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); 1573 1574 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), 1575 HEE->getOperand(0), 1576 ConstantVector::get(Mask), 1577 getReplacementName(I, true, o)); 1578 BV->insertBefore(J); 1579 return BV; 1580 } 1581 1582 Instruction *BV1 = InsertElementInst::Create( 1583 UndefValue::get(VArgType), 1584 L->getOperand(o), CV0, 1585 getReplacementName(I, true, o, 1)); 1586 BV1->insertBefore(I); 1587 Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), 1588 CV1, 1589 getReplacementName(I, true, o, 2)); 1590 BV2->insertBefore(J); 1591 return BV2; 1592 } 1593 1594 // This function creates an array of values that will be used as the inputs 1595 // to the vector instruction that fuses I with J. 1596 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 1597 Instruction *I, Instruction *J, 1598 SmallVector<Value *, 3> &ReplacedOperands, 1599 bool &FlipMemInputs) { 1600 FlipMemInputs = false; 1601 unsigned NumOperands = I->getNumOperands(); 1602 1603 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 1604 // Iterate backward so that we look at the store pointer 1605 // first and know whether or not we need to flip the inputs. 1606 1607 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 1608 // This is the pointer for a load/store instruction. 1609 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, 1610 FlipMemInputs); 1611 continue; 1612 } else if (isa<CallInst>(I)) { 1613 Function *F = cast<CallInst>(I)->getCalledFunction(); 1614 unsigned IID = F->getIntrinsicID(); 1615 if (o == NumOperands-1) { 1616 BasicBlock &BB = *I->getParent(); 1617 1618 Module *M = BB.getParent()->getParent(); 1619 Type *ArgType = I->getType(); 1620 Type *VArgType = getVecTypeForPair(ArgType); 1621 1622 // FIXME: is it safe to do this here? 1623 ReplacedOperands[o] = Intrinsic::getDeclaration(M, 1624 (Intrinsic::ID) IID, VArgType); 1625 continue; 1626 } else if (IID == Intrinsic::powi && o == 1) { 1627 // The second argument of powi is a single integer and we've already 1628 // checked that both arguments are equal. As a result, we just keep 1629 // I's second argument. 1630 ReplacedOperands[o] = I->getOperand(o); 1631 continue; 1632 } 1633 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 1634 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 1635 continue; 1636 } 1637 1638 ReplacedOperands[o] = 1639 getReplacementInput(Context, I, J, o, FlipMemInputs); 1640 } 1641 } 1642 1643 // This function creates two values that represent the outputs of the 1644 // original I and J instructions. These are generally vector shuffles 1645 // or extracts. In many cases, these will end up being unused and, thus, 1646 // eliminated by later passes. 1647 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 1648 Instruction *J, Instruction *K, 1649 Instruction *&InsertionPt, 1650 Instruction *&K1, Instruction *&K2, 1651 bool &FlipMemInputs) { 1652 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 1653 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 1654 1655 if (isa<StoreInst>(I)) { 1656 AA->replaceWithNewValue(I, K); 1657 AA->replaceWithNewValue(J, K); 1658 } else { 1659 Type *IType = I->getType(); 1660 Type *VType = getVecTypeForPair(IType); 1661 1662 if (IType->isVectorTy()) { 1663 unsigned numElem = cast<VectorType>(IType)->getNumElements(); 1664 std::vector<Constant*> Mask1(numElem), Mask2(numElem); 1665 for (unsigned v = 0; v < numElem; ++v) { 1666 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 1667 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); 1668 } 1669 1670 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 1671 ConstantVector::get( 1672 FlipMemInputs ? Mask2 : Mask1), 1673 getReplacementName(K, false, 1)); 1674 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 1675 ConstantVector::get( 1676 FlipMemInputs ? Mask1 : Mask2), 1677 getReplacementName(K, false, 2)); 1678 } else { 1679 K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, 1680 getReplacementName(K, false, 1)); 1681 K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, 1682 getReplacementName(K, false, 2)); 1683 } 1684 1685 K1->insertAfter(K); 1686 K2->insertAfter(K1); 1687 InsertionPt = K2; 1688 } 1689 } 1690 1691 // Move all uses of the function I (including pairing-induced uses) after J. 1692 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 1693 std::multimap<Value *, Value *> &LoadMoveSet, 1694 Instruction *I, Instruction *J) { 1695 // Skip to the first instruction past I. 1696 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1697 1698 DenseSet<Value *> Users; 1699 AliasSetTracker WriteSet(*AA); 1700 for (; cast<Instruction>(L) != J; ++L) 1701 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet); 1702 1703 assert(cast<Instruction>(L) == J && 1704 "Tracking has not proceeded far enough to check for dependencies"); 1705 // If J is now in the use set of I, then trackUsesOfI will return true 1706 // and we have a dependency cycle (and the fusing operation must abort). 1707 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet); 1708 } 1709 1710 // Move all uses of the function I (including pairing-induced uses) after J. 1711 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 1712 std::multimap<Value *, Value *> &LoadMoveSet, 1713 Instruction *&InsertionPt, 1714 Instruction *I, Instruction *J) { 1715 // Skip to the first instruction past I. 1716 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1717 1718 DenseSet<Value *> Users; 1719 AliasSetTracker WriteSet(*AA); 1720 for (; cast<Instruction>(L) != J;) { 1721 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) { 1722 // Move this instruction 1723 Instruction *InstToMove = L; ++L; 1724 1725 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 1726 " to after " << *InsertionPt << "\n"); 1727 InstToMove->removeFromParent(); 1728 InstToMove->insertAfter(InsertionPt); 1729 InsertionPt = InstToMove; 1730 } else { 1731 ++L; 1732 } 1733 } 1734 } 1735 1736 // Collect all load instruction that are in the move set of a given first 1737 // pair member. These loads depend on the first instruction, I, and so need 1738 // to be moved after J (the second instruction) when the pair is fused. 1739 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 1740 DenseMap<Value *, Value *> &ChosenPairs, 1741 std::multimap<Value *, Value *> &LoadMoveSet, 1742 Instruction *I) { 1743 // Skip to the first instruction past I. 1744 BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I)); 1745 1746 DenseSet<Value *> Users; 1747 AliasSetTracker WriteSet(*AA); 1748 1749 // Note: We cannot end the loop when we reach J because J could be moved 1750 // farther down the use chain by another instruction pairing. Also, J 1751 // could be before I if this is an inverted input. 1752 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 1753 if (trackUsesOfI(Users, WriteSet, I, L)) { 1754 if (L->mayReadFromMemory()) 1755 LoadMoveSet.insert(ValuePair(L, I)); 1756 } 1757 } 1758 } 1759 1760 // In cases where both load/stores and the computation of their pointers 1761 // are chosen for vectorization, we can end up in a situation where the 1762 // aliasing analysis starts returning different query results as the 1763 // process of fusing instruction pairs continues. Because the algorithm 1764 // relies on finding the same use trees here as were found earlier, we'll 1765 // need to precompute the necessary aliasing information here and then 1766 // manually update it during the fusion process. 1767 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 1768 std::vector<Value *> &PairableInsts, 1769 DenseMap<Value *, Value *> &ChosenPairs, 1770 std::multimap<Value *, Value *> &LoadMoveSet) { 1771 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1772 PIE = PairableInsts.end(); PI != PIE; ++PI) { 1773 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 1774 if (P == ChosenPairs.end()) continue; 1775 1776 Instruction *I = cast<Instruction>(P->first); 1777 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I); 1778 } 1779 } 1780 1781 // This function fuses the chosen instruction pairs into vector instructions, 1782 // taking care preserve any needed scalar outputs and, then, it reorders the 1783 // remaining instructions as needed (users of the first member of the pair 1784 // need to be moved to after the location of the second member of the pair 1785 // because the vector instruction is inserted in the location of the pair's 1786 // second member). 1787 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 1788 std::vector<Value *> &PairableInsts, 1789 DenseMap<Value *, Value *> &ChosenPairs) { 1790 LLVMContext& Context = BB.getContext(); 1791 1792 // During the vectorization process, the order of the pairs to be fused 1793 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 1794 // list. After a pair is fused, the flipped pair is removed from the list. 1795 std::vector<ValuePair> FlippedPairs; 1796 FlippedPairs.reserve(ChosenPairs.size()); 1797 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 1798 E = ChosenPairs.end(); P != E; ++P) 1799 FlippedPairs.push_back(ValuePair(P->second, P->first)); 1800 for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), 1801 E = FlippedPairs.end(); P != E; ++P) 1802 ChosenPairs.insert(*P); 1803 1804 std::multimap<Value *, Value *> LoadMoveSet; 1805 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); 1806 1807 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 1808 1809 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 1810 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 1811 if (P == ChosenPairs.end()) { 1812 ++PI; 1813 continue; 1814 } 1815 1816 if (getDepthFactor(P->first) == 0) { 1817 // These instructions are not really fused, but are tracked as though 1818 // they are. Any case in which it would be interesting to fuse them 1819 // will be taken care of by InstCombine. 1820 --NumFusedOps; 1821 ++PI; 1822 continue; 1823 } 1824 1825 Instruction *I = cast<Instruction>(P->first), 1826 *J = cast<Instruction>(P->second); 1827 1828 DEBUG(dbgs() << "BBV: fusing: " << *I << 1829 " <-> " << *J << "\n"); 1830 1831 // Remove the pair and flipped pair from the list. 1832 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 1833 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 1834 ChosenPairs.erase(FP); 1835 ChosenPairs.erase(P); 1836 1837 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) { 1838 DEBUG(dbgs() << "BBV: fusion of: " << *I << 1839 " <-> " << *J << 1840 " aborted because of non-trivial dependency cycle\n"); 1841 --NumFusedOps; 1842 ++PI; 1843 continue; 1844 } 1845 1846 bool FlipMemInputs; 1847 unsigned NumOperands = I->getNumOperands(); 1848 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 1849 getReplacementInputsForPair(Context, I, J, ReplacedOperands, 1850 FlipMemInputs); 1851 1852 // Make a copy of the original operation, change its type to the vector 1853 // type and replace its operands with the vector operands. 1854 Instruction *K = I->clone(); 1855 if (I->hasName()) K->takeName(I); 1856 1857 if (!isa<StoreInst>(K)) 1858 K->mutateType(getVecTypeForPair(I->getType())); 1859 1860 for (unsigned o = 0; o < NumOperands; ++o) 1861 K->setOperand(o, ReplacedOperands[o]); 1862 1863 // If we've flipped the memory inputs, make sure that we take the correct 1864 // alignment. 1865 if (FlipMemInputs) { 1866 if (isa<StoreInst>(K)) 1867 cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); 1868 else 1869 cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); 1870 } 1871 1872 K->insertAfter(J); 1873 1874 // Instruction insertion point: 1875 Instruction *InsertionPt = K; 1876 Instruction *K1 = 0, *K2 = 0; 1877 replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, 1878 FlipMemInputs); 1879 1880 // The use tree of the first original instruction must be moved to after 1881 // the location of the second instruction. The entire use tree of the 1882 // first instruction is disjoint from the input tree of the second 1883 // (by definition), and so commutes with it. 1884 1885 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); 1886 1887 if (!isa<StoreInst>(I)) { 1888 I->replaceAllUsesWith(K1); 1889 J->replaceAllUsesWith(K2); 1890 AA->replaceWithNewValue(I, K1); 1891 AA->replaceWithNewValue(J, K2); 1892 } 1893 1894 // Instructions that may read from memory may be in the load move set. 1895 // Once an instruction is fused, we no longer need its move set, and so 1896 // the values of the map never need to be updated. However, when a load 1897 // is fused, we need to merge the entries from both instructions in the 1898 // pair in case those instructions were in the move set of some other 1899 // yet-to-be-fused pair. The loads in question are the keys of the map. 1900 if (I->mayReadFromMemory()) { 1901 std::vector<ValuePair> NewSetMembers; 1902 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I); 1903 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J); 1904 for (std::multimap<Value *, Value *>::iterator N = IPairRange.first; 1905 N != IPairRange.second; ++N) 1906 NewSetMembers.push_back(ValuePair(K, N->second)); 1907 for (std::multimap<Value *, Value *>::iterator N = JPairRange.first; 1908 N != JPairRange.second; ++N) 1909 NewSetMembers.push_back(ValuePair(K, N->second)); 1910 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 1911 AE = NewSetMembers.end(); A != AE; ++A) 1912 LoadMoveSet.insert(*A); 1913 } 1914 1915 // Before removing I, set the iterator to the next instruction. 1916 PI = llvm::next(BasicBlock::iterator(I)); 1917 if (cast<Instruction>(PI) == J) 1918 ++PI; 1919 1920 SE->forgetValue(I); 1921 SE->forgetValue(J); 1922 I->eraseFromParent(); 1923 J->eraseFromParent(); 1924 } 1925 1926 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 1927 } 1928 } 1929 1930 char BBVectorize::ID = 0; 1931 static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 1932 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1933 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1934 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1935 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 1936 1937 BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 1938 return new BBVectorize(C); 1939 } 1940 1941 bool 1942 llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 1943 BBVectorize BBVectorizer(P, C); 1944 return BBVectorizer.vectorizeBB(BB); 1945 } 1946 1947 //===----------------------------------------------------------------------===// 1948 VectorizeConfig::VectorizeConfig() { 1949 VectorBits = ::VectorBits; 1950 VectorizeInts = !::NoInts; 1951 VectorizeFloats = !::NoFloats; 1952 VectorizePointers = !::NoPointers; 1953 VectorizeCasts = !::NoCasts; 1954 VectorizeMath = !::NoMath; 1955 VectorizeFMA = !::NoFMA; 1956 VectorizeSelect = !::NoSelect; 1957 VectorizeGEP = !::NoGEP; 1958 VectorizeMemOps = !::NoMemOps; 1959 AlignedOnly = ::AlignedOnly; 1960 ReqChainDepth= ::ReqChainDepth; 1961 SearchLimit = ::SearchLimit; 1962 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 1963 SplatBreaksChain = ::SplatBreaksChain; 1964 MaxInsts = ::MaxInsts; 1965 MaxIter = ::MaxIter; 1966 NoMemOpBoost = ::NoMemOpBoost; 1967 FastDep = ::FastDep; 1968 } 1969