1 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a basic-block vectorization pass. The algorithm was 11 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 12 // et al. It works by looking for chains of pairable operations and then 13 // pairing them. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #define BBV_NAME "bb-vectorize" 18 #include "llvm/Transforms/Vectorize.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/DenseSet.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/Statistic.h" 25 #include "llvm/ADT/StringExtras.h" 26 #include "llvm/Analysis/AliasAnalysis.h" 27 #include "llvm/Analysis/AliasSetTracker.h" 28 #include "llvm/Analysis/GlobalsModRef.h" 29 #include "llvm/Analysis/ScalarEvolution.h" 30 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 31 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 32 #include "llvm/Analysis/TargetLibraryInfo.h" 33 #include "llvm/Analysis/TargetTransformInfo.h" 34 #include "llvm/Analysis/ValueTracking.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Dominators.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/LLVMContext.h" 44 #include "llvm/IR/Metadata.h" 45 #include "llvm/IR/Module.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/IR/ValueHandle.h" 48 #include "llvm/Pass.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/Debug.h" 51 #include "llvm/Support/raw_ostream.h" 52 #include "llvm/Transforms/Utils/Local.h" 53 #include <algorithm> 54 using namespace llvm; 55 56 #define DEBUG_TYPE BBV_NAME 57 58 static cl::opt<bool> 59 IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), 60 cl::Hidden, cl::desc("Ignore target information")); 61 62 static cl::opt<unsigned> 63 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 64 cl::desc("The required chain depth for vectorization")); 65 66 static cl::opt<bool> 67 UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), 68 cl::Hidden, cl::desc("Use the chain depth requirement with" 69 " target information")); 70 71 static cl::opt<unsigned> 72 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 73 cl::desc("The maximum search distance for instruction pairs")); 74 75 static cl::opt<bool> 76 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 77 cl::desc("Replicating one element to a pair breaks the chain")); 78 79 static cl::opt<unsigned> 80 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 81 cl::desc("The size of the native vector registers")); 82 83 static cl::opt<unsigned> 84 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 85 cl::desc("The maximum number of pairing iterations")); 86 87 static cl::opt<bool> 88 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, 89 cl::desc("Don't try to form non-2^n-length vectors")); 90 91 static cl::opt<unsigned> 92 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 93 cl::desc("The maximum number of pairable instructions per group")); 94 95 static cl::opt<unsigned> 96 MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, 97 cl::desc("The maximum number of candidate instruction pairs per group")); 98 99 static cl::opt<unsigned> 100 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 101 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 102 " a full cycle check")); 103 104 static cl::opt<bool> 105 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, 106 cl::desc("Don't try to vectorize boolean (i1) values")); 107 108 static cl::opt<bool> 109 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 110 cl::desc("Don't try to vectorize integer values")); 111 112 static cl::opt<bool> 113 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 114 cl::desc("Don't try to vectorize floating-point values")); 115 116 // FIXME: This should default to false once pointer vector support works. 117 static cl::opt<bool> 118 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, 119 cl::desc("Don't try to vectorize pointer values")); 120 121 static cl::opt<bool> 122 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 123 cl::desc("Don't try to vectorize casting (conversion) operations")); 124 125 static cl::opt<bool> 126 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 127 cl::desc("Don't try to vectorize floating-point math intrinsics")); 128 129 static cl::opt<bool> 130 NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, 131 cl::desc("Don't try to vectorize BitManipulation intrinsics")); 132 133 static cl::opt<bool> 134 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 135 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 136 137 static cl::opt<bool> 138 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 139 cl::desc("Don't try to vectorize select instructions")); 140 141 static cl::opt<bool> 142 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, 143 cl::desc("Don't try to vectorize comparison instructions")); 144 145 static cl::opt<bool> 146 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 147 cl::desc("Don't try to vectorize getelementptr instructions")); 148 149 static cl::opt<bool> 150 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 151 cl::desc("Don't try to vectorize loads and stores")); 152 153 static cl::opt<bool> 154 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 155 cl::desc("Only generate aligned loads and stores")); 156 157 static cl::opt<bool> 158 NoMemOpBoost("bb-vectorize-no-mem-op-boost", 159 cl::init(false), cl::Hidden, 160 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 161 162 static cl::opt<bool> 163 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 164 cl::desc("Use a fast instruction dependency analysis")); 165 166 #ifndef NDEBUG 167 static cl::opt<bool> 168 DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 169 cl::init(false), cl::Hidden, 170 cl::desc("When debugging is enabled, output information on the" 171 " instruction-examination process")); 172 static cl::opt<bool> 173 DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 174 cl::init(false), cl::Hidden, 175 cl::desc("When debugging is enabled, output information on the" 176 " candidate-selection process")); 177 static cl::opt<bool> 178 DebugPairSelection("bb-vectorize-debug-pair-selection", 179 cl::init(false), cl::Hidden, 180 cl::desc("When debugging is enabled, output information on the" 181 " pair-selection process")); 182 static cl::opt<bool> 183 DebugCycleCheck("bb-vectorize-debug-cycle-check", 184 cl::init(false), cl::Hidden, 185 cl::desc("When debugging is enabled, output information on the" 186 " cycle-checking process")); 187 188 static cl::opt<bool> 189 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", 190 cl::init(false), cl::Hidden, 191 cl::desc("When debugging is enabled, dump the basic block after" 192 " every pair is fused")); 193 #endif 194 195 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 196 197 namespace { 198 struct BBVectorize : public BasicBlockPass { 199 static char ID; // Pass identification, replacement for typeid 200 201 const VectorizeConfig Config; 202 203 BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 204 : BasicBlockPass(ID), Config(C) { 205 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 206 } 207 208 BBVectorize(Pass *P, Function &F, const VectorizeConfig &C) 209 : BasicBlockPass(ID), Config(C) { 210 AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults(); 211 DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 212 SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 213 TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); 214 TTI = IgnoreTargetInfo 215 ? nullptr 216 : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 217 } 218 219 typedef std::pair<Value *, Value *> ValuePair; 220 typedef std::pair<ValuePair, int> ValuePairWithCost; 221 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 222 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 223 typedef std::pair<VPPair, unsigned> VPPairWithType; 224 225 AliasAnalysis *AA; 226 DominatorTree *DT; 227 ScalarEvolution *SE; 228 const TargetLibraryInfo *TLI; 229 const TargetTransformInfo *TTI; 230 231 // FIXME: const correct? 232 233 bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); 234 235 bool getCandidatePairs(BasicBlock &BB, 236 BasicBlock::iterator &Start, 237 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 238 DenseSet<ValuePair> &FixedOrderPairs, 239 DenseMap<ValuePair, int> &CandidatePairCostSavings, 240 std::vector<Value *> &PairableInsts, bool NonPow2Len); 241 242 // FIXME: The current implementation does not account for pairs that 243 // are connected in multiple ways. For example: 244 // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) 245 enum PairConnectionType { 246 PairConnectionDirect, 247 PairConnectionSwap, 248 PairConnectionSplat 249 }; 250 251 void computeConnectedPairs( 252 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 253 DenseSet<ValuePair> &CandidatePairsSet, 254 std::vector<Value *> &PairableInsts, 255 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 256 DenseMap<VPPair, unsigned> &PairConnectionTypes); 257 258 void buildDepMap(BasicBlock &BB, 259 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 260 std::vector<Value *> &PairableInsts, 261 DenseSet<ValuePair> &PairableInstUsers); 262 263 void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 264 DenseSet<ValuePair> &CandidatePairsSet, 265 DenseMap<ValuePair, int> &CandidatePairCostSavings, 266 std::vector<Value *> &PairableInsts, 267 DenseSet<ValuePair> &FixedOrderPairs, 268 DenseMap<VPPair, unsigned> &PairConnectionTypes, 269 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 270 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 271 DenseSet<ValuePair> &PairableInstUsers, 272 DenseMap<Value *, Value *>& ChosenPairs); 273 274 void fuseChosenPairs(BasicBlock &BB, 275 std::vector<Value *> &PairableInsts, 276 DenseMap<Value *, Value *>& ChosenPairs, 277 DenseSet<ValuePair> &FixedOrderPairs, 278 DenseMap<VPPair, unsigned> &PairConnectionTypes, 279 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 280 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); 281 282 283 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 284 285 bool areInstsCompatible(Instruction *I, Instruction *J, 286 bool IsSimpleLoadStore, bool NonPow2Len, 287 int &CostSavings, int &FixedOrder); 288 289 bool trackUsesOfI(DenseSet<Value *> &Users, 290 AliasSetTracker &WriteSet, Instruction *I, 291 Instruction *J, bool UpdateUsers = true, 292 DenseSet<ValuePair> *LoadMoveSetPairs = nullptr); 293 294 void computePairsConnectedTo( 295 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 296 DenseSet<ValuePair> &CandidatePairsSet, 297 std::vector<Value *> &PairableInsts, 298 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 299 DenseMap<VPPair, unsigned> &PairConnectionTypes, 300 ValuePair P); 301 302 bool pairsConflict(ValuePair P, ValuePair Q, 303 DenseSet<ValuePair> &PairableInstUsers, 304 DenseMap<ValuePair, std::vector<ValuePair> > 305 *PairableInstUserMap = nullptr, 306 DenseSet<VPPair> *PairableInstUserPairSet = nullptr); 307 308 bool pairWillFormCycle(ValuePair P, 309 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, 310 DenseSet<ValuePair> &CurrentPairs); 311 312 void pruneDAGFor( 313 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 314 std::vector<Value *> &PairableInsts, 315 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 316 DenseSet<ValuePair> &PairableInstUsers, 317 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 318 DenseSet<VPPair> &PairableInstUserPairSet, 319 DenseMap<Value *, Value *> &ChosenPairs, 320 DenseMap<ValuePair, size_t> &DAG, 321 DenseSet<ValuePair> &PrunedDAG, ValuePair J, 322 bool UseCycleCheck); 323 324 void buildInitialDAGFor( 325 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 326 DenseSet<ValuePair> &CandidatePairsSet, 327 std::vector<Value *> &PairableInsts, 328 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 329 DenseSet<ValuePair> &PairableInstUsers, 330 DenseMap<Value *, Value *> &ChosenPairs, 331 DenseMap<ValuePair, size_t> &DAG, ValuePair J); 332 333 void findBestDAGFor( 334 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 335 DenseSet<ValuePair> &CandidatePairsSet, 336 DenseMap<ValuePair, int> &CandidatePairCostSavings, 337 std::vector<Value *> &PairableInsts, 338 DenseSet<ValuePair> &FixedOrderPairs, 339 DenseMap<VPPair, unsigned> &PairConnectionTypes, 340 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 341 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 342 DenseSet<ValuePair> &PairableInstUsers, 343 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 344 DenseSet<VPPair> &PairableInstUserPairSet, 345 DenseMap<Value *, Value *> &ChosenPairs, 346 DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, 347 int &BestEffSize, Value *II, std::vector<Value *>&JJ, 348 bool UseCycleCheck); 349 350 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 351 Instruction *J, unsigned o); 352 353 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 354 unsigned MaskOffset, unsigned NumInElem, 355 unsigned NumInElem1, unsigned IdxOffset, 356 std::vector<Constant*> &Mask); 357 358 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 359 Instruction *J); 360 361 bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, 362 unsigned o, Value *&LOp, unsigned numElemL, 363 Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, 364 unsigned IdxOff = 0); 365 366 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 367 Instruction *J, unsigned o, bool IBeforeJ); 368 369 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 370 Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, 371 bool IBeforeJ); 372 373 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 374 Instruction *J, Instruction *K, 375 Instruction *&InsertionPt, Instruction *&K1, 376 Instruction *&K2); 377 378 void collectPairLoadMoveSet(BasicBlock &BB, 379 DenseMap<Value *, Value *> &ChosenPairs, 380 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 381 DenseSet<ValuePair> &LoadMoveSetPairs, 382 Instruction *I); 383 384 void collectLoadMoveSet(BasicBlock &BB, 385 std::vector<Value *> &PairableInsts, 386 DenseMap<Value *, Value *> &ChosenPairs, 387 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 388 DenseSet<ValuePair> &LoadMoveSetPairs); 389 390 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 391 DenseSet<ValuePair> &LoadMoveSetPairs, 392 Instruction *I, Instruction *J); 393 394 void moveUsesOfIAfterJ(BasicBlock &BB, 395 DenseSet<ValuePair> &LoadMoveSetPairs, 396 Instruction *&InsertionPt, 397 Instruction *I, Instruction *J); 398 399 bool vectorizeBB(BasicBlock &BB) { 400 if (skipOptnoneFunction(BB)) 401 return false; 402 if (!DT->isReachableFromEntry(&BB)) { 403 DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << 404 " in " << BB.getParent()->getName() << "\n"); 405 return false; 406 } 407 408 DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); 409 410 bool changed = false; 411 // Iterate a sufficient number of times to merge types of size 1 bit, 412 // then 2 bits, then 4, etc. up to half of the target vector width of the 413 // target vector register. 414 unsigned n = 1; 415 for (unsigned v = 2; 416 (TTI || v <= Config.VectorBits) && 417 (!Config.MaxIter || n <= Config.MaxIter); 418 v *= 2, ++n) { 419 DEBUG(dbgs() << "BBV: fusing loop #" << n << 420 " for " << BB.getName() << " in " << 421 BB.getParent()->getName() << "...\n"); 422 if (vectorizePairs(BB)) 423 changed = true; 424 else 425 break; 426 } 427 428 if (changed && !Pow2LenOnly) { 429 ++n; 430 for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { 431 DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << 432 n << " for " << BB.getName() << " in " << 433 BB.getParent()->getName() << "...\n"); 434 if (!vectorizePairs(BB, true)) break; 435 } 436 } 437 438 DEBUG(dbgs() << "BBV: done!\n"); 439 return changed; 440 } 441 442 bool runOnBasicBlock(BasicBlock &BB) override { 443 // OptimizeNone check deferred to vectorizeBB(). 444 445 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 446 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 447 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 448 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); 449 TTI = IgnoreTargetInfo 450 ? nullptr 451 : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 452 *BB.getParent()); 453 454 return vectorizeBB(BB); 455 } 456 457 void getAnalysisUsage(AnalysisUsage &AU) const override { 458 BasicBlockPass::getAnalysisUsage(AU); 459 AU.addRequired<AAResultsWrapperPass>(); 460 AU.addRequired<DominatorTreeWrapperPass>(); 461 AU.addRequired<ScalarEvolutionWrapperPass>(); 462 AU.addRequired<TargetLibraryInfoWrapperPass>(); 463 AU.addRequired<TargetTransformInfoWrapperPass>(); 464 AU.addPreserved<DominatorTreeWrapperPass>(); 465 AU.addPreserved<GlobalsAAWrapperPass>(); 466 AU.addPreserved<ScalarEvolutionWrapperPass>(); 467 AU.addPreserved<SCEVAAWrapperPass>(); 468 AU.setPreservesCFG(); 469 } 470 471 static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { 472 assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && 473 "Cannot form vector from incompatible scalar types"); 474 Type *STy = ElemTy->getScalarType(); 475 476 unsigned numElem; 477 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 478 numElem = VTy->getNumElements(); 479 } else { 480 numElem = 1; 481 } 482 483 if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { 484 numElem += VTy->getNumElements(); 485 } else { 486 numElem += 1; 487 } 488 489 return VectorType::get(STy, numElem); 490 } 491 492 static inline void getInstructionTypes(Instruction *I, 493 Type *&T1, Type *&T2) { 494 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 495 // For stores, it is the value type, not the pointer type that matters 496 // because the value is what will come from a vector register. 497 498 Value *IVal = SI->getValueOperand(); 499 T1 = IVal->getType(); 500 } else { 501 T1 = I->getType(); 502 } 503 504 if (CastInst *CI = dyn_cast<CastInst>(I)) 505 T2 = CI->getSrcTy(); 506 else 507 T2 = T1; 508 509 if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 510 T2 = SI->getCondition()->getType(); 511 } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { 512 T2 = SI->getOperand(0)->getType(); 513 } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { 514 T2 = CI->getOperand(0)->getType(); 515 } 516 } 517 518 // Returns the weight associated with the provided value. A chain of 519 // candidate pairs has a length given by the sum of the weights of its 520 // members (one weight per pair; the weight of each member of the pair 521 // is assumed to be the same). This length is then compared to the 522 // chain-length threshold to determine if a given chain is significant 523 // enough to be vectorized. The length is also used in comparing 524 // candidate chains where longer chains are considered to be better. 525 // Note: when this function returns 0, the resulting instructions are 526 // not actually fused. 527 inline size_t getDepthFactor(Value *V) { 528 // InsertElement and ExtractElement have a depth factor of zero. This is 529 // for two reasons: First, they cannot be usefully fused. Second, because 530 // the pass generates a lot of these, they can confuse the simple metric 531 // used to compare the dags in the next iteration. Thus, giving them a 532 // weight of zero allows the pass to essentially ignore them in 533 // subsequent iterations when looking for vectorization opportunities 534 // while still tracking dependency chains that flow through those 535 // instructions. 536 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 537 return 0; 538 539 // Give a load or store half of the required depth so that load/store 540 // pairs will vectorize. 541 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 542 return Config.ReqChainDepth/2; 543 544 return 1; 545 } 546 547 // Returns the cost of the provided instruction using TTI. 548 // This does not handle loads and stores. 549 unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, 550 TargetTransformInfo::OperandValueKind Op1VK = 551 TargetTransformInfo::OK_AnyValue, 552 TargetTransformInfo::OperandValueKind Op2VK = 553 TargetTransformInfo::OK_AnyValue) { 554 switch (Opcode) { 555 default: break; 556 case Instruction::GetElementPtr: 557 // We mark this instruction as zero-cost because scalar GEPs are usually 558 // lowered to the instruction addressing mode. At the moment we don't 559 // generate vector GEPs. 560 return 0; 561 case Instruction::Br: 562 return TTI->getCFInstrCost(Opcode); 563 case Instruction::PHI: 564 return 0; 565 case Instruction::Add: 566 case Instruction::FAdd: 567 case Instruction::Sub: 568 case Instruction::FSub: 569 case Instruction::Mul: 570 case Instruction::FMul: 571 case Instruction::UDiv: 572 case Instruction::SDiv: 573 case Instruction::FDiv: 574 case Instruction::URem: 575 case Instruction::SRem: 576 case Instruction::FRem: 577 case Instruction::Shl: 578 case Instruction::LShr: 579 case Instruction::AShr: 580 case Instruction::And: 581 case Instruction::Or: 582 case Instruction::Xor: 583 return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); 584 case Instruction::Select: 585 case Instruction::ICmp: 586 case Instruction::FCmp: 587 return TTI->getCmpSelInstrCost(Opcode, T1, T2); 588 case Instruction::ZExt: 589 case Instruction::SExt: 590 case Instruction::FPToUI: 591 case Instruction::FPToSI: 592 case Instruction::FPExt: 593 case Instruction::PtrToInt: 594 case Instruction::IntToPtr: 595 case Instruction::SIToFP: 596 case Instruction::UIToFP: 597 case Instruction::Trunc: 598 case Instruction::FPTrunc: 599 case Instruction::BitCast: 600 case Instruction::ShuffleVector: 601 return TTI->getCastInstrCost(Opcode, T1, T2); 602 } 603 604 return 1; 605 } 606 607 // This determines the relative offset of two loads or stores, returning 608 // true if the offset could be determined to be some constant value. 609 // For example, if OffsetInElmts == 1, then J accesses the memory directly 610 // after I; if OffsetInElmts == -1 then I accesses the memory 611 // directly after J. 612 bool getPairPtrInfo(Instruction *I, Instruction *J, 613 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 614 unsigned &IAddressSpace, unsigned &JAddressSpace, 615 int64_t &OffsetInElmts, bool ComputeOffset = true) { 616 OffsetInElmts = 0; 617 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 618 LoadInst *LJ = cast<LoadInst>(J); 619 IPtr = LI->getPointerOperand(); 620 JPtr = LJ->getPointerOperand(); 621 IAlignment = LI->getAlignment(); 622 JAlignment = LJ->getAlignment(); 623 IAddressSpace = LI->getPointerAddressSpace(); 624 JAddressSpace = LJ->getPointerAddressSpace(); 625 } else { 626 StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); 627 IPtr = SI->getPointerOperand(); 628 JPtr = SJ->getPointerOperand(); 629 IAlignment = SI->getAlignment(); 630 JAlignment = SJ->getAlignment(); 631 IAddressSpace = SI->getPointerAddressSpace(); 632 JAddressSpace = SJ->getPointerAddressSpace(); 633 } 634 635 if (!ComputeOffset) 636 return true; 637 638 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 639 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 640 641 // If this is a trivial offset, then we'll get something like 642 // 1*sizeof(type). With target data, which we need anyway, this will get 643 // constant folded into a number. 644 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 645 if (const SCEVConstant *ConstOffSCEV = 646 dyn_cast<SCEVConstant>(OffsetSCEV)) { 647 ConstantInt *IntOff = ConstOffSCEV->getValue(); 648 int64_t Offset = IntOff->getSExtValue(); 649 const DataLayout &DL = I->getModule()->getDataLayout(); 650 Type *VTy = IPtr->getType()->getPointerElementType(); 651 int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy); 652 653 Type *VTy2 = JPtr->getType()->getPointerElementType(); 654 if (VTy != VTy2 && Offset < 0) { 655 int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2); 656 OffsetInElmts = Offset/VTy2TSS; 657 return (std::abs(Offset) % VTy2TSS) == 0; 658 } 659 660 OffsetInElmts = Offset/VTyTSS; 661 return (std::abs(Offset) % VTyTSS) == 0; 662 } 663 664 return false; 665 } 666 667 // Returns true if the provided CallInst represents an intrinsic that can 668 // be vectorized. 669 bool isVectorizableIntrinsic(CallInst* I) { 670 Function *F = I->getCalledFunction(); 671 if (!F) return false; 672 673 Intrinsic::ID IID = F->getIntrinsicID(); 674 if (!IID) return false; 675 676 switch(IID) { 677 default: 678 return false; 679 case Intrinsic::sqrt: 680 case Intrinsic::powi: 681 case Intrinsic::sin: 682 case Intrinsic::cos: 683 case Intrinsic::log: 684 case Intrinsic::log2: 685 case Intrinsic::log10: 686 case Intrinsic::exp: 687 case Intrinsic::exp2: 688 case Intrinsic::pow: 689 case Intrinsic::round: 690 case Intrinsic::copysign: 691 case Intrinsic::ceil: 692 case Intrinsic::nearbyint: 693 case Intrinsic::rint: 694 case Intrinsic::trunc: 695 case Intrinsic::floor: 696 case Intrinsic::fabs: 697 case Intrinsic::minnum: 698 case Intrinsic::maxnum: 699 return Config.VectorizeMath; 700 case Intrinsic::bswap: 701 case Intrinsic::ctpop: 702 case Intrinsic::ctlz: 703 case Intrinsic::cttz: 704 return Config.VectorizeBitManipulations; 705 case Intrinsic::fma: 706 case Intrinsic::fmuladd: 707 return Config.VectorizeFMA; 708 } 709 } 710 711 bool isPureIEChain(InsertElementInst *IE) { 712 InsertElementInst *IENext = IE; 713 do { 714 if (!isa<UndefValue>(IENext->getOperand(0)) && 715 !isa<InsertElementInst>(IENext->getOperand(0))) { 716 return false; 717 } 718 } while ((IENext = 719 dyn_cast<InsertElementInst>(IENext->getOperand(0)))); 720 721 return true; 722 } 723 }; 724 725 // This function implements one vectorization iteration on the provided 726 // basic block. It returns true if the block is changed. 727 bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { 728 bool ShouldContinue; 729 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 730 731 std::vector<Value *> AllPairableInsts; 732 DenseMap<Value *, Value *> AllChosenPairs; 733 DenseSet<ValuePair> AllFixedOrderPairs; 734 DenseMap<VPPair, unsigned> AllPairConnectionTypes; 735 DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, 736 AllConnectedPairDeps; 737 738 do { 739 std::vector<Value *> PairableInsts; 740 DenseMap<Value *, std::vector<Value *> > CandidatePairs; 741 DenseSet<ValuePair> FixedOrderPairs; 742 DenseMap<ValuePair, int> CandidatePairCostSavings; 743 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 744 FixedOrderPairs, 745 CandidatePairCostSavings, 746 PairableInsts, NonPow2Len); 747 if (PairableInsts.empty()) continue; 748 749 // Build the candidate pair set for faster lookups. 750 DenseSet<ValuePair> CandidatePairsSet; 751 for (DenseMap<Value *, std::vector<Value *> >::iterator I = 752 CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) 753 for (std::vector<Value *>::iterator J = I->second.begin(), 754 JE = I->second.end(); J != JE; ++J) 755 CandidatePairsSet.insert(ValuePair(I->first, *J)); 756 757 // Now we have a map of all of the pairable instructions and we need to 758 // select the best possible pairing. A good pairing is one such that the 759 // users of the pair are also paired. This defines a (directed) forest 760 // over the pairs such that two pairs are connected iff the second pair 761 // uses the first. 762 763 // Note that it only matters that both members of the second pair use some 764 // element of the first pair (to allow for splatting). 765 766 DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, 767 ConnectedPairDeps; 768 DenseMap<VPPair, unsigned> PairConnectionTypes; 769 computeConnectedPairs(CandidatePairs, CandidatePairsSet, 770 PairableInsts, ConnectedPairs, PairConnectionTypes); 771 if (ConnectedPairs.empty()) continue; 772 773 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 774 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 775 I != IE; ++I) 776 for (std::vector<ValuePair>::iterator J = I->second.begin(), 777 JE = I->second.end(); J != JE; ++J) 778 ConnectedPairDeps[*J].push_back(I->first); 779 780 // Build the pairable-instruction dependency map 781 DenseSet<ValuePair> PairableInstUsers; 782 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 783 784 // There is now a graph of the connected pairs. For each variable, pick 785 // the pairing with the largest dag meeting the depth requirement on at 786 // least one branch. Then select all pairings that are part of that dag 787 // and remove them from the list of available pairings and pairable 788 // variables. 789 790 DenseMap<Value *, Value *> ChosenPairs; 791 choosePairs(CandidatePairs, CandidatePairsSet, 792 CandidatePairCostSavings, 793 PairableInsts, FixedOrderPairs, PairConnectionTypes, 794 ConnectedPairs, ConnectedPairDeps, 795 PairableInstUsers, ChosenPairs); 796 797 if (ChosenPairs.empty()) continue; 798 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 799 PairableInsts.end()); 800 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 801 802 // Only for the chosen pairs, propagate information on fixed-order pairs, 803 // pair connections, and their types to the data structures used by the 804 // pair fusion procedures. 805 for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), 806 IE = ChosenPairs.end(); I != IE; ++I) { 807 if (FixedOrderPairs.count(*I)) 808 AllFixedOrderPairs.insert(*I); 809 else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) 810 AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); 811 812 for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); 813 J != IE; ++J) { 814 DenseMap<VPPair, unsigned>::iterator K = 815 PairConnectionTypes.find(VPPair(*I, *J)); 816 if (K != PairConnectionTypes.end()) { 817 AllPairConnectionTypes.insert(*K); 818 } else { 819 K = PairConnectionTypes.find(VPPair(*J, *I)); 820 if (K != PairConnectionTypes.end()) 821 AllPairConnectionTypes.insert(*K); 822 } 823 } 824 } 825 826 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 827 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 828 I != IE; ++I) 829 for (std::vector<ValuePair>::iterator J = I->second.begin(), 830 JE = I->second.end(); J != JE; ++J) 831 if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { 832 AllConnectedPairs[I->first].push_back(*J); 833 AllConnectedPairDeps[*J].push_back(I->first); 834 } 835 } while (ShouldContinue); 836 837 if (AllChosenPairs.empty()) return false; 838 NumFusedOps += AllChosenPairs.size(); 839 840 // A set of pairs has now been selected. It is now necessary to replace the 841 // paired instructions with vector instructions. For this procedure each 842 // operand must be replaced with a vector operand. This vector is formed 843 // by using build_vector on the old operands. The replaced values are then 844 // replaced with a vector_extract on the result. Subsequent optimization 845 // passes should coalesce the build/extract combinations. 846 847 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, 848 AllPairConnectionTypes, 849 AllConnectedPairs, AllConnectedPairDeps); 850 851 // It is important to cleanup here so that future iterations of this 852 // function have less work to do. 853 (void)SimplifyInstructionsInBlock(&BB, TLI); 854 return true; 855 } 856 857 // This function returns true if the provided instruction is capable of being 858 // fused into a vector instruction. This determination is based only on the 859 // type and other attributes of the instruction. 860 bool BBVectorize::isInstVectorizable(Instruction *I, 861 bool &IsSimpleLoadStore) { 862 IsSimpleLoadStore = false; 863 864 if (CallInst *C = dyn_cast<CallInst>(I)) { 865 if (!isVectorizableIntrinsic(C)) 866 return false; 867 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 868 // Vectorize simple loads if possbile: 869 IsSimpleLoadStore = L->isSimple(); 870 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 871 return false; 872 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 873 // Vectorize simple stores if possbile: 874 IsSimpleLoadStore = S->isSimple(); 875 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 876 return false; 877 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 878 // We can vectorize casts, but not casts of pointer types, etc. 879 if (!Config.VectorizeCasts) 880 return false; 881 882 Type *SrcTy = C->getSrcTy(); 883 if (!SrcTy->isSingleValueType()) 884 return false; 885 886 Type *DestTy = C->getDestTy(); 887 if (!DestTy->isSingleValueType()) 888 return false; 889 } else if (isa<SelectInst>(I)) { 890 if (!Config.VectorizeSelect) 891 return false; 892 } else if (isa<CmpInst>(I)) { 893 if (!Config.VectorizeCmp) 894 return false; 895 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 896 if (!Config.VectorizeGEP) 897 return false; 898 899 // Currently, vector GEPs exist only with one index. 900 if (G->getNumIndices() != 1) 901 return false; 902 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 903 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 904 return false; 905 } 906 907 Type *T1, *T2; 908 getInstructionTypes(I, T1, T2); 909 910 // Not every type can be vectorized... 911 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 912 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 913 return false; 914 915 if (T1->getScalarSizeInBits() == 1) { 916 if (!Config.VectorizeBools) 917 return false; 918 } else { 919 if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) 920 return false; 921 } 922 923 if (T2->getScalarSizeInBits() == 1) { 924 if (!Config.VectorizeBools) 925 return false; 926 } else { 927 if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) 928 return false; 929 } 930 931 if (!Config.VectorizeFloats 932 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 933 return false; 934 935 // Don't vectorize target-specific types. 936 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) 937 return false; 938 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) 939 return false; 940 941 if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() || 942 T2->getScalarType()->isPointerTy())) 943 return false; 944 945 if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || 946 T2->getPrimitiveSizeInBits() >= Config.VectorBits)) 947 return false; 948 949 return true; 950 } 951 952 // This function returns true if the two provided instructions are compatible 953 // (meaning that they can be fused into a vector instruction). This assumes 954 // that I has already been determined to be vectorizable and that J is not 955 // in the use dag of I. 956 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 957 bool IsSimpleLoadStore, bool NonPow2Len, 958 int &CostSavings, int &FixedOrder) { 959 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 960 " <-> " << *J << "\n"); 961 962 CostSavings = 0; 963 FixedOrder = 0; 964 965 // Loads and stores can be merged if they have different alignments, 966 // but are otherwise the same. 967 if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | 968 (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) 969 return false; 970 971 Type *IT1, *IT2, *JT1, *JT2; 972 getInstructionTypes(I, IT1, IT2); 973 getInstructionTypes(J, JT1, JT2); 974 unsigned MaxTypeBits = std::max( 975 IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), 976 IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); 977 if (!TTI && MaxTypeBits > Config.VectorBits) 978 return false; 979 980 // FIXME: handle addsub-type operations! 981 982 if (IsSimpleLoadStore) { 983 Value *IPtr, *JPtr; 984 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 985 int64_t OffsetInElmts = 0; 986 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 987 IAddressSpace, JAddressSpace, OffsetInElmts) && 988 std::abs(OffsetInElmts) == 1) { 989 FixedOrder = (int) OffsetInElmts; 990 unsigned BottomAlignment = IAlignment; 991 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 992 993 Type *aTypeI = isa<StoreInst>(I) ? 994 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 995 Type *aTypeJ = isa<StoreInst>(J) ? 996 cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); 997 Type *VType = getVecTypeForPair(aTypeI, aTypeJ); 998 999 if (Config.AlignedOnly) { 1000 // An aligned load or store is possible only if the instruction 1001 // with the lower offset has an alignment suitable for the 1002 // vector type. 1003 const DataLayout &DL = I->getModule()->getDataLayout(); 1004 unsigned VecAlignment = DL.getPrefTypeAlignment(VType); 1005 if (BottomAlignment < VecAlignment) 1006 return false; 1007 } 1008 1009 if (TTI) { 1010 unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, 1011 IAlignment, IAddressSpace); 1012 unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, 1013 JAlignment, JAddressSpace); 1014 unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, 1015 BottomAlignment, 1016 IAddressSpace); 1017 1018 ICost += TTI->getAddressComputationCost(aTypeI); 1019 JCost += TTI->getAddressComputationCost(aTypeJ); 1020 VCost += TTI->getAddressComputationCost(VType); 1021 1022 if (VCost > ICost + JCost) 1023 return false; 1024 1025 // We don't want to fuse to a type that will be split, even 1026 // if the two input types will also be split and there is no other 1027 // associated cost. 1028 unsigned VParts = TTI->getNumberOfParts(VType); 1029 if (VParts > 1) 1030 return false; 1031 else if (!VParts && VCost == ICost + JCost) 1032 return false; 1033 1034 CostSavings = ICost + JCost - VCost; 1035 } 1036 } else { 1037 return false; 1038 } 1039 } else if (TTI) { 1040 unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); 1041 unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); 1042 Type *VT1 = getVecTypeForPair(IT1, JT1), 1043 *VT2 = getVecTypeForPair(IT2, JT2); 1044 TargetTransformInfo::OperandValueKind Op1VK = 1045 TargetTransformInfo::OK_AnyValue; 1046 TargetTransformInfo::OperandValueKind Op2VK = 1047 TargetTransformInfo::OK_AnyValue; 1048 1049 // On some targets (example X86) the cost of a vector shift may vary 1050 // depending on whether the second operand is a Uniform or 1051 // NonUniform Constant. 1052 switch (I->getOpcode()) { 1053 default : break; 1054 case Instruction::Shl: 1055 case Instruction::LShr: 1056 case Instruction::AShr: 1057 1058 // If both I and J are scalar shifts by constant, then the 1059 // merged vector shift count would be either a constant splat value 1060 // or a non-uniform vector of constants. 1061 if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) { 1062 if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1))) 1063 Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue : 1064 TargetTransformInfo::OK_NonUniformConstantValue; 1065 } else { 1066 // Check for a splat of a constant or for a non uniform vector 1067 // of constants. 1068 Value *IOp = I->getOperand(1); 1069 Value *JOp = J->getOperand(1); 1070 if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) && 1071 (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) { 1072 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 1073 Constant *SplatValue = cast<Constant>(IOp)->getSplatValue(); 1074 if (SplatValue != nullptr && 1075 SplatValue == cast<Constant>(JOp)->getSplatValue()) 1076 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 1077 } 1078 } 1079 } 1080 1081 // Note that this procedure is incorrect for insert and extract element 1082 // instructions (because combining these often results in a shuffle), 1083 // but this cost is ignored (because insert and extract element 1084 // instructions are assigned a zero depth factor and are not really 1085 // fused in general). 1086 unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK); 1087 1088 if (VCost > ICost + JCost) 1089 return false; 1090 1091 // We don't want to fuse to a type that will be split, even 1092 // if the two input types will also be split and there is no other 1093 // associated cost. 1094 unsigned VParts1 = TTI->getNumberOfParts(VT1), 1095 VParts2 = TTI->getNumberOfParts(VT2); 1096 if (VParts1 > 1 || VParts2 > 1) 1097 return false; 1098 else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) 1099 return false; 1100 1101 CostSavings = ICost + JCost - VCost; 1102 } 1103 1104 // The powi,ctlz,cttz intrinsics are special because only the first 1105 // argument is vectorized, the second arguments must be equal. 1106 CallInst *CI = dyn_cast<CallInst>(I); 1107 Function *FI; 1108 if (CI && (FI = CI->getCalledFunction())) { 1109 Intrinsic::ID IID = FI->getIntrinsicID(); 1110 if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || 1111 IID == Intrinsic::cttz) { 1112 Value *A1I = CI->getArgOperand(1), 1113 *A1J = cast<CallInst>(J)->getArgOperand(1); 1114 const SCEV *A1ISCEV = SE->getSCEV(A1I), 1115 *A1JSCEV = SE->getSCEV(A1J); 1116 return (A1ISCEV == A1JSCEV); 1117 } 1118 1119 if (IID && TTI) { 1120 SmallVector<Type*, 4> Tys; 1121 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 1122 Tys.push_back(CI->getArgOperand(i)->getType()); 1123 unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); 1124 1125 Tys.clear(); 1126 CallInst *CJ = cast<CallInst>(J); 1127 for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) 1128 Tys.push_back(CJ->getArgOperand(i)->getType()); 1129 unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); 1130 1131 Tys.clear(); 1132 assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && 1133 "Intrinsic argument counts differ"); 1134 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1135 if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || 1136 IID == Intrinsic::cttz) && i == 1) 1137 Tys.push_back(CI->getArgOperand(i)->getType()); 1138 else 1139 Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), 1140 CJ->getArgOperand(i)->getType())); 1141 } 1142 1143 Type *RetTy = getVecTypeForPair(IT1, JT1); 1144 unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); 1145 1146 if (VCost > ICost + JCost) 1147 return false; 1148 1149 // We don't want to fuse to a type that will be split, even 1150 // if the two input types will also be split and there is no other 1151 // associated cost. 1152 unsigned RetParts = TTI->getNumberOfParts(RetTy); 1153 if (RetParts > 1) 1154 return false; 1155 else if (!RetParts && VCost == ICost + JCost) 1156 return false; 1157 1158 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 1159 if (!Tys[i]->isVectorTy()) 1160 continue; 1161 1162 unsigned NumParts = TTI->getNumberOfParts(Tys[i]); 1163 if (NumParts > 1) 1164 return false; 1165 else if (!NumParts && VCost == ICost + JCost) 1166 return false; 1167 } 1168 1169 CostSavings = ICost + JCost - VCost; 1170 } 1171 } 1172 1173 return true; 1174 } 1175 1176 // Figure out whether or not J uses I and update the users and write-set 1177 // structures associated with I. Specifically, Users represents the set of 1178 // instructions that depend on I. WriteSet represents the set 1179 // of memory locations that are dependent on I. If UpdateUsers is true, 1180 // and J uses I, then Users is updated to contain J and WriteSet is updated 1181 // to contain any memory locations to which J writes. The function returns 1182 // true if J uses I. By default, alias analysis is used to determine 1183 // whether J reads from memory that overlaps with a location in WriteSet. 1184 // If LoadMoveSet is not null, then it is a previously-computed map 1185 // where the key is the memory-based user instruction and the value is 1186 // the instruction to be compared with I. So, if LoadMoveSet is provided, 1187 // then the alias analysis is not used. This is necessary because this 1188 // function is called during the process of moving instructions during 1189 // vectorization and the results of the alias analysis are not stable during 1190 // that process. 1191 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 1192 AliasSetTracker &WriteSet, Instruction *I, 1193 Instruction *J, bool UpdateUsers, 1194 DenseSet<ValuePair> *LoadMoveSetPairs) { 1195 bool UsesI = false; 1196 1197 // This instruction may already be marked as a user due, for example, to 1198 // being a member of a selected pair. 1199 if (Users.count(J)) 1200 UsesI = true; 1201 1202 if (!UsesI) 1203 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 1204 JU != JE; ++JU) { 1205 Value *V = *JU; 1206 if (I == V || Users.count(V)) { 1207 UsesI = true; 1208 break; 1209 } 1210 } 1211 if (!UsesI && J->mayReadFromMemory()) { 1212 if (LoadMoveSetPairs) { 1213 UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); 1214 } else { 1215 for (AliasSetTracker::iterator W = WriteSet.begin(), 1216 WE = WriteSet.end(); W != WE; ++W) { 1217 if (W->aliasesUnknownInst(J, *AA)) { 1218 UsesI = true; 1219 break; 1220 } 1221 } 1222 } 1223 } 1224 1225 if (UsesI && UpdateUsers) { 1226 if (J->mayWriteToMemory()) WriteSet.add(J); 1227 Users.insert(J); 1228 } 1229 1230 return UsesI; 1231 } 1232 1233 // This function iterates over all instruction pairs in the provided 1234 // basic block and collects all candidate pairs for vectorization. 1235 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 1236 BasicBlock::iterator &Start, 1237 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1238 DenseSet<ValuePair> &FixedOrderPairs, 1239 DenseMap<ValuePair, int> &CandidatePairCostSavings, 1240 std::vector<Value *> &PairableInsts, bool NonPow2Len) { 1241 size_t TotalPairs = 0; 1242 BasicBlock::iterator E = BB.end(); 1243 if (Start == E) return false; 1244 1245 bool ShouldContinue = false, IAfterStart = false; 1246 for (BasicBlock::iterator I = Start++; I != E; ++I) { 1247 if (I == Start) IAfterStart = true; 1248 1249 bool IsSimpleLoadStore; 1250 if (!isInstVectorizable(&*I, IsSimpleLoadStore)) 1251 continue; 1252 1253 // Look for an instruction with which to pair instruction *I... 1254 DenseSet<Value *> Users; 1255 AliasSetTracker WriteSet(*AA); 1256 if (I->mayWriteToMemory()) 1257 WriteSet.add(&*I); 1258 1259 bool JAfterStart = IAfterStart; 1260 BasicBlock::iterator J = std::next(I); 1261 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 1262 if (&*J == Start) 1263 JAfterStart = true; 1264 1265 // Determine if J uses I, if so, exit the loop. 1266 bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep); 1267 if (Config.FastDep) { 1268 // Note: For this heuristic to be effective, independent operations 1269 // must tend to be intermixed. This is likely to be true from some 1270 // kinds of grouped loop unrolling (but not the generic LLVM pass), 1271 // but otherwise may require some kind of reordering pass. 1272 1273 // When using fast dependency analysis, 1274 // stop searching after first use: 1275 if (UsesI) break; 1276 } else { 1277 if (UsesI) continue; 1278 } 1279 1280 // J does not use I, and comes before the first use of I, so it can be 1281 // merged with I if the instructions are compatible. 1282 int CostSavings, FixedOrder; 1283 if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len, 1284 CostSavings, FixedOrder)) 1285 continue; 1286 1287 // J is a candidate for merging with I. 1288 if (PairableInsts.empty() || 1289 PairableInsts[PairableInsts.size() - 1] != &*I) { 1290 PairableInsts.push_back(&*I); 1291 } 1292 1293 CandidatePairs[&*I].push_back(&*J); 1294 ++TotalPairs; 1295 if (TTI) 1296 CandidatePairCostSavings.insert( 1297 ValuePairWithCost(ValuePair(&*I, &*J), CostSavings)); 1298 1299 if (FixedOrder == 1) 1300 FixedOrderPairs.insert(ValuePair(&*I, &*J)); 1301 else if (FixedOrder == -1) 1302 FixedOrderPairs.insert(ValuePair(&*J, &*I)); 1303 1304 // The next call to this function must start after the last instruction 1305 // selected during this invocation. 1306 if (JAfterStart) { 1307 Start = std::next(J); 1308 IAfterStart = JAfterStart = false; 1309 } 1310 1311 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 1312 << *I << " <-> " << *J << " (cost savings: " << 1313 CostSavings << ")\n"); 1314 1315 // If we have already found too many pairs, break here and this function 1316 // will be called again starting after the last instruction selected 1317 // during this invocation. 1318 if (PairableInsts.size() >= Config.MaxInsts || 1319 TotalPairs >= Config.MaxPairs) { 1320 ShouldContinue = true; 1321 break; 1322 } 1323 } 1324 1325 if (ShouldContinue) 1326 break; 1327 } 1328 1329 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 1330 << " instructions with candidate pairs\n"); 1331 1332 return ShouldContinue; 1333 } 1334 1335 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 1336 // it looks for pairs such that both members have an input which is an 1337 // output of PI or PJ. 1338 void BBVectorize::computePairsConnectedTo( 1339 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1340 DenseSet<ValuePair> &CandidatePairsSet, 1341 std::vector<Value *> &PairableInsts, 1342 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1343 DenseMap<VPPair, unsigned> &PairConnectionTypes, 1344 ValuePair P) { 1345 StoreInst *SI, *SJ; 1346 1347 // For each possible pairing for this variable, look at the uses of 1348 // the first value... 1349 for (Value::user_iterator I = P.first->user_begin(), 1350 E = P.first->user_end(); 1351 I != E; ++I) { 1352 User *UI = *I; 1353 if (isa<LoadInst>(UI)) { 1354 // A pair cannot be connected to a load because the load only takes one 1355 // operand (the address) and it is a scalar even after vectorization. 1356 continue; 1357 } else if ((SI = dyn_cast<StoreInst>(UI)) && 1358 P.first == SI->getPointerOperand()) { 1359 // Similarly, a pair cannot be connected to a store through its 1360 // pointer operand. 1361 continue; 1362 } 1363 1364 // For each use of the first variable, look for uses of the second 1365 // variable... 1366 for (User *UJ : P.second->users()) { 1367 if ((SJ = dyn_cast<StoreInst>(UJ)) && 1368 P.second == SJ->getPointerOperand()) 1369 continue; 1370 1371 // Look for <I, J>: 1372 if (CandidatePairsSet.count(ValuePair(UI, UJ))) { 1373 VPPair VP(P, ValuePair(UI, UJ)); 1374 ConnectedPairs[VP.first].push_back(VP.second); 1375 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); 1376 } 1377 1378 // Look for <J, I>: 1379 if (CandidatePairsSet.count(ValuePair(UJ, UI))) { 1380 VPPair VP(P, ValuePair(UJ, UI)); 1381 ConnectedPairs[VP.first].push_back(VP.second); 1382 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); 1383 } 1384 } 1385 1386 if (Config.SplatBreaksChain) continue; 1387 // Look for cases where just the first value in the pair is used by 1388 // both members of another pair (splatting). 1389 for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) { 1390 User *UJ = *J; 1391 if ((SJ = dyn_cast<StoreInst>(UJ)) && 1392 P.first == SJ->getPointerOperand()) 1393 continue; 1394 1395 if (CandidatePairsSet.count(ValuePair(UI, UJ))) { 1396 VPPair VP(P, ValuePair(UI, UJ)); 1397 ConnectedPairs[VP.first].push_back(VP.second); 1398 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1399 } 1400 } 1401 } 1402 1403 if (Config.SplatBreaksChain) return; 1404 // Look for cases where just the second value in the pair is used by 1405 // both members of another pair (splatting). 1406 for (Value::user_iterator I = P.second->user_begin(), 1407 E = P.second->user_end(); 1408 I != E; ++I) { 1409 User *UI = *I; 1410 if (isa<LoadInst>(UI)) 1411 continue; 1412 else if ((SI = dyn_cast<StoreInst>(UI)) && 1413 P.second == SI->getPointerOperand()) 1414 continue; 1415 1416 for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) { 1417 User *UJ = *J; 1418 if ((SJ = dyn_cast<StoreInst>(UJ)) && 1419 P.second == SJ->getPointerOperand()) 1420 continue; 1421 1422 if (CandidatePairsSet.count(ValuePair(UI, UJ))) { 1423 VPPair VP(P, ValuePair(UI, UJ)); 1424 ConnectedPairs[VP.first].push_back(VP.second); 1425 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 1426 } 1427 } 1428 } 1429 } 1430 1431 // This function figures out which pairs are connected. Two pairs are 1432 // connected if some output of the first pair forms an input to both members 1433 // of the second pair. 1434 void BBVectorize::computeConnectedPairs( 1435 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1436 DenseSet<ValuePair> &CandidatePairsSet, 1437 std::vector<Value *> &PairableInsts, 1438 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1439 DenseMap<VPPair, unsigned> &PairConnectionTypes) { 1440 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 1441 PE = PairableInsts.end(); PI != PE; ++PI) { 1442 DenseMap<Value *, std::vector<Value *> >::iterator PP = 1443 CandidatePairs.find(*PI); 1444 if (PP == CandidatePairs.end()) 1445 continue; 1446 1447 for (std::vector<Value *>::iterator P = PP->second.begin(), 1448 E = PP->second.end(); P != E; ++P) 1449 computePairsConnectedTo(CandidatePairs, CandidatePairsSet, 1450 PairableInsts, ConnectedPairs, 1451 PairConnectionTypes, ValuePair(*PI, *P)); 1452 } 1453 1454 DEBUG(size_t TotalPairs = 0; 1455 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = 1456 ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I) 1457 TotalPairs += I->second.size(); 1458 dbgs() << "BBV: found " << TotalPairs 1459 << " pair connections.\n"); 1460 } 1461 1462 // This function builds a set of use tuples such that <A, B> is in the set 1463 // if B is in the use dag of A. If B is in the use dag of A, then B 1464 // depends on the output of A. 1465 void BBVectorize::buildDepMap( 1466 BasicBlock &BB, 1467 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1468 std::vector<Value *> &PairableInsts, 1469 DenseSet<ValuePair> &PairableInstUsers) { 1470 DenseSet<Value *> IsInPair; 1471 for (DenseMap<Value *, std::vector<Value *> >::iterator C = 1472 CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) { 1473 IsInPair.insert(C->first); 1474 IsInPair.insert(C->second.begin(), C->second.end()); 1475 } 1476 1477 // Iterate through the basic block, recording all users of each 1478 // pairable instruction. 1479 1480 BasicBlock::iterator E = BB.end(), EL = 1481 BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); 1482 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 1483 if (IsInPair.find(&*I) == IsInPair.end()) 1484 continue; 1485 1486 DenseSet<Value *> Users; 1487 AliasSetTracker WriteSet(*AA); 1488 if (I->mayWriteToMemory()) 1489 WriteSet.add(&*I); 1490 1491 for (BasicBlock::iterator J = std::next(I); J != E; ++J) { 1492 (void)trackUsesOfI(Users, WriteSet, &*I, &*J); 1493 1494 if (J == EL) 1495 break; 1496 } 1497 1498 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 1499 U != E; ++U) { 1500 if (IsInPair.find(*U) == IsInPair.end()) continue; 1501 PairableInstUsers.insert(ValuePair(&*I, *U)); 1502 } 1503 1504 if (I == EL) 1505 break; 1506 } 1507 } 1508 1509 // Returns true if an input to pair P is an output of pair Q and also an 1510 // input of pair Q is an output of pair P. If this is the case, then these 1511 // two pairs cannot be simultaneously fused. 1512 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 1513 DenseSet<ValuePair> &PairableInstUsers, 1514 DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap, 1515 DenseSet<VPPair> *PairableInstUserPairSet) { 1516 // Two pairs are in conflict if they are mutual Users of eachother. 1517 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 1518 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 1519 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 1520 PairableInstUsers.count(ValuePair(P.second, Q.second)); 1521 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 1522 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 1523 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 1524 PairableInstUsers.count(ValuePair(Q.second, P.second)); 1525 if (PairableInstUserMap) { 1526 // FIXME: The expensive part of the cycle check is not so much the cycle 1527 // check itself but this edge insertion procedure. This needs some 1528 // profiling and probably a different data structure. 1529 if (PUsesQ) { 1530 if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) 1531 (*PairableInstUserMap)[Q].push_back(P); 1532 } 1533 if (QUsesP) { 1534 if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) 1535 (*PairableInstUserMap)[P].push_back(Q); 1536 } 1537 } 1538 1539 return (QUsesP && PUsesQ); 1540 } 1541 1542 // This function walks the use graph of current pairs to see if, starting 1543 // from P, the walk returns to P. 1544 bool BBVectorize::pairWillFormCycle(ValuePair P, 1545 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1546 DenseSet<ValuePair> &CurrentPairs) { 1547 DEBUG(if (DebugCycleCheck) 1548 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 1549 << *P.second << "\n"); 1550 // A lookup table of visisted pairs is kept because the PairableInstUserMap 1551 // contains non-direct associations. 1552 DenseSet<ValuePair> Visited; 1553 SmallVector<ValuePair, 32> Q; 1554 // General depth-first post-order traversal: 1555 Q.push_back(P); 1556 do { 1557 ValuePair QTop = Q.pop_back_val(); 1558 Visited.insert(QTop); 1559 1560 DEBUG(if (DebugCycleCheck) 1561 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 1562 << *QTop.second << "\n"); 1563 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1564 PairableInstUserMap.find(QTop); 1565 if (QQ == PairableInstUserMap.end()) 1566 continue; 1567 1568 for (std::vector<ValuePair>::iterator C = QQ->second.begin(), 1569 CE = QQ->second.end(); C != CE; ++C) { 1570 if (*C == P) { 1571 DEBUG(dbgs() 1572 << "BBV: rejected to prevent non-trivial cycle formation: " 1573 << QTop.first << " <-> " << C->second << "\n"); 1574 return true; 1575 } 1576 1577 if (CurrentPairs.count(*C) && !Visited.count(*C)) 1578 Q.push_back(*C); 1579 } 1580 } while (!Q.empty()); 1581 1582 return false; 1583 } 1584 1585 // This function builds the initial dag of connected pairs with the 1586 // pair J at the root. 1587 void BBVectorize::buildInitialDAGFor( 1588 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1589 DenseSet<ValuePair> &CandidatePairsSet, 1590 std::vector<Value *> &PairableInsts, 1591 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1592 DenseSet<ValuePair> &PairableInstUsers, 1593 DenseMap<Value *, Value *> &ChosenPairs, 1594 DenseMap<ValuePair, size_t> &DAG, ValuePair J) { 1595 // Each of these pairs is viewed as the root node of a DAG. The DAG 1596 // is then walked (depth-first). As this happens, we keep track of 1597 // the pairs that compose the DAG and the maximum depth of the DAG. 1598 SmallVector<ValuePairWithDepth, 32> Q; 1599 // General depth-first post-order traversal: 1600 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1601 do { 1602 ValuePairWithDepth QTop = Q.back(); 1603 1604 // Push each child onto the queue: 1605 bool MoreChildren = false; 1606 size_t MaxChildDepth = QTop.second; 1607 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1608 ConnectedPairs.find(QTop.first); 1609 if (QQ != ConnectedPairs.end()) 1610 for (std::vector<ValuePair>::iterator k = QQ->second.begin(), 1611 ke = QQ->second.end(); k != ke; ++k) { 1612 // Make sure that this child pair is still a candidate: 1613 if (CandidatePairsSet.count(*k)) { 1614 DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k); 1615 if (C == DAG.end()) { 1616 size_t d = getDepthFactor(k->first); 1617 Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); 1618 MoreChildren = true; 1619 } else { 1620 MaxChildDepth = std::max(MaxChildDepth, C->second); 1621 } 1622 } 1623 } 1624 1625 if (!MoreChildren) { 1626 // Record the current pair as part of the DAG: 1627 DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 1628 Q.pop_back(); 1629 } 1630 } while (!Q.empty()); 1631 } 1632 1633 // Given some initial dag, prune it by removing conflicting pairs (pairs 1634 // that cannot be simultaneously chosen for vectorization). 1635 void BBVectorize::pruneDAGFor( 1636 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1637 std::vector<Value *> &PairableInsts, 1638 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1639 DenseSet<ValuePair> &PairableInstUsers, 1640 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1641 DenseSet<VPPair> &PairableInstUserPairSet, 1642 DenseMap<Value *, Value *> &ChosenPairs, 1643 DenseMap<ValuePair, size_t> &DAG, 1644 DenseSet<ValuePair> &PrunedDAG, ValuePair J, 1645 bool UseCycleCheck) { 1646 SmallVector<ValuePairWithDepth, 32> Q; 1647 // General depth-first post-order traversal: 1648 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 1649 do { 1650 ValuePairWithDepth QTop = Q.pop_back_val(); 1651 PrunedDAG.insert(QTop.first); 1652 1653 // Visit each child, pruning as necessary... 1654 SmallVector<ValuePairWithDepth, 8> BestChildren; 1655 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 1656 ConnectedPairs.find(QTop.first); 1657 if (QQ == ConnectedPairs.end()) 1658 continue; 1659 1660 for (std::vector<ValuePair>::iterator K = QQ->second.begin(), 1661 KE = QQ->second.end(); K != KE; ++K) { 1662 DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K); 1663 if (C == DAG.end()) continue; 1664 1665 // This child is in the DAG, now we need to make sure it is the 1666 // best of any conflicting children. There could be multiple 1667 // conflicting children, so first, determine if we're keeping 1668 // this child, then delete conflicting children as necessary. 1669 1670 // It is also necessary to guard against pairing-induced 1671 // dependencies. Consider instructions a .. x .. y .. b 1672 // such that (a,b) are to be fused and (x,y) are to be fused 1673 // but a is an input to x and b is an output from y. This 1674 // means that y cannot be moved after b but x must be moved 1675 // after b for (a,b) to be fused. In other words, after 1676 // fusing (a,b) we have y .. a/b .. x where y is an input 1677 // to a/b and x is an output to a/b: x and y can no longer 1678 // be legally fused. To prevent this condition, we must 1679 // make sure that a child pair added to the DAG is not 1680 // both an input and output of an already-selected pair. 1681 1682 // Pairing-induced dependencies can also form from more complicated 1683 // cycles. The pair vs. pair conflicts are easy to check, and so 1684 // that is done explicitly for "fast rejection", and because for 1685 // child vs. child conflicts, we may prefer to keep the current 1686 // pair in preference to the already-selected child. 1687 DenseSet<ValuePair> CurrentPairs; 1688 1689 bool CanAdd = true; 1690 for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 1691 = BestChildren.begin(), E2 = BestChildren.end(); 1692 C2 != E2; ++C2) { 1693 if (C2->first.first == C->first.first || 1694 C2->first.first == C->first.second || 1695 C2->first.second == C->first.first || 1696 C2->first.second == C->first.second || 1697 pairsConflict(C2->first, C->first, PairableInstUsers, 1698 UseCycleCheck ? &PairableInstUserMap : nullptr, 1699 UseCycleCheck ? &PairableInstUserPairSet 1700 : nullptr)) { 1701 if (C2->second >= C->second) { 1702 CanAdd = false; 1703 break; 1704 } 1705 1706 CurrentPairs.insert(C2->first); 1707 } 1708 } 1709 if (!CanAdd) continue; 1710 1711 // Even worse, this child could conflict with another node already 1712 // selected for the DAG. If that is the case, ignore this child. 1713 for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(), 1714 E2 = PrunedDAG.end(); T != E2; ++T) { 1715 if (T->first == C->first.first || 1716 T->first == C->first.second || 1717 T->second == C->first.first || 1718 T->second == C->first.second || 1719 pairsConflict(*T, C->first, PairableInstUsers, 1720 UseCycleCheck ? &PairableInstUserMap : nullptr, 1721 UseCycleCheck ? &PairableInstUserPairSet 1722 : nullptr)) { 1723 CanAdd = false; 1724 break; 1725 } 1726 1727 CurrentPairs.insert(*T); 1728 } 1729 if (!CanAdd) continue; 1730 1731 // And check the queue too... 1732 for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), 1733 E2 = Q.end(); C2 != E2; ++C2) { 1734 if (C2->first.first == C->first.first || 1735 C2->first.first == C->first.second || 1736 C2->first.second == C->first.first || 1737 C2->first.second == C->first.second || 1738 pairsConflict(C2->first, C->first, PairableInstUsers, 1739 UseCycleCheck ? &PairableInstUserMap : nullptr, 1740 UseCycleCheck ? &PairableInstUserPairSet 1741 : nullptr)) { 1742 CanAdd = false; 1743 break; 1744 } 1745 1746 CurrentPairs.insert(C2->first); 1747 } 1748 if (!CanAdd) continue; 1749 1750 // Last but not least, check for a conflict with any of the 1751 // already-chosen pairs. 1752 for (DenseMap<Value *, Value *>::iterator C2 = 1753 ChosenPairs.begin(), E2 = ChosenPairs.end(); 1754 C2 != E2; ++C2) { 1755 if (pairsConflict(*C2, C->first, PairableInstUsers, 1756 UseCycleCheck ? &PairableInstUserMap : nullptr, 1757 UseCycleCheck ? &PairableInstUserPairSet 1758 : nullptr)) { 1759 CanAdd = false; 1760 break; 1761 } 1762 1763 CurrentPairs.insert(*C2); 1764 } 1765 if (!CanAdd) continue; 1766 1767 // To check for non-trivial cycles formed by the addition of the 1768 // current pair we've formed a list of all relevant pairs, now use a 1769 // graph walk to check for a cycle. We start from the current pair and 1770 // walk the use dag to see if we again reach the current pair. If we 1771 // do, then the current pair is rejected. 1772 1773 // FIXME: It may be more efficient to use a topological-ordering 1774 // algorithm to improve the cycle check. This should be investigated. 1775 if (UseCycleCheck && 1776 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 1777 continue; 1778 1779 // This child can be added, but we may have chosen it in preference 1780 // to an already-selected child. Check for this here, and if a 1781 // conflict is found, then remove the previously-selected child 1782 // before adding this one in its place. 1783 for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 1784 = BestChildren.begin(); C2 != BestChildren.end();) { 1785 if (C2->first.first == C->first.first || 1786 C2->first.first == C->first.second || 1787 C2->first.second == C->first.first || 1788 C2->first.second == C->first.second || 1789 pairsConflict(C2->first, C->first, PairableInstUsers)) 1790 C2 = BestChildren.erase(C2); 1791 else 1792 ++C2; 1793 } 1794 1795 BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); 1796 } 1797 1798 for (SmallVectorImpl<ValuePairWithDepth>::iterator C 1799 = BestChildren.begin(), E2 = BestChildren.end(); 1800 C != E2; ++C) { 1801 size_t DepthF = getDepthFactor(C->first.first); 1802 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 1803 } 1804 } while (!Q.empty()); 1805 } 1806 1807 // This function finds the best dag of mututally-compatible connected 1808 // pairs, given the choice of root pairs as an iterator range. 1809 void BBVectorize::findBestDAGFor( 1810 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 1811 DenseSet<ValuePair> &CandidatePairsSet, 1812 DenseMap<ValuePair, int> &CandidatePairCostSavings, 1813 std::vector<Value *> &PairableInsts, 1814 DenseSet<ValuePair> &FixedOrderPairs, 1815 DenseMap<VPPair, unsigned> &PairConnectionTypes, 1816 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 1817 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 1818 DenseSet<ValuePair> &PairableInstUsers, 1819 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 1820 DenseSet<VPPair> &PairableInstUserPairSet, 1821 DenseMap<Value *, Value *> &ChosenPairs, 1822 DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, 1823 int &BestEffSize, Value *II, std::vector<Value *>&JJ, 1824 bool UseCycleCheck) { 1825 for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end(); 1826 J != JE; ++J) { 1827 ValuePair IJ(II, *J); 1828 if (!CandidatePairsSet.count(IJ)) 1829 continue; 1830 1831 // Before going any further, make sure that this pair does not 1832 // conflict with any already-selected pairs (see comment below 1833 // near the DAG pruning for more details). 1834 DenseSet<ValuePair> ChosenPairSet; 1835 bool DoesConflict = false; 1836 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 1837 E = ChosenPairs.end(); C != E; ++C) { 1838 if (pairsConflict(*C, IJ, PairableInstUsers, 1839 UseCycleCheck ? &PairableInstUserMap : nullptr, 1840 UseCycleCheck ? &PairableInstUserPairSet : nullptr)) { 1841 DoesConflict = true; 1842 break; 1843 } 1844 1845 ChosenPairSet.insert(*C); 1846 } 1847 if (DoesConflict) continue; 1848 1849 if (UseCycleCheck && 1850 pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) 1851 continue; 1852 1853 DenseMap<ValuePair, size_t> DAG; 1854 buildInitialDAGFor(CandidatePairs, CandidatePairsSet, 1855 PairableInsts, ConnectedPairs, 1856 PairableInstUsers, ChosenPairs, DAG, IJ); 1857 1858 // Because we'll keep the child with the largest depth, the largest 1859 // depth is still the same in the unpruned DAG. 1860 size_t MaxDepth = DAG.lookup(IJ); 1861 1862 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" 1863 << *IJ.first << " <-> " << *IJ.second << "} of depth " << 1864 MaxDepth << " and size " << DAG.size() << "\n"); 1865 1866 // At this point the DAG has been constructed, but, may contain 1867 // contradictory children (meaning that different children of 1868 // some dag node may be attempting to fuse the same instruction). 1869 // So now we walk the dag again, in the case of a conflict, 1870 // keep only the child with the largest depth. To break a tie, 1871 // favor the first child. 1872 1873 DenseSet<ValuePair> PrunedDAG; 1874 pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs, 1875 PairableInstUsers, PairableInstUserMap, 1876 PairableInstUserPairSet, 1877 ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck); 1878 1879 int EffSize = 0; 1880 if (TTI) { 1881 DenseSet<Value *> PrunedDAGInstrs; 1882 for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 1883 E = PrunedDAG.end(); S != E; ++S) { 1884 PrunedDAGInstrs.insert(S->first); 1885 PrunedDAGInstrs.insert(S->second); 1886 } 1887 1888 // The set of pairs that have already contributed to the total cost. 1889 DenseSet<ValuePair> IncomingPairs; 1890 1891 // If the cost model were perfect, this might not be necessary; but we 1892 // need to make sure that we don't get stuck vectorizing our own 1893 // shuffle chains. 1894 bool HasNontrivialInsts = false; 1895 1896 // The node weights represent the cost savings associated with 1897 // fusing the pair of instructions. 1898 for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 1899 E = PrunedDAG.end(); S != E; ++S) { 1900 if (!isa<ShuffleVectorInst>(S->first) && 1901 !isa<InsertElementInst>(S->first) && 1902 !isa<ExtractElementInst>(S->first)) 1903 HasNontrivialInsts = true; 1904 1905 bool FlipOrder = false; 1906 1907 if (getDepthFactor(S->first)) { 1908 int ESContrib = CandidatePairCostSavings.find(*S)->second; 1909 DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" 1910 << *S->first << " <-> " << *S->second << "} = " << 1911 ESContrib << "\n"); 1912 EffSize += ESContrib; 1913 } 1914 1915 // The edge weights contribute in a negative sense: they represent 1916 // the cost of shuffles. 1917 DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS = 1918 ConnectedPairDeps.find(*S); 1919 if (SS != ConnectedPairDeps.end()) { 1920 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 1921 for (std::vector<ValuePair>::iterator T = SS->second.begin(), 1922 TE = SS->second.end(); T != TE; ++T) { 1923 VPPair Q(*S, *T); 1924 if (!PrunedDAG.count(Q.second)) 1925 continue; 1926 DenseMap<VPPair, unsigned>::iterator R = 1927 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 1928 assert(R != PairConnectionTypes.end() && 1929 "Cannot find pair connection type"); 1930 if (R->second == PairConnectionDirect) 1931 ++NumDepsDirect; 1932 else if (R->second == PairConnectionSwap) 1933 ++NumDepsSwap; 1934 } 1935 1936 // If there are more swaps than direct connections, then 1937 // the pair order will be flipped during fusion. So the real 1938 // number of swaps is the minimum number. 1939 FlipOrder = !FixedOrderPairs.count(*S) && 1940 ((NumDepsSwap > NumDepsDirect) || 1941 FixedOrderPairs.count(ValuePair(S->second, S->first))); 1942 1943 for (std::vector<ValuePair>::iterator T = SS->second.begin(), 1944 TE = SS->second.end(); T != TE; ++T) { 1945 VPPair Q(*S, *T); 1946 if (!PrunedDAG.count(Q.second)) 1947 continue; 1948 DenseMap<VPPair, unsigned>::iterator R = 1949 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 1950 assert(R != PairConnectionTypes.end() && 1951 "Cannot find pair connection type"); 1952 Type *Ty1 = Q.second.first->getType(), 1953 *Ty2 = Q.second.second->getType(); 1954 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1955 if ((R->second == PairConnectionDirect && FlipOrder) || 1956 (R->second == PairConnectionSwap && !FlipOrder) || 1957 R->second == PairConnectionSplat) { 1958 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 1959 VTy, VTy); 1960 1961 if (VTy->getVectorNumElements() == 2) { 1962 if (R->second == PairConnectionSplat) 1963 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1964 TargetTransformInfo::SK_Broadcast, VTy)); 1965 else 1966 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 1967 TargetTransformInfo::SK_Reverse, VTy)); 1968 } 1969 1970 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 1971 *Q.second.first << " <-> " << *Q.second.second << 1972 "} -> {" << 1973 *S->first << " <-> " << *S->second << "} = " << 1974 ESContrib << "\n"); 1975 EffSize -= ESContrib; 1976 } 1977 } 1978 } 1979 1980 // Compute the cost of outgoing edges. We assume that edges outgoing 1981 // to shuffles, inserts or extracts can be merged, and so contribute 1982 // no additional cost. 1983 if (!S->first->getType()->isVoidTy()) { 1984 Type *Ty1 = S->first->getType(), 1985 *Ty2 = S->second->getType(); 1986 Type *VTy = getVecTypeForPair(Ty1, Ty2); 1987 1988 bool NeedsExtraction = false; 1989 for (User *U : S->first->users()) { 1990 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { 1991 // Shuffle can be folded if it has no other input 1992 if (isa<UndefValue>(SI->getOperand(1))) 1993 continue; 1994 } 1995 if (isa<ExtractElementInst>(U)) 1996 continue; 1997 if (PrunedDAGInstrs.count(U)) 1998 continue; 1999 NeedsExtraction = true; 2000 break; 2001 } 2002 2003 if (NeedsExtraction) { 2004 int ESContrib; 2005 if (Ty1->isVectorTy()) { 2006 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2007 Ty1, VTy); 2008 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 2009 TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); 2010 } else 2011 ESContrib = (int) TTI->getVectorInstrCost( 2012 Instruction::ExtractElement, VTy, 0); 2013 2014 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 2015 *S->first << "} = " << ESContrib << "\n"); 2016 EffSize -= ESContrib; 2017 } 2018 2019 NeedsExtraction = false; 2020 for (User *U : S->second->users()) { 2021 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { 2022 // Shuffle can be folded if it has no other input 2023 if (isa<UndefValue>(SI->getOperand(1))) 2024 continue; 2025 } 2026 if (isa<ExtractElementInst>(U)) 2027 continue; 2028 if (PrunedDAGInstrs.count(U)) 2029 continue; 2030 NeedsExtraction = true; 2031 break; 2032 } 2033 2034 if (NeedsExtraction) { 2035 int ESContrib; 2036 if (Ty2->isVectorTy()) { 2037 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2038 Ty2, VTy); 2039 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 2040 TargetTransformInfo::SK_ExtractSubvector, VTy, 2041 Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2)); 2042 } else 2043 ESContrib = (int) TTI->getVectorInstrCost( 2044 Instruction::ExtractElement, VTy, 1); 2045 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 2046 *S->second << "} = " << ESContrib << "\n"); 2047 EffSize -= ESContrib; 2048 } 2049 } 2050 2051 // Compute the cost of incoming edges. 2052 if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { 2053 Instruction *S1 = cast<Instruction>(S->first), 2054 *S2 = cast<Instruction>(S->second); 2055 for (unsigned o = 0; o < S1->getNumOperands(); ++o) { 2056 Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); 2057 2058 // Combining constants into vector constants (or small vector 2059 // constants into larger ones are assumed free). 2060 if (isa<Constant>(O1) && isa<Constant>(O2)) 2061 continue; 2062 2063 if (FlipOrder) 2064 std::swap(O1, O2); 2065 2066 ValuePair VP = ValuePair(O1, O2); 2067 ValuePair VPR = ValuePair(O2, O1); 2068 2069 // Internal edges are not handled here. 2070 if (PrunedDAG.count(VP) || PrunedDAG.count(VPR)) 2071 continue; 2072 2073 Type *Ty1 = O1->getType(), 2074 *Ty2 = O2->getType(); 2075 Type *VTy = getVecTypeForPair(Ty1, Ty2); 2076 2077 // Combining vector operations of the same type is also assumed 2078 // folded with other operations. 2079 if (Ty1 == Ty2) { 2080 // If both are insert elements, then both can be widened. 2081 InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), 2082 *IEO2 = dyn_cast<InsertElementInst>(O2); 2083 if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) 2084 continue; 2085 // If both are extract elements, and both have the same input 2086 // type, then they can be replaced with a shuffle 2087 ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), 2088 *EIO2 = dyn_cast<ExtractElementInst>(O2); 2089 if (EIO1 && EIO2 && 2090 EIO1->getOperand(0)->getType() == 2091 EIO2->getOperand(0)->getType()) 2092 continue; 2093 // If both are a shuffle with equal operand types and only two 2094 // unqiue operands, then they can be replaced with a single 2095 // shuffle 2096 ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), 2097 *SIO2 = dyn_cast<ShuffleVectorInst>(O2); 2098 if (SIO1 && SIO2 && 2099 SIO1->getOperand(0)->getType() == 2100 SIO2->getOperand(0)->getType()) { 2101 SmallSet<Value *, 4> SIOps; 2102 SIOps.insert(SIO1->getOperand(0)); 2103 SIOps.insert(SIO1->getOperand(1)); 2104 SIOps.insert(SIO2->getOperand(0)); 2105 SIOps.insert(SIO2->getOperand(1)); 2106 if (SIOps.size() <= 2) 2107 continue; 2108 } 2109 } 2110 2111 int ESContrib; 2112 // This pair has already been formed. 2113 if (IncomingPairs.count(VP)) { 2114 continue; 2115 } else if (IncomingPairs.count(VPR)) { 2116 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2117 VTy, VTy); 2118 2119 if (VTy->getVectorNumElements() == 2) 2120 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 2121 TargetTransformInfo::SK_Reverse, VTy)); 2122 } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { 2123 ESContrib = (int) TTI->getVectorInstrCost( 2124 Instruction::InsertElement, VTy, 0); 2125 ESContrib += (int) TTI->getVectorInstrCost( 2126 Instruction::InsertElement, VTy, 1); 2127 } else if (!Ty1->isVectorTy()) { 2128 // O1 needs to be inserted into a vector of size O2, and then 2129 // both need to be shuffled together. 2130 ESContrib = (int) TTI->getVectorInstrCost( 2131 Instruction::InsertElement, Ty2, 0); 2132 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2133 VTy, Ty2); 2134 } else if (!Ty2->isVectorTy()) { 2135 // O2 needs to be inserted into a vector of size O1, and then 2136 // both need to be shuffled together. 2137 ESContrib = (int) TTI->getVectorInstrCost( 2138 Instruction::InsertElement, Ty1, 0); 2139 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2140 VTy, Ty1); 2141 } else { 2142 Type *TyBig = Ty1, *TySmall = Ty2; 2143 if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) 2144 std::swap(TyBig, TySmall); 2145 2146 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 2147 VTy, TyBig); 2148 if (TyBig != TySmall) 2149 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 2150 TyBig, TySmall); 2151 } 2152 2153 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" 2154 << *O1 << " <-> " << *O2 << "} = " << 2155 ESContrib << "\n"); 2156 EffSize -= ESContrib; 2157 IncomingPairs.insert(VP); 2158 } 2159 } 2160 } 2161 2162 if (!HasNontrivialInsts) { 2163 DEBUG(if (DebugPairSelection) dbgs() << 2164 "\tNo non-trivial instructions in DAG;" 2165 " override to zero effective size\n"); 2166 EffSize = 0; 2167 } 2168 } else { 2169 for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 2170 E = PrunedDAG.end(); S != E; ++S) 2171 EffSize += (int) getDepthFactor(S->first); 2172 } 2173 2174 DEBUG(if (DebugPairSelection) 2175 dbgs() << "BBV: found pruned DAG for pair {" 2176 << *IJ.first << " <-> " << *IJ.second << "} of depth " << 2177 MaxDepth << " and size " << PrunedDAG.size() << 2178 " (effective size: " << EffSize << ")\n"); 2179 if (((TTI && !UseChainDepthWithTI) || 2180 MaxDepth >= Config.ReqChainDepth) && 2181 EffSize > 0 && EffSize > BestEffSize) { 2182 BestMaxDepth = MaxDepth; 2183 BestEffSize = EffSize; 2184 BestDAG = PrunedDAG; 2185 } 2186 } 2187 } 2188 2189 // Given the list of candidate pairs, this function selects those 2190 // that will be fused into vector instructions. 2191 void BBVectorize::choosePairs( 2192 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 2193 DenseSet<ValuePair> &CandidatePairsSet, 2194 DenseMap<ValuePair, int> &CandidatePairCostSavings, 2195 std::vector<Value *> &PairableInsts, 2196 DenseSet<ValuePair> &FixedOrderPairs, 2197 DenseMap<VPPair, unsigned> &PairConnectionTypes, 2198 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 2199 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 2200 DenseSet<ValuePair> &PairableInstUsers, 2201 DenseMap<Value *, Value *>& ChosenPairs) { 2202 bool UseCycleCheck = 2203 CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; 2204 2205 DenseMap<Value *, std::vector<Value *> > CandidatePairs2; 2206 for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(), 2207 E = CandidatePairsSet.end(); I != E; ++I) { 2208 std::vector<Value *> &JJ = CandidatePairs2[I->second]; 2209 if (JJ.empty()) JJ.reserve(32); 2210 JJ.push_back(I->first); 2211 } 2212 2213 DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap; 2214 DenseSet<VPPair> PairableInstUserPairSet; 2215 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 2216 E = PairableInsts.end(); I != E; ++I) { 2217 // The number of possible pairings for this variable: 2218 size_t NumChoices = CandidatePairs.lookup(*I).size(); 2219 if (!NumChoices) continue; 2220 2221 std::vector<Value *> &JJ = CandidatePairs[*I]; 2222 2223 // The best pair to choose and its dag: 2224 size_t BestMaxDepth = 0; 2225 int BestEffSize = 0; 2226 DenseSet<ValuePair> BestDAG; 2227 findBestDAGFor(CandidatePairs, CandidatePairsSet, 2228 CandidatePairCostSavings, 2229 PairableInsts, FixedOrderPairs, PairConnectionTypes, 2230 ConnectedPairs, ConnectedPairDeps, 2231 PairableInstUsers, PairableInstUserMap, 2232 PairableInstUserPairSet, ChosenPairs, 2233 BestDAG, BestMaxDepth, BestEffSize, *I, JJ, 2234 UseCycleCheck); 2235 2236 if (BestDAG.empty()) 2237 continue; 2238 2239 // A dag has been chosen (or not) at this point. If no dag was 2240 // chosen, then this instruction, I, cannot be paired (and is no longer 2241 // considered). 2242 2243 DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: " 2244 << *cast<Instruction>(*I) << "\n"); 2245 2246 for (DenseSet<ValuePair>::iterator S = BestDAG.begin(), 2247 SE2 = BestDAG.end(); S != SE2; ++S) { 2248 // Insert the members of this dag into the list of chosen pairs. 2249 ChosenPairs.insert(ValuePair(S->first, S->second)); 2250 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 2251 *S->second << "\n"); 2252 2253 // Remove all candidate pairs that have values in the chosen dag. 2254 std::vector<Value *> &KK = CandidatePairs[S->first]; 2255 for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end(); 2256 K != KE; ++K) { 2257 if (*K == S->second) 2258 continue; 2259 2260 CandidatePairsSet.erase(ValuePair(S->first, *K)); 2261 } 2262 2263 std::vector<Value *> &LL = CandidatePairs2[S->second]; 2264 for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end(); 2265 L != LE; ++L) { 2266 if (*L == S->first) 2267 continue; 2268 2269 CandidatePairsSet.erase(ValuePair(*L, S->second)); 2270 } 2271 2272 std::vector<Value *> &MM = CandidatePairs[S->second]; 2273 for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end(); 2274 M != ME; ++M) { 2275 assert(*M != S->first && "Flipped pair in candidate list?"); 2276 CandidatePairsSet.erase(ValuePair(S->second, *M)); 2277 } 2278 2279 std::vector<Value *> &NN = CandidatePairs2[S->first]; 2280 for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end(); 2281 N != NE; ++N) { 2282 assert(*N != S->second && "Flipped pair in candidate list?"); 2283 CandidatePairsSet.erase(ValuePair(*N, S->first)); 2284 } 2285 } 2286 } 2287 2288 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 2289 } 2290 2291 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 2292 unsigned n = 0) { 2293 if (!I->hasName()) 2294 return ""; 2295 2296 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 2297 (n > 0 ? "." + utostr(n) : "")).str(); 2298 } 2299 2300 // Returns the value that is to be used as the pointer input to the vector 2301 // instruction that fuses I with J. 2302 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 2303 Instruction *I, Instruction *J, unsigned o) { 2304 Value *IPtr, *JPtr; 2305 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 2306 int64_t OffsetInElmts; 2307 2308 // Note: the analysis might fail here, that is why the pair order has 2309 // been precomputed (OffsetInElmts must be unused here). 2310 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 2311 IAddressSpace, JAddressSpace, 2312 OffsetInElmts, false); 2313 2314 // The pointer value is taken to be the one with the lowest offset. 2315 Value *VPtr = IPtr; 2316 2317 Type *ArgTypeI = IPtr->getType()->getPointerElementType(); 2318 Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); 2319 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2320 Type *VArgPtrType 2321 = PointerType::get(VArgType, 2322 IPtr->getType()->getPointerAddressSpace()); 2323 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 2324 /* insert before */ I); 2325 } 2326 2327 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 2328 unsigned MaskOffset, unsigned NumInElem, 2329 unsigned NumInElem1, unsigned IdxOffset, 2330 std::vector<Constant*> &Mask) { 2331 unsigned NumElem1 = J->getType()->getVectorNumElements(); 2332 for (unsigned v = 0; v < NumElem1; ++v) { 2333 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 2334 if (m < 0) { 2335 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 2336 } else { 2337 unsigned mm = m + (int) IdxOffset; 2338 if (m >= (int) NumInElem1) 2339 mm += (int) NumInElem; 2340 2341 Mask[v+MaskOffset] = 2342 ConstantInt::get(Type::getInt32Ty(Context), mm); 2343 } 2344 } 2345 } 2346 2347 // Returns the value that is to be used as the vector-shuffle mask to the 2348 // vector instruction that fuses I with J. 2349 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 2350 Instruction *I, Instruction *J) { 2351 // This is the shuffle mask. We need to append the second 2352 // mask to the first, and the numbers need to be adjusted. 2353 2354 Type *ArgTypeI = I->getType(); 2355 Type *ArgTypeJ = J->getType(); 2356 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2357 2358 unsigned NumElemI = ArgTypeI->getVectorNumElements(); 2359 2360 // Get the total number of elements in the fused vector type. 2361 // By definition, this must equal the number of elements in 2362 // the final mask. 2363 unsigned NumElem = VArgType->getVectorNumElements(); 2364 std::vector<Constant*> Mask(NumElem); 2365 2366 Type *OpTypeI = I->getOperand(0)->getType(); 2367 unsigned NumInElemI = OpTypeI->getVectorNumElements(); 2368 Type *OpTypeJ = J->getOperand(0)->getType(); 2369 unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); 2370 2371 // The fused vector will be: 2372 // ----------------------------------------------------- 2373 // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | 2374 // ----------------------------------------------------- 2375 // from which we'll extract NumElem total elements (where the first NumElemI 2376 // of them come from the mask in I and the remainder come from the mask 2377 // in J. 2378 2379 // For the mask from the first pair... 2380 fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, 2381 0, Mask); 2382 2383 // For the mask from the second pair... 2384 fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, 2385 NumInElemI, Mask); 2386 2387 return ConstantVector::get(Mask); 2388 } 2389 2390 bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, 2391 Instruction *J, unsigned o, Value *&LOp, 2392 unsigned numElemL, 2393 Type *ArgTypeL, Type *ArgTypeH, 2394 bool IBeforeJ, unsigned IdxOff) { 2395 bool ExpandedIEChain = false; 2396 if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { 2397 // If we have a pure insertelement chain, then this can be rewritten 2398 // into a chain that directly builds the larger type. 2399 if (isPureIEChain(LIE)) { 2400 SmallVector<Value *, 8> VectElemts(numElemL, 2401 UndefValue::get(ArgTypeL->getScalarType())); 2402 InsertElementInst *LIENext = LIE; 2403 do { 2404 unsigned Idx = 2405 cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); 2406 VectElemts[Idx] = LIENext->getOperand(1); 2407 } while ((LIENext = 2408 dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); 2409 2410 LIENext = nullptr; 2411 Value *LIEPrev = UndefValue::get(ArgTypeH); 2412 for (unsigned i = 0; i < numElemL; ++i) { 2413 if (isa<UndefValue>(VectElemts[i])) continue; 2414 LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], 2415 ConstantInt::get(Type::getInt32Ty(Context), 2416 i + IdxOff), 2417 getReplacementName(IBeforeJ ? I : J, 2418 true, o, i+1)); 2419 LIENext->insertBefore(IBeforeJ ? J : I); 2420 LIEPrev = LIENext; 2421 } 2422 2423 LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); 2424 ExpandedIEChain = true; 2425 } 2426 } 2427 2428 return ExpandedIEChain; 2429 } 2430 2431 static unsigned getNumScalarElements(Type *Ty) { 2432 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) 2433 return VecTy->getNumElements(); 2434 return 1; 2435 } 2436 2437 // Returns the value to be used as the specified operand of the vector 2438 // instruction that fuses I with J. 2439 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 2440 Instruction *J, unsigned o, bool IBeforeJ) { 2441 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2442 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 2443 2444 // Compute the fused vector type for this operand 2445 Type *ArgTypeI = I->getOperand(o)->getType(); 2446 Type *ArgTypeJ = J->getOperand(o)->getType(); 2447 VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2448 2449 Instruction *L = I, *H = J; 2450 Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; 2451 2452 unsigned numElemL = getNumScalarElements(ArgTypeL); 2453 unsigned numElemH = getNumScalarElements(ArgTypeH); 2454 2455 Value *LOp = L->getOperand(o); 2456 Value *HOp = H->getOperand(o); 2457 unsigned numElem = VArgType->getNumElements(); 2458 2459 // First, we check if we can reuse the "original" vector outputs (if these 2460 // exist). We might need a shuffle. 2461 ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); 2462 ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); 2463 ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); 2464 ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); 2465 2466 // FIXME: If we're fusing shuffle instructions, then we can't apply this 2467 // optimization. The input vectors to the shuffle might be a different 2468 // length from the shuffle outputs. Unfortunately, the replacement 2469 // shuffle mask has already been formed, and the mask entries are sensitive 2470 // to the sizes of the inputs. 2471 bool IsSizeChangeShuffle = 2472 isa<ShuffleVectorInst>(L) && 2473 (LOp->getType() != L->getType() || HOp->getType() != H->getType()); 2474 2475 if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { 2476 // We can have at most two unique vector inputs. 2477 bool CanUseInputs = true; 2478 Value *I1, *I2 = nullptr; 2479 if (LEE) { 2480 I1 = LEE->getOperand(0); 2481 } else { 2482 I1 = LSV->getOperand(0); 2483 I2 = LSV->getOperand(1); 2484 if (I2 == I1 || isa<UndefValue>(I2)) 2485 I2 = nullptr; 2486 } 2487 2488 if (HEE) { 2489 Value *I3 = HEE->getOperand(0); 2490 if (!I2 && I3 != I1) 2491 I2 = I3; 2492 else if (I3 != I1 && I3 != I2) 2493 CanUseInputs = false; 2494 } else { 2495 Value *I3 = HSV->getOperand(0); 2496 if (!I2 && I3 != I1) 2497 I2 = I3; 2498 else if (I3 != I1 && I3 != I2) 2499 CanUseInputs = false; 2500 2501 if (CanUseInputs) { 2502 Value *I4 = HSV->getOperand(1); 2503 if (!isa<UndefValue>(I4)) { 2504 if (!I2 && I4 != I1) 2505 I2 = I4; 2506 else if (I4 != I1 && I4 != I2) 2507 CanUseInputs = false; 2508 } 2509 } 2510 } 2511 2512 if (CanUseInputs) { 2513 unsigned LOpElem = 2514 cast<Instruction>(LOp)->getOperand(0)->getType() 2515 ->getVectorNumElements(); 2516 2517 unsigned HOpElem = 2518 cast<Instruction>(HOp)->getOperand(0)->getType() 2519 ->getVectorNumElements(); 2520 2521 // We have one or two input vectors. We need to map each index of the 2522 // operands to the index of the original vector. 2523 SmallVector<std::pair<int, int>, 8> II(numElem); 2524 for (unsigned i = 0; i < numElemL; ++i) { 2525 int Idx, INum; 2526 if (LEE) { 2527 Idx = 2528 cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); 2529 INum = LEE->getOperand(0) == I1 ? 0 : 1; 2530 } else { 2531 Idx = LSV->getMaskValue(i); 2532 if (Idx < (int) LOpElem) { 2533 INum = LSV->getOperand(0) == I1 ? 0 : 1; 2534 } else { 2535 Idx -= LOpElem; 2536 INum = LSV->getOperand(1) == I1 ? 0 : 1; 2537 } 2538 } 2539 2540 II[i] = std::pair<int, int>(Idx, INum); 2541 } 2542 for (unsigned i = 0; i < numElemH; ++i) { 2543 int Idx, INum; 2544 if (HEE) { 2545 Idx = 2546 cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); 2547 INum = HEE->getOperand(0) == I1 ? 0 : 1; 2548 } else { 2549 Idx = HSV->getMaskValue(i); 2550 if (Idx < (int) HOpElem) { 2551 INum = HSV->getOperand(0) == I1 ? 0 : 1; 2552 } else { 2553 Idx -= HOpElem; 2554 INum = HSV->getOperand(1) == I1 ? 0 : 1; 2555 } 2556 } 2557 2558 II[i + numElemL] = std::pair<int, int>(Idx, INum); 2559 } 2560 2561 // We now have an array which tells us from which index of which 2562 // input vector each element of the operand comes. 2563 VectorType *I1T = cast<VectorType>(I1->getType()); 2564 unsigned I1Elem = I1T->getNumElements(); 2565 2566 if (!I2) { 2567 // In this case there is only one underlying vector input. Check for 2568 // the trivial case where we can use the input directly. 2569 if (I1Elem == numElem) { 2570 bool ElemInOrder = true; 2571 for (unsigned i = 0; i < numElem; ++i) { 2572 if (II[i].first != (int) i && II[i].first != -1) { 2573 ElemInOrder = false; 2574 break; 2575 } 2576 } 2577 2578 if (ElemInOrder) 2579 return I1; 2580 } 2581 2582 // A shuffle is needed. 2583 std::vector<Constant *> Mask(numElem); 2584 for (unsigned i = 0; i < numElem; ++i) { 2585 int Idx = II[i].first; 2586 if (Idx == -1) 2587 Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); 2588 else 2589 Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2590 } 2591 2592 Instruction *S = 2593 new ShuffleVectorInst(I1, UndefValue::get(I1T), 2594 ConstantVector::get(Mask), 2595 getReplacementName(IBeforeJ ? I : J, 2596 true, o)); 2597 S->insertBefore(IBeforeJ ? J : I); 2598 return S; 2599 } 2600 2601 VectorType *I2T = cast<VectorType>(I2->getType()); 2602 unsigned I2Elem = I2T->getNumElements(); 2603 2604 // This input comes from two distinct vectors. The first step is to 2605 // make sure that both vectors are the same length. If not, the 2606 // smaller one will need to grow before they can be shuffled together. 2607 if (I1Elem < I2Elem) { 2608 std::vector<Constant *> Mask(I2Elem); 2609 unsigned v = 0; 2610 for (; v < I1Elem; ++v) 2611 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2612 for (; v < I2Elem; ++v) 2613 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2614 2615 Instruction *NewI1 = 2616 new ShuffleVectorInst(I1, UndefValue::get(I1T), 2617 ConstantVector::get(Mask), 2618 getReplacementName(IBeforeJ ? I : J, 2619 true, o, 1)); 2620 NewI1->insertBefore(IBeforeJ ? J : I); 2621 I1 = NewI1; 2622 I1Elem = I2Elem; 2623 } else if (I1Elem > I2Elem) { 2624 std::vector<Constant *> Mask(I1Elem); 2625 unsigned v = 0; 2626 for (; v < I2Elem; ++v) 2627 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2628 for (; v < I1Elem; ++v) 2629 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2630 2631 Instruction *NewI2 = 2632 new ShuffleVectorInst(I2, UndefValue::get(I2T), 2633 ConstantVector::get(Mask), 2634 getReplacementName(IBeforeJ ? I : J, 2635 true, o, 1)); 2636 NewI2->insertBefore(IBeforeJ ? J : I); 2637 I2 = NewI2; 2638 } 2639 2640 // Now that both I1 and I2 are the same length we can shuffle them 2641 // together (and use the result). 2642 std::vector<Constant *> Mask(numElem); 2643 for (unsigned v = 0; v < numElem; ++v) { 2644 if (II[v].first == -1) { 2645 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2646 } else { 2647 int Idx = II[v].first + II[v].second * I1Elem; 2648 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2649 } 2650 } 2651 2652 Instruction *NewOp = 2653 new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), 2654 getReplacementName(IBeforeJ ? I : J, true, o)); 2655 NewOp->insertBefore(IBeforeJ ? J : I); 2656 return NewOp; 2657 } 2658 } 2659 2660 Type *ArgType = ArgTypeL; 2661 if (numElemL < numElemH) { 2662 if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, 2663 ArgTypeL, VArgType, IBeforeJ, 1)) { 2664 // This is another short-circuit case: we're combining a scalar into 2665 // a vector that is formed by an IE chain. We've just expanded the IE 2666 // chain, now insert the scalar and we're done. 2667 2668 Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, 2669 getReplacementName(IBeforeJ ? I : J, true, o)); 2670 S->insertBefore(IBeforeJ ? J : I); 2671 return S; 2672 } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, 2673 ArgTypeH, IBeforeJ)) { 2674 // The two vector inputs to the shuffle must be the same length, 2675 // so extend the smaller vector to be the same length as the larger one. 2676 Instruction *NLOp; 2677 if (numElemL > 1) { 2678 2679 std::vector<Constant *> Mask(numElemH); 2680 unsigned v = 0; 2681 for (; v < numElemL; ++v) 2682 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2683 for (; v < numElemH; ++v) 2684 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2685 2686 NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), 2687 ConstantVector::get(Mask), 2688 getReplacementName(IBeforeJ ? I : J, 2689 true, o, 1)); 2690 } else { 2691 NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, 2692 getReplacementName(IBeforeJ ? I : J, 2693 true, o, 1)); 2694 } 2695 2696 NLOp->insertBefore(IBeforeJ ? J : I); 2697 LOp = NLOp; 2698 } 2699 2700 ArgType = ArgTypeH; 2701 } else if (numElemL > numElemH) { 2702 if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, 2703 ArgTypeH, VArgType, IBeforeJ)) { 2704 Instruction *S = 2705 InsertElementInst::Create(LOp, HOp, 2706 ConstantInt::get(Type::getInt32Ty(Context), 2707 numElemL), 2708 getReplacementName(IBeforeJ ? I : J, 2709 true, o)); 2710 S->insertBefore(IBeforeJ ? J : I); 2711 return S; 2712 } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, 2713 ArgTypeL, IBeforeJ)) { 2714 Instruction *NHOp; 2715 if (numElemH > 1) { 2716 std::vector<Constant *> Mask(numElemL); 2717 unsigned v = 0; 2718 for (; v < numElemH; ++v) 2719 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2720 for (; v < numElemL; ++v) 2721 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 2722 2723 NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), 2724 ConstantVector::get(Mask), 2725 getReplacementName(IBeforeJ ? I : J, 2726 true, o, 1)); 2727 } else { 2728 NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, 2729 getReplacementName(IBeforeJ ? I : J, 2730 true, o, 1)); 2731 } 2732 2733 NHOp->insertBefore(IBeforeJ ? J : I); 2734 HOp = NHOp; 2735 } 2736 } 2737 2738 if (ArgType->isVectorTy()) { 2739 unsigned numElem = VArgType->getVectorNumElements(); 2740 std::vector<Constant*> Mask(numElem); 2741 for (unsigned v = 0; v < numElem; ++v) { 2742 unsigned Idx = v; 2743 // If the low vector was expanded, we need to skip the extra 2744 // undefined entries. 2745 if (v >= numElemL && numElemH > numElemL) 2746 Idx += (numElemH - numElemL); 2747 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 2748 } 2749 2750 Instruction *BV = new ShuffleVectorInst(LOp, HOp, 2751 ConstantVector::get(Mask), 2752 getReplacementName(IBeforeJ ? I : J, true, o)); 2753 BV->insertBefore(IBeforeJ ? J : I); 2754 return BV; 2755 } 2756 2757 Instruction *BV1 = InsertElementInst::Create( 2758 UndefValue::get(VArgType), LOp, CV0, 2759 getReplacementName(IBeforeJ ? I : J, 2760 true, o, 1)); 2761 BV1->insertBefore(IBeforeJ ? J : I); 2762 Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, 2763 getReplacementName(IBeforeJ ? I : J, 2764 true, o, 2)); 2765 BV2->insertBefore(IBeforeJ ? J : I); 2766 return BV2; 2767 } 2768 2769 // This function creates an array of values that will be used as the inputs 2770 // to the vector instruction that fuses I with J. 2771 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 2772 Instruction *I, Instruction *J, 2773 SmallVectorImpl<Value *> &ReplacedOperands, 2774 bool IBeforeJ) { 2775 unsigned NumOperands = I->getNumOperands(); 2776 2777 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 2778 // Iterate backward so that we look at the store pointer 2779 // first and know whether or not we need to flip the inputs. 2780 2781 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 2782 // This is the pointer for a load/store instruction. 2783 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); 2784 continue; 2785 } else if (isa<CallInst>(I)) { 2786 Function *F = cast<CallInst>(I)->getCalledFunction(); 2787 Intrinsic::ID IID = F->getIntrinsicID(); 2788 if (o == NumOperands-1) { 2789 BasicBlock &BB = *I->getParent(); 2790 2791 Module *M = BB.getParent()->getParent(); 2792 Type *ArgTypeI = I->getType(); 2793 Type *ArgTypeJ = J->getType(); 2794 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 2795 2796 ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); 2797 continue; 2798 } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || 2799 IID == Intrinsic::cttz) && o == 1) { 2800 // The second argument of powi/ctlz/cttz is a single integer/constant 2801 // and we've already checked that both arguments are equal. 2802 // As a result, we just keep I's second argument. 2803 ReplacedOperands[o] = I->getOperand(o); 2804 continue; 2805 } 2806 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 2807 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 2808 continue; 2809 } 2810 2811 ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); 2812 } 2813 } 2814 2815 // This function creates two values that represent the outputs of the 2816 // original I and J instructions. These are generally vector shuffles 2817 // or extracts. In many cases, these will end up being unused and, thus, 2818 // eliminated by later passes. 2819 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 2820 Instruction *J, Instruction *K, 2821 Instruction *&InsertionPt, 2822 Instruction *&K1, Instruction *&K2) { 2823 if (isa<StoreInst>(I)) 2824 return; 2825 2826 Type *IType = I->getType(); 2827 Type *JType = J->getType(); 2828 2829 VectorType *VType = getVecTypeForPair(IType, JType); 2830 unsigned numElem = VType->getNumElements(); 2831 2832 unsigned numElemI = getNumScalarElements(IType); 2833 unsigned numElemJ = getNumScalarElements(JType); 2834 2835 if (IType->isVectorTy()) { 2836 std::vector<Constant *> Mask1(numElemI), Mask2(numElemI); 2837 for (unsigned v = 0; v < numElemI; ++v) { 2838 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2839 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v); 2840 } 2841 2842 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 2843 ConstantVector::get(Mask1), 2844 getReplacementName(K, false, 1)); 2845 } else { 2846 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 2847 K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); 2848 } 2849 2850 if (JType->isVectorTy()) { 2851 std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ); 2852 for (unsigned v = 0; v < numElemJ; ++v) { 2853 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 2854 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v); 2855 } 2856 2857 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 2858 ConstantVector::get(Mask2), 2859 getReplacementName(K, false, 2)); 2860 } else { 2861 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1); 2862 K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); 2863 } 2864 2865 K1->insertAfter(K); 2866 K2->insertAfter(K1); 2867 InsertionPt = K2; 2868 } 2869 2870 // Move all uses of the function I (including pairing-induced uses) after J. 2871 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 2872 DenseSet<ValuePair> &LoadMoveSetPairs, 2873 Instruction *I, Instruction *J) { 2874 // Skip to the first instruction past I. 2875 BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); 2876 2877 DenseSet<Value *> Users; 2878 AliasSetTracker WriteSet(*AA); 2879 if (I->mayWriteToMemory()) WriteSet.add(I); 2880 2881 for (; cast<Instruction>(L) != J; ++L) 2882 (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs); 2883 2884 assert(cast<Instruction>(L) == J && 2885 "Tracking has not proceeded far enough to check for dependencies"); 2886 // If J is now in the use set of I, then trackUsesOfI will return true 2887 // and we have a dependency cycle (and the fusing operation must abort). 2888 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); 2889 } 2890 2891 // Move all uses of the function I (including pairing-induced uses) after J. 2892 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 2893 DenseSet<ValuePair> &LoadMoveSetPairs, 2894 Instruction *&InsertionPt, 2895 Instruction *I, Instruction *J) { 2896 // Skip to the first instruction past I. 2897 BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); 2898 2899 DenseSet<Value *> Users; 2900 AliasSetTracker WriteSet(*AA); 2901 if (I->mayWriteToMemory()) WriteSet.add(I); 2902 2903 for (; cast<Instruction>(L) != J;) { 2904 if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) { 2905 // Move this instruction 2906 Instruction *InstToMove = &*L++; 2907 2908 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 2909 " to after " << *InsertionPt << "\n"); 2910 InstToMove->removeFromParent(); 2911 InstToMove->insertAfter(InsertionPt); 2912 InsertionPt = InstToMove; 2913 } else { 2914 ++L; 2915 } 2916 } 2917 } 2918 2919 // Collect all load instruction that are in the move set of a given first 2920 // pair member. These loads depend on the first instruction, I, and so need 2921 // to be moved after J (the second instruction) when the pair is fused. 2922 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 2923 DenseMap<Value *, Value *> &ChosenPairs, 2924 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 2925 DenseSet<ValuePair> &LoadMoveSetPairs, 2926 Instruction *I) { 2927 // Skip to the first instruction past I. 2928 BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); 2929 2930 DenseSet<Value *> Users; 2931 AliasSetTracker WriteSet(*AA); 2932 if (I->mayWriteToMemory()) WriteSet.add(I); 2933 2934 // Note: We cannot end the loop when we reach J because J could be moved 2935 // farther down the use chain by another instruction pairing. Also, J 2936 // could be before I if this is an inverted input. 2937 for (BasicBlock::iterator E = BB.end(); L != E; ++L) { 2938 if (trackUsesOfI(Users, WriteSet, I, &*L)) { 2939 if (L->mayReadFromMemory()) { 2940 LoadMoveSet[&*L].push_back(I); 2941 LoadMoveSetPairs.insert(ValuePair(&*L, I)); 2942 } 2943 } 2944 } 2945 } 2946 2947 // In cases where both load/stores and the computation of their pointers 2948 // are chosen for vectorization, we can end up in a situation where the 2949 // aliasing analysis starts returning different query results as the 2950 // process of fusing instruction pairs continues. Because the algorithm 2951 // relies on finding the same use dags here as were found earlier, we'll 2952 // need to precompute the necessary aliasing information here and then 2953 // manually update it during the fusion process. 2954 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 2955 std::vector<Value *> &PairableInsts, 2956 DenseMap<Value *, Value *> &ChosenPairs, 2957 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 2958 DenseSet<ValuePair> &LoadMoveSetPairs) { 2959 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 2960 PIE = PairableInsts.end(); PI != PIE; ++PI) { 2961 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 2962 if (P == ChosenPairs.end()) continue; 2963 2964 Instruction *I = cast<Instruction>(P->first); 2965 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, 2966 LoadMoveSetPairs, I); 2967 } 2968 } 2969 2970 // This function fuses the chosen instruction pairs into vector instructions, 2971 // taking care preserve any needed scalar outputs and, then, it reorders the 2972 // remaining instructions as needed (users of the first member of the pair 2973 // need to be moved to after the location of the second member of the pair 2974 // because the vector instruction is inserted in the location of the pair's 2975 // second member). 2976 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 2977 std::vector<Value *> &PairableInsts, 2978 DenseMap<Value *, Value *> &ChosenPairs, 2979 DenseSet<ValuePair> &FixedOrderPairs, 2980 DenseMap<VPPair, unsigned> &PairConnectionTypes, 2981 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 2982 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) { 2983 LLVMContext& Context = BB.getContext(); 2984 2985 // During the vectorization process, the order of the pairs to be fused 2986 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 2987 // list. After a pair is fused, the flipped pair is removed from the list. 2988 DenseSet<ValuePair> FlippedPairs; 2989 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 2990 E = ChosenPairs.end(); P != E; ++P) 2991 FlippedPairs.insert(ValuePair(P->second, P->first)); 2992 for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), 2993 E = FlippedPairs.end(); P != E; ++P) 2994 ChosenPairs.insert(*P); 2995 2996 DenseMap<Value *, std::vector<Value *> > LoadMoveSet; 2997 DenseSet<ValuePair> LoadMoveSetPairs; 2998 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, 2999 LoadMoveSet, LoadMoveSetPairs); 3000 3001 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 3002 3003 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 3004 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI); 3005 if (P == ChosenPairs.end()) { 3006 ++PI; 3007 continue; 3008 } 3009 3010 if (getDepthFactor(P->first) == 0) { 3011 // These instructions are not really fused, but are tracked as though 3012 // they are. Any case in which it would be interesting to fuse them 3013 // will be taken care of by InstCombine. 3014 --NumFusedOps; 3015 ++PI; 3016 continue; 3017 } 3018 3019 Instruction *I = cast<Instruction>(P->first), 3020 *J = cast<Instruction>(P->second); 3021 3022 DEBUG(dbgs() << "BBV: fusing: " << *I << 3023 " <-> " << *J << "\n"); 3024 3025 // Remove the pair and flipped pair from the list. 3026 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 3027 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 3028 ChosenPairs.erase(FP); 3029 ChosenPairs.erase(P); 3030 3031 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) { 3032 DEBUG(dbgs() << "BBV: fusion of: " << *I << 3033 " <-> " << *J << 3034 " aborted because of non-trivial dependency cycle\n"); 3035 --NumFusedOps; 3036 ++PI; 3037 continue; 3038 } 3039 3040 // If the pair must have the other order, then flip it. 3041 bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); 3042 if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { 3043 // This pair does not have a fixed order, and so we might want to 3044 // flip it if that will yield fewer shuffles. We count the number 3045 // of dependencies connected via swaps, and those directly connected, 3046 // and flip the order if the number of swaps is greater. 3047 bool OrigOrder = true; 3048 DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ = 3049 ConnectedPairDeps.find(ValuePair(I, J)); 3050 if (IJ == ConnectedPairDeps.end()) { 3051 IJ = ConnectedPairDeps.find(ValuePair(J, I)); 3052 OrigOrder = false; 3053 } 3054 3055 if (IJ != ConnectedPairDeps.end()) { 3056 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 3057 for (std::vector<ValuePair>::iterator T = IJ->second.begin(), 3058 TE = IJ->second.end(); T != TE; ++T) { 3059 VPPair Q(IJ->first, *T); 3060 DenseMap<VPPair, unsigned>::iterator R = 3061 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 3062 assert(R != PairConnectionTypes.end() && 3063 "Cannot find pair connection type"); 3064 if (R->second == PairConnectionDirect) 3065 ++NumDepsDirect; 3066 else if (R->second == PairConnectionSwap) 3067 ++NumDepsSwap; 3068 } 3069 3070 if (!OrigOrder) 3071 std::swap(NumDepsDirect, NumDepsSwap); 3072 3073 if (NumDepsSwap > NumDepsDirect) { 3074 FlipPairOrder = true; 3075 DEBUG(dbgs() << "BBV: reordering pair: " << *I << 3076 " <-> " << *J << "\n"); 3077 } 3078 } 3079 } 3080 3081 Instruction *L = I, *H = J; 3082 if (FlipPairOrder) 3083 std::swap(H, L); 3084 3085 // If the pair being fused uses the opposite order from that in the pair 3086 // connection map, then we need to flip the types. 3087 DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL = 3088 ConnectedPairs.find(ValuePair(H, L)); 3089 if (HL != ConnectedPairs.end()) 3090 for (std::vector<ValuePair>::iterator T = HL->second.begin(), 3091 TE = HL->second.end(); T != TE; ++T) { 3092 VPPair Q(HL->first, *T); 3093 DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q); 3094 assert(R != PairConnectionTypes.end() && 3095 "Cannot find pair connection type"); 3096 if (R->second == PairConnectionDirect) 3097 R->second = PairConnectionSwap; 3098 else if (R->second == PairConnectionSwap) 3099 R->second = PairConnectionDirect; 3100 } 3101 3102 bool LBeforeH = !FlipPairOrder; 3103 unsigned NumOperands = I->getNumOperands(); 3104 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 3105 getReplacementInputsForPair(Context, L, H, ReplacedOperands, 3106 LBeforeH); 3107 3108 // Make a copy of the original operation, change its type to the vector 3109 // type and replace its operands with the vector operands. 3110 Instruction *K = L->clone(); 3111 if (L->hasName()) 3112 K->takeName(L); 3113 else if (H->hasName()) 3114 K->takeName(H); 3115 3116 if (auto CS = CallSite(K)) { 3117 SmallVector<Type *, 3> Tys; 3118 FunctionType *Old = CS.getFunctionType(); 3119 unsigned NumOld = Old->getNumParams(); 3120 assert(NumOld <= ReplacedOperands.size()); 3121 for (unsigned i = 0; i != NumOld; ++i) 3122 Tys.push_back(ReplacedOperands[i]->getType()); 3123 CS.mutateFunctionType( 3124 FunctionType::get(getVecTypeForPair(L->getType(), H->getType()), 3125 Tys, Old->isVarArg())); 3126 } else if (!isa<StoreInst>(K)) 3127 K->mutateType(getVecTypeForPair(L->getType(), H->getType())); 3128 3129 unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, 3130 LLVMContext::MD_noalias, LLVMContext::MD_fpmath, 3131 LLVMContext::MD_invariant_group}; 3132 combineMetadata(K, H, KnownIDs); 3133 K->intersectOptionalDataWith(H); 3134 3135 for (unsigned o = 0; o < NumOperands; ++o) 3136 K->setOperand(o, ReplacedOperands[o]); 3137 3138 K->insertAfter(J); 3139 3140 // Instruction insertion point: 3141 Instruction *InsertionPt = K; 3142 Instruction *K1 = nullptr, *K2 = nullptr; 3143 replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); 3144 3145 // The use dag of the first original instruction must be moved to after 3146 // the location of the second instruction. The entire use dag of the 3147 // first instruction is disjoint from the input dag of the second 3148 // (by definition), and so commutes with it. 3149 3150 moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); 3151 3152 if (!isa<StoreInst>(I)) { 3153 L->replaceAllUsesWith(K1); 3154 H->replaceAllUsesWith(K2); 3155 } 3156 3157 // Instructions that may read from memory may be in the load move set. 3158 // Once an instruction is fused, we no longer need its move set, and so 3159 // the values of the map never need to be updated. However, when a load 3160 // is fused, we need to merge the entries from both instructions in the 3161 // pair in case those instructions were in the move set of some other 3162 // yet-to-be-fused pair. The loads in question are the keys of the map. 3163 if (I->mayReadFromMemory()) { 3164 std::vector<ValuePair> NewSetMembers; 3165 DenseMap<Value *, std::vector<Value *> >::iterator II = 3166 LoadMoveSet.find(I); 3167 if (II != LoadMoveSet.end()) 3168 for (std::vector<Value *>::iterator N = II->second.begin(), 3169 NE = II->second.end(); N != NE; ++N) 3170 NewSetMembers.push_back(ValuePair(K, *N)); 3171 DenseMap<Value *, std::vector<Value *> >::iterator JJ = 3172 LoadMoveSet.find(J); 3173 if (JJ != LoadMoveSet.end()) 3174 for (std::vector<Value *>::iterator N = JJ->second.begin(), 3175 NE = JJ->second.end(); N != NE; ++N) 3176 NewSetMembers.push_back(ValuePair(K, *N)); 3177 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 3178 AE = NewSetMembers.end(); A != AE; ++A) { 3179 LoadMoveSet[A->first].push_back(A->second); 3180 LoadMoveSetPairs.insert(*A); 3181 } 3182 } 3183 3184 // Before removing I, set the iterator to the next instruction. 3185 PI = std::next(BasicBlock::iterator(I)); 3186 if (cast<Instruction>(PI) == J) 3187 ++PI; 3188 3189 SE->forgetValue(I); 3190 SE->forgetValue(J); 3191 I->eraseFromParent(); 3192 J->eraseFromParent(); 3193 3194 DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << 3195 BB << "\n"); 3196 } 3197 3198 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 3199 } 3200 } 3201 3202 char BBVectorize::ID = 0; 3203 static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 3204 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3205 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 3206 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 3207 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 3208 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 3209 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 3210 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 3211 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 3212 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 3213 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 3214 3215 BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 3216 return new BBVectorize(C); 3217 } 3218 3219 bool 3220 llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 3221 BBVectorize BBVectorizer(P, *BB.getParent(), C); 3222 return BBVectorizer.vectorizeBB(BB); 3223 } 3224 3225 //===----------------------------------------------------------------------===// 3226 VectorizeConfig::VectorizeConfig() { 3227 VectorBits = ::VectorBits; 3228 VectorizeBools = !::NoBools; 3229 VectorizeInts = !::NoInts; 3230 VectorizeFloats = !::NoFloats; 3231 VectorizePointers = !::NoPointers; 3232 VectorizeCasts = !::NoCasts; 3233 VectorizeMath = !::NoMath; 3234 VectorizeBitManipulations = !::NoBitManipulation; 3235 VectorizeFMA = !::NoFMA; 3236 VectorizeSelect = !::NoSelect; 3237 VectorizeCmp = !::NoCmp; 3238 VectorizeGEP = !::NoGEP; 3239 VectorizeMemOps = !::NoMemOps; 3240 AlignedOnly = ::AlignedOnly; 3241 ReqChainDepth= ::ReqChainDepth; 3242 SearchLimit = ::SearchLimit; 3243 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 3244 SplatBreaksChain = ::SplatBreaksChain; 3245 MaxInsts = ::MaxInsts; 3246 MaxPairs = ::MaxPairs; 3247 MaxIter = ::MaxIter; 3248 Pow2LenOnly = ::Pow2LenOnly; 3249 NoMemOpBoost = ::NoMemOpBoost; 3250 FastDep = ::FastDep; 3251 } 3252