Home | History | Annotate | Download | only in AMDGPU
      1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This pass eliminates allocas by either converting them into vectors or
     11 // by migrating them to local address space.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "AMDGPU.h"
     16 #include "AMDGPUSubtarget.h"
     17 #include "llvm/Analysis/ValueTracking.h"
     18 #include "llvm/IR/IRBuilder.h"
     19 #include "llvm/IR/IntrinsicInst.h"
     20 #include "llvm/IR/MDBuilder.h"
     21 #include "llvm/Support/Debug.h"
     22 #include "llvm/Support/raw_ostream.h"
     23 
     24 #define DEBUG_TYPE "amdgpu-promote-alloca"
     25 
     26 using namespace llvm;
     27 
     28 namespace {
     29 
     30 // FIXME: This can create globals so should be a module pass.
     31 class AMDGPUPromoteAlloca : public FunctionPass {
     32 private:
     33   const TargetMachine *TM;
     34   Module *Mod;
     35   const DataLayout *DL;
     36   MDNode *MaxWorkGroupSizeRange;
     37 
     38   // FIXME: This should be per-kernel.
     39   uint32_t LocalMemLimit;
     40   uint32_t CurrentLocalMemUsage;
     41 
     42   bool IsAMDGCN;
     43   bool IsAMDHSA;
     44 
     45   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
     46   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
     47 
     48   /// BaseAlloca is the alloca root the search started from.
     49   /// Val may be that alloca or a recursive user of it.
     50   bool collectUsesWithPtrTypes(Value *BaseAlloca,
     51                                Value *Val,
     52                                std::vector<Value*> &WorkList) const;
     53 
     54   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
     55   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
     56   /// Returns true if both operands are derived from the same alloca. Val should
     57   /// be the same value as one of the input operands of UseInst.
     58   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
     59                                        Instruction *UseInst,
     60                                        int OpIdx0, int OpIdx1) const;
     61 
     62 public:
     63   static char ID;
     64 
     65   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
     66     FunctionPass(ID),
     67     TM(TM_),
     68     Mod(nullptr),
     69     DL(nullptr),
     70     MaxWorkGroupSizeRange(nullptr),
     71     LocalMemLimit(0),
     72     CurrentLocalMemUsage(0),
     73     IsAMDGCN(false),
     74     IsAMDHSA(false) { }
     75 
     76   bool doInitialization(Module &M) override;
     77   bool runOnFunction(Function &F) override;
     78 
     79   const char *getPassName() const override {
     80     return "AMDGPU Promote Alloca";
     81   }
     82 
     83   void handleAlloca(AllocaInst &I);
     84 
     85   void getAnalysisUsage(AnalysisUsage &AU) const override {
     86     AU.setPreservesCFG();
     87     FunctionPass::getAnalysisUsage(AU);
     88   }
     89 };
     90 
     91 } // End anonymous namespace
     92 
     93 char AMDGPUPromoteAlloca::ID = 0;
     94 
     95 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
     96                    "AMDGPU promote alloca to vector or LDS", false, false)
     97 
     98 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
     99 
    100 
    101 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
    102   if (!TM)
    103     return false;
    104 
    105   Mod = &M;
    106   DL = &Mod->getDataLayout();
    107 
    108   // The maximum workitem id.
    109   //
    110   // FIXME: Should get as subtarget property. Usually runtime enforced max is
    111   // 256.
    112   MDBuilder MDB(Mod->getContext());
    113   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
    114 
    115   const Triple &TT = TM->getTargetTriple();
    116 
    117   IsAMDGCN = TT.getArch() == Triple::amdgcn;
    118   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
    119 
    120   return false;
    121 }
    122 
    123 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
    124   if (!TM || skipFunction(F))
    125     return false;
    126 
    127   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
    128   if (!ST.isPromoteAllocaEnabled())
    129     return false;
    130 
    131   FunctionType *FTy = F.getFunctionType();
    132 
    133   // If the function has any arguments in the local address space, then it's
    134   // possible these arguments require the entire local memory space, so
    135   // we cannot use local memory in the pass.
    136   for (Type *ParamTy : FTy->params()) {
    137     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
    138     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
    139       LocalMemLimit = 0;
    140       DEBUG(dbgs() << "Function has local memory argument. Promoting to "
    141                       "local memory disabled.\n");
    142       return false;
    143     }
    144   }
    145 
    146   LocalMemLimit = ST.getLocalMemorySize();
    147   if (LocalMemLimit == 0)
    148     return false;
    149 
    150   const DataLayout &DL = Mod->getDataLayout();
    151 
    152   // Check how much local memory is being used by global objects
    153   CurrentLocalMemUsage = 0;
    154   for (GlobalVariable &GV : Mod->globals()) {
    155     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
    156       continue;
    157 
    158     for (const User *U : GV.users()) {
    159       const Instruction *Use = dyn_cast<Instruction>(U);
    160       if (!Use)
    161         continue;
    162 
    163       if (Use->getParent()->getParent() == &F) {
    164         unsigned Align = GV.getAlignment();
    165         if (Align == 0)
    166           Align = DL.getABITypeAlignment(GV.getValueType());
    167 
    168         // FIXME: Try to account for padding here. The padding is currently
    169         // determined from the inverse order of uses in the function. I'm not
    170         // sure if the use list order is in any way connected to this, so the
    171         // total reported size is likely incorrect.
    172         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
    173         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
    174         CurrentLocalMemUsage += AllocSize;
    175         break;
    176       }
    177     }
    178   }
    179 
    180   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
    181 
    182   // Restrict local memory usage so that we don't drastically reduce occupancy,
    183   // unless it is already significantly reduced.
    184 
    185   // TODO: Have some sort of hint or other heuristics to guess occupancy based
    186   // on other factors..
    187   unsigned OccupancyHint
    188     = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
    189   if (OccupancyHint == 0)
    190     OccupancyHint = 7;
    191 
    192   // Clamp to max value.
    193   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());
    194 
    195   // Check the hint but ignore it if it's obviously wrong from the existing LDS
    196   // usage.
    197   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
    198 
    199 
    200   // Round up to the next tier of usage.
    201   unsigned MaxSizeWithWaveCount
    202     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
    203 
    204   // Program is possibly broken by using more local mem than available.
    205   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
    206     return false;
    207 
    208   LocalMemLimit = MaxSizeWithWaveCount;
    209 
    210   DEBUG(
    211     dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
    212     << "  Rounding size to " << MaxSizeWithWaveCount
    213     << " with a maximum occupancy of " << MaxOccupancy << '\n'
    214     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
    215     << " available for promotion\n"
    216   );
    217 
    218   BasicBlock &EntryBB = *F.begin();
    219   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
    220     AllocaInst *AI = dyn_cast<AllocaInst>(I);
    221 
    222     ++I;
    223     if (AI)
    224       handleAlloca(*AI);
    225   }
    226 
    227   return true;
    228 }
    229 
    230 std::pair<Value *, Value *>
    231 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
    232   if (!IsAMDHSA) {
    233     Function *LocalSizeYFn
    234       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
    235     Function *LocalSizeZFn
    236       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
    237 
    238     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
    239     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
    240 
    241     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
    242     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
    243 
    244     return std::make_pair(LocalSizeY, LocalSizeZ);
    245   }
    246 
    247   // We must read the size out of the dispatch pointer.
    248   assert(IsAMDGCN);
    249 
    250   // We are indexing into this struct, and want to extract the workgroup_size_*
    251   // fields.
    252   //
    253   //   typedef struct hsa_kernel_dispatch_packet_s {
    254   //     uint16_t header;
    255   //     uint16_t setup;
    256   //     uint16_t workgroup_size_x ;
    257   //     uint16_t workgroup_size_y;
    258   //     uint16_t workgroup_size_z;
    259   //     uint16_t reserved0;
    260   //     uint32_t grid_size_x ;
    261   //     uint32_t grid_size_y ;
    262   //     uint32_t grid_size_z;
    263   //
    264   //     uint32_t private_segment_size;
    265   //     uint32_t group_segment_size;
    266   //     uint64_t kernel_object;
    267   //
    268   // #ifdef HSA_LARGE_MODEL
    269   //     void *kernarg_address;
    270   // #elif defined HSA_LITTLE_ENDIAN
    271   //     void *kernarg_address;
    272   //     uint32_t reserved1;
    273   // #else
    274   //     uint32_t reserved1;
    275   //     void *kernarg_address;
    276   // #endif
    277   //     uint64_t reserved2;
    278   //     hsa_signal_t completion_signal; // uint64_t wrapper
    279   //   } hsa_kernel_dispatch_packet_t
    280   //
    281   Function *DispatchPtrFn
    282     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
    283 
    284   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
    285   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
    286   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
    287 
    288   // Size of the dispatch packet struct.
    289   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
    290 
    291   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
    292   Value *CastDispatchPtr = Builder.CreateBitCast(
    293     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
    294 
    295   // We could do a single 64-bit load here, but it's likely that the basic
    296   // 32-bit and extract sequence is already present, and it is probably easier
    297   // to CSE this. The loads should be mergable later anyway.
    298   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
    299   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
    300 
    301   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
    302   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
    303 
    304   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
    305   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
    306   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
    307   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
    308 
    309   // Extract y component. Upper half of LoadZU should be zero already.
    310   Value *Y = Builder.CreateLShr(LoadXY, 16);
    311 
    312   return std::make_pair(Y, LoadZU);
    313 }
    314 
    315 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
    316   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
    317 
    318   switch (N) {
    319   case 0:
    320     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
    321       : Intrinsic::r600_read_tidig_x;
    322     break;
    323   case 1:
    324     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
    325       : Intrinsic::r600_read_tidig_y;
    326     break;
    327 
    328   case 2:
    329     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
    330       : Intrinsic::r600_read_tidig_z;
    331     break;
    332   default:
    333     llvm_unreachable("invalid dimension");
    334   }
    335 
    336   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
    337   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
    338   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
    339 
    340   return CI;
    341 }
    342 
    343 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
    344   return VectorType::get(ArrayTy->getArrayElementType(),
    345                          ArrayTy->getArrayNumElements());
    346 }
    347 
    348 static Value *
    349 calculateVectorIndex(Value *Ptr,
    350                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
    351   if (isa<AllocaInst>(Ptr))
    352     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
    353 
    354   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
    355 
    356   auto I = GEPIdx.find(GEP);
    357   return I == GEPIdx.end() ? nullptr : I->second;
    358 }
    359 
    360 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
    361   // FIXME we only support simple cases
    362   if (GEP->getNumOperands() != 3)
    363     return NULL;
    364 
    365   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
    366   if (!I0 || !I0->isZero())
    367     return NULL;
    368 
    369   return GEP->getOperand(2);
    370 }
    371 
    372 // Not an instruction handled below to turn into a vector.
    373 //
    374 // TODO: Check isTriviallyVectorizable for calls and handle other
    375 // instructions.
    376 static bool canVectorizeInst(Instruction *Inst, User *User) {
    377   switch (Inst->getOpcode()) {
    378   case Instruction::Load:
    379   case Instruction::BitCast:
    380   case Instruction::AddrSpaceCast:
    381     return true;
    382   case Instruction::Store: {
    383     // Must be the stored pointer operand, not a stored value.
    384     StoreInst *SI = cast<StoreInst>(Inst);
    385     return SI->getPointerOperand() == User;
    386   }
    387   default:
    388     return false;
    389   }
    390 }
    391 
    392 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
    393   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
    394 
    395   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
    396 
    397   // FIXME: There is no reason why we can't support larger arrays, we
    398   // are just being conservative for now.
    399   if (!AllocaTy ||
    400       AllocaTy->getElementType()->isVectorTy() ||
    401       AllocaTy->getNumElements() > 4) {
    402     DEBUG(dbgs() << "  Cannot convert type to vector\n");
    403     return false;
    404   }
    405 
    406   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
    407   std::vector<Value*> WorkList;
    408   for (User *AllocaUser : Alloca->users()) {
    409     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
    410     if (!GEP) {
    411       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
    412         return false;
    413 
    414       WorkList.push_back(AllocaUser);
    415       continue;
    416     }
    417 
    418     Value *Index = GEPToVectorIndex(GEP);
    419 
    420     // If we can't compute a vector index from this GEP, then we can't
    421     // promote this alloca to vector.
    422     if (!Index) {
    423       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
    424       return false;
    425     }
    426 
    427     GEPVectorIdx[GEP] = Index;
    428     for (User *GEPUser : AllocaUser->users()) {
    429       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
    430         return false;
    431 
    432       WorkList.push_back(GEPUser);
    433     }
    434   }
    435 
    436   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
    437 
    438   DEBUG(dbgs() << "  Converting alloca to vector "
    439         << *AllocaTy << " -> " << *VectorTy << '\n');
    440 
    441   for (Value *V : WorkList) {
    442     Instruction *Inst = cast<Instruction>(V);
    443     IRBuilder<> Builder(Inst);
    444     switch (Inst->getOpcode()) {
    445     case Instruction::Load: {
    446       Value *Ptr = Inst->getOperand(0);
    447       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
    448       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
    449       Value *VecValue = Builder.CreateLoad(BitCast);
    450       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
    451       Inst->replaceAllUsesWith(ExtractElement);
    452       Inst->eraseFromParent();
    453       break;
    454     }
    455     case Instruction::Store: {
    456       Value *Ptr = Inst->getOperand(1);
    457       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
    458       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
    459       Value *VecValue = Builder.CreateLoad(BitCast);
    460       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
    461                                                        Inst->getOperand(0),
    462                                                        Index);
    463       Builder.CreateStore(NewVecValue, BitCast);
    464       Inst->eraseFromParent();
    465       break;
    466     }
    467     case Instruction::BitCast:
    468     case Instruction::AddrSpaceCast:
    469       break;
    470 
    471     default:
    472       Inst->dump();
    473       llvm_unreachable("Inconsistency in instructions promotable to vector");
    474     }
    475   }
    476   return true;
    477 }
    478 
    479 static bool isCallPromotable(CallInst *CI) {
    480   // TODO: We might be able to handle some cases where the callee is a
    481   // constantexpr bitcast of a function.
    482   if (!CI->getCalledFunction())
    483     return false;
    484 
    485   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
    486   if (!II)
    487     return false;
    488 
    489   switch (II->getIntrinsicID()) {
    490   case Intrinsic::memcpy:
    491   case Intrinsic::memmove:
    492   case Intrinsic::memset:
    493   case Intrinsic::lifetime_start:
    494   case Intrinsic::lifetime_end:
    495   case Intrinsic::invariant_start:
    496   case Intrinsic::invariant_end:
    497   case Intrinsic::invariant_group_barrier:
    498   case Intrinsic::objectsize:
    499     return true;
    500   default:
    501     return false;
    502   }
    503 }
    504 
    505 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
    506                                                           Value *Val,
    507                                                           Instruction *Inst,
    508                                                           int OpIdx0,
    509                                                           int OpIdx1) const {
    510   // Figure out which operand is the one we might not be promoting.
    511   Value *OtherOp = Inst->getOperand(OpIdx0);
    512   if (Val == OtherOp)
    513     OtherOp = Inst->getOperand(OpIdx1);
    514 
    515   if (isa<ConstantPointerNull>(OtherOp))
    516     return true;
    517 
    518   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
    519   if (!isa<AllocaInst>(OtherObj))
    520     return false;
    521 
    522   // TODO: We should be able to replace undefs with the right pointer type.
    523 
    524   // TODO: If we know the other base object is another promotable
    525   // alloca, not necessarily this alloca, we can do this. The
    526   // important part is both must have the same address space at
    527   // the end.
    528   if (OtherObj != BaseAlloca) {
    529     DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
    530     return false;
    531   }
    532 
    533   return true;
    534 }
    535 
    536 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
    537   Value *BaseAlloca,
    538   Value *Val,
    539   std::vector<Value*> &WorkList) const {
    540 
    541   for (User *User : Val->users()) {
    542     if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
    543       continue;
    544 
    545     if (CallInst *CI = dyn_cast<CallInst>(User)) {
    546       if (!isCallPromotable(CI))
    547         return false;
    548 
    549       WorkList.push_back(User);
    550       continue;
    551     }
    552 
    553     Instruction *UseInst = cast<Instruction>(User);
    554     if (UseInst->getOpcode() == Instruction::PtrToInt)
    555       return false;
    556 
    557     if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
    558       if (LI->isVolatile())
    559         return false;
    560 
    561       continue;
    562     }
    563 
    564     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
    565       if (SI->isVolatile())
    566         return false;
    567 
    568       // Reject if the stored value is not the pointer operand.
    569       if (SI->getPointerOperand() != Val)
    570         return false;
    571     } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
    572       if (RMW->isVolatile())
    573         return false;
    574     } else if (AtomicCmpXchgInst *CAS
    575                = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
    576       if (CAS->isVolatile())
    577         return false;
    578     }
    579 
    580     // Only promote a select if we know that the other select operand
    581     // is from another pointer that will also be promoted.
    582     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
    583       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
    584         return false;
    585 
    586       // May need to rewrite constant operands.
    587       WorkList.push_back(ICmp);
    588     }
    589 
    590     if (!User->getType()->isPointerTy())
    591       continue;
    592 
    593     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
    594       // Be conservative if an address could be computed outside the bounds of
    595       // the alloca.
    596       if (!GEP->isInBounds())
    597         return false;
    598     }
    599 
    600     // Only promote a select if we know that the other select operand is from
    601     // another pointer that will also be promoted.
    602     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
    603       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
    604         return false;
    605     }
    606 
    607     // Repeat for phis.
    608     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
    609       // TODO: Handle more complex cases. We should be able to replace loops
    610       // over arrays.
    611       switch (Phi->getNumIncomingValues()) {
    612       case 1:
    613         break;
    614       case 2:
    615         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
    616           return false;
    617         break;
    618       default:
    619         return false;
    620       }
    621     }
    622 
    623     WorkList.push_back(User);
    624     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
    625       return false;
    626   }
    627 
    628   return true;
    629 }
    630 
    631 // FIXME: Should try to pick the most likely to be profitable allocas first.
    632 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
    633   // Array allocations are probably not worth handling, since an allocation of
    634   // the array type is the canonical form.
    635   if (!I.isStaticAlloca() || I.isArrayAllocation())
    636     return;
    637 
    638   IRBuilder<> Builder(&I);
    639 
    640   // First try to replace the alloca with a vector
    641   Type *AllocaTy = I.getAllocatedType();
    642 
    643   DEBUG(dbgs() << "Trying to promote " << I << '\n');
    644 
    645   if (tryPromoteAllocaToVector(&I)) {
    646     DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
    647     return;
    648   }
    649 
    650   const Function &ContainingFunction = *I.getParent()->getParent();
    651 
    652   // FIXME: We should also try to get this value from the reqd_work_group_size
    653   // function attribute if it is available.
    654   unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
    655 
    656   const DataLayout &DL = Mod->getDataLayout();
    657 
    658   unsigned Align = I.getAlignment();
    659   if (Align == 0)
    660     Align = DL.getABITypeAlignment(I.getAllocatedType());
    661 
    662   // FIXME: This computed padding is likely wrong since it depends on inverse
    663   // usage order.
    664   //
    665   // FIXME: It is also possible that if we're allowed to use all of the memory
    666   // could could end up using more than the maximum due to alignment padding.
    667 
    668   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
    669   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
    670   NewSize += AllocSize;
    671 
    672   if (NewSize > LocalMemLimit) {
    673     DEBUG(dbgs() << "  " << AllocSize
    674           << " bytes of local memory not available to promote\n");
    675     return;
    676   }
    677 
    678   CurrentLocalMemUsage = NewSize;
    679 
    680   std::vector<Value*> WorkList;
    681 
    682   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
    683     DEBUG(dbgs() << " Do not know how to convert all uses\n");
    684     return;
    685   }
    686 
    687   DEBUG(dbgs() << "Promoting alloca to local memory\n");
    688 
    689   Function *F = I.getParent()->getParent();
    690 
    691   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
    692   GlobalVariable *GV = new GlobalVariable(
    693       *Mod, GVTy, false, GlobalValue::InternalLinkage,
    694       UndefValue::get(GVTy),
    695       Twine(F->getName()) + Twine('.') + I.getName(),
    696       nullptr,
    697       GlobalVariable::NotThreadLocal,
    698       AMDGPUAS::LOCAL_ADDRESS);
    699   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
    700   GV->setAlignment(I.getAlignment());
    701 
    702   Value *TCntY, *TCntZ;
    703 
    704   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
    705   Value *TIdX = getWorkitemID(Builder, 0);
    706   Value *TIdY = getWorkitemID(Builder, 1);
    707   Value *TIdZ = getWorkitemID(Builder, 2);
    708 
    709   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
    710   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
    711   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
    712   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
    713   TID = Builder.CreateAdd(TID, TIdZ);
    714 
    715   Value *Indices[] = {
    716     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
    717     TID
    718   };
    719 
    720   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
    721   I.mutateType(Offset->getType());
    722   I.replaceAllUsesWith(Offset);
    723   I.eraseFromParent();
    724 
    725   for (Value *V : WorkList) {
    726     CallInst *Call = dyn_cast<CallInst>(V);
    727     if (!Call) {
    728       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
    729         Value *Src0 = CI->getOperand(0);
    730         Type *EltTy = Src0->getType()->getPointerElementType();
    731         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
    732 
    733         if (isa<ConstantPointerNull>(CI->getOperand(0)))
    734           CI->setOperand(0, ConstantPointerNull::get(NewTy));
    735 
    736         if (isa<ConstantPointerNull>(CI->getOperand(1)))
    737           CI->setOperand(1, ConstantPointerNull::get(NewTy));
    738 
    739         continue;
    740       }
    741 
    742       // The operand's value should be corrected on its own.
    743       if (isa<AddrSpaceCastInst>(V))
    744         continue;
    745 
    746       Type *EltTy = V->getType()->getPointerElementType();
    747       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
    748 
    749       // FIXME: It doesn't really make sense to try to do this for all
    750       // instructions.
    751       V->mutateType(NewTy);
    752 
    753       // Adjust the types of any constant operands.
    754       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
    755         if (isa<ConstantPointerNull>(SI->getOperand(1)))
    756           SI->setOperand(1, ConstantPointerNull::get(NewTy));
    757 
    758         if (isa<ConstantPointerNull>(SI->getOperand(2)))
    759           SI->setOperand(2, ConstantPointerNull::get(NewTy));
    760       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
    761         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
    762           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
    763             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
    764         }
    765       }
    766 
    767       continue;
    768     }
    769 
    770     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
    771     if (!Intr) {
    772       // FIXME: What is this for? It doesn't make sense to promote arbitrary
    773       // function calls. If the call is to a defined function that can also be
    774       // promoted, we should be able to do this once that function is also
    775       // rewritten.
    776 
    777       std::vector<Type*> ArgTypes;
    778       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
    779                                 ArgIdx != ArgEnd; ++ArgIdx) {
    780         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
    781       }
    782       Function *F = Call->getCalledFunction();
    783       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
    784                                                 F->isVarArg());
    785       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
    786                                              NewType, F->getAttributes());
    787       Function *NewF = cast<Function>(C);
    788       Call->setCalledFunction(NewF);
    789       continue;
    790     }
    791 
    792     Builder.SetInsertPoint(Intr);
    793     switch (Intr->getIntrinsicID()) {
    794     case Intrinsic::lifetime_start:
    795     case Intrinsic::lifetime_end:
    796       // These intrinsics are for address space 0 only
    797       Intr->eraseFromParent();
    798       continue;
    799     case Intrinsic::memcpy: {
    800       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
    801       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
    802                            MemCpy->getLength(), MemCpy->getAlignment(),
    803                            MemCpy->isVolatile());
    804       Intr->eraseFromParent();
    805       continue;
    806     }
    807     case Intrinsic::memmove: {
    808       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
    809       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
    810                             MemMove->getLength(), MemMove->getAlignment(),
    811                             MemMove->isVolatile());
    812       Intr->eraseFromParent();
    813       continue;
    814     }
    815     case Intrinsic::memset: {
    816       MemSetInst *MemSet = cast<MemSetInst>(Intr);
    817       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
    818                            MemSet->getLength(), MemSet->getAlignment(),
    819                            MemSet->isVolatile());
    820       Intr->eraseFromParent();
    821       continue;
    822     }
    823     case Intrinsic::invariant_start:
    824     case Intrinsic::invariant_end:
    825     case Intrinsic::invariant_group_barrier:
    826       Intr->eraseFromParent();
    827       // FIXME: I think the invariant marker should still theoretically apply,
    828       // but the intrinsics need to be changed to accept pointers with any
    829       // address space.
    830       continue;
    831     case Intrinsic::objectsize: {
    832       Value *Src = Intr->getOperand(0);
    833       Type *SrcTy = Src->getType()->getPointerElementType();
    834       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
    835         Intrinsic::objectsize,
    836         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
    837       );
    838 
    839       CallInst *NewCall
    840         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
    841       Intr->replaceAllUsesWith(NewCall);
    842       Intr->eraseFromParent();
    843       continue;
    844     }
    845     default:
    846       Intr->dump();
    847       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
    848     }
    849   }
    850 }
    851 
    852 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
    853   return new AMDGPUPromoteAlloca(TM);
    854 }
    855