Home | History | Annotate | Download | only in R600
      1 //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 /// \file
      9 //==-----------------------------------------------------------------------===//
     10 
     11 #define DEBUG_TYPE "PeepholeOpt"
     12 #ifdef DEBUG
     13 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
     14 #else
     15 #define DEBUGME 0
     16 #endif
     17 
     18 #include "AMDILDevices.h"
     19 #include "AMDGPUInstrInfo.h"
     20 #include "llvm/ADT/Statistic.h"
     21 #include "llvm/ADT/StringExtras.h"
     22 #include "llvm/ADT/StringRef.h"
     23 #include "llvm/ADT/Twine.h"
     24 #include "llvm/IR/Constants.h"
     25 #include "llvm/CodeGen/MachineFunction.h"
     26 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
     27 #include "llvm/IR/Function.h"
     28 #include "llvm/IR/Instructions.h"
     29 #include "llvm/IR/Module.h"
     30 #include "llvm/Support/Debug.h"
     31 #include "llvm/Support/MathExtras.h"
     32 
     33 #include <sstream>
     34 
     35 #if 0
     36 STATISTIC(PointerAssignments, "Number of dynamic pointer "
     37     "assigments discovered");
     38 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
     39 #endif
     40 
     41 using namespace llvm;
     42 // The Peephole optimization pass is used to do simple last minute optimizations
     43 // that are required for correct code or to remove redundant functions
     44 namespace {
     45 
     46 class OpaqueType;
     47 
     48 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
     49 public:
     50   TargetMachine &TM;
     51   static char ID;
     52   AMDGPUPeepholeOpt(TargetMachine &tm);
     53   ~AMDGPUPeepholeOpt();
     54   const char *getPassName() const;
     55   bool runOnFunction(Function &F);
     56   bool doInitialization(Module &M);
     57   bool doFinalization(Module &M);
     58   void getAnalysisUsage(AnalysisUsage &AU) const;
     59 protected:
     60 private:
     61   // Function to initiate all of the instruction level optimizations.
     62   bool instLevelOptimizations(BasicBlock::iterator *inst);
     63   // Quick check to see if we need to dump all of the pointers into the
     64   // arena. If this is correct, then we set all pointers to exist in arena. This
     65   // is a workaround for aliasing of pointers in a struct/union.
     66   bool dumpAllIntoArena(Function &F);
     67   // Because I don't want to invalidate any pointers while in the
     68   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
     69   // it later. This function does the conversions if required.
     70   void doAtomicConversionIfNeeded(Function &F);
     71   // Because __amdil_is_constant cannot be properly evaluated if
     72   // optimizations are disabled, the call's are placed in a vector
     73   // and evaluated after the __amdil_image* functions are evaluated
     74   // which should allow the __amdil_is_constant function to be
     75   // evaluated correctly.
     76   void doIsConstCallConversionIfNeeded();
     77   bool mChanged;
     78   bool mDebug;
     79   bool mConvertAtomics;
     80   CodeGenOpt::Level optLevel;
     81   // Run a series of tests to see if we can optimize a CALL instruction.
     82   bool optimizeCallInst(BasicBlock::iterator *bbb);
     83   // A peephole optimization to optimize bit extract sequences.
     84   bool optimizeBitExtract(Instruction *inst);
     85   // A peephole optimization to optimize bit insert sequences.
     86   bool optimizeBitInsert(Instruction *inst);
     87   bool setupBitInsert(Instruction *base,
     88                       Instruction *&src,
     89                       Constant *&mask,
     90                       Constant *&shift);
     91   // Expand the bit field insert instruction on versions of OpenCL that
     92   // don't support it.
     93   bool expandBFI(CallInst *CI);
     94   // Expand the bit field mask instruction on version of OpenCL that
     95   // don't support it.
     96   bool expandBFM(CallInst *CI);
     97   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
     98   // this case we need to expand them. These functions check for 24bit functions
     99   // and then expand.
    100   bool isSigned24BitOps(CallInst *CI);
    101   void expandSigned24BitOps(CallInst *CI);
    102   // One optimization that can occur is that if the required workgroup size is
    103   // specified then the result of get_local_size is known at compile time and
    104   // can be returned accordingly.
    105   bool isRWGLocalOpt(CallInst *CI);
    106   // On northern island cards, the division is slightly less accurate than on
    107   // previous generations, so we need to utilize a more accurate division. So we
    108   // can translate the accurate divide to a normal divide on all other cards.
    109   bool convertAccurateDivide(CallInst *CI);
    110   void expandAccurateDivide(CallInst *CI);
    111   // If the alignment is set incorrectly, it can produce really inefficient
    112   // code. This checks for this scenario and fixes it if possible.
    113   bool correctMisalignedMemOp(Instruction *inst);
    114 
    115   // If we are in no opt mode, then we need to make sure that
    116   // local samplers are properly propagated as constant propagation
    117   // doesn't occur and we need to know the value of kernel defined
    118   // samplers at compile time.
    119   bool propagateSamplerInst(CallInst *CI);
    120 
    121   // Helper functions
    122 
    123   // Group of functions that recursively calculate the size of a structure based
    124   // on it's sub-types.
    125   size_t getTypeSize(Type * const T, bool dereferencePtr = false);
    126   size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
    127   size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
    128   size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
    129   size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
    130   size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
    131   size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
    132   size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
    133 
    134   LLVMContext *mCTX;
    135   Function *mF;
    136   const AMDGPUSubtarget *mSTM;
    137   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
    138   SmallVector<CallInst *, 16> isConstVec;
    139 }; // class AMDGPUPeepholeOpt
    140   char AMDGPUPeepholeOpt::ID = 0;
    141 
    142 // A template function that has two levels of looping before calling the
    143 // function with a pointer to the current iterator.
    144 template<class InputIterator, class SecondIterator, class Function>
    145 Function safeNestedForEach(InputIterator First, InputIterator Last,
    146                               SecondIterator S, Function F) {
    147   for ( ; First != Last; ++First) {
    148     SecondIterator sf, sl;
    149     for (sf = First->begin(), sl = First->end();
    150          sf != sl; )  {
    151       if (!F(&sf)) {
    152         ++sf;
    153       }
    154     }
    155   }
    156   return F;
    157 }
    158 
    159 } // anonymous namespace
    160 
    161 namespace llvm {
    162   FunctionPass *
    163   createAMDGPUPeepholeOpt(TargetMachine &tm) {
    164     return new AMDGPUPeepholeOpt(tm);
    165   }
    166 } // llvm namespace
    167 
    168 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
    169   : FunctionPass(ID), TM(tm)  {
    170   mDebug = DEBUGME;
    171   optLevel = TM.getOptLevel();
    172 
    173 }
    174 
    175 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
    176 }
    177 
    178 const char *
    179 AMDGPUPeepholeOpt::getPassName() const  {
    180   return "AMDGPU PeepHole Optimization Pass";
    181 }
    182 
    183 bool
    184 containsPointerType(Type *Ty)  {
    185   if (!Ty) {
    186     return false;
    187   }
    188   switch(Ty->getTypeID()) {
    189   default:
    190     return false;
    191   case Type::StructTyID: {
    192     const StructType *ST = dyn_cast<StructType>(Ty);
    193     for (StructType::element_iterator stb = ST->element_begin(),
    194            ste = ST->element_end(); stb != ste; ++stb) {
    195       if (!containsPointerType(*stb)) {
    196         continue;
    197       }
    198       return true;
    199     }
    200     break;
    201   }
    202   case Type::VectorTyID:
    203   case Type::ArrayTyID:
    204     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
    205   case Type::PointerTyID:
    206     return true;
    207   };
    208   return false;
    209 }
    210 
    211 bool
    212 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
    213   bool dumpAll = false;
    214   for (Function::const_arg_iterator cab = F.arg_begin(),
    215        cae = F.arg_end(); cab != cae; ++cab) {
    216     const Argument *arg = cab;
    217     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
    218     if (!PT) {
    219       continue;
    220     }
    221     Type *DereferencedType = PT->getElementType();
    222     if (!dyn_cast<StructType>(DereferencedType)
    223         ) {
    224       continue;
    225     }
    226     if (!containsPointerType(DereferencedType)) {
    227       continue;
    228     }
    229     // FIXME: Because a pointer inside of a struct/union may be aliased to
    230     // another pointer we need to take the conservative approach and place all
    231     // pointers into the arena until more advanced detection is implemented.
    232     dumpAll = true;
    233   }
    234   return dumpAll;
    235 }
    236 void
    237 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
    238   if (isConstVec.empty()) {
    239     return;
    240   }
    241   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
    242     CallInst *CI = isConstVec[x];
    243     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
    244     Type *aType = Type::getInt32Ty(*mCTX);
    245     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
    246       : ConstantInt::get(aType, 0);
    247     CI->replaceAllUsesWith(Val);
    248     CI->eraseFromParent();
    249   }
    250   isConstVec.clear();
    251 }
    252 void
    253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
    254   // Don't do anything if we don't have any atomic operations.
    255   if (atomicFuncs.empty()) {
    256     return;
    257   }
    258   // Change the function name for the atomic if it is required
    259   uint32_t size = atomicFuncs.size();
    260   for (uint32_t x = 0; x < size; ++x) {
    261     atomicFuncs[x].first->setOperand(
    262         atomicFuncs[x].first->getNumOperands()-1,
    263         atomicFuncs[x].second);
    264 
    265   }
    266   mChanged = true;
    267   if (mConvertAtomics) {
    268     return;
    269   }
    270 }
    271 
    272 bool
    273 AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
    274   mChanged = false;
    275   mF = &MF;
    276   mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
    277   if (mDebug) {
    278     MF.dump();
    279   }
    280   mCTX = &MF.getType()->getContext();
    281   mConvertAtomics = true;
    282   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
    283      std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
    284                   this));
    285 
    286   doAtomicConversionIfNeeded(MF);
    287   doIsConstCallConversionIfNeeded();
    288 
    289   if (mDebug) {
    290     MF.dump();
    291   }
    292   return mChanged;
    293 }
    294 
    295 bool
    296 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
    297   Instruction *inst = (*bbb);
    298   CallInst *CI = dyn_cast<CallInst>(inst);
    299   if (!CI) {
    300     return false;
    301   }
    302   if (isSigned24BitOps(CI)) {
    303     expandSigned24BitOps(CI);
    304     ++(*bbb);
    305     CI->eraseFromParent();
    306     return true;
    307   }
    308   if (propagateSamplerInst(CI)) {
    309     return false;
    310   }
    311   if (expandBFI(CI) || expandBFM(CI)) {
    312     ++(*bbb);
    313     CI->eraseFromParent();
    314     return true;
    315   }
    316   if (convertAccurateDivide(CI)) {
    317     expandAccurateDivide(CI);
    318     ++(*bbb);
    319     CI->eraseFromParent();
    320     return true;
    321   }
    322 
    323   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
    324   if (calleeName.startswith("__amdil_is_constant")) {
    325     // If we do not have optimizations, then this
    326     // cannot be properly evaluated, so we add the
    327     // call instruction to a vector and process
    328     // them at the end of processing after the
    329     // samplers have been correctly handled.
    330     if (optLevel == CodeGenOpt::None) {
    331       isConstVec.push_back(CI);
    332       return false;
    333     } else {
    334       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
    335       Type *aType = Type::getInt32Ty(*mCTX);
    336       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
    337         : ConstantInt::get(aType, 0);
    338       CI->replaceAllUsesWith(Val);
    339       ++(*bbb);
    340       CI->eraseFromParent();
    341       return true;
    342     }
    343   }
    344 
    345   if (calleeName.equals("__amdil_is_asic_id_i32")) {
    346     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
    347     Type *aType = Type::getInt32Ty(*mCTX);
    348     Value *Val = CV;
    349     if (Val) {
    350       Val = ConstantInt::get(aType,
    351           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
    352     } else {
    353       Val = ConstantInt::get(aType, 0);
    354     }
    355     CI->replaceAllUsesWith(Val);
    356     ++(*bbb);
    357     CI->eraseFromParent();
    358     return true;
    359   }
    360   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
    361   if (!F) {
    362     return false;
    363   }
    364   if (F->getName().startswith("__atom") && !CI->getNumUses()
    365       && F->getName().find("_xchg") == StringRef::npos) {
    366     std::string buffer(F->getName().str() + "_noret");
    367     F = dyn_cast<Function>(
    368           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
    369     atomicFuncs.push_back(std::make_pair(CI, F));
    370   }
    371 
    372   if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
    373       && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
    374     return false;
    375   }
    376   if (!mConvertAtomics) {
    377     return false;
    378   }
    379   StringRef name = F->getName();
    380   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
    381     mConvertAtomics = false;
    382   }
    383   return false;
    384 }
    385 
    386 bool
    387 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
    388     Instruction *&src,
    389     Constant *&mask,
    390     Constant *&shift) {
    391   if (!base) {
    392     if (mDebug) {
    393       dbgs() << "Null pointer passed into function.\n";
    394     }
    395     return false;
    396   }
    397   bool andOp = false;
    398   if (base->getOpcode() == Instruction::Shl) {
    399     shift = dyn_cast<Constant>(base->getOperand(1));
    400   } else if (base->getOpcode() == Instruction::And) {
    401     mask = dyn_cast<Constant>(base->getOperand(1));
    402     andOp = true;
    403   } else {
    404     if (mDebug) {
    405       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
    406     }
    407     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
    408     return false;
    409   }
    410   src = dyn_cast<Instruction>(base->getOperand(0));
    411   if (!src) {
    412     if (mDebug) {
    413       dbgs() << "Failed setup since the base operand is not an instruction!\n";
    414     }
    415     return false;
    416   }
    417   // If we find an 'and' operation, then we don't need to
    418   // find the next operation as we already know the
    419   // bits that are valid at this point.
    420   if (andOp) {
    421     return true;
    422   }
    423   if (src->getOpcode() == Instruction::Shl && !shift) {
    424     shift = dyn_cast<Constant>(src->getOperand(1));
    425     src = dyn_cast<Instruction>(src->getOperand(0));
    426   } else if (src->getOpcode() == Instruction::And && !mask) {
    427     mask = dyn_cast<Constant>(src->getOperand(1));
    428   }
    429   if (!mask && !shift) {
    430     if (mDebug) {
    431       dbgs() << "Failed setup since both mask and shift are NULL!\n";
    432     }
    433     // Did not find a constant mask or a shift.
    434     return false;
    435   }
    436   return true;
    437 }
    438 bool
    439 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
    440   if (!inst) {
    441     return false;
    442   }
    443   if (!inst->isBinaryOp()) {
    444     return false;
    445   }
    446   if (inst->getOpcode() != Instruction::Or) {
    447     return false;
    448   }
    449   if (optLevel == CodeGenOpt::None) {
    450     return false;
    451   }
    452   // We want to do an optimization on a sequence of ops that in the end equals a
    453   // single ISA instruction.
    454   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
    455   // Some simplified versions of this pattern are as follows:
    456   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
    457   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
    458   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
    459   // (A & B) | (D << F) when (1 << F) >= B
    460   // (A << C) | (D & E) when (1 << C) >= E
    461   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
    462     // The HD4XXX hardware doesn't support the ubit_insert instruction.
    463     return false;
    464   }
    465   Type *aType = inst->getType();
    466   bool isVector = aType->isVectorTy();
    467   int numEle = 1;
    468   // This optimization only works on 32bit integers.
    469   if (aType->getScalarType()
    470       != Type::getInt32Ty(inst->getContext())) {
    471     return false;
    472   }
    473   if (isVector) {
    474     const VectorType *VT = dyn_cast<VectorType>(aType);
    475     numEle = VT->getNumElements();
    476     // We currently cannot support more than 4 elements in a intrinsic and we
    477     // cannot support Vec3 types.
    478     if (numEle > 4 || numEle == 3) {
    479       return false;
    480     }
    481   }
    482   // TODO: Handle vectors.
    483   if (isVector) {
    484     if (mDebug) {
    485       dbgs() << "!!! Vectors are not supported yet!\n";
    486     }
    487     return false;
    488   }
    489   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
    490   Constant *LHSMask = NULL, *RHSMask = NULL;
    491   Constant *LHSShift = NULL, *RHSShift = NULL;
    492   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
    493   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
    494   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
    495     if (mDebug) {
    496       dbgs() << "Found an OR Operation that failed setup!\n";
    497       inst->dump();
    498       if (LHS) { LHS->dump(); }
    499       if (LHSSrc) { LHSSrc->dump(); }
    500       if (LHSMask) { LHSMask->dump(); }
    501       if (LHSShift) { LHSShift->dump(); }
    502     }
    503     // There was an issue with the setup for BitInsert.
    504     return false;
    505   }
    506   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
    507     if (mDebug) {
    508       dbgs() << "Found an OR Operation that failed setup!\n";
    509       inst->dump();
    510       if (RHS) { RHS->dump(); }
    511       if (RHSSrc) { RHSSrc->dump(); }
    512       if (RHSMask) { RHSMask->dump(); }
    513       if (RHSShift) { RHSShift->dump(); }
    514     }
    515     // There was an issue with the setup for BitInsert.
    516     return false;
    517   }
    518   if (mDebug) {
    519     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
    520     dbgs() << "Op:        "; inst->dump();
    521     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
    522     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
    523     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
    524     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
    525     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
    526     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
    527     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
    528     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
    529   }
    530   Constant *offset = NULL;
    531   Constant *width = NULL;
    532   uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
    533   uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
    534   uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
    535   uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
    536   lhsMaskVal = (LHSMask
    537       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
    538   rhsMaskVal = (RHSMask
    539       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
    540   lhsShiftVal = (LHSShift
    541       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
    542   rhsShiftVal = (RHSShift
    543       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
    544   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
    545   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
    546   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
    547   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
    548   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
    549   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
    550     return false;
    551   }
    552   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
    553     offset = ConstantInt::get(aType, lhsMaskOffset, false);
    554     width = ConstantInt::get(aType, lhsMaskWidth, false);
    555     RHSSrc = RHS;
    556     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
    557       return false;
    558     }
    559     if (!LHSShift) {
    560       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    561           "MaskShr", LHS);
    562     } else if (lhsShiftVal != lhsMaskOffset) {
    563       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    564           "MaskShr", LHS);
    565     }
    566     if (mDebug) {
    567       dbgs() << "Optimizing LHS!\n";
    568     }
    569   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
    570     offset = ConstantInt::get(aType, rhsMaskOffset, false);
    571     width = ConstantInt::get(aType, rhsMaskWidth, false);
    572     LHSSrc = RHSSrc;
    573     RHSSrc = LHS;
    574     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
    575       return false;
    576     }
    577     if (!RHSShift) {
    578       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    579           "MaskShr", RHS);
    580     } else if (rhsShiftVal != rhsMaskOffset) {
    581       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    582           "MaskShr", RHS);
    583     }
    584     if (mDebug) {
    585       dbgs() << "Optimizing RHS!\n";
    586     }
    587   } else {
    588     if (mDebug) {
    589       dbgs() << "Failed constraint 3!\n";
    590     }
    591     return false;
    592   }
    593   if (mDebug) {
    594     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
    595     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
    596     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
    597     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
    598   }
    599   if (!offset || !width) {
    600     if (mDebug) {
    601       dbgs() << "Either width or offset are NULL, failed detection!\n";
    602     }
    603     return false;
    604   }
    605   // Lets create the function signature.
    606   std::vector<Type *> callTypes;
    607   callTypes.push_back(aType);
    608   callTypes.push_back(aType);
    609   callTypes.push_back(aType);
    610   callTypes.push_back(aType);
    611   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
    612   std::string name = "__amdil_ubit_insert";
    613   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
    614   Function *Func =
    615     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
    616         getOrInsertFunction(StringRef(name), funcType));
    617   Value *Operands[4] = {
    618     width,
    619     offset,
    620     LHSSrc,
    621     RHSSrc
    622   };
    623   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
    624   if (mDebug) {
    625     dbgs() << "Old Inst: ";
    626     inst->dump();
    627     dbgs() << "New Inst: ";
    628     CI->dump();
    629     dbgs() << "\n\n";
    630   }
    631   CI->insertBefore(inst);
    632   inst->replaceAllUsesWith(CI);
    633   return true;
    634 }
    635 
    636 bool
    637 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
    638   if (!inst) {
    639     return false;
    640   }
    641   if (!inst->isBinaryOp()) {
    642     return false;
    643   }
    644   if (inst->getOpcode() != Instruction::And) {
    645     return false;
    646   }
    647   if (optLevel == CodeGenOpt::None) {
    648     return false;
    649   }
    650   // We want to do some simple optimizations on Shift right/And patterns. The
    651   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
    652   // value smaller than 32 and C is a mask. If C is a constant value, then the
    653   // following transformation can occur. For signed integers, it turns into the
    654   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
    655   // integers, it turns into the function call dst =
    656   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
    657   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
    658   // Evergreen hardware.
    659   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
    660     // This does not work on HD4XXX hardware.
    661     return false;
    662   }
    663   Type *aType = inst->getType();
    664   bool isVector = aType->isVectorTy();
    665 
    666   // XXX Support vector types
    667   if (isVector) {
    668     return false;
    669   }
    670   int numEle = 1;
    671   // This only works on 32bit integers
    672   if (aType->getScalarType()
    673       != Type::getInt32Ty(inst->getContext())) {
    674     return false;
    675   }
    676   if (isVector) {
    677     const VectorType *VT = dyn_cast<VectorType>(aType);
    678     numEle = VT->getNumElements();
    679     // We currently cannot support more than 4 elements in a intrinsic and we
    680     // cannot support Vec3 types.
    681     if (numEle > 4 || numEle == 3) {
    682       return false;
    683     }
    684   }
    685   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
    686   // If the first operand is not a shift instruction, then we can return as it
    687   // doesn't match this pattern.
    688   if (!ShiftInst || !ShiftInst->isShift()) {
    689     return false;
    690   }
    691   // If we are a shift left, then we need don't match this pattern.
    692   if (ShiftInst->getOpcode() == Instruction::Shl) {
    693     return false;
    694   }
    695   bool isSigned = ShiftInst->isArithmeticShift();
    696   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
    697   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
    698   // Lets make sure that the shift value and the and mask are constant integers.
    699   if (!AndMask || !ShrVal) {
    700     return false;
    701   }
    702   Constant *newMaskConst;
    703   Constant *shiftValConst;
    704   if (isVector) {
    705     // Handle the vector case
    706     std::vector<Constant *> maskVals;
    707     std::vector<Constant *> shiftVals;
    708     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
    709     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
    710     Type *scalarType = AndMaskVec->getType()->getScalarType();
    711     assert(AndMaskVec->getNumOperands() ==
    712            ShrValVec->getNumOperands() && "cannot have a "
    713            "combination where the number of elements to a "
    714            "shift and an and are different!");
    715     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
    716       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
    717       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
    718       if (!AndCI || !ShiftIC) {
    719         return false;
    720       }
    721       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
    722       if (!isMask_32(maskVal)) {
    723         return false;
    724       }
    725       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
    726       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
    727       // If the mask or shiftval is greater than the bitcount, then break out.
    728       if (maskVal >= 32 || shiftVal >= 32) {
    729         return false;
    730       }
    731       // If the mask val is greater than the the number of original bits left
    732       // then this optimization is invalid.
    733       if (maskVal > (32 - shiftVal)) {
    734         return false;
    735       }
    736       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
    737       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
    738     }
    739     newMaskConst = ConstantVector::get(maskVals);
    740     shiftValConst = ConstantVector::get(shiftVals);
    741   } else {
    742     // Handle the scalar case
    743     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
    744     // This must be a mask value where all lower bits are set to 1 and then any
    745     // bit higher is set to 0.
    746     if (!isMask_32(maskVal)) {
    747       return false;
    748     }
    749     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
    750     // Count the number of bits set in the mask, this is the width of the
    751     // resulting bit set that is extracted from the source value.
    752     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
    753     // If the mask or shift val is greater than the bitcount, then break out.
    754     if (maskVal >= 32 || shiftVal >= 32) {
    755       return false;
    756     }
    757     // If the mask val is greater than the the number of original bits left then
    758     // this optimization is invalid.
    759     if (maskVal > (32 - shiftVal)) {
    760       return false;
    761     }
    762     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
    763     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
    764   }
    765   // Lets create the function signature.
    766   std::vector<Type *> callTypes;
    767   callTypes.push_back(aType);
    768   callTypes.push_back(aType);
    769   callTypes.push_back(aType);
    770   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
    771   std::string name = "llvm.AMDGPU.bit.extract.u32";
    772   if (isVector) {
    773     name += ".v" + itostr(numEle) + "i32";
    774   } else {
    775     name += ".";
    776   }
    777   // Lets create the function.
    778   Function *Func =
    779     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
    780                        getOrInsertFunction(StringRef(name), funcType));
    781   Value *Operands[3] = {
    782     ShiftInst->getOperand(0),
    783     shiftValConst,
    784     newMaskConst
    785   };
    786   // Lets create the Call with the operands
    787   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
    788   CI->setDoesNotAccessMemory();
    789   CI->insertBefore(inst);
    790   inst->replaceAllUsesWith(CI);
    791   return true;
    792 }
    793 
    794 bool
    795 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
    796   if (!CI) {
    797     return false;
    798   }
    799   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
    800   if (!LHS->getName().startswith("__amdil_bfi")) {
    801     return false;
    802   }
    803   Type* type = CI->getOperand(0)->getType();
    804   Constant *negOneConst = NULL;
    805   if (type->isVectorTy()) {
    806     std::vector<Constant *> negOneVals;
    807     negOneConst = ConstantInt::get(CI->getContext(),
    808         APInt(32, StringRef("-1"), 10));
    809     for (size_t x = 0,
    810         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
    811       negOneVals.push_back(negOneConst);
    812     }
    813     negOneConst = ConstantVector::get(negOneVals);
    814   } else {
    815     negOneConst = ConstantInt::get(CI->getContext(),
    816         APInt(32, StringRef("-1"), 10));
    817   }
    818   // __amdil_bfi => (A & B) | (~A & C)
    819   BinaryOperator *lhs =
    820     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
    821         CI->getOperand(1), "bfi_and", CI);
    822   BinaryOperator *rhs =
    823     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
    824         "bfi_not", CI);
    825   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
    826       "bfi_and", CI);
    827   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
    828   CI->replaceAllUsesWith(lhs);
    829   return true;
    830 }
    831 
    832 bool
    833 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
    834   if (!CI) {
    835     return false;
    836   }
    837   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
    838   if (!LHS->getName().startswith("__amdil_bfm")) {
    839     return false;
    840   }
    841   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
    842   Constant *newMaskConst = NULL;
    843   Constant *newShiftConst = NULL;
    844   Type* type = CI->getOperand(0)->getType();
    845   if (type->isVectorTy()) {
    846     std::vector<Constant*> newMaskVals, newShiftVals;
    847     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
    848     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
    849     for (size_t x = 0,
    850         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
    851       newMaskVals.push_back(newMaskConst);
    852       newShiftVals.push_back(newShiftConst);
    853     }
    854     newMaskConst = ConstantVector::get(newMaskVals);
    855     newShiftConst = ConstantVector::get(newShiftVals);
    856   } else {
    857     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
    858     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
    859   }
    860   BinaryOperator *lhs =
    861     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
    862         newMaskConst, "bfm_mask", CI);
    863   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
    864       lhs, "bfm_shl", CI);
    865   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
    866       newShiftConst, "bfm_sub", CI);
    867   BinaryOperator *rhs =
    868     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
    869         newMaskConst, "bfm_mask", CI);
    870   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
    871   CI->replaceAllUsesWith(lhs);
    872   return true;
    873 }
    874 
    875 bool
    876 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
    877   Instruction *inst = (*bbb);
    878   if (optimizeCallInst(bbb)) {
    879     return true;
    880   }
    881   if (optimizeBitExtract(inst)) {
    882     return false;
    883   }
    884   if (optimizeBitInsert(inst)) {
    885     return false;
    886   }
    887   if (correctMisalignedMemOp(inst)) {
    888     return false;
    889   }
    890   return false;
    891 }
    892 bool
    893 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
    894   LoadInst *linst = dyn_cast<LoadInst>(inst);
    895   StoreInst *sinst = dyn_cast<StoreInst>(inst);
    896   unsigned alignment;
    897   Type* Ty = inst->getType();
    898   if (linst) {
    899     alignment = linst->getAlignment();
    900     Ty = inst->getType();
    901   } else if (sinst) {
    902     alignment = sinst->getAlignment();
    903     Ty = sinst->getValueOperand()->getType();
    904   } else {
    905     return false;
    906   }
    907   unsigned size = getTypeSize(Ty);
    908   if (size == alignment || size < alignment) {
    909     return false;
    910   }
    911   if (!Ty->isStructTy()) {
    912     return false;
    913   }
    914   if (alignment < 4) {
    915     if (linst) {
    916       linst->setAlignment(0);
    917       return true;
    918     } else if (sinst) {
    919       sinst->setAlignment(0);
    920       return true;
    921     }
    922   }
    923   return false;
    924 }
    925 bool
    926 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
    927   if (!CI) {
    928     return false;
    929   }
    930   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
    931   std::string namePrefix = LHS->getName().substr(0, 14);
    932   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
    933       && namePrefix != "__amdil__imul24_high") {
    934     return false;
    935   }
    936   if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
    937     return false;
    938   }
    939   return true;
    940 }
    941 
    942 void
    943 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
    944   assert(isSigned24BitOps(CI) && "Must be a "
    945       "signed 24 bit operation to call this function!");
    946   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
    947   // On 7XX and 8XX we do not have signed 24bit, so we need to
    948   // expand it to the following:
    949   // imul24 turns into 32bit imul
    950   // imad24 turns into 32bit imad
    951   // imul24_high turns into 32bit imulhigh
    952   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
    953     Type *aType = CI->getOperand(0)->getType();
    954     bool isVector = aType->isVectorTy();
    955     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
    956     std::vector<Type*> callTypes;
    957     callTypes.push_back(CI->getOperand(0)->getType());
    958     callTypes.push_back(CI->getOperand(1)->getType());
    959     callTypes.push_back(CI->getOperand(2)->getType());
    960     FunctionType *funcType =
    961       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
    962     std::string name = "__amdil_imad";
    963     if (isVector) {
    964       name += "_v" + itostr(numEle) + "i32";
    965     } else {
    966       name += "_i32";
    967     }
    968     Function *Func = dyn_cast<Function>(
    969                        CI->getParent()->getParent()->getParent()->
    970                        getOrInsertFunction(StringRef(name), funcType));
    971     Value *Operands[3] = {
    972       CI->getOperand(0),
    973       CI->getOperand(1),
    974       CI->getOperand(2)
    975     };
    976     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
    977     nCI->insertBefore(CI);
    978     CI->replaceAllUsesWith(nCI);
    979   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
    980     BinaryOperator *mulOp =
    981       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
    982           CI->getOperand(1), "imul24", CI);
    983     CI->replaceAllUsesWith(mulOp);
    984   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
    985     Type *aType = CI->getOperand(0)->getType();
    986 
    987     bool isVector = aType->isVectorTy();
    988     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
    989     std::vector<Type*> callTypes;
    990     callTypes.push_back(CI->getOperand(0)->getType());
    991     callTypes.push_back(CI->getOperand(1)->getType());
    992     FunctionType *funcType =
    993       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
    994     std::string name = "__amdil_imul_high";
    995     if (isVector) {
    996       name += "_v" + itostr(numEle) + "i32";
    997     } else {
    998       name += "_i32";
    999     }
   1000     Function *Func = dyn_cast<Function>(
   1001                        CI->getParent()->getParent()->getParent()->
   1002                        getOrInsertFunction(StringRef(name), funcType));
   1003     Value *Operands[2] = {
   1004       CI->getOperand(0),
   1005       CI->getOperand(1)
   1006     };
   1007     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
   1008     nCI->insertBefore(CI);
   1009     CI->replaceAllUsesWith(nCI);
   1010   }
   1011 }
   1012 
   1013 bool
   1014 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
   1015   return (CI != NULL
   1016           && CI->getOperand(CI->getNumOperands() - 1)->getName()
   1017           == "__amdil_get_local_size_int");
   1018 }
   1019 
   1020 bool
   1021 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
   1022   if (!CI) {
   1023     return false;
   1024   }
   1025   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
   1026       && (mSTM->getDeviceName() == "cayman")) {
   1027     return false;
   1028   }
   1029   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
   1030       == "__amdil_improved_div";
   1031 }
   1032 
   1033 void
   1034 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
   1035   assert(convertAccurateDivide(CI)
   1036          && "expanding accurate divide can only happen if it is expandable!");
   1037   BinaryOperator *divOp =
   1038     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
   1039                            CI->getOperand(1), "fdiv32", CI);
   1040   CI->replaceAllUsesWith(divOp);
   1041 }
   1042 
   1043 bool
   1044 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
   1045   if (optLevel != CodeGenOpt::None) {
   1046     return false;
   1047   }
   1048 
   1049   if (!CI) {
   1050     return false;
   1051   }
   1052 
   1053   unsigned funcNameIdx = 0;
   1054   funcNameIdx = CI->getNumOperands() - 1;
   1055   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
   1056   if (calleeName != "__amdil_image2d_read_norm"
   1057    && calleeName != "__amdil_image2d_read_unnorm"
   1058    && calleeName != "__amdil_image3d_read_norm"
   1059    && calleeName != "__amdil_image3d_read_unnorm") {
   1060     return false;
   1061   }
   1062 
   1063   unsigned samplerIdx = 2;
   1064   samplerIdx = 1;
   1065   Value *sampler = CI->getOperand(samplerIdx);
   1066   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
   1067   if (!lInst) {
   1068     return false;
   1069   }
   1070 
   1071   if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1072     return false;
   1073   }
   1074 
   1075   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
   1076   // If we are loading from what is not a global value, then we
   1077   // fail and return.
   1078   if (!gv) {
   1079     return false;
   1080   }
   1081 
   1082   // If we don't have an initializer or we have an initializer and
   1083   // the initializer is not a 32bit integer, we fail.
   1084   if (!gv->hasInitializer()
   1085       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
   1086       return false;
   1087   }
   1088 
   1089   // Now that we have the global variable initializer, lets replace
   1090   // all uses of the load instruction with the samplerVal and
   1091   // reparse the __amdil_is_constant() function.
   1092   Constant *samplerVal = gv->getInitializer();
   1093   lInst->replaceAllUsesWith(samplerVal);
   1094   return true;
   1095 }
   1096 
   1097 bool
   1098 AMDGPUPeepholeOpt::doInitialization(Module &M)  {
   1099   return false;
   1100 }
   1101 
   1102 bool
   1103 AMDGPUPeepholeOpt::doFinalization(Module &M)  {
   1104   return false;
   1105 }
   1106 
   1107 void
   1108 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
   1109   AU.addRequired<MachineFunctionAnalysis>();
   1110   FunctionPass::getAnalysisUsage(AU);
   1111   AU.setPreservesAll();
   1112 }
   1113 
   1114 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
   1115   size_t size = 0;
   1116   if (!T) {
   1117     return size;
   1118   }
   1119   switch (T->getTypeID()) {
   1120   case Type::X86_FP80TyID:
   1121   case Type::FP128TyID:
   1122   case Type::PPC_FP128TyID:
   1123   case Type::LabelTyID:
   1124     assert(0 && "These types are not supported by this backend");
   1125   default:
   1126   case Type::FloatTyID:
   1127   case Type::DoubleTyID:
   1128     size = T->getPrimitiveSizeInBits() >> 3;
   1129     break;
   1130   case Type::PointerTyID:
   1131     size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
   1132     break;
   1133   case Type::IntegerTyID:
   1134     size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
   1135     break;
   1136   case Type::StructTyID:
   1137     size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
   1138     break;
   1139   case Type::ArrayTyID:
   1140     size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
   1141     break;
   1142   case Type::FunctionTyID:
   1143     size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
   1144     break;
   1145   case Type::VectorTyID:
   1146     size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
   1147     break;
   1148   };
   1149   return size;
   1150 }
   1151 
   1152 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
   1153     bool dereferencePtr) {
   1154   size_t size = 0;
   1155   if (!ST) {
   1156     return size;
   1157   }
   1158   Type *curType;
   1159   StructType::element_iterator eib;
   1160   StructType::element_iterator eie;
   1161   for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
   1162     curType = *eib;
   1163     size += getTypeSize(curType, dereferencePtr);
   1164   }
   1165   return size;
   1166 }
   1167 
   1168 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
   1169     bool dereferencePtr) {
   1170   return IT ? (IT->getBitWidth() >> 3) : 0;
   1171 }
   1172 
   1173 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
   1174     bool dereferencePtr) {
   1175     assert(0 && "Should not be able to calculate the size of an function type");
   1176     return 0;
   1177 }
   1178 
   1179 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
   1180     bool dereferencePtr) {
   1181   return (size_t)(AT ? (getTypeSize(AT->getElementType(),
   1182                                     dereferencePtr) * AT->getNumElements())
   1183                      : 0);
   1184 }
   1185 
   1186 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
   1187     bool dereferencePtr) {
   1188   return VT ? (VT->getBitWidth() >> 3) : 0;
   1189 }
   1190 
   1191 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
   1192     bool dereferencePtr) {
   1193   if (!PT) {
   1194     return 0;
   1195   }
   1196   Type *CT = PT->getElementType();
   1197   if (CT->getTypeID() == Type::StructTyID &&
   1198       PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
   1199     return getTypeSize(dyn_cast<StructType>(CT));
   1200   } else if (dereferencePtr) {
   1201     size_t size = 0;
   1202     for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
   1203       size += getTypeSize(PT->getContainedType(x), dereferencePtr);
   1204     }
   1205     return size;
   1206   } else {
   1207     return 4;
   1208   }
   1209 }
   1210 
   1211 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
   1212     bool dereferencePtr) {
   1213   //assert(0 && "Should not be able to calculate the size of an opaque type");
   1214   return 4;
   1215 }
   1216