Home | History | Annotate | Download | only in radeon
      1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //==-----------------------------------------------------------------------===//
      9 
     10 #include "AMDILDevices.h"
     11 #include "AMDGPUInstrInfo.h"
     12 #include "llvm/ADT/Statistic.h"
     13 #include "llvm/ADT/StringExtras.h"
     14 #include "llvm/ADT/StringRef.h"
     15 #include "llvm/ADT/Twine.h"
     16 #include "llvm/Constants.h"
     17 #include "llvm/CodeGen/MachineFunction.h"
     18 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
     19 #include "llvm/Function.h"
     20 #include "llvm/Instructions.h"
     21 #include "llvm/Module.h"
     22 #include "llvm/Support/Debug.h"
     23 #include "llvm/Support/MathExtras.h"
     24 
     25 #include <sstream>
     26 
     27 #if 0
     28 STATISTIC(PointerAssignments, "Number of dynamic pointer "
     29     "assigments discovered");
     30 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
     31 #endif
     32 
     33 using namespace llvm;
     34 // The Peephole optimization pass is used to do simple last minute optimizations
     35 // that are required for correct code or to remove redundant functions
     36 namespace {
     37 
     38 class OpaqueType;
     39 
     40 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
     41 public:
     42   TargetMachine &TM;
     43   static char ID;
     44   AMDGPUPeepholeOpt(TargetMachine &tm);
     45   ~AMDGPUPeepholeOpt();
     46   const char *getPassName() const;
     47   bool runOnFunction(Function &F);
     48   bool doInitialization(Module &M);
     49   bool doFinalization(Module &M);
     50   void getAnalysisUsage(AnalysisUsage &AU) const;
     51 protected:
     52 private:
     53   // Function to initiate all of the instruction level optimizations.
     54   bool instLevelOptimizations(BasicBlock::iterator *inst);
     55   // Quick check to see if we need to dump all of the pointers into the
     56   // arena. If this is correct, then we set all pointers to exist in arena. This
     57   // is a workaround for aliasing of pointers in a struct/union.
     58   bool dumpAllIntoArena(Function &F);
     59   // Because I don't want to invalidate any pointers while in the
     60   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
     61   // it later. This function does the conversions if required.
     62   void doAtomicConversionIfNeeded(Function &F);
     63   // Because __amdil_is_constant cannot be properly evaluated if
     64   // optimizations are disabled, the call's are placed in a vector
     65   // and evaluated after the __amdil_image* functions are evaluated
     66   // which should allow the __amdil_is_constant function to be
     67   // evaluated correctly.
     68   void doIsConstCallConversionIfNeeded();
     69   bool mChanged;
     70   bool mDebug;
     71   bool mConvertAtomics;
     72   CodeGenOpt::Level optLevel;
     73   // Run a series of tests to see if we can optimize a CALL instruction.
     74   bool optimizeCallInst(BasicBlock::iterator *bbb);
     75   // A peephole optimization to optimize bit extract sequences.
     76   bool optimizeBitExtract(Instruction *inst);
     77   // A peephole optimization to optimize bit insert sequences.
     78   bool optimizeBitInsert(Instruction *inst);
     79   bool setupBitInsert(Instruction *base,
     80                       Instruction *&src,
     81                       Constant *&mask,
     82                       Constant *&shift);
     83   // Expand the bit field insert instruction on versions of OpenCL that
     84   // don't support it.
     85   bool expandBFI(CallInst *CI);
     86   // Expand the bit field mask instruction on version of OpenCL that
     87   // don't support it.
     88   bool expandBFM(CallInst *CI);
     89   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
     90   // this case we need to expand them. These functions check for 24bit functions
     91   // and then expand.
     92   bool isSigned24BitOps(CallInst *CI);
     93   void expandSigned24BitOps(CallInst *CI);
     94   // One optimization that can occur is that if the required workgroup size is
     95   // specified then the result of get_local_size is known at compile time and
     96   // can be returned accordingly.
     97   bool isRWGLocalOpt(CallInst *CI);
     98   // On northern island cards, the division is slightly less accurate than on
     99   // previous generations, so we need to utilize a more accurate division. So we
    100   // can translate the accurate divide to a normal divide on all other cards.
    101   bool convertAccurateDivide(CallInst *CI);
    102   void expandAccurateDivide(CallInst *CI);
    103   // If the alignment is set incorrectly, it can produce really inefficient
    104   // code. This checks for this scenario and fixes it if possible.
    105   bool correctMisalignedMemOp(Instruction *inst);
    106 
    107   // If we are in no opt mode, then we need to make sure that
    108   // local samplers are properly propagated as constant propagation
    109   // doesn't occur and we need to know the value of kernel defined
    110   // samplers at compile time.
    111   bool propagateSamplerInst(CallInst *CI);
    112 
    113   // Helper functions
    114 
    115   // Group of functions that recursively calculate the size of a structure based
    116   // on it's sub-types.
    117   size_t getTypeSize(Type * const T, bool dereferencePtr = false);
    118   size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
    119   size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
    120   size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
    121   size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
    122   size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
    123   size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
    124   size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
    125 
    126   LLVMContext *mCTX;
    127   Function *mF;
    128   const AMDGPUSubtarget *mSTM;
    129   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
    130   SmallVector<CallInst *, 16> isConstVec;
    131 }; // class AMDGPUPeepholeOpt
    132   char AMDGPUPeepholeOpt::ID = 0;
    133 
    134 // A template function that has two levels of looping before calling the
    135 // function with a pointer to the current iterator.
    136 template<class InputIterator, class SecondIterator, class Function>
    137 Function safeNestedForEach(InputIterator First, InputIterator Last,
    138                               SecondIterator S, Function F)
    139 {
    140   for ( ; First != Last; ++First) {
    141     SecondIterator sf, sl;
    142     for (sf = First->begin(), sl = First->end();
    143          sf != sl; )  {
    144       if (!F(&sf)) {
    145         ++sf;
    146       }
    147     }
    148   }
    149   return F;
    150 }
    151 
    152 } // anonymous namespace
    153 
    154 namespace llvm {
    155   FunctionPass *
    156   createAMDGPUPeepholeOpt(TargetMachine &tm)
    157   {
    158     return new AMDGPUPeepholeOpt(tm);
    159   }
    160 } // llvm namespace
    161 
    162 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
    163   : FunctionPass(ID), TM(tm)
    164 {
    165   mDebug = false;
    166   optLevel = TM.getOptLevel();
    167 
    168 }
    169 
    170 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()
    171 {
    172 }
    173 
    174 const char *
    175 AMDGPUPeepholeOpt::getPassName() const
    176 {
    177   return "AMDGPU PeepHole Optimization Pass";
    178 }
    179 
    180 bool
    181 containsPointerType(Type *Ty)
    182 {
    183   if (!Ty) {
    184     return false;
    185   }
    186   switch(Ty->getTypeID()) {
    187   default:
    188     return false;
    189   case Type::StructTyID: {
    190     const StructType *ST = dyn_cast<StructType>(Ty);
    191     for (StructType::element_iterator stb = ST->element_begin(),
    192            ste = ST->element_end(); stb != ste; ++stb) {
    193       if (!containsPointerType(*stb)) {
    194         continue;
    195       }
    196       return true;
    197     }
    198     break;
    199   }
    200   case Type::VectorTyID:
    201   case Type::ArrayTyID:
    202     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
    203   case Type::PointerTyID:
    204     return true;
    205   };
    206   return false;
    207 }
    208 
    209 bool
    210 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)
    211 {
    212   bool dumpAll = false;
    213   for (Function::const_arg_iterator cab = F.arg_begin(),
    214        cae = F.arg_end(); cab != cae; ++cab) {
    215     const Argument *arg = cab;
    216     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
    217     if (!PT) {
    218       continue;
    219     }
    220     Type *DereferencedType = PT->getElementType();
    221     if (!dyn_cast<StructType>(DereferencedType)
    222         ) {
    223       continue;
    224     }
    225     if (!containsPointerType(DereferencedType)) {
    226       continue;
    227     }
    228     // FIXME: Because a pointer inside of a struct/union may be aliased to
    229     // another pointer we need to take the conservative approach and place all
    230     // pointers into the arena until more advanced detection is implemented.
    231     dumpAll = true;
    232   }
    233   return dumpAll;
    234 }
    235 void
    236 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
    237 {
    238   if (isConstVec.empty()) {
    239     return;
    240   }
    241   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
    242     CallInst *CI = isConstVec[x];
    243     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
    244     Type *aType = Type::getInt32Ty(*mCTX);
    245     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
    246       : ConstantInt::get(aType, 0);
    247     CI->replaceAllUsesWith(Val);
    248     CI->eraseFromParent();
    249   }
    250   isConstVec.clear();
    251 }
    252 void
    253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
    254 {
    255   // Don't do anything if we don't have any atomic operations.
    256   if (atomicFuncs.empty()) {
    257     return;
    258   }
    259   // Change the function name for the atomic if it is required
    260   uint32_t size = atomicFuncs.size();
    261   for (uint32_t x = 0; x < size; ++x) {
    262     atomicFuncs[x].first->setOperand(
    263         atomicFuncs[x].first->getNumOperands()-1,
    264         atomicFuncs[x].second);
    265 
    266   }
    267   mChanged = true;
    268   if (mConvertAtomics) {
    269     return;
    270   }
    271 }
    272 
    273 bool
    274 AMDGPUPeepholeOpt::runOnFunction(Function &MF)
    275 {
    276   mChanged = false;
    277   mF = &MF;
    278   mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
    279   if (mDebug) {
    280     MF.dump();
    281   }
    282   mCTX = &MF.getType()->getContext();
    283   mConvertAtomics = true;
    284   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
    285      std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
    286                   this));
    287 
    288   doAtomicConversionIfNeeded(MF);
    289   doIsConstCallConversionIfNeeded();
    290 
    291   if (mDebug) {
    292     MF.dump();
    293   }
    294   return mChanged;
    295 }
    296 
    297 bool
    298 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
    299 {
    300   Instruction *inst = (*bbb);
    301   CallInst *CI = dyn_cast<CallInst>(inst);
    302   if (!CI) {
    303     return false;
    304   }
    305   if (isSigned24BitOps(CI)) {
    306     expandSigned24BitOps(CI);
    307     ++(*bbb);
    308     CI->eraseFromParent();
    309     return true;
    310   }
    311   if (propagateSamplerInst(CI)) {
    312     return false;
    313   }
    314   if (expandBFI(CI) || expandBFM(CI)) {
    315     ++(*bbb);
    316     CI->eraseFromParent();
    317     return true;
    318   }
    319   if (convertAccurateDivide(CI)) {
    320     expandAccurateDivide(CI);
    321     ++(*bbb);
    322     CI->eraseFromParent();
    323     return true;
    324   }
    325 
    326   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
    327   if (calleeName.startswith("__amdil_is_constant")) {
    328     // If we do not have optimizations, then this
    329     // cannot be properly evaluated, so we add the
    330     // call instruction to a vector and process
    331     // them at the end of processing after the
    332     // samplers have been correctly handled.
    333     if (optLevel == CodeGenOpt::None) {
    334       isConstVec.push_back(CI);
    335       return false;
    336     } else {
    337       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
    338       Type *aType = Type::getInt32Ty(*mCTX);
    339       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
    340         : ConstantInt::get(aType, 0);
    341       CI->replaceAllUsesWith(Val);
    342       ++(*bbb);
    343       CI->eraseFromParent();
    344       return true;
    345     }
    346   }
    347 
    348   if (calleeName.equals("__amdil_is_asic_id_i32")) {
    349     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
    350     Type *aType = Type::getInt32Ty(*mCTX);
    351     Value *Val = CV;
    352     if (Val) {
    353       Val = ConstantInt::get(aType,
    354           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
    355     } else {
    356       Val = ConstantInt::get(aType, 0);
    357     }
    358     CI->replaceAllUsesWith(Val);
    359     ++(*bbb);
    360     CI->eraseFromParent();
    361     return true;
    362   }
    363   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
    364   if (!F) {
    365     return false;
    366   }
    367   if (F->getName().startswith("__atom") && !CI->getNumUses()
    368       && F->getName().find("_xchg") == StringRef::npos) {
    369     std::string buffer(F->getName().str() + "_noret");
    370     F = dyn_cast<Function>(
    371           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
    372     atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
    373   }
    374 
    375   if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
    376       && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
    377     return false;
    378   }
    379   if (!mConvertAtomics) {
    380     return false;
    381   }
    382   StringRef name = F->getName();
    383   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
    384     mConvertAtomics = false;
    385   }
    386   return false;
    387 }
    388 
    389 bool
    390 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
    391     Instruction *&src,
    392     Constant *&mask,
    393     Constant *&shift)
    394 {
    395   if (!base) {
    396     if (mDebug) {
    397       dbgs() << "Null pointer passed into function.\n";
    398     }
    399     return false;
    400   }
    401   bool andOp = false;
    402   if (base->getOpcode() == Instruction::Shl) {
    403     shift = dyn_cast<Constant>(base->getOperand(1));
    404   } else if (base->getOpcode() == Instruction::And) {
    405     mask = dyn_cast<Constant>(base->getOperand(1));
    406     andOp = true;
    407   } else {
    408     if (mDebug) {
    409       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
    410     }
    411     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
    412     return false;
    413   }
    414   src = dyn_cast<Instruction>(base->getOperand(0));
    415   if (!src) {
    416     if (mDebug) {
    417       dbgs() << "Failed setup since the base operand is not an instruction!\n";
    418     }
    419     return false;
    420   }
    421   // If we find an 'and' operation, then we don't need to
    422   // find the next operation as we already know the
    423   // bits that are valid at this point.
    424   if (andOp) {
    425     return true;
    426   }
    427   if (src->getOpcode() == Instruction::Shl && !shift) {
    428     shift = dyn_cast<Constant>(src->getOperand(1));
    429     src = dyn_cast<Instruction>(src->getOperand(0));
    430   } else if (src->getOpcode() == Instruction::And && !mask) {
    431     mask = dyn_cast<Constant>(src->getOperand(1));
    432   }
    433   if (!mask && !shift) {
    434     if (mDebug) {
    435       dbgs() << "Failed setup since both mask and shift are NULL!\n";
    436     }
    437     // Did not find a constant mask or a shift.
    438     return false;
    439   }
    440   return true;
    441 }
    442 bool
    443 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)
    444 {
    445   if (!inst) {
    446     return false;
    447   }
    448   if (!inst->isBinaryOp()) {
    449     return false;
    450   }
    451   if (inst->getOpcode() != Instruction::Or) {
    452     return false;
    453   }
    454   if (optLevel == CodeGenOpt::None) {
    455     return false;
    456   }
    457   // We want to do an optimization on a sequence of ops that in the end equals a
    458   // single ISA instruction.
    459   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
    460   // Some simplified versions of this pattern are as follows:
    461   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
    462   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
    463   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
    464   // (A & B) | (D << F) when (1 << F) >= B
    465   // (A << C) | (D & E) when (1 << C) >= E
    466   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
    467     // The HD4XXX hardware doesn't support the ubit_insert instruction.
    468     return false;
    469   }
    470   Type *aType = inst->getType();
    471   bool isVector = aType->isVectorTy();
    472   int numEle = 1;
    473   // This optimization only works on 32bit integers.
    474   if (aType->getScalarType()
    475       != Type::getInt32Ty(inst->getContext())) {
    476     return false;
    477   }
    478   if (isVector) {
    479     const VectorType *VT = dyn_cast<VectorType>(aType);
    480     numEle = VT->getNumElements();
    481     // We currently cannot support more than 4 elements in a intrinsic and we
    482     // cannot support Vec3 types.
    483     if (numEle > 4 || numEle == 3) {
    484       return false;
    485     }
    486   }
    487   // TODO: Handle vectors.
    488   if (isVector) {
    489     if (mDebug) {
    490       dbgs() << "!!! Vectors are not supported yet!\n";
    491     }
    492     return false;
    493   }
    494   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
    495   Constant *LHSMask = NULL, *RHSMask = NULL;
    496   Constant *LHSShift = NULL, *RHSShift = NULL;
    497   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
    498   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
    499   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
    500     if (mDebug) {
    501       dbgs() << "Found an OR Operation that failed setup!\n";
    502       inst->dump();
    503       if (LHS) { LHS->dump(); }
    504       if (LHSSrc) { LHSSrc->dump(); }
    505       if (LHSMask) { LHSMask->dump(); }
    506       if (LHSShift) { LHSShift->dump(); }
    507     }
    508     // There was an issue with the setup for BitInsert.
    509     return false;
    510   }
    511   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
    512     if (mDebug) {
    513       dbgs() << "Found an OR Operation that failed setup!\n";
    514       inst->dump();
    515       if (RHS) { RHS->dump(); }
    516       if (RHSSrc) { RHSSrc->dump(); }
    517       if (RHSMask) { RHSMask->dump(); }
    518       if (RHSShift) { RHSShift->dump(); }
    519     }
    520     // There was an issue with the setup for BitInsert.
    521     return false;
    522   }
    523   if (mDebug) {
    524     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
    525     dbgs() << "Op:        "; inst->dump();
    526     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
    527     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
    528     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
    529     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
    530     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
    531     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
    532     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
    533     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
    534   }
    535   Constant *offset = NULL;
    536   Constant *width = NULL;
    537   int32_t lhsMaskVal = 0, rhsMaskVal = 0;
    538   int32_t lhsShiftVal = 0, rhsShiftVal = 0;
    539   int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
    540   int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
    541   lhsMaskVal = (int32_t)(LHSMask
    542       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
    543   rhsMaskVal = (int32_t)(RHSMask
    544       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
    545   lhsShiftVal = (int32_t)(LHSShift
    546       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
    547   rhsShiftVal = (int32_t)(RHSShift
    548       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
    549   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
    550   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
    551   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
    552   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
    553   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
    554   if (mDebug) {
    555       dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
    556       dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
    557       dbgs() << (RHSMask ? " & E)" : ")");
    558       dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
    559       dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
    560       dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
    561       dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
    562       dbgs() << "width(B) = " << lhsMaskWidth;
    563       dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
    564       dbgs() << "offset(B) = " << lhsMaskOffset;
    565       dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
    566       dbgs() << "Constraints: \n";
    567       dbgs() << "\t(1) B ^ E == 0\n";
    568       dbgs() << "\t(2-LHS) B is a mask\n";
    569       dbgs() << "\t(2-LHS) E is a mask\n";
    570       dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
    571       dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
    572   }
    573   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
    574     if (mDebug) {
    575       dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
    576       dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
    577       dbgs() << "Failed constraint 1!\n";
    578     }
    579     return false;
    580   }
    581   if (mDebug) {
    582     dbgs() << "LHS = " << lhsMaskOffset << "";
    583     dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
    584     dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
    585     dbgs() << "\nRHS = " << rhsMaskOffset << "";
    586     dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
    587     dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
    588     dbgs() << "\n";
    589   }
    590   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
    591     offset = ConstantInt::get(aType, lhsMaskOffset, false);
    592     width = ConstantInt::get(aType, lhsMaskWidth, false);
    593     RHSSrc = RHS;
    594     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
    595       if (mDebug) {
    596         dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
    597         dbgs() << "Failed constraint 2!\n";
    598       }
    599       return false;
    600     }
    601     if (!LHSShift) {
    602       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    603           "MaskShr", LHS);
    604     } else if (lhsShiftVal != lhsMaskOffset) {
    605       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    606           "MaskShr", LHS);
    607     }
    608     if (mDebug) {
    609       dbgs() << "Optimizing LHS!\n";
    610     }
    611   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
    612     offset = ConstantInt::get(aType, rhsMaskOffset, false);
    613     width = ConstantInt::get(aType, rhsMaskWidth, false);
    614     LHSSrc = RHSSrc;
    615     RHSSrc = LHS;
    616     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
    617       if (mDebug) {
    618         dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
    619         dbgs() << "Failed constraint 2!\n";
    620       }
    621       return false;
    622     }
    623     if (!RHSShift) {
    624       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    625           "MaskShr", RHS);
    626     } else if (rhsShiftVal != rhsMaskOffset) {
    627       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
    628           "MaskShr", RHS);
    629     }
    630     if (mDebug) {
    631       dbgs() << "Optimizing RHS!\n";
    632     }
    633   } else {
    634     if (mDebug) {
    635       dbgs() << "Failed constraint 3!\n";
    636     }
    637     return false;
    638   }
    639   if (mDebug) {
    640     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
    641     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
    642     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
    643     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
    644   }
    645   if (!offset || !width) {
    646     if (mDebug) {
    647       dbgs() << "Either width or offset are NULL, failed detection!\n";
    648     }
    649     return false;
    650   }
    651   // Lets create the function signature.
    652   std::vector<Type *> callTypes;
    653   callTypes.push_back(aType);
    654   callTypes.push_back(aType);
    655   callTypes.push_back(aType);
    656   callTypes.push_back(aType);
    657   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
    658   std::string name = "__amdil_ubit_insert";
    659   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
    660   Function *Func =
    661     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
    662         getOrInsertFunction(llvm::StringRef(name), funcType));
    663   Value *Operands[4] = {
    664     width,
    665     offset,
    666     LHSSrc,
    667     RHSSrc
    668   };
    669   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
    670   if (mDebug) {
    671     dbgs() << "Old Inst: ";
    672     inst->dump();
    673     dbgs() << "New Inst: ";
    674     CI->dump();
    675     dbgs() << "\n\n";
    676   }
    677   CI->insertBefore(inst);
    678   inst->replaceAllUsesWith(CI);
    679   return true;
    680 }
    681 
    682 bool
    683 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)
    684 {
    685   if (!inst) {
    686     return false;
    687   }
    688   if (!inst->isBinaryOp()) {
    689     return false;
    690   }
    691   if (inst->getOpcode() != Instruction::And) {
    692     return false;
    693   }
    694   if (optLevel == CodeGenOpt::None) {
    695     return false;
    696   }
    697   // We want to do some simple optimizations on Shift right/And patterns. The
    698   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
    699   // value smaller than 32 and C is a mask. If C is a constant value, then the
    700   // following transformation can occur. For signed integers, it turns into the
    701   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
    702   // integers, it turns into the function call dst =
    703   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
    704   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
    705   // Evergreen hardware.
    706   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
    707     // This does not work on HD4XXX hardware.
    708     return false;
    709   }
    710   Type *aType = inst->getType();
    711   bool isVector = aType->isVectorTy();
    712 
    713   // XXX Support vector types
    714   if (isVector) {
    715     return false;
    716   }
    717   int numEle = 1;
    718   // This only works on 32bit integers
    719   if (aType->getScalarType()
    720       != Type::getInt32Ty(inst->getContext())) {
    721     return false;
    722   }
    723   if (isVector) {
    724     const VectorType *VT = dyn_cast<VectorType>(aType);
    725     numEle = VT->getNumElements();
    726     // We currently cannot support more than 4 elements in a intrinsic and we
    727     // cannot support Vec3 types.
    728     if (numEle > 4 || numEle == 3) {
    729       return false;
    730     }
    731   }
    732   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
    733   // If the first operand is not a shift instruction, then we can return as it
    734   // doesn't match this pattern.
    735   if (!ShiftInst || !ShiftInst->isShift()) {
    736     return false;
    737   }
    738   // If we are a shift left, then we need don't match this pattern.
    739   if (ShiftInst->getOpcode() == Instruction::Shl) {
    740     return false;
    741   }
    742   bool isSigned = ShiftInst->isArithmeticShift();
    743   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
    744   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
    745   // Lets make sure that the shift value and the and mask are constant integers.
    746   if (!AndMask || !ShrVal) {
    747     return false;
    748   }
    749   Constant *newMaskConst;
    750   Constant *shiftValConst;
    751   if (isVector) {
    752     // Handle the vector case
    753     std::vector<Constant *> maskVals;
    754     std::vector<Constant *> shiftVals;
    755     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
    756     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
    757     Type *scalarType = AndMaskVec->getType()->getScalarType();
    758     assert(AndMaskVec->getNumOperands() ==
    759            ShrValVec->getNumOperands() && "cannot have a "
    760            "combination where the number of elements to a "
    761            "shift and an and are different!");
    762     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
    763       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
    764       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
    765       if (!AndCI || !ShiftIC) {
    766         return false;
    767       }
    768       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
    769       if (!isMask_32(maskVal)) {
    770         return false;
    771       }
    772       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
    773       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
    774       // If the mask or shiftval is greater than the bitcount, then break out.
    775       if (maskVal >= 32 || shiftVal >= 32) {
    776         return false;
    777       }
    778       // If the mask val is greater than the the number of original bits left
    779       // then this optimization is invalid.
    780       if (maskVal > (32 - shiftVal)) {
    781         return false;
    782       }
    783       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
    784       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
    785     }
    786     newMaskConst = ConstantVector::get(maskVals);
    787     shiftValConst = ConstantVector::get(shiftVals);
    788   } else {
    789     // Handle the scalar case
    790     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
    791     // This must be a mask value where all lower bits are set to 1 and then any
    792     // bit higher is set to 0.
    793     if (!isMask_32(maskVal)) {
    794       return false;
    795     }
    796     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
    797     // Count the number of bits set in the mask, this is the width of the
    798     // resulting bit set that is extracted from the source value.
    799     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
    800     // If the mask or shift val is greater than the bitcount, then break out.
    801     if (maskVal >= 32 || shiftVal >= 32) {
    802       return false;
    803     }
    804     // If the mask val is greater than the the number of original bits left then
    805     // this optimization is invalid.
    806     if (maskVal > (32 - shiftVal)) {
    807       return false;
    808     }
    809     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
    810     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
    811   }
    812   // Lets create the function signature.
    813   std::vector<Type *> callTypes;
    814   callTypes.push_back(aType);
    815   callTypes.push_back(aType);
    816   callTypes.push_back(aType);
    817   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
    818   std::string name = "llvm.AMDIL.bit.extract.u32";
    819   if (isVector) {
    820     name += ".v" + itostr(numEle) + "i32";
    821   } else {
    822     name += ".";
    823   }
    824   // Lets create the function.
    825   Function *Func =
    826     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
    827                        getOrInsertFunction(llvm::StringRef(name), funcType));
    828   Value *Operands[3] = {
    829     ShiftInst->getOperand(0),
    830     shiftValConst,
    831     newMaskConst
    832   };
    833   // Lets create the Call with the operands
    834   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
    835   CI->setDoesNotAccessMemory();
    836   CI->insertBefore(inst);
    837   inst->replaceAllUsesWith(CI);
    838   return true;
    839 }
    840 
    841 bool
    842 AMDGPUPeepholeOpt::expandBFI(CallInst *CI)
    843 {
    844   if (!CI) {
    845     return false;
    846   }
    847   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
    848   if (!LHS->getName().startswith("__amdil_bfi")) {
    849     return false;
    850   }
    851   Type* type = CI->getOperand(0)->getType();
    852   Constant *negOneConst = NULL;
    853   if (type->isVectorTy()) {
    854     std::vector<Constant *> negOneVals;
    855     negOneConst = ConstantInt::get(CI->getContext(),
    856         APInt(32, StringRef("-1"), 10));
    857     for (size_t x = 0,
    858         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
    859       negOneVals.push_back(negOneConst);
    860     }
    861     negOneConst = ConstantVector::get(negOneVals);
    862   } else {
    863     negOneConst = ConstantInt::get(CI->getContext(),
    864         APInt(32, StringRef("-1"), 10));
    865   }
    866   // __amdil_bfi => (A & B) | (~A & C)
    867   BinaryOperator *lhs =
    868     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
    869         CI->getOperand(1), "bfi_and", CI);
    870   BinaryOperator *rhs =
    871     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
    872         "bfi_not", CI);
    873   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
    874       "bfi_and", CI);
    875   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
    876   CI->replaceAllUsesWith(lhs);
    877   return true;
    878 }
    879 
    880 bool
    881 AMDGPUPeepholeOpt::expandBFM(CallInst *CI)
    882 {
    883   if (!CI) {
    884     return false;
    885   }
    886   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
    887   if (!LHS->getName().startswith("__amdil_bfm")) {
    888     return false;
    889   }
    890   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
    891   Constant *newMaskConst = NULL;
    892   Constant *newShiftConst = NULL;
    893   Type* type = CI->getOperand(0)->getType();
    894   if (type->isVectorTy()) {
    895     std::vector<Constant*> newMaskVals, newShiftVals;
    896     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
    897     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
    898     for (size_t x = 0,
    899         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
    900       newMaskVals.push_back(newMaskConst);
    901       newShiftVals.push_back(newShiftConst);
    902     }
    903     newMaskConst = ConstantVector::get(newMaskVals);
    904     newShiftConst = ConstantVector::get(newShiftVals);
    905   } else {
    906     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
    907     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
    908   }
    909   BinaryOperator *lhs =
    910     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
    911         newMaskConst, "bfm_mask", CI);
    912   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
    913       lhs, "bfm_shl", CI);
    914   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
    915       newShiftConst, "bfm_sub", CI);
    916   BinaryOperator *rhs =
    917     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
    918         newMaskConst, "bfm_mask", CI);
    919   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
    920   CI->replaceAllUsesWith(lhs);
    921   return true;
    922 }
    923 
    924 bool
    925 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
    926 {
    927   Instruction *inst = (*bbb);
    928   if (optimizeCallInst(bbb)) {
    929     return true;
    930   }
    931   if (optimizeBitExtract(inst)) {
    932     return false;
    933   }
    934   if (optimizeBitInsert(inst)) {
    935     return false;
    936   }
    937   if (correctMisalignedMemOp(inst)) {
    938     return false;
    939   }
    940   return false;
    941 }
    942 bool
    943 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
    944 {
    945   LoadInst *linst = dyn_cast<LoadInst>(inst);
    946   StoreInst *sinst = dyn_cast<StoreInst>(inst);
    947   unsigned alignment;
    948   Type* Ty = inst->getType();
    949   if (linst) {
    950     alignment = linst->getAlignment();
    951     Ty = inst->getType();
    952   } else if (sinst) {
    953     alignment = sinst->getAlignment();
    954     Ty = sinst->getValueOperand()->getType();
    955   } else {
    956     return false;
    957   }
    958   unsigned size = getTypeSize(Ty);
    959   if (size == alignment || size < alignment) {
    960     return false;
    961   }
    962   if (!Ty->isStructTy()) {
    963     return false;
    964   }
    965   if (alignment < 4) {
    966     if (linst) {
    967       linst->setAlignment(0);
    968       return true;
    969     } else if (sinst) {
    970       sinst->setAlignment(0);
    971       return true;
    972     }
    973   }
    974   return false;
    975 }
    976 bool
    977 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)
    978 {
    979   if (!CI) {
    980     return false;
    981   }
    982   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
    983   std::string namePrefix = LHS->getName().substr(0, 14);
    984   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
    985       && namePrefix != "__amdil__imul24_high") {
    986     return false;
    987   }
    988   if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
    989     return false;
    990   }
    991   return true;
    992 }
    993 
    994 void
    995 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)
    996 {
    997   assert(isSigned24BitOps(CI) && "Must be a "
    998       "signed 24 bit operation to call this function!");
    999   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
   1000   // On 7XX and 8XX we do not have signed 24bit, so we need to
   1001   // expand it to the following:
   1002   // imul24 turns into 32bit imul
   1003   // imad24 turns into 32bit imad
   1004   // imul24_high turns into 32bit imulhigh
   1005   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
   1006     Type *aType = CI->getOperand(0)->getType();
   1007     bool isVector = aType->isVectorTy();
   1008     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
   1009     std::vector<Type*> callTypes;
   1010     callTypes.push_back(CI->getOperand(0)->getType());
   1011     callTypes.push_back(CI->getOperand(1)->getType());
   1012     callTypes.push_back(CI->getOperand(2)->getType());
   1013     FunctionType *funcType =
   1014       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
   1015     std::string name = "__amdil_imad";
   1016     if (isVector) {
   1017       name += "_v" + itostr(numEle) + "i32";
   1018     } else {
   1019       name += "_i32";
   1020     }
   1021     Function *Func = dyn_cast<Function>(
   1022                        CI->getParent()->getParent()->getParent()->
   1023                        getOrInsertFunction(llvm::StringRef(name), funcType));
   1024     Value *Operands[3] = {
   1025       CI->getOperand(0),
   1026       CI->getOperand(1),
   1027       CI->getOperand(2)
   1028     };
   1029     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
   1030     nCI->insertBefore(CI);
   1031     CI->replaceAllUsesWith(nCI);
   1032   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
   1033     BinaryOperator *mulOp =
   1034       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
   1035           CI->getOperand(1), "imul24", CI);
   1036     CI->replaceAllUsesWith(mulOp);
   1037   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
   1038     Type *aType = CI->getOperand(0)->getType();
   1039 
   1040     bool isVector = aType->isVectorTy();
   1041     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
   1042     std::vector<Type*> callTypes;
   1043     callTypes.push_back(CI->getOperand(0)->getType());
   1044     callTypes.push_back(CI->getOperand(1)->getType());
   1045     FunctionType *funcType =
   1046       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
   1047     std::string name = "__amdil_imul_high";
   1048     if (isVector) {
   1049       name += "_v" + itostr(numEle) + "i32";
   1050     } else {
   1051       name += "_i32";
   1052     }
   1053     Function *Func = dyn_cast<Function>(
   1054                        CI->getParent()->getParent()->getParent()->
   1055                        getOrInsertFunction(llvm::StringRef(name), funcType));
   1056     Value *Operands[2] = {
   1057       CI->getOperand(0),
   1058       CI->getOperand(1)
   1059     };
   1060     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
   1061     nCI->insertBefore(CI);
   1062     CI->replaceAllUsesWith(nCI);
   1063   }
   1064 }
   1065 
   1066 bool
   1067 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)
   1068 {
   1069   return (CI != NULL
   1070           && CI->getOperand(CI->getNumOperands() - 1)->getName()
   1071           == "__amdil_get_local_size_int");
   1072 }
   1073 
   1074 bool
   1075 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)
   1076 {
   1077   if (!CI) {
   1078     return false;
   1079   }
   1080   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
   1081       && (mSTM->getDeviceName() == "cayman")) {
   1082     return false;
   1083   }
   1084   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
   1085       == "__amdil_improved_div";
   1086 }
   1087 
   1088 void
   1089 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)
   1090 {
   1091   assert(convertAccurateDivide(CI)
   1092          && "expanding accurate divide can only happen if it is expandable!");
   1093   BinaryOperator *divOp =
   1094     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
   1095                            CI->getOperand(1), "fdiv32", CI);
   1096   CI->replaceAllUsesWith(divOp);
   1097 }
   1098 
   1099 bool
   1100 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI)
   1101 {
   1102   if (optLevel != CodeGenOpt::None) {
   1103     return false;
   1104   }
   1105 
   1106   if (!CI) {
   1107     return false;
   1108   }
   1109 
   1110   unsigned funcNameIdx = 0;
   1111   funcNameIdx = CI->getNumOperands() - 1;
   1112   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
   1113   if (calleeName != "__amdil_image2d_read_norm"
   1114    && calleeName != "__amdil_image2d_read_unnorm"
   1115    && calleeName != "__amdil_image3d_read_norm"
   1116    && calleeName != "__amdil_image3d_read_unnorm") {
   1117     return false;
   1118   }
   1119 
   1120   unsigned samplerIdx = 2;
   1121   samplerIdx = 1;
   1122   Value *sampler = CI->getOperand(samplerIdx);
   1123   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
   1124   if (!lInst) {
   1125     return false;
   1126   }
   1127 
   1128   if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
   1129     return false;
   1130   }
   1131 
   1132   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
   1133   // If we are loading from what is not a global value, then we
   1134   // fail and return.
   1135   if (!gv) {
   1136     return false;
   1137   }
   1138 
   1139   // If we don't have an initializer or we have an initializer and
   1140   // the initializer is not a 32bit integer, we fail.
   1141   if (!gv->hasInitializer()
   1142       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
   1143       return false;
   1144   }
   1145 
   1146   // Now that we have the global variable initializer, lets replace
   1147   // all uses of the load instruction with the samplerVal and
   1148   // reparse the __amdil_is_constant() function.
   1149   Constant *samplerVal = gv->getInitializer();
   1150   lInst->replaceAllUsesWith(samplerVal);
   1151   return true;
   1152 }
   1153 
   1154 bool
   1155 AMDGPUPeepholeOpt::doInitialization(Module &M)
   1156 {
   1157   return false;
   1158 }
   1159 
   1160 bool
   1161 AMDGPUPeepholeOpt::doFinalization(Module &M)
   1162 {
   1163   return false;
   1164 }
   1165 
   1166 void
   1167 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
   1168 {
   1169   AU.addRequired<MachineFunctionAnalysis>();
   1170   FunctionPass::getAnalysisUsage(AU);
   1171   AU.setPreservesAll();
   1172 }
   1173 
   1174 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
   1175   size_t size = 0;
   1176   if (!T) {
   1177     return size;
   1178   }
   1179   switch (T->getTypeID()) {
   1180   case Type::X86_FP80TyID:
   1181   case Type::FP128TyID:
   1182   case Type::PPC_FP128TyID:
   1183   case Type::LabelTyID:
   1184     assert(0 && "These types are not supported by this backend");
   1185   default:
   1186   case Type::FloatTyID:
   1187   case Type::DoubleTyID:
   1188     size = T->getPrimitiveSizeInBits() >> 3;
   1189     break;
   1190   case Type::PointerTyID:
   1191     size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
   1192     break;
   1193   case Type::IntegerTyID:
   1194     size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
   1195     break;
   1196   case Type::StructTyID:
   1197     size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
   1198     break;
   1199   case Type::ArrayTyID:
   1200     size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
   1201     break;
   1202   case Type::FunctionTyID:
   1203     size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
   1204     break;
   1205   case Type::VectorTyID:
   1206     size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
   1207     break;
   1208   };
   1209   return size;
   1210 }
   1211 
   1212 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
   1213     bool dereferencePtr) {
   1214   size_t size = 0;
   1215   if (!ST) {
   1216     return size;
   1217   }
   1218   Type *curType;
   1219   StructType::element_iterator eib;
   1220   StructType::element_iterator eie;
   1221   for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
   1222     curType = *eib;
   1223     size += getTypeSize(curType, dereferencePtr);
   1224   }
   1225   return size;
   1226 }
   1227 
   1228 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
   1229     bool dereferencePtr) {
   1230   return IT ? (IT->getBitWidth() >> 3) : 0;
   1231 }
   1232 
   1233 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
   1234     bool dereferencePtr) {
   1235     assert(0 && "Should not be able to calculate the size of an function type");
   1236     return 0;
   1237 }
   1238 
   1239 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
   1240     bool dereferencePtr) {
   1241   return (size_t)(AT ? (getTypeSize(AT->getElementType(),
   1242                                     dereferencePtr) * AT->getNumElements())
   1243                      : 0);
   1244 }
   1245 
   1246 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
   1247     bool dereferencePtr) {
   1248   return VT ? (VT->getBitWidth() >> 3) : 0;
   1249 }
   1250 
   1251 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
   1252     bool dereferencePtr) {
   1253   if (!PT) {
   1254     return 0;
   1255   }
   1256   Type *CT = PT->getElementType();
   1257   if (CT->getTypeID() == Type::StructTyID &&
   1258       PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
   1259     return getTypeSize(dyn_cast<StructType>(CT));
   1260   } else if (dereferencePtr) {
   1261     size_t size = 0;
   1262     for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
   1263       size += getTypeSize(PT->getContainedType(x), dereferencePtr);
   1264     }
   1265     return size;
   1266   } else {
   1267     return 4;
   1268   }
   1269 }
   1270 
   1271 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
   1272     bool dereferencePtr) {
   1273   //assert(0 && "Should not be able to calculate the size of an opaque type");
   1274   return 4;
   1275 }
   1276