1 //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 /// \file 9 //==-----------------------------------------------------------------------===// 10 11 #define DEBUG_TYPE "PeepholeOpt" 12 #ifdef DEBUG 13 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) 14 #else 15 #define DEBUGME 0 16 #endif 17 18 #include "AMDILDevices.h" 19 #include "AMDGPUInstrInfo.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/ADT/StringExtras.h" 22 #include "llvm/ADT/StringRef.h" 23 #include "llvm/ADT/Twine.h" 24 #include "llvm/IR/Constants.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/Instructions.h" 29 #include "llvm/IR/Module.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/MathExtras.h" 32 33 #include <sstream> 34 35 #if 0 36 STATISTIC(PointerAssignments, "Number of dynamic pointer " 37 "assigments discovered"); 38 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); 39 #endif 40 41 using namespace llvm; 42 // The Peephole optimization pass is used to do simple last minute optimizations 43 // that are required for correct code or to remove redundant functions 44 namespace { 45 46 class OpaqueType; 47 48 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { 49 public: 50 TargetMachine &TM; 51 static char ID; 52 AMDGPUPeepholeOpt(TargetMachine &tm); 53 ~AMDGPUPeepholeOpt(); 54 const char *getPassName() const; 55 bool runOnFunction(Function &F); 56 bool doInitialization(Module &M); 57 bool doFinalization(Module &M); 58 void getAnalysisUsage(AnalysisUsage &AU) const; 59 protected: 60 private: 61 // Function to initiate all of the instruction level optimizations. 62 bool instLevelOptimizations(BasicBlock::iterator *inst); 63 // Quick check to see if we need to dump all of the pointers into the 64 // arena. If this is correct, then we set all pointers to exist in arena. This 65 // is a workaround for aliasing of pointers in a struct/union. 66 bool dumpAllIntoArena(Function &F); 67 // Because I don't want to invalidate any pointers while in the 68 // safeNestedForEachFunction. I push atomic conversions to a vector and handle 69 // it later. This function does the conversions if required. 70 void doAtomicConversionIfNeeded(Function &F); 71 // Because __amdil_is_constant cannot be properly evaluated if 72 // optimizations are disabled, the call's are placed in a vector 73 // and evaluated after the __amdil_image* functions are evaluated 74 // which should allow the __amdil_is_constant function to be 75 // evaluated correctly. 76 void doIsConstCallConversionIfNeeded(); 77 bool mChanged; 78 bool mDebug; 79 bool mConvertAtomics; 80 CodeGenOpt::Level optLevel; 81 // Run a series of tests to see if we can optimize a CALL instruction. 82 bool optimizeCallInst(BasicBlock::iterator *bbb); 83 // A peephole optimization to optimize bit extract sequences. 84 bool optimizeBitExtract(Instruction *inst); 85 // A peephole optimization to optimize bit insert sequences. 86 bool optimizeBitInsert(Instruction *inst); 87 bool setupBitInsert(Instruction *base, 88 Instruction *&src, 89 Constant *&mask, 90 Constant *&shift); 91 // Expand the bit field insert instruction on versions of OpenCL that 92 // don't support it. 93 bool expandBFI(CallInst *CI); 94 // Expand the bit field mask instruction on version of OpenCL that 95 // don't support it. 96 bool expandBFM(CallInst *CI); 97 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in 98 // this case we need to expand them. These functions check for 24bit functions 99 // and then expand. 100 bool isSigned24BitOps(CallInst *CI); 101 void expandSigned24BitOps(CallInst *CI); 102 // One optimization that can occur is that if the required workgroup size is 103 // specified then the result of get_local_size is known at compile time and 104 // can be returned accordingly. 105 bool isRWGLocalOpt(CallInst *CI); 106 // On northern island cards, the division is slightly less accurate than on 107 // previous generations, so we need to utilize a more accurate division. So we 108 // can translate the accurate divide to a normal divide on all other cards. 109 bool convertAccurateDivide(CallInst *CI); 110 void expandAccurateDivide(CallInst *CI); 111 // If the alignment is set incorrectly, it can produce really inefficient 112 // code. This checks for this scenario and fixes it if possible. 113 bool correctMisalignedMemOp(Instruction *inst); 114 115 // If we are in no opt mode, then we need to make sure that 116 // local samplers are properly propagated as constant propagation 117 // doesn't occur and we need to know the value of kernel defined 118 // samplers at compile time. 119 bool propagateSamplerInst(CallInst *CI); 120 121 // Helper functions 122 123 // Group of functions that recursively calculate the size of a structure based 124 // on it's sub-types. 125 size_t getTypeSize(Type * const T, bool dereferencePtr = false); 126 size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); 127 size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); 128 size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); 129 size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); 130 size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); 131 size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); 132 size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); 133 134 LLVMContext *mCTX; 135 Function *mF; 136 const AMDGPUSubtarget *mSTM; 137 SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; 138 SmallVector<CallInst *, 16> isConstVec; 139 }; // class AMDGPUPeepholeOpt 140 char AMDGPUPeepholeOpt::ID = 0; 141 142 // A template function that has two levels of looping before calling the 143 // function with a pointer to the current iterator. 144 template<class InputIterator, class SecondIterator, class Function> 145 Function safeNestedForEach(InputIterator First, InputIterator Last, 146 SecondIterator S, Function F) { 147 for ( ; First != Last; ++First) { 148 SecondIterator sf, sl; 149 for (sf = First->begin(), sl = First->end(); 150 sf != sl; ) { 151 if (!F(&sf)) { 152 ++sf; 153 } 154 } 155 } 156 return F; 157 } 158 159 } // anonymous namespace 160 161 namespace llvm { 162 FunctionPass * 163 createAMDGPUPeepholeOpt(TargetMachine &tm) { 164 return new AMDGPUPeepholeOpt(tm); 165 } 166 } // llvm namespace 167 168 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) 169 : FunctionPass(ID), TM(tm) { 170 mDebug = DEBUGME; 171 optLevel = TM.getOptLevel(); 172 173 } 174 175 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { 176 } 177 178 const char * 179 AMDGPUPeepholeOpt::getPassName() const { 180 return "AMDGPU PeepHole Optimization Pass"; 181 } 182 183 bool 184 containsPointerType(Type *Ty) { 185 if (!Ty) { 186 return false; 187 } 188 switch(Ty->getTypeID()) { 189 default: 190 return false; 191 case Type::StructTyID: { 192 const StructType *ST = dyn_cast<StructType>(Ty); 193 for (StructType::element_iterator stb = ST->element_begin(), 194 ste = ST->element_end(); stb != ste; ++stb) { 195 if (!containsPointerType(*stb)) { 196 continue; 197 } 198 return true; 199 } 200 break; 201 } 202 case Type::VectorTyID: 203 case Type::ArrayTyID: 204 return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); 205 case Type::PointerTyID: 206 return true; 207 }; 208 return false; 209 } 210 211 bool 212 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { 213 bool dumpAll = false; 214 for (Function::const_arg_iterator cab = F.arg_begin(), 215 cae = F.arg_end(); cab != cae; ++cab) { 216 const Argument *arg = cab; 217 const PointerType *PT = dyn_cast<PointerType>(arg->getType()); 218 if (!PT) { 219 continue; 220 } 221 Type *DereferencedType = PT->getElementType(); 222 if (!dyn_cast<StructType>(DereferencedType) 223 ) { 224 continue; 225 } 226 if (!containsPointerType(DereferencedType)) { 227 continue; 228 } 229 // FIXME: Because a pointer inside of a struct/union may be aliased to 230 // another pointer we need to take the conservative approach and place all 231 // pointers into the arena until more advanced detection is implemented. 232 dumpAll = true; 233 } 234 return dumpAll; 235 } 236 void 237 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { 238 if (isConstVec.empty()) { 239 return; 240 } 241 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { 242 CallInst *CI = isConstVec[x]; 243 Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); 244 Type *aType = Type::getInt32Ty(*mCTX); 245 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) 246 : ConstantInt::get(aType, 0); 247 CI->replaceAllUsesWith(Val); 248 CI->eraseFromParent(); 249 } 250 isConstVec.clear(); 251 } 252 void 253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { 254 // Don't do anything if we don't have any atomic operations. 255 if (atomicFuncs.empty()) { 256 return; 257 } 258 // Change the function name for the atomic if it is required 259 uint32_t size = atomicFuncs.size(); 260 for (uint32_t x = 0; x < size; ++x) { 261 atomicFuncs[x].first->setOperand( 262 atomicFuncs[x].first->getNumOperands()-1, 263 atomicFuncs[x].second); 264 265 } 266 mChanged = true; 267 if (mConvertAtomics) { 268 return; 269 } 270 } 271 272 bool 273 AMDGPUPeepholeOpt::runOnFunction(Function &MF) { 274 mChanged = false; 275 mF = &MF; 276 mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); 277 if (mDebug) { 278 MF.dump(); 279 } 280 mCTX = &MF.getType()->getContext(); 281 mConvertAtomics = true; 282 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), 283 std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), 284 this)); 285 286 doAtomicConversionIfNeeded(MF); 287 doIsConstCallConversionIfNeeded(); 288 289 if (mDebug) { 290 MF.dump(); 291 } 292 return mChanged; 293 } 294 295 bool 296 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { 297 Instruction *inst = (*bbb); 298 CallInst *CI = dyn_cast<CallInst>(inst); 299 if (!CI) { 300 return false; 301 } 302 if (isSigned24BitOps(CI)) { 303 expandSigned24BitOps(CI); 304 ++(*bbb); 305 CI->eraseFromParent(); 306 return true; 307 } 308 if (propagateSamplerInst(CI)) { 309 return false; 310 } 311 if (expandBFI(CI) || expandBFM(CI)) { 312 ++(*bbb); 313 CI->eraseFromParent(); 314 return true; 315 } 316 if (convertAccurateDivide(CI)) { 317 expandAccurateDivide(CI); 318 ++(*bbb); 319 CI->eraseFromParent(); 320 return true; 321 } 322 323 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); 324 if (calleeName.startswith("__amdil_is_constant")) { 325 // If we do not have optimizations, then this 326 // cannot be properly evaluated, so we add the 327 // call instruction to a vector and process 328 // them at the end of processing after the 329 // samplers have been correctly handled. 330 if (optLevel == CodeGenOpt::None) { 331 isConstVec.push_back(CI); 332 return false; 333 } else { 334 Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); 335 Type *aType = Type::getInt32Ty(*mCTX); 336 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) 337 : ConstantInt::get(aType, 0); 338 CI->replaceAllUsesWith(Val); 339 ++(*bbb); 340 CI->eraseFromParent(); 341 return true; 342 } 343 } 344 345 if (calleeName.equals("__amdil_is_asic_id_i32")) { 346 ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); 347 Type *aType = Type::getInt32Ty(*mCTX); 348 Value *Val = CV; 349 if (Val) { 350 Val = ConstantInt::get(aType, 351 mSTM->device()->getDeviceFlag() & CV->getZExtValue()); 352 } else { 353 Val = ConstantInt::get(aType, 0); 354 } 355 CI->replaceAllUsesWith(Val); 356 ++(*bbb); 357 CI->eraseFromParent(); 358 return true; 359 } 360 Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); 361 if (!F) { 362 return false; 363 } 364 if (F->getName().startswith("__atom") && !CI->getNumUses() 365 && F->getName().find("_xchg") == StringRef::npos) { 366 std::string buffer(F->getName().str() + "_noret"); 367 F = dyn_cast<Function>( 368 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); 369 atomicFuncs.push_back(std::make_pair(CI, F)); 370 } 371 372 if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) 373 && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { 374 return false; 375 } 376 if (!mConvertAtomics) { 377 return false; 378 } 379 StringRef name = F->getName(); 380 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { 381 mConvertAtomics = false; 382 } 383 return false; 384 } 385 386 bool 387 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 388 Instruction *&src, 389 Constant *&mask, 390 Constant *&shift) { 391 if (!base) { 392 if (mDebug) { 393 dbgs() << "Null pointer passed into function.\n"; 394 } 395 return false; 396 } 397 bool andOp = false; 398 if (base->getOpcode() == Instruction::Shl) { 399 shift = dyn_cast<Constant>(base->getOperand(1)); 400 } else if (base->getOpcode() == Instruction::And) { 401 mask = dyn_cast<Constant>(base->getOperand(1)); 402 andOp = true; 403 } else { 404 if (mDebug) { 405 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; 406 } 407 // If the base is neither a Shl or a And, we don't fit any of the patterns above. 408 return false; 409 } 410 src = dyn_cast<Instruction>(base->getOperand(0)); 411 if (!src) { 412 if (mDebug) { 413 dbgs() << "Failed setup since the base operand is not an instruction!\n"; 414 } 415 return false; 416 } 417 // If we find an 'and' operation, then we don't need to 418 // find the next operation as we already know the 419 // bits that are valid at this point. 420 if (andOp) { 421 return true; 422 } 423 if (src->getOpcode() == Instruction::Shl && !shift) { 424 shift = dyn_cast<Constant>(src->getOperand(1)); 425 src = dyn_cast<Instruction>(src->getOperand(0)); 426 } else if (src->getOpcode() == Instruction::And && !mask) { 427 mask = dyn_cast<Constant>(src->getOperand(1)); 428 } 429 if (!mask && !shift) { 430 if (mDebug) { 431 dbgs() << "Failed setup since both mask and shift are NULL!\n"; 432 } 433 // Did not find a constant mask or a shift. 434 return false; 435 } 436 return true; 437 } 438 bool 439 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { 440 if (!inst) { 441 return false; 442 } 443 if (!inst->isBinaryOp()) { 444 return false; 445 } 446 if (inst->getOpcode() != Instruction::Or) { 447 return false; 448 } 449 if (optLevel == CodeGenOpt::None) { 450 return false; 451 } 452 // We want to do an optimization on a sequence of ops that in the end equals a 453 // single ISA instruction. 454 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) 455 // Some simplified versions of this pattern are as follows: 456 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 457 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E 458 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B 459 // (A & B) | (D << F) when (1 << F) >= B 460 // (A << C) | (D & E) when (1 << C) >= E 461 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { 462 // The HD4XXX hardware doesn't support the ubit_insert instruction. 463 return false; 464 } 465 Type *aType = inst->getType(); 466 bool isVector = aType->isVectorTy(); 467 int numEle = 1; 468 // This optimization only works on 32bit integers. 469 if (aType->getScalarType() 470 != Type::getInt32Ty(inst->getContext())) { 471 return false; 472 } 473 if (isVector) { 474 const VectorType *VT = dyn_cast<VectorType>(aType); 475 numEle = VT->getNumElements(); 476 // We currently cannot support more than 4 elements in a intrinsic and we 477 // cannot support Vec3 types. 478 if (numEle > 4 || numEle == 3) { 479 return false; 480 } 481 } 482 // TODO: Handle vectors. 483 if (isVector) { 484 if (mDebug) { 485 dbgs() << "!!! Vectors are not supported yet!\n"; 486 } 487 return false; 488 } 489 Instruction *LHSSrc = NULL, *RHSSrc = NULL; 490 Constant *LHSMask = NULL, *RHSMask = NULL; 491 Constant *LHSShift = NULL, *RHSShift = NULL; 492 Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); 493 Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); 494 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { 495 if (mDebug) { 496 dbgs() << "Found an OR Operation that failed setup!\n"; 497 inst->dump(); 498 if (LHS) { LHS->dump(); } 499 if (LHSSrc) { LHSSrc->dump(); } 500 if (LHSMask) { LHSMask->dump(); } 501 if (LHSShift) { LHSShift->dump(); } 502 } 503 // There was an issue with the setup for BitInsert. 504 return false; 505 } 506 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { 507 if (mDebug) { 508 dbgs() << "Found an OR Operation that failed setup!\n"; 509 inst->dump(); 510 if (RHS) { RHS->dump(); } 511 if (RHSSrc) { RHSSrc->dump(); } 512 if (RHSMask) { RHSMask->dump(); } 513 if (RHSShift) { RHSShift->dump(); } 514 } 515 // There was an issue with the setup for BitInsert. 516 return false; 517 } 518 if (mDebug) { 519 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; 520 dbgs() << "Op: "; inst->dump(); 521 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } 522 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } 523 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } 524 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } 525 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } 526 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } 527 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } 528 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } 529 } 530 Constant *offset = NULL; 531 Constant *width = NULL; 532 uint32_t lhsMaskVal = 0, rhsMaskVal = 0; 533 uint32_t lhsShiftVal = 0, rhsShiftVal = 0; 534 uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; 535 uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; 536 lhsMaskVal = (LHSMask 537 ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); 538 rhsMaskVal = (RHSMask 539 ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); 540 lhsShiftVal = (LHSShift 541 ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); 542 rhsShiftVal = (RHSShift 543 ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); 544 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; 545 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; 546 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; 547 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; 548 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). 549 if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { 550 return false; 551 } 552 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { 553 offset = ConstantInt::get(aType, lhsMaskOffset, false); 554 width = ConstantInt::get(aType, lhsMaskWidth, false); 555 RHSSrc = RHS; 556 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { 557 return false; 558 } 559 if (!LHSShift) { 560 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 561 "MaskShr", LHS); 562 } else if (lhsShiftVal != lhsMaskOffset) { 563 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 564 "MaskShr", LHS); 565 } 566 if (mDebug) { 567 dbgs() << "Optimizing LHS!\n"; 568 } 569 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { 570 offset = ConstantInt::get(aType, rhsMaskOffset, false); 571 width = ConstantInt::get(aType, rhsMaskWidth, false); 572 LHSSrc = RHSSrc; 573 RHSSrc = LHS; 574 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { 575 return false; 576 } 577 if (!RHSShift) { 578 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 579 "MaskShr", RHS); 580 } else if (rhsShiftVal != rhsMaskOffset) { 581 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 582 "MaskShr", RHS); 583 } 584 if (mDebug) { 585 dbgs() << "Optimizing RHS!\n"; 586 } 587 } else { 588 if (mDebug) { 589 dbgs() << "Failed constraint 3!\n"; 590 } 591 return false; 592 } 593 if (mDebug) { 594 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } 595 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } 596 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } 597 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } 598 } 599 if (!offset || !width) { 600 if (mDebug) { 601 dbgs() << "Either width or offset are NULL, failed detection!\n"; 602 } 603 return false; 604 } 605 // Lets create the function signature. 606 std::vector<Type *> callTypes; 607 callTypes.push_back(aType); 608 callTypes.push_back(aType); 609 callTypes.push_back(aType); 610 callTypes.push_back(aType); 611 FunctionType *funcType = FunctionType::get(aType, callTypes, false); 612 std::string name = "__amdil_ubit_insert"; 613 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } 614 Function *Func = 615 dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> 616 getOrInsertFunction(StringRef(name), funcType)); 617 Value *Operands[4] = { 618 width, 619 offset, 620 LHSSrc, 621 RHSSrc 622 }; 623 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); 624 if (mDebug) { 625 dbgs() << "Old Inst: "; 626 inst->dump(); 627 dbgs() << "New Inst: "; 628 CI->dump(); 629 dbgs() << "\n\n"; 630 } 631 CI->insertBefore(inst); 632 inst->replaceAllUsesWith(CI); 633 return true; 634 } 635 636 bool 637 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { 638 if (!inst) { 639 return false; 640 } 641 if (!inst->isBinaryOp()) { 642 return false; 643 } 644 if (inst->getOpcode() != Instruction::And) { 645 return false; 646 } 647 if (optLevel == CodeGenOpt::None) { 648 return false; 649 } 650 // We want to do some simple optimizations on Shift right/And patterns. The 651 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a 652 // value smaller than 32 and C is a mask. If C is a constant value, then the 653 // following transformation can occur. For signed integers, it turns into the 654 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned 655 // integers, it turns into the function call dst = 656 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract 657 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for 658 // Evergreen hardware. 659 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { 660 // This does not work on HD4XXX hardware. 661 return false; 662 } 663 Type *aType = inst->getType(); 664 bool isVector = aType->isVectorTy(); 665 666 // XXX Support vector types 667 if (isVector) { 668 return false; 669 } 670 int numEle = 1; 671 // This only works on 32bit integers 672 if (aType->getScalarType() 673 != Type::getInt32Ty(inst->getContext())) { 674 return false; 675 } 676 if (isVector) { 677 const VectorType *VT = dyn_cast<VectorType>(aType); 678 numEle = VT->getNumElements(); 679 // We currently cannot support more than 4 elements in a intrinsic and we 680 // cannot support Vec3 types. 681 if (numEle > 4 || numEle == 3) { 682 return false; 683 } 684 } 685 BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); 686 // If the first operand is not a shift instruction, then we can return as it 687 // doesn't match this pattern. 688 if (!ShiftInst || !ShiftInst->isShift()) { 689 return false; 690 } 691 // If we are a shift left, then we need don't match this pattern. 692 if (ShiftInst->getOpcode() == Instruction::Shl) { 693 return false; 694 } 695 bool isSigned = ShiftInst->isArithmeticShift(); 696 Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); 697 Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); 698 // Lets make sure that the shift value and the and mask are constant integers. 699 if (!AndMask || !ShrVal) { 700 return false; 701 } 702 Constant *newMaskConst; 703 Constant *shiftValConst; 704 if (isVector) { 705 // Handle the vector case 706 std::vector<Constant *> maskVals; 707 std::vector<Constant *> shiftVals; 708 ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); 709 ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); 710 Type *scalarType = AndMaskVec->getType()->getScalarType(); 711 assert(AndMaskVec->getNumOperands() == 712 ShrValVec->getNumOperands() && "cannot have a " 713 "combination where the number of elements to a " 714 "shift and an and are different!"); 715 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { 716 ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); 717 ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); 718 if (!AndCI || !ShiftIC) { 719 return false; 720 } 721 uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); 722 if (!isMask_32(maskVal)) { 723 return false; 724 } 725 maskVal = (uint32_t)CountTrailingOnes_32(maskVal); 726 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); 727 // If the mask or shiftval is greater than the bitcount, then break out. 728 if (maskVal >= 32 || shiftVal >= 32) { 729 return false; 730 } 731 // If the mask val is greater than the the number of original bits left 732 // then this optimization is invalid. 733 if (maskVal > (32 - shiftVal)) { 734 return false; 735 } 736 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); 737 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); 738 } 739 newMaskConst = ConstantVector::get(maskVals); 740 shiftValConst = ConstantVector::get(shiftVals); 741 } else { 742 // Handle the scalar case 743 uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); 744 // This must be a mask value where all lower bits are set to 1 and then any 745 // bit higher is set to 0. 746 if (!isMask_32(maskVal)) { 747 return false; 748 } 749 maskVal = (uint32_t)CountTrailingOnes_32(maskVal); 750 // Count the number of bits set in the mask, this is the width of the 751 // resulting bit set that is extracted from the source value. 752 uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); 753 // If the mask or shift val is greater than the bitcount, then break out. 754 if (maskVal >= 32 || shiftVal >= 32) { 755 return false; 756 } 757 // If the mask val is greater than the the number of original bits left then 758 // this optimization is invalid. 759 if (maskVal > (32 - shiftVal)) { 760 return false; 761 } 762 newMaskConst = ConstantInt::get(aType, maskVal, isSigned); 763 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); 764 } 765 // Lets create the function signature. 766 std::vector<Type *> callTypes; 767 callTypes.push_back(aType); 768 callTypes.push_back(aType); 769 callTypes.push_back(aType); 770 FunctionType *funcType = FunctionType::get(aType, callTypes, false); 771 std::string name = "llvm.AMDGPU.bit.extract.u32"; 772 if (isVector) { 773 name += ".v" + itostr(numEle) + "i32"; 774 } else { 775 name += "."; 776 } 777 // Lets create the function. 778 Function *Func = 779 dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> 780 getOrInsertFunction(StringRef(name), funcType)); 781 Value *Operands[3] = { 782 ShiftInst->getOperand(0), 783 shiftValConst, 784 newMaskConst 785 }; 786 // Lets create the Call with the operands 787 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); 788 CI->setDoesNotAccessMemory(); 789 CI->insertBefore(inst); 790 inst->replaceAllUsesWith(CI); 791 return true; 792 } 793 794 bool 795 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { 796 if (!CI) { 797 return false; 798 } 799 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); 800 if (!LHS->getName().startswith("__amdil_bfi")) { 801 return false; 802 } 803 Type* type = CI->getOperand(0)->getType(); 804 Constant *negOneConst = NULL; 805 if (type->isVectorTy()) { 806 std::vector<Constant *> negOneVals; 807 negOneConst = ConstantInt::get(CI->getContext(), 808 APInt(32, StringRef("-1"), 10)); 809 for (size_t x = 0, 810 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { 811 negOneVals.push_back(negOneConst); 812 } 813 negOneConst = ConstantVector::get(negOneVals); 814 } else { 815 negOneConst = ConstantInt::get(CI->getContext(), 816 APInt(32, StringRef("-1"), 10)); 817 } 818 // __amdil_bfi => (A & B) | (~A & C) 819 BinaryOperator *lhs = 820 BinaryOperator::Create(Instruction::And, CI->getOperand(0), 821 CI->getOperand(1), "bfi_and", CI); 822 BinaryOperator *rhs = 823 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, 824 "bfi_not", CI); 825 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), 826 "bfi_and", CI); 827 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); 828 CI->replaceAllUsesWith(lhs); 829 return true; 830 } 831 832 bool 833 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { 834 if (!CI) { 835 return false; 836 } 837 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); 838 if (!LHS->getName().startswith("__amdil_bfm")) { 839 return false; 840 } 841 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) 842 Constant *newMaskConst = NULL; 843 Constant *newShiftConst = NULL; 844 Type* type = CI->getOperand(0)->getType(); 845 if (type->isVectorTy()) { 846 std::vector<Constant*> newMaskVals, newShiftVals; 847 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); 848 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); 849 for (size_t x = 0, 850 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { 851 newMaskVals.push_back(newMaskConst); 852 newShiftVals.push_back(newShiftConst); 853 } 854 newMaskConst = ConstantVector::get(newMaskVals); 855 newShiftConst = ConstantVector::get(newShiftVals); 856 } else { 857 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); 858 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); 859 } 860 BinaryOperator *lhs = 861 BinaryOperator::Create(Instruction::And, CI->getOperand(0), 862 newMaskConst, "bfm_mask", CI); 863 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, 864 lhs, "bfm_shl", CI); 865 lhs = BinaryOperator::Create(Instruction::Sub, lhs, 866 newShiftConst, "bfm_sub", CI); 867 BinaryOperator *rhs = 868 BinaryOperator::Create(Instruction::And, CI->getOperand(1), 869 newMaskConst, "bfm_mask", CI); 870 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); 871 CI->replaceAllUsesWith(lhs); 872 return true; 873 } 874 875 bool 876 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { 877 Instruction *inst = (*bbb); 878 if (optimizeCallInst(bbb)) { 879 return true; 880 } 881 if (optimizeBitExtract(inst)) { 882 return false; 883 } 884 if (optimizeBitInsert(inst)) { 885 return false; 886 } 887 if (correctMisalignedMemOp(inst)) { 888 return false; 889 } 890 return false; 891 } 892 bool 893 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { 894 LoadInst *linst = dyn_cast<LoadInst>(inst); 895 StoreInst *sinst = dyn_cast<StoreInst>(inst); 896 unsigned alignment; 897 Type* Ty = inst->getType(); 898 if (linst) { 899 alignment = linst->getAlignment(); 900 Ty = inst->getType(); 901 } else if (sinst) { 902 alignment = sinst->getAlignment(); 903 Ty = sinst->getValueOperand()->getType(); 904 } else { 905 return false; 906 } 907 unsigned size = getTypeSize(Ty); 908 if (size == alignment || size < alignment) { 909 return false; 910 } 911 if (!Ty->isStructTy()) { 912 return false; 913 } 914 if (alignment < 4) { 915 if (linst) { 916 linst->setAlignment(0); 917 return true; 918 } else if (sinst) { 919 sinst->setAlignment(0); 920 return true; 921 } 922 } 923 return false; 924 } 925 bool 926 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { 927 if (!CI) { 928 return false; 929 } 930 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); 931 std::string namePrefix = LHS->getName().substr(0, 14); 932 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" 933 && namePrefix != "__amdil__imul24_high") { 934 return false; 935 } 936 if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { 937 return false; 938 } 939 return true; 940 } 941 942 void 943 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { 944 assert(isSigned24BitOps(CI) && "Must be a " 945 "signed 24 bit operation to call this function!"); 946 Value *LHS = CI->getOperand(CI->getNumOperands()-1); 947 // On 7XX and 8XX we do not have signed 24bit, so we need to 948 // expand it to the following: 949 // imul24 turns into 32bit imul 950 // imad24 turns into 32bit imad 951 // imul24_high turns into 32bit imulhigh 952 if (LHS->getName().substr(0, 14) == "__amdil_imad24") { 953 Type *aType = CI->getOperand(0)->getType(); 954 bool isVector = aType->isVectorTy(); 955 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; 956 std::vector<Type*> callTypes; 957 callTypes.push_back(CI->getOperand(0)->getType()); 958 callTypes.push_back(CI->getOperand(1)->getType()); 959 callTypes.push_back(CI->getOperand(2)->getType()); 960 FunctionType *funcType = 961 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); 962 std::string name = "__amdil_imad"; 963 if (isVector) { 964 name += "_v" + itostr(numEle) + "i32"; 965 } else { 966 name += "_i32"; 967 } 968 Function *Func = dyn_cast<Function>( 969 CI->getParent()->getParent()->getParent()-> 970 getOrInsertFunction(StringRef(name), funcType)); 971 Value *Operands[3] = { 972 CI->getOperand(0), 973 CI->getOperand(1), 974 CI->getOperand(2) 975 }; 976 CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); 977 nCI->insertBefore(CI); 978 CI->replaceAllUsesWith(nCI); 979 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { 980 BinaryOperator *mulOp = 981 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), 982 CI->getOperand(1), "imul24", CI); 983 CI->replaceAllUsesWith(mulOp); 984 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { 985 Type *aType = CI->getOperand(0)->getType(); 986 987 bool isVector = aType->isVectorTy(); 988 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; 989 std::vector<Type*> callTypes; 990 callTypes.push_back(CI->getOperand(0)->getType()); 991 callTypes.push_back(CI->getOperand(1)->getType()); 992 FunctionType *funcType = 993 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); 994 std::string name = "__amdil_imul_high"; 995 if (isVector) { 996 name += "_v" + itostr(numEle) + "i32"; 997 } else { 998 name += "_i32"; 999 } 1000 Function *Func = dyn_cast<Function>( 1001 CI->getParent()->getParent()->getParent()-> 1002 getOrInsertFunction(StringRef(name), funcType)); 1003 Value *Operands[2] = { 1004 CI->getOperand(0), 1005 CI->getOperand(1) 1006 }; 1007 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); 1008 nCI->insertBefore(CI); 1009 CI->replaceAllUsesWith(nCI); 1010 } 1011 } 1012 1013 bool 1014 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { 1015 return (CI != NULL 1016 && CI->getOperand(CI->getNumOperands() - 1)->getName() 1017 == "__amdil_get_local_size_int"); 1018 } 1019 1020 bool 1021 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { 1022 if (!CI) { 1023 return false; 1024 } 1025 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX 1026 && (mSTM->getDeviceName() == "cayman")) { 1027 return false; 1028 } 1029 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 1030 == "__amdil_improved_div"; 1031 } 1032 1033 void 1034 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { 1035 assert(convertAccurateDivide(CI) 1036 && "expanding accurate divide can only happen if it is expandable!"); 1037 BinaryOperator *divOp = 1038 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), 1039 CI->getOperand(1), "fdiv32", CI); 1040 CI->replaceAllUsesWith(divOp); 1041 } 1042 1043 bool 1044 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { 1045 if (optLevel != CodeGenOpt::None) { 1046 return false; 1047 } 1048 1049 if (!CI) { 1050 return false; 1051 } 1052 1053 unsigned funcNameIdx = 0; 1054 funcNameIdx = CI->getNumOperands() - 1; 1055 StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); 1056 if (calleeName != "__amdil_image2d_read_norm" 1057 && calleeName != "__amdil_image2d_read_unnorm" 1058 && calleeName != "__amdil_image3d_read_norm" 1059 && calleeName != "__amdil_image3d_read_unnorm") { 1060 return false; 1061 } 1062 1063 unsigned samplerIdx = 2; 1064 samplerIdx = 1; 1065 Value *sampler = CI->getOperand(samplerIdx); 1066 LoadInst *lInst = dyn_cast<LoadInst>(sampler); 1067 if (!lInst) { 1068 return false; 1069 } 1070 1071 if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1072 return false; 1073 } 1074 1075 GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); 1076 // If we are loading from what is not a global value, then we 1077 // fail and return. 1078 if (!gv) { 1079 return false; 1080 } 1081 1082 // If we don't have an initializer or we have an initializer and 1083 // the initializer is not a 32bit integer, we fail. 1084 if (!gv->hasInitializer() 1085 || !gv->getInitializer()->getType()->isIntegerTy(32)) { 1086 return false; 1087 } 1088 1089 // Now that we have the global variable initializer, lets replace 1090 // all uses of the load instruction with the samplerVal and 1091 // reparse the __amdil_is_constant() function. 1092 Constant *samplerVal = gv->getInitializer(); 1093 lInst->replaceAllUsesWith(samplerVal); 1094 return true; 1095 } 1096 1097 bool 1098 AMDGPUPeepholeOpt::doInitialization(Module &M) { 1099 return false; 1100 } 1101 1102 bool 1103 AMDGPUPeepholeOpt::doFinalization(Module &M) { 1104 return false; 1105 } 1106 1107 void 1108 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { 1109 AU.addRequired<MachineFunctionAnalysis>(); 1110 FunctionPass::getAnalysisUsage(AU); 1111 AU.setPreservesAll(); 1112 } 1113 1114 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { 1115 size_t size = 0; 1116 if (!T) { 1117 return size; 1118 } 1119 switch (T->getTypeID()) { 1120 case Type::X86_FP80TyID: 1121 case Type::FP128TyID: 1122 case Type::PPC_FP128TyID: 1123 case Type::LabelTyID: 1124 assert(0 && "These types are not supported by this backend"); 1125 default: 1126 case Type::FloatTyID: 1127 case Type::DoubleTyID: 1128 size = T->getPrimitiveSizeInBits() >> 3; 1129 break; 1130 case Type::PointerTyID: 1131 size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); 1132 break; 1133 case Type::IntegerTyID: 1134 size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); 1135 break; 1136 case Type::StructTyID: 1137 size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); 1138 break; 1139 case Type::ArrayTyID: 1140 size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); 1141 break; 1142 case Type::FunctionTyID: 1143 size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); 1144 break; 1145 case Type::VectorTyID: 1146 size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); 1147 break; 1148 }; 1149 return size; 1150 } 1151 1152 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, 1153 bool dereferencePtr) { 1154 size_t size = 0; 1155 if (!ST) { 1156 return size; 1157 } 1158 Type *curType; 1159 StructType::element_iterator eib; 1160 StructType::element_iterator eie; 1161 for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { 1162 curType = *eib; 1163 size += getTypeSize(curType, dereferencePtr); 1164 } 1165 return size; 1166 } 1167 1168 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, 1169 bool dereferencePtr) { 1170 return IT ? (IT->getBitWidth() >> 3) : 0; 1171 } 1172 1173 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, 1174 bool dereferencePtr) { 1175 assert(0 && "Should not be able to calculate the size of an function type"); 1176 return 0; 1177 } 1178 1179 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, 1180 bool dereferencePtr) { 1181 return (size_t)(AT ? (getTypeSize(AT->getElementType(), 1182 dereferencePtr) * AT->getNumElements()) 1183 : 0); 1184 } 1185 1186 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, 1187 bool dereferencePtr) { 1188 return VT ? (VT->getBitWidth() >> 3) : 0; 1189 } 1190 1191 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, 1192 bool dereferencePtr) { 1193 if (!PT) { 1194 return 0; 1195 } 1196 Type *CT = PT->getElementType(); 1197 if (CT->getTypeID() == Type::StructTyID && 1198 PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1199 return getTypeSize(dyn_cast<StructType>(CT)); 1200 } else if (dereferencePtr) { 1201 size_t size = 0; 1202 for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { 1203 size += getTypeSize(PT->getContainedType(x), dereferencePtr); 1204 } 1205 return size; 1206 } else { 1207 return 4; 1208 } 1209 } 1210 1211 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, 1212 bool dereferencePtr) { 1213 //assert(0 && "Should not be able to calculate the size of an opaque type"); 1214 return 4; 1215 } 1216