1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 10 #include "AMDILDevices.h" 11 #include "AMDGPUInstrInfo.h" 12 #include "llvm/ADT/Statistic.h" 13 #include "llvm/ADT/StringExtras.h" 14 #include "llvm/ADT/StringRef.h" 15 #include "llvm/ADT/Twine.h" 16 #include "llvm/Constants.h" 17 #include "llvm/CodeGen/MachineFunction.h" 18 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 19 #include "llvm/Function.h" 20 #include "llvm/Instructions.h" 21 #include "llvm/Module.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/MathExtras.h" 24 25 #include <sstream> 26 27 #if 0 28 STATISTIC(PointerAssignments, "Number of dynamic pointer " 29 "assigments discovered"); 30 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); 31 #endif 32 33 using namespace llvm; 34 // The Peephole optimization pass is used to do simple last minute optimizations 35 // that are required for correct code or to remove redundant functions 36 namespace { 37 38 class OpaqueType; 39 40 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { 41 public: 42 TargetMachine &TM; 43 static char ID; 44 AMDGPUPeepholeOpt(TargetMachine &tm); 45 ~AMDGPUPeepholeOpt(); 46 const char *getPassName() const; 47 bool runOnFunction(Function &F); 48 bool doInitialization(Module &M); 49 bool doFinalization(Module &M); 50 void getAnalysisUsage(AnalysisUsage &AU) const; 51 protected: 52 private: 53 // Function to initiate all of the instruction level optimizations. 54 bool instLevelOptimizations(BasicBlock::iterator *inst); 55 // Quick check to see if we need to dump all of the pointers into the 56 // arena. If this is correct, then we set all pointers to exist in arena. This 57 // is a workaround for aliasing of pointers in a struct/union. 58 bool dumpAllIntoArena(Function &F); 59 // Because I don't want to invalidate any pointers while in the 60 // safeNestedForEachFunction. I push atomic conversions to a vector and handle 61 // it later. This function does the conversions if required. 62 void doAtomicConversionIfNeeded(Function &F); 63 // Because __amdil_is_constant cannot be properly evaluated if 64 // optimizations are disabled, the call's are placed in a vector 65 // and evaluated after the __amdil_image* functions are evaluated 66 // which should allow the __amdil_is_constant function to be 67 // evaluated correctly. 68 void doIsConstCallConversionIfNeeded(); 69 bool mChanged; 70 bool mDebug; 71 bool mConvertAtomics; 72 CodeGenOpt::Level optLevel; 73 // Run a series of tests to see if we can optimize a CALL instruction. 74 bool optimizeCallInst(BasicBlock::iterator *bbb); 75 // A peephole optimization to optimize bit extract sequences. 76 bool optimizeBitExtract(Instruction *inst); 77 // A peephole optimization to optimize bit insert sequences. 78 bool optimizeBitInsert(Instruction *inst); 79 bool setupBitInsert(Instruction *base, 80 Instruction *&src, 81 Constant *&mask, 82 Constant *&shift); 83 // Expand the bit field insert instruction on versions of OpenCL that 84 // don't support it. 85 bool expandBFI(CallInst *CI); 86 // Expand the bit field mask instruction on version of OpenCL that 87 // don't support it. 88 bool expandBFM(CallInst *CI); 89 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in 90 // this case we need to expand them. These functions check for 24bit functions 91 // and then expand. 92 bool isSigned24BitOps(CallInst *CI); 93 void expandSigned24BitOps(CallInst *CI); 94 // One optimization that can occur is that if the required workgroup size is 95 // specified then the result of get_local_size is known at compile time and 96 // can be returned accordingly. 97 bool isRWGLocalOpt(CallInst *CI); 98 // On northern island cards, the division is slightly less accurate than on 99 // previous generations, so we need to utilize a more accurate division. So we 100 // can translate the accurate divide to a normal divide on all other cards. 101 bool convertAccurateDivide(CallInst *CI); 102 void expandAccurateDivide(CallInst *CI); 103 // If the alignment is set incorrectly, it can produce really inefficient 104 // code. This checks for this scenario and fixes it if possible. 105 bool correctMisalignedMemOp(Instruction *inst); 106 107 // If we are in no opt mode, then we need to make sure that 108 // local samplers are properly propagated as constant propagation 109 // doesn't occur and we need to know the value of kernel defined 110 // samplers at compile time. 111 bool propagateSamplerInst(CallInst *CI); 112 113 // Helper functions 114 115 // Group of functions that recursively calculate the size of a structure based 116 // on it's sub-types. 117 size_t getTypeSize(Type * const T, bool dereferencePtr = false); 118 size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); 119 size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); 120 size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); 121 size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); 122 size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); 123 size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); 124 size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); 125 126 LLVMContext *mCTX; 127 Function *mF; 128 const AMDGPUSubtarget *mSTM; 129 SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; 130 SmallVector<CallInst *, 16> isConstVec; 131 }; // class AMDGPUPeepholeOpt 132 char AMDGPUPeepholeOpt::ID = 0; 133 134 // A template function that has two levels of looping before calling the 135 // function with a pointer to the current iterator. 136 template<class InputIterator, class SecondIterator, class Function> 137 Function safeNestedForEach(InputIterator First, InputIterator Last, 138 SecondIterator S, Function F) 139 { 140 for ( ; First != Last; ++First) { 141 SecondIterator sf, sl; 142 for (sf = First->begin(), sl = First->end(); 143 sf != sl; ) { 144 if (!F(&sf)) { 145 ++sf; 146 } 147 } 148 } 149 return F; 150 } 151 152 } // anonymous namespace 153 154 namespace llvm { 155 FunctionPass * 156 createAMDGPUPeepholeOpt(TargetMachine &tm) 157 { 158 return new AMDGPUPeepholeOpt(tm); 159 } 160 } // llvm namespace 161 162 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) 163 : FunctionPass(ID), TM(tm) 164 { 165 mDebug = false; 166 optLevel = TM.getOptLevel(); 167 168 } 169 170 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() 171 { 172 } 173 174 const char * 175 AMDGPUPeepholeOpt::getPassName() const 176 { 177 return "AMDGPU PeepHole Optimization Pass"; 178 } 179 180 bool 181 containsPointerType(Type *Ty) 182 { 183 if (!Ty) { 184 return false; 185 } 186 switch(Ty->getTypeID()) { 187 default: 188 return false; 189 case Type::StructTyID: { 190 const StructType *ST = dyn_cast<StructType>(Ty); 191 for (StructType::element_iterator stb = ST->element_begin(), 192 ste = ST->element_end(); stb != ste; ++stb) { 193 if (!containsPointerType(*stb)) { 194 continue; 195 } 196 return true; 197 } 198 break; 199 } 200 case Type::VectorTyID: 201 case Type::ArrayTyID: 202 return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); 203 case Type::PointerTyID: 204 return true; 205 }; 206 return false; 207 } 208 209 bool 210 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) 211 { 212 bool dumpAll = false; 213 for (Function::const_arg_iterator cab = F.arg_begin(), 214 cae = F.arg_end(); cab != cae; ++cab) { 215 const Argument *arg = cab; 216 const PointerType *PT = dyn_cast<PointerType>(arg->getType()); 217 if (!PT) { 218 continue; 219 } 220 Type *DereferencedType = PT->getElementType(); 221 if (!dyn_cast<StructType>(DereferencedType) 222 ) { 223 continue; 224 } 225 if (!containsPointerType(DereferencedType)) { 226 continue; 227 } 228 // FIXME: Because a pointer inside of a struct/union may be aliased to 229 // another pointer we need to take the conservative approach and place all 230 // pointers into the arena until more advanced detection is implemented. 231 dumpAll = true; 232 } 233 return dumpAll; 234 } 235 void 236 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() 237 { 238 if (isConstVec.empty()) { 239 return; 240 } 241 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { 242 CallInst *CI = isConstVec[x]; 243 Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); 244 Type *aType = Type::getInt32Ty(*mCTX); 245 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) 246 : ConstantInt::get(aType, 0); 247 CI->replaceAllUsesWith(Val); 248 CI->eraseFromParent(); 249 } 250 isConstVec.clear(); 251 } 252 void 253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) 254 { 255 // Don't do anything if we don't have any atomic operations. 256 if (atomicFuncs.empty()) { 257 return; 258 } 259 // Change the function name for the atomic if it is required 260 uint32_t size = atomicFuncs.size(); 261 for (uint32_t x = 0; x < size; ++x) { 262 atomicFuncs[x].first->setOperand( 263 atomicFuncs[x].first->getNumOperands()-1, 264 atomicFuncs[x].second); 265 266 } 267 mChanged = true; 268 if (mConvertAtomics) { 269 return; 270 } 271 } 272 273 bool 274 AMDGPUPeepholeOpt::runOnFunction(Function &MF) 275 { 276 mChanged = false; 277 mF = &MF; 278 mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); 279 if (mDebug) { 280 MF.dump(); 281 } 282 mCTX = &MF.getType()->getContext(); 283 mConvertAtomics = true; 284 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), 285 std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), 286 this)); 287 288 doAtomicConversionIfNeeded(MF); 289 doIsConstCallConversionIfNeeded(); 290 291 if (mDebug) { 292 MF.dump(); 293 } 294 return mChanged; 295 } 296 297 bool 298 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) 299 { 300 Instruction *inst = (*bbb); 301 CallInst *CI = dyn_cast<CallInst>(inst); 302 if (!CI) { 303 return false; 304 } 305 if (isSigned24BitOps(CI)) { 306 expandSigned24BitOps(CI); 307 ++(*bbb); 308 CI->eraseFromParent(); 309 return true; 310 } 311 if (propagateSamplerInst(CI)) { 312 return false; 313 } 314 if (expandBFI(CI) || expandBFM(CI)) { 315 ++(*bbb); 316 CI->eraseFromParent(); 317 return true; 318 } 319 if (convertAccurateDivide(CI)) { 320 expandAccurateDivide(CI); 321 ++(*bbb); 322 CI->eraseFromParent(); 323 return true; 324 } 325 326 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); 327 if (calleeName.startswith("__amdil_is_constant")) { 328 // If we do not have optimizations, then this 329 // cannot be properly evaluated, so we add the 330 // call instruction to a vector and process 331 // them at the end of processing after the 332 // samplers have been correctly handled. 333 if (optLevel == CodeGenOpt::None) { 334 isConstVec.push_back(CI); 335 return false; 336 } else { 337 Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); 338 Type *aType = Type::getInt32Ty(*mCTX); 339 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) 340 : ConstantInt::get(aType, 0); 341 CI->replaceAllUsesWith(Val); 342 ++(*bbb); 343 CI->eraseFromParent(); 344 return true; 345 } 346 } 347 348 if (calleeName.equals("__amdil_is_asic_id_i32")) { 349 ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); 350 Type *aType = Type::getInt32Ty(*mCTX); 351 Value *Val = CV; 352 if (Val) { 353 Val = ConstantInt::get(aType, 354 mSTM->device()->getDeviceFlag() & CV->getZExtValue()); 355 } else { 356 Val = ConstantInt::get(aType, 0); 357 } 358 CI->replaceAllUsesWith(Val); 359 ++(*bbb); 360 CI->eraseFromParent(); 361 return true; 362 } 363 Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); 364 if (!F) { 365 return false; 366 } 367 if (F->getName().startswith("__atom") && !CI->getNumUses() 368 && F->getName().find("_xchg") == StringRef::npos) { 369 std::string buffer(F->getName().str() + "_noret"); 370 F = dyn_cast<Function>( 371 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); 372 atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F)); 373 } 374 375 if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) 376 && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { 377 return false; 378 } 379 if (!mConvertAtomics) { 380 return false; 381 } 382 StringRef name = F->getName(); 383 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { 384 mConvertAtomics = false; 385 } 386 return false; 387 } 388 389 bool 390 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 391 Instruction *&src, 392 Constant *&mask, 393 Constant *&shift) 394 { 395 if (!base) { 396 if (mDebug) { 397 dbgs() << "Null pointer passed into function.\n"; 398 } 399 return false; 400 } 401 bool andOp = false; 402 if (base->getOpcode() == Instruction::Shl) { 403 shift = dyn_cast<Constant>(base->getOperand(1)); 404 } else if (base->getOpcode() == Instruction::And) { 405 mask = dyn_cast<Constant>(base->getOperand(1)); 406 andOp = true; 407 } else { 408 if (mDebug) { 409 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; 410 } 411 // If the base is neither a Shl or a And, we don't fit any of the patterns above. 412 return false; 413 } 414 src = dyn_cast<Instruction>(base->getOperand(0)); 415 if (!src) { 416 if (mDebug) { 417 dbgs() << "Failed setup since the base operand is not an instruction!\n"; 418 } 419 return false; 420 } 421 // If we find an 'and' operation, then we don't need to 422 // find the next operation as we already know the 423 // bits that are valid at this point. 424 if (andOp) { 425 return true; 426 } 427 if (src->getOpcode() == Instruction::Shl && !shift) { 428 shift = dyn_cast<Constant>(src->getOperand(1)); 429 src = dyn_cast<Instruction>(src->getOperand(0)); 430 } else if (src->getOpcode() == Instruction::And && !mask) { 431 mask = dyn_cast<Constant>(src->getOperand(1)); 432 } 433 if (!mask && !shift) { 434 if (mDebug) { 435 dbgs() << "Failed setup since both mask and shift are NULL!\n"; 436 } 437 // Did not find a constant mask or a shift. 438 return false; 439 } 440 return true; 441 } 442 bool 443 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) 444 { 445 if (!inst) { 446 return false; 447 } 448 if (!inst->isBinaryOp()) { 449 return false; 450 } 451 if (inst->getOpcode() != Instruction::Or) { 452 return false; 453 } 454 if (optLevel == CodeGenOpt::None) { 455 return false; 456 } 457 // We want to do an optimization on a sequence of ops that in the end equals a 458 // single ISA instruction. 459 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) 460 // Some simplified versions of this pattern are as follows: 461 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 462 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E 463 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B 464 // (A & B) | (D << F) when (1 << F) >= B 465 // (A << C) | (D & E) when (1 << C) >= E 466 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { 467 // The HD4XXX hardware doesn't support the ubit_insert instruction. 468 return false; 469 } 470 Type *aType = inst->getType(); 471 bool isVector = aType->isVectorTy(); 472 int numEle = 1; 473 // This optimization only works on 32bit integers. 474 if (aType->getScalarType() 475 != Type::getInt32Ty(inst->getContext())) { 476 return false; 477 } 478 if (isVector) { 479 const VectorType *VT = dyn_cast<VectorType>(aType); 480 numEle = VT->getNumElements(); 481 // We currently cannot support more than 4 elements in a intrinsic and we 482 // cannot support Vec3 types. 483 if (numEle > 4 || numEle == 3) { 484 return false; 485 } 486 } 487 // TODO: Handle vectors. 488 if (isVector) { 489 if (mDebug) { 490 dbgs() << "!!! Vectors are not supported yet!\n"; 491 } 492 return false; 493 } 494 Instruction *LHSSrc = NULL, *RHSSrc = NULL; 495 Constant *LHSMask = NULL, *RHSMask = NULL; 496 Constant *LHSShift = NULL, *RHSShift = NULL; 497 Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); 498 Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); 499 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { 500 if (mDebug) { 501 dbgs() << "Found an OR Operation that failed setup!\n"; 502 inst->dump(); 503 if (LHS) { LHS->dump(); } 504 if (LHSSrc) { LHSSrc->dump(); } 505 if (LHSMask) { LHSMask->dump(); } 506 if (LHSShift) { LHSShift->dump(); } 507 } 508 // There was an issue with the setup for BitInsert. 509 return false; 510 } 511 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { 512 if (mDebug) { 513 dbgs() << "Found an OR Operation that failed setup!\n"; 514 inst->dump(); 515 if (RHS) { RHS->dump(); } 516 if (RHSSrc) { RHSSrc->dump(); } 517 if (RHSMask) { RHSMask->dump(); } 518 if (RHSShift) { RHSShift->dump(); } 519 } 520 // There was an issue with the setup for BitInsert. 521 return false; 522 } 523 if (mDebug) { 524 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; 525 dbgs() << "Op: "; inst->dump(); 526 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } 527 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } 528 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } 529 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } 530 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } 531 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } 532 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } 533 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } 534 } 535 Constant *offset = NULL; 536 Constant *width = NULL; 537 int32_t lhsMaskVal = 0, rhsMaskVal = 0; 538 int32_t lhsShiftVal = 0, rhsShiftVal = 0; 539 int32_t lhsMaskWidth = 0, rhsMaskWidth = 0; 540 int32_t lhsMaskOffset = 0, rhsMaskOffset = 0; 541 lhsMaskVal = (int32_t)(LHSMask 542 ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); 543 rhsMaskVal = (int32_t)(RHSMask 544 ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); 545 lhsShiftVal = (int32_t)(LHSShift 546 ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); 547 rhsShiftVal = (int32_t)(RHSShift 548 ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); 549 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; 550 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; 551 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; 552 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; 553 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). 554 if (mDebug) { 555 dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")"); 556 dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ; 557 dbgs() << (RHSMask ? " & E)" : ")"); 558 dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n"); 559 dbgs() << "A = LHSSrc\t\tD = RHSSrc \n"; 560 dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n"; 561 dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n"; 562 dbgs() << "width(B) = " << lhsMaskWidth; 563 dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n"; 564 dbgs() << "offset(B) = " << lhsMaskOffset; 565 dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n"; 566 dbgs() << "Constraints: \n"; 567 dbgs() << "\t(1) B ^ E == 0\n"; 568 dbgs() << "\t(2-LHS) B is a mask\n"; 569 dbgs() << "\t(2-LHS) E is a mask\n"; 570 dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n"; 571 dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n"; 572 } 573 if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { 574 if (mDebug) { 575 dbgs() << lhsMaskVal << " ^ " << rhsMaskVal; 576 dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n"; 577 dbgs() << "Failed constraint 1!\n"; 578 } 579 return false; 580 } 581 if (mDebug) { 582 dbgs() << "LHS = " << lhsMaskOffset << ""; 583 dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = "; 584 dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)); 585 dbgs() << "\nRHS = " << rhsMaskOffset << ""; 586 dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = "; 587 dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)); 588 dbgs() << "\n"; 589 } 590 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { 591 offset = ConstantInt::get(aType, lhsMaskOffset, false); 592 width = ConstantInt::get(aType, lhsMaskWidth, false); 593 RHSSrc = RHS; 594 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { 595 if (mDebug) { 596 dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n"; 597 dbgs() << "Failed constraint 2!\n"; 598 } 599 return false; 600 } 601 if (!LHSShift) { 602 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 603 "MaskShr", LHS); 604 } else if (lhsShiftVal != lhsMaskOffset) { 605 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 606 "MaskShr", LHS); 607 } 608 if (mDebug) { 609 dbgs() << "Optimizing LHS!\n"; 610 } 611 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { 612 offset = ConstantInt::get(aType, rhsMaskOffset, false); 613 width = ConstantInt::get(aType, rhsMaskWidth, false); 614 LHSSrc = RHSSrc; 615 RHSSrc = LHS; 616 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { 617 if (mDebug) { 618 dbgs() << "Non-Mask: " << rhsMaskVal << "\n"; 619 dbgs() << "Failed constraint 2!\n"; 620 } 621 return false; 622 } 623 if (!RHSShift) { 624 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 625 "MaskShr", RHS); 626 } else if (rhsShiftVal != rhsMaskOffset) { 627 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, 628 "MaskShr", RHS); 629 } 630 if (mDebug) { 631 dbgs() << "Optimizing RHS!\n"; 632 } 633 } else { 634 if (mDebug) { 635 dbgs() << "Failed constraint 3!\n"; 636 } 637 return false; 638 } 639 if (mDebug) { 640 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } 641 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } 642 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } 643 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } 644 } 645 if (!offset || !width) { 646 if (mDebug) { 647 dbgs() << "Either width or offset are NULL, failed detection!\n"; 648 } 649 return false; 650 } 651 // Lets create the function signature. 652 std::vector<Type *> callTypes; 653 callTypes.push_back(aType); 654 callTypes.push_back(aType); 655 callTypes.push_back(aType); 656 callTypes.push_back(aType); 657 FunctionType *funcType = FunctionType::get(aType, callTypes, false); 658 std::string name = "__amdil_ubit_insert"; 659 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } 660 Function *Func = 661 dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> 662 getOrInsertFunction(llvm::StringRef(name), funcType)); 663 Value *Operands[4] = { 664 width, 665 offset, 666 LHSSrc, 667 RHSSrc 668 }; 669 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); 670 if (mDebug) { 671 dbgs() << "Old Inst: "; 672 inst->dump(); 673 dbgs() << "New Inst: "; 674 CI->dump(); 675 dbgs() << "\n\n"; 676 } 677 CI->insertBefore(inst); 678 inst->replaceAllUsesWith(CI); 679 return true; 680 } 681 682 bool 683 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) 684 { 685 if (!inst) { 686 return false; 687 } 688 if (!inst->isBinaryOp()) { 689 return false; 690 } 691 if (inst->getOpcode() != Instruction::And) { 692 return false; 693 } 694 if (optLevel == CodeGenOpt::None) { 695 return false; 696 } 697 // We want to do some simple optimizations on Shift right/And patterns. The 698 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a 699 // value smaller than 32 and C is a mask. If C is a constant value, then the 700 // following transformation can occur. For signed integers, it turns into the 701 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned 702 // integers, it turns into the function call dst = 703 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract 704 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for 705 // Evergreen hardware. 706 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { 707 // This does not work on HD4XXX hardware. 708 return false; 709 } 710 Type *aType = inst->getType(); 711 bool isVector = aType->isVectorTy(); 712 713 // XXX Support vector types 714 if (isVector) { 715 return false; 716 } 717 int numEle = 1; 718 // This only works on 32bit integers 719 if (aType->getScalarType() 720 != Type::getInt32Ty(inst->getContext())) { 721 return false; 722 } 723 if (isVector) { 724 const VectorType *VT = dyn_cast<VectorType>(aType); 725 numEle = VT->getNumElements(); 726 // We currently cannot support more than 4 elements in a intrinsic and we 727 // cannot support Vec3 types. 728 if (numEle > 4 || numEle == 3) { 729 return false; 730 } 731 } 732 BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); 733 // If the first operand is not a shift instruction, then we can return as it 734 // doesn't match this pattern. 735 if (!ShiftInst || !ShiftInst->isShift()) { 736 return false; 737 } 738 // If we are a shift left, then we need don't match this pattern. 739 if (ShiftInst->getOpcode() == Instruction::Shl) { 740 return false; 741 } 742 bool isSigned = ShiftInst->isArithmeticShift(); 743 Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); 744 Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); 745 // Lets make sure that the shift value and the and mask are constant integers. 746 if (!AndMask || !ShrVal) { 747 return false; 748 } 749 Constant *newMaskConst; 750 Constant *shiftValConst; 751 if (isVector) { 752 // Handle the vector case 753 std::vector<Constant *> maskVals; 754 std::vector<Constant *> shiftVals; 755 ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); 756 ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); 757 Type *scalarType = AndMaskVec->getType()->getScalarType(); 758 assert(AndMaskVec->getNumOperands() == 759 ShrValVec->getNumOperands() && "cannot have a " 760 "combination where the number of elements to a " 761 "shift and an and are different!"); 762 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { 763 ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); 764 ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); 765 if (!AndCI || !ShiftIC) { 766 return false; 767 } 768 uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); 769 if (!isMask_32(maskVal)) { 770 return false; 771 } 772 maskVal = (uint32_t)CountTrailingOnes_32(maskVal); 773 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); 774 // If the mask or shiftval is greater than the bitcount, then break out. 775 if (maskVal >= 32 || shiftVal >= 32) { 776 return false; 777 } 778 // If the mask val is greater than the the number of original bits left 779 // then this optimization is invalid. 780 if (maskVal > (32 - shiftVal)) { 781 return false; 782 } 783 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); 784 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); 785 } 786 newMaskConst = ConstantVector::get(maskVals); 787 shiftValConst = ConstantVector::get(shiftVals); 788 } else { 789 // Handle the scalar case 790 uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); 791 // This must be a mask value where all lower bits are set to 1 and then any 792 // bit higher is set to 0. 793 if (!isMask_32(maskVal)) { 794 return false; 795 } 796 maskVal = (uint32_t)CountTrailingOnes_32(maskVal); 797 // Count the number of bits set in the mask, this is the width of the 798 // resulting bit set that is extracted from the source value. 799 uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); 800 // If the mask or shift val is greater than the bitcount, then break out. 801 if (maskVal >= 32 || shiftVal >= 32) { 802 return false; 803 } 804 // If the mask val is greater than the the number of original bits left then 805 // this optimization is invalid. 806 if (maskVal > (32 - shiftVal)) { 807 return false; 808 } 809 newMaskConst = ConstantInt::get(aType, maskVal, isSigned); 810 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); 811 } 812 // Lets create the function signature. 813 std::vector<Type *> callTypes; 814 callTypes.push_back(aType); 815 callTypes.push_back(aType); 816 callTypes.push_back(aType); 817 FunctionType *funcType = FunctionType::get(aType, callTypes, false); 818 std::string name = "llvm.AMDIL.bit.extract.u32"; 819 if (isVector) { 820 name += ".v" + itostr(numEle) + "i32"; 821 } else { 822 name += "."; 823 } 824 // Lets create the function. 825 Function *Func = 826 dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> 827 getOrInsertFunction(llvm::StringRef(name), funcType)); 828 Value *Operands[3] = { 829 ShiftInst->getOperand(0), 830 shiftValConst, 831 newMaskConst 832 }; 833 // Lets create the Call with the operands 834 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); 835 CI->setDoesNotAccessMemory(); 836 CI->insertBefore(inst); 837 inst->replaceAllUsesWith(CI); 838 return true; 839 } 840 841 bool 842 AMDGPUPeepholeOpt::expandBFI(CallInst *CI) 843 { 844 if (!CI) { 845 return false; 846 } 847 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); 848 if (!LHS->getName().startswith("__amdil_bfi")) { 849 return false; 850 } 851 Type* type = CI->getOperand(0)->getType(); 852 Constant *negOneConst = NULL; 853 if (type->isVectorTy()) { 854 std::vector<Constant *> negOneVals; 855 negOneConst = ConstantInt::get(CI->getContext(), 856 APInt(32, StringRef("-1"), 10)); 857 for (size_t x = 0, 858 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { 859 negOneVals.push_back(negOneConst); 860 } 861 negOneConst = ConstantVector::get(negOneVals); 862 } else { 863 negOneConst = ConstantInt::get(CI->getContext(), 864 APInt(32, StringRef("-1"), 10)); 865 } 866 // __amdil_bfi => (A & B) | (~A & C) 867 BinaryOperator *lhs = 868 BinaryOperator::Create(Instruction::And, CI->getOperand(0), 869 CI->getOperand(1), "bfi_and", CI); 870 BinaryOperator *rhs = 871 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, 872 "bfi_not", CI); 873 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), 874 "bfi_and", CI); 875 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); 876 CI->replaceAllUsesWith(lhs); 877 return true; 878 } 879 880 bool 881 AMDGPUPeepholeOpt::expandBFM(CallInst *CI) 882 { 883 if (!CI) { 884 return false; 885 } 886 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); 887 if (!LHS->getName().startswith("__amdil_bfm")) { 888 return false; 889 } 890 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) 891 Constant *newMaskConst = NULL; 892 Constant *newShiftConst = NULL; 893 Type* type = CI->getOperand(0)->getType(); 894 if (type->isVectorTy()) { 895 std::vector<Constant*> newMaskVals, newShiftVals; 896 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); 897 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); 898 for (size_t x = 0, 899 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { 900 newMaskVals.push_back(newMaskConst); 901 newShiftVals.push_back(newShiftConst); 902 } 903 newMaskConst = ConstantVector::get(newMaskVals); 904 newShiftConst = ConstantVector::get(newShiftVals); 905 } else { 906 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); 907 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); 908 } 909 BinaryOperator *lhs = 910 BinaryOperator::Create(Instruction::And, CI->getOperand(0), 911 newMaskConst, "bfm_mask", CI); 912 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, 913 lhs, "bfm_shl", CI); 914 lhs = BinaryOperator::Create(Instruction::Sub, lhs, 915 newShiftConst, "bfm_sub", CI); 916 BinaryOperator *rhs = 917 BinaryOperator::Create(Instruction::And, CI->getOperand(1), 918 newMaskConst, "bfm_mask", CI); 919 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); 920 CI->replaceAllUsesWith(lhs); 921 return true; 922 } 923 924 bool 925 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) 926 { 927 Instruction *inst = (*bbb); 928 if (optimizeCallInst(bbb)) { 929 return true; 930 } 931 if (optimizeBitExtract(inst)) { 932 return false; 933 } 934 if (optimizeBitInsert(inst)) { 935 return false; 936 } 937 if (correctMisalignedMemOp(inst)) { 938 return false; 939 } 940 return false; 941 } 942 bool 943 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) 944 { 945 LoadInst *linst = dyn_cast<LoadInst>(inst); 946 StoreInst *sinst = dyn_cast<StoreInst>(inst); 947 unsigned alignment; 948 Type* Ty = inst->getType(); 949 if (linst) { 950 alignment = linst->getAlignment(); 951 Ty = inst->getType(); 952 } else if (sinst) { 953 alignment = sinst->getAlignment(); 954 Ty = sinst->getValueOperand()->getType(); 955 } else { 956 return false; 957 } 958 unsigned size = getTypeSize(Ty); 959 if (size == alignment || size < alignment) { 960 return false; 961 } 962 if (!Ty->isStructTy()) { 963 return false; 964 } 965 if (alignment < 4) { 966 if (linst) { 967 linst->setAlignment(0); 968 return true; 969 } else if (sinst) { 970 sinst->setAlignment(0); 971 return true; 972 } 973 } 974 return false; 975 } 976 bool 977 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) 978 { 979 if (!CI) { 980 return false; 981 } 982 Value *LHS = CI->getOperand(CI->getNumOperands() - 1); 983 std::string namePrefix = LHS->getName().substr(0, 14); 984 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" 985 && namePrefix != "__amdil__imul24_high") { 986 return false; 987 } 988 if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { 989 return false; 990 } 991 return true; 992 } 993 994 void 995 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) 996 { 997 assert(isSigned24BitOps(CI) && "Must be a " 998 "signed 24 bit operation to call this function!"); 999 Value *LHS = CI->getOperand(CI->getNumOperands()-1); 1000 // On 7XX and 8XX we do not have signed 24bit, so we need to 1001 // expand it to the following: 1002 // imul24 turns into 32bit imul 1003 // imad24 turns into 32bit imad 1004 // imul24_high turns into 32bit imulhigh 1005 if (LHS->getName().substr(0, 14) == "__amdil_imad24") { 1006 Type *aType = CI->getOperand(0)->getType(); 1007 bool isVector = aType->isVectorTy(); 1008 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; 1009 std::vector<Type*> callTypes; 1010 callTypes.push_back(CI->getOperand(0)->getType()); 1011 callTypes.push_back(CI->getOperand(1)->getType()); 1012 callTypes.push_back(CI->getOperand(2)->getType()); 1013 FunctionType *funcType = 1014 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); 1015 std::string name = "__amdil_imad"; 1016 if (isVector) { 1017 name += "_v" + itostr(numEle) + "i32"; 1018 } else { 1019 name += "_i32"; 1020 } 1021 Function *Func = dyn_cast<Function>( 1022 CI->getParent()->getParent()->getParent()-> 1023 getOrInsertFunction(llvm::StringRef(name), funcType)); 1024 Value *Operands[3] = { 1025 CI->getOperand(0), 1026 CI->getOperand(1), 1027 CI->getOperand(2) 1028 }; 1029 CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); 1030 nCI->insertBefore(CI); 1031 CI->replaceAllUsesWith(nCI); 1032 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { 1033 BinaryOperator *mulOp = 1034 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), 1035 CI->getOperand(1), "imul24", CI); 1036 CI->replaceAllUsesWith(mulOp); 1037 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { 1038 Type *aType = CI->getOperand(0)->getType(); 1039 1040 bool isVector = aType->isVectorTy(); 1041 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; 1042 std::vector<Type*> callTypes; 1043 callTypes.push_back(CI->getOperand(0)->getType()); 1044 callTypes.push_back(CI->getOperand(1)->getType()); 1045 FunctionType *funcType = 1046 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); 1047 std::string name = "__amdil_imul_high"; 1048 if (isVector) { 1049 name += "_v" + itostr(numEle) + "i32"; 1050 } else { 1051 name += "_i32"; 1052 } 1053 Function *Func = dyn_cast<Function>( 1054 CI->getParent()->getParent()->getParent()-> 1055 getOrInsertFunction(llvm::StringRef(name), funcType)); 1056 Value *Operands[2] = { 1057 CI->getOperand(0), 1058 CI->getOperand(1) 1059 }; 1060 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); 1061 nCI->insertBefore(CI); 1062 CI->replaceAllUsesWith(nCI); 1063 } 1064 } 1065 1066 bool 1067 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) 1068 { 1069 return (CI != NULL 1070 && CI->getOperand(CI->getNumOperands() - 1)->getName() 1071 == "__amdil_get_local_size_int"); 1072 } 1073 1074 bool 1075 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) 1076 { 1077 if (!CI) { 1078 return false; 1079 } 1080 if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX 1081 && (mSTM->getDeviceName() == "cayman")) { 1082 return false; 1083 } 1084 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 1085 == "__amdil_improved_div"; 1086 } 1087 1088 void 1089 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) 1090 { 1091 assert(convertAccurateDivide(CI) 1092 && "expanding accurate divide can only happen if it is expandable!"); 1093 BinaryOperator *divOp = 1094 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), 1095 CI->getOperand(1), "fdiv32", CI); 1096 CI->replaceAllUsesWith(divOp); 1097 } 1098 1099 bool 1100 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) 1101 { 1102 if (optLevel != CodeGenOpt::None) { 1103 return false; 1104 } 1105 1106 if (!CI) { 1107 return false; 1108 } 1109 1110 unsigned funcNameIdx = 0; 1111 funcNameIdx = CI->getNumOperands() - 1; 1112 StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); 1113 if (calleeName != "__amdil_image2d_read_norm" 1114 && calleeName != "__amdil_image2d_read_unnorm" 1115 && calleeName != "__amdil_image3d_read_norm" 1116 && calleeName != "__amdil_image3d_read_unnorm") { 1117 return false; 1118 } 1119 1120 unsigned samplerIdx = 2; 1121 samplerIdx = 1; 1122 Value *sampler = CI->getOperand(samplerIdx); 1123 LoadInst *lInst = dyn_cast<LoadInst>(sampler); 1124 if (!lInst) { 1125 return false; 1126 } 1127 1128 if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1129 return false; 1130 } 1131 1132 GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); 1133 // If we are loading from what is not a global value, then we 1134 // fail and return. 1135 if (!gv) { 1136 return false; 1137 } 1138 1139 // If we don't have an initializer or we have an initializer and 1140 // the initializer is not a 32bit integer, we fail. 1141 if (!gv->hasInitializer() 1142 || !gv->getInitializer()->getType()->isIntegerTy(32)) { 1143 return false; 1144 } 1145 1146 // Now that we have the global variable initializer, lets replace 1147 // all uses of the load instruction with the samplerVal and 1148 // reparse the __amdil_is_constant() function. 1149 Constant *samplerVal = gv->getInitializer(); 1150 lInst->replaceAllUsesWith(samplerVal); 1151 return true; 1152 } 1153 1154 bool 1155 AMDGPUPeepholeOpt::doInitialization(Module &M) 1156 { 1157 return false; 1158 } 1159 1160 bool 1161 AMDGPUPeepholeOpt::doFinalization(Module &M) 1162 { 1163 return false; 1164 } 1165 1166 void 1167 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const 1168 { 1169 AU.addRequired<MachineFunctionAnalysis>(); 1170 FunctionPass::getAnalysisUsage(AU); 1171 AU.setPreservesAll(); 1172 } 1173 1174 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { 1175 size_t size = 0; 1176 if (!T) { 1177 return size; 1178 } 1179 switch (T->getTypeID()) { 1180 case Type::X86_FP80TyID: 1181 case Type::FP128TyID: 1182 case Type::PPC_FP128TyID: 1183 case Type::LabelTyID: 1184 assert(0 && "These types are not supported by this backend"); 1185 default: 1186 case Type::FloatTyID: 1187 case Type::DoubleTyID: 1188 size = T->getPrimitiveSizeInBits() >> 3; 1189 break; 1190 case Type::PointerTyID: 1191 size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); 1192 break; 1193 case Type::IntegerTyID: 1194 size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); 1195 break; 1196 case Type::StructTyID: 1197 size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); 1198 break; 1199 case Type::ArrayTyID: 1200 size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); 1201 break; 1202 case Type::FunctionTyID: 1203 size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); 1204 break; 1205 case Type::VectorTyID: 1206 size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); 1207 break; 1208 }; 1209 return size; 1210 } 1211 1212 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, 1213 bool dereferencePtr) { 1214 size_t size = 0; 1215 if (!ST) { 1216 return size; 1217 } 1218 Type *curType; 1219 StructType::element_iterator eib; 1220 StructType::element_iterator eie; 1221 for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { 1222 curType = *eib; 1223 size += getTypeSize(curType, dereferencePtr); 1224 } 1225 return size; 1226 } 1227 1228 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, 1229 bool dereferencePtr) { 1230 return IT ? (IT->getBitWidth() >> 3) : 0; 1231 } 1232 1233 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, 1234 bool dereferencePtr) { 1235 assert(0 && "Should not be able to calculate the size of an function type"); 1236 return 0; 1237 } 1238 1239 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, 1240 bool dereferencePtr) { 1241 return (size_t)(AT ? (getTypeSize(AT->getElementType(), 1242 dereferencePtr) * AT->getNumElements()) 1243 : 0); 1244 } 1245 1246 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, 1247 bool dereferencePtr) { 1248 return VT ? (VT->getBitWidth() >> 3) : 0; 1249 } 1250 1251 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, 1252 bool dereferencePtr) { 1253 if (!PT) { 1254 return 0; 1255 } 1256 Type *CT = PT->getElementType(); 1257 if (CT->getTypeID() == Type::StructTyID && 1258 PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 1259 return getTypeSize(dyn_cast<StructType>(CT)); 1260 } else if (dereferencePtr) { 1261 size_t size = 0; 1262 for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { 1263 size += getTypeSize(PT->getContainedType(x), dereferencePtr); 1264 } 1265 return size; 1266 } else { 1267 return 4; 1268 } 1269 } 1270 1271 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, 1272 bool dereferencePtr) { 1273 //assert(0 && "Should not be able to calculate the size of an opaque type"); 1274 return 4; 1275 } 1276