1 /* 2 * Copyright 2012, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "Assert.h" 18 #include "Log.h" 19 #include "RSTransforms.h" 20 #include "RSUtils.h" 21 22 #include "bcc/Config.h" 23 #include "bcinfo/MetadataExtractor.h" 24 25 #include "slang_version.h" 26 27 #include <cstdlib> 28 #include <functional> 29 #include <unordered_set> 30 31 #include <llvm/IR/DerivedTypes.h> 32 #include <llvm/IR/Function.h> 33 #include <llvm/IR/Instructions.h> 34 #include <llvm/IR/IRBuilder.h> 35 #include <llvm/IR/MDBuilder.h> 36 #include <llvm/IR/Module.h> 37 #include <llvm/Pass.h> 38 #include <llvm/Support/raw_ostream.h> 39 #include <llvm/IR/DataLayout.h> 40 #include <llvm/IR/Function.h> 41 #include <llvm/IR/Type.h> 42 #include <llvm/Transforms/Utils/BasicBlockUtils.h> 43 44 #ifndef __DISABLE_ASSERTS 45 // Only used in bccAssert() 46 const int kNumExpandedForeachParams = 4; 47 const int kNumExpandedReduceAccumulatorParams = 4; 48 #endif 49 50 const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA"; 51 const char kRenderScriptTBAANodeName[] = "RenderScript TBAA"; 52 53 using namespace bcc; 54 55 namespace { 56 57 static const bool gEnableRsTbaa = true; 58 59 /* RSKernelExpandPass 60 * 61 * This pass generates functions used to implement calls via 62 * rsForEach(), "foreach_<NAME>", or "reduce_<NAME>". We create an 63 * inner loop for the function to be invoked over the appropriate data 64 * cells of the input/output allocations (adjusting other relevant 65 * parameters as we go). We support doing this for any forEach or 66 * reduce style compute kernels. 67 * 68 * In the case of a foreach kernel or a simple reduction kernel, the 69 * new function name is the original function name "<NAME>" followed 70 * by ".expand" -- "<NAME>.expand". 71 * 72 * In the case of a general reduction kernel, the kernel's accumulator 73 * function is the one transformed, and the new function name is the 74 * original accumulator function name "<ACCUMFN>" followed by 75 * ".expand" -- "<ACCUMFN>.expand". Using the name "<ACCUMFN>.expand" 76 * for the function generated from the accumulator should not 77 * introduce any possibility for name clashes today: The accumulator 78 * function <ACCUMFN> must be static, so it cannot also serve as a 79 * foreach kernel; and the code for <ACCUMFN>.expand depends only on 80 * <ACCUMFN>, not on any other properties of the reduction kernel, so 81 * any reduction kernels that share the accumulator <ACCUMFN> can 82 * share <ACCUMFN>.expand also. 83 * 84 * Note that this pass does not delete the original function <NAME> or 85 * <ACCUMFN>. However, if it is inlined into the newly-generated 86 * function and not otherwise referenced, then a subsequent pass may 87 * delete it. 88 */ 89 class RSKernelExpandPass : public llvm::ModulePass { 90 public: 91 static char ID; 92 93 private: 94 static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h 95 96 typedef std::unordered_set<llvm::Function *> FunctionSet; 97 98 enum RsLaunchDimensionsField { 99 RsLaunchDimensionsFieldX, 100 RsLaunchDimensionsFieldY, 101 RsLaunchDimensionsFieldZ, 102 RsLaunchDimensionsFieldLod, 103 RsLaunchDimensionsFieldFace, 104 RsLaunchDimensionsFieldArray, 105 106 RsLaunchDimensionsFieldCount 107 }; 108 109 enum RsExpandKernelDriverInfoPfxField { 110 RsExpandKernelDriverInfoPfxFieldInPtr, 111 RsExpandKernelDriverInfoPfxFieldInStride, 112 RsExpandKernelDriverInfoPfxFieldInLen, 113 RsExpandKernelDriverInfoPfxFieldOutPtr, 114 RsExpandKernelDriverInfoPfxFieldOutStride, 115 RsExpandKernelDriverInfoPfxFieldOutLen, 116 RsExpandKernelDriverInfoPfxFieldDim, 117 RsExpandKernelDriverInfoPfxFieldCurrent, 118 RsExpandKernelDriverInfoPfxFieldUsr, 119 RsExpandKernelDriverInfoPfxFieldUsLenr, 120 121 RsExpandKernelDriverInfoPfxFieldCount 122 }; 123 124 llvm::Module *Module; 125 llvm::LLVMContext *Context; 126 127 /* 128 * Pointers to LLVM type information for the the function signatures 129 * for expanded functions. These must be re-calculated for each module 130 * the pass is run on. 131 */ 132 llvm::FunctionType *ExpandedForEachType; 133 llvm::Type *RsExpandKernelDriverInfoPfxTy; 134 135 // Initialized when we begin to process each Module 136 bool mStructExplicitlyPaddedBySlang; 137 uint32_t mExportForEachCount; 138 const char **mExportForEachNameList; 139 const uint32_t *mExportForEachSignatureList; 140 141 // Turns on optimization of allocation stride values. 142 bool mEnableStepOpt; 143 144 uint32_t getRootSignature(llvm::Function *Function) { 145 const llvm::NamedMDNode *ExportForEachMetadata = 146 Module->getNamedMetadata("#rs_export_foreach"); 147 148 if (!ExportForEachMetadata) { 149 llvm::SmallVector<llvm::Type*, 8> RootArgTys; 150 for (llvm::Function::arg_iterator B = Function->arg_begin(), 151 E = Function->arg_end(); 152 B != E; 153 ++B) { 154 RootArgTys.push_back(B->getType()); 155 } 156 157 // For pre-ICS bitcode, we may not have signature information. In that 158 // case, we use the size of the RootArgTys to select the number of 159 // arguments. 160 return (1 << RootArgTys.size()) - 1; 161 } 162 163 if (ExportForEachMetadata->getNumOperands() == 0) { 164 return 0; 165 } 166 167 bccAssert(ExportForEachMetadata->getNumOperands() > 0); 168 169 // We only handle the case for legacy root() functions here, so this is 170 // hard-coded to look at only the first such function. 171 llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0); 172 if (SigNode != nullptr && SigNode->getNumOperands() == 1) { 173 llvm::Metadata *SigMD = SigNode->getOperand(0); 174 if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) { 175 llvm::StringRef SigString = SigS->getString(); 176 uint32_t Signature = 0; 177 if (SigString.getAsInteger(10, Signature)) { 178 ALOGE("Non-integer signature value '%s'", SigString.str().c_str()); 179 return 0; 180 } 181 return Signature; 182 } 183 } 184 185 return 0; 186 } 187 188 bool isStepOptSupported(llvm::Type *AllocType) { 189 190 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 191 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 192 193 if (mEnableStepOpt) { 194 return false; 195 } 196 197 if (AllocType == VoidPtrTy) { 198 return false; 199 } 200 201 if (!PT) { 202 return false; 203 } 204 205 // remaining conditions are 64-bit only 206 if (VoidPtrTy->getPrimitiveSizeInBits() == 32) { 207 return true; 208 } 209 210 // coerce suggests an upconverted struct type, which we can't support 211 if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) { 212 return false; 213 } 214 215 // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported 216 llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2); 217 llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128); 218 if (AllocType == V2xi64Ty || AllocType == Int128Ty) { 219 return false; 220 } 221 222 return true; 223 } 224 225 // Get the actual value we should use to step through an allocation. 226 // 227 // Normally the value we use to step through an allocation is given to us by 228 // the driver. However, for certain primitive data types, we can derive an 229 // integer constant for the step value. We use this integer constant whenever 230 // possible to allow further compiler optimizations to take place. 231 // 232 // DL - Target Data size/layout information. 233 // T - Type of allocation (should be a pointer). 234 // OrigStep - Original step increment (root.expand() input from driver). 235 llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType, 236 llvm::Value *OrigStep) { 237 bccAssert(DL); 238 bccAssert(AllocType); 239 bccAssert(OrigStep); 240 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType); 241 if (isStepOptSupported(AllocType)) { 242 llvm::Type *ET = PT->getElementType(); 243 uint64_t ETSize = DL->getTypeAllocSize(ET); 244 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 245 return llvm::ConstantInt::get(Int32Ty, ETSize); 246 } else { 247 return OrigStep; 248 } 249 } 250 251 /// Builds the types required by the pass for the given context. 252 void buildTypes(void) { 253 // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs. 254 255 llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context); 256 llvm::Type *Int8PtrTy = Int8Ty->getPointerTo(); 257 llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT); 258 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 259 llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT); 260 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context); 261 llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4); 262 263 /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h: 264 * 265 * struct RsLaunchDimensions { 266 * uint32_t x; 267 * uint32_t y; 268 * uint32_t z; 269 * uint32_t lod; 270 * uint32_t face; 271 * uint32_t array[4]; 272 * }; 273 */ 274 llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes; 275 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x 276 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y 277 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z 278 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod 279 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face 280 RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4] 281 llvm::StructType *RsLaunchDimensionsTy = 282 llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions"); 283 284 /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h: 285 * 286 * struct RsExpandKernelDriverInfoPfx { 287 * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]; 288 * uint32_t inStride[RS_KERNEL_INPUT_LIMIT]; 289 * uint32_t inLen; 290 * 291 * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]; 292 * uint32_t outStride[RS_KERNEL_INPUT_LIMIT]; 293 * uint32_t outLen; 294 * 295 * // Dimension of the launch 296 * RsLaunchDimensions dim; 297 * 298 * // The walking iterator of the launch 299 * RsLaunchDimensions current; 300 * 301 * const void *usr; 302 * uint32_t usrLen; 303 * 304 * // Items below this line are not used by the compiler and can be change in the driver. 305 * // So the compiler must assume there are an unknown number of fields of unknown type 306 * // beginning here. 307 * }; 308 * 309 * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp). 310 */ 311 llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes; 312 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT] 313 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT] 314 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen 315 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT] 316 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT] 317 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen 318 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim 319 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current 320 RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr 321 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen 322 RsExpandKernelDriverInfoPfxTy = 323 llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx"); 324 325 // Create the function type for expanded kernels. 326 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 327 328 llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo(); 329 // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep) 330 ExpandedForEachType = llvm::FunctionType::get(VoidTy, 331 {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false); 332 } 333 334 /// @brief Create skeleton of the expanded foreach kernel. 335 /// 336 /// This creates a function with the following signature: 337 /// 338 /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2, 339 /// uint32_t outstep) 340 /// 341 llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) { 342 llvm::Function *ExpandedFunction = 343 llvm::Function::Create(ExpandedForEachType, 344 llvm::GlobalValue::ExternalLinkage, 345 OldName + ".expand", Module); 346 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 347 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin(); 348 (AI++)->setName("p"); 349 (AI++)->setName("x1"); 350 (AI++)->setName("x2"); 351 (AI++)->setName("arg_outstep"); 352 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 353 ExpandedFunction); 354 llvm::IRBuilder<> Builder(Begin); 355 Builder.CreateRetVoid(); 356 return ExpandedFunction; 357 } 358 359 // Create skeleton of a general reduce kernel's expanded accumulator. 360 // 361 // This creates a function with the following signature: 362 // 363 // void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p, 364 // i32 %x1, i32 %x2, accumType* nocapture %accum) 365 // 366 llvm::Function *createEmptyExpandedReduceAccumulator(llvm::StringRef OldName, 367 llvm::Type *AccumArgTy) { 368 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 369 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 370 llvm::FunctionType *ExpandedReduceAccumulatorType = 371 llvm::FunctionType::get(VoidTy, 372 {RsExpandKernelDriverInfoPfxTy->getPointerTo(), 373 Int32Ty, Int32Ty, AccumArgTy}, false); 374 llvm::Function *FnExpandedAccumulator = 375 llvm::Function::Create(ExpandedReduceAccumulatorType, 376 llvm::GlobalValue::ExternalLinkage, 377 OldName + ".expand", Module); 378 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams); 379 380 llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin(); 381 382 using llvm::Attribute; 383 384 llvm::Argument *Arg_p = &(*AI++); 385 Arg_p->setName("p"); 386 Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1, 387 llvm::makeArrayRef(Attribute::NoCapture))); 388 389 llvm::Argument *Arg_x1 = &(*AI++); 390 Arg_x1->setName("x1"); 391 392 llvm::Argument *Arg_x2 = &(*AI++); 393 Arg_x2->setName("x2"); 394 395 llvm::Argument *Arg_accum = &(*AI++); 396 Arg_accum->setName("accum"); 397 Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1, 398 llvm::makeArrayRef(Attribute::NoCapture))); 399 400 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin", 401 FnExpandedAccumulator); 402 llvm::IRBuilder<> Builder(Begin); 403 Builder.CreateRetVoid(); 404 405 return FnExpandedAccumulator; 406 } 407 408 /// @brief Create an empty loop 409 /// 410 /// Create a loop of the form: 411 /// 412 /// for (i = LowerBound; i < UpperBound; i++) 413 /// ; 414 /// 415 /// After the loop has been created, the builder is set such that 416 /// instructions can be added to the loop body. 417 /// 418 /// @param Builder The builder to use to build this loop. The current 419 /// position of the builder is the position the loop 420 /// will be inserted. 421 /// @param LowerBound The first value of the loop iterator 422 /// @param UpperBound The maximal value of the loop iterator 423 /// @param LoopIV A reference that will be set to the loop iterator. 424 /// @return The BasicBlock that will be executed after the loop. 425 llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder, 426 llvm::Value *LowerBound, 427 llvm::Value *UpperBound, 428 llvm::Value **LoopIV) { 429 bccAssert(LowerBound->getType() == UpperBound->getType()); 430 431 llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB; 432 llvm::Value *Cond, *IVNext, *IV, *IVVar; 433 434 CondBB = Builder.GetInsertBlock(); 435 AfterBB = llvm::SplitBlock(CondBB, &*Builder.GetInsertPoint(), nullptr, nullptr); 436 HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent()); 437 438 CondBB->getTerminator()->eraseFromParent(); 439 Builder.SetInsertPoint(CondBB); 440 441 // decltype(LowerBound) *ivvar = alloca(sizeof(int)) 442 // *ivvar = LowerBound 443 IVVar = Builder.CreateAlloca(LowerBound->getType(), nullptr, BCC_INDEX_VAR_NAME); 444 Builder.CreateStore(LowerBound, IVVar); 445 446 // if (LowerBound < Upperbound) 447 // goto LoopHeader 448 // else 449 // goto AfterBB 450 Cond = Builder.CreateICmpULT(LowerBound, UpperBound); 451 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 452 453 // LoopHeader: 454 // iv = *ivvar 455 // <insertion point here> 456 // iv.next = iv + 1 457 // *ivvar = iv.next 458 // if (iv.next < Upperbound) 459 // goto LoopHeader 460 // else 461 // goto AfterBB 462 // AfterBB: 463 Builder.SetInsertPoint(HeaderBB); 464 IV = Builder.CreateLoad(IVVar, "X"); 465 IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1)); 466 Builder.CreateStore(IVNext, IVVar); 467 Cond = Builder.CreateICmpULT(IVNext, UpperBound); 468 Builder.CreateCondBr(Cond, HeaderBB, AfterBB); 469 AfterBB->setName("Exit"); 470 Builder.SetInsertPoint(llvm::cast<llvm::Instruction>(IVNext)); 471 472 // Record information about this loop. 473 *LoopIV = IV; 474 return AfterBB; 475 } 476 477 // Finish building the outgoing argument list for calling a ForEach-able function. 478 // 479 // ArgVector - on input, the non-special arguments 480 // on output, the non-special arguments combined with the special arguments 481 // from SpecialArgVector 482 // SpecialArgVector - special arguments (from ExpandSpecialArguments()) 483 // SpecialArgContextIdx - return value of ExpandSpecialArguments() 484 // (position of context argument in SpecialArgVector) 485 // CalleeFunction - the ForEach-able function being called 486 // Builder - for inserting code into the caller function 487 template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen> 488 void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector, 489 const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector, 490 const int SpecialArgContextIdx, 491 const llvm::Function &CalleeFunction, 492 llvm::IRBuilder<> &CallerBuilder) { 493 /* The context argument (if any) is a pointer to an opaque user-visible type that differs from 494 * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the 495 * two types represent the same thing). Therefore, we must introduce a pointer cast when 496 * generating a call to the kernel function. 497 */ 498 const int ArgContextIdx = 499 SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx; 500 ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end()); 501 if (ArgContextIdx >= 0) { 502 llvm::Type *ContextArgType = nullptr; 503 int ArgIdx = ArgContextIdx; 504 for (const auto &Arg : CalleeFunction.getArgumentList()) { 505 if (!ArgIdx--) { 506 ContextArgType = Arg.getType(); 507 break; 508 } 509 } 510 bccAssert(ContextArgType); 511 ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType); 512 } 513 } 514 515 // GEPHelper() returns a SmallVector of values suitable for passing 516 // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for 517 // the returned data type. It is sized so that the SmallVector 518 // returned by GEPHelper() never needs to do a heap allocation for 519 // any list of GEP indices it encounters in the code. 520 typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices; 521 522 // Helper for turning a list of constant integer GEP indices into a 523 // SmallVector of llvm::Value*. The return value is suitable for 524 // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP(). 525 // 526 // Inputs: 527 // I32Args should be integers which represent the index arguments 528 // to a GEP instruction. 529 // 530 // Returns: 531 // Returns a SmallVector of ConstantInts. 532 SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) { 533 SmallGEPIndices Out(I32Args.size()); 534 llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context); 535 std::transform(I32Args.begin(), I32Args.end(), Out.begin(), 536 [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); }); 537 return Out; 538 } 539 540 public: 541 explicit RSKernelExpandPass(bool pEnableStepOpt = true) 542 : ModulePass(ID), Module(nullptr), Context(nullptr), 543 mEnableStepOpt(pEnableStepOpt) { 544 545 } 546 547 virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { 548 // This pass does not use any other analysis passes, but it does 549 // add/wrap the existing functions in the module (thus altering the CFG). 550 } 551 552 // Build contribution to outgoing argument list for calling a 553 // ForEach-able function or a general reduction accumulator 554 // function, based on the special parameters of that function. 555 // 556 // Signature - metadata bits for the signature of the callee 557 // X, Arg_p - values derived directly from expanded function, 558 // suitable for computing arguments for the callee 559 // CalleeArgs - contribution is accumulated here 560 // Bump - invoked once for each contributed outgoing argument 561 // LoopHeaderInsertionPoint - an Instruction in the loop header, before which 562 // this function can insert loop-invariant loads 563 // 564 // Return value is the (zero-based) position of the context (Arg_p) 565 // argument in the CalleeArgs vector, or a negative value if the 566 // context argument is not placed in the CalleeArgs vector. 567 int ExpandSpecialArguments(uint32_t Signature, 568 llvm::Value *X, 569 llvm::Value *Arg_p, 570 llvm::IRBuilder<> &Builder, 571 llvm::SmallVector<llvm::Value*, 8> &CalleeArgs, 572 const std::function<void ()> &Bump, 573 llvm::Instruction *LoopHeaderInsertionPoint) { 574 575 bccAssert(CalleeArgs.empty()); 576 577 int Return = -1; 578 if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) { 579 CalleeArgs.push_back(Arg_p); 580 Bump(); 581 Return = CalleeArgs.size() - 1; 582 } 583 584 if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) { 585 CalleeArgs.push_back(X); 586 Bump(); 587 } 588 589 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) || 590 bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 591 bccAssert(LoopHeaderInsertionPoint); 592 593 // Y and Z are loop invariant, so they can be hoisted out of the 594 // loop. Set the IRBuilder insertion point to the loop header. 595 auto OldInsertionPoint = Builder.saveIP(); 596 Builder.SetInsertPoint(LoopHeaderInsertionPoint); 597 598 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) { 599 SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 600 RsLaunchDimensionsFieldY})); 601 llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep"); 602 CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y")); 603 Bump(); 604 } 605 606 if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) { 607 SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent, 608 RsLaunchDimensionsFieldZ})); 609 llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep"); 610 CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z")); 611 Bump(); 612 } 613 614 Builder.restoreIP(OldInsertionPoint); 615 } 616 617 return Return; 618 } 619 620 // Generate loop-invariant input processing setup code for an expanded 621 // ForEach-able function or an expanded general reduction accumulator 622 // function. 623 // 624 // LoopHeader - block at the end of which the setup code will be inserted 625 // Arg_p - RSKernelDriverInfo pointer passed to the expanded function 626 // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo 627 // ArgIter - iterator pointing to first input of the UNexpanded function 628 // NumInputs - number of inputs (NOT number of ARGUMENTS) 629 // 630 // InTypes[] - this function saves input type, they will be used in ExpandInputsBody(). 631 // InBufPtrs[] - this function sets each array element to point to the first cell / byte 632 // (byte for x86, cell for other platforms) of the corresponding input allocation 633 // InStructTempSlots[] - this function sets each array element either to nullptr 634 // or to the result of an alloca (for the case where the 635 // calling convention dictates that a value must be passed 636 // by reference, and so we need a stacked temporary to hold 637 // a copy of that value) 638 void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader, 639 llvm::Value *Arg_p, 640 llvm::MDNode *TBAAPointer, 641 llvm::Function::arg_iterator ArgIter, 642 const size_t NumInputs, 643 llvm::SmallVectorImpl<llvm::Type *> &InTypes, 644 llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 645 llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) { 646 bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT); 647 648 // Extract information about input slots. The work done 649 // here is loop-invariant, so we can hoist the operations out of the loop. 650 auto OldInsertionPoint = Builder.saveIP(); 651 Builder.SetInsertPoint(LoopHeader->getTerminator()); 652 653 for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) { 654 llvm::Type *InType = ArgIter->getType(); 655 656 /* 657 * AArch64 calling conventions dictate that structs of sufficient size 658 * get passed by pointer instead of passed by value. This, combined 659 * with the fact that we don't allow kernels to operate on pointer 660 * data means that if we see a kernel with a pointer parameter we know 661 * that it is a struct input that has been promoted. As such we don't 662 * need to convert its type to a pointer. Later we will need to know 663 * to create a temporary copy on the stack, so we save this information 664 * in InStructTempSlots. 665 */ 666 if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) { 667 llvm::Type *ElementType = PtrType->getElementType(); 668 InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr, 669 "input_struct_slot")); 670 } else { 671 InType = InType->getPointerTo(); 672 InStructTempSlots.push_back(nullptr); 673 } 674 675 SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 676 static_cast<int32_t>(InputIndex)})); 677 llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep"); 678 llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf"); 679 680 llvm::Value *CastInBufPtr = nullptr; 681 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) { 682 CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in"); 683 } else { 684 // The disagreement between module and x86 target machine datalayout 685 // causes mismatched input/output data offset between slang reflected 686 // code and bcc codegen for GetElementPtr. To solve this issue, skip the 687 // cast to InType and leave CastInBufPtr as an int8_t*. The buffer is 688 // later indexed with an explicit byte offset computed based on 689 // X86_CUSTOM_DL_STRING and then bitcast to actual input type. 690 CastInBufPtr = InBufPtr; 691 } 692 693 if (gEnableRsTbaa) { 694 InBufPtr->setMetadata("tbaa", TBAAPointer); 695 } 696 697 InTypes.push_back(InType); 698 InBufPtrs.push_back(CastInBufPtr); 699 } 700 701 Builder.restoreIP(OldInsertionPoint); 702 } 703 704 // Generate loop-varying input processing code for an expanded ForEach-able function 705 // or an expanded general reduction accumulator function. Also, for the call to the 706 // UNexpanded function, collect the portion of the argument list corresponding to the 707 // inputs. 708 // 709 // Arg_x1 - first X coordinate to be processed by the expanded function 710 // TBAAAllocation - metadata for marking loads of input values out of allocations 711 // NumInputs -- number of inputs (NOT number of ARGUMENTS) 712 // InTypes[] - this function uses the saved input types in ExpandInputsLoopInvariant() 713 // to convert the pointer of byte InPtr to its real type. 714 // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant() 715 // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant() 716 // IndVar - value of loop induction variable (X coordinate) for a given loop iteration 717 // 718 // RootArgs - this function sets this to the list of outgoing argument values corresponding 719 // to the inputs 720 void ExpandInputsBody(llvm::IRBuilder<> &Builder, 721 llvm::Value *Arg_x1, 722 llvm::MDNode *TBAAAllocation, 723 const size_t NumInputs, 724 const llvm::SmallVectorImpl<llvm::Type *> &InTypes, 725 const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs, 726 const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots, 727 llvm::Value *IndVar, 728 llvm::SmallVectorImpl<llvm::Value *> &RootArgs) { 729 llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1); 730 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 731 732 for (size_t Index = 0; Index < NumInputs; ++Index) { 733 734 llvm::Value *InPtr = nullptr; 735 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) { 736 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset); 737 } else { 738 // Treat x86 input buffer as byte[], get indexed pointer with explicit 739 // byte offset computed using a datalayout based on 740 // X86_CUSTOM_DL_STRING, then bitcast it to actual input type. 741 llvm::DataLayout DL(X86_CUSTOM_DL_STRING); 742 llvm::Type *InTy = InTypes[Index]; 743 uint64_t InStep = DL.getTypeAllocSize(InTy->getPointerElementType()); 744 llvm::Value *OffsetInBytes = Builder.CreateMul(Offset, llvm::ConstantInt::get(Int32Ty, InStep)); 745 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], OffsetInBytes); 746 InPtr = Builder.CreatePointerCast(InPtr, InTy); 747 } 748 749 llvm::Value *Input; 750 llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input"); 751 752 if (gEnableRsTbaa) { 753 InputLoad->setMetadata("tbaa", TBAAAllocation); 754 } 755 756 if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) { 757 // Pass a pointer to a temporary on the stack, rather than 758 // passing a pointer to the original value. We do not want 759 // the kernel to potentially modify the input data. 760 761 // Note: don't annotate with TBAA, since the kernel might 762 // have its own TBAA annotations for the pointer argument. 763 Builder.CreateStore(InputLoad, TemporarySlot); 764 Input = TemporarySlot; 765 } else { 766 Input = InputLoad; 767 } 768 769 RootArgs.push_back(Input); 770 } 771 } 772 773 /* Performs the actual optimization on a selected function. On success, the 774 * Module will contain a new function of the name "<NAME>.expand" that 775 * invokes <NAME>() in a loop with the appropriate parameters. 776 */ 777 bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) { 778 ALOGV("Expanding ForEach-able Function %s", 779 Function->getName().str().c_str()); 780 781 if (!Signature) { 782 Signature = getRootSignature(Function); 783 if (!Signature) { 784 // We couldn't determine how to expand this function based on its 785 // function signature. 786 return false; 787 } 788 } 789 790 llvm::DataLayout DL(Module); 791 if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) { 792 DL.reset(X86_CUSTOM_DL_STRING); 793 } 794 795 llvm::Function *ExpandedFunction = 796 createEmptyExpandedForEachKernel(Function->getName()); 797 798 /* 799 * Extract the expanded function's parameters. It is guaranteed by 800 * createEmptyExpandedForEachKernel that there will be four parameters. 801 */ 802 803 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 804 805 llvm::Function::arg_iterator ExpandedFunctionArgIter = 806 ExpandedFunction->arg_begin(); 807 808 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 809 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 810 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 811 llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter); 812 813 llvm::Value *InStep = nullptr; 814 llvm::Value *OutStep = nullptr; 815 816 // Construct the actual function body. 817 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin()); 818 819 // Collect and construct the arguments for the kernel(). 820 // Note that we load any loop-invariant arguments before entering the Loop. 821 llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin(); 822 823 llvm::Type *InTy = nullptr; 824 llvm::Value *InBufPtr = nullptr; 825 if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) { 826 SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0})); 827 llvm::LoadInst *InStepArg = Builder.CreateLoad( 828 Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr"); 829 830 InTy = (FunctionArgIter++)->getType(); 831 InStep = getStepValue(&DL, InTy, InStepArg); 832 833 InStep->setName("instep"); 834 835 SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0})); 836 InBufPtr = Builder.CreateLoad( 837 Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf"); 838 } 839 840 llvm::Type *OutTy = nullptr; 841 llvm::Value *OutBasePtr = nullptr; 842 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 843 OutTy = (FunctionArgIter++)->getType(); 844 OutStep = getStepValue(&DL, OutTy, Arg_outstep); 845 OutStep->setName("outstep"); 846 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 847 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 848 } 849 850 llvm::Value *UsrData = nullptr; 851 if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) { 852 llvm::Type *UsrDataTy = (FunctionArgIter++)->getType(); 853 llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr); 854 UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy); 855 UsrData->setName("UsrData"); 856 } 857 858 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 859 llvm::Value *IV; 860 createLoop(Builder, Arg_x1, Arg_x2, &IV); 861 862 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 863 const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 864 [&FunctionArgIter]() { FunctionArgIter++; }, 865 LoopHeader->getTerminator()); 866 867 bccAssert(FunctionArgIter == Function->arg_end()); 868 869 // Populate the actual call to kernel(). 870 llvm::SmallVector<llvm::Value*, 8> RootArgs; 871 872 llvm::Value *InPtr = nullptr; 873 llvm::Value *OutPtr = nullptr; 874 875 // Calculate the current input and output pointers 876 // 877 // We always calculate the input/output pointers with a GEP operating on i8 878 // values and only cast at the very end to OutTy. This is because the step 879 // between two values is given in bytes. 880 // 881 // TODO: We could further optimize the output by using a GEP operation of 882 // type 'OutTy' in cases where the element type of the allocation allows. 883 if (OutBasePtr) { 884 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 885 OutOffset = Builder.CreateMul(OutOffset, OutStep); 886 OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset); 887 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 888 } 889 890 if (InBufPtr) { 891 llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1); 892 InOffset = Builder.CreateMul(InOffset, InStep); 893 InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset); 894 InPtr = Builder.CreatePointerCast(InPtr, InTy); 895 } 896 897 if (InPtr) { 898 RootArgs.push_back(InPtr); 899 } 900 901 if (OutPtr) { 902 RootArgs.push_back(OutPtr); 903 } 904 905 if (UsrData) { 906 RootArgs.push_back(UsrData); 907 } 908 909 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 910 911 Builder.CreateCall(Function, RootArgs); 912 913 return true; 914 } 915 916 /* Expand a pass-by-value foreach kernel. 917 */ 918 bool ExpandForEach(llvm::Function *Function, uint32_t Signature) { 919 bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)); 920 ALOGV("Expanding kernel Function %s", Function->getName().str().c_str()); 921 922 // TODO: Refactor this to share functionality with ExpandOldStyleForEach. 923 llvm::DataLayout DL(Module); 924 if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) { 925 DL.reset(X86_CUSTOM_DL_STRING); 926 } 927 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context); 928 929 llvm::Function *ExpandedFunction = 930 createEmptyExpandedForEachKernel(Function->getName()); 931 932 /* 933 * Extract the expanded function's parameters. It is guaranteed by 934 * createEmptyExpandedForEachKernel that there will be four parameters. 935 */ 936 937 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams); 938 939 llvm::Function::arg_iterator ExpandedFunctionArgIter = 940 ExpandedFunction->arg_begin(); 941 942 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++); 943 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++); 944 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++); 945 // Arg_outstep is not used by expanded new-style forEach kernels. 946 947 // Construct the actual function body. 948 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin()); 949 950 // Create TBAA meta-data. 951 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 952 *TBAAAllocation, *TBAAPointer; 953 llvm::MDBuilder MDHelper(*Context); 954 955 TBAARenderScriptDistinct = 956 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 957 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 958 TBAARenderScriptDistinct); 959 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 960 TBAARenderScript); 961 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 962 TBAAAllocation, 0); 963 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 964 TBAARenderScript); 965 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 966 967 /* 968 * Collect and construct the arguments for the kernel(). 969 * 970 * Note that we load any loop-invariant arguments before entering the Loop. 971 */ 972 size_t NumRemainingInputs = Function->arg_size(); 973 974 // No usrData parameter on kernels. 975 bccAssert( 976 !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)); 977 978 llvm::Function::arg_iterator ArgIter = Function->arg_begin(); 979 980 // Check the return type 981 llvm::Type *OutTy = nullptr; 982 llvm::LoadInst *OutBasePtr = nullptr; 983 llvm::Value *CastedOutBasePtr = nullptr; 984 985 bool PassOutByPointer = false; 986 987 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) { 988 llvm::Type *OutBaseTy = Function->getReturnType(); 989 990 if (OutBaseTy->isVoidTy()) { 991 PassOutByPointer = true; 992 OutTy = ArgIter->getType(); 993 994 ArgIter++; 995 --NumRemainingInputs; 996 } else { 997 // We don't increment Args, since we are using the actual return type. 998 OutTy = OutBaseTy->getPointerTo(); 999 } 1000 1001 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0})); 1002 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep")); 1003 1004 if (gEnableRsTbaa) { 1005 OutBasePtr->setMetadata("tbaa", TBAAPointer); 1006 } 1007 1008 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) { 1009 CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out"); 1010 } else { 1011 // The disagreement between module and x86 target machine datalayout 1012 // causes mismatched input/output data offset between slang reflected 1013 // code and bcc codegen for GetElementPtr. To solve this issue, skip the 1014 // cast to OutTy and leave CastedOutBasePtr as an int8_t*. The buffer 1015 // is later indexed with an explicit byte offset computed based on 1016 // X86_CUSTOM_DL_STRING and then bitcast to actual output type. 1017 CastedOutBasePtr = OutBasePtr; 1018 } 1019 } 1020 1021 llvm::SmallVector<llvm::Type*, 8> InTypes; 1022 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1023 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1024 1025 bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT); 1026 1027 // Create the loop structure. 1028 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1029 llvm::Value *IV; 1030 createLoop(Builder, Arg_x1, Arg_x2, &IV); 1031 1032 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1033 const int CalleeArgsContextIdx = 1034 ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs, 1035 [&NumRemainingInputs]() { --NumRemainingInputs; }, 1036 LoopHeader->getTerminator()); 1037 1038 // After ExpandSpecialArguments() gets called, NumRemainingInputs 1039 // counts the number of arguments to the kernel that correspond to 1040 // an array entry from the InPtr field of the DriverInfo 1041 // structure. 1042 const size_t NumInPtrArguments = NumRemainingInputs; 1043 1044 if (NumInPtrArguments > 0) { 1045 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments, 1046 InTypes, InBufPtrs, InStructTempSlots); 1047 } 1048 1049 // Populate the actual call to kernel(). 1050 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1051 1052 // Calculate the current input and output pointers. 1053 1054 // Output 1055 1056 llvm::Value *OutPtr = nullptr; 1057 if (CastedOutBasePtr) { 1058 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1); 1059 1060 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) { 1061 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset); 1062 } else { 1063 // Treat x86 output buffer as byte[], get indexed pointer with explicit 1064 // byte offset computed using a datalayout based on 1065 // X86_CUSTOM_DL_STRING, then bitcast it to actual output type. 1066 uint64_t OutStep = DL.getTypeAllocSize(OutTy->getPointerElementType()); 1067 llvm::Value *OutOffsetInBytes = Builder.CreateMul(OutOffset, llvm::ConstantInt::get(Int32Ty, OutStep)); 1068 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffsetInBytes); 1069 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy); 1070 } 1071 1072 if (PassOutByPointer) { 1073 RootArgs.push_back(OutPtr); 1074 } 1075 } 1076 1077 // Inputs 1078 1079 if (NumInPtrArguments > 0) { 1080 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments, 1081 InTypes, InBufPtrs, InStructTempSlots, IV, RootArgs); 1082 } 1083 1084 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder); 1085 1086 llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs); 1087 1088 if (OutPtr && !PassOutByPointer) { 1089 RetVal->setName("call.result"); 1090 llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr); 1091 if (gEnableRsTbaa) { 1092 Store->setMetadata("tbaa", TBAAAllocation); 1093 } 1094 } 1095 1096 return true; 1097 } 1098 1099 // Certain categories of functions that make up a general 1100 // reduce-style kernel are called directly from the driver with no 1101 // expansion needed. For a function in such a category, we need to 1102 // promote linkage from static to external, to ensure that the 1103 // function is visible to the driver in the dynamic symbol table. 1104 // This promotion is safe because we don't have any kind of cross 1105 // translation unit linkage model (except for linking against 1106 // RenderScript libraries), so we do not risk name clashes. 1107 bool PromoteReduceFunction(const char *Name, FunctionSet &PromotedFunctions) { 1108 if (!Name) // a presumably-optional function that is not present 1109 return false; 1110 1111 llvm::Function *Fn = Module->getFunction(Name); 1112 bccAssert(Fn != nullptr); 1113 if (PromotedFunctions.insert(Fn).second) { 1114 bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage); 1115 Fn->setLinkage(llvm::GlobalValue::ExternalLinkage); 1116 return true; 1117 } 1118 1119 return false; 1120 } 1121 1122 // Expand the accumulator function for a general reduce-style kernel. 1123 // 1124 // The input is a function of the form 1125 // 1126 // define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments]) 1127 // 1128 // where all arguments except the first are the same as for a foreach kernel. 1129 // 1130 // The input accumulator function gets expanded into a function of the form 1131 // 1132 // define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum) 1133 // 1134 // which performs a serial accumulaion of elements [x1, x2) into *%accum. 1135 // 1136 // In pseudocode, @func.expand does: 1137 // 1138 // for (i = %x1; i < %x2; ++i) { 1139 // func(%accum, 1140 // *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i) 1141 // [, p] [, i] [, p->current.y] [, p->current.z]); 1142 // } 1143 // 1144 // This is very similar to foreach kernel expansion with no output. 1145 bool ExpandReduceAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) { 1146 ALOGV("Expanding accumulator %s for general reduce kernel", 1147 FnAccumulator->getName().str().c_str()); 1148 1149 // Create TBAA meta-data. 1150 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, 1151 *TBAAAllocation, *TBAAPointer; 1152 llvm::MDBuilder MDHelper(*Context); 1153 TBAARenderScriptDistinct = 1154 MDHelper.createTBAARoot(kRenderScriptTBAARootName); 1155 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName, 1156 TBAARenderScriptDistinct); 1157 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation", 1158 TBAARenderScript); 1159 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation, 1160 TBAAAllocation, 0); 1161 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer", 1162 TBAARenderScript); 1163 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0); 1164 1165 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1166 1167 // Create empty accumulator function. 1168 llvm::Function *FnExpandedAccumulator = 1169 createEmptyExpandedReduceAccumulator(FnAccumulator->getName(), 1170 (AccumulatorArgIter++)->getType()); 1171 1172 // Extract the expanded accumulator's parameters. It is 1173 // guaranteed by createEmptyExpandedReduceAccumulator that 1174 // there will be 4 parameters. 1175 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams); 1176 auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin(); 1177 llvm::Value *Arg_p = &*(ExpandedAccumulatorArgIter++); 1178 llvm::Value *Arg_x1 = &*(ExpandedAccumulatorArgIter++); 1179 llvm::Value *Arg_x2 = &*(ExpandedAccumulatorArgIter++); 1180 llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++); 1181 1182 // Construct the actual function body. 1183 llvm::IRBuilder<> Builder(&*FnExpandedAccumulator->getEntryBlock().begin()); 1184 1185 // Create the loop structure. 1186 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock(); 1187 llvm::Value *IndVar; 1188 createLoop(Builder, Arg_x1, Arg_x2, &IndVar); 1189 1190 llvm::SmallVector<llvm::Value*, 8> CalleeArgs; 1191 const int CalleeArgsContextIdx = 1192 ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs, 1193 [](){}, LoopHeader->getTerminator()); 1194 1195 llvm::SmallVector<llvm::Type*, 8> InTypes; 1196 llvm::SmallVector<llvm::Value*, 8> InBufPtrs; 1197 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots; 1198 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs, 1199 InTypes, InBufPtrs, InStructTempSlots); 1200 1201 // Populate the actual call to the original accumulator. 1202 llvm::SmallVector<llvm::Value*, 8> RootArgs; 1203 RootArgs.push_back(Arg_accum); 1204 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InTypes, InBufPtrs, InStructTempSlots, 1205 IndVar, RootArgs); 1206 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder); 1207 Builder.CreateCall(FnAccumulator, RootArgs); 1208 1209 return true; 1210 } 1211 1212 // Create a combiner function for a general reduce-style kernel that lacks one, 1213 // by calling the accumulator function. 1214 // 1215 // The accumulator function must be of the form 1216 // 1217 // define void @accumFn(accumType* %accum, accumType %in) 1218 // 1219 // A combiner function will be generated of the form 1220 // 1221 // define void @accumFn.combiner(accumType* %accum, accumType* %other) { 1222 // %1 = load accumType, accumType* %other 1223 // call void @accumFn(accumType* %accum, accumType %1); 1224 // } 1225 bool CreateReduceCombinerFromAccumulator(llvm::Function *FnAccumulator) { 1226 ALOGV("Creating combiner from accumulator %s for general reduce kernel", 1227 FnAccumulator->getName().str().c_str()); 1228 1229 using llvm::Attribute; 1230 1231 bccAssert(FnAccumulator->arg_size() == 2); 1232 auto AccumulatorArgIter = FnAccumulator->arg_begin(); 1233 llvm::Value *AccumulatorArg_accum = &*(AccumulatorArgIter++); 1234 llvm::Value *AccumulatorArg_in = &*(AccumulatorArgIter++); 1235 llvm::Type *AccumulatorArgType = AccumulatorArg_accum->getType(); 1236 bccAssert(AccumulatorArgType->isPointerTy()); 1237 1238 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context); 1239 llvm::FunctionType *CombinerType = 1240 llvm::FunctionType::get(VoidTy, { AccumulatorArgType, AccumulatorArgType }, false); 1241 llvm::Function *FnCombiner = 1242 llvm::Function::Create(CombinerType, llvm::GlobalValue::ExternalLinkage, 1243 nameReduceCombinerFromAccumulator(FnAccumulator->getName()), 1244 Module); 1245 1246 auto CombinerArgIter = FnCombiner->arg_begin(); 1247 1248 llvm::Argument *CombinerArg_accum = &(*CombinerArgIter++); 1249 CombinerArg_accum->setName("accum"); 1250 CombinerArg_accum->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_accum->getArgNo() + 1, 1251 llvm::makeArrayRef(Attribute::NoCapture))); 1252 1253 llvm::Argument *CombinerArg_other = &(*CombinerArgIter++); 1254 CombinerArg_other->setName("other"); 1255 CombinerArg_other->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_other->getArgNo() + 1, 1256 llvm::makeArrayRef(Attribute::NoCapture))); 1257 1258 llvm::BasicBlock *BB = llvm::BasicBlock::Create(*Context, "BB", FnCombiner); 1259 llvm::IRBuilder<> Builder(BB); 1260 1261 if (AccumulatorArg_in->getType()->isPointerTy()) { 1262 // Types of sufficient size get passed by pointer-to-copy rather 1263 // than passed by value. An accumulator cannot take a pointer 1264 // at the user level; so if we see a pointer here, we know that 1265 // we have a pass-by-pointer-to-copy case. 1266 llvm::Type *ElementType = AccumulatorArg_in->getType()->getPointerElementType(); 1267 llvm::Value *TempMem = Builder.CreateAlloca(ElementType, nullptr, "caller_copy"); 1268 Builder.CreateStore(Builder.CreateLoad(CombinerArg_other), TempMem); 1269 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, TempMem }); 1270 } else { 1271 llvm::Value *TypeAdjustedOther = CombinerArg_other; 1272 if (AccumulatorArgType->getPointerElementType() != AccumulatorArg_in->getType()) { 1273 // Call lowering by frontend has done some type coercion 1274 TypeAdjustedOther = Builder.CreatePointerCast(CombinerArg_other, 1275 AccumulatorArg_in->getType()->getPointerTo(), 1276 "cast"); 1277 } 1278 llvm::Value *DerefOther = Builder.CreateLoad(TypeAdjustedOther); 1279 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, DerefOther }); 1280 } 1281 Builder.CreateRetVoid(); 1282 1283 return true; 1284 } 1285 1286 /// @brief Checks if pointers to allocation internals are exposed 1287 /// 1288 /// This function verifies if through the parameters passed to the kernel 1289 /// or through calls to the runtime library the script gains access to 1290 /// pointers pointing to data within a RenderScript Allocation. 1291 /// If we know we control all loads from and stores to data within 1292 /// RenderScript allocations and if we know the run-time internal accesses 1293 /// are all annotated with RenderScript TBAA metadata, only then we 1294 /// can safely use TBAA to distinguish between generic and from-allocation 1295 /// pointers. 1296 bool allocPointersExposed(llvm::Module &Module) { 1297 // Old style kernel function can expose pointers to elements within 1298 // allocations. 1299 // TODO: Extend analysis to allow simple cases of old-style kernels. 1300 for (size_t i = 0; i < mExportForEachCount; ++i) { 1301 const char *Name = mExportForEachNameList[i]; 1302 uint32_t Signature = mExportForEachSignatureList[i]; 1303 if (Module.getFunction(Name) && 1304 !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) { 1305 return true; 1306 } 1307 } 1308 1309 // Check for library functions that expose a pointer to an Allocation or 1310 // that are not yet annotated with RenderScript-specific tbaa information. 1311 static const std::vector<const char *> Funcs{ 1312 // rsGetElementAt(...) 1313 "_Z14rsGetElementAt13rs_allocationj", 1314 "_Z14rsGetElementAt13rs_allocationjj", 1315 "_Z14rsGetElementAt13rs_allocationjjj", 1316 1317 // rsSetElementAt() 1318 "_Z14rsSetElementAt13rs_allocationPvj", 1319 "_Z14rsSetElementAt13rs_allocationPvjj", 1320 "_Z14rsSetElementAt13rs_allocationPvjjj", 1321 1322 // rsGetElementAtYuv_uchar_Y() 1323 "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj", 1324 1325 // rsGetElementAtYuv_uchar_U() 1326 "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj", 1327 1328 // rsGetElementAtYuv_uchar_V() 1329 "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj", 1330 }; 1331 1332 for (auto FI : Funcs) { 1333 llvm::Function *Function = Module.getFunction(FI); 1334 1335 if (!Function) { 1336 ALOGE("Missing run-time function '%s'", FI); 1337 return true; 1338 } 1339 1340 if (Function->getNumUses() > 0) { 1341 return true; 1342 } 1343 } 1344 1345 return false; 1346 } 1347 1348 /// @brief Connect RenderScript TBAA metadata to C/C++ metadata 1349 /// 1350 /// The TBAA metadata used to annotate loads/stores from RenderScript 1351 /// Allocations is generated in a separate TBAA tree with a 1352 /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for 1353 /// all nodes in unrelated alias analysis trees. This function makes the 1354 /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root), 1355 /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With 1356 /// the connected trees every access to an Allocation is resolved to 1357 /// must-alias if compared to a normal C/C++ access. 1358 void connectRenderScriptTBAAMetadata(llvm::Module &Module) { 1359 llvm::MDBuilder MDHelper(*Context); 1360 llvm::MDNode *TBAARenderScriptDistinct = 1361 MDHelper.createTBAARoot("RenderScript Distinct TBAA"); 1362 llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode( 1363 "RenderScript TBAA", TBAARenderScriptDistinct); 1364 llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA"); 1365 TBAARenderScript->replaceOperandWith(1, TBAARoot); 1366 } 1367 1368 virtual bool runOnModule(llvm::Module &Module) { 1369 bool Changed = false; 1370 this->Module = &Module; 1371 Context = &Module.getContext(); 1372 1373 buildTypes(); 1374 1375 bcinfo::MetadataExtractor me(&Module); 1376 if (!me.extract()) { 1377 ALOGE("Could not extract metadata from module!"); 1378 return false; 1379 } 1380 1381 mStructExplicitlyPaddedBySlang = (me.getCompilerVersion() >= SlangVersion::N_STRUCT_EXPLICIT_PADDING); 1382 1383 // Expand forEach_* style kernels. 1384 mExportForEachCount = me.getExportForEachSignatureCount(); 1385 mExportForEachNameList = me.getExportForEachNameList(); 1386 mExportForEachSignatureList = me.getExportForEachSignatureList(); 1387 1388 for (size_t i = 0; i < mExportForEachCount; ++i) { 1389 const char *name = mExportForEachNameList[i]; 1390 uint32_t signature = mExportForEachSignatureList[i]; 1391 llvm::Function *kernel = Module.getFunction(name); 1392 if (kernel) { 1393 if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) { 1394 Changed |= ExpandForEach(kernel, signature); 1395 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1396 } else if (kernel->getReturnType()->isVoidTy()) { 1397 Changed |= ExpandOldStyleForEach(kernel, signature); 1398 kernel->setLinkage(llvm::GlobalValue::InternalLinkage); 1399 } else { 1400 // There are some graphics root functions that are not 1401 // expanded, but that will be called directly. For those 1402 // functions, we can not set the linkage to internal. 1403 } 1404 } 1405 } 1406 1407 // Process general reduce_* style functions. 1408 const size_t ExportReduceCount = me.getExportReduceCount(); 1409 const bcinfo::MetadataExtractor::Reduce *ExportReduceList = me.getExportReduceList(); 1410 // Note that functions can be shared between kernels 1411 FunctionSet PromotedFunctions, ExpandedAccumulators, AccumulatorsForCombiners; 1412 1413 for (size_t i = 0; i < ExportReduceCount; ++i) { 1414 Changed |= PromoteReduceFunction(ExportReduceList[i].mInitializerName, PromotedFunctions); 1415 Changed |= PromoteReduceFunction(ExportReduceList[i].mCombinerName, PromotedFunctions); 1416 Changed |= PromoteReduceFunction(ExportReduceList[i].mOutConverterName, PromotedFunctions); 1417 1418 // Accumulator 1419 llvm::Function *accumulator = Module.getFunction(ExportReduceList[i].mAccumulatorName); 1420 bccAssert(accumulator != nullptr); 1421 if (ExpandedAccumulators.insert(accumulator).second) 1422 Changed |= ExpandReduceAccumulator(accumulator, 1423 ExportReduceList[i].mSignature, 1424 ExportReduceList[i].mInputCount); 1425 if (!ExportReduceList[i].mCombinerName) { 1426 if (AccumulatorsForCombiners.insert(accumulator).second) 1427 Changed |= CreateReduceCombinerFromAccumulator(accumulator); 1428 } 1429 } 1430 1431 if (gEnableRsTbaa && !allocPointersExposed(Module)) { 1432 connectRenderScriptTBAAMetadata(Module); 1433 } 1434 1435 return Changed; 1436 } 1437 1438 virtual const char *getPassName() const { 1439 return "forEach_* and reduce_* function expansion"; 1440 } 1441 1442 }; // end RSKernelExpandPass 1443 1444 } // end anonymous namespace 1445 1446 char RSKernelExpandPass::ID = 0; 1447 static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass"); 1448 1449 namespace bcc { 1450 1451 const char BCC_INDEX_VAR_NAME[] = "rsIndex"; 1452 1453 llvm::ModulePass * 1454 createRSKernelExpandPass(bool pEnableStepOpt) { 1455 return new RSKernelExpandPass(pEnableStepOpt); 1456 } 1457 1458 } // end namespace bcc 1459