1 #include "rsCpuScriptGroup2.h" 2 3 #include <dlfcn.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <unistd.h> 7 8 #include <set> 9 #include <sstream> 10 #include <string> 11 #include <vector> 12 13 #ifndef RS_COMPATIBILITY_LIB 14 #include "bcc/Config.h" 15 #endif 16 17 #include "cpu_ref/rsCpuCore.h" 18 #include "rsClosure.h" 19 #include "rsContext.h" 20 #include "rsCpuCore.h" 21 #include "rsCpuExecutable.h" 22 #include "rsCpuScript.h" 23 #include "rsScript.h" 24 #include "rsScriptGroup2.h" 25 #include "rsScriptIntrinsic.h" 26 27 using std::string; 28 using std::vector; 29 30 namespace android { 31 namespace renderscript { 32 33 namespace { 34 35 const size_t DefaultKernelArgCount = 2; 36 37 void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart, 38 uint32_t xend, uint32_t outstep) { 39 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr; 40 RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo); 41 42 const size_t oldInLen = mutable_kinfo->inLen; 43 44 decltype(mutable_kinfo->inStride) oldInStride; 45 memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride)); 46 47 for (CPUClosure* cpuClosure : closures) { 48 const Closure* closure = cpuClosure->mClosure; 49 50 // There had better be enough space in mutable_kinfo 51 rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT); 52 53 for (size_t i = 0; i < closure->mNumArg; i++) { 54 const void* arg = closure->mArgs[i]; 55 const Allocation* a = (const Allocation*)arg; 56 const uint32_t eStride = a->mHal.state.elementSizeBytes; 57 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 58 eStride * xstart; 59 if (kinfo->dim.y > 1) { 60 ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y; 61 } 62 mutable_kinfo->inPtr[i] = ptr; 63 mutable_kinfo->inStride[i] = eStride; 64 } 65 mutable_kinfo->inLen = closure->mNumArg; 66 67 const Allocation* out = closure->mReturnValue; 68 const uint32_t ostep = out->mHal.state.elementSizeBytes; 69 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 70 ostep * xstart; 71 if (kinfo->dim.y > 1) { 72 ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y; 73 } 74 75 mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr); 76 77 // The implementation of an intrinsic relies on kinfo->usr being 78 // the "this" pointer to the intrinsic (an RsdCpuScriptIntrinsic object) 79 mutable_kinfo->usr = cpuClosure->mSi; 80 81 cpuClosure->mFunc(kinfo, xstart, xend, ostep); 82 } 83 84 mutable_kinfo->inLen = oldInLen; 85 mutable_kinfo->usr = &closures; 86 memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride)); 87 } 88 89 } // namespace 90 91 Batch::Batch(CpuScriptGroup2Impl* group, const char* name) : 92 mGroup(group), mFunc(nullptr) { 93 mName = strndup(name, strlen(name)); 94 } 95 96 Batch::~Batch() { 97 for (CPUClosure* c : mClosures) { 98 delete c; 99 } 100 free(mName); 101 } 102 103 bool Batch::conflict(CPUClosure* cpuClosure) const { 104 if (mClosures.empty()) { 105 return false; 106 } 107 108 const Closure* closure = cpuClosure->mClosure; 109 110 if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) { 111 // An invoke should be in a batch by itself, so it conflicts with any other 112 // closure. 113 return true; 114 } 115 116 const auto& globalDeps = closure->mGlobalDeps; 117 const auto& argDeps = closure->mArgDeps; 118 119 for (CPUClosure* c : mClosures) { 120 const Closure* batched = c->mClosure; 121 if (globalDeps.find(batched) != globalDeps.end()) { 122 return true; 123 } 124 const auto& it = argDeps.find(batched); 125 if (it != argDeps.end()) { 126 const auto& args = (*it).second; 127 for (const auto &p1 : *args) { 128 if (p1.second.get() != nullptr) { 129 return true; 130 } 131 } 132 } 133 } 134 135 // The compiler fusion pass in bcc expects that kernels chained up through 136 // (1st) input and output. 137 138 const Closure* lastBatched = mClosures.back()->mClosure; 139 const auto& it = argDeps.find(lastBatched); 140 141 if (it == argDeps.end()) { 142 return true; 143 } 144 145 const auto& args = (*it).second; 146 for (const auto &p1 : *args) { 147 if (p1.first == 0 && p1.second.get() == nullptr) { 148 // The new closure depends on the last batched closure's return 149 // value (fieldId being nullptr) for its first argument (argument 0) 150 return false; 151 } 152 } 153 154 return true; 155 } 156 157 CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 158 const ScriptGroupBase *sg) : 159 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)), 160 mExecutable(nullptr), mScriptObj(nullptr) { 161 rsAssert(!mGroup->mClosures.empty()); 162 163 mCpuRefImpl->lockMutex(); 164 Batch* batch = new Batch(this, "Batch0"); 165 int i = 0; 166 for (Closure* closure: mGroup->mClosures) { 167 CPUClosure* cc; 168 const IDBase* funcID = closure->mFunctionID.get(); 169 RsdCpuScriptImpl* si = 170 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); 171 if (closure->mIsKernel) { 172 MTLaunchStructForEach mtls; 173 si->forEachKernelSetup(funcID->mSlot, &mtls); 174 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); 175 } else { 176 cc = new CPUClosure(closure, si); 177 } 178 179 if (batch->conflict(cc)) { 180 mBatches.push_back(batch); 181 std::stringstream ss; 182 ss << "Batch" << ++i; 183 std::string batchStr(ss.str()); 184 batch = new Batch(this, batchStr.c_str()); 185 } 186 187 batch->mClosures.push_back(cc); 188 } 189 190 rsAssert(!batch->mClosures.empty()); 191 mBatches.push_back(batch); 192 193 #ifndef RS_COMPATIBILITY_LIB 194 compile(mGroup->mCacheDir); 195 if (mScriptObj != nullptr && mExecutable != nullptr) { 196 for (Batch* batch : mBatches) { 197 batch->resolveFuncPtr(mScriptObj); 198 } 199 } 200 #endif // RS_COMPATIBILITY_LIB 201 mCpuRefImpl->unlockMutex(); 202 } 203 204 void Batch::resolveFuncPtr(void* sharedObj) { 205 std::string funcName(mName); 206 if (mClosures.front()->mClosure->mIsKernel) { 207 funcName.append(".expand"); 208 } 209 mFunc = dlsym(sharedObj, funcName.c_str()); 210 rsAssert (mFunc != nullptr); 211 } 212 213 CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 214 for (Batch* batch : mBatches) { 215 delete batch; 216 } 217 delete mExecutable; 218 // TODO: move this dlclose into ~ScriptExecutable(). 219 if (mScriptObj != nullptr) { 220 dlclose(mScriptObj); 221 } 222 } 223 224 namespace { 225 226 #ifndef RS_COMPATIBILITY_LIB 227 228 string getCoreLibPath(Context* context, string* coreLibRelaxedPath) { 229 *coreLibRelaxedPath = ""; 230 231 // If we're debugging, use the debug library. 232 if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) { 233 return SYSLIBPATH_BC"/libclcore_debug.bc"; 234 } 235 236 // Check for a platform specific library 237 238 #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON) 239 // NEON-capable ARMv7a devices can use an accelerated math library 240 // for all reduced precision scripts. 241 // ARMv8 does not use NEON, as ASIMD can be used with all precision 242 // levels. 243 *coreLibRelaxedPath = SYSLIBPATH_BC"/libclcore_neon.bc"; 244 #endif 245 246 #if defined(__i386__) || defined(__x86_64__) 247 // x86 devices will use an optimized library. 248 return SYSLIBPATH_BC"/libclcore_x86.bc"; 249 #else 250 return SYSLIBPATH_BC"/libclcore.bc"; 251 #endif 252 } 253 254 void setupCompileArguments( 255 const vector<const char*>& inputs, const vector<string>& kernelBatches, 256 const vector<string>& invokeBatches, 257 const char* outputDir, const char* outputFileName, 258 const char* coreLibPath, const char* coreLibRelaxedPath, 259 const bool emitGlobalInfo, const bool emitGlobalInfoSkipConstant, 260 int optLevel, vector<const char*>* args) { 261 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 262 args->push_back("-fPIC"); 263 args->push_back("-embedRSInfo"); 264 if (emitGlobalInfo) { 265 args->push_back("-rs-global-info"); 266 if (emitGlobalInfoSkipConstant) { 267 args->push_back("-rs-global-info-skip-constant"); 268 } 269 } 270 args->push_back("-mtriple"); 271 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 272 args->push_back("-bclib"); 273 args->push_back(coreLibPath); 274 args->push_back("-bclib_relaxed"); 275 args->push_back(coreLibRelaxedPath); 276 for (const char* input : inputs) { 277 args->push_back(input); 278 } 279 for (const string& batch : kernelBatches) { 280 args->push_back("-merge"); 281 args->push_back(batch.c_str()); 282 } 283 for (const string& batch : invokeBatches) { 284 args->push_back("-invoke"); 285 args->push_back(batch.c_str()); 286 } 287 args->push_back("-output_path"); 288 args->push_back(outputDir); 289 290 args->push_back("-O"); 291 switch (optLevel) { 292 case 0: 293 args->push_back("0"); 294 break; 295 case 3: 296 args->push_back("3"); 297 break; 298 default: 299 ALOGW("Expected optimization level of 0 or 3. Received %d", optLevel); 300 args->push_back("3"); 301 break; 302 } 303 304 // The output filename has to be the last, in case we need to pop it out and 305 // replace with a different name. 306 args->push_back("-o"); 307 args->push_back(outputFileName); 308 } 309 310 void generateSourceSlot(RsdCpuReferenceImpl* ctxt, 311 const Closure& closure, 312 const std::vector<const char*>& inputs, 313 std::stringstream& ss) { 314 const IDBase* funcID = (const IDBase*)closure.mFunctionID.get(); 315 const Script* script = funcID->mScript; 316 317 rsAssert (!script->isIntrinsic()); 318 319 const RsdCpuScriptImpl *cpuScript = 320 (const RsdCpuScriptImpl *)ctxt->lookupScript(script); 321 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 322 323 const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) - 324 inputs.begin(); 325 326 ss << index << "," << funcID->mSlot << "."; 327 } 328 329 #endif // RS_COMPATIBILTY_LIB 330 331 } // anonymous namespace 332 333 // This function is used by the debugger to inspect ScriptGroup 334 // compilations. 335 // 336 // "__attribute__((noinline))" and "__asm__" are used to prevent the 337 // function call from being eliminated as a no-op (see the "noinline" 338 // attribute in gcc documentation). 339 // 340 // "__attribute__((weak))" is used to prevent callers from recognizing 341 // that this is guaranteed to be the function definition, recognizing 342 // that certain arguments are unused, and optimizing away the passing 343 // of those arguments (see the LLVM optimization 344 // DeadArgumentElimination). Theoretically, the compiler could get 345 // aggressive enough with link-time optimization that even marking the 346 // entry point as a weak definition wouldn't solve the problem. 347 // 348 extern __attribute__((noinline)) __attribute__((weak)) 349 void debugHintScriptGroup2(const char* groupName, 350 const uint32_t groupNameSize, 351 const ExpandFuncTy* kernel, 352 const uint32_t kernelCount) { 353 ALOGV("group name: %d:%s\n", groupNameSize, groupName); 354 for (uint32_t i=0; i < kernelCount; ++i) { 355 const char* f1 = (const char*)(kernel[i]); 356 __asm__ __volatile__(""); 357 ALOGV(" closure: %p\n", (const void*)f1); 358 } 359 // do nothing, this is just a hook point for the debugger. 360 return; 361 } 362 363 void CpuScriptGroup2Impl::compile(const char* cacheDir) { 364 #ifndef RS_COMPATIBILITY_LIB 365 if (mGroup->mClosures.size() < 2) { 366 return; 367 } 368 369 const int optLevel = getCpuRefImpl()->getContext()->getOptLevel(); 370 if (optLevel == 0) { 371 std::vector<ExpandFuncTy> kernels; 372 for (const Batch* b : mBatches) 373 for (const CPUClosure* c : b->mClosures) 374 kernels.push_back(c->mFunc); 375 376 if (kernels.size()) { 377 // pass this information on to the debugger via a hint function. 378 debugHintScriptGroup2(mGroup->mName, 379 strlen(mGroup->mName), 380 kernels.data(), 381 kernels.size()); 382 } 383 384 // skip script group compilation forcing the driver to use the fallback 385 // execution path which currently has better support for debugging. 386 return; 387 } 388 389 auto comparator = [](const char* str1, const char* str2) -> bool { 390 return strcmp(str1, str2) < 0; 391 }; 392 std::set<const char*, decltype(comparator)> inputSet(comparator); 393 394 for (Closure* closure : mGroup->mClosures) { 395 const Script* script = closure->mFunctionID.get()->mScript; 396 397 // If any script is an intrinsic, give up trying fusing the kernels. 398 if (script->isIntrinsic()) { 399 return; 400 } 401 402 const RsdCpuScriptImpl *cpuScript = 403 (const RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(script); 404 405 const char* bitcodeFilename = cpuScript->getBitcodeFilePath(); 406 inputSet.insert(bitcodeFilename); 407 } 408 409 std::vector<const char*> inputs(inputSet.begin(), inputSet.end()); 410 411 std::vector<string> kernelBatches; 412 std::vector<string> invokeBatches; 413 414 int i = 0; 415 for (const auto& batch : mBatches) { 416 rsAssert(batch->size() > 0); 417 418 std::stringstream ss; 419 ss << batch->mName << ":"; 420 421 if (!batch->mClosures.front()->mClosure->mIsKernel) { 422 rsAssert(batch->size() == 1); 423 generateSourceSlot(mCpuRefImpl, *batch->mClosures.front()->mClosure, inputs, ss); 424 invokeBatches.push_back(ss.str()); 425 } else { 426 for (const auto& cpuClosure : batch->mClosures) { 427 generateSourceSlot(mCpuRefImpl, *cpuClosure->mClosure, inputs, ss); 428 } 429 kernelBatches.push_back(ss.str()); 430 } 431 } 432 433 rsAssert(cacheDir != nullptr); 434 string objFilePath(cacheDir); 435 objFilePath.append("/"); 436 objFilePath.append(mGroup->mName); 437 objFilePath.append(".o"); 438 439 const char* resName = mGroup->mName; 440 string coreLibRelaxedPath; 441 const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(), 442 &coreLibRelaxedPath); 443 444 vector<const char*> arguments; 445 bool emitGlobalInfo = getCpuRefImpl()->getEmbedGlobalInfo(); 446 bool emitGlobalInfoSkipConstant = getCpuRefImpl()->getEmbedGlobalInfoSkipConstant(); 447 setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir, 448 resName, coreLibPath.c_str(), coreLibRelaxedPath.c_str(), 449 emitGlobalInfo, emitGlobalInfoSkipConstant, 450 optLevel, &arguments); 451 452 std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1, 453 arguments.data())); 454 455 inputs.push_back(coreLibPath.c_str()); 456 inputs.push_back(coreLibRelaxedPath.c_str()); 457 458 uint32_t checksum = constructBuildChecksum(nullptr, 0, cmdLine.get(), 459 inputs.data(), inputs.size()); 460 461 if (checksum == 0) { 462 return; 463 } 464 465 std::stringstream ss; 466 ss << std::hex << checksum; 467 std::string checksumStr(ss.str()); 468 469 //===--------------------------------------------------------------------===// 470 // Try to load a shared lib from code cache matching filename and checksum 471 //===--------------------------------------------------------------------===// 472 473 bool alreadyLoaded = false; 474 std::string cloneName; 475 476 const bool useRSDebugContext = 477 (mCpuRefImpl->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG); 478 const bool reuse = !is_force_recompile() && !useRSDebugContext; 479 if (reuse) { 480 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nullptr, 481 &alreadyLoaded); 482 } 483 if (mScriptObj != nullptr) { 484 // A shared library named resName is found in code cache directory 485 // cacheDir, and loaded with the handle stored in mScriptObj. 486 487 mExecutable = ScriptExecutable::createFromSharedObject( 488 mScriptObj, checksum); 489 490 if (mExecutable != nullptr) { 491 // The loaded shared library in mScriptObj has a matching checksum. 492 // An executable object has been created. 493 return; 494 } 495 496 ALOGV("Failed to create an executable object from so file due to " 497 "mismatching checksum"); 498 499 if (alreadyLoaded) { 500 // The shared object found in code cache has already been loaded. 501 // A different file name is needed for the new shared library, to 502 // avoid corrupting the currently loaded instance. 503 504 cloneName.append(resName); 505 cloneName.append("#"); 506 cloneName.append(SharedLibraryUtils::getRandomString(6).c_str()); 507 508 // The last element in arguments is the output filename. 509 arguments.pop_back(); 510 arguments.push_back(cloneName.c_str()); 511 } 512 513 dlclose(mScriptObj); 514 mScriptObj = nullptr; 515 } 516 517 //===--------------------------------------------------------------------===// 518 // Fuse the input kernels and generate native code in an object file 519 //===--------------------------------------------------------------------===// 520 521 arguments.push_back("-build-checksum"); 522 arguments.push_back(checksumStr.c_str()); 523 arguments.push_back(nullptr); 524 525 bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH, 526 arguments.size()-1, 527 arguments.data()); 528 if (!compiled) { 529 return; 530 } 531 532 //===--------------------------------------------------------------------===// 533 // Create and load the shared lib 534 //===--------------------------------------------------------------------===// 535 536 std::string SOPath; 537 538 if (!SharedLibraryUtils::createSharedLibrary( 539 getCpuRefImpl()->getContext()->getDriverName(), cacheDir, resName, 540 reuse, &SOPath)) { 541 ALOGE("Failed to link object file '%s'", resName); 542 unlink(objFilePath.c_str()); 543 return; 544 } 545 546 unlink(objFilePath.c_str()); 547 548 if (reuse) { 549 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 550 } else { 551 mScriptObj = SharedLibraryUtils::loadAndDeleteSharedLibrary(SOPath.c_str()); 552 } 553 if (mScriptObj == nullptr) { 554 ALOGE("Unable to load '%s'", resName); 555 return; 556 } 557 558 if (alreadyLoaded) { 559 // Delete the temporary, random-named file that we created to avoid 560 // interfering with an already loaded shared library. 561 string cloneFilePath(cacheDir); 562 cloneFilePath.append("/"); 563 cloneFilePath.append(cloneName.c_str()); 564 cloneFilePath.append(".so"); 565 unlink(cloneFilePath.c_str()); 566 } 567 568 mExecutable = ScriptExecutable::createFromSharedObject(mScriptObj); 569 570 #endif // RS_COMPATIBILITY_LIB 571 } 572 573 void CpuScriptGroup2Impl::execute() { 574 for (auto batch : mBatches) { 575 batch->setGlobalsForBatch(); 576 batch->run(); 577 } 578 } 579 580 void Batch::setGlobalsForBatch() { 581 for (CPUClosure* cpuClosure : mClosures) { 582 const Closure* closure = cpuClosure->mClosure; 583 const IDBase* funcID = closure->mFunctionID.get(); 584 Script* s = funcID->mScript;; 585 for (const auto& p : closure->mGlobals) { 586 const int64_t value = p.second.first; 587 int size = p.second.second; 588 if (value == 0 && size == 0) { 589 // This indicates the current closure depends on another closure for a 590 // global in their shared module (script). In this case we don't need to 591 // copy the value. For example, an invoke intializes a global variable 592 // which a kernel later reads. 593 continue; 594 } 595 rsAssert(p.first != nullptr); 596 Script* script = p.first->mScript; 597 rsAssert(script == s); 598 RsdCpuReferenceImpl* ctxt = mGroup->getCpuRefImpl(); 599 const RsdCpuScriptImpl *cpuScript = 600 (const RsdCpuScriptImpl *)ctxt->lookupScript(script); 601 int slot = p.first->mSlot; 602 ScriptExecutable* exec = mGroup->getExecutable(); 603 if (exec != nullptr) { 604 const char* varName = cpuScript->getFieldName(slot); 605 void* addr = exec->getFieldAddress(varName); 606 if (size < 0) { 607 rsrSetObject(mGroup->getCpuRefImpl()->getContext(), 608 (rs_object_base*)addr, (ObjectBase*)value); 609 } else { 610 memcpy(addr, (const void*)&value, size); 611 } 612 } else { 613 // We use -1 size to indicate an ObjectBase rather than a primitive type 614 if (size < 0) { 615 s->setVarObj(slot, (ObjectBase*)value); 616 } else { 617 s->setVar(slot, (const void*)&value, size); 618 } 619 } 620 } 621 } 622 } 623 624 void Batch::run() { 625 if (!mClosures.front()->mClosure->mIsKernel) { 626 rsAssert(mClosures.size() == 1); 627 628 // This batch contains a single closure for an invoke function 629 CPUClosure* cc = mClosures.front(); 630 const Closure* c = cc->mClosure; 631 632 if (mFunc != nullptr) { 633 // TODO: Need align pointers for x86_64. 634 // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp 635 ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength); 636 } else { 637 const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get(); 638 rsAssert(invokeID != nullptr); 639 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 640 } 641 642 return; 643 } 644 645 if (mFunc != nullptr) { 646 MTLaunchStructForEach mtls; 647 const CPUClosure* firstCpuClosure = mClosures.front(); 648 const CPUClosure* lastCpuClosure = mClosures.back(); 649 650 firstCpuClosure->mSi->forEachMtlsSetup( 651 (const Allocation**)firstCpuClosure->mClosure->mArgs, 652 firstCpuClosure->mClosure->mNumArg, 653 lastCpuClosure->mClosure->mReturnValue, 654 nullptr, 0, nullptr, &mtls); 655 656 mtls.script = nullptr; 657 mtls.fep.usr = nullptr; 658 mtls.kernel = (ForEachFunc_t)mFunc; 659 660 mGroup->getCpuRefImpl()->launchForEach( 661 (const Allocation**)firstCpuClosure->mClosure->mArgs, 662 firstCpuClosure->mClosure->mNumArg, 663 lastCpuClosure->mClosure->mReturnValue, 664 nullptr, &mtls); 665 666 return; 667 } 668 669 for (CPUClosure* cpuClosure : mClosures) { 670 const Closure* closure = cpuClosure->mClosure; 671 const ScriptKernelID* kernelID = 672 (const ScriptKernelID*)closure->mFunctionID.get(); 673 cpuClosure->mSi->preLaunch(kernelID->mSlot, 674 (const Allocation**)closure->mArgs, 675 closure->mNumArg, closure->mReturnValue, 676 nullptr, 0, nullptr); 677 } 678 679 const CPUClosure* cpuClosure = mClosures.front(); 680 const Closure* closure = cpuClosure->mClosure; 681 MTLaunchStructForEach mtls; 682 683 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 684 closure->mNumArg, 685 closure->mReturnValue, 686 nullptr, 0, nullptr, &mtls)) { 687 688 mtls.script = nullptr; 689 mtls.kernel = &groupRoot; 690 mtls.fep.usr = &mClosures; 691 692 mGroup->getCpuRefImpl()->launchForEach(nullptr, 0, nullptr, nullptr, &mtls); 693 } 694 695 for (CPUClosure* cpuClosure : mClosures) { 696 const Closure* closure = cpuClosure->mClosure; 697 const ScriptKernelID* kernelID = 698 (const ScriptKernelID*)closure->mFunctionID.get(); 699 cpuClosure->mSi->postLaunch(kernelID->mSlot, 700 (const Allocation**)closure->mArgs, 701 closure->mNumArg, closure->mReturnValue, 702 nullptr, 0, nullptr); 703 } 704 } 705 706 } // namespace renderscript 707 } // namespace android 708