Home | History | Annotate | Download | only in cpu_ref
      1 #include "rsCpuScriptGroup2.h"
      2 
      3 #include <dlfcn.h>
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <unistd.h>
      7 
      8 #include <set>
      9 #include <sstream>
     10 #include <string>
     11 #include <vector>
     12 
     13 #ifndef RS_COMPATIBILITY_LIB
     14 #include "bcc/Config.h"
     15 #endif
     16 
     17 #include "cpu_ref/rsCpuCore.h"
     18 #include "rsClosure.h"
     19 #include "rsContext.h"
     20 #include "rsCpuCore.h"
     21 #include "rsCpuExecutable.h"
     22 #include "rsCpuScript.h"
     23 #include "rsScript.h"
     24 #include "rsScriptGroup2.h"
     25 #include "rsScriptIntrinsic.h"
     26 
     27 using std::string;
     28 using std::vector;
     29 
     30 namespace android {
     31 namespace renderscript {
     32 
     33 namespace {
     34 
     35 const size_t DefaultKernelArgCount = 2;
     36 
     37 void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart,
     38                uint32_t xend, uint32_t outstep) {
     39     const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr;
     40     RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
     41 
     42     const size_t oldInLen = mutable_kinfo->inLen;
     43 
     44     decltype(mutable_kinfo->inStride) oldInStride;
     45     memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride));
     46 
     47     for (CPUClosure* cpuClosure : closures) {
     48         const Closure* closure = cpuClosure->mClosure;
     49 
     50         // There had better be enough space in mutable_kinfo
     51         rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT);
     52 
     53         for (size_t i = 0; i < closure->mNumArg; i++) {
     54             const void* arg = closure->mArgs[i];
     55             const Allocation* a = (const Allocation*)arg;
     56             const uint32_t eStride = a->mHal.state.elementSizeBytes;
     57             const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
     58                     eStride * xstart;
     59             if (kinfo->dim.y > 1) {
     60                 ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y;
     61             }
     62             mutable_kinfo->inPtr[i] = ptr;
     63             mutable_kinfo->inStride[i] = eStride;
     64         }
     65         mutable_kinfo->inLen = closure->mNumArg;
     66 
     67         const Allocation* out = closure->mReturnValue;
     68         const uint32_t ostep = out->mHal.state.elementSizeBytes;
     69         const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
     70                 ostep * xstart;
     71         if (kinfo->dim.y > 1) {
     72             ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y;
     73         }
     74 
     75         mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr);
     76 
     77         // The implementation of an intrinsic relies on kinfo->usr being
     78         // the "this" pointer to the intrinsic (an RsdCpuScriptIntrinsic object)
     79         mutable_kinfo->usr = cpuClosure->mSi;
     80 
     81         cpuClosure->mFunc(kinfo, xstart, xend, ostep);
     82     }
     83 
     84     mutable_kinfo->inLen = oldInLen;
     85     mutable_kinfo->usr = &closures;
     86     memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride));
     87 }
     88 
     89 }  // namespace
     90 
     91 Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
     92     mGroup(group), mFunc(nullptr) {
     93     mName = strndup(name, strlen(name));
     94 }
     95 
     96 Batch::~Batch() {
     97     for (CPUClosure* c : mClosures) {
     98         delete c;
     99     }
    100     free(mName);
    101 }
    102 
    103 bool Batch::conflict(CPUClosure* cpuClosure) const {
    104     if (mClosures.empty()) {
    105         return false;
    106     }
    107 
    108     const Closure* closure = cpuClosure->mClosure;
    109 
    110     if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
    111         // An invoke should be in a batch by itself, so it conflicts with any other
    112         // closure.
    113         return true;
    114     }
    115 
    116     const auto& globalDeps = closure->mGlobalDeps;
    117     const auto& argDeps = closure->mArgDeps;
    118 
    119     for (CPUClosure* c : mClosures) {
    120         const Closure* batched = c->mClosure;
    121         if (globalDeps.find(batched) != globalDeps.end()) {
    122             return true;
    123         }
    124         const auto& it = argDeps.find(batched);
    125         if (it != argDeps.end()) {
    126             const auto& args = (*it).second;
    127             for (const auto &p1 : *args) {
    128                 if (p1.second.get() != nullptr) {
    129                     return true;
    130                 }
    131             }
    132         }
    133     }
    134 
    135     // The compiler fusion pass in bcc expects that kernels chained up through
    136     // (1st) input and output.
    137 
    138     const Closure* lastBatched = mClosures.back()->mClosure;
    139     const auto& it = argDeps.find(lastBatched);
    140 
    141     if (it == argDeps.end()) {
    142         return true;
    143     }
    144 
    145     const auto& args = (*it).second;
    146     for (const auto &p1 : *args) {
    147         if (p1.first == 0 && p1.second.get() == nullptr) {
    148             // The new closure depends on the last batched closure's return
    149             // value (fieldId being nullptr) for its first argument (argument 0)
    150             return false;
    151         }
    152     }
    153 
    154     return true;
    155 }
    156 
    157 CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
    158                                          const ScriptGroupBase *sg) :
    159     mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
    160     mExecutable(nullptr), mScriptObj(nullptr) {
    161     rsAssert(!mGroup->mClosures.empty());
    162 
    163     mCpuRefImpl->lockMutex();
    164     Batch* batch = new Batch(this, "Batch0");
    165     int i = 0;
    166     for (Closure* closure: mGroup->mClosures) {
    167         CPUClosure* cc;
    168         const IDBase* funcID = closure->mFunctionID.get();
    169         RsdCpuScriptImpl* si =
    170                 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
    171         if (closure->mIsKernel) {
    172             MTLaunchStructForEach mtls;
    173             si->forEachKernelSetup(funcID->mSlot, &mtls);
    174             cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
    175         } else {
    176             cc = new CPUClosure(closure, si);
    177         }
    178 
    179         if (batch->conflict(cc)) {
    180             mBatches.push_back(batch);
    181             std::stringstream ss;
    182             ss << "Batch" << ++i;
    183             std::string batchStr(ss.str());
    184             batch = new Batch(this, batchStr.c_str());
    185         }
    186 
    187         batch->mClosures.push_back(cc);
    188     }
    189 
    190     rsAssert(!batch->mClosures.empty());
    191     mBatches.push_back(batch);
    192 
    193 #ifndef RS_COMPATIBILITY_LIB
    194     compile(mGroup->mCacheDir);
    195     if (mScriptObj != nullptr && mExecutable != nullptr) {
    196         for (Batch* batch : mBatches) {
    197             batch->resolveFuncPtr(mScriptObj);
    198         }
    199     }
    200 #endif  // RS_COMPATIBILITY_LIB
    201     mCpuRefImpl->unlockMutex();
    202 }
    203 
    204 void Batch::resolveFuncPtr(void* sharedObj) {
    205     std::string funcName(mName);
    206     if (mClosures.front()->mClosure->mIsKernel) {
    207         funcName.append(".expand");
    208     }
    209     mFunc = dlsym(sharedObj, funcName.c_str());
    210     rsAssert (mFunc != nullptr);
    211 }
    212 
    213 CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
    214     for (Batch* batch : mBatches) {
    215         delete batch;
    216     }
    217     delete mExecutable;
    218     // TODO: move this dlclose into ~ScriptExecutable().
    219     if (mScriptObj != nullptr) {
    220         dlclose(mScriptObj);
    221     }
    222 }
    223 
    224 namespace {
    225 
    226 #ifndef RS_COMPATIBILITY_LIB
    227 
    228 string getCoreLibPath(Context* context, string* coreLibRelaxedPath) {
    229     *coreLibRelaxedPath = "";
    230 
    231     // If we're debugging, use the debug library.
    232     if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
    233         return SYSLIBPATH_BC"/libclcore_debug.bc";
    234     }
    235 
    236     // Check for a platform specific library
    237 
    238 #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
    239     // NEON-capable ARMv7a devices can use an accelerated math library
    240     // for all reduced precision scripts.
    241     // ARMv8 does not use NEON, as ASIMD can be used with all precision
    242     // levels.
    243     *coreLibRelaxedPath = SYSLIBPATH_BC"/libclcore_neon.bc";
    244 #endif
    245 
    246 #if defined(__i386__) || defined(__x86_64__)
    247     // x86 devices will use an optimized library.
    248     return SYSLIBPATH_BC"/libclcore_x86.bc";
    249 #else
    250     return SYSLIBPATH_BC"/libclcore.bc";
    251 #endif
    252 }
    253 
    254 void setupCompileArguments(
    255         const vector<const char*>& inputs, const vector<string>& kernelBatches,
    256         const vector<string>& invokeBatches,
    257         const char* outputDir, const char* outputFileName,
    258         const char* coreLibPath, const char* coreLibRelaxedPath,
    259         const bool emitGlobalInfo, const bool emitGlobalInfoSkipConstant,
    260         int optLevel, vector<const char*>* args) {
    261     args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
    262     args->push_back("-fPIC");
    263     args->push_back("-embedRSInfo");
    264     if (emitGlobalInfo) {
    265         args->push_back("-rs-global-info");
    266         if (emitGlobalInfoSkipConstant) {
    267             args->push_back("-rs-global-info-skip-constant");
    268         }
    269     }
    270     args->push_back("-mtriple");
    271     args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
    272     args->push_back("-bclib");
    273     args->push_back(coreLibPath);
    274     args->push_back("-bclib_relaxed");
    275     args->push_back(coreLibRelaxedPath);
    276     for (const char* input : inputs) {
    277         args->push_back(input);
    278     }
    279     for (const string& batch : kernelBatches) {
    280         args->push_back("-merge");
    281         args->push_back(batch.c_str());
    282     }
    283     for (const string& batch : invokeBatches) {
    284         args->push_back("-invoke");
    285         args->push_back(batch.c_str());
    286     }
    287     args->push_back("-output_path");
    288     args->push_back(outputDir);
    289 
    290     args->push_back("-O");
    291     switch (optLevel) {
    292     case 0:
    293         args->push_back("0");
    294         break;
    295     case 3:
    296         args->push_back("3");
    297         break;
    298     default:
    299         ALOGW("Expected optimization level of 0 or 3. Received %d", optLevel);
    300         args->push_back("3");
    301         break;
    302     }
    303 
    304     // The output filename has to be the last, in case we need to pop it out and
    305     // replace with a different name.
    306     args->push_back("-o");
    307     args->push_back(outputFileName);
    308 }
    309 
    310 void generateSourceSlot(RsdCpuReferenceImpl* ctxt,
    311                         const Closure& closure,
    312                         const std::vector<const char*>& inputs,
    313                         std::stringstream& ss) {
    314     const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
    315     const Script* script = funcID->mScript;
    316 
    317     rsAssert (!script->isIntrinsic());
    318 
    319     const RsdCpuScriptImpl *cpuScript =
    320             (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
    321     const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
    322 
    323     const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
    324             inputs.begin();
    325 
    326     ss << index << "," << funcID->mSlot << ".";
    327 }
    328 
    329 #endif  // RS_COMPATIBILTY_LIB
    330 
    331 }  // anonymous namespace
    332 
    333 // This function is used by the debugger to inspect ScriptGroup
    334 // compilations.
    335 //
    336 // "__attribute__((noinline))" and "__asm__" are used to prevent the
    337 // function call from being eliminated as a no-op (see the "noinline"
    338 // attribute in gcc documentation).
    339 //
    340 // "__attribute__((weak))" is used to prevent callers from recognizing
    341 // that this is guaranteed to be the function definition, recognizing
    342 // that certain arguments are unused, and optimizing away the passing
    343 // of those arguments (see the LLVM optimization
    344 // DeadArgumentElimination).  Theoretically, the compiler could get
    345 // aggressive enough with link-time optimization that even marking the
    346 // entry point as a weak definition wouldn't solve the problem.
    347 //
    348 extern __attribute__((noinline)) __attribute__((weak))
    349 void debugHintScriptGroup2(const char* groupName,
    350                            const uint32_t groupNameSize,
    351                            const ExpandFuncTy* kernel,
    352                            const uint32_t kernelCount) {
    353     ALOGV("group name: %d:%s\n", groupNameSize, groupName);
    354     for (uint32_t i=0; i < kernelCount; ++i) {
    355         const char* f1 = (const char*)(kernel[i]);
    356         __asm__ __volatile__("");
    357         ALOGV("  closure: %p\n", (const void*)f1);
    358     }
    359     // do nothing, this is just a hook point for the debugger.
    360     return;
    361 }
    362 
    363 void CpuScriptGroup2Impl::compile(const char* cacheDir) {
    364 #ifndef RS_COMPATIBILITY_LIB
    365     if (mGroup->mClosures.size() < 2) {
    366         return;
    367     }
    368 
    369     const int optLevel = getCpuRefImpl()->getContext()->getOptLevel();
    370     if (optLevel == 0) {
    371         std::vector<ExpandFuncTy> kernels;
    372         for (const Batch* b : mBatches)
    373             for (const CPUClosure* c : b->mClosures)
    374                 kernels.push_back(c->mFunc);
    375 
    376         if (kernels.size()) {
    377             // pass this information on to the debugger via a hint function.
    378             debugHintScriptGroup2(mGroup->mName,
    379                                   strlen(mGroup->mName),
    380                                   kernels.data(),
    381                                   kernels.size());
    382         }
    383 
    384         // skip script group compilation forcing the driver to use the fallback
    385         // execution path which currently has better support for debugging.
    386         return;
    387     }
    388 
    389     auto comparator = [](const char* str1, const char* str2) -> bool {
    390         return strcmp(str1, str2) < 0;
    391     };
    392     std::set<const char*, decltype(comparator)> inputSet(comparator);
    393 
    394     for (Closure* closure : mGroup->mClosures) {
    395         const Script* script = closure->mFunctionID.get()->mScript;
    396 
    397         // If any script is an intrinsic, give up trying fusing the kernels.
    398         if (script->isIntrinsic()) {
    399             return;
    400         }
    401 
    402         const RsdCpuScriptImpl *cpuScript =
    403             (const RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(script);
    404 
    405         const char* bitcodeFilename = cpuScript->getBitcodeFilePath();
    406         inputSet.insert(bitcodeFilename);
    407     }
    408 
    409     std::vector<const char*> inputs(inputSet.begin(), inputSet.end());
    410 
    411     std::vector<string> kernelBatches;
    412     std::vector<string> invokeBatches;
    413 
    414     int i = 0;
    415     for (const auto& batch : mBatches) {
    416         rsAssert(batch->size() > 0);
    417 
    418         std::stringstream ss;
    419         ss << batch->mName << ":";
    420 
    421         if (!batch->mClosures.front()->mClosure->mIsKernel) {
    422             rsAssert(batch->size() == 1);
    423             generateSourceSlot(mCpuRefImpl, *batch->mClosures.front()->mClosure, inputs, ss);
    424             invokeBatches.push_back(ss.str());
    425         } else {
    426             for (const auto& cpuClosure : batch->mClosures) {
    427                 generateSourceSlot(mCpuRefImpl, *cpuClosure->mClosure, inputs, ss);
    428             }
    429             kernelBatches.push_back(ss.str());
    430         }
    431     }
    432 
    433     rsAssert(cacheDir != nullptr);
    434     string objFilePath(cacheDir);
    435     objFilePath.append("/");
    436     objFilePath.append(mGroup->mName);
    437     objFilePath.append(".o");
    438 
    439     const char* resName = mGroup->mName;
    440     string coreLibRelaxedPath;
    441     const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(),
    442                                                &coreLibRelaxedPath);
    443 
    444     vector<const char*> arguments;
    445     bool emitGlobalInfo = getCpuRefImpl()->getEmbedGlobalInfo();
    446     bool emitGlobalInfoSkipConstant = getCpuRefImpl()->getEmbedGlobalInfoSkipConstant();
    447     setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir,
    448                           resName, coreLibPath.c_str(), coreLibRelaxedPath.c_str(),
    449                           emitGlobalInfo, emitGlobalInfoSkipConstant,
    450                           optLevel, &arguments);
    451 
    452     std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1,
    453                                                        arguments.data()));
    454 
    455     inputs.push_back(coreLibPath.c_str());
    456     inputs.push_back(coreLibRelaxedPath.c_str());
    457 
    458     uint32_t checksum = constructBuildChecksum(nullptr, 0, cmdLine.get(),
    459                                                inputs.data(), inputs.size());
    460 
    461     if (checksum == 0) {
    462         return;
    463     }
    464 
    465     std::stringstream ss;
    466     ss << std::hex << checksum;
    467     std::string checksumStr(ss.str());
    468 
    469     //===--------------------------------------------------------------------===//
    470     // Try to load a shared lib from code cache matching filename and checksum
    471     //===--------------------------------------------------------------------===//
    472 
    473     bool alreadyLoaded = false;
    474     std::string cloneName;
    475 
    476     const bool useRSDebugContext =
    477             (mCpuRefImpl->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG);
    478     const bool reuse = !is_force_recompile() && !useRSDebugContext;
    479     if (reuse) {
    480         mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nullptr,
    481                                                            &alreadyLoaded);
    482     }
    483     if (mScriptObj != nullptr) {
    484         // A shared library named resName is found in code cache directory
    485         // cacheDir, and loaded with the handle stored in mScriptObj.
    486 
    487         mExecutable = ScriptExecutable::createFromSharedObject(
    488             mScriptObj, checksum);
    489 
    490         if (mExecutable != nullptr) {
    491             // The loaded shared library in mScriptObj has a matching checksum.
    492             // An executable object has been created.
    493             return;
    494         }
    495 
    496         ALOGV("Failed to create an executable object from so file due to "
    497               "mismatching checksum");
    498 
    499         if (alreadyLoaded) {
    500             // The shared object found in code cache has already been loaded.
    501             // A different file name is needed for the new shared library, to
    502             // avoid corrupting the currently loaded instance.
    503 
    504             cloneName.append(resName);
    505             cloneName.append("#");
    506             cloneName.append(SharedLibraryUtils::getRandomString(6).c_str());
    507 
    508             // The last element in arguments is the output filename.
    509             arguments.pop_back();
    510             arguments.push_back(cloneName.c_str());
    511         }
    512 
    513         dlclose(mScriptObj);
    514         mScriptObj = nullptr;
    515     }
    516 
    517     //===--------------------------------------------------------------------===//
    518     // Fuse the input kernels and generate native code in an object file
    519     //===--------------------------------------------------------------------===//
    520 
    521     arguments.push_back("-build-checksum");
    522     arguments.push_back(checksumStr.c_str());
    523     arguments.push_back(nullptr);
    524 
    525     bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH,
    526                                       arguments.size()-1,
    527                                       arguments.data());
    528     if (!compiled) {
    529         return;
    530     }
    531 
    532     //===--------------------------------------------------------------------===//
    533     // Create and load the shared lib
    534     //===--------------------------------------------------------------------===//
    535 
    536     std::string SOPath;
    537 
    538     if (!SharedLibraryUtils::createSharedLibrary(
    539             getCpuRefImpl()->getContext()->getDriverName(), cacheDir, resName,
    540             reuse, &SOPath)) {
    541         ALOGE("Failed to link object file '%s'", resName);
    542         unlink(objFilePath.c_str());
    543         return;
    544     }
    545 
    546     unlink(objFilePath.c_str());
    547 
    548     if (reuse) {
    549         mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
    550     } else {
    551         mScriptObj = SharedLibraryUtils::loadAndDeleteSharedLibrary(SOPath.c_str());
    552     }
    553     if (mScriptObj == nullptr) {
    554         ALOGE("Unable to load '%s'", resName);
    555         return;
    556     }
    557 
    558     if (alreadyLoaded) {
    559         // Delete the temporary, random-named file that we created to avoid
    560         // interfering with an already loaded shared library.
    561         string cloneFilePath(cacheDir);
    562         cloneFilePath.append("/");
    563         cloneFilePath.append(cloneName.c_str());
    564         cloneFilePath.append(".so");
    565         unlink(cloneFilePath.c_str());
    566     }
    567 
    568     mExecutable = ScriptExecutable::createFromSharedObject(mScriptObj);
    569 
    570 #endif  // RS_COMPATIBILITY_LIB
    571 }
    572 
    573 void CpuScriptGroup2Impl::execute() {
    574     for (auto batch : mBatches) {
    575         batch->setGlobalsForBatch();
    576         batch->run();
    577     }
    578 }
    579 
    580 void Batch::setGlobalsForBatch() {
    581     for (CPUClosure* cpuClosure : mClosures) {
    582         const Closure* closure = cpuClosure->mClosure;
    583         const IDBase* funcID = closure->mFunctionID.get();
    584         Script* s = funcID->mScript;;
    585         for (const auto& p : closure->mGlobals) {
    586             const int64_t value = p.second.first;
    587             int size = p.second.second;
    588             if (value == 0 && size == 0) {
    589                 // This indicates the current closure depends on another closure for a
    590                 // global in their shared module (script). In this case we don't need to
    591                 // copy the value. For example, an invoke intializes a global variable
    592                 // which a kernel later reads.
    593                 continue;
    594             }
    595             rsAssert(p.first != nullptr);
    596             Script* script = p.first->mScript;
    597             rsAssert(script == s);
    598             RsdCpuReferenceImpl* ctxt = mGroup->getCpuRefImpl();
    599             const RsdCpuScriptImpl *cpuScript =
    600                     (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
    601             int slot = p.first->mSlot;
    602             ScriptExecutable* exec = mGroup->getExecutable();
    603             if (exec != nullptr) {
    604                 const char* varName = cpuScript->getFieldName(slot);
    605                 void* addr = exec->getFieldAddress(varName);
    606                 if (size < 0) {
    607                     rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
    608                                  (rs_object_base*)addr, (ObjectBase*)value);
    609                 } else {
    610                     memcpy(addr, (const void*)&value, size);
    611                 }
    612             } else {
    613                 // We use -1 size to indicate an ObjectBase rather than a primitive type
    614                 if (size < 0) {
    615                     s->setVarObj(slot, (ObjectBase*)value);
    616                 } else {
    617                     s->setVar(slot, (const void*)&value, size);
    618                 }
    619             }
    620         }
    621     }
    622 }
    623 
    624 void Batch::run() {
    625     if (!mClosures.front()->mClosure->mIsKernel) {
    626         rsAssert(mClosures.size() == 1);
    627 
    628         // This batch contains a single closure for an invoke function
    629         CPUClosure* cc = mClosures.front();
    630         const Closure* c = cc->mClosure;
    631 
    632         if (mFunc != nullptr) {
    633             // TODO: Need align pointers for x86_64.
    634             // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
    635             ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
    636         } else {
    637             const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
    638             rsAssert(invokeID != nullptr);
    639             cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
    640         }
    641 
    642         return;
    643     }
    644 
    645     if (mFunc != nullptr) {
    646         MTLaunchStructForEach mtls;
    647         const CPUClosure* firstCpuClosure = mClosures.front();
    648         const CPUClosure* lastCpuClosure = mClosures.back();
    649 
    650         firstCpuClosure->mSi->forEachMtlsSetup(
    651                 (const Allocation**)firstCpuClosure->mClosure->mArgs,
    652                 firstCpuClosure->mClosure->mNumArg,
    653                 lastCpuClosure->mClosure->mReturnValue,
    654                 nullptr, 0, nullptr, &mtls);
    655 
    656         mtls.script = nullptr;
    657         mtls.fep.usr = nullptr;
    658         mtls.kernel = (ForEachFunc_t)mFunc;
    659 
    660         mGroup->getCpuRefImpl()->launchForEach(
    661                 (const Allocation**)firstCpuClosure->mClosure->mArgs,
    662                 firstCpuClosure->mClosure->mNumArg,
    663                 lastCpuClosure->mClosure->mReturnValue,
    664                 nullptr, &mtls);
    665 
    666         return;
    667     }
    668 
    669     for (CPUClosure* cpuClosure : mClosures) {
    670         const Closure* closure = cpuClosure->mClosure;
    671         const ScriptKernelID* kernelID =
    672                 (const ScriptKernelID*)closure->mFunctionID.get();
    673         cpuClosure->mSi->preLaunch(kernelID->mSlot,
    674                                    (const Allocation**)closure->mArgs,
    675                                    closure->mNumArg, closure->mReturnValue,
    676                                    nullptr, 0, nullptr);
    677     }
    678 
    679     const CPUClosure* cpuClosure = mClosures.front();
    680     const Closure* closure = cpuClosure->mClosure;
    681     MTLaunchStructForEach mtls;
    682 
    683     if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
    684                                           closure->mNumArg,
    685                                           closure->mReturnValue,
    686                                           nullptr, 0, nullptr, &mtls)) {
    687 
    688         mtls.script = nullptr;
    689         mtls.kernel = &groupRoot;
    690         mtls.fep.usr = &mClosures;
    691 
    692         mGroup->getCpuRefImpl()->launchForEach(nullptr, 0, nullptr, nullptr, &mtls);
    693     }
    694 
    695     for (CPUClosure* cpuClosure : mClosures) {
    696         const Closure* closure = cpuClosure->mClosure;
    697         const ScriptKernelID* kernelID =
    698                 (const ScriptKernelID*)closure->mFunctionID.get();
    699         cpuClosure->mSi->postLaunch(kernelID->mSlot,
    700                                     (const Allocation**)closure->mArgs,
    701                                     closure->mNumArg, closure->mReturnValue,
    702                                     nullptr, 0, nullptr);
    703     }
    704 }
    705 
    706 }  // namespace renderscript
    707 }  // namespace android
    708