Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2011-2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "rsCpuCore.h"
     18 #include "rsCpuScript.h"
     19 #include "rsCpuExecutable.h"
     20 
     21 #ifdef RS_COMPATIBILITY_LIB
     22     #include <stdio.h>
     23     #include <sys/stat.h>
     24     #include <unistd.h>
     25 #else
     26     #include "rsCppUtils.h"
     27 
     28     #include <bcc/Config.h>
     29     #include <bcinfo/MetadataExtractor.h>
     30 
     31     #include <zlib.h>
     32     #include <sys/file.h>
     33     #include <sys/types.h>
     34     #include <unistd.h>
     35 
     36     #include <string>
     37     #include <vector>
     38 #endif
     39 
     40 #include <set>
     41 #include <string>
     42 #include <dlfcn.h>
     43 #include <stdlib.h>
     44 #include <string.h>
     45 #include <iostream>
     46 #include <sstream>
     47 
     48 namespace {
     49 
     50 static const bool kDebugGlobalVariables = false;
     51 
     52 static bool allocationLODIsNull(const android::renderscript::Allocation *alloc) {
     53   // Even if alloc != nullptr, mallocPtr could be null if
     54   // IO_OUTPUT/IO_INPUT with no bound surface.
     55   return alloc && alloc->mHal.drvState.lod[0].mallocPtr == nullptr;
     56 }
     57 
     58 #ifndef RS_COMPATIBILITY_LIB
     59 
     60 static void setCompileArguments(std::vector<const char*>* args,
     61                                 const std::string& bcFileName,
     62                                 const char* cacheDir, const char* resName,
     63                                 const char* core_lib, bool useRSDebugContext,
     64                                 const char* bccPluginName, bool emitGlobalInfo,
     65                                 int optLevel, bool emitGlobalInfoSkipConstant) {
     66     rsAssert(cacheDir && resName && core_lib);
     67     args->push_back(android::renderscript::RsdCpuScriptImpl::BCC_EXE_PATH);
     68     args->push_back("-unroll-runtime");
     69     args->push_back("-scalarize-load-store");
     70     if (emitGlobalInfo) {
     71         args->push_back("-rs-global-info");
     72         if (emitGlobalInfoSkipConstant) {
     73             args->push_back("-rs-global-info-skip-constant");
     74         }
     75     }
     76     args->push_back("-o");
     77     args->push_back(resName);
     78     args->push_back("-output_path");
     79     args->push_back(cacheDir);
     80     args->push_back("-bclib");
     81     args->push_back(core_lib);
     82     args->push_back("-mtriple");
     83     args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
     84     args->push_back("-O");
     85 
     86     switch (optLevel) {
     87     case 0:
     88         args->push_back("0");
     89         break;
     90     case 3:
     91         args->push_back("3");
     92         break;
     93     default:
     94         ALOGW("Expected optimization level of 0 or 3. Received %d", optLevel);
     95         args->push_back("3");
     96         break;
     97     }
     98 
     99     // Enable workaround for A53 codegen by default.
    100 #if defined(__aarch64__) && !defined(DISABLE_A53_WORKAROUND)
    101     args->push_back("-aarch64-fix-cortex-a53-835769");
    102 #endif
    103 
    104     // Execute the bcc compiler.
    105     if (useRSDebugContext) {
    106         args->push_back("-rs-debug-ctx");
    107     } else {
    108         // Only load additional libraries for compiles that don't use
    109         // the debug context.
    110         if (bccPluginName && strlen(bccPluginName) > 0) {
    111 #ifdef __ANDROID__
    112             // For Android, -plugin option must be used in order to load the
    113             // vendor plugin from the sphal namespace.
    114             args->push_back("-plugin");
    115 #else
    116             args->push_back("-load");
    117 #endif
    118             args->push_back(bccPluginName);
    119         }
    120     }
    121 
    122     args->push_back("-fPIC");
    123     args->push_back("-embedRSInfo");
    124 
    125     args->push_back(bcFileName.c_str());
    126     args->push_back(nullptr);
    127 }
    128 
    129 static bool compileBitcode(const std::string &bcFileName,
    130                            const char *bitcode,
    131                            size_t bitcodeSize,
    132                            std::vector<const char *> &compileArguments) {
    133     rsAssert(bitcode && bitcodeSize);
    134 
    135     FILE *bcfile = fopen(bcFileName.c_str(), "w");
    136     if (!bcfile) {
    137         ALOGE("Could not write to %s", bcFileName.c_str());
    138         return false;
    139     }
    140     size_t nwritten = fwrite(bitcode, 1, bitcodeSize, bcfile);
    141     fclose(bcfile);
    142     if (nwritten != bitcodeSize) {
    143         ALOGE("Could not write %zu bytes to %s", bitcodeSize,
    144               bcFileName.c_str());
    145         return false;
    146     }
    147 
    148     return android::renderscript::rsuExecuteCommand(
    149                    android::renderscript::RsdCpuScriptImpl::BCC_EXE_PATH,
    150                    compileArguments.size()-1, compileArguments.data());
    151 }
    152 
    153 // The checksum is unnecessary under a few conditions, since the primary
    154 // use-case for it is debugging. If we are loading something from the
    155 // system partition (read-only), we know that it was precompiled as part of
    156 // application ahead of time (and thus the checksum is completely
    157 // unnecessary). The checksum is also unnecessary on release (non-debug)
    158 // builds, as the only way to get a shared object is to have compiled the
    159 // script once already. On a release build, there is no way to adjust the
    160 // other libraries/dependencies, and so the only reason to recompile would
    161 // be for a source APK change or an OTA. In either case, the APK would be
    162 // reinstalled, which would already clear the code_cache/ directory.
    163 bool isChecksumNeeded(const char *cacheDir) {
    164     static const std::string sysLibPathVndk = getVndkSysLibPath();
    165     if ((::strcmp(SYSLIBPATH, cacheDir) == 0) ||
    166         (::strcmp(sysLibPathVndk.c_str(), cacheDir) == 0) ||
    167         (::strcmp(SYSLIBPATH_VENDOR, cacheDir) == 0))
    168         return false;
    169     char buf[PROP_VALUE_MAX];
    170     android::renderscript::property_get("ro.debuggable", buf, "");
    171     return (buf[0] == '1');
    172 }
    173 
    174 bool addFileToChecksum(const char *fileName, uint32_t &checksum) {
    175     int FD = open(fileName, O_RDONLY);
    176     if (FD == -1) {
    177         ALOGE("Cannot open file \'%s\' to compute checksum", fileName);
    178         return false;
    179     }
    180 
    181     char buf[256];
    182     while (true) {
    183         ssize_t nread = read(FD, buf, sizeof(buf));
    184         if (nread < 0) { // bail out on failed read
    185             ALOGE("Error while computing checksum for file \'%s\'", fileName);
    186             return false;
    187         }
    188 
    189         checksum = adler32(checksum, (const unsigned char *) buf, nread);
    190         if (static_cast<size_t>(nread) < sizeof(buf)) // EOF
    191             break;
    192     }
    193 
    194     if (close(FD) != 0) {
    195         ALOGE("Cannot close file \'%s\' after computing checksum", fileName);
    196         return false;
    197     }
    198     return true;
    199 }
    200 
    201 #endif  // !defined(RS_COMPATIBILITY_LIB)
    202 }  // namespace
    203 
    204 namespace android {
    205 namespace renderscript {
    206 
    207 #ifndef RS_COMPATIBILITY_LIB
    208 
    209 uint32_t constructBuildChecksum(uint8_t const *bitcode, size_t bitcodeSize,
    210                                 const char *commandLine,
    211                                 const char** bccFiles, size_t numFiles) {
    212     uint32_t checksum = adler32(0L, Z_NULL, 0);
    213 
    214     // include checksum of bitcode
    215     if (bitcode != nullptr && bitcodeSize > 0) {
    216         checksum = adler32(checksum, bitcode, bitcodeSize);
    217     }
    218 
    219     // include checksum of command line arguments
    220     checksum = adler32(checksum, (const unsigned char *) commandLine,
    221                        strlen(commandLine));
    222 
    223     // include checksum of bccFiles
    224     for (size_t i = 0; i < numFiles; i++) {
    225         const char* bccFile = bccFiles[i];
    226         if (bccFile[0] != 0 && !addFileToChecksum(bccFile, checksum)) {
    227             // return empty checksum instead of something partial/corrupt
    228             return 0;
    229         }
    230     }
    231 
    232     return checksum;
    233 }
    234 
    235 #endif  // !RS_COMPATIBILITY_LIB
    236 
    237 RsdCpuScriptImpl::RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s) {
    238     mCtx = ctx;
    239     mScript = s;
    240 
    241     mScriptSO = nullptr;
    242 
    243     mRoot = nullptr;
    244     mRootExpand = nullptr;
    245     mInit = nullptr;
    246     mFreeChildren = nullptr;
    247     mScriptExec = nullptr;
    248 
    249     mBoundAllocs = nullptr;
    250     mIntrinsicData = nullptr;
    251     mIsThreadable = true;
    252 
    253     mBuildChecksum = 0;
    254     mChecksumNeeded = false;
    255 }
    256 
    257 bool RsdCpuScriptImpl::storeRSInfoFromSO() {
    258     // The shared object may have an invalid build checksum.
    259     // Validate and fail early.
    260     mScriptExec = ScriptExecutable::createFromSharedObject(
    261             mScriptSO, mChecksumNeeded ? mBuildChecksum : 0);
    262 
    263     if (mScriptExec == nullptr) {
    264         return false;
    265     }
    266 
    267     mRoot = (RootFunc_t) dlsym(mScriptSO, "root");
    268     if (mRoot) {
    269         //ALOGE("Found root(): %p", mRoot);
    270     }
    271     mRootExpand = (RootFunc_t) dlsym(mScriptSO, "root.expand");
    272     if (mRootExpand) {
    273         //ALOGE("Found root.expand(): %p", mRootExpand);
    274     }
    275     mInit = (InitOrDtorFunc_t) dlsym(mScriptSO, "init");
    276     if (mInit) {
    277         //ALOGE("Found init(): %p", mInit);
    278     }
    279     mFreeChildren = (InitOrDtorFunc_t) dlsym(mScriptSO, ".rs.dtor");
    280     if (mFreeChildren) {
    281         //ALOGE("Found .rs.dtor(): %p", mFreeChildren);
    282     }
    283 
    284     size_t varCount = mScriptExec->getExportedVariableCount();
    285     if (varCount > 0) {
    286         mBoundAllocs = new Allocation *[varCount];
    287         memset(mBoundAllocs, 0, varCount * sizeof(*mBoundAllocs));
    288     }
    289 
    290     mIsThreadable = mScriptExec->getThreadable();
    291     //ALOGE("Script isThreadable? %d", mIsThreadable);
    292 
    293     if (kDebugGlobalVariables) {
    294         mScriptExec->dumpGlobalInfo();
    295     }
    296 
    297     return true;
    298 }
    299 
    300 bool RsdCpuScriptImpl::init(char const *resName, char const *cacheDir,
    301                             uint8_t const *bitcode, size_t bitcodeSize,
    302                             uint32_t flags, char const *bccPluginName) {
    303     //ALOGE("rsdScriptCreate %p %p %p %p %i %i %p", rsc, resName, cacheDir,
    304     // bitcode, bitcodeSize, flags, lookupFunc);
    305     //ALOGE("rsdScriptInit %p %p", rsc, script);
    306 
    307     mCtx->lockMutex();
    308 #ifndef RS_COMPATIBILITY_LIB
    309     bool useRSDebugContext = false;
    310 
    311     bcinfo::MetadataExtractor bitcodeMetadata((const char *) bitcode, bitcodeSize);
    312     if (!bitcodeMetadata.extract()) {
    313         ALOGE("Could not extract metadata from bitcode");
    314         mCtx->unlockMutex();
    315         return false;
    316     }
    317 
    318     const char* core_lib = findCoreLib(bitcodeMetadata, (const char*)bitcode, bitcodeSize);
    319 
    320     if (mCtx->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
    321         useRSDebugContext = true;
    322     }
    323 
    324     int optLevel = mCtx->getContext()->getOptLevel();
    325 
    326     std::string bcFileName(cacheDir);
    327     bcFileName.append("/");
    328     bcFileName.append(resName);
    329     bcFileName.append(".bc");
    330 
    331     std::vector<const char*> compileArguments;
    332     bool emitGlobalInfo = mCtx->getEmbedGlobalInfo();
    333     bool emitGlobalInfoSkipConstant = mCtx->getEmbedGlobalInfoSkipConstant();
    334     setCompileArguments(&compileArguments, bcFileName, cacheDir, resName, core_lib,
    335                         useRSDebugContext, bccPluginName, emitGlobalInfo,
    336                         optLevel, emitGlobalInfoSkipConstant);
    337 
    338     mChecksumNeeded = isChecksumNeeded(cacheDir);
    339     if (mChecksumNeeded) {
    340         std::vector<const char *> bccFiles = { BCC_EXE_PATH,
    341                                                core_lib,
    342                                              };
    343 
    344         // The last argument of compileArguments is a nullptr, so remove 1 from
    345         // the size.
    346         std::unique_ptr<const char> compileCommandLine(
    347             rsuJoinStrings(compileArguments.size()-1, compileArguments.data()));
    348 
    349         mBuildChecksum = constructBuildChecksum(bitcode, bitcodeSize,
    350                                                 compileCommandLine.get(),
    351                                                 bccFiles.data(), bccFiles.size());
    352 
    353         if (mBuildChecksum == 0) {
    354             // cannot compute checksum but verification is enabled
    355             mCtx->unlockMutex();
    356             return false;
    357         }
    358     }
    359     else {
    360         // add a dummy/constant as a checksum if verification is disabled
    361         mBuildChecksum = 0xabadcafe;
    362     }
    363 
    364     // Append build checksum to commandline
    365     // Handle the terminal nullptr in compileArguments
    366     compileArguments.pop_back();
    367     compileArguments.push_back("-build-checksum");
    368     std::stringstream ss;
    369     ss << std::hex << mBuildChecksum;
    370     std::string checksumStr(ss.str());
    371     compileArguments.push_back(checksumStr.c_str());
    372     compileArguments.push_back(nullptr);
    373 
    374     const bool reuse = !is_force_recompile() && !useRSDebugContext;
    375     if (reuse) {
    376         mScriptSO = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
    377 
    378         // Read RS info from the shared object to detect checksum mismatch
    379         if (mScriptSO != nullptr && !storeRSInfoFromSO()) {
    380             dlclose(mScriptSO);
    381             mScriptSO = nullptr;
    382         }
    383     }
    384 
    385     // If reuse is desired and we can't, it's either not there or out of date.
    386     // We compile the bit code and try loading again.
    387     if (mScriptSO == nullptr) {
    388         if (!compileBitcode(bcFileName, (const char*)bitcode, bitcodeSize,
    389                             compileArguments))
    390         {
    391             ALOGE("bcc: FAILS to compile '%s'", resName);
    392             mCtx->unlockMutex();
    393             return false;
    394         }
    395 
    396         std::string SOPath;
    397 
    398         if (!SharedLibraryUtils::createSharedLibrary(
    399                 mCtx->getContext()->getDriverName(), cacheDir, resName, reuse,
    400                 &SOPath)) {
    401             ALOGE("Linker: Failed to link object file '%s'", resName);
    402             mCtx->unlockMutex();
    403             return false;
    404         }
    405 
    406         if (reuse) {
    407             mScriptSO = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
    408         } else {
    409             mScriptSO = SharedLibraryUtils::loadAndDeleteSharedLibrary(SOPath.c_str());
    410         }
    411         if (mScriptSO == nullptr) {
    412             ALOGE("Unable to load '%s'", resName);
    413             mCtx->unlockMutex();
    414             return false;
    415         }
    416 
    417         // Read RS symbol information from the .so.
    418         if (!storeRSInfoFromSO()) {
    419             goto error;
    420         }
    421     }
    422 
    423     mBitcodeFilePath.assign(bcFileName.c_str());
    424 
    425 #else  // RS_COMPATIBILITY_LIB is defined
    426     const char *nativeLibDir = mCtx->getContext()->getNativeLibDir();
    427     mScriptSO = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nativeLibDir);
    428 
    429     if (!mScriptSO) {
    430         goto error;
    431     }
    432 
    433     if (!storeRSInfoFromSO()) {
    434         goto error;
    435     }
    436 #endif
    437     mCtx->unlockMutex();
    438     return true;
    439 
    440 error:
    441 
    442     mCtx->unlockMutex();
    443     if (mScriptSO) {
    444         dlclose(mScriptSO);
    445         mScriptSO = nullptr;
    446     }
    447     return false;
    448 }
    449 
    450 #ifndef RS_COMPATIBILITY_LIB
    451 
    452 const char* RsdCpuScriptImpl::findCoreLib(const bcinfo::MetadataExtractor& ME, const char* bitcode,
    453                                           size_t bitcodeSize) {
    454     const char* defaultLib = SYSLIBPATH_BC"/libclcore.bc";
    455 
    456     // If we're debugging, use the debug library.
    457     if (mCtx->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
    458         if (ME.hasDebugInfo()) {
    459             return SYSLIBPATH_BC"/libclcore_debug_g.bc";
    460         }
    461         return SYSLIBPATH_BC"/libclcore_debug.bc";
    462     }
    463 
    464     if (ME.hasDebugInfo()) {
    465         return SYSLIBPATH_BC"/libclcore_g.bc";
    466     }
    467 
    468     // If a callback has been registered to specify a library, use that.
    469     RSSelectRTCallback selectRTCallback = mCtx->getSelectRTCallback();
    470     if (selectRTCallback != nullptr) {
    471         return selectRTCallback((const char*)bitcode, bitcodeSize);
    472     }
    473 
    474     // Check for a platform specific library
    475 #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
    476     enum bcinfo::RSFloatPrecision prec = ME.getRSFloatPrecision();
    477     if (prec == bcinfo::RS_FP_Relaxed) {
    478         // NEON-capable ARMv7a devices can use an accelerated math library
    479         // for all reduced precision scripts.
    480         // ARMv8 does not use NEON, as ASIMD can be used with all precision
    481         // levels.
    482         return SYSLIBPATH_BC"/libclcore_neon.bc";
    483     } else {
    484         return defaultLib;
    485     }
    486 #elif defined(__i386__) || defined(__x86_64__)
    487     // x86 devices will use an optimized library.
    488     return SYSLIBPATH_BC"/libclcore_x86.bc";
    489 #else
    490     return defaultLib;
    491 #endif
    492 }
    493 
    494 #endif
    495 
    496 void RsdCpuScriptImpl::populateScript(Script *script) {
    497     // Copy info over to runtime
    498     script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
    499     script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
    500     script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
    501     script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
    502     script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
    503     script->mHal.info.exportedPragmaKeyList = mScriptExec->getPragmaKeys();
    504     script->mHal.info.exportedPragmaValueList = mScriptExec->getPragmaValues();
    505 
    506     // Bug, need to stash in metadata
    507     if (mRootExpand) {
    508         script->mHal.info.root = mRootExpand;
    509     } else {
    510         script->mHal.info.root = mRoot;
    511     }
    512 }
    513 
    514 // Set up the launch dimensions, and write the values of the launch
    515 // dimensions into the mtls start/end fields.
    516 //
    517 // Inputs:
    518 //    baseDim - base shape of the input
    519 //         sc - used to constrain the launch dimensions
    520 //
    521 // Returns:
    522 //   True on success, false on failure to set up
    523 bool RsdCpuScriptImpl::setUpMtlsDimensions(MTLaunchStructCommon *mtls,
    524                                            const RsLaunchDimensions &baseDim,
    525                                            const RsScriptCall *sc) {
    526     rsAssert(mtls);
    527 
    528 #define SET_UP_DIMENSION(DIM_FIELD, SC_FIELD) do {            \
    529     if (!sc || (sc->SC_FIELD##End == 0)) {                    \
    530         mtls->end.DIM_FIELD = baseDim.DIM_FIELD;              \
    531     } else {                                                  \
    532         mtls->start.DIM_FIELD =                               \
    533             rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##Start);    \
    534         mtls->end.DIM_FIELD =                                 \
    535             rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##End);      \
    536         if (mtls->start.DIM_FIELD >= mtls->end.DIM_FIELD) {   \
    537             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, \
    538                 "Failed to launch kernel; Invalid "           \
    539                 #SC_FIELD "Start or " #SC_FIELD "End.");      \
    540             return false;                                     \
    541         }                                                     \
    542     }} while(0)
    543 
    544     SET_UP_DIMENSION(x, x);
    545     SET_UP_DIMENSION(y, y);
    546     SET_UP_DIMENSION(z, z);
    547     // Checks and setup of fields other than x, y, z are ignored, since those
    548     // fields are not used in the runtime and are not visible in the Java API.
    549 #undef SET_UP_DIMENSION
    550 
    551     return true;
    552 }
    553 
    554 // Preliminary work to prepare a general reduce-style kernel for launch.
    555 bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation ** ains,
    556                                        uint32_t inLen,
    557                                        const Allocation * aout,
    558                                        const RsScriptCall *sc,
    559                                        MTLaunchStructReduce *mtls) {
    560     rsAssert(ains && (inLen >= 1) && aout);
    561     memset(mtls, 0, sizeof(MTLaunchStructReduce));
    562     mtls->dimPtr = &mtls->redp.dim;
    563 
    564     for (int index = inLen; --index >= 0;) {
    565         if (allocationLODIsNull(ains[index])) {
    566             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    567                                          "reduce called with null in allocations");
    568             return false;
    569         }
    570     }
    571 
    572     if (allocationLODIsNull(aout)) {
    573         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    574                                      "reduce called with null out allocation");
    575         return false;
    576     }
    577 
    578     const Allocation *ain0   = ains[0];
    579     const Type       *inType = ain0->getType();
    580 
    581     mtls->redp.dim.x = inType->getDimX();
    582     mtls->redp.dim.y = inType->getDimY();
    583     mtls->redp.dim.z = inType->getDimZ();
    584 
    585     for (int Index = inLen; --Index >= 1;) {
    586         if (!ain0->hasSameDims(ains[Index])) {
    587             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    588                                          "Failed to launch reduction kernel;"
    589                                          "dimensions of input allocations do not match.");
    590             return false;
    591         }
    592     }
    593 
    594     if (!setUpMtlsDimensions(mtls, mtls->redp.dim, sc)) {
    595         return false;
    596     }
    597 
    598     // The X & Y walkers always want 0-1 min even if dim is not present
    599     mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
    600     mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
    601 
    602     mtls->rs = mCtx;
    603 
    604     mtls->mSliceNum    = 0;
    605     mtls->mSliceSize   = 1;
    606     mtls->isThreadable = mIsThreadable;
    607 
    608     // Set up output,
    609     mtls->redp.outLen = 1;
    610     mtls->redp.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
    611     mtls->redp.outStride[0] = aout->getType()->getElementSizeBytes();
    612 
    613     // Set up input.
    614     memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
    615     mtls->redp.inLen = inLen;
    616     for (int index = inLen; --index >= 0;) {
    617         mtls->redp.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
    618         mtls->redp.inStride[index] = ains[index]->getType()->getElementSizeBytes();
    619     }
    620 
    621     // All validation passed, ok to launch threads
    622     return true;
    623 }
    624 
    625 // Preliminary work to prepare a forEach-style kernel for launch.
    626 bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
    627                                         uint32_t inLen,
    628                                         Allocation * aout,
    629                                         const void * usr, uint32_t usrLen,
    630                                         const RsScriptCall *sc,
    631                                         MTLaunchStructForEach *mtls) {
    632     if (ains == nullptr && inLen != 0) {
    633         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    634           "rsForEach called with none-zero inLen with null in allocations");
    635         return false;
    636     }
    637 
    638     memset(mtls, 0, sizeof(MTLaunchStructForEach));
    639     mtls->dimPtr = &mtls->fep.dim;
    640 
    641     for (int index = inLen; --index >= 0;) {
    642         if (allocationLODIsNull(ains[index])) {
    643             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    644                                          "rsForEach called with null in allocations");
    645             return false;
    646         }
    647     }
    648 
    649     if (allocationLODIsNull(aout)) {
    650         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    651                                      "rsForEach called with null out allocations");
    652         return false;
    653     }
    654 
    655     // The only situation where ains[j] is null is when inLen==1 and j==0;
    656     // and that can only happen for an old-style kernel in API level 11~13,
    657     // where the input allocation cannot be skipped if the output allocation is specified.
    658     if (inLen != 0)
    659         rsAssert((inLen == 1) || (ains[0] != nullptr));
    660 
    661     if (inLen > 0 && ains[0]) {
    662         const Allocation *ain0   = ains[0];
    663         const Type       *inType = ain0->getType();
    664 
    665         mtls->fep.dim.x = inType->getDimX();
    666         mtls->fep.dim.y = inType->getDimY();
    667         mtls->fep.dim.z = inType->getDimZ();
    668 
    669         for (int Index = inLen; --Index >= 1;) {
    670             if (!ain0->hasSameDims(ains[Index])) {
    671                 mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    672                   "Failed to launch kernel; dimensions of input "
    673                   "allocations do not match.");
    674                 return false;
    675             }
    676         }
    677     } else if (aout != nullptr) {
    678         const Type *outType = aout->getType();
    679 
    680         mtls->fep.dim.x = outType->getDimX();
    681         mtls->fep.dim.y = outType->getDimY();
    682         mtls->fep.dim.z = outType->getDimZ();
    683 
    684     } else if (sc != nullptr) {
    685         mtls->fep.dim.x = sc->xEnd;
    686         mtls->fep.dim.y = sc->yEnd;
    687         mtls->fep.dim.z = 0;
    688     } else {
    689         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    690                                      "rsForEach called with null allocations");
    691         return false;
    692     }
    693 
    694     if (inLen > 0 && aout != nullptr) {
    695         if (ains[0] && !ains[0]->hasSameDims(aout)) {
    696             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    697               "Failed to launch kernel; dimensions of input and output allocations do not match.");
    698 
    699             return false;
    700         }
    701     }
    702 
    703     if (!setUpMtlsDimensions(mtls, mtls->fep.dim, sc)) {
    704         return false;
    705     }
    706 
    707     // The X & Y walkers always want 0-1 min even if dim is not present
    708     mtls->end.x    = rsMax((uint32_t)1, mtls->end.x);
    709     mtls->end.y    = rsMax((uint32_t)1, mtls->end.y);
    710     mtls->rs       = mCtx;
    711     if (ains) {
    712         memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
    713     }
    714     mtls->aout[0]    = aout;
    715     mtls->fep.usr    = usr;
    716     mtls->fep.usrLen = usrLen;
    717     mtls->mSliceSize = 1;
    718     mtls->mSliceNum  = 0;
    719 
    720     mtls->isThreadable  = mIsThreadable;
    721 
    722     if (inLen > 0) {
    723         mtls->fep.inLen = inLen;
    724         for (int index = inLen; --index >= 0;) {
    725             if (ains[index] == nullptr) {
    726                 // In old style kernels, the first and only input allocation could be null.
    727                 // Not allowed in newer styles.
    728                 rsAssert(inLen == 1 && index == 0);
    729                 continue;
    730             }
    731             mtls->fep.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
    732             mtls->fep.inStride[index] = ains[index]->getType()->getElementSizeBytes();
    733         }
    734     }
    735 
    736     if (aout != nullptr) {
    737         mtls->fep.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
    738         mtls->fep.outStride[0] = aout->getType()->getElementSizeBytes();
    739     }
    740 
    741     // All validation passed, ok to launch threads
    742     return true;
    743 }
    744 
    745 
    746 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
    747                                      const Allocation ** ains,
    748                                      uint32_t inLen,
    749                                      Allocation * aout,
    750                                      const void * usr,
    751                                      uint32_t usrLen,
    752                                      const RsScriptCall *sc) {
    753 
    754     MTLaunchStructForEach mtls;
    755 
    756     if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
    757         forEachKernelSetup(slot, &mtls);
    758 
    759         RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
    760         mCtx->launchForEach(ains, inLen, aout, sc, &mtls);
    761         mCtx->setTLS(oldTLS);
    762     }
    763 }
    764 
    765 void RsdCpuScriptImpl::invokeReduce(uint32_t slot,
    766                                     const Allocation ** ains, uint32_t inLen,
    767                                     Allocation *aout,
    768                                     const RsScriptCall *sc) {
    769   MTLaunchStructReduce mtls;
    770 
    771   if (reduceMtlsSetup(ains, inLen, aout, sc, &mtls)) {
    772     reduceKernelSetup(slot, &mtls);
    773     RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
    774     mCtx->launchReduce(ains, inLen, aout, &mtls);
    775     mCtx->setTLS(oldTLS);
    776   }
    777 }
    778 
    779 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
    780     mtls->script = this;
    781     mtls->fep.slot = slot;
    782     mtls->kernel = mScriptExec->getForEachFunction(slot);
    783     rsAssert(mtls->kernel != nullptr);
    784 }
    785 
    786 void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
    787     mtls->script = this;
    788     mtls->redp.slot = slot;
    789 
    790     const ReduceDescription *desc = mScriptExec->getReduceDescription(slot);
    791     mtls->accumFunc = desc->accumFunc;
    792     mtls->initFunc  = desc->initFunc;   // might legally be nullptr
    793     mtls->combFunc  = desc->combFunc;   // might legally be nullptr
    794     mtls->outFunc   = desc->outFunc;    // might legally be nullptr
    795     mtls->accumSize = desc->accumSize;
    796 
    797     rsAssert(mtls->accumFunc != nullptr);
    798 }
    799 
    800 int RsdCpuScriptImpl::invokeRoot() {
    801     RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
    802     int ret = mRoot();
    803     mCtx->setTLS(oldTLS);
    804     return ret;
    805 }
    806 
    807 void RsdCpuScriptImpl::invokeInit() {
    808     if (mInit) {
    809         mInit();
    810     }
    811 }
    812 
    813 void RsdCpuScriptImpl::invokeFreeChildren() {
    814     if (mFreeChildren) {
    815         mFreeChildren();
    816     }
    817 }
    818 
    819 void RsdCpuScriptImpl::invokeFunction(uint32_t slot, const void *params,
    820                                       size_t paramLength) {
    821     //ALOGE("invoke %i %p %zu", slot, params, paramLength);
    822     void * ap = nullptr;
    823 
    824 #if defined(__x86_64__)
    825     // The invoked function could have input parameter of vector type for example float4 which
    826     // requires void* params to be 16 bytes aligned when using SSE instructions for x86_64 platform.
    827     // So try to align void* params before passing them into RS exported function.
    828 
    829     if ((uint8_t)(uint64_t)params & 0x0F) {
    830         if ((ap = (void*)memalign(16, paramLength)) != nullptr) {
    831             memcpy(ap, params, paramLength);
    832         } else {
    833             ALOGE("x86_64: invokeFunction memalign error, still use params which"
    834                   " is not 16 bytes aligned.");
    835         }
    836     }
    837 #endif
    838 
    839     RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
    840     reinterpret_cast<void (*)(const void *, uint32_t)>(
    841         mScriptExec->getInvokeFunction(slot))(ap? (const void *) ap: params, paramLength);
    842 
    843 #if defined(__x86_64__)
    844     free(ap);
    845 #endif
    846 
    847     mCtx->setTLS(oldTLS);
    848 }
    849 
    850 void RsdCpuScriptImpl::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
    851     //rsAssert(!script->mFieldIsObject[slot]);
    852     //ALOGE("setGlobalVar %i %p %zu", slot, data, dataLength);
    853 
    854     //if (mIntrinsicID) {
    855         //mIntrinsicFuncs.setVar(dc, script, drv->mIntrinsicData, slot, data, dataLength);
    856         //return;
    857     //}
    858 
    859     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    860     if (!destPtr) {
    861         //ALOGV("Calling setVar on slot = %i which is null", slot);
    862         return;
    863     }
    864 
    865     memcpy(destPtr, data, dataLength);
    866 }
    867 
    868 void RsdCpuScriptImpl::getGlobalVar(uint32_t slot, void *data, size_t dataLength) {
    869     //rsAssert(!script->mFieldIsObject[slot]);
    870     //ALOGE("getGlobalVar %i %p %zu", slot, data, dataLength);
    871 
    872     int32_t *srcPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    873     if (!srcPtr) {
    874         //ALOGV("Calling setVar on slot = %i which is null", slot);
    875         return;
    876     }
    877     memcpy(data, srcPtr, dataLength);
    878 }
    879 
    880 
    881 void RsdCpuScriptImpl::setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
    882                                                 const Element *elem,
    883                                                 const uint32_t *dims, size_t dimLength) {
    884     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    885     if (!destPtr) {
    886         //ALOGV("Calling setVar on slot = %i which is null", slot);
    887         return;
    888     }
    889 
    890     // We want to look at dimension in terms of integer components,
    891     // but dimLength is given in terms of bytes.
    892     dimLength /= sizeof(int);
    893 
    894     // Only a single dimension is currently supported.
    895     rsAssert(dimLength == 1);
    896     if (dimLength == 1) {
    897         // First do the increment loop.
    898         size_t stride = elem->getSizeBytes();
    899         const char *cVal = reinterpret_cast<const char *>(data);
    900         for (uint32_t i = 0; i < dims[0]; i++) {
    901             elem->incRefs(cVal);
    902             cVal += stride;
    903         }
    904 
    905         // Decrement loop comes after (to prevent race conditions).
    906         char *oldVal = reinterpret_cast<char *>(destPtr);
    907         for (uint32_t i = 0; i < dims[0]; i++) {
    908             elem->decRefs(oldVal);
    909             oldVal += stride;
    910         }
    911     }
    912 
    913     memcpy(destPtr, data, dataLength);
    914 }
    915 
    916 void RsdCpuScriptImpl::setGlobalBind(uint32_t slot, Allocation *data) {
    917 
    918     //rsAssert(!script->mFieldIsObject[slot]);
    919     //ALOGE("setGlobalBind %i %p", slot, data);
    920 
    921     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    922     if (!destPtr) {
    923         //ALOGV("Calling setVar on slot = %i which is null", slot);
    924         return;
    925     }
    926 
    927     void *ptr = nullptr;
    928     mBoundAllocs[slot] = data;
    929     if (data) {
    930         ptr = data->mHal.drvState.lod[0].mallocPtr;
    931     }
    932     memcpy(destPtr, &ptr, sizeof(void *));
    933 }
    934 
    935 void RsdCpuScriptImpl::setGlobalObj(uint32_t slot, ObjectBase *data) {
    936 
    937     //rsAssert(script->mFieldIsObject[slot]);
    938     //ALOGE("setGlobalObj %i %p", slot, data);
    939 
    940     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    941     if (!destPtr) {
    942         //ALOGV("Calling setVar on slot = %i which is null", slot);
    943         return;
    944     }
    945 
    946     rsrSetObject(mCtx->getContext(), (rs_object_base *)destPtr, data);
    947 }
    948 
    949 const char* RsdCpuScriptImpl::getFieldName(uint32_t slot) const {
    950     return mScriptExec->getFieldName(slot);
    951 }
    952 
    953 RsdCpuScriptImpl::~RsdCpuScriptImpl() {
    954     delete mScriptExec;
    955     delete[] mBoundAllocs;
    956     if (mScriptSO) {
    957         dlclose(mScriptSO);
    958     }
    959 }
    960 
    961 Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const {
    962     if (!ptr) {
    963         return nullptr;
    964     }
    965 
    966     for (uint32_t ct=0; ct < mScript->mHal.info.exportedVariableCount; ct++) {
    967         Allocation *a = mBoundAllocs[ct];
    968         if (!a) continue;
    969         if (a->mHal.drvState.lod[0].mallocPtr == ptr) {
    970             return a;
    971         }
    972     }
    973     ALOGE("rsGetAllocation, failed to find %p", ptr);
    974     return nullptr;
    975 }
    976 
    977 int RsdCpuScriptImpl::getGlobalEntries() const {
    978     return mScriptExec->getGlobalEntries();
    979 }
    980 
    981 const char * RsdCpuScriptImpl::getGlobalName(int i) const {
    982     return mScriptExec->getGlobalName(i);
    983 }
    984 
    985 const void * RsdCpuScriptImpl::getGlobalAddress(int i) const {
    986     return mScriptExec->getGlobalAddress(i);
    987 }
    988 
    989 size_t RsdCpuScriptImpl::getGlobalSize(int i) const {
    990     return mScriptExec->getGlobalSize(i);
    991 }
    992 
    993 uint32_t RsdCpuScriptImpl::getGlobalProperties(int i) const {
    994     return mScriptExec->getGlobalProperties(i);
    995 }
    996 
    997 void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
    998                                  uint32_t inLen, Allocation * aout,
    999                                  const void * usr, uint32_t usrLen,
   1000                                  const RsScriptCall *sc) {}
   1001 
   1002 void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
   1003                                   uint32_t inLen, Allocation * aout,
   1004                                   const void * usr, uint32_t usrLen,
   1005                                   const RsScriptCall *sc) {}
   1006 
   1007 
   1008 } // namespace renderscript
   1009 } // namespace android
   1010