Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2011-2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "rsCpuCore.h"
     18 #include "rsCpuScript.h"
     19 #include "rsCpuExecutable.h"
     20 
     21 #ifdef RS_COMPATIBILITY_LIB
     22     #include <stdio.h>
     23     #include <sys/stat.h>
     24     #include <unistd.h>
     25 #else
     26     #include "rsCppUtils.h"
     27 
     28     #include <bcc/Config.h>
     29     #include <bcinfo/MetadataExtractor.h>
     30 
     31     #include <zlib.h>
     32     #include <sys/file.h>
     33     #include <sys/types.h>
     34     #include <unistd.h>
     35 
     36     #include <string>
     37     #include <vector>
     38 #endif
     39 
     40 #include <set>
     41 #include <string>
     42 #include <dlfcn.h>
     43 #include <stdlib.h>
     44 #include <string.h>
     45 #include <iostream>
     46 #include <sstream>
     47 
     48 namespace {
     49 
     50 static const bool kDebugGlobalVariables = false;
     51 
     52 static bool allocationLODIsNull(const android::renderscript::Allocation *alloc) {
     53   // Even if alloc != nullptr, mallocPtr could be null if
     54   // IO_OUTPUT/IO_INPUT with no bound surface.
     55   return alloc && alloc->mHal.drvState.lod[0].mallocPtr == nullptr;
     56 }
     57 
     58 #ifndef RS_COMPATIBILITY_LIB
     59 
     60 static void setCompileArguments(std::vector<const char*>* args,
     61                                 const std::string& bcFileName,
     62                                 const char* cacheDir, const char* resName,
     63                                 const char* core_lib, bool useRSDebugContext,
     64                                 const char* bccPluginName, bool emitGlobalInfo,
     65                                 int optLevel, bool emitGlobalInfoSkipConstant) {
     66     rsAssert(cacheDir && resName && core_lib);
     67     args->push_back(android::renderscript::RsdCpuScriptImpl::BCC_EXE_PATH);
     68     args->push_back("-unroll-runtime");
     69     args->push_back("-scalarize-load-store");
     70     if (emitGlobalInfo) {
     71         args->push_back("-rs-global-info");
     72         if (emitGlobalInfoSkipConstant) {
     73             args->push_back("-rs-global-info-skip-constant");
     74         }
     75     }
     76     args->push_back("-o");
     77     args->push_back(resName);
     78     args->push_back("-output_path");
     79     args->push_back(cacheDir);
     80     args->push_back("-bclib");
     81     args->push_back(core_lib);
     82     args->push_back("-mtriple");
     83     args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
     84     args->push_back("-O");
     85 
     86     switch (optLevel) {
     87     case 0:
     88         args->push_back("0");
     89         break;
     90     case 3:
     91         args->push_back("3");
     92         break;
     93     default:
     94         ALOGW("Expected optimization level of 0 or 3. Received %d", optLevel);
     95         args->push_back("3");
     96         break;
     97     }
     98 
     99     // Enable workaround for A53 codegen by default.
    100 #if defined(__aarch64__) && !defined(DISABLE_A53_WORKAROUND)
    101     args->push_back("-aarch64-fix-cortex-a53-835769");
    102 #endif
    103 
    104     // Execute the bcc compiler.
    105     if (useRSDebugContext) {
    106         args->push_back("-rs-debug-ctx");
    107     } else {
    108         // Only load additional libraries for compiles that don't use
    109         // the debug context.
    110         if (bccPluginName && strlen(bccPluginName) > 0) {
    111             args->push_back("-load");
    112             args->push_back(bccPluginName);
    113         }
    114     }
    115 
    116     args->push_back("-fPIC");
    117     args->push_back("-embedRSInfo");
    118 
    119     args->push_back(bcFileName.c_str());
    120     args->push_back(nullptr);
    121 }
    122 
    123 static bool compileBitcode(const std::string &bcFileName,
    124                            const char *bitcode,
    125                            size_t bitcodeSize,
    126                            std::vector<const char *> &compileArguments) {
    127     rsAssert(bitcode && bitcodeSize);
    128 
    129     FILE *bcfile = fopen(bcFileName.c_str(), "w");
    130     if (!bcfile) {
    131         ALOGE("Could not write to %s", bcFileName.c_str());
    132         return false;
    133     }
    134     size_t nwritten = fwrite(bitcode, 1, bitcodeSize, bcfile);
    135     fclose(bcfile);
    136     if (nwritten != bitcodeSize) {
    137         ALOGE("Could not write %zu bytes to %s", bitcodeSize,
    138               bcFileName.c_str());
    139         return false;
    140     }
    141 
    142     return android::renderscript::rsuExecuteCommand(
    143                    android::renderscript::RsdCpuScriptImpl::BCC_EXE_PATH,
    144                    compileArguments.size()-1, compileArguments.data());
    145 }
    146 
    147 // The checksum is unnecessary under a few conditions, since the primary
    148 // use-case for it is debugging. If we are loading something from the
    149 // system partition (read-only), we know that it was precompiled as part of
    150 // application ahead of time (and thus the checksum is completely
    151 // unnecessary). The checksum is also unnecessary on release (non-debug)
    152 // builds, as the only way to get a shared object is to have compiled the
    153 // script once already. On a release build, there is no way to adjust the
    154 // other libraries/dependencies, and so the only reason to recompile would
    155 // be for a source APK change or an OTA. In either case, the APK would be
    156 // reinstalled, which would already clear the code_cache/ directory.
    157 bool isChecksumNeeded(const char *cacheDir) {
    158     if ((::strcmp(SYSLIBPATH, cacheDir) == 0) ||
    159         (::strcmp(SYSLIBPATH_VENDOR, cacheDir) == 0))
    160         return false;
    161     char buf[PROP_VALUE_MAX];
    162     android::renderscript::property_get("ro.debuggable", buf, "");
    163     return (buf[0] == '1');
    164 }
    165 
    166 bool addFileToChecksum(const char *fileName, uint32_t &checksum) {
    167     int FD = open(fileName, O_RDONLY);
    168     if (FD == -1) {
    169         ALOGE("Cannot open file \'%s\' to compute checksum", fileName);
    170         return false;
    171     }
    172 
    173     char buf[256];
    174     while (true) {
    175         ssize_t nread = read(FD, buf, sizeof(buf));
    176         if (nread < 0) { // bail out on failed read
    177             ALOGE("Error while computing checksum for file \'%s\'", fileName);
    178             return false;
    179         }
    180 
    181         checksum = adler32(checksum, (const unsigned char *) buf, nread);
    182         if (static_cast<size_t>(nread) < sizeof(buf)) // EOF
    183             break;
    184     }
    185 
    186     if (close(FD) != 0) {
    187         ALOGE("Cannot close file \'%s\' after computing checksum", fileName);
    188         return false;
    189     }
    190     return true;
    191 }
    192 
    193 #endif  // !defined(RS_COMPATIBILITY_LIB)
    194 }  // namespace
    195 
    196 namespace android {
    197 namespace renderscript {
    198 
    199 #ifndef RS_COMPATIBILITY_LIB
    200 
    201 uint32_t constructBuildChecksum(uint8_t const *bitcode, size_t bitcodeSize,
    202                                 const char *commandLine,
    203                                 const char** bccFiles, size_t numFiles) {
    204     uint32_t checksum = adler32(0L, Z_NULL, 0);
    205 
    206     // include checksum of bitcode
    207     if (bitcode != nullptr && bitcodeSize > 0) {
    208         checksum = adler32(checksum, bitcode, bitcodeSize);
    209     }
    210 
    211     // include checksum of command line arguments
    212     checksum = adler32(checksum, (const unsigned char *) commandLine,
    213                        strlen(commandLine));
    214 
    215     // include checksum of bccFiles
    216     for (size_t i = 0; i < numFiles; i++) {
    217         const char* bccFile = bccFiles[i];
    218         if (bccFile[0] != 0 && !addFileToChecksum(bccFile, checksum)) {
    219             // return empty checksum instead of something partial/corrupt
    220             return 0;
    221         }
    222     }
    223 
    224     return checksum;
    225 }
    226 
    227 #endif  // !RS_COMPATIBILITY_LIB
    228 
    229 RsdCpuScriptImpl::RsdCpuScriptImpl(RsdCpuReferenceImpl *ctx, const Script *s) {
    230     mCtx = ctx;
    231     mScript = s;
    232 
    233     mScriptSO = nullptr;
    234 
    235     mRoot = nullptr;
    236     mRootExpand = nullptr;
    237     mInit = nullptr;
    238     mFreeChildren = nullptr;
    239     mScriptExec = nullptr;
    240 
    241     mBoundAllocs = nullptr;
    242     mIntrinsicData = nullptr;
    243     mIsThreadable = true;
    244 
    245     mBuildChecksum = 0;
    246     mChecksumNeeded = false;
    247 }
    248 
    249 bool RsdCpuScriptImpl::storeRSInfoFromSO() {
    250     // The shared object may have an invalid build checksum.
    251     // Validate and fail early.
    252     mScriptExec = ScriptExecutable::createFromSharedObject(
    253             mScriptSO, mChecksumNeeded ? mBuildChecksum : 0);
    254 
    255     if (mScriptExec == nullptr) {
    256         return false;
    257     }
    258 
    259     mRoot = (RootFunc_t) dlsym(mScriptSO, "root");
    260     if (mRoot) {
    261         //ALOGE("Found root(): %p", mRoot);
    262     }
    263     mRootExpand = (RootFunc_t) dlsym(mScriptSO, "root.expand");
    264     if (mRootExpand) {
    265         //ALOGE("Found root.expand(): %p", mRootExpand);
    266     }
    267     mInit = (InitOrDtorFunc_t) dlsym(mScriptSO, "init");
    268     if (mInit) {
    269         //ALOGE("Found init(): %p", mInit);
    270     }
    271     mFreeChildren = (InitOrDtorFunc_t) dlsym(mScriptSO, ".rs.dtor");
    272     if (mFreeChildren) {
    273         //ALOGE("Found .rs.dtor(): %p", mFreeChildren);
    274     }
    275 
    276     size_t varCount = mScriptExec->getExportedVariableCount();
    277     if (varCount > 0) {
    278         mBoundAllocs = new Allocation *[varCount];
    279         memset(mBoundAllocs, 0, varCount * sizeof(*mBoundAllocs));
    280     }
    281 
    282     mIsThreadable = mScriptExec->getThreadable();
    283     //ALOGE("Script isThreadable? %d", mIsThreadable);
    284 
    285     if (kDebugGlobalVariables) {
    286         mScriptExec->dumpGlobalInfo();
    287     }
    288 
    289     return true;
    290 }
    291 
    292 bool RsdCpuScriptImpl::init(char const *resName, char const *cacheDir,
    293                             uint8_t const *bitcode, size_t bitcodeSize,
    294                             uint32_t flags, char const *bccPluginName) {
    295     //ALOGE("rsdScriptCreate %p %p %p %p %i %i %p", rsc, resName, cacheDir,
    296     // bitcode, bitcodeSize, flags, lookupFunc);
    297     //ALOGE("rsdScriptInit %p %p", rsc, script);
    298 
    299     mCtx->lockMutex();
    300 #ifndef RS_COMPATIBILITY_LIB
    301     bool useRSDebugContext = false;
    302 
    303     bcinfo::MetadataExtractor bitcodeMetadata((const char *) bitcode, bitcodeSize);
    304     if (!bitcodeMetadata.extract()) {
    305         ALOGE("Could not extract metadata from bitcode");
    306         mCtx->unlockMutex();
    307         return false;
    308     }
    309 
    310     const char* core_lib = findCoreLib(bitcodeMetadata, (const char*)bitcode, bitcodeSize);
    311 
    312     if (mCtx->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
    313         useRSDebugContext = true;
    314     }
    315 
    316     int optLevel = mCtx->getContext()->getOptLevel();
    317 
    318     std::string bcFileName(cacheDir);
    319     bcFileName.append("/");
    320     bcFileName.append(resName);
    321     bcFileName.append(".bc");
    322 
    323     std::vector<const char*> compileArguments;
    324     bool emitGlobalInfo = mCtx->getEmbedGlobalInfo();
    325     bool emitGlobalInfoSkipConstant = mCtx->getEmbedGlobalInfoSkipConstant();
    326     setCompileArguments(&compileArguments, bcFileName, cacheDir, resName, core_lib,
    327                         useRSDebugContext, bccPluginName, emitGlobalInfo,
    328                         optLevel, emitGlobalInfoSkipConstant);
    329 
    330     mChecksumNeeded = isChecksumNeeded(cacheDir);
    331     if (mChecksumNeeded) {
    332         std::vector<const char *> bccFiles = { BCC_EXE_PATH,
    333                                                core_lib,
    334                                              };
    335 
    336         // The last argument of compileArguments is a nullptr, so remove 1 from
    337         // the size.
    338         std::unique_ptr<const char> compileCommandLine(
    339             rsuJoinStrings(compileArguments.size()-1, compileArguments.data()));
    340 
    341         mBuildChecksum = constructBuildChecksum(bitcode, bitcodeSize,
    342                                                 compileCommandLine.get(),
    343                                                 bccFiles.data(), bccFiles.size());
    344 
    345         if (mBuildChecksum == 0) {
    346             // cannot compute checksum but verification is enabled
    347             mCtx->unlockMutex();
    348             return false;
    349         }
    350     }
    351     else {
    352         // add a dummy/constant as a checksum if verification is disabled
    353         mBuildChecksum = 0xabadcafe;
    354     }
    355 
    356     // Append build checksum to commandline
    357     // Handle the terminal nullptr in compileArguments
    358     compileArguments.pop_back();
    359     compileArguments.push_back("-build-checksum");
    360     std::stringstream ss;
    361     ss << std::hex << mBuildChecksum;
    362     std::string checksumStr(ss.str());
    363     compileArguments.push_back(checksumStr.c_str());
    364     compileArguments.push_back(nullptr);
    365 
    366     const bool reuse = !is_force_recompile() && !useRSDebugContext;
    367     if (reuse) {
    368         mScriptSO = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
    369 
    370         // Read RS info from the shared object to detect checksum mismatch
    371         if (mScriptSO != nullptr && !storeRSInfoFromSO()) {
    372             dlclose(mScriptSO);
    373             mScriptSO = nullptr;
    374         }
    375     }
    376 
    377     // If reuse is desired and we can't, it's either not there or out of date.
    378     // We compile the bit code and try loading again.
    379     if (mScriptSO == nullptr) {
    380         if (!compileBitcode(bcFileName, (const char*)bitcode, bitcodeSize,
    381                             compileArguments))
    382         {
    383             ALOGE("bcc: FAILS to compile '%s'", resName);
    384             mCtx->unlockMutex();
    385             return false;
    386         }
    387 
    388         std::string SOPath;
    389 
    390         if (!SharedLibraryUtils::createSharedLibrary(
    391                 mCtx->getContext()->getDriverName(), cacheDir, resName, reuse,
    392                 &SOPath)) {
    393             ALOGE("Linker: Failed to link object file '%s'", resName);
    394             mCtx->unlockMutex();
    395             return false;
    396         }
    397 
    398         if (reuse) {
    399             mScriptSO = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
    400         } else {
    401             mScriptSO = SharedLibraryUtils::loadAndDeleteSharedLibrary(SOPath.c_str());
    402         }
    403         if (mScriptSO == nullptr) {
    404             ALOGE("Unable to load '%s'", resName);
    405             mCtx->unlockMutex();
    406             return false;
    407         }
    408 
    409         // Read RS symbol information from the .so.
    410         if (!storeRSInfoFromSO()) {
    411             goto error;
    412         }
    413     }
    414 
    415     mBitcodeFilePath.assign(bcFileName.c_str());
    416 
    417 #else  // RS_COMPATIBILITY_LIB is defined
    418     const char *nativeLibDir = mCtx->getContext()->getNativeLibDir();
    419     mScriptSO = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nativeLibDir);
    420 
    421     if (!mScriptSO) {
    422         goto error;
    423     }
    424 
    425     if (!storeRSInfoFromSO()) {
    426         goto error;
    427     }
    428 #endif
    429     mCtx->unlockMutex();
    430     return true;
    431 
    432 error:
    433 
    434     mCtx->unlockMutex();
    435     if (mScriptSO) {
    436         dlclose(mScriptSO);
    437         mScriptSO = nullptr;
    438     }
    439     return false;
    440 }
    441 
    442 #ifndef RS_COMPATIBILITY_LIB
    443 
    444 const char* RsdCpuScriptImpl::findCoreLib(const bcinfo::MetadataExtractor& ME, const char* bitcode,
    445                                           size_t bitcodeSize) {
    446     const char* defaultLib = SYSLIBPATH_BC"/libclcore.bc";
    447 
    448     // If we're debugging, use the debug library.
    449     if (mCtx->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
    450         return SYSLIBPATH_BC"/libclcore_debug.bc";
    451     }
    452 
    453     if (ME.hasDebugInfo()) {
    454         return SYSLIBPATH_BC"/libclcore_g.bc";
    455     }
    456 
    457     // If a callback has been registered to specify a library, use that.
    458     RSSelectRTCallback selectRTCallback = mCtx->getSelectRTCallback();
    459     if (selectRTCallback != nullptr) {
    460         return selectRTCallback((const char*)bitcode, bitcodeSize);
    461     }
    462 
    463     // Check for a platform specific library
    464 #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
    465     enum bcinfo::RSFloatPrecision prec = ME.getRSFloatPrecision();
    466     if (prec == bcinfo::RS_FP_Relaxed) {
    467         // NEON-capable ARMv7a devices can use an accelerated math library
    468         // for all reduced precision scripts.
    469         // ARMv8 does not use NEON, as ASIMD can be used with all precision
    470         // levels.
    471         return SYSLIBPATH_BC"/libclcore_neon.bc";
    472     } else {
    473         return defaultLib;
    474     }
    475 #elif defined(__i386__) || defined(__x86_64__)
    476     // x86 devices will use an optimized library.
    477     return SYSLIBPATH_BC"/libclcore_x86.bc";
    478 #else
    479     return defaultLib;
    480 #endif
    481 }
    482 
    483 #endif
    484 
    485 void RsdCpuScriptImpl::populateScript(Script *script) {
    486     // Copy info over to runtime
    487     script->mHal.info.exportedFunctionCount = mScriptExec->getExportedFunctionCount();
    488     script->mHal.info.exportedReduceCount = mScriptExec->getExportedReduceCount();
    489     script->mHal.info.exportedForEachCount = mScriptExec->getExportedForEachCount();
    490     script->mHal.info.exportedVariableCount = mScriptExec->getExportedVariableCount();
    491     script->mHal.info.exportedPragmaCount = mScriptExec->getPragmaCount();;
    492     script->mHal.info.exportedPragmaKeyList = mScriptExec->getPragmaKeys();
    493     script->mHal.info.exportedPragmaValueList = mScriptExec->getPragmaValues();
    494 
    495     // Bug, need to stash in metadata
    496     if (mRootExpand) {
    497         script->mHal.info.root = mRootExpand;
    498     } else {
    499         script->mHal.info.root = mRoot;
    500     }
    501 }
    502 
    503 // Set up the launch dimensions, and write the values of the launch
    504 // dimensions into the mtls start/end fields.
    505 //
    506 // Inputs:
    507 //    baseDim - base shape of the input
    508 //         sc - used to constrain the launch dimensions
    509 //
    510 // Returns:
    511 //   True on success, false on failure to set up
    512 bool RsdCpuScriptImpl::setUpMtlsDimensions(MTLaunchStructCommon *mtls,
    513                                            const RsLaunchDimensions &baseDim,
    514                                            const RsScriptCall *sc) {
    515     rsAssert(mtls);
    516 
    517 #define SET_UP_DIMENSION(DIM_FIELD, SC_FIELD) do {            \
    518     if (!sc || (sc->SC_FIELD##End == 0)) {                    \
    519         mtls->end.DIM_FIELD = baseDim.DIM_FIELD;              \
    520     } else {                                                  \
    521         mtls->start.DIM_FIELD =                               \
    522             rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##Start);    \
    523         mtls->end.DIM_FIELD =                                 \
    524             rsMin(baseDim.DIM_FIELD, sc->SC_FIELD##End);      \
    525         if (mtls->start.DIM_FIELD >= mtls->end.DIM_FIELD) {   \
    526             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, \
    527                 "Failed to launch kernel; Invalid "           \
    528                 #SC_FIELD "Start or " #SC_FIELD "End.");      \
    529             return false;                                     \
    530         }                                                     \
    531     }} while(0)
    532 
    533     SET_UP_DIMENSION(x, x);
    534     SET_UP_DIMENSION(y, y);
    535     SET_UP_DIMENSION(z, z);
    536     // Checks and setup of fields other than x, y, z are ignored, since those
    537     // fields are not used in the runtime and are not visible in the Java API.
    538 #undef SET_UP_DIMENSION
    539 
    540     return true;
    541 }
    542 
    543 // Preliminary work to prepare a general reduce-style kernel for launch.
    544 bool RsdCpuScriptImpl::reduceMtlsSetup(const Allocation ** ains,
    545                                        uint32_t inLen,
    546                                        const Allocation * aout,
    547                                        const RsScriptCall *sc,
    548                                        MTLaunchStructReduce *mtls) {
    549     rsAssert(ains && (inLen >= 1) && aout);
    550     memset(mtls, 0, sizeof(MTLaunchStructReduce));
    551     mtls->dimPtr = &mtls->redp.dim;
    552 
    553     for (int index = inLen; --index >= 0;) {
    554         if (allocationLODIsNull(ains[index])) {
    555             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    556                                          "reduce called with null in allocations");
    557             return false;
    558         }
    559     }
    560 
    561     if (allocationLODIsNull(aout)) {
    562         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    563                                      "reduce called with null out allocation");
    564         return false;
    565     }
    566 
    567     const Allocation *ain0   = ains[0];
    568     const Type       *inType = ain0->getType();
    569 
    570     mtls->redp.dim.x = inType->getDimX();
    571     mtls->redp.dim.y = inType->getDimY();
    572     mtls->redp.dim.z = inType->getDimZ();
    573 
    574     for (int Index = inLen; --Index >= 1;) {
    575         if (!ain0->hasSameDims(ains[Index])) {
    576             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    577                                          "Failed to launch reduction kernel;"
    578                                          "dimensions of input allocations do not match.");
    579             return false;
    580         }
    581     }
    582 
    583     if (!setUpMtlsDimensions(mtls, mtls->redp.dim, sc)) {
    584         return false;
    585     }
    586 
    587     // The X & Y walkers always want 0-1 min even if dim is not present
    588     mtls->end.x = rsMax((uint32_t)1, mtls->end.x);
    589     mtls->end.y = rsMax((uint32_t)1, mtls->end.y);
    590 
    591     mtls->rs = mCtx;
    592 
    593     mtls->mSliceNum    = 0;
    594     mtls->mSliceSize   = 1;
    595     mtls->isThreadable = mIsThreadable;
    596 
    597     // Set up output,
    598     mtls->redp.outLen = 1;
    599     mtls->redp.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
    600     mtls->redp.outStride[0] = aout->getType()->getElementSizeBytes();
    601 
    602     // Set up input.
    603     memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
    604     mtls->redp.inLen = inLen;
    605     for (int index = inLen; --index >= 0;) {
    606         mtls->redp.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
    607         mtls->redp.inStride[index] = ains[index]->getType()->getElementSizeBytes();
    608     }
    609 
    610     // All validation passed, ok to launch threads
    611     return true;
    612 }
    613 
    614 // Preliminary work to prepare a forEach-style kernel for launch.
    615 bool RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains,
    616                                         uint32_t inLen,
    617                                         Allocation * aout,
    618                                         const void * usr, uint32_t usrLen,
    619                                         const RsScriptCall *sc,
    620                                         MTLaunchStructForEach *mtls) {
    621     if (ains == nullptr && inLen != 0) {
    622         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    623           "rsForEach called with none-zero inLen with null in allocations");
    624         return false;
    625     }
    626 
    627     memset(mtls, 0, sizeof(MTLaunchStructForEach));
    628     mtls->dimPtr = &mtls->fep.dim;
    629 
    630     for (int index = inLen; --index >= 0;) {
    631         if (allocationLODIsNull(ains[index])) {
    632             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    633                                          "rsForEach called with null in allocations");
    634             return false;
    635         }
    636     }
    637 
    638     if (allocationLODIsNull(aout)) {
    639         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    640                                      "rsForEach called with null out allocations");
    641         return false;
    642     }
    643 
    644     // The only situation where ains[j] is null is when inLen==1 and j==0;
    645     // and that can only happen for an old-style kernel in API level 11~13,
    646     // where the input allocation cannot be skipped if the output allocation is specified.
    647     if (inLen != 0)
    648         rsAssert((inLen == 1) || (ains[0] != nullptr));
    649 
    650     if (inLen > 0 && ains[0]) {
    651         const Allocation *ain0   = ains[0];
    652         const Type       *inType = ain0->getType();
    653 
    654         mtls->fep.dim.x = inType->getDimX();
    655         mtls->fep.dim.y = inType->getDimY();
    656         mtls->fep.dim.z = inType->getDimZ();
    657 
    658         for (int Index = inLen; --Index >= 1;) {
    659             if (!ain0->hasSameDims(ains[Index])) {
    660                 mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    661                   "Failed to launch kernel; dimensions of input "
    662                   "allocations do not match.");
    663                 return false;
    664             }
    665         }
    666     } else if (aout != nullptr) {
    667         const Type *outType = aout->getType();
    668 
    669         mtls->fep.dim.x = outType->getDimX();
    670         mtls->fep.dim.y = outType->getDimY();
    671         mtls->fep.dim.z = outType->getDimZ();
    672 
    673     } else if (sc != nullptr) {
    674         mtls->fep.dim.x = sc->xEnd;
    675         mtls->fep.dim.y = sc->yEnd;
    676         mtls->fep.dim.z = 0;
    677     } else {
    678         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    679                                      "rsForEach called with null allocations");
    680         return false;
    681     }
    682 
    683     if (inLen > 0 && aout != nullptr) {
    684         if (ains[0] && !ains[0]->hasSameDims(aout)) {
    685             mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
    686               "Failed to launch kernel; dimensions of input and output allocations do not match.");
    687 
    688             return false;
    689         }
    690     }
    691 
    692     if (!setUpMtlsDimensions(mtls, mtls->fep.dim, sc)) {
    693         return false;
    694     }
    695 
    696     // The X & Y walkers always want 0-1 min even if dim is not present
    697     mtls->end.x    = rsMax((uint32_t)1, mtls->end.x);
    698     mtls->end.y    = rsMax((uint32_t)1, mtls->end.y);
    699     mtls->rs       = mCtx;
    700     if (ains) {
    701         memcpy(mtls->ains, ains, inLen * sizeof(ains[0]));
    702     }
    703     mtls->aout[0]    = aout;
    704     mtls->fep.usr    = usr;
    705     mtls->fep.usrLen = usrLen;
    706     mtls->mSliceSize = 1;
    707     mtls->mSliceNum  = 0;
    708 
    709     mtls->isThreadable  = mIsThreadable;
    710 
    711     if (inLen > 0) {
    712         mtls->fep.inLen = inLen;
    713         for (int index = inLen; --index >= 0;) {
    714             if (ains[index] == nullptr) {
    715                 // In old style kernels, the first and only input allocation could be null.
    716                 // Not allowed in newer styles.
    717                 rsAssert(inLen == 1 && index == 0);
    718                 continue;
    719             }
    720             mtls->fep.inPtr[index] = (const uint8_t*)ains[index]->mHal.drvState.lod[0].mallocPtr;
    721             mtls->fep.inStride[index] = ains[index]->getType()->getElementSizeBytes();
    722         }
    723     }
    724 
    725     if (aout != nullptr) {
    726         mtls->fep.outPtr[0] = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
    727         mtls->fep.outStride[0] = aout->getType()->getElementSizeBytes();
    728     }
    729 
    730     // All validation passed, ok to launch threads
    731     return true;
    732 }
    733 
    734 
    735 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
    736                                      const Allocation ** ains,
    737                                      uint32_t inLen,
    738                                      Allocation * aout,
    739                                      const void * usr,
    740                                      uint32_t usrLen,
    741                                      const RsScriptCall *sc) {
    742 
    743     MTLaunchStructForEach mtls;
    744 
    745     if (forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls)) {
    746         forEachKernelSetup(slot, &mtls);
    747 
    748         RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
    749         mCtx->launchForEach(ains, inLen, aout, sc, &mtls);
    750         mCtx->setTLS(oldTLS);
    751     }
    752 }
    753 
    754 void RsdCpuScriptImpl::invokeReduce(uint32_t slot,
    755                                     const Allocation ** ains, uint32_t inLen,
    756                                     Allocation *aout,
    757                                     const RsScriptCall *sc) {
    758   MTLaunchStructReduce mtls;
    759 
    760   if (reduceMtlsSetup(ains, inLen, aout, sc, &mtls)) {
    761     reduceKernelSetup(slot, &mtls);
    762     RsdCpuScriptImpl *oldTLS = mCtx->setTLS(this);
    763     mCtx->launchReduce(ains, inLen, aout, &mtls);
    764     mCtx->setTLS(oldTLS);
    765   }
    766 }
    767 
    768 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStructForEach *mtls) {
    769     mtls->script = this;
    770     mtls->fep.slot = slot;
    771     mtls->kernel = mScriptExec->getForEachFunction(slot);
    772     rsAssert(mtls->kernel != nullptr);
    773 }
    774 
    775 void RsdCpuScriptImpl::reduceKernelSetup(uint32_t slot, MTLaunchStructReduce *mtls) {
    776     mtls->script = this;
    777     mtls->redp.slot = slot;
    778 
    779     const ReduceDescription *desc = mScriptExec->getReduceDescription(slot);
    780     mtls->accumFunc = desc->accumFunc;
    781     mtls->initFunc  = desc->initFunc;   // might legally be nullptr
    782     mtls->combFunc  = desc->combFunc;   // might legally be nullptr
    783     mtls->outFunc   = desc->outFunc;    // might legally be nullptr
    784     mtls->accumSize = desc->accumSize;
    785 
    786     rsAssert(mtls->accumFunc != nullptr);
    787 }
    788 
    789 int RsdCpuScriptImpl::invokeRoot() {
    790     RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
    791     int ret = mRoot();
    792     mCtx->setTLS(oldTLS);
    793     return ret;
    794 }
    795 
    796 void RsdCpuScriptImpl::invokeInit() {
    797     if (mInit) {
    798         mInit();
    799     }
    800 }
    801 
    802 void RsdCpuScriptImpl::invokeFreeChildren() {
    803     if (mFreeChildren) {
    804         mFreeChildren();
    805     }
    806 }
    807 
    808 void RsdCpuScriptImpl::invokeFunction(uint32_t slot, const void *params,
    809                                       size_t paramLength) {
    810     //ALOGE("invoke %i %p %zu", slot, params, paramLength);
    811     void * ap = nullptr;
    812 
    813 #if defined(__x86_64__)
    814     // The invoked function could have input parameter of vector type for example float4 which
    815     // requires void* params to be 16 bytes aligned when using SSE instructions for x86_64 platform.
    816     // So try to align void* params before passing them into RS exported function.
    817 
    818     if ((uint8_t)(uint64_t)params & 0x0F) {
    819         if ((ap = (void*)memalign(16, paramLength)) != nullptr) {
    820             memcpy(ap, params, paramLength);
    821         } else {
    822             ALOGE("x86_64: invokeFunction memalign error, still use params which"
    823                   " is not 16 bytes aligned.");
    824         }
    825     }
    826 #endif
    827 
    828     RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
    829     reinterpret_cast<void (*)(const void *, uint32_t)>(
    830         mScriptExec->getInvokeFunction(slot))(ap? (const void *) ap: params, paramLength);
    831 
    832 #if defined(__x86_64__)
    833     free(ap);
    834 #endif
    835 
    836     mCtx->setTLS(oldTLS);
    837 }
    838 
    839 void RsdCpuScriptImpl::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
    840     //rsAssert(!script->mFieldIsObject[slot]);
    841     //ALOGE("setGlobalVar %i %p %zu", slot, data, dataLength);
    842 
    843     //if (mIntrinsicID) {
    844         //mIntrinsicFuncs.setVar(dc, script, drv->mIntrinsicData, slot, data, dataLength);
    845         //return;
    846     //}
    847 
    848     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    849     if (!destPtr) {
    850         //ALOGV("Calling setVar on slot = %i which is null", slot);
    851         return;
    852     }
    853 
    854     memcpy(destPtr, data, dataLength);
    855 }
    856 
    857 void RsdCpuScriptImpl::getGlobalVar(uint32_t slot, void *data, size_t dataLength) {
    858     //rsAssert(!script->mFieldIsObject[slot]);
    859     //ALOGE("getGlobalVar %i %p %zu", slot, data, dataLength);
    860 
    861     int32_t *srcPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    862     if (!srcPtr) {
    863         //ALOGV("Calling setVar on slot = %i which is null", slot);
    864         return;
    865     }
    866     memcpy(data, srcPtr, dataLength);
    867 }
    868 
    869 
    870 void RsdCpuScriptImpl::setGlobalVarWithElemDims(uint32_t slot, const void *data, size_t dataLength,
    871                                                 const Element *elem,
    872                                                 const uint32_t *dims, size_t dimLength) {
    873     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    874     if (!destPtr) {
    875         //ALOGV("Calling setVar on slot = %i which is null", slot);
    876         return;
    877     }
    878 
    879     // We want to look at dimension in terms of integer components,
    880     // but dimLength is given in terms of bytes.
    881     dimLength /= sizeof(int);
    882 
    883     // Only a single dimension is currently supported.
    884     rsAssert(dimLength == 1);
    885     if (dimLength == 1) {
    886         // First do the increment loop.
    887         size_t stride = elem->getSizeBytes();
    888         const char *cVal = reinterpret_cast<const char *>(data);
    889         for (uint32_t i = 0; i < dims[0]; i++) {
    890             elem->incRefs(cVal);
    891             cVal += stride;
    892         }
    893 
    894         // Decrement loop comes after (to prevent race conditions).
    895         char *oldVal = reinterpret_cast<char *>(destPtr);
    896         for (uint32_t i = 0; i < dims[0]; i++) {
    897             elem->decRefs(oldVal);
    898             oldVal += stride;
    899         }
    900     }
    901 
    902     memcpy(destPtr, data, dataLength);
    903 }
    904 
    905 void RsdCpuScriptImpl::setGlobalBind(uint32_t slot, Allocation *data) {
    906 
    907     //rsAssert(!script->mFieldIsObject[slot]);
    908     //ALOGE("setGlobalBind %i %p", slot, data);
    909 
    910     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    911     if (!destPtr) {
    912         //ALOGV("Calling setVar on slot = %i which is null", slot);
    913         return;
    914     }
    915 
    916     void *ptr = nullptr;
    917     mBoundAllocs[slot] = data;
    918     if (data) {
    919         ptr = data->mHal.drvState.lod[0].mallocPtr;
    920     }
    921     memcpy(destPtr, &ptr, sizeof(void *));
    922 }
    923 
    924 void RsdCpuScriptImpl::setGlobalObj(uint32_t slot, ObjectBase *data) {
    925 
    926     //rsAssert(script->mFieldIsObject[slot]);
    927     //ALOGE("setGlobalObj %i %p", slot, data);
    928 
    929     int32_t *destPtr = reinterpret_cast<int32_t *>(mScriptExec->getFieldAddress(slot));
    930     if (!destPtr) {
    931         //ALOGV("Calling setVar on slot = %i which is null", slot);
    932         return;
    933     }
    934 
    935     rsrSetObject(mCtx->getContext(), (rs_object_base *)destPtr, data);
    936 }
    937 
    938 const char* RsdCpuScriptImpl::getFieldName(uint32_t slot) const {
    939     return mScriptExec->getFieldName(slot);
    940 }
    941 
    942 RsdCpuScriptImpl::~RsdCpuScriptImpl() {
    943     delete mScriptExec;
    944     delete[] mBoundAllocs;
    945     if (mScriptSO) {
    946         dlclose(mScriptSO);
    947     }
    948 }
    949 
    950 Allocation * RsdCpuScriptImpl::getAllocationForPointer(const void *ptr) const {
    951     if (!ptr) {
    952         return nullptr;
    953     }
    954 
    955     for (uint32_t ct=0; ct < mScript->mHal.info.exportedVariableCount; ct++) {
    956         Allocation *a = mBoundAllocs[ct];
    957         if (!a) continue;
    958         if (a->mHal.drvState.lod[0].mallocPtr == ptr) {
    959             return a;
    960         }
    961     }
    962     ALOGE("rsGetAllocation, failed to find %p", ptr);
    963     return nullptr;
    964 }
    965 
    966 int RsdCpuScriptImpl::getGlobalEntries() const {
    967     return mScriptExec->getGlobalEntries();
    968 }
    969 
    970 const char * RsdCpuScriptImpl::getGlobalName(int i) const {
    971     return mScriptExec->getGlobalName(i);
    972 }
    973 
    974 const void * RsdCpuScriptImpl::getGlobalAddress(int i) const {
    975     return mScriptExec->getGlobalAddress(i);
    976 }
    977 
    978 size_t RsdCpuScriptImpl::getGlobalSize(int i) const {
    979     return mScriptExec->getGlobalSize(i);
    980 }
    981 
    982 uint32_t RsdCpuScriptImpl::getGlobalProperties(int i) const {
    983     return mScriptExec->getGlobalProperties(i);
    984 }
    985 
    986 void RsdCpuScriptImpl::preLaunch(uint32_t slot, const Allocation ** ains,
    987                                  uint32_t inLen, Allocation * aout,
    988                                  const void * usr, uint32_t usrLen,
    989                                  const RsScriptCall *sc) {}
    990 
    991 void RsdCpuScriptImpl::postLaunch(uint32_t slot, const Allocation ** ains,
    992                                   uint32_t inLen, Allocation * aout,
    993                                   const void * usr, uint32_t usrLen,
    994                                   const RsScriptCall *sc) {}
    995 
    996 
    997 } // namespace renderscript
    998 } // namespace android
    999