Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "rsCpuCore.h"
     18 #include "rsCpuScript.h"
     19 #include "rsCpuScriptGroup.h"
     20 #include "rsCpuScriptGroup2.h"
     21 
     22 #include <malloc.h>
     23 #include "rsContext.h"
     24 
     25 #include <sys/types.h>
     26 #include <sys/resource.h>
     27 #include <sched.h>
     28 #include <sys/syscall.h>
     29 #include <stdio.h>
     30 #include <string.h>
     31 #include <unistd.h>
     32 
     33 #define REDUCE_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
     34 
     35 static pthread_key_t gThreadTLSKey = 0;
     36 static uint32_t gThreadTLSKeyCount = 0;
     37 static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
     38 
     39 namespace android {
     40 namespace renderscript {
     41 
     42 bool gArchUseSIMD = false;
     43 
     44 RsdCpuReference::~RsdCpuReference() {
     45 }
     46 
     47 RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
     48         uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn
     49         , RSSelectRTCallback pSelectRTCallback,
     50         const char *pBccPluginName
     51         ) {
     52 
     53     RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
     54     if (!cpu) {
     55         return nullptr;
     56     }
     57     if (!cpu->init(version_major, version_minor, lfn, slfn)) {
     58         delete cpu;
     59         return nullptr;
     60     }
     61 
     62     cpu->setSelectRTCallback(pSelectRTCallback);
     63     if (pBccPluginName) {
     64         cpu->setBccPluginName(pBccPluginName);
     65     }
     66 
     67     return cpu;
     68 }
     69 
     70 
     71 Context * RsdCpuReference::getTlsContext() {
     72     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
     73     return tls->mContext;
     74 }
     75 
     76 const Script * RsdCpuReference::getTlsScript() {
     77     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
     78     return tls->mScript;
     79 }
     80 
     81 pthread_key_t RsdCpuReference::getThreadTLSKey(){ return gThreadTLSKey; }
     82 
     83 ////////////////////////////////////////////////////////////
     84 ///
     85 
     86 RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
     87     mRSC = rsc;
     88 
     89     version_major = 0;
     90     version_minor = 0;
     91     mInKernel = false;
     92     memset(&mWorkers, 0, sizeof(mWorkers));
     93     memset(&mTlsStruct, 0, sizeof(mTlsStruct));
     94     mExit = false;
     95     mSelectRTCallback = nullptr;
     96     mEmbedGlobalInfo = true;
     97     mEmbedGlobalInfoSkipConstant = true;
     98 }
     99 
    100 
    101 void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
    102     RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
    103 
    104     uint32_t idx = __sync_fetch_and_add(&dc->mWorkers.mLaunchCount, 1);
    105 
    106     //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
    107 
    108     dc->mWorkers.mLaunchSignals[idx].init();
    109     dc->mWorkers.mNativeThreadId[idx] = gettid();
    110 
    111     memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
    112     int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
    113     if (status) {
    114         ALOGE("pthread_setspecific %i", status);
    115     }
    116 
    117 #if 0
    118     typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
    119     cpu_set_t cpuset;
    120     memset(&cpuset, 0, sizeof(cpuset));
    121     cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
    122     int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
    123               sizeof(cpuset), &cpuset);
    124     ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
    125 #endif
    126 
    127     while (!dc->mExit) {
    128         dc->mWorkers.mLaunchSignals[idx].wait();
    129         if (dc->mWorkers.mLaunchCallback) {
    130            // idx +1 is used because the calling thread is always worker 0.
    131            dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
    132         }
    133         __sync_fetch_and_sub(&dc->mWorkers.mRunningCount, 1);
    134         dc->mWorkers.mCompleteSignal.set();
    135     }
    136 
    137     //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
    138     return nullptr;
    139 }
    140 
    141 // Launch a kernel.
    142 // The callback function is called to execute the kernel.
    143 void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
    144     mWorkers.mLaunchData = data;
    145     mWorkers.mLaunchCallback = cbk;
    146 
    147     // fast path for very small launches
    148     MTLaunchStructCommon *mtls = (MTLaunchStructCommon *)data;
    149     if (mtls && mtls->dimPtr->y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
    150         if (mWorkers.mLaunchCallback) {
    151             mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
    152         }
    153         return;
    154     }
    155 
    156     mWorkers.mRunningCount = mWorkers.mCount;
    157     __sync_synchronize();
    158 
    159     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
    160         mWorkers.mLaunchSignals[ct].set();
    161     }
    162 
    163     // We use the calling thread as one of the workers so we can start without
    164     // the delay of the thread wakeup.
    165     if (mWorkers.mLaunchCallback) {
    166         mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
    167     }
    168 
    169     while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
    170         mWorkers.mCompleteSignal.wait();
    171     }
    172 }
    173 
    174 
    175 void RsdCpuReferenceImpl::lockMutex() {
    176     pthread_mutex_lock(&gInitMutex);
    177 }
    178 
    179 void RsdCpuReferenceImpl::unlockMutex() {
    180     pthread_mutex_unlock(&gInitMutex);
    181 }
    182 
    183 // Determine if the CPU we're running on supports SIMD instructions.
    184 static void GetCpuInfo() {
    185     // Read the CPU flags from /proc/cpuinfo.
    186     FILE *cpuinfo = fopen("/proc/cpuinfo", "r");
    187 
    188     if (!cpuinfo) {
    189         return;
    190     }
    191 
    192     char cpuinfostr[4096];
    193     // fgets() ends with newline or EOF, need to check the whole
    194     // "cpuinfo" file to make sure we can use SIMD or not.
    195     while (fgets(cpuinfostr, sizeof(cpuinfostr), cpuinfo)) {
    196 #if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
    197         gArchUseSIMD = strstr(cpuinfostr, " neon") || strstr(cpuinfostr, " asimd");
    198 #elif defined(ARCH_X86_HAVE_SSSE3)
    199         gArchUseSIMD = strstr(cpuinfostr, " ssse3");
    200 #endif
    201         if (gArchUseSIMD) {
    202             break;
    203         }
    204     }
    205     fclose(cpuinfo);
    206 }
    207 
    208 bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
    209                                sym_lookup_t lfn, script_lookup_t slfn) {
    210     mSymLookupFn = lfn;
    211     mScriptLookupFn = slfn;
    212 
    213     lockMutex();
    214     if (!gThreadTLSKeyCount) {
    215         int status = pthread_key_create(&gThreadTLSKey, nullptr);
    216         if (status) {
    217             ALOGE("Failed to init thread tls key.");
    218             unlockMutex();
    219             return false;
    220         }
    221     }
    222     gThreadTLSKeyCount++;
    223     unlockMutex();
    224 
    225     mTlsStruct.mContext = mRSC;
    226     mTlsStruct.mScript = nullptr;
    227     int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
    228     if (status) {
    229         ALOGE("pthread_setspecific %i", status);
    230     }
    231 
    232     mPageSize = sysconf(_SC_PAGE_SIZE);
    233     // ALOGV("page size = %ld", mPageSize);
    234 
    235     GetCpuInfo();
    236 
    237     int cpu = sysconf(_SC_NPROCESSORS_CONF);
    238     if(mRSC->props.mDebugMaxThreads) {
    239         cpu = mRSC->props.mDebugMaxThreads;
    240     }
    241     if (cpu < 2) {
    242         mWorkers.mCount = 0;
    243         return true;
    244     }
    245 
    246     // Subtract one from the cpu count because we also use the command thread as a worker.
    247     mWorkers.mCount = (uint32_t)(cpu - 1);
    248 
    249     if (mRSC->props.mLogScripts) {
    250       ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount + 1);
    251     }
    252 
    253     mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
    254     mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
    255     mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
    256     mWorkers.mLaunchCallback = nullptr;
    257 
    258     mWorkers.mCompleteSignal.init();
    259 
    260     mWorkers.mRunningCount = mWorkers.mCount;
    261     mWorkers.mLaunchCount = 0;
    262     __sync_synchronize();
    263 
    264     pthread_attr_t threadAttr;
    265     status = pthread_attr_init(&threadAttr);
    266     if (status) {
    267         ALOGE("Failed to init thread attribute.");
    268         return false;
    269     }
    270 
    271     for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
    272         status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
    273         if (status) {
    274             mWorkers.mCount = ct;
    275             ALOGE("Created fewer than expected number of RS threads.");
    276             break;
    277         }
    278     }
    279     while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
    280         usleep(100);
    281     }
    282 
    283     pthread_attr_destroy(&threadAttr);
    284     return true;
    285 }
    286 
    287 
    288 void RsdCpuReferenceImpl::setPriority(int32_t priority) {
    289     for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
    290         setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
    291     }
    292 }
    293 
    294 RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
    295     mExit = true;
    296     mWorkers.mLaunchData = nullptr;
    297     mWorkers.mLaunchCallback = nullptr;
    298     mWorkers.mRunningCount = mWorkers.mCount;
    299     __sync_synchronize();
    300     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
    301         mWorkers.mLaunchSignals[ct].set();
    302     }
    303     void *res;
    304     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
    305         pthread_join(mWorkers.mThreadId[ct], &res);
    306     }
    307     // b/23109602
    308     // TODO: Refactor the implementation with threadpool to
    309     // fix the race condition in the destuctor.
    310     // rsAssert(__sync_fetch_and_or(&mWorkers.mRunningCount, 0) == 0);
    311     free(mWorkers.mThreadId);
    312     free(mWorkers.mNativeThreadId);
    313     delete[] mWorkers.mLaunchSignals;
    314 
    315     // Global structure cleanup.
    316     lockMutex();
    317     --gThreadTLSKeyCount;
    318     if (!gThreadTLSKeyCount) {
    319         pthread_key_delete(gThreadTLSKey);
    320     }
    321     unlockMutex();
    322 
    323 }
    324 
    325 // Set up the appropriate input and output pointers to the kernel driver info structure.
    326 // Inputs:
    327 //   mtls - The MTLaunchStruct holding information about the kernel launch
    328 //   fep - The forEach parameters (driver info structure)
    329 //   x, y, z, lod, face, a1, a2, a3, a4 - The start offsets into each dimension
    330 static inline void FepPtrSetup(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo *fep,
    331                                uint32_t x, uint32_t y,
    332                                uint32_t z = 0, uint32_t lod = 0,
    333                                RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
    334                                uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
    335     // When rsForEach passes a null input allocation (as opposed to no input),
    336     // fep->inLen can be 1 with mtls->ains[0] being null.
    337     // This should only happen on old style kernels.
    338     for (uint32_t i = 0; i < fep->inLen; i++) {
    339         if (mtls->ains[i] == nullptr) {
    340             rsAssert(fep->inLen == 1);
    341             continue;
    342         }
    343         fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
    344     }
    345     if (mtls->aout[0] != nullptr) {
    346         fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
    347     }
    348 }
    349 
    350 // Set up the appropriate input and output pointers to the kernel driver info structure.
    351 // Inputs:
    352 //   mtls - The MTLaunchStruct holding information about the kernel launch
    353 //   redp - The reduce parameters (driver info structure)
    354 //   x, y, z - The start offsets into each dimension
    355 static inline void RedpPtrSetup(const MTLaunchStructReduce *mtls, RsExpandKernelDriverInfo *redp,
    356                                 uint32_t x, uint32_t y, uint32_t z) {
    357     for (uint32_t i = 0; i < redp->inLen; i++) {
    358         redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
    359     }
    360 }
    361 
    362 static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
    363     if (start >= end) {
    364         *p = start;
    365         return val;
    366     }
    367 
    368     uint32_t div = end - start;
    369 
    370     uint32_t n = val / div;
    371     *p = (val - (n * div)) + start;
    372     return n;
    373 }
    374 
    375 static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
    376     uint32_t r = sliceNum;
    377     r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
    378     r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
    379     r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
    380     r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
    381     r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
    382     r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
    383     r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
    384     return r == 0;
    385 }
    386 
    387 static bool SelectZSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
    388     return sliceInt(&info->current.z, sliceNum, mtls->start.z, mtls->end.z) == 0;
    389 }
    390 
    391 static void walk_general_foreach(void *usr, uint32_t idx) {
    392     MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
    393     RsExpandKernelDriverInfo fep = mtls->fep;
    394     fep.lid = idx;
    395     ForEachFunc_t fn = mtls->kernel;
    396 
    397     while(1) {
    398         uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
    399 
    400         if (!SelectOuterSlice(mtls, &fep, slice)) {
    401             return;
    402         }
    403 
    404         for (fep.current.y = mtls->start.y; fep.current.y < mtls->end.y;
    405              fep.current.y++) {
    406 
    407             FepPtrSetup(mtls, &fep, mtls->start.x,
    408                         fep.current.y, fep.current.z, fep.current.lod,
    409                         (RsAllocationCubemapFace)fep.current.face,
    410                         fep.current.array[0], fep.current.array[1],
    411                         fep.current.array[2], fep.current.array[3]);
    412 
    413             fn(&fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
    414         }
    415     }
    416 }
    417 
    418 static void walk_2d_foreach(void *usr, uint32_t idx) {
    419     MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
    420     RsExpandKernelDriverInfo fep = mtls->fep;
    421     fep.lid = idx;
    422     ForEachFunc_t fn = mtls->kernel;
    423 
    424     while (1) {
    425         uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
    426         uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
    427         uint32_t yEnd   = yStart + mtls->mSliceSize;
    428 
    429         yEnd = rsMin(yEnd, mtls->end.y);
    430 
    431         if (yEnd <= yStart) {
    432             return;
    433         }
    434 
    435         for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
    436             FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
    437 
    438             fn(&fep, mtls->start.x, mtls->end.x, fep.outStride[0]);
    439         }
    440     }
    441 }
    442 
    443 static void walk_1d_foreach(void *usr, uint32_t idx) {
    444     MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
    445     RsExpandKernelDriverInfo fep = mtls->fep;
    446     fep.lid = idx;
    447     ForEachFunc_t fn = mtls->kernel;
    448 
    449     while (1) {
    450         uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
    451         uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
    452         uint32_t xEnd   = xStart + mtls->mSliceSize;
    453 
    454         xEnd = rsMin(xEnd, mtls->end.x);
    455 
    456         if (xEnd <= xStart) {
    457             return;
    458         }
    459 
    460         FepPtrSetup(mtls, &fep, xStart, 0);
    461 
    462         fn(&fep, xStart, xEnd, fep.outStride[0]);
    463     }
    464 }
    465 
    466 // The function format_bytes() is an auxiliary function to assist in logging.
    467 //
    468 // Bytes are read from an input (inBuf) and written (as pairs of hex digits)
    469 // to an output (outBuf).
    470 //
    471 // Output format:
    472 // - starts with ": "
    473 // - each input byte is translated to a pair of hex digits
    474 // - bytes are separated by "." except that every fourth separator is "|"
    475 // - if the input is sufficiently long, the output is truncated and terminated with "..."
    476 //
    477 // Arguments:
    478 // - outBuf  -- Pointer to buffer of type "FormatBuf" into which output is written
    479 // - inBuf   -- Pointer to bytes which are to be formatted into outBuf
    480 // - inBytes -- Number of bytes in inBuf
    481 //
    482 // Constant:
    483 // - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
    484 //                        from inBuf
    485 //
    486 // Return value:
    487 // - pointer (const char *) to output (which is part of outBuf)
    488 //
    489 static const int kFormatInBytesMax = 16;
    490 // ": " + 2 digits per byte + 1 separator between bytes + "..." + null
    491 typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
    492 static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
    493   strlcpy(*outBuf, ": ", sizeof(*outBuf));
    494   int pos = 2;
    495   const int lim = std::min(kFormatInBytesMax, inBytes);
    496   for (int i = 0; i < lim; ++i) {
    497     if (i) {
    498       sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
    499       ++pos;
    500     }
    501     sprintf(*outBuf + pos, "%02x", inBuf[i]);
    502     pos += 2;
    503   }
    504   if (kFormatInBytesMax < inBytes)
    505     strlcpy(*outBuf + pos, "...", sizeof(FormatBuf) - pos);
    506   return *outBuf;
    507 }
    508 
    509 static void reduce_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduce *mtls,
    510                                    const char *walkerName, uint32_t threadIdx) {
    511   rsAssert(!accumPtr);
    512 
    513   uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
    514   if (mtls->outFunc) {
    515     accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
    516   } else {
    517     if (accumIdx == 0) {
    518       accumPtr = mtls->redp.outPtr[0];
    519     } else {
    520       accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
    521     }
    522   }
    523   REDUCE_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
    524                walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
    525   // initialize accumulator
    526   if (mtls->initFunc) {
    527     mtls->initFunc(accumPtr);
    528   } else {
    529     memset(accumPtr, 0, mtls->accumSize);
    530   }
    531 }
    532 
    533 static void walk_1d_reduce(void *usr, uint32_t idx) {
    534   const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
    535   RsExpandKernelDriverInfo redp = mtls->redp;
    536 
    537   // find accumulator
    538   uint8_t *&accumPtr = mtls->accumPtr[idx];
    539   if (!accumPtr) {
    540     reduce_get_accumulator(accumPtr, mtls, __func__, idx);
    541   }
    542 
    543   // accumulate
    544   const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
    545   while (1) {
    546     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
    547     uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
    548     uint32_t xEnd   = xStart + mtls->mSliceSize;
    549 
    550     xEnd = rsMin(xEnd, mtls->end.x);
    551 
    552     if (xEnd <= xStart) {
    553       return;
    554     }
    555 
    556     RedpPtrSetup(mtls, &redp, xStart, 0, 0);
    557     fn(&redp, xStart, xEnd, accumPtr);
    558 
    559     // Emit log line after slice has been run, so that we can include
    560     // the results of the run on that line.
    561     FormatBuf fmt;
    562     if (mtls->logReduce >= 3) {
    563       format_bytes(&fmt, accumPtr, mtls->accumSize);
    564     } else {
    565       fmt[0] = 0;
    566     }
    567     REDUCE_ALOGV(mtls, 2, "walk_1d_reduce(%p): idx = %u, x in [%u, %u)%s",
    568                  mtls->accumFunc, idx, xStart, xEnd, fmt);
    569   }
    570 }
    571 
    572 static void walk_2d_reduce(void *usr, uint32_t idx) {
    573   const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
    574   RsExpandKernelDriverInfo redp = mtls->redp;
    575 
    576   // find accumulator
    577   uint8_t *&accumPtr = mtls->accumPtr[idx];
    578   if (!accumPtr) {
    579     reduce_get_accumulator(accumPtr, mtls, __func__, idx);
    580   }
    581 
    582   // accumulate
    583   const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
    584   while (1) {
    585     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
    586     uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
    587     uint32_t yEnd   = yStart + mtls->mSliceSize;
    588 
    589     yEnd = rsMin(yEnd, mtls->end.y);
    590 
    591     if (yEnd <= yStart) {
    592       return;
    593     }
    594 
    595     for (redp.current.y = yStart; redp.current.y < yEnd; redp.current.y++) {
    596       RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, 0);
    597       fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
    598     }
    599 
    600     FormatBuf fmt;
    601     if (mtls->logReduce >= 3) {
    602       format_bytes(&fmt, accumPtr, mtls->accumSize);
    603     } else {
    604       fmt[0] = 0;
    605     }
    606     REDUCE_ALOGV(mtls, 2, "walk_2d_reduce(%p): idx = %u, y in [%u, %u)%s",
    607                  mtls->accumFunc, idx, yStart, yEnd, fmt);
    608   }
    609 }
    610 
    611 static void walk_3d_reduce(void *usr, uint32_t idx) {
    612   const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
    613   RsExpandKernelDriverInfo redp = mtls->redp;
    614 
    615   // find accumulator
    616   uint8_t *&accumPtr = mtls->accumPtr[idx];
    617   if (!accumPtr) {
    618     reduce_get_accumulator(accumPtr, mtls, __func__, idx);
    619   }
    620 
    621   // accumulate
    622   const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
    623   while (1) {
    624     uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
    625 
    626     if (!SelectZSlice(mtls, &redp, slice)) {
    627       return;
    628     }
    629 
    630     for (redp.current.y = mtls->start.y; redp.current.y < mtls->end.y; redp.current.y++) {
    631       RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, redp.current.z);
    632       fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
    633     }
    634 
    635     FormatBuf fmt;
    636     if (mtls->logReduce >= 3) {
    637       format_bytes(&fmt, accumPtr, mtls->accumSize);
    638     } else {
    639       fmt[0] = 0;
    640     }
    641     REDUCE_ALOGV(mtls, 2, "walk_3d_reduce(%p): idx = %u, z = %u%s",
    642                  mtls->accumFunc, idx, redp.current.z, fmt);
    643   }
    644 }
    645 
    646 // Launch a general reduce-style kernel.
    647 // Inputs:
    648 //   ains[0..inLen-1]: Array of allocations that contain the inputs
    649 //   aout:             The allocation that will hold the output
    650 //   mtls:             Holds launch parameters
    651 void RsdCpuReferenceImpl::launchReduce(const Allocation ** ains,
    652                                        uint32_t inLen,
    653                                        Allocation * aout,
    654                                        MTLaunchStructReduce *mtls) {
    655   mtls->logReduce = mRSC->props.mLogReduce;
    656   if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
    657     launchReduceParallel(ains, inLen, aout, mtls);
    658   } else {
    659     launchReduceSerial(ains, inLen, aout, mtls);
    660   }
    661 }
    662 
    663 // Launch a general reduce-style kernel, single-threaded.
    664 // Inputs:
    665 //   ains[0..inLen-1]: Array of allocations that contain the inputs
    666 //   aout:             The allocation that will hold the output
    667 //   mtls:             Holds launch parameters
    668 void RsdCpuReferenceImpl::launchReduceSerial(const Allocation ** ains,
    669                                              uint32_t inLen,
    670                                              Allocation * aout,
    671                                              MTLaunchStructReduce *mtls) {
    672   REDUCE_ALOGV(mtls, 1, "launchReduceSerial(%p): %u x %u x %u", mtls->accumFunc,
    673                mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
    674 
    675   // In the presence of outconverter, we allocate temporary memory for
    676   // the accumulator.
    677   //
    678   // In the absence of outconverter, we use the output allocation as the
    679   // accumulator.
    680   uint8_t *const accumPtr = (mtls->outFunc
    681                              ? static_cast<uint8_t *>(malloc(mtls->accumSize))
    682                              : mtls->redp.outPtr[0]);
    683 
    684   // initialize
    685   if (mtls->initFunc) {
    686     mtls->initFunc(accumPtr);
    687   } else {
    688     memset(accumPtr, 0, mtls->accumSize);
    689   }
    690 
    691   // accumulate
    692   const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
    693   uint32_t slice = 0;
    694   while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
    695     for (mtls->redp.current.y = mtls->start.y;
    696          mtls->redp.current.y < mtls->end.y;
    697          mtls->redp.current.y++) {
    698       RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
    699       fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
    700     }
    701   }
    702 
    703   // outconvert
    704   if (mtls->outFunc) {
    705     mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
    706     free(accumPtr);
    707   }
    708 }
    709 
    710 // Launch a general reduce-style kernel, multi-threaded.
    711 // Inputs:
    712 //   ains[0..inLen-1]: Array of allocations that contain the inputs
    713 //   aout:             The allocation that will hold the output
    714 //   mtls:             Holds launch parameters
    715 void RsdCpuReferenceImpl::launchReduceParallel(const Allocation ** ains,
    716                                                uint32_t inLen,
    717                                                Allocation * aout,
    718                                                MTLaunchStructReduce *mtls) {
    719   // For now, we don't know how to go parallel in the absence of a combiner.
    720   if (!mtls->combFunc) {
    721     launchReduceSerial(ains, inLen, aout, mtls);
    722     return;
    723   }
    724 
    725   // Number of threads = "main thread" + number of other (worker) threads
    726   const uint32_t numThreads = mWorkers.mCount + 1;
    727 
    728   // In the absence of outconverter, we use the output allocation as
    729   // an accumulator, and therefore need to allocate one fewer accumulator.
    730   const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
    731 
    732   // If mDebugReduceSplitAccum, then we want each accumulator to start
    733   // on a page boundary.  (TODO: Would some unit smaller than a page
    734   // be sufficient to avoid false sharing?)
    735   if (mRSC->props.mDebugReduceSplitAccum) {
    736     // Round up accumulator size to an integral number of pages
    737     mtls->accumStride =
    738         (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
    739         ~(unsigned(mPageSize)-1);
    740     // Each accumulator gets its own page.  Alternatively, if we just
    741     // wanted to make sure no two accumulators are on the same page,
    742     // we could instead do
    743     //   allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
    744     const size_t allocSize = mtls->accumStride * numAllocAccum;
    745     mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
    746   } else {
    747     mtls->accumStride = mtls->accumSize;
    748     mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
    749   }
    750 
    751   const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
    752   mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
    753   memset(mtls->accumPtr, 0, accumPtrArrayBytes);
    754 
    755   mtls->accumCount = 0;
    756 
    757   rsAssert(!mInKernel);
    758   mInKernel = true;
    759   REDUCE_ALOGV(mtls, 1, "launchReduceParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
    760                mtls->accumFunc,
    761                mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
    762                numThreads, mtls->accumAlloc);
    763   if (mtls->redp.dim.z > 1) {
    764     mtls->mSliceSize = 1;
    765     launchThreads(walk_3d_reduce, mtls);
    766   } else if (mtls->redp.dim.y > 1) {
    767     mtls->mSliceSize = rsMax(1U, mtls->redp.dim.y / (numThreads * 4));
    768     launchThreads(walk_2d_reduce, mtls);
    769   } else {
    770     mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
    771     launchThreads(walk_1d_reduce, mtls);
    772   }
    773   mInKernel = false;
    774 
    775   // Combine accumulators and identify final accumulator
    776   uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
    777   //   Loop over accumulators, combining into finalAccumPtr.  If finalAccumPtr
    778   //   is null, then the first accumulator I find becomes finalAccumPtr.
    779   for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
    780     uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
    781     if (finalAccumPtr) {
    782       if (finalAccumPtr != thisAccumPtr) {
    783         if (mtls->combFunc) {
    784           if (mtls->logReduce >= 3) {
    785             FormatBuf fmt;
    786             REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulating into%s",
    787                          mtls->accumFunc,
    788                          format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
    789             REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p):    accumulator[%d]%s",
    790                          mtls->accumFunc, idx,
    791                          format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
    792           }
    793           mtls->combFunc(finalAccumPtr, thisAccumPtr);
    794         } else {
    795           rsAssert(!"expected combiner");
    796         }
    797       }
    798     } else {
    799       finalAccumPtr = thisAccumPtr;
    800     }
    801   }
    802   rsAssert(finalAccumPtr != nullptr);
    803   if (mtls->logReduce >= 3) {
    804     FormatBuf fmt;
    805     REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final accumulator%s",
    806                  mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
    807   }
    808 
    809   // Outconvert
    810   if (mtls->outFunc) {
    811     mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
    812     if (mtls->logReduce >= 3) {
    813       FormatBuf fmt;
    814       REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final outconverted result%s",
    815                    mtls->accumFunc,
    816                    format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
    817     }
    818   }
    819 
    820   // Clean up
    821   free(mtls->accumPtr);
    822   free(mtls->accumAlloc);
    823 }
    824 
    825 
    826 void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
    827                                         uint32_t inLen,
    828                                         Allocation* aout,
    829                                         const RsScriptCall* sc,
    830                                         MTLaunchStructForEach* mtls) {
    831 
    832     //android::StopWatch kernel_time("kernel time");
    833 
    834     bool outerDims = (mtls->start.z != mtls->end.z) ||
    835                      (mtls->start.face != mtls->end.face) ||
    836                      (mtls->start.lod != mtls->end.lod) ||
    837                      (mtls->start.array[0] != mtls->end.array[0]) ||
    838                      (mtls->start.array[1] != mtls->end.array[1]) ||
    839                      (mtls->start.array[2] != mtls->end.array[2]) ||
    840                      (mtls->start.array[3] != mtls->end.array[3]);
    841 
    842     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
    843         const size_t targetByteChunk = 16 * 1024;
    844         mInKernel = true;  // NOTE: The guard immediately above ensures this was !mInKernel
    845 
    846         if (outerDims) {
    847             // No fancy logic for chunk size
    848             mtls->mSliceSize = 1;
    849             launchThreads(walk_general_foreach, mtls);
    850         } else if (mtls->fep.dim.y > 1) {
    851             uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
    852             uint32_t s2 = 0;
    853 
    854             // This chooses our slice size to rate limit atomic ops to
    855             // one per 16k bytes of reads/writes.
    856             if ((mtls->aout[0] != nullptr) && mtls->aout[0]->mHal.drvState.lod[0].stride) {
    857                 s2 = targetByteChunk / mtls->aout[0]->mHal.drvState.lod[0].stride;
    858             } else if (mtls->ains[0]) {
    859                 s2 = targetByteChunk / mtls->ains[0]->mHal.drvState.lod[0].stride;
    860             } else {
    861                 // Launch option only case
    862                 // Use s1 based only on the dimensions
    863                 s2 = s1;
    864             }
    865             mtls->mSliceSize = rsMin(s1, s2);
    866 
    867             if(mtls->mSliceSize < 1) {
    868                 mtls->mSliceSize = 1;
    869             }
    870 
    871             launchThreads(walk_2d_foreach, mtls);
    872         } else {
    873             uint32_t s1 = mtls->fep.dim.x / ((mWorkers.mCount + 1) * 4);
    874             uint32_t s2 = 0;
    875 
    876             // This chooses our slice size to rate limit atomic ops to
    877             // one per 16k bytes of reads/writes.
    878             if ((mtls->aout[0] != nullptr) && mtls->aout[0]->getType()->getElementSizeBytes()) {
    879                 s2 = targetByteChunk / mtls->aout[0]->getType()->getElementSizeBytes();
    880             } else if (mtls->ains[0]) {
    881                 s2 = targetByteChunk / mtls->ains[0]->getType()->getElementSizeBytes();
    882             } else {
    883                 // Launch option only case
    884                 // Use s1 based only on the dimensions
    885                 s2 = s1;
    886             }
    887             mtls->mSliceSize = rsMin(s1, s2);
    888 
    889             if (mtls->mSliceSize < 1) {
    890                 mtls->mSliceSize = 1;
    891             }
    892 
    893             launchThreads(walk_1d_foreach, mtls);
    894         }
    895         mInKernel = false;
    896 
    897     } else {
    898         ForEachFunc_t fn = mtls->kernel;
    899         uint32_t slice = 0;
    900 
    901 
    902         while(SelectOuterSlice(mtls, &mtls->fep, slice++)) {
    903             for (mtls->fep.current.y = mtls->start.y;
    904                  mtls->fep.current.y < mtls->end.y;
    905                  mtls->fep.current.y++) {
    906 
    907                 FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
    908                             mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
    909                             (RsAllocationCubemapFace) mtls->fep.current.face,
    910                             mtls->fep.current.array[0], mtls->fep.current.array[1],
    911                             mtls->fep.current.array[2], mtls->fep.current.array[3]);
    912 
    913                 fn(&mtls->fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
    914             }
    915         }
    916     }
    917 }
    918 
    919 RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
    920     //ALOGE("setTls %p", sc);
    921     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
    922     rsAssert(tls);
    923     RsdCpuScriptImpl *old = tls->mImpl;
    924     tls->mImpl = sc;
    925     tls->mContext = mRSC;
    926     if (sc) {
    927         tls->mScript = sc->getScript();
    928     } else {
    929         tls->mScript = nullptr;
    930     }
    931     return old;
    932 }
    933 
    934 const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
    935     return mSymLookupFn(mRSC, name);
    936 }
    937 
    938 
    939 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
    940                                     char const *resName, char const *cacheDir,
    941                                     uint8_t const *bitcode, size_t bitcodeSize,
    942                                     uint32_t flags) {
    943 
    944     RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
    945     if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags
    946         , getBccPluginName()
    947         )) {
    948         delete i;
    949         return nullptr;
    950     }
    951     return i;
    952 }
    953 
    954 extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
    955                                              const Script *s, const Element *e);
    956 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
    957                                                    const Script *s, const Element *e);
    958 extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
    959                                                    const Script *s, const Element *e);
    960 extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
    961                                            const Script *s, const Element *e);
    962 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
    963                                                    const Script *s, const Element *e);
    964 extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
    965                                             const Script *s, const Element *e);
    966 extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
    967                                                 const Script *s, const Element *e);
    968 extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
    969                                              const Script *s, const Element *e);
    970 extern RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx,
    971                                                  const Script *s, const Element *e);
    972 extern RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx,
    973                                               const Script *s, const Element *e);
    974 extern RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
    975                                               const Script *s, const Element *e);
    976 
    977 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
    978                                     RsScriptIntrinsicID iid, Element *e) {
    979 
    980     RsdCpuScriptImpl *i = nullptr;
    981     switch (iid) {
    982     case RS_SCRIPT_INTRINSIC_ID_3DLUT:
    983         i = rsdIntrinsic_3DLUT(this, s, e);
    984         break;
    985     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
    986         i = rsdIntrinsic_Convolve3x3(this, s, e);
    987         break;
    988     case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
    989         i = rsdIntrinsic_ColorMatrix(this, s, e);
    990         break;
    991     case RS_SCRIPT_INTRINSIC_ID_LUT:
    992         i = rsdIntrinsic_LUT(this, s, e);
    993         break;
    994     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
    995         i = rsdIntrinsic_Convolve5x5(this, s, e);
    996         break;
    997     case RS_SCRIPT_INTRINSIC_ID_BLUR:
    998         i = rsdIntrinsic_Blur(this, s, e);
    999         break;
   1000     case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
   1001         i = rsdIntrinsic_YuvToRGB(this, s, e);
   1002         break;
   1003     case RS_SCRIPT_INTRINSIC_ID_BLEND:
   1004         i = rsdIntrinsic_Blend(this, s, e);
   1005         break;
   1006     case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
   1007         i = rsdIntrinsic_Histogram(this, s, e);
   1008         break;
   1009     case RS_SCRIPT_INTRINSIC_ID_RESIZE:
   1010         i = rsdIntrinsic_Resize(this, s, e);
   1011         break;
   1012     case RS_SCRIPT_INTRINSIC_ID_BLAS:
   1013         i = rsdIntrinsic_BLAS(this, s, e);
   1014         break;
   1015 
   1016     default:
   1017         rsAssert(0);
   1018     }
   1019 
   1020     return i;
   1021 }
   1022 
   1023 void* RsdCpuReferenceImpl::createScriptGroup(const ScriptGroupBase *sg) {
   1024   switch (sg->getApiVersion()) {
   1025     case ScriptGroupBase::SG_V1: {
   1026       CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
   1027       if (!sgi->init()) {
   1028         delete sgi;
   1029         return nullptr;
   1030       }
   1031       return sgi;
   1032     }
   1033     case ScriptGroupBase::SG_V2: {
   1034       return new CpuScriptGroup2Impl(this, sg);
   1035     }
   1036   }
   1037   return nullptr;
   1038 }
   1039 
   1040 } // namespace renderscript
   1041 } // namespace android
   1042