Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "rsCpuIntrinsic.h"
     18 #include "rsCpuIntrinsicInlines.h"
     19 
     20 using namespace android;
     21 using namespace android::renderscript;
     22 
     23 namespace android {
     24 namespace renderscript {
     25 
     26 
     27 class RsdCpuScriptIntrinsicBlur : public RsdCpuScriptIntrinsic {
     28 public:
     29     virtual void populateScript(Script *);
     30     virtual void invokeFreeChildren();
     31 
     32     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
     33     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
     34 
     35     virtual ~RsdCpuScriptIntrinsicBlur();
     36     RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
     37 
     38 protected:
     39     float mFp[104];
     40     short mIp[104];
     41     void **mScratch;
     42     size_t *mScratchSize;
     43     float mRadius;
     44     int mIradius;
     45     ObjectBaseRef<Allocation> mAlloc;
     46 
     47     static void kernelU4(const RsForEachStubParamStruct *p,
     48                          uint32_t xstart, uint32_t xend,
     49                          uint32_t instep, uint32_t outstep);
     50     static void kernelU1(const RsForEachStubParamStruct *p,
     51                          uint32_t xstart, uint32_t xend,
     52                          uint32_t instep, uint32_t outstep);
     53     void ComputeGaussianWeights();
     54 };
     55 
     56 }
     57 }
     58 
     59 
     60 void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
     61     memset(mFp, 0, sizeof(mFp));
     62     memset(mIp, 0, sizeof(mIp));
     63 
     64     // Compute gaussian weights for the blur
     65     // e is the euler's number
     66     float e = 2.718281828459045f;
     67     float pi = 3.1415926535897932f;
     68     // g(x) = ( 1 / sqrt( 2 * pi ) * sigma) * e ^ ( -x^2 / 2 * sigma^2 )
     69     // x is of the form [-radius .. 0 .. radius]
     70     // and sigma varies with radius.
     71     // Based on some experimental radius values and sigma's
     72     // we approximately fit sigma = f(radius) as
     73     // sigma = radius * 0.4  + 0.6
     74     // The larger the radius gets, the more our gaussian blur
     75     // will resemble a box blur since with large sigma
     76     // the gaussian curve begins to lose its shape
     77     float sigma = 0.4f * mRadius + 0.6f;
     78 
     79     // Now compute the coefficients. We will store some redundant values to save
     80     // some math during the blur calculations precompute some values
     81     float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
     82     float coeff2 = - 1.0f / (2.0f * sigma * sigma);
     83 
     84     float normalizeFactor = 0.0f;
     85     float floatR = 0.0f;
     86     int r;
     87     mIradius = (float)ceil(mRadius) + 0.5f;
     88     for (r = -mIradius; r <= mIradius; r ++) {
     89         floatR = (float)r;
     90         mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
     91         normalizeFactor += mFp[r + mIradius];
     92     }
     93 
     94     //Now we need to normalize the weights because all our coefficients need to add up to one
     95     normalizeFactor = 1.0f / normalizeFactor;
     96     for (r = -mIradius; r <= mIradius; r ++) {
     97         mFp[r + mIradius] *= normalizeFactor;
     98         mIp[r + mIradius] = (short)(mIp[r + mIradius] * 32768);
     99     }
    100 }
    101 
    102 void RsdCpuScriptIntrinsicBlur::setGlobalObj(uint32_t slot, ObjectBase *data) {
    103     rsAssert(slot == 1);
    104     mAlloc.set(static_cast<Allocation *>(data));
    105 }
    106 
    107 void RsdCpuScriptIntrinsicBlur::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
    108     rsAssert(slot == 0);
    109     mRadius = ((const float *)data)[0];
    110     ComputeGaussianWeights();
    111 }
    112 
    113 
    114 
    115 static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
    116                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
    117 
    118     const uchar *pi = ptrIn + x*4;
    119 
    120     float4 blurredPixel = 0;
    121     for (int r = -iradius; r <= iradius; r ++) {
    122         int validY = rsMax((y + r), 0);
    123         validY = rsMin(validY, (int)(p->dimY - 1));
    124         const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
    125         float4 pf = convert_float4(pvy[0]);
    126         blurredPixel += pf * gPtr[0];
    127         gPtr++;
    128     }
    129 
    130     out->xyzw = blurredPixel;
    131 }
    132 
    133 static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
    134                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
    135 
    136     const uchar *pi = ptrIn + x;
    137 
    138     float blurredPixel = 0;
    139     for (int r = -iradius; r <= iradius; r ++) {
    140         int validY = rsMax((y + r), 0);
    141         validY = rsMin(validY, (int)(p->dimY - 1));
    142         float pf = (float)pi[validY * iStride];
    143         blurredPixel += pf * gPtr[0];
    144         gPtr++;
    145     }
    146 
    147     out[0] = blurredPixel;
    148 }
    149 
    150 extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
    151 extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
    152 extern "C" void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
    153 
    154 static void OneVFU4(float4 *out,
    155                     const uchar *ptrIn, int iStride, const float* gPtr, int ct,
    156                     int x1, int x2) {
    157 
    158 #if defined(ARCH_ARM_HAVE_VFP)
    159     if (gArchUseSIMD) {
    160         int t = (x2 - x1);
    161         t &= ~1;
    162         if(t) {
    163             rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
    164         }
    165         x1 += t;
    166     }
    167 #endif
    168 
    169     while(x2 > x1) {
    170         const uchar *pi = ptrIn;
    171         float4 blurredPixel = 0;
    172         const float* gp = gPtr;
    173 
    174         for (int r = 0; r < ct; r++) {
    175             float4 pf = convert_float4(((const uchar4 *)pi)[0]);
    176             blurredPixel += pf * gp[0];
    177             pi += iStride;
    178             gp++;
    179         }
    180         out->xyzw = blurredPixel;
    181         x1++;
    182         out++;
    183         ptrIn+=4;
    184     }
    185 }
    186 
    187 static void OneVFU1(float *out,
    188                     const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) {
    189 
    190     int len = x2 - x1;
    191 
    192     while((x2 > x1) && (((uintptr_t)ptrIn) & 0x3)) {
    193         const uchar *pi = ptrIn;
    194         float blurredPixel = 0;
    195         const float* gp = gPtr;
    196 
    197         for (int r = 0; r < ct; r++) {
    198             float pf = (float)pi[0];
    199             blurredPixel += pf * gp[0];
    200             pi += iStride;
    201             gp++;
    202         }
    203         out[0] = blurredPixel;
    204         x1++;
    205         out++;
    206         ptrIn++;
    207         len--;
    208     }
    209 
    210 #if defined(ARCH_ARM_HAVE_VFP)
    211     if (gArchUseSIMD && (x2 > x1)) {
    212         int t = (x2 - x1) >> 2;
    213         t &= ~1;
    214         if(t) {
    215             rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
    216             len -= t << 2;
    217             ptrIn += t << 2;
    218             out += t << 2;
    219         }
    220     }
    221 #endif
    222 
    223     while(len > 0) {
    224         const uchar *pi = ptrIn;
    225         float blurredPixel = 0;
    226         const float* gp = gPtr;
    227 
    228         for (int r = 0; r < ct; r++) {
    229             float pf = (float)pi[0];
    230             blurredPixel += pf * gp[0];
    231             pi += iStride;
    232             gp++;
    233         }
    234         out[0] = blurredPixel;
    235         len--;
    236         out++;
    237         ptrIn++;
    238     }
    239 }
    240 
    241 static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
    242                    const float4 *ptrIn, const float* gPtr, int iradius) {
    243 
    244     float4 blurredPixel = 0;
    245     for (int r = -iradius; r <= iradius; r ++) {
    246         int validX = rsMax((x + r), 0);
    247         validX = rsMin(validX, (int)(p->dimX - 1));
    248         float4 pf = ptrIn[validX];
    249         blurredPixel += pf * gPtr[0];
    250         gPtr++;
    251     }
    252 
    253     out->xyzw = convert_uchar4(blurredPixel);
    254 }
    255 
    256 static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
    257                    const float *ptrIn, const float* gPtr, int iradius) {
    258 
    259     float blurredPixel = 0;
    260     for (int r = -iradius; r <= iradius; r ++) {
    261         int validX = rsMax((x + r), 0);
    262         validX = rsMin(validX, (int)(p->dimX - 1));
    263         float pf = ptrIn[validX];
    264         blurredPixel += pf * gPtr[0];
    265         gPtr++;
    266     }
    267 
    268     out[0] = (uchar)blurredPixel;
    269 }
    270 
    271 
    272 void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
    273                                          uint32_t xstart, uint32_t xend,
    274                                          uint32_t instep, uint32_t outstep) {
    275 
    276     float4 stackbuf[2048];
    277     float4 *buf = &stackbuf[0];
    278     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
    279     if (!cp->mAlloc.get()) {
    280         ALOGE("Blur executed without input, skipping");
    281         return;
    282     }
    283     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    284     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    285 
    286     uchar4 *out = (uchar4 *)p->out;
    287     uint32_t x1 = xstart;
    288     uint32_t x2 = xend;
    289 
    290     if (p->dimX > 2048) {
    291         if ((p->dimX > cp->mScratchSize[p->lid]) || !cp->mScratch[p->lid]) {
    292             // Pad the side of the allocation by one unit to allow alignment later
    293             cp->mScratch[p->lid] = realloc(cp->mScratch[p->lid], (p->dimX + 1) * 16);
    294             cp->mScratchSize[p->lid] = p->dimX;
    295         }
    296         // realloc only aligns to 8 bytes so we manually align to 16.
    297         buf = (float4 *) ((((intptr_t)cp->mScratch[p->lid]) + 15) & ~0xf);
    298     }
    299     float4 *fout = (float4 *)buf;
    300     int y = p->y;
    301     if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius))) {
    302         const uchar *pi = pin + (y - cp->mIradius) * stride;
    303         OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
    304     } else {
    305         while(x2 > x1) {
    306             OneVU4(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
    307             fout++;
    308             x1++;
    309         }
    310     }
    311 
    312     x1 = xstart;
    313     while ((x1 < (uint32_t)cp->mIradius) && (x1 < x2)) {
    314         OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
    315         out++;
    316         x1++;
    317     }
    318 #if defined(ARCH_ARM_HAVE_VFP)
    319     if (gArchUseSIMD) {
    320         if ((x1 + cp->mIradius) < x2) {
    321             rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
    322                                    cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
    323             out += (x2 - cp->mIradius) - x1;
    324             x1 = x2 - cp->mIradius;
    325         }
    326     }
    327 #endif
    328     while(x2 > x1) {
    329         OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
    330         out++;
    331         x1++;
    332     }
    333 }
    334 
    335 void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
    336                                          uint32_t xstart, uint32_t xend,
    337                                          uint32_t instep, uint32_t outstep) {
    338     float buf[4 * 2048];
    339     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
    340     if (!cp->mAlloc.get()) {
    341         ALOGE("Blur executed without input, skipping");
    342         return;
    343     }
    344     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    345     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    346 
    347     uchar *out = (uchar *)p->out;
    348     uint32_t x1 = xstart;
    349     uint32_t x2 = xend;
    350 
    351     float *fout = (float *)buf;
    352     int y = p->y;
    353     if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) {
    354         const uchar *pi = pin + (y - cp->mIradius) * stride;
    355         OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
    356     } else {
    357         while(x2 > x1) {
    358             OneVU1(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
    359             fout++;
    360             x1++;
    361         }
    362     }
    363 
    364     x1 = xstart;
    365     while ((x1 < x2) &&
    366            ((x1 < (uint32_t)cp->mIradius) || (((uintptr_t)out) & 0x3))) {
    367         OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
    368         out++;
    369         x1++;
    370     }
    371 #if defined(ARCH_ARM_HAVE_VFP)
    372     if (gArchUseSIMD) {
    373         if ((x1 + cp->mIradius) < x2) {
    374             uint32_t len = x2 - (x1 + cp->mIradius);
    375             len &= ~3;
    376             if (len > 0) {
    377                 rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
    378                                        cp->mIradius * 2 + 1, x1, x1 + len);
    379                 out += len;
    380                 x1 += len;
    381             }
    382         }
    383     }
    384 #endif
    385     while(x2 > x1) {
    386         OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
    387         out++;
    388         x1++;
    389     }
    390 }
    391 
    392 RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx,
    393                                                      const Script *s, const Element *e)
    394             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
    395 
    396     mRootPtr = NULL;
    397     if (e->getType() == RS_TYPE_UNSIGNED_8) {
    398         switch (e->getVectorSize()) {
    399         case 1:
    400             mRootPtr = &kernelU1;
    401             break;
    402         case 4:
    403             mRootPtr = &kernelU4;
    404             break;
    405         }
    406     }
    407     rsAssert(mRootPtr);
    408     mRadius = 5;
    409 
    410     mScratch = new void *[mCtx->getThreadCount()];
    411     mScratchSize = new size_t[mCtx->getThreadCount()];
    412     memset(mScratch, 0, sizeof(void *) * mCtx->getThreadCount());
    413     memset(mScratchSize, 0, sizeof(size_t) * mCtx->getThreadCount());
    414 
    415     ComputeGaussianWeights();
    416 }
    417 
    418 RsdCpuScriptIntrinsicBlur::~RsdCpuScriptIntrinsicBlur() {
    419     uint32_t threads = mCtx->getThreadCount();
    420     if (mScratch) {
    421         for (size_t i = 0; i < threads; i++) {
    422             if (mScratch[i]) {
    423                 free(mScratch[i]);
    424             }
    425         }
    426         delete []mScratch;
    427     }
    428     if (mScratchSize) {
    429         delete []mScratchSize;
    430     }
    431 }
    432 
    433 void RsdCpuScriptIntrinsicBlur::populateScript(Script *s) {
    434     s->mHal.info.exportedVariableCount = 2;
    435 }
    436 
    437 void RsdCpuScriptIntrinsicBlur::invokeFreeChildren() {
    438     mAlloc.clear();
    439 }
    440 
    441 
    442 RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
    443 
    444     return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
    445 }
    446 
    447 
    448