Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 
     18 #include "rsCpuIntrinsic.h"
     19 #include "rsCpuIntrinsicInlines.h"
     20 
     21 namespace android {
     22 namespace renderscript {
     23 
     24 
     25 class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic {
     26 public:
     27     void populateScript(Script *) override;
     28     void invokeFreeChildren() override;
     29 
     30     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
     31     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
     32 
     33     ~RsdCpuScriptIntrinsicConvolve3x3() override;
     34     RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
     35 
     36 protected:
     37     float mFp[16];
     38     short mIp[16];
     39     ObjectBaseRef<const Allocation> mAlloc;
     40     ObjectBaseRef<const Element> mElement;
     41 
     42     static void kernelU1(const RsExpandKernelDriverInfo *info,
     43                          uint32_t xstart, uint32_t xend,
     44                          uint32_t outstep);
     45     static void kernelU2(const RsExpandKernelDriverInfo *info,
     46                          uint32_t xstart, uint32_t xend,
     47                          uint32_t outstep);
     48     static void kernelU4(const RsExpandKernelDriverInfo *info,
     49                          uint32_t xstart, uint32_t xend,
     50                          uint32_t outstep);
     51     static void kernelF1(const RsExpandKernelDriverInfo *info,
     52                          uint32_t xstart, uint32_t xend,
     53                          uint32_t outstep);
     54     static void kernelF2(const RsExpandKernelDriverInfo *info,
     55                          uint32_t xstart, uint32_t xend,
     56                          uint32_t outstep);
     57     static void kernelF4(const RsExpandKernelDriverInfo *info,
     58                          uint32_t xstart, uint32_t xend,
     59                          uint32_t outstep);
     60 };
     61 
     62 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
     63     rsAssert(slot == 1);
     64     mAlloc.set(static_cast<Allocation *>(data));
     65 }
     66 
     67 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
     68                                                     size_t dataLength) {
     69     rsAssert(slot == 0);
     70     memcpy (&mFp, data, dataLength);
     71     for(int ct=0; ct < 9; ct++) {
     72         if (mFp[ct] >= 0) {
     73             mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
     74         } else {
     75             mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
     76         }
     77     }
     78 }
     79 
     80 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
     81                                           const void *y2, const short *coef, uint32_t count);
     82 
     83 
     84 static void ConvolveOneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
     85                           const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
     86                           const float* coeff) {
     87 
     88     uint32_t x1 = rsMax((int32_t)x-1, 0);
     89     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
     90 
     91     float4 px = convert_float4(py0[x1]) * coeff[0] +
     92                 convert_float4(py0[x]) * coeff[1] +
     93                 convert_float4(py0[x2]) * coeff[2] +
     94                 convert_float4(py1[x1]) * coeff[3] +
     95                 convert_float4(py1[x]) * coeff[4] +
     96                 convert_float4(py1[x2]) * coeff[5] +
     97                 convert_float4(py2[x1]) * coeff[6] +
     98                 convert_float4(py2[x]) * coeff[7] +
     99                 convert_float4(py2[x2]) * coeff[8];
    100 
    101     px = clamp(px + 0.5f, 0.f, 255.f);
    102     uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
    103     *out = o;
    104 }
    105 
    106 static void ConvolveOneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
    107                           const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
    108                           const float* coeff) {
    109 
    110     uint32_t x1 = rsMax((int32_t)x-1, 0);
    111     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    112 
    113     float2 px = convert_float2(py0[x1]) * coeff[0] +
    114                 convert_float2(py0[x]) * coeff[1] +
    115                 convert_float2(py0[x2]) * coeff[2] +
    116                 convert_float2(py1[x1]) * coeff[3] +
    117                 convert_float2(py1[x]) * coeff[4] +
    118                 convert_float2(py1[x2]) * coeff[5] +
    119                 convert_float2(py2[x1]) * coeff[6] +
    120                 convert_float2(py2[x]) * coeff[7] +
    121                 convert_float2(py2[x2]) * coeff[8];
    122 
    123     px = clamp(px + 0.5f, 0.f, 255.f);
    124     *out = convert_uchar2(px);
    125 }
    126 
    127 static void ConvolveOneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
    128                           const uchar *py0, const uchar *py1, const uchar *py2,
    129                           const float* coeff) {
    130 
    131     uint32_t x1 = rsMax((int32_t)x-1, 0);
    132     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    133 
    134     float px = ((float)py0[x1]) * coeff[0] +
    135                ((float)py0[x]) * coeff[1] +
    136                ((float)py0[x2]) * coeff[2] +
    137                ((float)py1[x1]) * coeff[3] +
    138                ((float)py1[x]) * coeff[4] +
    139                ((float)py1[x2]) * coeff[5] +
    140                ((float)py2[x1]) * coeff[6] +
    141                ((float)py2[x]) * coeff[7] +
    142                ((float)py2[x2]) * coeff[8];
    143     *out = clamp(px + 0.5f, 0.f, 255.f);
    144 }
    145 
    146 static void ConvolveOneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
    147                           const float4 *py0, const float4 *py1, const float4 *py2,
    148                           const float* coeff) {
    149 
    150     uint32_t x1 = rsMax((int32_t)x-1, 0);
    151     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    152     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
    153            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
    154            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
    155 }
    156 
    157 static void ConvolveOneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
    158                           const float2 *py0, const float2 *py1, const float2 *py2,
    159                           const float* coeff) {
    160 
    161     uint32_t x1 = rsMax((int32_t)x-1, 0);
    162     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    163     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
    164            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
    165            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
    166 }
    167 
    168 static void ConvolveOneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
    169                           const float *py0, const float *py1, const float *py2,
    170                           const float* coeff) {
    171 
    172     uint32_t x1 = rsMax((int32_t)x-1, 0);
    173     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    174     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
    175            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
    176            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
    177 }
    178 
    179 void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelDriverInfo *info,
    180                                                 uint32_t xstart, uint32_t xend,
    181                                                 uint32_t outstep) {
    182     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    183 
    184     if (!cp->mAlloc.get()) {
    185         ALOGE("Convolve3x3 executed without input, skipping");
    186         return;
    187     }
    188     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    189     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    190 
    191     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    192     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    193     const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
    194     const uchar4 *py1 = (const uchar4 *)(pin + stride * info->current.y);
    195     const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
    196 
    197     uchar4 *out = (uchar4 *)info->outPtr[0];
    198     uint32_t x1 = xstart;
    199     uint32_t x2 = xend;
    200     if(x1 == 0) {
    201         ConvolveOneU4(info, 0, out, py0, py1, py2, cp->mFp);
    202         x1 ++;
    203         out++;
    204     }
    205 
    206     if(x2 > x1) {
    207 #if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
    208         if (gArchUseSIMD) {
    209             int32_t len = (x2 - x1 - 1) >> 1;
    210             if(len > 0) {
    211                 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    212                 x1 += len << 1;
    213                 out += len << 1;
    214             }
    215         }
    216 #endif
    217 
    218         while(x1 != x2) {
    219             ConvolveOneU4(info, x1, out, py0, py1, py2, cp->mFp);
    220             out++;
    221             x1++;
    222         }
    223     }
    224 }
    225 
    226 void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelDriverInfo *info,
    227                                                 uint32_t xstart, uint32_t xend,
    228                                                 uint32_t outstep) {
    229     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    230 
    231     if (!cp->mAlloc.get()) {
    232         ALOGE("Convolve3x3 executed without input, skipping");
    233         return;
    234     }
    235     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    236     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    237 
    238     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    239     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    240     const uchar2 *py0 = (const uchar2 *)(pin + stride * y2);
    241     const uchar2 *py1 = (const uchar2 *)(pin + stride * info->current.y);
    242     const uchar2 *py2 = (const uchar2 *)(pin + stride * y1);
    243 
    244     uchar2 *out = (uchar2 *)info->outPtr[0];
    245     uint32_t x1 = xstart;
    246     uint32_t x2 = xend;
    247     if(x1 == 0) {
    248         ConvolveOneU2(info, 0, out, py0, py1, py2, cp->mFp);
    249         x1 ++;
    250         out++;
    251     }
    252 
    253     if(x2 > x1) {
    254 #if 0//defined(ARCH_ARM_HAVE_NEON)
    255         int32_t len = (x2 - x1 - 1) >> 1;
    256         if(len > 0) {
    257             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    258             x1 += len << 1;
    259             out += len << 1;
    260         }
    261 #endif
    262 
    263         while(x1 != x2) {
    264             ConvolveOneU2(info, x1, out, py0, py1, py2, cp->mFp);
    265             out++;
    266             x1++;
    267         }
    268     }
    269 }
    270 
    271 void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelDriverInfo *info,
    272                                                 uint32_t xstart, uint32_t xend,
    273                                                 uint32_t outstep) {
    274     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    275 
    276     if (!cp->mAlloc.get()) {
    277         ALOGE("Convolve3x3 executed without input, skipping");
    278         return;
    279     }
    280     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    281     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    282 
    283     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    284     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    285     const uchar *py0 = (const uchar *)(pin + stride * y2);
    286     const uchar *py1 = (const uchar *)(pin + stride * info->current.y);
    287     const uchar *py2 = (const uchar *)(pin + stride * y1);
    288 
    289     uchar *out = (uchar *)info->outPtr[0];
    290     uint32_t x1 = xstart;
    291     uint32_t x2 = xend;
    292     if(x1 == 0) {
    293         ConvolveOneU1(info, 0, out, py0, py1, py2, cp->mFp);
    294         x1 ++;
    295         out++;
    296     }
    297 
    298     if(x2 > x1) {
    299 #if 0//defined(ARCH_ARM_HAVE_NEON)
    300         int32_t len = (x2 - x1 - 1) >> 1;
    301         if(len > 0) {
    302             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    303             x1 += len << 1;
    304             out += len << 1;
    305         }
    306 #endif
    307 
    308         while(x1 != x2) {
    309             ConvolveOneU1(info, x1, out, py0, py1, py2, cp->mFp);
    310             out++;
    311             x1++;
    312         }
    313     }
    314 }
    315 
    316 void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelDriverInfo *info,
    317                                                 uint32_t xstart, uint32_t xend,
    318                                                 uint32_t outstep) {
    319     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    320 
    321     if (!cp->mAlloc.get()) {
    322         ALOGE("Convolve3x3 executed without input, skipping");
    323         return;
    324     }
    325     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    326     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    327 
    328     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    329     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    330     const float4 *py0 = (const float4 *)(pin + stride * y2);
    331     const float4 *py1 = (const float4 *)(pin + stride * info->current.y);
    332     const float4 *py2 = (const float4 *)(pin + stride * y1);
    333 
    334     float4 *out = (float4 *)info->outPtr[0];
    335     uint32_t x1 = xstart;
    336     uint32_t x2 = xend;
    337     if(x1 == 0) {
    338         ConvolveOneF4(info, 0, out, py0, py1, py2, cp->mFp);
    339         x1 ++;
    340         out++;
    341     }
    342 
    343     if(x2 > x1) {
    344 #if 0//defined(ARCH_ARM_HAVE_NEON)
    345         int32_t len = (x2 - x1 - 1) >> 1;
    346         if(len > 0) {
    347             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    348             x1 += len << 1;
    349             out += len << 1;
    350         }
    351 #endif
    352 
    353         while(x1 != x2) {
    354             ConvolveOneF4(info, x1, out, py0, py1, py2, cp->mFp);
    355             out++;
    356             x1++;
    357         }
    358     }
    359 }
    360 
    361 void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelDriverInfo *info,
    362                                                 uint32_t xstart, uint32_t xend,
    363                                                 uint32_t outstep) {
    364     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    365 
    366     if (!cp->mAlloc.get()) {
    367         ALOGE("Convolve3x3 executed without input, skipping");
    368         return;
    369     }
    370     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    371     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    372 
    373     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    374     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    375     const float2 *py0 = (const float2 *)(pin + stride * y2);
    376     const float2 *py1 = (const float2 *)(pin + stride * info->current.y);
    377     const float2 *py2 = (const float2 *)(pin + stride * y1);
    378 
    379     float2 *out = (float2 *)info->outPtr[0];
    380     uint32_t x1 = xstart;
    381     uint32_t x2 = xend;
    382     if(x1 == 0) {
    383         ConvolveOneF2(info, 0, out, py0, py1, py2, cp->mFp);
    384         x1 ++;
    385         out++;
    386     }
    387 
    388     if(x2 > x1) {
    389 #if 0//defined(ARCH_ARM_HAVE_NEON)
    390         int32_t len = (x2 - x1 - 1) >> 1;
    391         if(len > 0) {
    392             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    393             x1 += len << 1;
    394             out += len << 1;
    395         }
    396 #endif
    397 
    398         while(x1 != x2) {
    399             ConvolveOneF2(info, x1, out, py0, py1, py2, cp->mFp);
    400             out++;
    401             x1++;
    402         }
    403     }
    404 }
    405 void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelDriverInfo *info,
    406                                                 uint32_t xstart, uint32_t xend,
    407                                                 uint32_t outstep) {
    408     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    409 
    410     if (!cp->mAlloc.get()) {
    411         ALOGE("Convolve3x3 executed without input, skipping");
    412         return;
    413     }
    414     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    415     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    416 
    417     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    418     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    419     const float *py0 = (const float *)(pin + stride * y2);
    420     const float *py1 = (const float *)(pin + stride * info->current.y);
    421     const float *py2 = (const float *)(pin + stride * y1);
    422 
    423     float *out = (float *)info->outPtr[0];
    424     uint32_t x1 = xstart;
    425     uint32_t x2 = xend;
    426     if(x1 == 0) {
    427         ConvolveOneF1(info, 0, out, py0, py1, py2, cp->mFp);
    428         x1 ++;
    429         out++;
    430     }
    431 
    432     if(x2 > x1) {
    433 #if 0//defined(ARCH_ARM_HAVE_NEON)
    434         int32_t len = (x2 - x1 - 1) >> 1;
    435         if(len > 0) {
    436             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    437             x1 += len << 1;
    438             out += len << 1;
    439         }
    440 #endif
    441 
    442         while(x1 != x2) {
    443             ConvolveOneF1(info, x1, out, py0, py1, py2, cp->mFp);
    444             out++;
    445             x1++;
    446         }
    447     }
    448 }
    449 
    450 RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
    451             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
    452             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
    453 
    454     if (e->getType() == RS_TYPE_FLOAT_32) {
    455         switch(e->getVectorSize()) {
    456         case 1:
    457             mRootPtr = &kernelF1;
    458             break;
    459         case 2:
    460             mRootPtr = &kernelF2;
    461             break;
    462         case 3:
    463         case 4:
    464             mRootPtr = &kernelF4;
    465             break;
    466         }
    467     } else {
    468         switch(e->getVectorSize()) {
    469         case 1:
    470             mRootPtr = &kernelU1;
    471             break;
    472         case 2:
    473             mRootPtr = &kernelU2;
    474             break;
    475         case 3:
    476         case 4:
    477             mRootPtr = &kernelU4;
    478             break;
    479         }
    480     }
    481     for(int ct=0; ct < 9; ct++) {
    482         mFp[ct] = 1.f / 9.f;
    483         mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
    484     }
    485 }
    486 
    487 RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() {
    488 }
    489 
    490 void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) {
    491     s->mHal.info.exportedVariableCount = 2;
    492 }
    493 
    494 void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
    495     mAlloc.clear();
    496 }
    497 
    498 RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
    499 
    500     return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
    501 }
    502 
    503 } // namespace renderscript
    504 } // namespace android
    505