Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 
     18 #include "rsCpuIntrinsic.h"
     19 #include "rsCpuIntrinsicInlines.h"
     20 
     21 using namespace android;
     22 using namespace android::renderscript;
     23 
     24 namespace android {
     25 namespace renderscript {
     26 
     27 
     28 class RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic {
     29 public:
     30     void populateScript(Script *) override;
     31     void invokeFreeChildren() override;
     32 
     33     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
     34     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
     35 
     36     ~RsdCpuScriptIntrinsicConvolve3x3() override;
     37     RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
     38 
     39 protected:
     40     float mFp[16];
     41     short mIp[16];
     42     ObjectBaseRef<const Allocation> mAlloc;
     43     ObjectBaseRef<const Element> mElement;
     44 
     45     static void kernelU1(const RsExpandKernelDriverInfo *info,
     46                          uint32_t xstart, uint32_t xend,
     47                          uint32_t outstep);
     48     static void kernelU2(const RsExpandKernelDriverInfo *info,
     49                          uint32_t xstart, uint32_t xend,
     50                          uint32_t outstep);
     51     static void kernelU4(const RsExpandKernelDriverInfo *info,
     52                          uint32_t xstart, uint32_t xend,
     53                          uint32_t outstep);
     54     static void kernelF1(const RsExpandKernelDriverInfo *info,
     55                          uint32_t xstart, uint32_t xend,
     56                          uint32_t outstep);
     57     static void kernelF2(const RsExpandKernelDriverInfo *info,
     58                          uint32_t xstart, uint32_t xend,
     59                          uint32_t outstep);
     60     static void kernelF4(const RsExpandKernelDriverInfo *info,
     61                          uint32_t xstart, uint32_t xend,
     62                          uint32_t outstep);
     63 };
     64 
     65 }
     66 }
     67 
     68 
     69 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
     70     rsAssert(slot == 1);
     71     mAlloc.set(static_cast<Allocation *>(data));
     72 }
     73 
     74 void RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
     75                                                     size_t dataLength) {
     76     rsAssert(slot == 0);
     77     memcpy (&mFp, data, dataLength);
     78     for(int ct=0; ct < 9; ct++) {
     79         if (mFp[ct] >= 0) {
     80             mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
     81         } else {
     82             mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
     83         }
     84     }
     85 }
     86 
     87 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
     88                                           const void *y2, const short *coef, uint32_t count);
     89 
     90 
     91 static void ConvolveOneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
     92                           const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
     93                           const float* coeff) {
     94 
     95     uint32_t x1 = rsMax((int32_t)x-1, 0);
     96     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
     97 
     98     float4 px = convert_float4(py0[x1]) * coeff[0] +
     99                 convert_float4(py0[x]) * coeff[1] +
    100                 convert_float4(py0[x2]) * coeff[2] +
    101                 convert_float4(py1[x1]) * coeff[3] +
    102                 convert_float4(py1[x]) * coeff[4] +
    103                 convert_float4(py1[x2]) * coeff[5] +
    104                 convert_float4(py2[x1]) * coeff[6] +
    105                 convert_float4(py2[x]) * coeff[7] +
    106                 convert_float4(py2[x2]) * coeff[8];
    107 
    108     px = clamp(px + 0.5f, 0.f, 255.f);
    109     uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
    110     *out = o;
    111 }
    112 
    113 static void ConvolveOneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
    114                           const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
    115                           const float* coeff) {
    116 
    117     uint32_t x1 = rsMax((int32_t)x-1, 0);
    118     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    119 
    120     float2 px = convert_float2(py0[x1]) * coeff[0] +
    121                 convert_float2(py0[x]) * coeff[1] +
    122                 convert_float2(py0[x2]) * coeff[2] +
    123                 convert_float2(py1[x1]) * coeff[3] +
    124                 convert_float2(py1[x]) * coeff[4] +
    125                 convert_float2(py1[x2]) * coeff[5] +
    126                 convert_float2(py2[x1]) * coeff[6] +
    127                 convert_float2(py2[x]) * coeff[7] +
    128                 convert_float2(py2[x2]) * coeff[8];
    129 
    130     px = clamp(px + 0.5f, 0.f, 255.f);
    131     *out = convert_uchar2(px);
    132 }
    133 
    134 static void ConvolveOneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
    135                           const uchar *py0, const uchar *py1, const uchar *py2,
    136                           const float* coeff) {
    137 
    138     uint32_t x1 = rsMax((int32_t)x-1, 0);
    139     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    140 
    141     float px = ((float)py0[x1]) * coeff[0] +
    142                ((float)py0[x]) * coeff[1] +
    143                ((float)py0[x2]) * coeff[2] +
    144                ((float)py1[x1]) * coeff[3] +
    145                ((float)py1[x]) * coeff[4] +
    146                ((float)py1[x2]) * coeff[5] +
    147                ((float)py2[x1]) * coeff[6] +
    148                ((float)py2[x]) * coeff[7] +
    149                ((float)py2[x2]) * coeff[8];
    150     *out = clamp(px + 0.5f, 0.f, 255.f);
    151 }
    152 
    153 static void ConvolveOneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
    154                           const float4 *py0, const float4 *py1, const float4 *py2,
    155                           const float* coeff) {
    156 
    157     uint32_t x1 = rsMax((int32_t)x-1, 0);
    158     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    159     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
    160            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
    161            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
    162 }
    163 
    164 static void ConvolveOneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
    165                           const float2 *py0, const float2 *py1, const float2 *py2,
    166                           const float* coeff) {
    167 
    168     uint32_t x1 = rsMax((int32_t)x-1, 0);
    169     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    170     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
    171            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
    172            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
    173 }
    174 
    175 static void ConvolveOneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
    176                           const float *py0, const float *py1, const float *py2,
    177                           const float* coeff) {
    178 
    179     uint32_t x1 = rsMax((int32_t)x-1, 0);
    180     uint32_t x2 = rsMin((int32_t)x+1, (int32_t)info->dim.x-1);
    181     *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
    182            (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
    183            (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
    184 }
    185 
    186 void RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsExpandKernelDriverInfo *info,
    187                                                 uint32_t xstart, uint32_t xend,
    188                                                 uint32_t outstep) {
    189     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    190 
    191     if (!cp->mAlloc.get()) {
    192         ALOGE("Convolve3x3 executed without input, skipping");
    193         return;
    194     }
    195     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    196     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    197 
    198     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    199     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    200     const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
    201     const uchar4 *py1 = (const uchar4 *)(pin + stride * info->current.y);
    202     const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
    203 
    204     uchar4 *out = (uchar4 *)info->outPtr[0];
    205     uint32_t x1 = xstart;
    206     uint32_t x2 = xend;
    207     if(x1 == 0) {
    208         ConvolveOneU4(info, 0, out, py0, py1, py2, cp->mFp);
    209         x1 ++;
    210         out++;
    211     }
    212 
    213     if(x2 > x1) {
    214 #if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
    215         if (gArchUseSIMD) {
    216             int32_t len = (x2 - x1 - 1) >> 1;
    217             if(len > 0) {
    218                 rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    219                 x1 += len << 1;
    220                 out += len << 1;
    221             }
    222         }
    223 #endif
    224 
    225         while(x1 != x2) {
    226             ConvolveOneU4(info, x1, out, py0, py1, py2, cp->mFp);
    227             out++;
    228             x1++;
    229         }
    230     }
    231 }
    232 
    233 void RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsExpandKernelDriverInfo *info,
    234                                                 uint32_t xstart, uint32_t xend,
    235                                                 uint32_t outstep) {
    236     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    237 
    238     if (!cp->mAlloc.get()) {
    239         ALOGE("Convolve3x3 executed without input, skipping");
    240         return;
    241     }
    242     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    243     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    244 
    245     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    246     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    247     const uchar2 *py0 = (const uchar2 *)(pin + stride * y2);
    248     const uchar2 *py1 = (const uchar2 *)(pin + stride * info->current.y);
    249     const uchar2 *py2 = (const uchar2 *)(pin + stride * y1);
    250 
    251     uchar2 *out = (uchar2 *)info->outPtr[0];
    252     uint32_t x1 = xstart;
    253     uint32_t x2 = xend;
    254     if(x1 == 0) {
    255         ConvolveOneU2(info, 0, out, py0, py1, py2, cp->mFp);
    256         x1 ++;
    257         out++;
    258     }
    259 
    260     if(x2 > x1) {
    261 #if 0//defined(ARCH_ARM_HAVE_NEON)
    262         int32_t len = (x2 - x1 - 1) >> 1;
    263         if(len > 0) {
    264             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    265             x1 += len << 1;
    266             out += len << 1;
    267         }
    268 #endif
    269 
    270         while(x1 != x2) {
    271             ConvolveOneU2(info, x1, out, py0, py1, py2, cp->mFp);
    272             out++;
    273             x1++;
    274         }
    275     }
    276 }
    277 
    278 void RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsExpandKernelDriverInfo *info,
    279                                                 uint32_t xstart, uint32_t xend,
    280                                                 uint32_t outstep) {
    281     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    282 
    283     if (!cp->mAlloc.get()) {
    284         ALOGE("Convolve3x3 executed without input, skipping");
    285         return;
    286     }
    287     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    288     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    289 
    290     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    291     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    292     const uchar *py0 = (const uchar *)(pin + stride * y2);
    293     const uchar *py1 = (const uchar *)(pin + stride * info->current.y);
    294     const uchar *py2 = (const uchar *)(pin + stride * y1);
    295 
    296     uchar *out = (uchar *)info->outPtr[0];
    297     uint32_t x1 = xstart;
    298     uint32_t x2 = xend;
    299     if(x1 == 0) {
    300         ConvolveOneU1(info, 0, out, py0, py1, py2, cp->mFp);
    301         x1 ++;
    302         out++;
    303     }
    304 
    305     if(x2 > x1) {
    306 #if 0//defined(ARCH_ARM_HAVE_NEON)
    307         int32_t len = (x2 - x1 - 1) >> 1;
    308         if(len > 0) {
    309             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    310             x1 += len << 1;
    311             out += len << 1;
    312         }
    313 #endif
    314 
    315         while(x1 != x2) {
    316             ConvolveOneU1(info, x1, out, py0, py1, py2, cp->mFp);
    317             out++;
    318             x1++;
    319         }
    320     }
    321 }
    322 
    323 void RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsExpandKernelDriverInfo *info,
    324                                                 uint32_t xstart, uint32_t xend,
    325                                                 uint32_t outstep) {
    326     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    327 
    328     if (!cp->mAlloc.get()) {
    329         ALOGE("Convolve3x3 executed without input, skipping");
    330         return;
    331     }
    332     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    333     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    334 
    335     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    336     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    337     const float4 *py0 = (const float4 *)(pin + stride * y2);
    338     const float4 *py1 = (const float4 *)(pin + stride * info->current.y);
    339     const float4 *py2 = (const float4 *)(pin + stride * y1);
    340 
    341     float4 *out = (float4 *)info->outPtr[0];
    342     uint32_t x1 = xstart;
    343     uint32_t x2 = xend;
    344     if(x1 == 0) {
    345         ConvolveOneF4(info, 0, out, py0, py1, py2, cp->mFp);
    346         x1 ++;
    347         out++;
    348     }
    349 
    350     if(x2 > x1) {
    351 #if 0//defined(ARCH_ARM_HAVE_NEON)
    352         int32_t len = (x2 - x1 - 1) >> 1;
    353         if(len > 0) {
    354             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    355             x1 += len << 1;
    356             out += len << 1;
    357         }
    358 #endif
    359 
    360         while(x1 != x2) {
    361             ConvolveOneF4(info, x1, out, py0, py1, py2, cp->mFp);
    362             out++;
    363             x1++;
    364         }
    365     }
    366 }
    367 
    368 void RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsExpandKernelDriverInfo *info,
    369                                                 uint32_t xstart, uint32_t xend,
    370                                                 uint32_t outstep) {
    371     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    372 
    373     if (!cp->mAlloc.get()) {
    374         ALOGE("Convolve3x3 executed without input, skipping");
    375         return;
    376     }
    377     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    378     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    379 
    380     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    381     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    382     const float2 *py0 = (const float2 *)(pin + stride * y2);
    383     const float2 *py1 = (const float2 *)(pin + stride * info->current.y);
    384     const float2 *py2 = (const float2 *)(pin + stride * y1);
    385 
    386     float2 *out = (float2 *)info->outPtr[0];
    387     uint32_t x1 = xstart;
    388     uint32_t x2 = xend;
    389     if(x1 == 0) {
    390         ConvolveOneF2(info, 0, out, py0, py1, py2, cp->mFp);
    391         x1 ++;
    392         out++;
    393     }
    394 
    395     if(x2 > x1) {
    396 #if 0//defined(ARCH_ARM_HAVE_NEON)
    397         int32_t len = (x2 - x1 - 1) >> 1;
    398         if(len > 0) {
    399             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    400             x1 += len << 1;
    401             out += len << 1;
    402         }
    403 #endif
    404 
    405         while(x1 != x2) {
    406             ConvolveOneF2(info, x1, out, py0, py1, py2, cp->mFp);
    407             out++;
    408             x1++;
    409         }
    410     }
    411 }
    412 void RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsExpandKernelDriverInfo *info,
    413                                                 uint32_t xstart, uint32_t xend,
    414                                                 uint32_t outstep) {
    415     RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)info->usr;
    416 
    417     if (!cp->mAlloc.get()) {
    418         ALOGE("Convolve3x3 executed without input, skipping");
    419         return;
    420     }
    421     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    422     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    423 
    424     uint32_t y1 = rsMin((int32_t)info->current.y + 1, (int32_t)(info->dim.y-1));
    425     uint32_t y2 = rsMax((int32_t)info->current.y - 1, 0);
    426     const float *py0 = (const float *)(pin + stride * y2);
    427     const float *py1 = (const float *)(pin + stride * info->current.y);
    428     const float *py2 = (const float *)(pin + stride * y1);
    429 
    430     float *out = (float *)info->outPtr[0];
    431     uint32_t x1 = xstart;
    432     uint32_t x2 = xend;
    433     if(x1 == 0) {
    434         ConvolveOneF1(info, 0, out, py0, py1, py2, cp->mFp);
    435         x1 ++;
    436         out++;
    437     }
    438 
    439     if(x2 > x1) {
    440 #if 0//defined(ARCH_ARM_HAVE_NEON)
    441         int32_t len = (x2 - x1 - 1) >> 1;
    442         if(len > 0) {
    443             rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
    444             x1 += len << 1;
    445             out += len << 1;
    446         }
    447 #endif
    448 
    449         while(x1 != x2) {
    450             ConvolveOneF1(info, x1, out, py0, py1, py2, cp->mFp);
    451             out++;
    452             x1++;
    453         }
    454     }
    455 }
    456 
    457 RsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
    458             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
    459             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
    460 
    461     if (e->getType() == RS_TYPE_FLOAT_32) {
    462         switch(e->getVectorSize()) {
    463         case 1:
    464             mRootPtr = &kernelF1;
    465             break;
    466         case 2:
    467             mRootPtr = &kernelF2;
    468             break;
    469         case 3:
    470         case 4:
    471             mRootPtr = &kernelF4;
    472             break;
    473         }
    474     } else {
    475         switch(e->getVectorSize()) {
    476         case 1:
    477             mRootPtr = &kernelU1;
    478             break;
    479         case 2:
    480             mRootPtr = &kernelU2;
    481             break;
    482         case 3:
    483         case 4:
    484             mRootPtr = &kernelU4;
    485             break;
    486         }
    487     }
    488     for(int ct=0; ct < 9; ct++) {
    489         mFp[ct] = 1.f / 9.f;
    490         mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
    491     }
    492 }
    493 
    494 RsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() {
    495 }
    496 
    497 void RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) {
    498     s->mHal.info.exportedVariableCount = 2;
    499 }
    500 
    501 void RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
    502     mAlloc.clear();
    503 }
    504 
    505 
    506 RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
    507 
    508     return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
    509 }
    510