Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <sys/mman.h>
     18 #include <unistd.h>
     19 
     20 #include "rsCpuIntrinsic.h"
     21 #include "rsCpuIntrinsicInlines.h"
     22 #include "linkloader/include/MemChunk.h"
     23 #include "linkloader/utils/flush_cpu_cache.h"
     24 
     25 #include <sys/mman.h>
     26 #include <stddef.h>
     27 #include <stdint.h>
     28 #include <stdlib.h>
     29 //#include <utils/StopWatch.h>
     30 
     31 
     32 /*  uint kernel
     33  *  Q0  D0:  Load slot for R
     34  *      D1:  Load slot for G
     35  *  Q1  D2:  Load slot for B
     36  *      D3:  Load slot for A
     37  *  Q2  D4:  Matrix
     38  *      D5:  =
     39  *  Q3  D6:  =
     40  *      D7:  =
     41  *  Q4  D8:  Add R
     42  *      D9:
     43  *  Q5  D10: Add G
     44  *      D11:
     45  *  Q6  D12: Add B
     46  *      D13:
     47  *  Q7  D14: Add A
     48  *      D15:
     49  *  Q8  D16:  I32: R Sum
     50  *      D17:
     51  *  Q9  D18:  I32: G Sum
     52  *      D19:
     53  *  Q10 D20:  I32: B Sum
     54  *      D21:
     55  *  Q11 D22:  I32: A Sum
     56  *      D23:
     57  *  Q12 D24:  U16: expanded R
     58  *      D25:
     59  *  Q13 D26:  U16: expanded G
     60  *      D27:
     61  *  Q14 D28:  U16: expanded B
     62  *      D29:
     63  *  Q15 D30:  U16: expanded A
     64  *      D31:
     65  *
     66  */
     67 
     68 /*  float kernel
     69  *  Q0  D0:  Load slot for R
     70  *      D1:  =
     71  *  Q1  D2:  Load slot for G
     72  *      D3:  =
     73  *  Q2  D4:  Load slot for B
     74  *      D5:  =
     75  *  Q3  D6:  Load slot for A
     76  *      D7:  =
     77  *  Q4  D8:  Matrix
     78  *      D9:  =
     79  *  Q5  D10: =
     80  *      D11: =
     81  *  Q6  D12: =
     82  *      D13: =
     83  *  Q7  D14: =
     84  *      D15: =
     85  *  Q8  D16: Add R
     86  *      D17: =
     87  *  Q9  D18: Add G
     88  *      D19: =
     89  *  Q10 D20: Add B
     90  *      D21: =
     91  *  Q11 D22: Add A
     92  *      D23: =
     93  *  Q12 D24: Sum R
     94  *      D25: =
     95  *  Q13 D26: Sum G
     96  *      D27: =
     97  *  Q14 D28: Sum B
     98  *      D29: =
     99  *  Q15 D30: Sum A
    100  *      D31: =
    101  *
    102  */
    103 
    104 
    105 
    106 using namespace android;
    107 using namespace android::renderscript;
    108 
    109 namespace android {
    110 namespace renderscript {
    111 
    112 typedef union {
    113     uint64_t key;
    114     struct {
    115         uint32_t inVecSize          :2;  // [0 - 1]
    116         uint32_t outVecSize         :2;  // [2 - 3]
    117         uint32_t inType             :4;  // [4 - 7]
    118         uint32_t outType            :4;  // [8 - 11]
    119         uint32_t dot                :1;  // [12]
    120         uint32_t _unused1           :1;  // [13]
    121         uint32_t copyAlpha          :1;  // [14]
    122         uint32_t _unused2           :1;  // [15]
    123         uint32_t coeffMask          :16; // [16-31]
    124         uint32_t addMask            :4;  // [32-35]
    125     } u;
    126 } Key_t;
    127 
    128 //Re-enable when intrinsic is fixed
    129 #if defined(ARCH_ARM64_USE_INTRINSICS)
    130 typedef struct {
    131     void (*column[4])(void);
    132     void (*store)(void);
    133     void (*load)(void);
    134     void (*store_end)(void);
    135     void (*load_end)(void);
    136 } FunctionTab_t;
    137 
    138 extern "C" void rsdIntrinsicColorMatrix_int_K(
    139              void *out, void const *in, size_t count,
    140              FunctionTab_t const *fns,
    141              int16_t const *mult, int32_t const *add);
    142 
    143 extern "C" void rsdIntrinsicColorMatrix_float_K(
    144              void *out, void const *in, size_t count,
    145              FunctionTab_t const *fns,
    146              float const *mult, float const *add);
    147 
    148 /* The setup functions fill in function tables to be used by above functions;
    149  * this code also eliminates jump-to-another-jump cases by short-circuiting
    150  * empty functions.  While it's not performance critical, it works out easier
    151  * to write the set-up code in assembly than to try to expose the same symbols
    152  * and write the code in C.
    153  */
    154 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
    155              FunctionTab_t *fns,
    156              uint32_t mask, int dt, int st);
    157 
    158 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
    159              FunctionTab_t *fns,
    160              uint32_t mask, int dt, int st);
    161 #endif
    162 
    163 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
    164 public:
    165     virtual void populateScript(Script *);
    166 
    167     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
    168 
    169     virtual ~RsdCpuScriptIntrinsicColorMatrix();
    170     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
    171 
    172     virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
    173                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
    174     virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
    175                             const void * usr, uint32_t usrLen, const RsScriptCall *sc);
    176 
    177 protected:
    178     float fp[16];
    179     float fpa[4];
    180 
    181     // The following four fields are read as constants
    182     // by the SIMD assembly code.
    183     short ip[16];
    184     int ipa[4];
    185     float tmpFp[16];
    186     float tmpFpa[4];
    187 #if defined(ARCH_ARM64_USE_INTRINSICS)
    188     FunctionTab_t mFnTab;
    189 #endif
    190 
    191     static void kernel(const RsForEachStubParamStruct *p,
    192                        uint32_t xstart, uint32_t xend,
    193                        uint32_t instep, uint32_t outstep);
    194     void updateCoeffCache(float fpMul, float addMul);
    195 
    196     Key_t mLastKey;
    197     unsigned char *mBuf;
    198     size_t mBufSize;
    199 
    200     Key_t computeKey(const Element *ein, const Element *eout);
    201 
    202     bool build(Key_t key);
    203 
    204     void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
    205 
    206 };
    207 
    208 }
    209 }
    210 
    211 
    212 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
    213         const Element *ein, const Element *eout) {
    214 
    215     Key_t key;
    216     key.key = 0;
    217 
    218     // Compute a unique code key for this operation
    219 
    220     // Add to the key the input and output types
    221     bool hasFloat = false;
    222     if (ein->getType() == RS_TYPE_FLOAT_32) {
    223         hasFloat = true;
    224         key.u.inType = RS_TYPE_FLOAT_32;
    225         rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
    226     }
    227     if (eout->getType() == RS_TYPE_FLOAT_32) {
    228         hasFloat = true;
    229         key.u.outType = RS_TYPE_FLOAT_32;
    230         rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
    231     }
    232 
    233     // Mask in the bits indicating which coefficients in the
    234     // color matrix are needed.
    235     if (hasFloat) {
    236         for (uint32_t i=0; i < 16; i++) {
    237             if (fabs(fp[i]) != 0.f) {
    238                 key.u.coeffMask |= 1 << i;
    239             }
    240         }
    241         if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
    242         if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
    243         if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
    244         if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
    245 
    246     } else {
    247         for (uint32_t i=0; i < 16; i++) {
    248             if (ip[i] != 0) {
    249                 key.u.coeffMask |= 1 << i;
    250             }
    251         }
    252         if (ipa[0] != 0) key.u.addMask |= 0x1;
    253         if (ipa[1] != 0) key.u.addMask |= 0x2;
    254         if (ipa[2] != 0) key.u.addMask |= 0x4;
    255         if (ipa[3] != 0) key.u.addMask |= 0x8;
    256     }
    257 
    258     // Look for a dot product where the r,g,b colums are the same
    259     if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
    260         (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
    261         (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
    262         (ip[12] == ip[13]) && (ip[12] == ip[14])) {
    263 
    264         if (!key.u.addMask) key.u.dot = 1;
    265     }
    266 
    267     // Is alpha a simple copy
    268     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
    269         key.u.copyAlpha = !(key.u.inType || key.u.outType);
    270     }
    271 
    272     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
    273 
    274     switch (ein->getVectorSize()) {
    275     case 4:
    276         key.u.inVecSize = 3;
    277         break;
    278     case 3:
    279         key.u.inVecSize = 2;
    280         key.u.coeffMask &= ~0xF000;
    281         break;
    282     case 2:
    283         key.u.inVecSize = 1;
    284         key.u.coeffMask &= ~0xFF00;
    285         break;
    286     default:
    287         key.u.coeffMask &= ~0xFFF0;
    288         break;
    289     }
    290 
    291     switch (eout->getVectorSize()) {
    292     case 4:
    293         key.u.outVecSize = 3;
    294         break;
    295     case 3:
    296         key.u.outVecSize = 2;
    297         key.u.coeffMask &= ~0x8888;
    298         key.u.addMask &= 7;
    299         break;
    300     case 2:
    301         key.u.outVecSize = 1;
    302         key.u.coeffMask &= ~0xCCCC;
    303         key.u.addMask &= 3;
    304         break;
    305     default:
    306         key.u.coeffMask &= ~0xEEEE;
    307         key.u.addMask &= 1;
    308         break;
    309     }
    310 
    311     if (key.u.inType && !key.u.outType) {
    312         key.u.addMask |= 1;
    313         if (key.u.outVecSize > 0) key.u.addMask |= 2;
    314         if (key.u.outVecSize > 1) key.u.addMask |= 4;
    315         if (key.u.outVecSize > 2) key.u.addMask |= 8;
    316     }
    317 
    318     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
    319     return key;
    320 }
    321 
    322 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
    323 
    324 #define DEF_SYM(x)                                  \
    325     extern "C" uint32_t _N_ColorMatrix_##x;      \
    326     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
    327     extern "C" uint32_t _N_ColorMatrix_##x##_len;
    328 
    329 DEF_SYM(prefix_i)
    330 DEF_SYM(prefix_f)
    331 DEF_SYM(postfix1)
    332 DEF_SYM(postfix2)
    333 
    334 DEF_SYM(load_u8_4)
    335 DEF_SYM(load_u8_3)
    336 DEF_SYM(load_u8_2)
    337 DEF_SYM(load_u8_1)
    338 DEF_SYM(load_u8f_4)
    339 DEF_SYM(load_u8f_3)
    340 DEF_SYM(load_u8f_2)
    341 DEF_SYM(load_u8f_1)
    342 DEF_SYM(load_f32_4)
    343 DEF_SYM(load_f32_3)
    344 DEF_SYM(load_f32_2)
    345 DEF_SYM(load_f32_1)
    346 
    347 DEF_SYM(store_u8_4)
    348 DEF_SYM(store_u8_2)
    349 DEF_SYM(store_u8_1)
    350 DEF_SYM(store_f32_4)
    351 DEF_SYM(store_f32_3)
    352 DEF_SYM(store_f32_2)
    353 DEF_SYM(store_f32_1)
    354 DEF_SYM(store_f32u_4)
    355 DEF_SYM(store_f32u_2)
    356 DEF_SYM(store_f32u_1)
    357 
    358 DEF_SYM(unpack_u8_4)
    359 DEF_SYM(unpack_u8_3)
    360 DEF_SYM(unpack_u8_2)
    361 DEF_SYM(unpack_u8_1)
    362 DEF_SYM(pack_u8_4)
    363 DEF_SYM(pack_u8_3)
    364 DEF_SYM(pack_u8_2)
    365 DEF_SYM(pack_u8_1)
    366 DEF_SYM(dot)
    367 DEF_SYM(add_0_u8)
    368 DEF_SYM(add_1_u8)
    369 DEF_SYM(add_2_u8)
    370 DEF_SYM(add_3_u8)
    371 
    372 #define ADD_CHUNK(x) \
    373     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
    374     buf += _N_ColorMatrix_##x##_len
    375 
    376 
    377 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
    378     size_t off = (target - buf - 8) >> 2;
    379     rsAssert(((off & 0xff000000) == 0) ||
    380            ((off & 0xff000000) == 0xff000000));
    381 
    382     uint32_t op = (condition << 28);
    383     op |= 0xa << 24;  // branch
    384     op |= 0xffffff & off;
    385     ((uint32_t *)buf)[0] = op;
    386     return buf + 4;
    387 }
    388 
    389 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
    390     rsAssert(vd < 32);
    391     rsAssert(vm < 32);
    392     rsAssert(vn < 32);
    393 
    394     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
    395     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
    396     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
    397     return op;
    398 }
    399 
    400 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    401     //vmlal.s16 Q#1, D#1, D#2[#]
    402     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
    403     ((uint32_t *)buf)[0] = op;
    404     return buf + 4;
    405 }
    406 
    407 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    408     //vmull.s16 Q#1, D#1, D#2[#]
    409     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
    410     ((uint32_t *)buf)[0] = op;
    411     return buf + 4;
    412 }
    413 
    414 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    415     //vqadd.s32 Q#1, Q#1, Q#2
    416     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    417     ((uint32_t *)buf)[0] = op;
    418     return buf + 4;
    419 }
    420 
    421 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    422     //vmlal.f32 Q#1, D#1, D#2[#]
    423     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
    424     ((uint32_t *)buf)[0] = op;
    425     return buf + 4;
    426 }
    427 
    428 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    429     //vmull.f32 Q#1, D#1, D#2[#]
    430     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
    431     ((uint32_t *)buf)[0] = op;
    432     return buf + 4;
    433 }
    434 
    435 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    436     //vadd.f32 Q#1, D#1, D#2
    437     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    438     ((uint32_t *)buf)[0] = op;
    439     return buf + 4;
    440 }
    441 
    442 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
    443     //vmov.32 Q#1, #imm
    444     rsAssert(imm == 0);
    445     uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
    446     ((uint32_t *)buf)[0] = op;
    447     return buf + 4;
    448 }
    449 
    450 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    451     //vadd.f32 Q#1, D#1, D#2
    452     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    453     ((uint32_t *)buf)[0] = op;
    454     return buf + 4;
    455 }
    456 #endif
    457 
    458 #if defined(ARCH_X86_HAVE_SSSE3)
    459 extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
    460                                   const short *coef, uint32_t count);
    461 extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
    462                                   const short *coef, uint32_t count);
    463 extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
    464                                   const short *coef, uint32_t count);
    465 
    466 void * selectKernel(Key_t key)
    467 {
    468     void * kernel = NULL;
    469 
    470     // inType, outType float if nonzero
    471     if (!(key.u.inType || key.u.outType)) {
    472         if (key.u.dot)
    473             kernel = (void *)rsdIntrinsicColorMatrixDot_K;
    474         else if (key.u.copyAlpha)
    475             kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
    476         else
    477             kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
    478     }
    479 
    480     return kernel;
    481 }
    482 #endif
    483 
    484 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
    485 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
    486     mBufSize = 4096;
    487     //StopWatch build_time("rs cm: build time");
    488     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
    489                                   MAP_PRIVATE | MAP_ANON, -1, 0);
    490     if (mBuf == MAP_FAILED) {
    491         mBuf = NULL;
    492         return false;
    493     }
    494 
    495     uint8_t *buf = mBuf;
    496     uint8_t *buf2 = NULL;
    497 
    498     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
    499     int opInit[4] = {0, 0, 0, 0};
    500 
    501     memset(ops, 0, sizeof(ops));
    502     for (int i=0; i < 4; i++) {
    503         if (key.u.coeffMask & (1 << (i*4))) {
    504             ops[i][0] = 0x2 | opInit[0];
    505             opInit[0] = 1;
    506         }
    507         if (!key.u.dot) {
    508             if (key.u.coeffMask & (1 << (1 + i*4))) {
    509                 ops[i][1] = 0x2 | opInit[1];
    510                 opInit[1] = 1;
    511             }
    512             if (key.u.coeffMask & (1 << (2 + i*4))) {
    513                 ops[i][2] = 0x2 | opInit[2];
    514                 opInit[2] = 1;
    515             }
    516         }
    517         if (!key.u.copyAlpha) {
    518             if (key.u.coeffMask & (1 << (3 + i*4))) {
    519                 ops[i][3] = 0x2 | opInit[3];
    520                 opInit[3] = 1;
    521             }
    522         }
    523     }
    524 
    525     if (key.u.inType || key.u.outType) {
    526         key.u.copyAlpha = 0;
    527         ADD_CHUNK(prefix_f);
    528         buf2 = buf;
    529 
    530         // Load the incoming r,g,b,a as needed
    531         if (key.u.inType) {
    532             switch(key.u.inVecSize) {
    533             case 3:
    534                 ADD_CHUNK(load_f32_4);
    535                 break;
    536             case 2:
    537                 ADD_CHUNK(load_f32_3);
    538                 break;
    539             case 1:
    540                 ADD_CHUNK(load_f32_2);
    541                 break;
    542             case 0:
    543                 ADD_CHUNK(load_f32_1);
    544                 break;
    545             }
    546         } else {
    547             switch(key.u.inVecSize) {
    548             case 3:
    549                 ADD_CHUNK(load_u8f_4);
    550                 break;
    551             case 2:
    552                 ADD_CHUNK(load_u8f_3);
    553                 break;
    554             case 1:
    555                 ADD_CHUNK(load_u8f_2);
    556                 break;
    557             case 0:
    558                 ADD_CHUNK(load_u8f_1);
    559                 break;
    560             }
    561         }
    562 
    563         for (int i=0; i < 4; i++) {
    564             for (int j=0; j < 4; j++) {
    565                 switch(ops[i][j]) {
    566                 case 0:
    567                     break;
    568                 case 2:
    569                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
    570                     break;
    571                 case 3:
    572                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
    573                     break;
    574                 }
    575             }
    576         }
    577         for (int j=0; j < 4; j++) {
    578             if (opInit[j]) {
    579                 if (key.u.addMask & (1 << j)) {
    580                     buf = addVADD_F32(buf, j, 12+j, 8+j);
    581                 } else {
    582                     buf = addVORR_32(buf, j, 12+j, 12+j);
    583                 }
    584             } else {
    585                 if (key.u.addMask & (1 << j)) {
    586                     buf = addVORR_32(buf, j, 8+j, 8+j);
    587                 } else {
    588                     buf = addVMOV_32(buf, j, 0);
    589                 }
    590             }
    591         }
    592 
    593         if (key.u.outType) {
    594             switch(key.u.outVecSize) {
    595             case 3:
    596                 ADD_CHUNK(store_f32_4);
    597                 break;
    598             case 2:
    599                 ADD_CHUNK(store_f32_3);
    600                 break;
    601             case 1:
    602                 ADD_CHUNK(store_f32_2);
    603                 break;
    604             case 0:
    605                 ADD_CHUNK(store_f32_1);
    606                 break;
    607             }
    608         } else {
    609             switch(key.u.outVecSize) {
    610             case 3:
    611             case 2:
    612                 ADD_CHUNK(store_f32u_4);
    613                 break;
    614             case 1:
    615                 ADD_CHUNK(store_f32u_2);
    616                 break;
    617             case 0:
    618                 ADD_CHUNK(store_f32u_1);
    619                 break;
    620             }
    621         }
    622 
    623 
    624     } else {
    625         // Add the function prefix
    626         // Store the address for the loop return
    627         ADD_CHUNK(prefix_i);
    628         buf2 = buf;
    629 
    630         // Load the incoming r,g,b,a as needed
    631         switch(key.u.inVecSize) {
    632         case 3:
    633             ADD_CHUNK(load_u8_4);
    634             if (key.u.copyAlpha) {
    635                 ADD_CHUNK(unpack_u8_3);
    636             } else {
    637                 ADD_CHUNK(unpack_u8_4);
    638             }
    639             break;
    640         case 2:
    641             ADD_CHUNK(load_u8_3);
    642             ADD_CHUNK(unpack_u8_3);
    643             break;
    644         case 1:
    645             ADD_CHUNK(load_u8_2);
    646             ADD_CHUNK(unpack_u8_2);
    647             break;
    648         case 0:
    649             ADD_CHUNK(load_u8_1);
    650             ADD_CHUNK(unpack_u8_1);
    651             break;
    652         }
    653 
    654         // Add multiply and accumulate
    655         // use MULL to init the output register,
    656         // use MLAL from there
    657         for (int i=0; i < 4; i++) {
    658             for (int j=0; j < 4; j++) {
    659                 switch(ops[i][j]) {
    660                 case 0:
    661                     break;
    662                 case 2:
    663                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
    664                     break;
    665                 case 3:
    666                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
    667                     break;
    668                 }
    669             }
    670         }
    671         for (int j=0; j < 4; j++) {
    672             if (opInit[j]) {
    673                 if (key.u.addMask & (1 << j)) {
    674                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
    675                 }
    676             } else {
    677                 if (key.u.addMask & (1 << j)) {
    678                     buf = addVORR_32(buf, 8+j, 4+j, 4+j);
    679                 }
    680             }
    681         }
    682 
    683         // If we have a dot product, perform the special pack.
    684         if (key.u.dot) {
    685             ADD_CHUNK(pack_u8_1);
    686             ADD_CHUNK(dot);
    687         } else {
    688             switch(key.u.outVecSize) {
    689             case 3:
    690                 if (key.u.copyAlpha) {
    691                     ADD_CHUNK(pack_u8_3);
    692                 } else {
    693                     ADD_CHUNK(pack_u8_4);
    694                 }
    695                 break;
    696             case 2:
    697                 ADD_CHUNK(pack_u8_3);
    698                 break;
    699             case 1:
    700                 ADD_CHUNK(pack_u8_2);
    701                 break;
    702             case 0:
    703                 ADD_CHUNK(pack_u8_1);
    704                 break;
    705             }
    706         }
    707 
    708         // Write out result
    709         switch(key.u.outVecSize) {
    710         case 3:
    711         case 2:
    712             ADD_CHUNK(store_u8_4);
    713             break;
    714         case 1:
    715             ADD_CHUNK(store_u8_2);
    716             break;
    717         case 0:
    718             ADD_CHUNK(store_u8_1);
    719             break;
    720         }
    721     }
    722 
    723     if (key.u.inType != key.u.outType) {
    724         key.u.copyAlpha = 0;
    725         key.u.dot = 0;
    726     }
    727 
    728     // Loop, branch, and cleanup
    729     ADD_CHUNK(postfix1);
    730     buf = addBranch(buf, buf2, 0x01);
    731     ADD_CHUNK(postfix2);
    732 
    733     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
    734     if (ret == -1) {
    735         ALOGE("mprotect error %i", ret);
    736         return false;
    737     }
    738 
    739     FLUSH_CPU_CACHE(mBuf, (char*) mBuf + mBufSize);
    740     return true;
    741 #else
    742     return false;
    743 #endif
    744 }
    745 
    746 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
    747     for(int ct=0; ct < 16; ct++) {
    748         ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
    749         tmpFp[ct] = fp[ct] * fpMul;
    750         //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
    751     }
    752 
    753     float add = 0.f;
    754     if (fpMul > 254.f) add = 0.5f;
    755     for(int ct=0; ct < 4; ct++) {
    756         tmpFpa[ct] = fpa[ct] * addMul + add;
    757         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
    758     }
    759 
    760     for(int ct=0; ct < 4; ct++) {
    761         ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
    762     }
    763 }
    764 
    765 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
    766                                                     size_t dataLength) {
    767     switch(slot) {
    768     case 0:
    769         memcpy (fp, data, sizeof(fp));
    770         break;
    771     case 1:
    772         memcpy (fpa, data, sizeof(fpa));
    773         break;
    774     default:
    775         rsAssert(0);
    776         break;
    777     }
    778     mRootPtr = &kernel;
    779 }
    780 
    781 
    782 static void One(const RsForEachStubParamStruct *p, void *out,
    783                 const void *py, const float* coeff, const float *add,
    784                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
    785 
    786     float4 f = 0.f;
    787     if (fin) {
    788         switch(vsin) {
    789         case 3:
    790             f = ((const float4 *)py)[0];
    791             break;
    792         case 2:
    793             f = ((const float4 *)py)[0];
    794             f.w = 0.f;
    795             break;
    796         case 1:
    797             f.xy = ((const float2 *)py)[0];
    798             break;
    799         case 0:
    800             f.x = ((const float *)py)[0];
    801             break;
    802         }
    803     } else {
    804         switch(vsin) {
    805         case 3:
    806             f = convert_float4(((const uchar4 *)py)[0]);
    807             break;
    808         case 2:
    809             f = convert_float4(((const uchar4 *)py)[0]);
    810             f.w = 0.f;
    811             break;
    812         case 1:
    813             f.xy = convert_float2(((const uchar2 *)py)[0]);
    814             break;
    815         case 0:
    816             f.x = (float)(((const uchar *)py)[0]);
    817             break;
    818         }
    819     }
    820     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
    821 
    822     float4 sum;
    823     sum.x = f.x * coeff[0] +
    824             f.y * coeff[4] +
    825             f.z * coeff[8] +
    826             f.w * coeff[12];
    827     sum.y = f.x * coeff[1] +
    828             f.y * coeff[5] +
    829             f.z * coeff[9] +
    830             f.w * coeff[13];
    831     sum.z = f.x * coeff[2] +
    832             f.y * coeff[6] +
    833             f.z * coeff[10] +
    834             f.w * coeff[14];
    835     sum.w = f.x * coeff[3] +
    836             f.y * coeff[7] +
    837             f.z * coeff[11] +
    838             f.w * coeff[15];
    839     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
    840 
    841     sum.x += add[0];
    842     sum.y += add[1];
    843     sum.z += add[2];
    844     sum.w += add[3];
    845 
    846 
    847     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
    848     if (fout) {
    849         switch(vsout) {
    850         case 3:
    851         case 2:
    852             ((float4 *)out)[0] = sum;
    853             break;
    854         case 1:
    855             ((float2 *)out)[0] = sum.xy;
    856             break;
    857         case 0:
    858             ((float *)out)[0] = sum.x;
    859             break;
    860         }
    861     } else {
    862         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
    863         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
    864         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
    865         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
    866 
    867         switch(vsout) {
    868         case 3:
    869         case 2:
    870             ((uchar4 *)out)[0] = convert_uchar4(sum);
    871             break;
    872         case 1:
    873             ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
    874             break;
    875         case 0:
    876             ((uchar *)out)[0] = sum.x;
    877             break;
    878         }
    879     }
    880     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
    881 }
    882 
    883 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
    884                                               uint32_t xstart, uint32_t xend,
    885                                               uint32_t instep, uint32_t outstep) {
    886     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
    887     uchar *out = (uchar *)p->out;
    888     uchar *in = (uchar *)p->in;
    889     uint32_t x1 = xstart;
    890     uint32_t x2 = xend;
    891 
    892     uint32_t vsin = cp->mLastKey.u.inVecSize;
    893     uint32_t vsout = cp->mLastKey.u.outVecSize;
    894     bool floatIn = !!cp->mLastKey.u.inType;
    895     bool floatOut = !!cp->mLastKey.u.outType;
    896 
    897     //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
    898 
    899     if(x2 > x1) {
    900         int32_t len = x2 - x1;
    901         if (gArchUseSIMD) {
    902             if((cp->mOptKernel != NULL) && (len >= 4)) {
    903                 // The optimized kernel processes 4 pixels at once
    904                 // and requires a minimum of 1 chunk of 4
    905                 cp->mOptKernel(out, in, cp->ip, len >> 2);
    906                 // Update the len and pointers so the generic code can
    907                 // finish any leftover pixels
    908                 len &= ~3;
    909                 x1 += len;
    910                 out += outstep * len;
    911                 in += instep * len;
    912             }
    913 #if defined(ARCH_ARM64_USE_INTRINSICS)
    914             else {
    915                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
    916                     // Currently this generates off by one errors.
    917                     //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
    918                     //x1 += len;
    919                     //out += outstep * len;
    920                     //in += instep * len;
    921                 } else {
    922                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
    923                     x1 += len;
    924                     out += outstep * len;
    925                     in += instep * len;
    926                 }
    927             }
    928 #endif
    929         }
    930 
    931         while(x1 != x2) {
    932             One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
    933             out += outstep;
    934             in += instep;
    935             x1++;
    936         }
    937     }
    938 }
    939 
    940 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
    941         uint32_t slot, const Allocation * ain, Allocation * aout,
    942         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
    943 
    944     const Element *ein = ain->mHal.state.type->getElement();
    945     const Element *eout = aout->mHal.state.type->getElement();
    946 
    947     if (ein->getType() == eout->getType()) {
    948         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
    949             updateCoeffCache(1.f, 255.f);
    950         } else {
    951             updateCoeffCache(1.f, 1.f);
    952         }
    953     } else {
    954         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
    955             updateCoeffCache(255.f, 255.f);
    956         } else {
    957             updateCoeffCache(1.f / 255.f, 1.f);
    958         }
    959     }
    960 
    961     Key_t key = computeKey(ain->mHal.state.type->getElement(),
    962                            aout->mHal.state.type->getElement());
    963 #if defined(ARCH_X86_HAVE_SSSE3)
    964     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
    965         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
    966         // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
    967         mLastKey = key;
    968     }
    969 
    970 #else //if !defined(ARCH_X86_HAVE_SSSE3)
    971     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
    972         if (mBuf) munmap(mBuf, mBufSize);
    973         mBuf = NULL;
    974         mOptKernel = NULL;
    975         if (build(key)) {
    976             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
    977         }
    978 #if defined(ARCH_ARM64_USE_INTRINSICS)
    979         else {
    980             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
    981             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
    982             uint32_t mm = 0;
    983             int i;
    984             for (i = 0; i < 4; i++)
    985             {
    986                 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
    987                 m = ((m * 0x249) >> 9) & 15;
    988                 m |= ((key.u.addMask >> i) & 1) << 4;
    989                 mm |= m << (i * 5);
    990             }
    991 
    992             if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
    993                 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
    994             } else {
    995                 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
    996             }
    997         }
    998 #endif
    999         mLastKey = key;
   1000     }
   1001 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
   1002 }
   1003 
   1004 void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
   1005         uint32_t slot, const Allocation * ain, Allocation * aout,
   1006         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
   1007 
   1008 }
   1009 
   1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
   1011             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
   1012             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
   1013 
   1014     mLastKey.key = 0;
   1015     mBuf = NULL;
   1016     mBufSize = 0;
   1017     mOptKernel = NULL;
   1018     const static float defaultMatrix[] = {
   1019         1.f, 0.f, 0.f, 0.f,
   1020         0.f, 1.f, 0.f, 0.f,
   1021         0.f, 0.f, 1.f, 0.f,
   1022         0.f, 0.f, 0.f, 1.f
   1023     };
   1024     const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
   1025     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
   1026     setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
   1027 }
   1028 
   1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
   1030     if (mBuf) munmap(mBuf, mBufSize);
   1031     mBuf = NULL;
   1032     mOptKernel = NULL;
   1033 }
   1034 
   1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
   1036     s->mHal.info.exportedVariableCount = 2;
   1037 }
   1038 
   1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
   1040                                             const Script *s, const Element *e) {
   1041 
   1042     return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
   1043 }
   1044