Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <sys/mman.h>
     18 #include <unistd.h>
     19 
     20 #include "rsCpuIntrinsic.h"
     21 #include "rsCpuIntrinsicInlines.h"
     22 #include "linkloader/include/MemChunk.h"
     23 
     24 #include <sys/mman.h>
     25 #include <stddef.h>
     26 #include <stdint.h>
     27 #include <stdlib.h>
     28 //#include <utils/StopWatch.h>
     29 
     30 
     31 /*  uint kernel
     32  *  Q0  D0:  Load slot for R
     33  *      D1:  Load slot for G
     34  *  Q1  D2:  Load slot for B
     35  *      D3:  Load slot for A
     36  *  Q2  D4:  Matrix
     37  *      D5:  =
     38  *  Q3  D6:  =
     39  *      D7:  =
     40  *  Q4  D8:  Add R
     41  *      D9:
     42  *  Q5  D10: Add G
     43  *      D11:
     44  *  Q6  D12: Add B
     45  *      D13:
     46  *  Q7  D14: Add A
     47  *      D15:
     48  *  Q8  D16:  I32: R Sum
     49  *      D17:
     50  *  Q9  D18:  I32: G Sum
     51  *      D19:
     52  *  Q10 D20:  I32: B Sum
     53  *      D21:
     54  *  Q11 D22:  I32: A Sum
     55  *      D23:
     56  *  Q12 D24:  U16: expanded R
     57  *      D25:
     58  *  Q13 D26:  U16: expanded G
     59  *      D27:
     60  *  Q14 D28:  U16: expanded B
     61  *      D29:
     62  *  Q15 D30:  U16: expanded A
     63  *      D31:
     64  *
     65  */
     66 
     67 /*  float kernel
     68  *  Q0  D0:  Load slot for R
     69  *      D1:  =
     70  *  Q1  D2:  Load slot for G
     71  *      D3:  =
     72  *  Q2  D4:  Load slot for B
     73  *      D5:  =
     74  *  Q3  D6:  Load slot for A
     75  *      D7:  =
     76  *  Q4  D8:  Matrix
     77  *      D9:  =
     78  *  Q5  D10: =
     79  *      D11: =
     80  *  Q6  D12: =
     81  *      D13: =
     82  *  Q7  D14: =
     83  *      D15: =
     84  *  Q8  D16: Add R
     85  *      D17: =
     86  *  Q9  D18: Add G
     87  *      D19: =
     88  *  Q10 D20: Add B
     89  *      D21: =
     90  *  Q11 D22: Add A
     91  *      D23: =
     92  *  Q12 D24: Sum R
     93  *      D25: =
     94  *  Q13 D26: Sum G
     95  *      D27: =
     96  *  Q14 D28: Sum B
     97  *      D29: =
     98  *  Q15 D30: Sum A
     99  *      D31: =
    100  *
    101  */
    102 
    103 
    104 
    105 using namespace android;
    106 using namespace android::renderscript;
    107 
    108 namespace android {
    109 namespace renderscript {
    110 
    111 typedef union {
    112     uint64_t key;
    113     struct {
    114         uint32_t inVecSize          :2;  // [0 - 1]
    115         uint32_t outVecSize         :2;  // [2 - 3]
    116         uint32_t inType             :4;  // [4 - 7]
    117         uint32_t outType            :4;  // [8 - 11]
    118         uint32_t dot                :1;  // [12]
    119         uint32_t _unused1           :1;  // [13]
    120         uint32_t copyAlpha          :1;  // [14]
    121         uint32_t _unused2           :1;  // [15]
    122         uint32_t coeffMask          :16; // [16-31]
    123         uint32_t addMask            :4;  // [32-35]
    124     } u;
    125 } Key_t;
    126 
    127 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
    128 public:
    129     virtual void populateScript(Script *);
    130 
    131     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
    132 
    133     virtual ~RsdCpuScriptIntrinsicColorMatrix();
    134     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
    135 
    136     virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
    137                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
    138     virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
    139                             const void * usr, uint32_t usrLen, const RsScriptCall *sc);
    140 
    141 protected:
    142     float fp[16];
    143     float fpa[4];
    144 
    145     // The following four fields are read as constants
    146     // by the SIMD assembly code.
    147     short ip[16];
    148     int ipa[16];
    149     float tmpFp[16];
    150     float tmpFpa[16];
    151 
    152     static void kernel(const RsForEachStubParamStruct *p,
    153                        uint32_t xstart, uint32_t xend,
    154                        uint32_t instep, uint32_t outstep);
    155     void updateCoeffCache(float fpMul, float addMul);
    156 
    157     Key_t mLastKey;
    158     unsigned char *mBuf;
    159     size_t mBufSize;
    160 
    161     Key_t computeKey(const Element *ein, const Element *eout);
    162 
    163     bool build(Key_t key);
    164 
    165     void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
    166 
    167 };
    168 
    169 }
    170 }
    171 
    172 
    173 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
    174         const Element *ein, const Element *eout) {
    175 
    176     Key_t key;
    177     key.key = 0;
    178 
    179     // Compute a unique code key for this operation
    180 
    181     // Add to the key the input and output types
    182     bool hasFloat = false;
    183     if (ein->getType() == RS_TYPE_FLOAT_32) {
    184         hasFloat = true;
    185         key.u.inType = RS_TYPE_FLOAT_32;
    186         rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
    187     }
    188     if (eout->getType() == RS_TYPE_FLOAT_32) {
    189         hasFloat = true;
    190         key.u.outType = RS_TYPE_FLOAT_32;
    191         rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
    192     }
    193 
    194     // Mask in the bits indicating which coefficients in the
    195     // color matrix are needed.
    196     if (hasFloat) {
    197         for (uint32_t i=0; i < 16; i++) {
    198             if (fabs(fp[i]) != 0.f) {
    199                 key.u.coeffMask |= 1 << i;
    200             }
    201         }
    202         if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
    203         if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
    204         if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
    205         if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
    206 
    207     } else {
    208         for (uint32_t i=0; i < 16; i++) {
    209             if (ip[i] != 0) {
    210                 key.u.coeffMask |= 1 << i;
    211             }
    212         }
    213         if (ipa[0] != 0) key.u.addMask |= 0x1;
    214         if (ipa[4] != 0) key.u.addMask |= 0x2;
    215         if (ipa[8] != 0) key.u.addMask |= 0x4;
    216         if (ipa[12] != 0) key.u.addMask |= 0x8;
    217     }
    218 
    219     // Look for a dot product where the r,g,b colums are the same
    220     if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
    221         (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
    222         (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
    223         (ip[12] == ip[13]) && (ip[12] == ip[14])) {
    224 
    225         if (!key.u.addMask) key.u.dot = 1;
    226     }
    227 
    228     // Is alpha a simple copy
    229     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
    230         key.u.copyAlpha = !(key.u.inType || key.u.outType);
    231     }
    232 
    233     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
    234 
    235     switch (ein->getVectorSize()) {
    236     case 4:
    237         key.u.inVecSize = 3;
    238         break;
    239     case 3:
    240         key.u.inVecSize = 2;
    241         key.u.coeffMask &= ~0xF000;
    242         break;
    243     case 2:
    244         key.u.inVecSize = 1;
    245         key.u.coeffMask &= ~0xFF00;
    246         break;
    247     default:
    248         key.u.coeffMask &= ~0xFFF0;
    249         break;
    250     }
    251 
    252     switch (eout->getVectorSize()) {
    253     case 4:
    254         key.u.outVecSize = 3;
    255         break;
    256     case 3:
    257         key.u.outVecSize = 2;
    258         key.u.coeffMask &= ~0x8888;
    259         break;
    260     case 2:
    261         key.u.outVecSize = 1;
    262         key.u.coeffMask &= ~0xCCCC;
    263         break;
    264     default:
    265         key.u.coeffMask &= ~0xEEEE;
    266         break;
    267     }
    268 
    269     if (key.u.inType && !key.u.outType) {
    270         key.u.addMask |= 1;
    271         if (key.u.outVecSize > 0) key.u.addMask |= 2;
    272         if (key.u.outVecSize > 1) key.u.addMask |= 4;
    273         if (key.u.outVecSize > 2) key.u.addMask |= 8;
    274     }
    275 
    276     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
    277     return key;
    278 }
    279 
    280 #if defined(ARCH_ARM_HAVE_NEON)
    281 
    282 #define DEF_SYM(x)                                  \
    283     extern "C" uint32_t _N_ColorMatrix_##x;      \
    284     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
    285     extern "C" uint32_t _N_ColorMatrix_##x##_len;
    286 
    287 DEF_SYM(prefix_i)
    288 DEF_SYM(prefix_f)
    289 DEF_SYM(postfix1)
    290 DEF_SYM(postfix2)
    291 
    292 DEF_SYM(load_u8_4)
    293 DEF_SYM(load_u8_3)
    294 DEF_SYM(load_u8_2)
    295 DEF_SYM(load_u8_1)
    296 DEF_SYM(load_u8f_4)
    297 DEF_SYM(load_u8f_3)
    298 DEF_SYM(load_u8f_2)
    299 DEF_SYM(load_u8f_1)
    300 DEF_SYM(load_f32_4)
    301 DEF_SYM(load_f32_3)
    302 DEF_SYM(load_f32_2)
    303 DEF_SYM(load_f32_1)
    304 
    305 DEF_SYM(store_u8_4)
    306 DEF_SYM(store_u8_2)
    307 DEF_SYM(store_u8_1)
    308 DEF_SYM(store_f32_4)
    309 DEF_SYM(store_f32_3)
    310 DEF_SYM(store_f32_2)
    311 DEF_SYM(store_f32_1)
    312 DEF_SYM(store_f32u_4)
    313 DEF_SYM(store_f32u_2)
    314 DEF_SYM(store_f32u_1)
    315 
    316 DEF_SYM(unpack_u8_4)
    317 DEF_SYM(unpack_u8_3)
    318 DEF_SYM(unpack_u8_2)
    319 DEF_SYM(unpack_u8_1)
    320 DEF_SYM(pack_u8_4)
    321 DEF_SYM(pack_u8_3)
    322 DEF_SYM(pack_u8_2)
    323 DEF_SYM(pack_u8_1)
    324 DEF_SYM(dot)
    325 DEF_SYM(add_0_u8)
    326 DEF_SYM(add_1_u8)
    327 DEF_SYM(add_2_u8)
    328 DEF_SYM(add_3_u8)
    329 
    330 #define ADD_CHUNK(x) \
    331     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
    332     buf += _N_ColorMatrix_##x##_len
    333 
    334 
    335 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
    336     size_t off = (target - buf - 8) >> 2;
    337     rsAssert(((off & 0xff000000) == 0) ||
    338            ((off & 0xff000000) == 0xff000000));
    339 
    340     uint32_t op = (condition << 28);
    341     op |= 0xa << 24;  // branch
    342     op |= 0xffffff & off;
    343     ((uint32_t *)buf)[0] = op;
    344     return buf + 4;
    345 }
    346 
    347 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
    348     rsAssert(vd < 32);
    349     rsAssert(vm < 32);
    350     rsAssert(vn < 32);
    351 
    352     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
    353     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
    354     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
    355     return op;
    356 }
    357 
    358 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    359     //vmlal.s16 Q#1, D#1, D#2[#]
    360     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
    361     ((uint32_t *)buf)[0] = op;
    362     return buf + 4;
    363 }
    364 
    365 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    366     //vmull.s16 Q#1, D#1, D#2[#]
    367     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
    368     ((uint32_t *)buf)[0] = op;
    369     return buf + 4;
    370 }
    371 
    372 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    373     //vqadd.s32 Q#1, D#1, D#2
    374     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    375     ((uint32_t *)buf)[0] = op;
    376     return buf + 4;
    377 }
    378 
    379 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    380     //vmlal.f32 Q#1, D#1, D#2[#]
    381     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
    382     ((uint32_t *)buf)[0] = op;
    383     return buf + 4;
    384 }
    385 
    386 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    387     //vmull.f32 Q#1, D#1, D#2[#]
    388     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
    389     ((uint32_t *)buf)[0] = op;
    390     return buf + 4;
    391 }
    392 
    393 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    394     //vadd.f32 Q#1, D#1, D#2
    395     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    396     ((uint32_t *)buf)[0] = op;
    397     return buf + 4;
    398 }
    399 
    400 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    401     //vadd.f32 Q#1, D#1, D#2
    402     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    403     ((uint32_t *)buf)[0] = op;
    404     return buf + 4;
    405 }
    406 #endif
    407 
    408 
    409 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
    410 #if defined(ARCH_ARM_HAVE_NEON)
    411     mBufSize = 4096;
    412     //StopWatch build_time("rs cm: build time");
    413     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
    414                                   MAP_PRIVATE | MAP_ANON, -1, 0);
    415     if (!mBuf) {
    416         return false;
    417     }
    418 
    419     uint8_t *buf = mBuf;
    420     uint8_t *buf2 = NULL;
    421 
    422     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
    423     int opInit[4] = {0, 0, 0, 0};
    424 
    425     memset(ops, 0, sizeof(ops));
    426     for (int i=0; i < 4; i++) {
    427         if (key.u.coeffMask & (1 << (i*4))) {
    428             ops[i][0] = 0x2 | opInit[0];
    429             opInit[0] = 1;
    430         }
    431         if (!key.u.dot) {
    432             if (key.u.coeffMask & (1 << (1 + i*4))) {
    433                 ops[i][1] = 0x2 | opInit[1];
    434                 opInit[1] = 1;
    435             }
    436             if (key.u.coeffMask & (1 << (2 + i*4))) {
    437                 ops[i][2] = 0x2 | opInit[2];
    438                 opInit[2] = 1;
    439             }
    440         }
    441         if (!key.u.copyAlpha) {
    442             if (key.u.coeffMask & (1 << (3 + i*4))) {
    443                 ops[i][3] = 0x2 | opInit[3];
    444                 opInit[3] = 1;
    445             }
    446         }
    447     }
    448 
    449     if (key.u.inType || key.u.outType) {
    450         key.u.copyAlpha = 0;
    451         ADD_CHUNK(prefix_f);
    452         buf2 = buf;
    453 
    454         // Load the incoming r,g,b,a as needed
    455         if (key.u.inType) {
    456             switch(key.u.inVecSize) {
    457             case 3:
    458                 ADD_CHUNK(load_f32_4);
    459                 break;
    460             case 2:
    461                 ADD_CHUNK(load_f32_3);
    462                 break;
    463             case 1:
    464                 ADD_CHUNK(load_f32_2);
    465                 break;
    466             case 0:
    467                 ADD_CHUNK(load_f32_1);
    468                 break;
    469             }
    470         } else {
    471             switch(key.u.inVecSize) {
    472             case 3:
    473                 ADD_CHUNK(load_u8f_4);
    474                 break;
    475             case 2:
    476                 ADD_CHUNK(load_u8f_3);
    477                 break;
    478             case 1:
    479                 ADD_CHUNK(load_u8f_2);
    480                 break;
    481             case 0:
    482                 ADD_CHUNK(load_u8f_1);
    483                 break;
    484             }
    485         }
    486 
    487         for (int i=0; i < 4; i++) {
    488             for (int j=0; j < 4; j++) {
    489                 switch(ops[i][j]) {
    490                 case 0:
    491                     break;
    492                 case 2:
    493                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
    494                     break;
    495                 case 3:
    496                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
    497                     break;
    498                 }
    499             }
    500         }
    501         for (int j=0; j < 4; j++) {
    502             if (opInit[j]) {
    503                 if (key.u.addMask & (1 << j)) {
    504                     buf = addVADD_F32(buf, j, 12+j, 8+j);
    505                 } else {
    506                     buf = addVORR_32(buf, j, 12+j, 12+j);
    507                 }
    508             } else {
    509                 if (key.u.addMask & (1 << j)) {
    510                     buf = addVADD_F32(buf, j, j, 8+j);
    511                 }
    512             }
    513         }
    514 
    515         if (key.u.outType) {
    516             switch(key.u.outVecSize) {
    517             case 3:
    518                 ADD_CHUNK(store_f32_4);
    519                 break;
    520             case 2:
    521                 ADD_CHUNK(store_f32_3);
    522                 break;
    523             case 1:
    524                 ADD_CHUNK(store_f32_2);
    525                 break;
    526             case 0:
    527                 ADD_CHUNK(store_f32_1);
    528                 break;
    529             }
    530         } else {
    531             switch(key.u.outVecSize) {
    532             case 3:
    533             case 2:
    534                 ADD_CHUNK(store_f32u_4);
    535                 break;
    536             case 1:
    537                 ADD_CHUNK(store_f32u_2);
    538                 break;
    539             case 0:
    540                 ADD_CHUNK(store_f32u_1);
    541                 break;
    542             }
    543         }
    544 
    545 
    546     } else {
    547         // Add the function prefix
    548         // Store the address for the loop return
    549         ADD_CHUNK(prefix_i);
    550         buf2 = buf;
    551 
    552         // Load the incoming r,g,b,a as needed
    553         switch(key.u.inVecSize) {
    554         case 3:
    555             ADD_CHUNK(load_u8_4);
    556             if (key.u.copyAlpha) {
    557                 ADD_CHUNK(unpack_u8_3);
    558             } else {
    559                 ADD_CHUNK(unpack_u8_4);
    560             }
    561             break;
    562         case 2:
    563             ADD_CHUNK(load_u8_3);
    564             ADD_CHUNK(unpack_u8_3);
    565             break;
    566         case 1:
    567             ADD_CHUNK(load_u8_2);
    568             ADD_CHUNK(unpack_u8_2);
    569             break;
    570         case 0:
    571             ADD_CHUNK(load_u8_1);
    572             ADD_CHUNK(unpack_u8_1);
    573             break;
    574         }
    575 
    576         // Add multiply and accumulate
    577         // use MULL to init the output register,
    578         // use MLAL from there
    579         for (int i=0; i < 4; i++) {
    580             for (int j=0; j < 4; j++) {
    581                 switch(ops[i][j]) {
    582                 case 0:
    583                     break;
    584                 case 2:
    585                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
    586                     break;
    587                 case 3:
    588                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
    589                     break;
    590                 }
    591             }
    592         }
    593         for (int j=0; j < 4; j++) {
    594             if (opInit[j]) {
    595                 if (key.u.addMask & (1 << j)) {
    596                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
    597                 }
    598             } else {
    599                 if (key.u.addMask & (1 << j)) {
    600                     buf = addVQADD_S32(buf, 8+j, 12+j, 4+j);
    601                 }
    602             }
    603         }
    604 
    605         // If we have a dot product, perform the special pack.
    606         if (key.u.dot) {
    607             ADD_CHUNK(pack_u8_1);
    608             ADD_CHUNK(dot);
    609         } else {
    610             switch(key.u.outVecSize) {
    611             case 3:
    612                 if (key.u.copyAlpha) {
    613                     ADD_CHUNK(pack_u8_3);
    614                 } else {
    615                     ADD_CHUNK(pack_u8_4);
    616                 }
    617                 break;
    618             case 2:
    619                 ADD_CHUNK(pack_u8_3);
    620                 break;
    621             case 1:
    622                 ADD_CHUNK(pack_u8_2);
    623                 break;
    624             case 0:
    625                 ADD_CHUNK(pack_u8_1);
    626                 break;
    627             }
    628         }
    629 
    630         // Write out result
    631         switch(key.u.outVecSize) {
    632         case 3:
    633         case 2:
    634             ADD_CHUNK(store_u8_4);
    635             break;
    636         case 1:
    637             ADD_CHUNK(store_u8_2);
    638             break;
    639         case 0:
    640             ADD_CHUNK(store_u8_1);
    641             break;
    642         }
    643     }
    644 
    645     if (key.u.inType != key.u.outType) {
    646         key.u.copyAlpha = 0;
    647         key.u.dot = 0;
    648     }
    649 
    650     // Loop, branch, and cleanup
    651     ADD_CHUNK(postfix1);
    652     buf = addBranch(buf, buf2, 0x01);
    653     ADD_CHUNK(postfix2);
    654 
    655     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
    656     if (ret == -1) {
    657         ALOGE("mprotect error %i", ret);
    658         return false;
    659     }
    660 
    661     cacheflush((long)mBuf, (long)mBuf + mBufSize, 0);
    662     return true;
    663 #else
    664     return false;
    665 #endif
    666 }
    667 
    668 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
    669     for(int ct=0; ct < 16; ct++) {
    670         ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
    671         tmpFp[ct] = fp[ct] * fpMul;
    672         //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
    673     }
    674 
    675     float add = 0.f;
    676     if (fpMul > 254.f) add = 0.5f;
    677     for(int ct=0; ct < 4; ct++) {
    678         tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add;
    679         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
    680         tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
    681         tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
    682         tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
    683     }
    684 
    685     for(int ct=0; ct < 4; ct++) {
    686         ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f);
    687         ipa[ct * 4 + 1] = ipa[ct * 4];
    688         ipa[ct * 4 + 2] = ipa[ct * 4];
    689         ipa[ct * 4 + 3] = ipa[ct * 4];
    690     }
    691 }
    692 
    693 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
    694                                                     size_t dataLength) {
    695     switch(slot) {
    696     case 0:
    697         memcpy (fp, data, sizeof(fp));
    698         break;
    699     case 1:
    700         memcpy (fpa, data, sizeof(fpa));
    701         break;
    702     default:
    703         rsAssert(0);
    704         break;
    705     }
    706     mRootPtr = &kernel;
    707 }
    708 
    709 
    710 static void One(const RsForEachStubParamStruct *p, void *out,
    711                 const void *py, const float* coeff, const float *add,
    712                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
    713 
    714     float4 f = 0.f;
    715     if (fin) {
    716         switch(vsin) {
    717         case 3:
    718             f = ((const float4 *)py)[0];
    719             break;
    720         case 2:
    721             f = ((const float4 *)py)[0];
    722             f.w = 0.f;
    723             break;
    724         case 1:
    725             f.xy = ((const float2 *)py)[0];
    726             break;
    727         case 0:
    728             f.x = ((const float *)py)[0];
    729             break;
    730         }
    731     } else {
    732         switch(vsin) {
    733         case 3:
    734             f = convert_float4(((const uchar4 *)py)[0]);
    735             break;
    736         case 2:
    737             f = convert_float4(((const uchar4 *)py)[0]);
    738             f.w = 0.f;
    739             break;
    740         case 1:
    741             f.xy = convert_float2(((const uchar2 *)py)[0]);
    742             break;
    743         case 0:
    744             f.x = (float)(((const uchar *)py)[0]);
    745             break;
    746         }
    747     }
    748     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
    749 
    750     float4 sum;
    751     sum.x = f.x * coeff[0] +
    752             f.y * coeff[4] +
    753             f.z * coeff[8] +
    754             f.w * coeff[12];
    755     sum.y = f.x * coeff[1] +
    756             f.y * coeff[5] +
    757             f.z * coeff[9] +
    758             f.w * coeff[13];
    759     sum.z = f.x * coeff[2] +
    760             f.y * coeff[6] +
    761             f.z * coeff[10] +
    762             f.w * coeff[14];
    763     sum.w = f.x * coeff[3] +
    764             f.y * coeff[7] +
    765             f.z * coeff[11] +
    766             f.w * coeff[15];
    767     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
    768 
    769     sum.x += add[0];
    770     sum.y += add[4];
    771     sum.z += add[8];
    772     sum.w += add[12];
    773 
    774 
    775     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
    776     if (fout) {
    777         switch(vsout) {
    778         case 3:
    779         case 2:
    780             ((float4 *)out)[0] = sum;
    781             break;
    782         case 1:
    783             ((float2 *)out)[0] = sum.xy;
    784             break;
    785         case 0:
    786             ((float *)out)[0] = sum.x;
    787             break;
    788         }
    789     } else {
    790         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
    791         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
    792         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
    793         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
    794 
    795         switch(vsout) {
    796         case 3:
    797         case 2:
    798             ((uchar4 *)out)[0] = convert_uchar4(sum);
    799             break;
    800         case 1:
    801             ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
    802             break;
    803         case 0:
    804             ((uchar *)out)[0] = sum.x;
    805             break;
    806         }
    807     }
    808     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
    809 }
    810 
    811 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
    812                                               uint32_t xstart, uint32_t xend,
    813                                               uint32_t instep, uint32_t outstep) {
    814     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
    815     uchar *out = (uchar *)p->out;
    816     uchar *in = (uchar *)p->in;
    817     uint32_t x1 = xstart;
    818     uint32_t x2 = xend;
    819 
    820     uint32_t vsin = cp->mLastKey.u.inVecSize;
    821     uint32_t vsout = cp->mLastKey.u.outVecSize;
    822     bool floatIn = !!cp->mLastKey.u.inType;
    823     bool floatOut = !!cp->mLastKey.u.outType;
    824 
    825     //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
    826 
    827     if(x2 > x1) {
    828         int32_t len = (x2 - x1) >> 2;
    829         if((cp->mOptKernel != NULL) && (len > 0)) {
    830             cp->mOptKernel(out, in, cp->ip, len);
    831             x1 += len << 2;
    832             out += outstep * (len << 2);
    833             in += instep * (len << 2);
    834         }
    835 
    836         while(x1 != x2) {
    837             One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
    838             out += outstep;
    839             in += instep;
    840             x1++;
    841         }
    842     }
    843 }
    844 
    845 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
    846         uint32_t slot, const Allocation * ain, Allocation * aout,
    847         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
    848 
    849     const Element *ein = ain->mHal.state.type->getElement();
    850     const Element *eout = aout->mHal.state.type->getElement();
    851 
    852     if (ein->getType() == eout->getType()) {
    853         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
    854             updateCoeffCache(1.f, 255.f);
    855         } else {
    856             updateCoeffCache(1.f, 1.f);
    857         }
    858     } else {
    859         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
    860             updateCoeffCache(255.f, 255.f);
    861         } else {
    862             updateCoeffCache(1.f / 255.f, 1.f);
    863         }
    864     }
    865 
    866     Key_t key = computeKey(ain->mHal.state.type->getElement(),
    867                            aout->mHal.state.type->getElement());
    868     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
    869         if (mBuf) munmap(mBuf, mBufSize);
    870         mBuf = NULL;
    871         mOptKernel = NULL;
    872         if (build(key)) {
    873             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
    874             mLastKey = key;
    875         }
    876     }
    877 }
    878 
    879 void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
    880         uint32_t slot, const Allocation * ain, Allocation * aout,
    881         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
    882 
    883 }
    884 
    885 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
    886             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
    887             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
    888 
    889     mLastKey.key = 0;
    890     mBuf = NULL;
    891     mBufSize = 0;
    892     mOptKernel = NULL;
    893     const static float defaultMatrix[] = {
    894         1.f, 0.f, 0.f, 0.f,
    895         0.f, 1.f, 0.f, 0.f,
    896         0.f, 0.f, 1.f, 0.f,
    897         0.f, 0.f, 0.f, 1.f
    898     };
    899     const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
    900     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
    901     setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
    902 }
    903 
    904 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
    905     if (mBuf) munmap(mBuf, mBufSize);
    906     mBuf = NULL;
    907     mOptKernel = NULL;
    908 }
    909 
    910 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
    911     s->mHal.info.exportedVariableCount = 2;
    912 }
    913 
    914 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
    915                                             const Script *s, const Element *e) {
    916 
    917     return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
    918 }
    919 
    920 
    921 
    922