Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <sys/mman.h>
     18 #include <unistd.h>
     19 
     20 #include "rsCpuIntrinsic.h"
     21 #include "rsCpuIntrinsicInlines.h"
     22 
     23 #include <sys/mman.h>
     24 #include <stddef.h>
     25 #include <stdint.h>
     26 #include <stdlib.h>
     27 //#include <utils/StopWatch.h>
     28 
     29 
     30 /*  uint kernel
     31  *  Q0  D0:  Load slot for R
     32  *      D1:  Load slot for G
     33  *  Q1  D2:  Load slot for B
     34  *      D3:  Load slot for A
     35  *  Q2  D4:  Matrix
     36  *      D5:  =
     37  *  Q3  D6:  =
     38  *      D7:  =
     39  *  Q4  D8:  Add R
     40  *      D9:
     41  *  Q5  D10: Add G
     42  *      D11:
     43  *  Q6  D12: Add B
     44  *      D13:
     45  *  Q7  D14: Add A
     46  *      D15:
     47  *  Q8  D16:  I32: R Sum
     48  *      D17:
     49  *  Q9  D18:  I32: G Sum
     50  *      D19:
     51  *  Q10 D20:  I32: B Sum
     52  *      D21:
     53  *  Q11 D22:  I32: A Sum
     54  *      D23:
     55  *  Q12 D24:  U16: expanded R
     56  *      D25:
     57  *  Q13 D26:  U16: expanded G
     58  *      D27:
     59  *  Q14 D28:  U16: expanded B
     60  *      D29:
     61  *  Q15 D30:  U16: expanded A
     62  *      D31:
     63  *
     64  */
     65 
     66 /*  float kernel
     67  *  Q0  D0:  Load slot for R
     68  *      D1:  =
     69  *  Q1  D2:  Load slot for G
     70  *      D3:  =
     71  *  Q2  D4:  Load slot for B
     72  *      D5:  =
     73  *  Q3  D6:  Load slot for A
     74  *      D7:  =
     75  *  Q4  D8:  Matrix
     76  *      D9:  =
     77  *  Q5  D10: =
     78  *      D11: =
     79  *  Q6  D12: =
     80  *      D13: =
     81  *  Q7  D14: =
     82  *      D15: =
     83  *  Q8  D16: Add R
     84  *      D17: =
     85  *  Q9  D18: Add G
     86  *      D19: =
     87  *  Q10 D20: Add B
     88  *      D21: =
     89  *  Q11 D22: Add A
     90  *      D23: =
     91  *  Q12 D24: Sum R
     92  *      D25: =
     93  *  Q13 D26: Sum G
     94  *      D27: =
     95  *  Q14 D28: Sum B
     96  *      D29: =
     97  *  Q15 D30: Sum A
     98  *      D31: =
     99  *
    100  */
    101 
    102 
    103 
    104 namespace android {
    105 namespace renderscript {
    106 
    107 typedef union {
    108     uint64_t key;
    109     struct {
    110         uint32_t inVecSize          :2;  // [0 - 1]
    111         uint32_t outVecSize         :2;  // [2 - 3]
    112         uint32_t inType             :4;  // [4 - 7]
    113         uint32_t outType            :4;  // [8 - 11]
    114         uint32_t dot                :1;  // [12]
    115         uint32_t _unused1           :1;  // [13]
    116         uint32_t copyAlpha          :1;  // [14]
    117         uint32_t _unused2           :1;  // [15]
    118         uint32_t coeffMask          :16; // [16-31]
    119         uint32_t addMask            :4;  // [32-35]
    120     } u;
    121 } Key_t;
    122 
    123 //Re-enable when intrinsic is fixed
    124 #if defined(ARCH_ARM64_USE_INTRINSICS)
    125 typedef struct {
    126     void (*column[4])(void);
    127     void (*store)(void);
    128     void (*load)(void);
    129     void (*store_end)(void);
    130     void (*load_end)(void);
    131 } FunctionTab_t;
    132 
    133 extern "C" void rsdIntrinsicColorMatrix_int_K(
    134              void *out, void const *in, size_t count,
    135              FunctionTab_t const *fns,
    136              int16_t const *mult, int32_t const *add);
    137 
    138 extern "C" void rsdIntrinsicColorMatrix_float_K(
    139              void *out, void const *in, size_t count,
    140              FunctionTab_t const *fns,
    141              float const *mult, float const *add);
    142 
    143 /* The setup functions fill in function tables to be used by above functions;
    144  * this code also eliminates jump-to-another-jump cases by short-circuiting
    145  * empty functions.  While it's not performance critical, it works out easier
    146  * to write the set-up code in assembly than to try to expose the same symbols
    147  * and write the code in C.
    148  */
    149 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
    150              FunctionTab_t *fns,
    151              uint32_t mask, int dt, int st);
    152 
    153 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
    154              FunctionTab_t *fns,
    155              uint32_t mask, int dt, int st);
    156 #endif
    157 
    158 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
    159 public:
    160     void populateScript(Script *) override;
    161 
    162     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
    163 
    164     ~RsdCpuScriptIntrinsicColorMatrix() override;
    165     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
    166 
    167     void preLaunch(uint32_t slot, const Allocation ** ains,
    168                    uint32_t inLen, Allocation * aout, const void * usr,
    169                    uint32_t usrLen, const RsScriptCall *sc) override;
    170 
    171 protected:
    172     float fp[16];
    173     float fpa[4];
    174 
    175     // The following four fields are read as constants
    176     // by the SIMD assembly code.
    177     short ip[16];
    178     int ipa[4];
    179     float tmpFp[16];
    180     float tmpFpa[4];
    181 #if defined(ARCH_ARM64_USE_INTRINSICS)
    182     FunctionTab_t mFnTab;
    183 #endif
    184 
    185     static void kernel(const RsExpandKernelDriverInfo *info,
    186                        uint32_t xstart, uint32_t xend,
    187                        uint32_t outstep);
    188     void updateCoeffCache(float fpMul, float addMul);
    189 
    190     Key_t mLastKey;
    191     unsigned char *mBuf;
    192     size_t mBufSize;
    193 
    194     Key_t computeKey(const Element *ein, const Element *eout);
    195 
    196     bool build(Key_t key);
    197 
    198     void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
    199 
    200 };
    201 
    202 
    203 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
    204         const Element *ein, const Element *eout) {
    205 
    206     Key_t key;
    207     key.key = 0;
    208 
    209     // Compute a unique code key for this operation
    210 
    211     // Add to the key the input and output types
    212     bool hasFloat = false;
    213     if (ein->getType() == RS_TYPE_FLOAT_32) {
    214         hasFloat = true;
    215         key.u.inType = RS_TYPE_FLOAT_32;
    216         rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
    217     }
    218     if (eout->getType() == RS_TYPE_FLOAT_32) {
    219         hasFloat = true;
    220         key.u.outType = RS_TYPE_FLOAT_32;
    221         rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
    222     }
    223 
    224     // Mask in the bits indicating which coefficients in the
    225     // color matrix are needed.
    226     if (hasFloat) {
    227         for (uint32_t i=0; i < 16; i++) {
    228             if (fabs(fp[i]) != 0.f) {
    229                 key.u.coeffMask |= 1 << i;
    230             }
    231         }
    232         if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
    233         if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
    234         if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
    235         if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
    236 
    237     } else {
    238         for (uint32_t i=0; i < 16; i++) {
    239             if (ip[i] != 0) {
    240                 key.u.coeffMask |= 1 << i;
    241             }
    242         }
    243         if (ipa[0] != 0) key.u.addMask |= 0x1;
    244         if (ipa[1] != 0) key.u.addMask |= 0x2;
    245         if (ipa[2] != 0) key.u.addMask |= 0x4;
    246         if (ipa[3] != 0) key.u.addMask |= 0x8;
    247     }
    248 
    249     // Look for a dot product where the r,g,b colums are the same
    250     if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
    251         (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
    252         (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
    253         (ip[12] == ip[13]) && (ip[12] == ip[14])) {
    254 
    255         if (!key.u.addMask) key.u.dot = 1;
    256     }
    257 
    258     // Is alpha a simple copy
    259     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
    260         key.u.copyAlpha = !(key.u.inType || key.u.outType);
    261     }
    262 
    263     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
    264 
    265     switch (ein->getVectorSize()) {
    266     case 4:
    267         key.u.inVecSize = 3;
    268         break;
    269     case 3:
    270         key.u.inVecSize = 2;
    271         key.u.coeffMask &= ~0xF000;
    272         break;
    273     case 2:
    274         key.u.inVecSize = 1;
    275         key.u.coeffMask &= ~0xFF00;
    276         break;
    277     default:
    278         key.u.coeffMask &= ~0xFFF0;
    279         break;
    280     }
    281 
    282     switch (eout->getVectorSize()) {
    283     case 4:
    284         key.u.outVecSize = 3;
    285         break;
    286     case 3:
    287         key.u.outVecSize = 2;
    288         key.u.coeffMask &= ~0x8888;
    289         key.u.addMask &= 7;
    290         break;
    291     case 2:
    292         key.u.outVecSize = 1;
    293         key.u.coeffMask &= ~0xCCCC;
    294         key.u.addMask &= 3;
    295         break;
    296     default:
    297         key.u.coeffMask &= ~0xEEEE;
    298         key.u.addMask &= 1;
    299         break;
    300     }
    301 
    302     if (key.u.inType && !key.u.outType) {
    303         key.u.addMask |= 1;
    304         if (key.u.outVecSize > 0) key.u.addMask |= 2;
    305         if (key.u.outVecSize > 1) key.u.addMask |= 4;
    306         if (key.u.outVecSize > 2) key.u.addMask |= 8;
    307     }
    308 
    309     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
    310     return key;
    311 }
    312 
    313 } // namespace renderscript
    314 } // namespace android
    315 
    316 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
    317 
    318 #define DEF_SYM(x)                                  \
    319     extern "C" uint32_t _N_ColorMatrix_##x;      \
    320     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
    321     extern "C" uint32_t _N_ColorMatrix_##x##_len;
    322 
    323 DEF_SYM(prefix_i)
    324 DEF_SYM(prefix_f)
    325 DEF_SYM(postfix1)
    326 DEF_SYM(postfix2)
    327 
    328 DEF_SYM(load_u8_4)
    329 DEF_SYM(load_u8_3)
    330 DEF_SYM(load_u8_2)
    331 DEF_SYM(load_u8_1)
    332 DEF_SYM(load_u8f_4)
    333 DEF_SYM(load_u8f_3)
    334 DEF_SYM(load_u8f_2)
    335 DEF_SYM(load_u8f_1)
    336 DEF_SYM(load_f32_4)
    337 DEF_SYM(load_f32_3)
    338 DEF_SYM(load_f32_2)
    339 DEF_SYM(load_f32_1)
    340 
    341 DEF_SYM(store_u8_4)
    342 DEF_SYM(store_u8_2)
    343 DEF_SYM(store_u8_1)
    344 DEF_SYM(store_f32_4)
    345 DEF_SYM(store_f32_3)
    346 DEF_SYM(store_f32_2)
    347 DEF_SYM(store_f32_1)
    348 DEF_SYM(store_f32u_4)
    349 DEF_SYM(store_f32u_2)
    350 DEF_SYM(store_f32u_1)
    351 
    352 DEF_SYM(unpack_u8_4)
    353 DEF_SYM(unpack_u8_3)
    354 DEF_SYM(unpack_u8_2)
    355 DEF_SYM(unpack_u8_1)
    356 DEF_SYM(pack_u8_4)
    357 DEF_SYM(pack_u8_3)
    358 DEF_SYM(pack_u8_2)
    359 DEF_SYM(pack_u8_1)
    360 DEF_SYM(dot)
    361 DEF_SYM(add_0_u8)
    362 DEF_SYM(add_1_u8)
    363 DEF_SYM(add_2_u8)
    364 DEF_SYM(add_3_u8)
    365 
    366 #define ADD_CHUNK(x) \
    367     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
    368     buf += _N_ColorMatrix_##x##_len
    369 
    370 
    371 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
    372     size_t off = (target - buf - 8) >> 2;
    373     rsAssert(((off & 0xff000000) == 0) ||
    374            ((off & 0xff000000) == 0xff000000));
    375 
    376     uint32_t op = (condition << 28);
    377     op |= 0xa << 24;  // branch
    378     op |= 0xffffff & off;
    379     ((uint32_t *)buf)[0] = op;
    380     return buf + 4;
    381 }
    382 
    383 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
    384     rsAssert(vd < 32);
    385     rsAssert(vm < 32);
    386     rsAssert(vn < 32);
    387 
    388     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
    389     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
    390     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
    391     return op;
    392 }
    393 
    394 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    395     //vmlal.s16 Q#1, D#1, D#2[#]
    396     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
    397     ((uint32_t *)buf)[0] = op;
    398     return buf + 4;
    399 }
    400 
    401 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    402     //vmull.s16 Q#1, D#1, D#2[#]
    403     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
    404     ((uint32_t *)buf)[0] = op;
    405     return buf + 4;
    406 }
    407 
    408 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    409     //vqadd.s32 Q#1, Q#1, Q#2
    410     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    411     ((uint32_t *)buf)[0] = op;
    412     return buf + 4;
    413 }
    414 
    415 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    416     //vmlal.f32 Q#1, D#1, D#2[#]
    417     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
    418     ((uint32_t *)buf)[0] = op;
    419     return buf + 4;
    420 }
    421 
    422 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
    423     //vmull.f32 Q#1, D#1, D#2[#]
    424     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
    425     ((uint32_t *)buf)[0] = op;
    426     return buf + 4;
    427 }
    428 
    429 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    430     //vadd.f32 Q#1, D#1, D#2
    431     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    432     ((uint32_t *)buf)[0] = op;
    433     return buf + 4;
    434 }
    435 
    436 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
    437     //vmov.32 Q#1, #imm
    438     rsAssert(imm == 0);
    439     uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
    440     ((uint32_t *)buf)[0] = op;
    441     return buf + 4;
    442 }
    443 
    444 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
    445     //vadd.f32 Q#1, D#1, D#2
    446     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
    447     ((uint32_t *)buf)[0] = op;
    448     return buf + 4;
    449 }
    450 #endif
    451 
    452 #if defined(ARCH_X86_HAVE_SSSE3)
    453 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
    454                                   const short *coef, uint32_t count);
    455 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
    456                                   const short *coef, uint32_t count);
    457 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
    458                                   const short *coef, uint32_t count);
    459 
    460 using android::renderscript::Key_t;
    461 
    462 void * selectKernel(Key_t key)
    463 {
    464     void * kernel = nullptr;
    465 
    466     // inType, outType float if nonzero
    467     if (!(key.u.inType || key.u.outType)) {
    468         if (key.u.dot)
    469             kernel = (void *)rsdIntrinsicColorMatrixDot_K;
    470         else if (key.u.copyAlpha)
    471             kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
    472         else
    473             kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
    474     }
    475 
    476     return kernel;
    477 }
    478 #endif
    479 
    480 namespace android {
    481 namespace renderscript {
    482 
    483 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
    484 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
    485     mBufSize = 4096;
    486     //StopWatch build_time("rs cm: build time");
    487     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
    488                                   MAP_PRIVATE | MAP_ANON, -1, 0);
    489     if (mBuf == MAP_FAILED) {
    490         mBuf = NULL;
    491         return false;
    492     }
    493 
    494     uint8_t *buf = mBuf;
    495     uint8_t *buf2 = nullptr;
    496 
    497     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
    498     int opInit[4] = {0, 0, 0, 0};
    499 
    500     memset(ops, 0, sizeof(ops));
    501     for (int i=0; i < 4; i++) {
    502         if (key.u.coeffMask & (1 << (i*4))) {
    503             ops[i][0] = 0x2 | opInit[0];
    504             opInit[0] = 1;
    505         }
    506         if (!key.u.dot) {
    507             if (key.u.coeffMask & (1 << (1 + i*4))) {
    508                 ops[i][1] = 0x2 | opInit[1];
    509                 opInit[1] = 1;
    510             }
    511             if (key.u.coeffMask & (1 << (2 + i*4))) {
    512                 ops[i][2] = 0x2 | opInit[2];
    513                 opInit[2] = 1;
    514             }
    515         }
    516         if (!key.u.copyAlpha) {
    517             if (key.u.coeffMask & (1 << (3 + i*4))) {
    518                 ops[i][3] = 0x2 | opInit[3];
    519                 opInit[3] = 1;
    520             }
    521         }
    522     }
    523 
    524     if (key.u.inType || key.u.outType) {
    525         key.u.copyAlpha = 0;
    526         ADD_CHUNK(prefix_f);
    527         buf2 = buf;
    528 
    529         // Load the incoming r,g,b,a as needed
    530         if (key.u.inType) {
    531             switch(key.u.inVecSize) {
    532             case 3:
    533                 ADD_CHUNK(load_f32_4);
    534                 break;
    535             case 2:
    536                 ADD_CHUNK(load_f32_3);
    537                 break;
    538             case 1:
    539                 ADD_CHUNK(load_f32_2);
    540                 break;
    541             case 0:
    542                 ADD_CHUNK(load_f32_1);
    543                 break;
    544             }
    545         } else {
    546             switch(key.u.inVecSize) {
    547             case 3:
    548                 ADD_CHUNK(load_u8f_4);
    549                 break;
    550             case 2:
    551                 ADD_CHUNK(load_u8f_3);
    552                 break;
    553             case 1:
    554                 ADD_CHUNK(load_u8f_2);
    555                 break;
    556             case 0:
    557                 ADD_CHUNK(load_u8f_1);
    558                 break;
    559             }
    560         }
    561 
    562         for (int i=0; i < 4; i++) {
    563             for (int j=0; j < 4; j++) {
    564                 switch(ops[i][j]) {
    565                 case 0:
    566                     break;
    567                 case 2:
    568                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
    569                     break;
    570                 case 3:
    571                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
    572                     break;
    573                 }
    574             }
    575         }
    576         for (int j=0; j < 4; j++) {
    577             if (opInit[j]) {
    578                 if (key.u.addMask & (1 << j)) {
    579                     buf = addVADD_F32(buf, j, 12+j, 8+j);
    580                 } else {
    581                     buf = addVORR_32(buf, j, 12+j, 12+j);
    582                 }
    583             } else {
    584                 if (key.u.addMask & (1 << j)) {
    585                     buf = addVORR_32(buf, j, 8+j, 8+j);
    586                 } else {
    587                     buf = addVMOV_32(buf, j, 0);
    588                 }
    589             }
    590         }
    591 
    592         if (key.u.outType) {
    593             switch(key.u.outVecSize) {
    594             case 3:
    595                 ADD_CHUNK(store_f32_4);
    596                 break;
    597             case 2:
    598                 ADD_CHUNK(store_f32_3);
    599                 break;
    600             case 1:
    601                 ADD_CHUNK(store_f32_2);
    602                 break;
    603             case 0:
    604                 ADD_CHUNK(store_f32_1);
    605                 break;
    606             }
    607         } else {
    608             switch(key.u.outVecSize) {
    609             case 3:
    610             case 2:
    611                 ADD_CHUNK(store_f32u_4);
    612                 break;
    613             case 1:
    614                 ADD_CHUNK(store_f32u_2);
    615                 break;
    616             case 0:
    617                 ADD_CHUNK(store_f32u_1);
    618                 break;
    619             }
    620         }
    621 
    622 
    623     } else {
    624         // Add the function prefix
    625         // Store the address for the loop return
    626         ADD_CHUNK(prefix_i);
    627         buf2 = buf;
    628 
    629         // Load the incoming r,g,b,a as needed
    630         switch(key.u.inVecSize) {
    631         case 3:
    632             ADD_CHUNK(load_u8_4);
    633             if (key.u.copyAlpha) {
    634                 ADD_CHUNK(unpack_u8_3);
    635             } else {
    636                 ADD_CHUNK(unpack_u8_4);
    637             }
    638             break;
    639         case 2:
    640             ADD_CHUNK(load_u8_3);
    641             ADD_CHUNK(unpack_u8_3);
    642             break;
    643         case 1:
    644             ADD_CHUNK(load_u8_2);
    645             ADD_CHUNK(unpack_u8_2);
    646             break;
    647         case 0:
    648             ADD_CHUNK(load_u8_1);
    649             ADD_CHUNK(unpack_u8_1);
    650             break;
    651         }
    652 
    653         // Add multiply and accumulate
    654         // use MULL to init the output register,
    655         // use MLAL from there
    656         for (int i=0; i < 4; i++) {
    657             for (int j=0; j < 4; j++) {
    658                 switch(ops[i][j]) {
    659                 case 0:
    660                     break;
    661                 case 2:
    662                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
    663                     break;
    664                 case 3:
    665                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
    666                     break;
    667                 }
    668             }
    669         }
    670         for (int j=0; j < 4; j++) {
    671             if (opInit[j]) {
    672                 if (key.u.addMask & (1 << j)) {
    673                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
    674                 }
    675             } else {
    676                 if (key.u.addMask & (1 << j)) {
    677                     buf = addVORR_32(buf, 8+j, 4+j, 4+j);
    678                 }
    679             }
    680         }
    681 
    682         // If we have a dot product, perform the special pack.
    683         if (key.u.dot) {
    684             ADD_CHUNK(pack_u8_1);
    685             ADD_CHUNK(dot);
    686         } else {
    687             switch(key.u.outVecSize) {
    688             case 3:
    689                 if (key.u.copyAlpha) {
    690                     ADD_CHUNK(pack_u8_3);
    691                 } else {
    692                     ADD_CHUNK(pack_u8_4);
    693                 }
    694                 break;
    695             case 2:
    696                 ADD_CHUNK(pack_u8_3);
    697                 break;
    698             case 1:
    699                 ADD_CHUNK(pack_u8_2);
    700                 break;
    701             case 0:
    702                 ADD_CHUNK(pack_u8_1);
    703                 break;
    704             }
    705         }
    706 
    707         // Write out result
    708         switch(key.u.outVecSize) {
    709         case 3:
    710         case 2:
    711             ADD_CHUNK(store_u8_4);
    712             break;
    713         case 1:
    714             ADD_CHUNK(store_u8_2);
    715             break;
    716         case 0:
    717             ADD_CHUNK(store_u8_1);
    718             break;
    719         }
    720     }
    721 
    722     if (key.u.inType != key.u.outType) {
    723         key.u.copyAlpha = 0;
    724         key.u.dot = 0;
    725     }
    726 
    727     // Loop, branch, and cleanup
    728     ADD_CHUNK(postfix1);
    729     buf = addBranch(buf, buf2, 0x01);
    730     ADD_CHUNK(postfix2);
    731 
    732     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
    733     if (ret == -1) {
    734         ALOGE("mprotect error %i", ret);
    735         return false;
    736     }
    737 
    738     __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
    739     return true;
    740 #else
    741     return false;
    742 #endif
    743 }
    744 
    745 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
    746     for(int ct=0; ct < 16; ct++) {
    747         ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
    748         tmpFp[ct] = fp[ct] * fpMul;
    749         //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
    750     }
    751 
    752     float add = 0.f;
    753     if (fpMul > 254.f) add = 0.5f;
    754     for(int ct=0; ct < 4; ct++) {
    755         tmpFpa[ct] = fpa[ct] * addMul + add;
    756         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
    757     }
    758 
    759     for(int ct=0; ct < 4; ct++) {
    760         ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
    761     }
    762 }
    763 
    764 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
    765                                                     size_t dataLength) {
    766     switch(slot) {
    767     case 0:
    768         memcpy (fp, data, sizeof(fp));
    769         break;
    770     case 1:
    771         memcpy (fpa, data, sizeof(fpa));
    772         break;
    773     default:
    774         rsAssert(0);
    775         break;
    776     }
    777     mRootPtr = &kernel;
    778 }
    779 
    780 
    781 static void One(const RsExpandKernelDriverInfo *info, void *out,
    782                 const void *py, const float* coeff, const float *add,
    783                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
    784 
    785     float4 f = 0.f;
    786     if (fin) {
    787         switch(vsin) {
    788         case 3:
    789             f = ((const float4 *)py)[0];
    790             break;
    791         case 2:
    792             f = ((const float4 *)py)[0];
    793             f.w = 0.f;
    794             break;
    795         case 1:
    796             f.xy = ((const float2 *)py)[0];
    797             break;
    798         case 0:
    799             f.x = ((const float *)py)[0];
    800             break;
    801         }
    802     } else {
    803         switch(vsin) {
    804         case 3:
    805             f = convert_float4(((const uchar4 *)py)[0]);
    806             break;
    807         case 2:
    808             f = convert_float4(((const uchar4 *)py)[0]);
    809             f.w = 0.f;
    810             break;
    811         case 1:
    812             f.xy = convert_float2(((const uchar2 *)py)[0]);
    813             break;
    814         case 0:
    815             f.x = (float)(((const uchar *)py)[0]);
    816             break;
    817         }
    818     }
    819     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
    820 
    821     float4 sum;
    822     sum.x = f.x * coeff[0] +
    823             f.y * coeff[4] +
    824             f.z * coeff[8] +
    825             f.w * coeff[12];
    826     sum.y = f.x * coeff[1] +
    827             f.y * coeff[5] +
    828             f.z * coeff[9] +
    829             f.w * coeff[13];
    830     sum.z = f.x * coeff[2] +
    831             f.y * coeff[6] +
    832             f.z * coeff[10] +
    833             f.w * coeff[14];
    834     sum.w = f.x * coeff[3] +
    835             f.y * coeff[7] +
    836             f.z * coeff[11] +
    837             f.w * coeff[15];
    838     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
    839 
    840     sum.x += add[0];
    841     sum.y += add[1];
    842     sum.z += add[2];
    843     sum.w += add[3];
    844 
    845 
    846     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
    847     if (fout) {
    848         switch(vsout) {
    849         case 3:
    850         case 2:
    851             ((float4 *)out)[0] = sum;
    852             break;
    853         case 1:
    854             ((float2 *)out)[0] = sum.xy;
    855             break;
    856         case 0:
    857             ((float *)out)[0] = sum.x;
    858             break;
    859         }
    860     } else {
    861         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
    862         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
    863         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
    864         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
    865 
    866         switch(vsout) {
    867         case 3:
    868         case 2:
    869             ((uchar4 *)out)[0] = convert_uchar4(sum);
    870             break;
    871         case 1:
    872             ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
    873             break;
    874         case 0:
    875             ((uchar *)out)[0] = sum.x;
    876             break;
    877         }
    878     }
    879     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
    880 }
    881 
    882 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
    883                                               uint32_t xstart, uint32_t xend,
    884                                               uint32_t outstep) {
    885     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
    886 
    887     uint32_t instep = info->inStride[0];
    888 
    889     uchar *out = (uchar *)info->outPtr[0];
    890     uchar *in = (uchar *)info->inPtr[0];
    891     uint32_t x1 = xstart;
    892     uint32_t x2 = xend;
    893 
    894     uint32_t vsin = cp->mLastKey.u.inVecSize;
    895     uint32_t vsout = cp->mLastKey.u.outVecSize;
    896     bool floatIn = !!cp->mLastKey.u.inType;
    897     bool floatOut = !!cp->mLastKey.u.outType;
    898 
    899     //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
    900 
    901     if(x2 > x1) {
    902         int32_t len = x2 - x1;
    903         if (gArchUseSIMD) {
    904             if((cp->mOptKernel != nullptr) && (len >= 4)) {
    905                 // The optimized kernel processes 4 pixels at once
    906                 // and requires a minimum of 1 chunk of 4
    907                 cp->mOptKernel(out, in, cp->ip, len >> 2);
    908                 // Update the len and pointers so the generic code can
    909                 // finish any leftover pixels
    910                 len &= ~3;
    911                 x1 += len;
    912                 out += outstep * len;
    913                 in += instep * len;
    914             }
    915 #if defined(ARCH_ARM64_USE_INTRINSICS)
    916             else {
    917                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
    918                     // Currently this generates off by one errors.
    919                     //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
    920                     //x1 += len;
    921                     //out += outstep * len;
    922                     //in += instep * len;
    923                 } else {
    924                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
    925                     x1 += len;
    926                     out += outstep * len;
    927                     in += instep * len;
    928                 }
    929             }
    930 #endif
    931         }
    932 
    933         while(x1 != x2) {
    934             One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
    935             out += outstep;
    936             in += instep;
    937             x1++;
    938         }
    939     }
    940 }
    941 
    942 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
    943                                                  const Allocation ** ains,
    944                                                  uint32_t inLen,
    945                                                  Allocation * aout,
    946                                                  const void * usr,
    947                                                  uint32_t usrLen,
    948                                                  const RsScriptCall *sc) {
    949 
    950     const Element *ein = ains[0]->mHal.state.type->getElement();
    951     const Element *eout = aout->mHal.state.type->getElement();
    952 
    953     if (ein->getType() == eout->getType()) {
    954         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
    955             updateCoeffCache(1.f, 255.f);
    956         } else {
    957             updateCoeffCache(1.f, 1.f);
    958         }
    959     } else {
    960         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
    961             updateCoeffCache(255.f, 255.f);
    962         } else {
    963             updateCoeffCache(1.f / 255.f, 1.f);
    964         }
    965     }
    966 
    967     Key_t key = computeKey(ein, eout);
    968 
    969 #if defined(ARCH_X86_HAVE_SSSE3)
    970     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
    971         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
    972         // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
    973         mLastKey = key;
    974     }
    975 
    976 #else //if !defined(ARCH_X86_HAVE_SSSE3)
    977     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
    978         if (mBuf) munmap(mBuf, mBufSize);
    979         mBuf = nullptr;
    980         mOptKernel = nullptr;
    981         if (build(key)) {
    982             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
    983         }
    984 #if defined(ARCH_ARM64_USE_INTRINSICS)
    985         else {
    986             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
    987             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
    988             uint32_t mm = 0;
    989             int i;
    990             for (i = 0; i < 4; i++)
    991             {
    992                 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
    993                 m = ((m * 0x249) >> 9) & 15;
    994                 m |= ((key.u.addMask >> i) & 1) << 4;
    995                 mm |= m << (i * 5);
    996             }
    997 
    998             if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
    999                 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
   1000             } else {
   1001                 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
   1002             }
   1003         }
   1004 #endif
   1005         mLastKey = key;
   1006     }
   1007 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
   1008 }
   1009 
   1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
   1011             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
   1012             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
   1013 
   1014     mLastKey.key = 0;
   1015     mBuf = nullptr;
   1016     mBufSize = 0;
   1017     mOptKernel = nullptr;
   1018     const static float defaultMatrix[] = {
   1019         1.f, 0.f, 0.f, 0.f,
   1020         0.f, 1.f, 0.f, 0.f,
   1021         0.f, 0.f, 1.f, 0.f,
   1022         0.f, 0.f, 0.f, 1.f
   1023     };
   1024     const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
   1025     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
   1026     setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
   1027 }
   1028 
   1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
   1030     if (mBuf) munmap(mBuf, mBufSize);
   1031     mBuf = nullptr;
   1032     mOptKernel = nullptr;
   1033 }
   1034 
   1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
   1036     s->mHal.info.exportedVariableCount = 2;
   1037 }
   1038 
   1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
   1040                                             const Script *s, const Element *e) {
   1041 
   1042     return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
   1043 }
   1044 
   1045 } // namespace renderscript
   1046 } // namespace android
   1047