1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <sys/mman.h> 18 #include <unistd.h> 19 20 #include "rsCpuIntrinsic.h" 21 #include "rsCpuIntrinsicInlines.h" 22 #include "linkloader/include/MemChunk.h" 23 #include "linkloader/utils/flush_cpu_cache.h" 24 25 #include <sys/mman.h> 26 #include <stddef.h> 27 #include <stdint.h> 28 #include <stdlib.h> 29 //#include <utils/StopWatch.h> 30 31 32 /* uint kernel 33 * Q0 D0: Load slot for R 34 * D1: Load slot for G 35 * Q1 D2: Load slot for B 36 * D3: Load slot for A 37 * Q2 D4: Matrix 38 * D5: = 39 * Q3 D6: = 40 * D7: = 41 * Q4 D8: Add R 42 * D9: 43 * Q5 D10: Add G 44 * D11: 45 * Q6 D12: Add B 46 * D13: 47 * Q7 D14: Add A 48 * D15: 49 * Q8 D16: I32: R Sum 50 * D17: 51 * Q9 D18: I32: G Sum 52 * D19: 53 * Q10 D20: I32: B Sum 54 * D21: 55 * Q11 D22: I32: A Sum 56 * D23: 57 * Q12 D24: U16: expanded R 58 * D25: 59 * Q13 D26: U16: expanded G 60 * D27: 61 * Q14 D28: U16: expanded B 62 * D29: 63 * Q15 D30: U16: expanded A 64 * D31: 65 * 66 */ 67 68 /* float kernel 69 * Q0 D0: Load slot for R 70 * D1: = 71 * Q1 D2: Load slot for G 72 * D3: = 73 * Q2 D4: Load slot for B 74 * D5: = 75 * Q3 D6: Load slot for A 76 * D7: = 77 * Q4 D8: Matrix 78 * D9: = 79 * Q5 D10: = 80 * D11: = 81 * Q6 D12: = 82 * D13: = 83 * Q7 D14: = 84 * D15: = 85 * Q8 D16: Add R 86 * D17: = 87 * Q9 D18: Add G 88 * D19: = 89 * Q10 D20: Add B 90 * D21: = 91 * Q11 D22: Add A 92 * D23: = 93 * Q12 D24: Sum R 94 * D25: = 95 * Q13 D26: Sum G 96 * D27: = 97 * Q14 D28: Sum B 98 * D29: = 99 * Q15 D30: Sum A 100 * D31: = 101 * 102 */ 103 104 105 106 using namespace android; 107 using namespace android::renderscript; 108 109 namespace android { 110 namespace renderscript { 111 112 typedef union { 113 uint64_t key; 114 struct { 115 uint32_t inVecSize :2; // [0 - 1] 116 uint32_t outVecSize :2; // [2 - 3] 117 uint32_t inType :4; // [4 - 7] 118 uint32_t outType :4; // [8 - 11] 119 uint32_t dot :1; // [12] 120 uint32_t _unused1 :1; // [13] 121 uint32_t copyAlpha :1; // [14] 122 uint32_t _unused2 :1; // [15] 123 uint32_t coeffMask :16; // [16-31] 124 uint32_t addMask :4; // [32-35] 125 } u; 126 } Key_t; 127 128 //Re-enable when intrinsic is fixed 129 #if defined(ARCH_ARM64_USE_INTRINSICS) 130 typedef struct { 131 void (*column[4])(void); 132 void (*store)(void); 133 void (*load)(void); 134 void (*store_end)(void); 135 void (*load_end)(void); 136 } FunctionTab_t; 137 138 extern "C" void rsdIntrinsicColorMatrix_int_K( 139 void *out, void const *in, size_t count, 140 FunctionTab_t const *fns, 141 int16_t const *mult, int32_t const *add); 142 143 extern "C" void rsdIntrinsicColorMatrix_float_K( 144 void *out, void const *in, size_t count, 145 FunctionTab_t const *fns, 146 float const *mult, float const *add); 147 148 /* The setup functions fill in function tables to be used by above functions; 149 * this code also eliminates jump-to-another-jump cases by short-circuiting 150 * empty functions. While it's not performance critical, it works out easier 151 * to write the set-up code in assembly than to try to expose the same symbols 152 * and write the code in C. 153 */ 154 extern "C" void rsdIntrinsicColorMatrixSetup_int_K( 155 FunctionTab_t *fns, 156 uint32_t mask, int dt, int st); 157 158 extern "C" void rsdIntrinsicColorMatrixSetup_float_K( 159 FunctionTab_t *fns, 160 uint32_t mask, int dt, int st); 161 #endif 162 163 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic { 164 public: 165 virtual void populateScript(Script *); 166 167 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); 168 169 virtual ~RsdCpuScriptIntrinsicColorMatrix(); 170 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 171 172 virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, 173 const void * usr, uint32_t usrLen, const RsScriptCall *sc); 174 virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, 175 const void * usr, uint32_t usrLen, const RsScriptCall *sc); 176 177 protected: 178 float fp[16]; 179 float fpa[4]; 180 181 // The following four fields are read as constants 182 // by the SIMD assembly code. 183 short ip[16]; 184 int ipa[4]; 185 float tmpFp[16]; 186 float tmpFpa[4]; 187 #if defined(ARCH_ARM64_USE_INTRINSICS) 188 FunctionTab_t mFnTab; 189 #endif 190 191 static void kernel(const RsForEachStubParamStruct *p, 192 uint32_t xstart, uint32_t xend, 193 uint32_t instep, uint32_t outstep); 194 void updateCoeffCache(float fpMul, float addMul); 195 196 Key_t mLastKey; 197 unsigned char *mBuf; 198 size_t mBufSize; 199 200 Key_t computeKey(const Element *ein, const Element *eout); 201 202 bool build(Key_t key); 203 204 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count); 205 206 }; 207 208 } 209 } 210 211 212 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey( 213 const Element *ein, const Element *eout) { 214 215 Key_t key; 216 key.key = 0; 217 218 // Compute a unique code key for this operation 219 220 // Add to the key the input and output types 221 bool hasFloat = false; 222 if (ein->getType() == RS_TYPE_FLOAT_32) { 223 hasFloat = true; 224 key.u.inType = RS_TYPE_FLOAT_32; 225 rsAssert(key.u.inType == RS_TYPE_FLOAT_32); 226 } 227 if (eout->getType() == RS_TYPE_FLOAT_32) { 228 hasFloat = true; 229 key.u.outType = RS_TYPE_FLOAT_32; 230 rsAssert(key.u.outType == RS_TYPE_FLOAT_32); 231 } 232 233 // Mask in the bits indicating which coefficients in the 234 // color matrix are needed. 235 if (hasFloat) { 236 for (uint32_t i=0; i < 16; i++) { 237 if (fabs(fp[i]) != 0.f) { 238 key.u.coeffMask |= 1 << i; 239 } 240 } 241 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1; 242 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2; 243 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4; 244 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8; 245 246 } else { 247 for (uint32_t i=0; i < 16; i++) { 248 if (ip[i] != 0) { 249 key.u.coeffMask |= 1 << i; 250 } 251 } 252 if (ipa[0] != 0) key.u.addMask |= 0x1; 253 if (ipa[1] != 0) key.u.addMask |= 0x2; 254 if (ipa[2] != 0) key.u.addMask |= 0x4; 255 if (ipa[3] != 0) key.u.addMask |= 0x8; 256 } 257 258 // Look for a dot product where the r,g,b colums are the same 259 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) && 260 (ip[4] == ip[5]) && (ip[4] == ip[6]) && 261 (ip[8] == ip[9]) && (ip[8] == ip[10]) && 262 (ip[12] == ip[13]) && (ip[12] == ip[14])) { 263 264 if (!key.u.addMask) key.u.dot = 1; 265 } 266 267 // Is alpha a simple copy 268 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) { 269 key.u.copyAlpha = !(key.u.inType || key.u.outType); 270 } 271 272 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 273 274 switch (ein->getVectorSize()) { 275 case 4: 276 key.u.inVecSize = 3; 277 break; 278 case 3: 279 key.u.inVecSize = 2; 280 key.u.coeffMask &= ~0xF000; 281 break; 282 case 2: 283 key.u.inVecSize = 1; 284 key.u.coeffMask &= ~0xFF00; 285 break; 286 default: 287 key.u.coeffMask &= ~0xFFF0; 288 break; 289 } 290 291 switch (eout->getVectorSize()) { 292 case 4: 293 key.u.outVecSize = 3; 294 break; 295 case 3: 296 key.u.outVecSize = 2; 297 key.u.coeffMask &= ~0x8888; 298 key.u.addMask &= 7; 299 break; 300 case 2: 301 key.u.outVecSize = 1; 302 key.u.coeffMask &= ~0xCCCC; 303 key.u.addMask &= 3; 304 break; 305 default: 306 key.u.coeffMask &= ~0xEEEE; 307 key.u.addMask &= 1; 308 break; 309 } 310 311 if (key.u.inType && !key.u.outType) { 312 key.u.addMask |= 1; 313 if (key.u.outVecSize > 0) key.u.addMask |= 2; 314 if (key.u.outVecSize > 1) key.u.addMask |= 4; 315 if (key.u.outVecSize > 2) key.u.addMask |= 8; 316 } 317 318 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 319 return key; 320 } 321 322 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 323 324 #define DEF_SYM(x) \ 325 extern "C" uint32_t _N_ColorMatrix_##x; \ 326 extern "C" uint32_t _N_ColorMatrix_##x##_end; \ 327 extern "C" uint32_t _N_ColorMatrix_##x##_len; 328 329 DEF_SYM(prefix_i) 330 DEF_SYM(prefix_f) 331 DEF_SYM(postfix1) 332 DEF_SYM(postfix2) 333 334 DEF_SYM(load_u8_4) 335 DEF_SYM(load_u8_3) 336 DEF_SYM(load_u8_2) 337 DEF_SYM(load_u8_1) 338 DEF_SYM(load_u8f_4) 339 DEF_SYM(load_u8f_3) 340 DEF_SYM(load_u8f_2) 341 DEF_SYM(load_u8f_1) 342 DEF_SYM(load_f32_4) 343 DEF_SYM(load_f32_3) 344 DEF_SYM(load_f32_2) 345 DEF_SYM(load_f32_1) 346 347 DEF_SYM(store_u8_4) 348 DEF_SYM(store_u8_2) 349 DEF_SYM(store_u8_1) 350 DEF_SYM(store_f32_4) 351 DEF_SYM(store_f32_3) 352 DEF_SYM(store_f32_2) 353 DEF_SYM(store_f32_1) 354 DEF_SYM(store_f32u_4) 355 DEF_SYM(store_f32u_2) 356 DEF_SYM(store_f32u_1) 357 358 DEF_SYM(unpack_u8_4) 359 DEF_SYM(unpack_u8_3) 360 DEF_SYM(unpack_u8_2) 361 DEF_SYM(unpack_u8_1) 362 DEF_SYM(pack_u8_4) 363 DEF_SYM(pack_u8_3) 364 DEF_SYM(pack_u8_2) 365 DEF_SYM(pack_u8_1) 366 DEF_SYM(dot) 367 DEF_SYM(add_0_u8) 368 DEF_SYM(add_1_u8) 369 DEF_SYM(add_2_u8) 370 DEF_SYM(add_3_u8) 371 372 #define ADD_CHUNK(x) \ 373 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \ 374 buf += _N_ColorMatrix_##x##_len 375 376 377 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) { 378 size_t off = (target - buf - 8) >> 2; 379 rsAssert(((off & 0xff000000) == 0) || 380 ((off & 0xff000000) == 0xff000000)); 381 382 uint32_t op = (condition << 28); 383 op |= 0xa << 24; // branch 384 op |= 0xffffff & off; 385 ((uint32_t *)buf)[0] = op; 386 return buf + 4; 387 } 388 389 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) { 390 rsAssert(vd < 32); 391 rsAssert(vm < 32); 392 rsAssert(vn < 32); 393 394 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22); 395 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5); 396 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7); 397 return op; 398 } 399 400 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 401 //vmlal.s16 Q#1, D#1, D#2[#] 402 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 403 ((uint32_t *)buf)[0] = op; 404 return buf + 4; 405 } 406 407 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 408 //vmull.s16 Q#1, D#1, D#2[#] 409 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 410 ((uint32_t *)buf)[0] = op; 411 return buf + 4; 412 } 413 414 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 415 //vqadd.s32 Q#1, Q#1, Q#2 416 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 417 ((uint32_t *)buf)[0] = op; 418 return buf + 4; 419 } 420 421 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 422 //vmlal.f32 Q#1, D#1, D#2[#] 423 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 424 ((uint32_t *)buf)[0] = op; 425 return buf + 4; 426 } 427 428 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 429 //vmull.f32 Q#1, D#1, D#2[#] 430 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 431 ((uint32_t *)buf)[0] = op; 432 return buf + 4; 433 } 434 435 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 436 //vadd.f32 Q#1, D#1, D#2 437 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 438 ((uint32_t *)buf)[0] = op; 439 return buf + 4; 440 } 441 442 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) { 443 //vmov.32 Q#1, #imm 444 rsAssert(imm == 0); 445 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0); 446 ((uint32_t *)buf)[0] = op; 447 return buf + 4; 448 } 449 450 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 451 //vadd.f32 Q#1, D#1, D#2 452 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 453 ((uint32_t *)buf)[0] = op; 454 return buf + 4; 455 } 456 #endif 457 458 #if defined(ARCH_X86_HAVE_SSSE3) 459 extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, 460 const short *coef, uint32_t count); 461 extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, 462 const short *coef, uint32_t count); 463 extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, 464 const short *coef, uint32_t count); 465 466 void * selectKernel(Key_t key) 467 { 468 void * kernel = NULL; 469 470 // inType, outType float if nonzero 471 if (!(key.u.inType || key.u.outType)) { 472 if (key.u.dot) 473 kernel = (void *)rsdIntrinsicColorMatrixDot_K; 474 else if (key.u.copyAlpha) 475 kernel = (void *)rsdIntrinsicColorMatrix3x3_K; 476 else 477 kernel = (void *)rsdIntrinsicColorMatrix4x4_K; 478 } 479 480 return kernel; 481 } 482 #endif 483 484 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) { 485 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 486 mBufSize = 4096; 487 //StopWatch build_time("rs cm: build time"); 488 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE, 489 MAP_PRIVATE | MAP_ANON, -1, 0); 490 if (mBuf == MAP_FAILED) { 491 mBuf = NULL; 492 return false; 493 } 494 495 uint8_t *buf = mBuf; 496 uint8_t *buf2 = NULL; 497 498 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final 499 int opInit[4] = {0, 0, 0, 0}; 500 501 memset(ops, 0, sizeof(ops)); 502 for (int i=0; i < 4; i++) { 503 if (key.u.coeffMask & (1 << (i*4))) { 504 ops[i][0] = 0x2 | opInit[0]; 505 opInit[0] = 1; 506 } 507 if (!key.u.dot) { 508 if (key.u.coeffMask & (1 << (1 + i*4))) { 509 ops[i][1] = 0x2 | opInit[1]; 510 opInit[1] = 1; 511 } 512 if (key.u.coeffMask & (1 << (2 + i*4))) { 513 ops[i][2] = 0x2 | opInit[2]; 514 opInit[2] = 1; 515 } 516 } 517 if (!key.u.copyAlpha) { 518 if (key.u.coeffMask & (1 << (3 + i*4))) { 519 ops[i][3] = 0x2 | opInit[3]; 520 opInit[3] = 1; 521 } 522 } 523 } 524 525 if (key.u.inType || key.u.outType) { 526 key.u.copyAlpha = 0; 527 ADD_CHUNK(prefix_f); 528 buf2 = buf; 529 530 // Load the incoming r,g,b,a as needed 531 if (key.u.inType) { 532 switch(key.u.inVecSize) { 533 case 3: 534 ADD_CHUNK(load_f32_4); 535 break; 536 case 2: 537 ADD_CHUNK(load_f32_3); 538 break; 539 case 1: 540 ADD_CHUNK(load_f32_2); 541 break; 542 case 0: 543 ADD_CHUNK(load_f32_1); 544 break; 545 } 546 } else { 547 switch(key.u.inVecSize) { 548 case 3: 549 ADD_CHUNK(load_u8f_4); 550 break; 551 case 2: 552 ADD_CHUNK(load_u8f_3); 553 break; 554 case 1: 555 ADD_CHUNK(load_u8f_2); 556 break; 557 case 0: 558 ADD_CHUNK(load_u8f_1); 559 break; 560 } 561 } 562 563 for (int i=0; i < 4; i++) { 564 for (int j=0; j < 4; j++) { 565 switch(ops[i][j]) { 566 case 0: 567 break; 568 case 2: 569 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 570 break; 571 case 3: 572 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 573 break; 574 } 575 } 576 } 577 for (int j=0; j < 4; j++) { 578 if (opInit[j]) { 579 if (key.u.addMask & (1 << j)) { 580 buf = addVADD_F32(buf, j, 12+j, 8+j); 581 } else { 582 buf = addVORR_32(buf, j, 12+j, 12+j); 583 } 584 } else { 585 if (key.u.addMask & (1 << j)) { 586 buf = addVORR_32(buf, j, 8+j, 8+j); 587 } else { 588 buf = addVMOV_32(buf, j, 0); 589 } 590 } 591 } 592 593 if (key.u.outType) { 594 switch(key.u.outVecSize) { 595 case 3: 596 ADD_CHUNK(store_f32_4); 597 break; 598 case 2: 599 ADD_CHUNK(store_f32_3); 600 break; 601 case 1: 602 ADD_CHUNK(store_f32_2); 603 break; 604 case 0: 605 ADD_CHUNK(store_f32_1); 606 break; 607 } 608 } else { 609 switch(key.u.outVecSize) { 610 case 3: 611 case 2: 612 ADD_CHUNK(store_f32u_4); 613 break; 614 case 1: 615 ADD_CHUNK(store_f32u_2); 616 break; 617 case 0: 618 ADD_CHUNK(store_f32u_1); 619 break; 620 } 621 } 622 623 624 } else { 625 // Add the function prefix 626 // Store the address for the loop return 627 ADD_CHUNK(prefix_i); 628 buf2 = buf; 629 630 // Load the incoming r,g,b,a as needed 631 switch(key.u.inVecSize) { 632 case 3: 633 ADD_CHUNK(load_u8_4); 634 if (key.u.copyAlpha) { 635 ADD_CHUNK(unpack_u8_3); 636 } else { 637 ADD_CHUNK(unpack_u8_4); 638 } 639 break; 640 case 2: 641 ADD_CHUNK(load_u8_3); 642 ADD_CHUNK(unpack_u8_3); 643 break; 644 case 1: 645 ADD_CHUNK(load_u8_2); 646 ADD_CHUNK(unpack_u8_2); 647 break; 648 case 0: 649 ADD_CHUNK(load_u8_1); 650 ADD_CHUNK(unpack_u8_1); 651 break; 652 } 653 654 // Add multiply and accumulate 655 // use MULL to init the output register, 656 // use MLAL from there 657 for (int i=0; i < 4; i++) { 658 for (int j=0; j < 4; j++) { 659 switch(ops[i][j]) { 660 case 0: 661 break; 662 case 2: 663 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j); 664 break; 665 case 3: 666 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j); 667 break; 668 } 669 } 670 } 671 for (int j=0; j < 4; j++) { 672 if (opInit[j]) { 673 if (key.u.addMask & (1 << j)) { 674 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j); 675 } 676 } else { 677 if (key.u.addMask & (1 << j)) { 678 buf = addVORR_32(buf, 8+j, 4+j, 4+j); 679 } 680 } 681 } 682 683 // If we have a dot product, perform the special pack. 684 if (key.u.dot) { 685 ADD_CHUNK(pack_u8_1); 686 ADD_CHUNK(dot); 687 } else { 688 switch(key.u.outVecSize) { 689 case 3: 690 if (key.u.copyAlpha) { 691 ADD_CHUNK(pack_u8_3); 692 } else { 693 ADD_CHUNK(pack_u8_4); 694 } 695 break; 696 case 2: 697 ADD_CHUNK(pack_u8_3); 698 break; 699 case 1: 700 ADD_CHUNK(pack_u8_2); 701 break; 702 case 0: 703 ADD_CHUNK(pack_u8_1); 704 break; 705 } 706 } 707 708 // Write out result 709 switch(key.u.outVecSize) { 710 case 3: 711 case 2: 712 ADD_CHUNK(store_u8_4); 713 break; 714 case 1: 715 ADD_CHUNK(store_u8_2); 716 break; 717 case 0: 718 ADD_CHUNK(store_u8_1); 719 break; 720 } 721 } 722 723 if (key.u.inType != key.u.outType) { 724 key.u.copyAlpha = 0; 725 key.u.dot = 0; 726 } 727 728 // Loop, branch, and cleanup 729 ADD_CHUNK(postfix1); 730 buf = addBranch(buf, buf2, 0x01); 731 ADD_CHUNK(postfix2); 732 733 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC); 734 if (ret == -1) { 735 ALOGE("mprotect error %i", ret); 736 return false; 737 } 738 739 FLUSH_CPU_CACHE(mBuf, (char*) mBuf + mBufSize); 740 return true; 741 #else 742 return false; 743 #endif 744 } 745 746 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) { 747 for(int ct=0; ct < 16; ct++) { 748 ip[ct] = (short)(fp[ct] * 256.f + 0.5f); 749 tmpFp[ct] = fp[ct] * fpMul; 750 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]); 751 } 752 753 float add = 0.f; 754 if (fpMul > 254.f) add = 0.5f; 755 for(int ct=0; ct < 4; ct++) { 756 tmpFpa[ct] = fpa[ct] * addMul + add; 757 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]); 758 } 759 760 for(int ct=0; ct < 4; ct++) { 761 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f); 762 } 763 } 764 765 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data, 766 size_t dataLength) { 767 switch(slot) { 768 case 0: 769 memcpy (fp, data, sizeof(fp)); 770 break; 771 case 1: 772 memcpy (fpa, data, sizeof(fpa)); 773 break; 774 default: 775 rsAssert(0); 776 break; 777 } 778 mRootPtr = &kernel; 779 } 780 781 782 static void One(const RsForEachStubParamStruct *p, void *out, 783 const void *py, const float* coeff, const float *add, 784 uint32_t vsin, uint32_t vsout, bool fin, bool fout) { 785 786 float4 f = 0.f; 787 if (fin) { 788 switch(vsin) { 789 case 3: 790 f = ((const float4 *)py)[0]; 791 break; 792 case 2: 793 f = ((const float4 *)py)[0]; 794 f.w = 0.f; 795 break; 796 case 1: 797 f.xy = ((const float2 *)py)[0]; 798 break; 799 case 0: 800 f.x = ((const float *)py)[0]; 801 break; 802 } 803 } else { 804 switch(vsin) { 805 case 3: 806 f = convert_float4(((const uchar4 *)py)[0]); 807 break; 808 case 2: 809 f = convert_float4(((const uchar4 *)py)[0]); 810 f.w = 0.f; 811 break; 812 case 1: 813 f.xy = convert_float2(((const uchar2 *)py)[0]); 814 break; 815 case 0: 816 f.x = (float)(((const uchar *)py)[0]); 817 break; 818 } 819 } 820 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w); 821 822 float4 sum; 823 sum.x = f.x * coeff[0] + 824 f.y * coeff[4] + 825 f.z * coeff[8] + 826 f.w * coeff[12]; 827 sum.y = f.x * coeff[1] + 828 f.y * coeff[5] + 829 f.z * coeff[9] + 830 f.w * coeff[13]; 831 sum.z = f.x * coeff[2] + 832 f.y * coeff[6] + 833 f.z * coeff[10] + 834 f.w * coeff[14]; 835 sum.w = f.x * coeff[3] + 836 f.y * coeff[7] + 837 f.z * coeff[11] + 838 f.w * coeff[15]; 839 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w); 840 841 sum.x += add[0]; 842 sum.y += add[1]; 843 sum.z += add[2]; 844 sum.w += add[3]; 845 846 847 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w); 848 if (fout) { 849 switch(vsout) { 850 case 3: 851 case 2: 852 ((float4 *)out)[0] = sum; 853 break; 854 case 1: 855 ((float2 *)out)[0] = sum.xy; 856 break; 857 case 0: 858 ((float *)out)[0] = sum.x; 859 break; 860 } 861 } else { 862 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x); 863 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y); 864 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z); 865 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w); 866 867 switch(vsout) { 868 case 3: 869 case 2: 870 ((uchar4 *)out)[0] = convert_uchar4(sum); 871 break; 872 case 1: 873 ((uchar2 *)out)[0] = convert_uchar2(sum.xy); 874 break; 875 case 0: 876 ((uchar *)out)[0] = sum.x; 877 break; 878 } 879 } 880 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]); 881 } 882 883 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p, 884 uint32_t xstart, uint32_t xend, 885 uint32_t instep, uint32_t outstep) { 886 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr; 887 uchar *out = (uchar *)p->out; 888 uchar *in = (uchar *)p->in; 889 uint32_t x1 = xstart; 890 uint32_t x2 = xend; 891 892 uint32_t vsin = cp->mLastKey.u.inVecSize; 893 uint32_t vsout = cp->mLastKey.u.outVecSize; 894 bool floatIn = !!cp->mLastKey.u.inType; 895 bool floatOut = !!cp->mLastKey.u.outType; 896 897 //if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout); 898 899 if(x2 > x1) { 900 int32_t len = x2 - x1; 901 if (gArchUseSIMD) { 902 if((cp->mOptKernel != NULL) && (len >= 4)) { 903 // The optimized kernel processes 4 pixels at once 904 // and requires a minimum of 1 chunk of 4 905 cp->mOptKernel(out, in, cp->ip, len >> 2); 906 // Update the len and pointers so the generic code can 907 // finish any leftover pixels 908 len &= ~3; 909 x1 += len; 910 out += outstep * len; 911 in += instep * len; 912 } 913 #if defined(ARCH_ARM64_USE_INTRINSICS) 914 else { 915 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) { 916 // Currently this generates off by one errors. 917 //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); 918 //x1 += len; 919 //out += outstep * len; 920 //in += instep * len; 921 } else { 922 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa); 923 x1 += len; 924 out += outstep * len; 925 in += instep * len; 926 } 927 } 928 #endif 929 } 930 931 while(x1 != x2) { 932 One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut); 933 out += outstep; 934 in += instep; 935 x1++; 936 } 937 } 938 } 939 940 void RsdCpuScriptIntrinsicColorMatrix::preLaunch( 941 uint32_t slot, const Allocation * ain, Allocation * aout, 942 const void * usr, uint32_t usrLen, const RsScriptCall *sc) { 943 944 const Element *ein = ain->mHal.state.type->getElement(); 945 const Element *eout = aout->mHal.state.type->getElement(); 946 947 if (ein->getType() == eout->getType()) { 948 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 949 updateCoeffCache(1.f, 255.f); 950 } else { 951 updateCoeffCache(1.f, 1.f); 952 } 953 } else { 954 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 955 updateCoeffCache(255.f, 255.f); 956 } else { 957 updateCoeffCache(1.f / 255.f, 1.f); 958 } 959 } 960 961 Key_t key = computeKey(ain->mHal.state.type->getElement(), 962 aout->mHal.state.type->getElement()); 963 #if defined(ARCH_X86_HAVE_SSSE3) 964 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) { 965 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases 966 // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key); 967 mLastKey = key; 968 } 969 970 #else //if !defined(ARCH_X86_HAVE_SSSE3) 971 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) { 972 if (mBuf) munmap(mBuf, mBufSize); 973 mBuf = NULL; 974 mOptKernel = NULL; 975 if (build(key)) { 976 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf; 977 } 978 #if defined(ARCH_ARM64_USE_INTRINSICS) 979 else { 980 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0); 981 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0); 982 uint32_t mm = 0; 983 int i; 984 for (i = 0; i < 4; i++) 985 { 986 uint32_t m = (key.u.coeffMask >> i) & 0x1111; 987 m = ((m * 0x249) >> 9) & 15; 988 m |= ((key.u.addMask >> i) & 1) << 4; 989 mm |= m << (i * 5); 990 } 991 992 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) { 993 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st); 994 } else { 995 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st); 996 } 997 } 998 #endif 999 mLastKey = key; 1000 } 1001 #endif //if !defined(ARCH_X86_HAVE_SSSE3) 1002 } 1003 1004 void RsdCpuScriptIntrinsicColorMatrix::postLaunch( 1005 uint32_t slot, const Allocation * ain, Allocation * aout, 1006 const void * usr, uint32_t usrLen, const RsScriptCall *sc) { 1007 1008 } 1009 1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( 1011 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 1012 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { 1013 1014 mLastKey.key = 0; 1015 mBuf = NULL; 1016 mBufSize = 0; 1017 mOptKernel = NULL; 1018 const static float defaultMatrix[] = { 1019 1.f, 0.f, 0.f, 0.f, 1020 0.f, 1.f, 0.f, 0.f, 1021 0.f, 0.f, 1.f, 0.f, 1022 0.f, 0.f, 0.f, 1.f 1023 }; 1024 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f}; 1025 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix)); 1026 setGlobalVar(1, defaultAdd, sizeof(defaultAdd)); 1027 } 1028 1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() { 1030 if (mBuf) munmap(mBuf, mBufSize); 1031 mBuf = NULL; 1032 mOptKernel = NULL; 1033 } 1034 1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) { 1036 s->mHal.info.exportedVariableCount = 2; 1037 } 1038 1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, 1040 const Script *s, const Element *e) { 1041 1042 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e); 1043 } 1044