1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <sys/mman.h> 18 #include <unistd.h> 19 20 #include "rsCpuIntrinsic.h" 21 #include "rsCpuIntrinsicInlines.h" 22 23 #include <sys/mman.h> 24 #include <stddef.h> 25 #include <stdint.h> 26 #include <stdlib.h> 27 //#include <utils/StopWatch.h> 28 29 30 /* uint kernel 31 * Q0 D0: Load slot for R 32 * D1: Load slot for G 33 * Q1 D2: Load slot for B 34 * D3: Load slot for A 35 * Q2 D4: Matrix 36 * D5: = 37 * Q3 D6: = 38 * D7: = 39 * Q4 D8: Add R 40 * D9: 41 * Q5 D10: Add G 42 * D11: 43 * Q6 D12: Add B 44 * D13: 45 * Q7 D14: Add A 46 * D15: 47 * Q8 D16: I32: R Sum 48 * D17: 49 * Q9 D18: I32: G Sum 50 * D19: 51 * Q10 D20: I32: B Sum 52 * D21: 53 * Q11 D22: I32: A Sum 54 * D23: 55 * Q12 D24: U16: expanded R 56 * D25: 57 * Q13 D26: U16: expanded G 58 * D27: 59 * Q14 D28: U16: expanded B 60 * D29: 61 * Q15 D30: U16: expanded A 62 * D31: 63 * 64 */ 65 66 /* float kernel 67 * Q0 D0: Load slot for R 68 * D1: = 69 * Q1 D2: Load slot for G 70 * D3: = 71 * Q2 D4: Load slot for B 72 * D5: = 73 * Q3 D6: Load slot for A 74 * D7: = 75 * Q4 D8: Matrix 76 * D9: = 77 * Q5 D10: = 78 * D11: = 79 * Q6 D12: = 80 * D13: = 81 * Q7 D14: = 82 * D15: = 83 * Q8 D16: Add R 84 * D17: = 85 * Q9 D18: Add G 86 * D19: = 87 * Q10 D20: Add B 88 * D21: = 89 * Q11 D22: Add A 90 * D23: = 91 * Q12 D24: Sum R 92 * D25: = 93 * Q13 D26: Sum G 94 * D27: = 95 * Q14 D28: Sum B 96 * D29: = 97 * Q15 D30: Sum A 98 * D31: = 99 * 100 */ 101 102 103 104 namespace android { 105 namespace renderscript { 106 107 typedef union { 108 uint64_t key; 109 struct { 110 uint32_t inVecSize :2; // [0 - 1] 111 uint32_t outVecSize :2; // [2 - 3] 112 uint32_t inType :4; // [4 - 7] 113 uint32_t outType :4; // [8 - 11] 114 uint32_t dot :1; // [12] 115 uint32_t _unused1 :1; // [13] 116 uint32_t copyAlpha :1; // [14] 117 uint32_t _unused2 :1; // [15] 118 uint32_t coeffMask :16; // [16-31] 119 uint32_t addMask :4; // [32-35] 120 } u; 121 } Key_t; 122 123 //Re-enable when intrinsic is fixed 124 #if defined(ARCH_ARM64_USE_INTRINSICS) 125 typedef struct { 126 void (*column[4])(void); 127 void (*store)(void); 128 void (*load)(void); 129 void (*store_end)(void); 130 void (*load_end)(void); 131 } FunctionTab_t; 132 133 extern "C" void rsdIntrinsicColorMatrix_int_K( 134 void *out, void const *in, size_t count, 135 FunctionTab_t const *fns, 136 int16_t const *mult, int32_t const *add); 137 138 extern "C" void rsdIntrinsicColorMatrix_float_K( 139 void *out, void const *in, size_t count, 140 FunctionTab_t const *fns, 141 float const *mult, float const *add); 142 143 /* The setup functions fill in function tables to be used by above functions; 144 * this code also eliminates jump-to-another-jump cases by short-circuiting 145 * empty functions. While it's not performance critical, it works out easier 146 * to write the set-up code in assembly than to try to expose the same symbols 147 * and write the code in C. 148 */ 149 extern "C" void rsdIntrinsicColorMatrixSetup_int_K( 150 FunctionTab_t *fns, 151 uint32_t mask, int dt, int st); 152 153 extern "C" void rsdIntrinsicColorMatrixSetup_float_K( 154 FunctionTab_t *fns, 155 uint32_t mask, int dt, int st); 156 #endif 157 158 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic { 159 public: 160 void populateScript(Script *) override; 161 162 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; 163 164 ~RsdCpuScriptIntrinsicColorMatrix() override; 165 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 166 167 void preLaunch(uint32_t slot, const Allocation ** ains, 168 uint32_t inLen, Allocation * aout, const void * usr, 169 uint32_t usrLen, const RsScriptCall *sc) override; 170 171 protected: 172 float fp[16]; 173 float fpa[4]; 174 175 // The following four fields are read as constants 176 // by the SIMD assembly code. 177 int16_t ip[16]; 178 int ipa[4]; 179 float tmpFp[16]; 180 float tmpFpa[4]; 181 #if defined(ARCH_ARM64_USE_INTRINSICS) 182 FunctionTab_t mFnTab; 183 #endif 184 185 static void kernel(const RsExpandKernelDriverInfo *info, 186 uint32_t xstart, uint32_t xend, 187 uint32_t outstep); 188 void updateCoeffCache(float fpMul, float addMul); 189 190 Key_t mLastKey; 191 unsigned char *mBuf; 192 size_t mBufSize; 193 194 Key_t computeKey(const Element *ein, const Element *eout); 195 196 bool build(Key_t key); 197 198 void (*mOptKernel)(void *dst, const void *src, const int16_t *coef, uint32_t count); 199 200 }; 201 202 203 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey( 204 const Element *ein, const Element *eout) { 205 206 Key_t key; 207 key.key = 0; 208 209 // Compute a unique code key for this operation 210 211 // Add to the key the input and output types 212 bool hasFloat = false; 213 if (ein->getType() == RS_TYPE_FLOAT_32) { 214 hasFloat = true; 215 key.u.inType = RS_TYPE_FLOAT_32; 216 rsAssert(key.u.inType == RS_TYPE_FLOAT_32); 217 } 218 if (eout->getType() == RS_TYPE_FLOAT_32) { 219 hasFloat = true; 220 key.u.outType = RS_TYPE_FLOAT_32; 221 rsAssert(key.u.outType == RS_TYPE_FLOAT_32); 222 } 223 224 // Mask in the bits indicating which coefficients in the 225 // color matrix are needed. 226 if (hasFloat) { 227 for (uint32_t i=0; i < 16; i++) { 228 if (fabs(fp[i]) != 0.f) { 229 key.u.coeffMask |= 1 << i; 230 } 231 } 232 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1; 233 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2; 234 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4; 235 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8; 236 237 } else { 238 for (uint32_t i=0; i < 16; i++) { 239 if (ip[i] != 0) { 240 key.u.coeffMask |= 1 << i; 241 } 242 } 243 if (ipa[0] != 0) key.u.addMask |= 0x1; 244 if (ipa[1] != 0) key.u.addMask |= 0x2; 245 if (ipa[2] != 0) key.u.addMask |= 0x4; 246 if (ipa[3] != 0) key.u.addMask |= 0x8; 247 } 248 249 // Look for a dot product where the r,g,b colums are the same 250 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) && 251 (ip[4] == ip[5]) && (ip[4] == ip[6]) && 252 (ip[8] == ip[9]) && (ip[8] == ip[10]) && 253 (ip[12] == ip[13]) && (ip[12] == ip[14])) { 254 255 if (!key.u.addMask) key.u.dot = 1; 256 } 257 258 // Is alpha a simple copy 259 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) { 260 key.u.copyAlpha = !(key.u.inType || key.u.outType); 261 } 262 263 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 264 265 switch (ein->getVectorSize()) { 266 case 4: 267 key.u.inVecSize = 3; 268 break; 269 case 3: 270 key.u.inVecSize = 2; 271 key.u.coeffMask &= ~0xF000; 272 break; 273 case 2: 274 key.u.inVecSize = 1; 275 key.u.coeffMask &= ~0xFF00; 276 break; 277 default: 278 key.u.coeffMask &= ~0xFFF0; 279 break; 280 } 281 282 switch (eout->getVectorSize()) { 283 case 4: 284 key.u.outVecSize = 3; 285 break; 286 case 3: 287 key.u.outVecSize = 2; 288 key.u.coeffMask &= ~0x8888; 289 key.u.addMask &= 7; 290 break; 291 case 2: 292 key.u.outVecSize = 1; 293 key.u.coeffMask &= ~0xCCCC; 294 key.u.addMask &= 3; 295 break; 296 default: 297 key.u.coeffMask &= ~0xEEEE; 298 key.u.addMask &= 1; 299 break; 300 } 301 302 if (key.u.inType && !key.u.outType) { 303 key.u.addMask |= 1; 304 if (key.u.outVecSize > 0) key.u.addMask |= 2; 305 if (key.u.outVecSize > 1) key.u.addMask |= 4; 306 if (key.u.outVecSize > 2) key.u.addMask |= 8; 307 } 308 309 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 310 return key; 311 } 312 313 } // namespace renderscript 314 } // namespace android 315 316 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 317 318 #define DEF_SYM(x) \ 319 extern "C" uint32_t _N_ColorMatrix_##x; \ 320 extern "C" uint32_t _N_ColorMatrix_##x##_end; \ 321 extern "C" uint32_t _N_ColorMatrix_##x##_len; 322 323 DEF_SYM(prefix_i) 324 DEF_SYM(prefix_f) 325 DEF_SYM(postfix1) 326 DEF_SYM(postfix2) 327 328 DEF_SYM(load_u8_4) 329 DEF_SYM(load_u8_3) 330 DEF_SYM(load_u8_2) 331 DEF_SYM(load_u8_1) 332 DEF_SYM(load_u8f_4) 333 DEF_SYM(load_u8f_3) 334 DEF_SYM(load_u8f_2) 335 DEF_SYM(load_u8f_1) 336 DEF_SYM(load_f32_4) 337 DEF_SYM(load_f32_3) 338 DEF_SYM(load_f32_2) 339 DEF_SYM(load_f32_1) 340 341 DEF_SYM(store_u8_4) 342 DEF_SYM(store_u8_2) 343 DEF_SYM(store_u8_1) 344 DEF_SYM(store_f32_4) 345 DEF_SYM(store_f32_3) 346 DEF_SYM(store_f32_2) 347 DEF_SYM(store_f32_1) 348 DEF_SYM(store_f32u_4) 349 DEF_SYM(store_f32u_2) 350 DEF_SYM(store_f32u_1) 351 352 DEF_SYM(unpack_u8_4) 353 DEF_SYM(unpack_u8_3) 354 DEF_SYM(unpack_u8_2) 355 DEF_SYM(unpack_u8_1) 356 DEF_SYM(pack_u8_4) 357 DEF_SYM(pack_u8_3) 358 DEF_SYM(pack_u8_2) 359 DEF_SYM(pack_u8_1) 360 DEF_SYM(dot) 361 DEF_SYM(add_0_u8) 362 DEF_SYM(add_1_u8) 363 DEF_SYM(add_2_u8) 364 DEF_SYM(add_3_u8) 365 366 #define ADD_CHUNK(x) \ 367 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \ 368 buf += _N_ColorMatrix_##x##_len 369 370 371 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) { 372 size_t off = (target - buf - 8) >> 2; 373 rsAssert(((off & 0xff000000) == 0) || 374 ((off & 0xff000000) == 0xff000000)); 375 376 uint32_t op = (condition << 28); 377 op |= 0xa << 24; // branch 378 op |= 0xffffff & off; 379 ((uint32_t *)buf)[0] = op; 380 return buf + 4; 381 } 382 383 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) { 384 rsAssert(vd < 32); 385 rsAssert(vm < 32); 386 rsAssert(vn < 32); 387 388 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22); 389 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5); 390 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7); 391 return op; 392 } 393 394 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 395 //vmlal.s16 Q#1, D#1, D#2[#] 396 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 397 ((uint32_t *)buf)[0] = op; 398 return buf + 4; 399 } 400 401 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 402 //vmull.s16 Q#1, D#1, D#2[#] 403 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 404 ((uint32_t *)buf)[0] = op; 405 return buf + 4; 406 } 407 408 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 409 //vqadd.s32 Q#1, Q#1, Q#2 410 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 411 ((uint32_t *)buf)[0] = op; 412 return buf + 4; 413 } 414 415 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 416 //vmlal.f32 Q#1, D#1, D#2[#] 417 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 418 ((uint32_t *)buf)[0] = op; 419 return buf + 4; 420 } 421 422 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 423 //vmull.f32 Q#1, D#1, D#2[#] 424 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 425 ((uint32_t *)buf)[0] = op; 426 return buf + 4; 427 } 428 429 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 430 //vadd.f32 Q#1, D#1, D#2 431 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 432 ((uint32_t *)buf)[0] = op; 433 return buf + 4; 434 } 435 436 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) { 437 //vmov.32 Q#1, #imm 438 rsAssert(imm == 0); 439 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0); 440 ((uint32_t *)buf)[0] = op; 441 return buf + 4; 442 } 443 444 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 445 //vadd.f32 Q#1, D#1, D#2 446 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 447 ((uint32_t *)buf)[0] = op; 448 return buf + 4; 449 } 450 #endif 451 452 #if defined(ARCH_X86_HAVE_SSSE3) 453 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, 454 const int16_t *coef, uint32_t count); 455 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, 456 const int16_t *coef, uint32_t count); 457 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, 458 const int16_t *coef, uint32_t count); 459 460 using android::renderscript::Key_t; 461 462 void * selectKernel(Key_t key) 463 { 464 void * kernel = nullptr; 465 466 // inType, outType float if nonzero 467 if (!(key.u.inType || key.u.outType)) { 468 if (key.u.dot) 469 kernel = (void *)rsdIntrinsicColorMatrixDot_K; 470 else if (key.u.copyAlpha) 471 kernel = (void *)rsdIntrinsicColorMatrix3x3_K; 472 else 473 kernel = (void *)rsdIntrinsicColorMatrix4x4_K; 474 } 475 476 return kernel; 477 } 478 #endif 479 480 namespace android { 481 namespace renderscript { 482 483 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) { 484 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 485 mBufSize = 4096; 486 //StopWatch build_time("rs cm: build time"); 487 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE, 488 MAP_PRIVATE | MAP_ANON, -1, 0); 489 if (mBuf == MAP_FAILED) { 490 mBuf = NULL; 491 return false; 492 } 493 494 uint8_t *buf = mBuf; 495 uint8_t *buf2 = nullptr; 496 497 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final 498 int opInit[4] = {0, 0, 0, 0}; 499 500 memset(ops, 0, sizeof(ops)); 501 for (int i=0; i < 4; i++) { 502 if (key.u.coeffMask & (1 << (i*4))) { 503 ops[i][0] = 0x2 | opInit[0]; 504 opInit[0] = 1; 505 } 506 if (!key.u.dot) { 507 if (key.u.coeffMask & (1 << (1 + i*4))) { 508 ops[i][1] = 0x2 | opInit[1]; 509 opInit[1] = 1; 510 } 511 if (key.u.coeffMask & (1 << (2 + i*4))) { 512 ops[i][2] = 0x2 | opInit[2]; 513 opInit[2] = 1; 514 } 515 } 516 if (!key.u.copyAlpha) { 517 if (key.u.coeffMask & (1 << (3 + i*4))) { 518 ops[i][3] = 0x2 | opInit[3]; 519 opInit[3] = 1; 520 } 521 } 522 } 523 524 if (key.u.inType || key.u.outType) { 525 key.u.copyAlpha = 0; 526 ADD_CHUNK(prefix_f); 527 buf2 = buf; 528 529 // Load the incoming r,g,b,a as needed 530 if (key.u.inType) { 531 switch(key.u.inVecSize) { 532 case 3: 533 ADD_CHUNK(load_f32_4); 534 break; 535 case 2: 536 ADD_CHUNK(load_f32_3); 537 break; 538 case 1: 539 ADD_CHUNK(load_f32_2); 540 break; 541 case 0: 542 ADD_CHUNK(load_f32_1); 543 break; 544 } 545 } else { 546 switch(key.u.inVecSize) { 547 case 3: 548 ADD_CHUNK(load_u8f_4); 549 break; 550 case 2: 551 ADD_CHUNK(load_u8f_3); 552 break; 553 case 1: 554 ADD_CHUNK(load_u8f_2); 555 break; 556 case 0: 557 ADD_CHUNK(load_u8f_1); 558 break; 559 } 560 } 561 562 for (int i=0; i < 4; i++) { 563 for (int j=0; j < 4; j++) { 564 switch(ops[i][j]) { 565 case 0: 566 break; 567 case 2: 568 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 569 break; 570 case 3: 571 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 572 break; 573 } 574 } 575 } 576 for (int j=0; j < 4; j++) { 577 if (opInit[j]) { 578 if (key.u.addMask & (1 << j)) { 579 buf = addVADD_F32(buf, j, 12+j, 8+j); 580 } else { 581 buf = addVORR_32(buf, j, 12+j, 12+j); 582 } 583 } else { 584 if (key.u.addMask & (1 << j)) { 585 buf = addVORR_32(buf, j, 8+j, 8+j); 586 } else { 587 buf = addVMOV_32(buf, j, 0); 588 } 589 } 590 } 591 592 if (key.u.outType) { 593 switch(key.u.outVecSize) { 594 case 3: 595 ADD_CHUNK(store_f32_4); 596 break; 597 case 2: 598 ADD_CHUNK(store_f32_3); 599 break; 600 case 1: 601 ADD_CHUNK(store_f32_2); 602 break; 603 case 0: 604 ADD_CHUNK(store_f32_1); 605 break; 606 } 607 } else { 608 switch(key.u.outVecSize) { 609 case 3: 610 case 2: 611 ADD_CHUNK(store_f32u_4); 612 break; 613 case 1: 614 ADD_CHUNK(store_f32u_2); 615 break; 616 case 0: 617 ADD_CHUNK(store_f32u_1); 618 break; 619 } 620 } 621 622 623 } else { 624 // Add the function prefix 625 // Store the address for the loop return 626 ADD_CHUNK(prefix_i); 627 buf2 = buf; 628 629 // Load the incoming r,g,b,a as needed 630 switch(key.u.inVecSize) { 631 case 3: 632 ADD_CHUNK(load_u8_4); 633 if (key.u.copyAlpha) { 634 ADD_CHUNK(unpack_u8_3); 635 } else { 636 ADD_CHUNK(unpack_u8_4); 637 } 638 break; 639 case 2: 640 ADD_CHUNK(load_u8_3); 641 ADD_CHUNK(unpack_u8_3); 642 break; 643 case 1: 644 ADD_CHUNK(load_u8_2); 645 ADD_CHUNK(unpack_u8_2); 646 break; 647 case 0: 648 ADD_CHUNK(load_u8_1); 649 ADD_CHUNK(unpack_u8_1); 650 break; 651 } 652 653 // Add multiply and accumulate 654 // use MULL to init the output register, 655 // use MLAL from there 656 for (int i=0; i < 4; i++) { 657 for (int j=0; j < 4; j++) { 658 switch(ops[i][j]) { 659 case 0: 660 break; 661 case 2: 662 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j); 663 break; 664 case 3: 665 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j); 666 break; 667 } 668 } 669 } 670 for (int j=0; j < 4; j++) { 671 if (opInit[j]) { 672 if (key.u.addMask & (1 << j)) { 673 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j); 674 } 675 } else { 676 if (key.u.addMask & (1 << j)) { 677 buf = addVORR_32(buf, 8+j, 4+j, 4+j); 678 } 679 } 680 } 681 682 // If we have a dot product, perform the special pack. 683 if (key.u.dot) { 684 ADD_CHUNK(pack_u8_1); 685 ADD_CHUNK(dot); 686 } else { 687 switch(key.u.outVecSize) { 688 case 3: 689 if (key.u.copyAlpha) { 690 ADD_CHUNK(pack_u8_3); 691 } else { 692 ADD_CHUNK(pack_u8_4); 693 } 694 break; 695 case 2: 696 ADD_CHUNK(pack_u8_3); 697 break; 698 case 1: 699 ADD_CHUNK(pack_u8_2); 700 break; 701 case 0: 702 ADD_CHUNK(pack_u8_1); 703 break; 704 } 705 } 706 707 // Write out result 708 switch(key.u.outVecSize) { 709 case 3: 710 case 2: 711 ADD_CHUNK(store_u8_4); 712 break; 713 case 1: 714 ADD_CHUNK(store_u8_2); 715 break; 716 case 0: 717 ADD_CHUNK(store_u8_1); 718 break; 719 } 720 } 721 722 if (key.u.inType != key.u.outType) { 723 key.u.copyAlpha = 0; 724 key.u.dot = 0; 725 } 726 727 // Loop, branch, and cleanup 728 ADD_CHUNK(postfix1); 729 buf = addBranch(buf, buf2, 0x01); 730 ADD_CHUNK(postfix2); 731 732 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC); 733 if (ret == -1) { 734 ALOGE("mprotect error %i", ret); 735 return false; 736 } 737 738 __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize); 739 return true; 740 #else 741 return false; 742 #endif 743 } 744 745 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) { 746 for(int ct=0; ct < 16; ct++) { 747 ip[ct] = (int16_t)(fp[ct] * 256.f + 0.5f); 748 tmpFp[ct] = fp[ct] * fpMul; 749 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]); 750 } 751 752 float add = 0.f; 753 if (fpMul > 254.f) add = 0.5f; 754 for(int ct=0; ct < 4; ct++) { 755 tmpFpa[ct] = fpa[ct] * addMul + add; 756 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]); 757 } 758 759 for(int ct=0; ct < 4; ct++) { 760 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f); 761 } 762 } 763 764 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data, 765 size_t dataLength) { 766 switch(slot) { 767 case 0: 768 memcpy (fp, data, sizeof(fp)); 769 break; 770 case 1: 771 memcpy (fpa, data, sizeof(fpa)); 772 break; 773 default: 774 rsAssert(0); 775 break; 776 } 777 mRootPtr = &kernel; 778 } 779 780 781 static void One(const RsExpandKernelDriverInfo *info, void *out, 782 const void *py, const float* coeff, const float *add, 783 uint32_t vsin, uint32_t vsout, bool fin, bool fout) { 784 785 float4 f = 0.f; 786 if (fin) { 787 switch(vsin) { 788 case 3: 789 f = ((const float4 *)py)[0]; 790 break; 791 case 2: 792 f = ((const float4 *)py)[0]; 793 f.w = 0.f; 794 break; 795 case 1: 796 f.xy = ((const float2 *)py)[0]; 797 break; 798 case 0: 799 f.x = ((const float *)py)[0]; 800 break; 801 } 802 } else { 803 switch(vsin) { 804 case 3: 805 f = convert_float4(((const uchar4 *)py)[0]); 806 break; 807 case 2: 808 f = convert_float4(((const uchar4 *)py)[0]); 809 f.w = 0.f; 810 break; 811 case 1: 812 f.xy = convert_float2(((const uchar2 *)py)[0]); 813 break; 814 case 0: 815 f.x = (float)(((const uchar *)py)[0]); 816 break; 817 } 818 } 819 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w); 820 821 float4 sum; 822 sum.x = f.x * coeff[0] + 823 f.y * coeff[4] + 824 f.z * coeff[8] + 825 f.w * coeff[12]; 826 sum.y = f.x * coeff[1] + 827 f.y * coeff[5] + 828 f.z * coeff[9] + 829 f.w * coeff[13]; 830 sum.z = f.x * coeff[2] + 831 f.y * coeff[6] + 832 f.z * coeff[10] + 833 f.w * coeff[14]; 834 sum.w = f.x * coeff[3] + 835 f.y * coeff[7] + 836 f.z * coeff[11] + 837 f.w * coeff[15]; 838 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w); 839 840 sum.x += add[0]; 841 sum.y += add[1]; 842 sum.z += add[2]; 843 sum.w += add[3]; 844 845 846 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w); 847 if (fout) { 848 switch(vsout) { 849 case 3: 850 case 2: 851 ((float4 *)out)[0] = sum; 852 break; 853 case 1: 854 ((float2 *)out)[0] = sum.xy; 855 break; 856 case 0: 857 ((float *)out)[0] = sum.x; 858 break; 859 } 860 } else { 861 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x); 862 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y); 863 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z); 864 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w); 865 866 switch(vsout) { 867 case 3: 868 case 2: 869 ((uchar4 *)out)[0] = convert_uchar4(sum); 870 break; 871 case 1: 872 ((uchar2 *)out)[0] = convert_uchar2(sum.xy); 873 break; 874 case 0: 875 ((uchar *)out)[0] = sum.x; 876 break; 877 } 878 } 879 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]); 880 } 881 882 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info, 883 uint32_t xstart, uint32_t xend, 884 uint32_t outstep) { 885 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr; 886 887 uint32_t instep = info->inStride[0]; 888 889 uchar *out = (uchar *)info->outPtr[0]; 890 uchar *in = (uchar *)info->inPtr[0]; 891 uint32_t x1 = xstart; 892 uint32_t x2 = xend; 893 894 uint32_t vsin = cp->mLastKey.u.inVecSize; 895 uint32_t vsout = cp->mLastKey.u.outVecSize; 896 bool floatIn = !!cp->mLastKey.u.inType; 897 bool floatOut = !!cp->mLastKey.u.outType; 898 899 //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout); 900 901 if(x2 > x1) { 902 int32_t len = x2 - x1; 903 if (gArchUseSIMD) { 904 if((cp->mOptKernel != nullptr) && (len >= 4)) { 905 // The optimized kernel processes 4 pixels at once 906 // and requires a minimum of 1 chunk of 4 907 cp->mOptKernel(out, in, cp->ip, len >> 2); 908 // Update the len and pointers so the generic code can 909 // finish any leftover pixels 910 len &= ~3; 911 x1 += len; 912 out += outstep * len; 913 in += instep * len; 914 } 915 #if defined(ARCH_ARM64_USE_INTRINSICS) 916 else { 917 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) { 918 // Currently this generates off by one errors. 919 //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); 920 //x1 += len; 921 //out += outstep * len; 922 //in += instep * len; 923 } else { 924 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa); 925 x1 += len; 926 out += outstep * len; 927 in += instep * len; 928 } 929 } 930 #endif 931 } 932 933 while(x1 != x2) { 934 One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut); 935 out += outstep; 936 in += instep; 937 x1++; 938 } 939 } 940 } 941 942 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, 943 const Allocation ** ains, 944 uint32_t inLen, 945 Allocation * aout, 946 const void * usr, 947 uint32_t usrLen, 948 const RsScriptCall *sc) { 949 950 const Element *ein = ains[0]->mHal.state.type->getElement(); 951 const Element *eout = aout->mHal.state.type->getElement(); 952 953 if (ein->getType() == eout->getType()) { 954 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 955 updateCoeffCache(1.f, 255.f); 956 } else { 957 updateCoeffCache(1.f, 1.f); 958 } 959 } else { 960 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 961 updateCoeffCache(255.f, 255.f); 962 } else { 963 updateCoeffCache(1.f / 255.f, 1.f); 964 } 965 } 966 967 Key_t key = computeKey(ein, eout); 968 969 #if defined(ARCH_X86_HAVE_SSSE3) 970 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { 971 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases 972 // mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key); 973 mLastKey = key; 974 } 975 976 #else //if !defined(ARCH_X86_HAVE_SSSE3) 977 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { 978 if (mBuf) munmap(mBuf, mBufSize); 979 mBuf = nullptr; 980 mOptKernel = nullptr; 981 if (build(key)) { 982 mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf; 983 } 984 #if defined(ARCH_ARM64_USE_INTRINSICS) 985 else { 986 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0); 987 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0); 988 uint32_t mm = 0; 989 int i; 990 for (i = 0; i < 4; i++) 991 { 992 uint32_t m = (key.u.coeffMask >> i) & 0x1111; 993 m = ((m * 0x249) >> 9) & 15; 994 m |= ((key.u.addMask >> i) & 1) << 4; 995 mm |= m << (i * 5); 996 } 997 998 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) { 999 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st); 1000 } else { 1001 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st); 1002 } 1003 } 1004 #endif 1005 mLastKey = key; 1006 } 1007 #endif //if !defined(ARCH_X86_HAVE_SSSE3) 1008 } 1009 1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( 1011 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 1012 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { 1013 1014 mLastKey.key = 0; 1015 mBuf = nullptr; 1016 mBufSize = 0; 1017 mOptKernel = nullptr; 1018 const static float defaultMatrix[] = { 1019 1.f, 0.f, 0.f, 0.f, 1020 0.f, 1.f, 0.f, 0.f, 1021 0.f, 0.f, 1.f, 0.f, 1022 0.f, 0.f, 0.f, 1.f 1023 }; 1024 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f}; 1025 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix)); 1026 setGlobalVar(1, defaultAdd, sizeof(defaultAdd)); 1027 } 1028 1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() { 1030 if (mBuf) munmap(mBuf, mBufSize); 1031 mBuf = nullptr; 1032 mOptKernel = nullptr; 1033 } 1034 1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) { 1036 s->mHal.info.exportedVariableCount = 2; 1037 } 1038 1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, 1040 const Script *s, const Element *e) { 1041 1042 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e); 1043 } 1044 1045 } // namespace renderscript 1046 } // namespace android 1047