1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <sys/mman.h> 18 #include <unistd.h> 19 20 #include "rsCpuIntrinsic.h" 21 #include "rsCpuIntrinsicInlines.h" 22 #include "linkloader/include/MemChunk.h" 23 24 #include <sys/mman.h> 25 #include <stddef.h> 26 #include <stdint.h> 27 #include <stdlib.h> 28 //#include <utils/StopWatch.h> 29 30 31 /* uint kernel 32 * Q0 D0: Load slot for R 33 * D1: Load slot for G 34 * Q1 D2: Load slot for B 35 * D3: Load slot for A 36 * Q2 D4: Matrix 37 * D5: = 38 * Q3 D6: = 39 * D7: = 40 * Q4 D8: Add R 41 * D9: 42 * Q5 D10: Add G 43 * D11: 44 * Q6 D12: Add B 45 * D13: 46 * Q7 D14: Add A 47 * D15: 48 * Q8 D16: I32: R Sum 49 * D17: 50 * Q9 D18: I32: G Sum 51 * D19: 52 * Q10 D20: I32: B Sum 53 * D21: 54 * Q11 D22: I32: A Sum 55 * D23: 56 * Q12 D24: U16: expanded R 57 * D25: 58 * Q13 D26: U16: expanded G 59 * D27: 60 * Q14 D28: U16: expanded B 61 * D29: 62 * Q15 D30: U16: expanded A 63 * D31: 64 * 65 */ 66 67 /* float kernel 68 * Q0 D0: Load slot for R 69 * D1: = 70 * Q1 D2: Load slot for G 71 * D3: = 72 * Q2 D4: Load slot for B 73 * D5: = 74 * Q3 D6: Load slot for A 75 * D7: = 76 * Q4 D8: Matrix 77 * D9: = 78 * Q5 D10: = 79 * D11: = 80 * Q6 D12: = 81 * D13: = 82 * Q7 D14: = 83 * D15: = 84 * Q8 D16: Add R 85 * D17: = 86 * Q9 D18: Add G 87 * D19: = 88 * Q10 D20: Add B 89 * D21: = 90 * Q11 D22: Add A 91 * D23: = 92 * Q12 D24: Sum R 93 * D25: = 94 * Q13 D26: Sum G 95 * D27: = 96 * Q14 D28: Sum B 97 * D29: = 98 * Q15 D30: Sum A 99 * D31: = 100 * 101 */ 102 103 104 105 using namespace android; 106 using namespace android::renderscript; 107 108 namespace android { 109 namespace renderscript { 110 111 typedef union { 112 uint64_t key; 113 struct { 114 uint32_t inVecSize :2; // [0 - 1] 115 uint32_t outVecSize :2; // [2 - 3] 116 uint32_t inType :4; // [4 - 7] 117 uint32_t outType :4; // [8 - 11] 118 uint32_t dot :1; // [12] 119 uint32_t _unused1 :1; // [13] 120 uint32_t copyAlpha :1; // [14] 121 uint32_t _unused2 :1; // [15] 122 uint32_t coeffMask :16; // [16-31] 123 uint32_t addMask :4; // [32-35] 124 } u; 125 } Key_t; 126 127 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic { 128 public: 129 virtual void populateScript(Script *); 130 131 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); 132 133 virtual ~RsdCpuScriptIntrinsicColorMatrix(); 134 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 135 136 virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, 137 const void * usr, uint32_t usrLen, const RsScriptCall *sc); 138 virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout, 139 const void * usr, uint32_t usrLen, const RsScriptCall *sc); 140 141 protected: 142 float fp[16]; 143 float fpa[4]; 144 145 // The following four fields are read as constants 146 // by the SIMD assembly code. 147 short ip[16]; 148 int ipa[16]; 149 float tmpFp[16]; 150 float tmpFpa[16]; 151 152 static void kernel(const RsForEachStubParamStruct *p, 153 uint32_t xstart, uint32_t xend, 154 uint32_t instep, uint32_t outstep); 155 void updateCoeffCache(float fpMul, float addMul); 156 157 Key_t mLastKey; 158 unsigned char *mBuf; 159 size_t mBufSize; 160 161 Key_t computeKey(const Element *ein, const Element *eout); 162 163 bool build(Key_t key); 164 165 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count); 166 167 }; 168 169 } 170 } 171 172 173 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey( 174 const Element *ein, const Element *eout) { 175 176 Key_t key; 177 key.key = 0; 178 179 // Compute a unique code key for this operation 180 181 // Add to the key the input and output types 182 bool hasFloat = false; 183 if (ein->getType() == RS_TYPE_FLOAT_32) { 184 hasFloat = true; 185 key.u.inType = RS_TYPE_FLOAT_32; 186 rsAssert(key.u.inType == RS_TYPE_FLOAT_32); 187 } 188 if (eout->getType() == RS_TYPE_FLOAT_32) { 189 hasFloat = true; 190 key.u.outType = RS_TYPE_FLOAT_32; 191 rsAssert(key.u.outType == RS_TYPE_FLOAT_32); 192 } 193 194 // Mask in the bits indicating which coefficients in the 195 // color matrix are needed. 196 if (hasFloat) { 197 for (uint32_t i=0; i < 16; i++) { 198 if (fabs(fp[i]) != 0.f) { 199 key.u.coeffMask |= 1 << i; 200 } 201 } 202 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1; 203 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2; 204 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4; 205 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8; 206 207 } else { 208 for (uint32_t i=0; i < 16; i++) { 209 if (ip[i] != 0) { 210 key.u.coeffMask |= 1 << i; 211 } 212 } 213 if (ipa[0] != 0) key.u.addMask |= 0x1; 214 if (ipa[4] != 0) key.u.addMask |= 0x2; 215 if (ipa[8] != 0) key.u.addMask |= 0x4; 216 if (ipa[12] != 0) key.u.addMask |= 0x8; 217 } 218 219 // Look for a dot product where the r,g,b colums are the same 220 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) && 221 (ip[4] == ip[5]) && (ip[4] == ip[6]) && 222 (ip[8] == ip[9]) && (ip[8] == ip[10]) && 223 (ip[12] == ip[13]) && (ip[12] == ip[14])) { 224 225 if (!key.u.addMask) key.u.dot = 1; 226 } 227 228 // Is alpha a simple copy 229 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) { 230 key.u.copyAlpha = !(key.u.inType || key.u.outType); 231 } 232 233 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 234 235 switch (ein->getVectorSize()) { 236 case 4: 237 key.u.inVecSize = 3; 238 break; 239 case 3: 240 key.u.inVecSize = 2; 241 key.u.coeffMask &= ~0xF000; 242 break; 243 case 2: 244 key.u.inVecSize = 1; 245 key.u.coeffMask &= ~0xFF00; 246 break; 247 default: 248 key.u.coeffMask &= ~0xFFF0; 249 break; 250 } 251 252 switch (eout->getVectorSize()) { 253 case 4: 254 key.u.outVecSize = 3; 255 break; 256 case 3: 257 key.u.outVecSize = 2; 258 key.u.coeffMask &= ~0x8888; 259 break; 260 case 2: 261 key.u.outVecSize = 1; 262 key.u.coeffMask &= ~0xCCCC; 263 break; 264 default: 265 key.u.coeffMask &= ~0xEEEE; 266 break; 267 } 268 269 if (key.u.inType && !key.u.outType) { 270 key.u.addMask |= 1; 271 if (key.u.outVecSize > 0) key.u.addMask |= 2; 272 if (key.u.outVecSize > 1) key.u.addMask |= 4; 273 if (key.u.outVecSize > 2) key.u.addMask |= 8; 274 } 275 276 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 277 return key; 278 } 279 280 #if defined(ARCH_ARM_HAVE_NEON) 281 282 #define DEF_SYM(x) \ 283 extern "C" uint32_t _N_ColorMatrix_##x; \ 284 extern "C" uint32_t _N_ColorMatrix_##x##_end; \ 285 extern "C" uint32_t _N_ColorMatrix_##x##_len; 286 287 DEF_SYM(prefix_i) 288 DEF_SYM(prefix_f) 289 DEF_SYM(postfix1) 290 DEF_SYM(postfix2) 291 292 DEF_SYM(load_u8_4) 293 DEF_SYM(load_u8_3) 294 DEF_SYM(load_u8_2) 295 DEF_SYM(load_u8_1) 296 DEF_SYM(load_u8f_4) 297 DEF_SYM(load_u8f_3) 298 DEF_SYM(load_u8f_2) 299 DEF_SYM(load_u8f_1) 300 DEF_SYM(load_f32_4) 301 DEF_SYM(load_f32_3) 302 DEF_SYM(load_f32_2) 303 DEF_SYM(load_f32_1) 304 305 DEF_SYM(store_u8_4) 306 DEF_SYM(store_u8_2) 307 DEF_SYM(store_u8_1) 308 DEF_SYM(store_f32_4) 309 DEF_SYM(store_f32_3) 310 DEF_SYM(store_f32_2) 311 DEF_SYM(store_f32_1) 312 DEF_SYM(store_f32u_4) 313 DEF_SYM(store_f32u_2) 314 DEF_SYM(store_f32u_1) 315 316 DEF_SYM(unpack_u8_4) 317 DEF_SYM(unpack_u8_3) 318 DEF_SYM(unpack_u8_2) 319 DEF_SYM(unpack_u8_1) 320 DEF_SYM(pack_u8_4) 321 DEF_SYM(pack_u8_3) 322 DEF_SYM(pack_u8_2) 323 DEF_SYM(pack_u8_1) 324 DEF_SYM(dot) 325 DEF_SYM(add_0_u8) 326 DEF_SYM(add_1_u8) 327 DEF_SYM(add_2_u8) 328 DEF_SYM(add_3_u8) 329 330 #define ADD_CHUNK(x) \ 331 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \ 332 buf += _N_ColorMatrix_##x##_len 333 334 335 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) { 336 size_t off = (target - buf - 8) >> 2; 337 rsAssert(((off & 0xff000000) == 0) || 338 ((off & 0xff000000) == 0xff000000)); 339 340 uint32_t op = (condition << 28); 341 op |= 0xa << 24; // branch 342 op |= 0xffffff & off; 343 ((uint32_t *)buf)[0] = op; 344 return buf + 4; 345 } 346 347 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) { 348 rsAssert(vd < 32); 349 rsAssert(vm < 32); 350 rsAssert(vn < 32); 351 352 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22); 353 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5); 354 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7); 355 return op; 356 } 357 358 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 359 //vmlal.s16 Q#1, D#1, D#2[#] 360 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 361 ((uint32_t *)buf)[0] = op; 362 return buf + 4; 363 } 364 365 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 366 //vmull.s16 Q#1, D#1, D#2[#] 367 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 368 ((uint32_t *)buf)[0] = op; 369 return buf + 4; 370 } 371 372 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 373 //vqadd.s32 Q#1, D#1, D#2 374 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 375 ((uint32_t *)buf)[0] = op; 376 return buf + 4; 377 } 378 379 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 380 //vmlal.f32 Q#1, D#1, D#2[#] 381 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 382 ((uint32_t *)buf)[0] = op; 383 return buf + 4; 384 } 385 386 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 387 //vmull.f32 Q#1, D#1, D#2[#] 388 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 389 ((uint32_t *)buf)[0] = op; 390 return buf + 4; 391 } 392 393 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 394 //vadd.f32 Q#1, D#1, D#2 395 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 396 ((uint32_t *)buf)[0] = op; 397 return buf + 4; 398 } 399 400 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 401 //vadd.f32 Q#1, D#1, D#2 402 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 403 ((uint32_t *)buf)[0] = op; 404 return buf + 4; 405 } 406 #endif 407 408 409 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) { 410 #if defined(ARCH_ARM_HAVE_NEON) 411 mBufSize = 4096; 412 //StopWatch build_time("rs cm: build time"); 413 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE, 414 MAP_PRIVATE | MAP_ANON, -1, 0); 415 if (!mBuf) { 416 return false; 417 } 418 419 uint8_t *buf = mBuf; 420 uint8_t *buf2 = NULL; 421 422 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final 423 int opInit[4] = {0, 0, 0, 0}; 424 425 memset(ops, 0, sizeof(ops)); 426 for (int i=0; i < 4; i++) { 427 if (key.u.coeffMask & (1 << (i*4))) { 428 ops[i][0] = 0x2 | opInit[0]; 429 opInit[0] = 1; 430 } 431 if (!key.u.dot) { 432 if (key.u.coeffMask & (1 << (1 + i*4))) { 433 ops[i][1] = 0x2 | opInit[1]; 434 opInit[1] = 1; 435 } 436 if (key.u.coeffMask & (1 << (2 + i*4))) { 437 ops[i][2] = 0x2 | opInit[2]; 438 opInit[2] = 1; 439 } 440 } 441 if (!key.u.copyAlpha) { 442 if (key.u.coeffMask & (1 << (3 + i*4))) { 443 ops[i][3] = 0x2 | opInit[3]; 444 opInit[3] = 1; 445 } 446 } 447 } 448 449 if (key.u.inType || key.u.outType) { 450 key.u.copyAlpha = 0; 451 ADD_CHUNK(prefix_f); 452 buf2 = buf; 453 454 // Load the incoming r,g,b,a as needed 455 if (key.u.inType) { 456 switch(key.u.inVecSize) { 457 case 3: 458 ADD_CHUNK(load_f32_4); 459 break; 460 case 2: 461 ADD_CHUNK(load_f32_3); 462 break; 463 case 1: 464 ADD_CHUNK(load_f32_2); 465 break; 466 case 0: 467 ADD_CHUNK(load_f32_1); 468 break; 469 } 470 } else { 471 switch(key.u.inVecSize) { 472 case 3: 473 ADD_CHUNK(load_u8f_4); 474 break; 475 case 2: 476 ADD_CHUNK(load_u8f_3); 477 break; 478 case 1: 479 ADD_CHUNK(load_u8f_2); 480 break; 481 case 0: 482 ADD_CHUNK(load_u8f_1); 483 break; 484 } 485 } 486 487 for (int i=0; i < 4; i++) { 488 for (int j=0; j < 4; j++) { 489 switch(ops[i][j]) { 490 case 0: 491 break; 492 case 2: 493 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 494 break; 495 case 3: 496 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 497 break; 498 } 499 } 500 } 501 for (int j=0; j < 4; j++) { 502 if (opInit[j]) { 503 if (key.u.addMask & (1 << j)) { 504 buf = addVADD_F32(buf, j, 12+j, 8+j); 505 } else { 506 buf = addVORR_32(buf, j, 12+j, 12+j); 507 } 508 } else { 509 if (key.u.addMask & (1 << j)) { 510 buf = addVADD_F32(buf, j, j, 8+j); 511 } 512 } 513 } 514 515 if (key.u.outType) { 516 switch(key.u.outVecSize) { 517 case 3: 518 ADD_CHUNK(store_f32_4); 519 break; 520 case 2: 521 ADD_CHUNK(store_f32_3); 522 break; 523 case 1: 524 ADD_CHUNK(store_f32_2); 525 break; 526 case 0: 527 ADD_CHUNK(store_f32_1); 528 break; 529 } 530 } else { 531 switch(key.u.outVecSize) { 532 case 3: 533 case 2: 534 ADD_CHUNK(store_f32u_4); 535 break; 536 case 1: 537 ADD_CHUNK(store_f32u_2); 538 break; 539 case 0: 540 ADD_CHUNK(store_f32u_1); 541 break; 542 } 543 } 544 545 546 } else { 547 // Add the function prefix 548 // Store the address for the loop return 549 ADD_CHUNK(prefix_i); 550 buf2 = buf; 551 552 // Load the incoming r,g,b,a as needed 553 switch(key.u.inVecSize) { 554 case 3: 555 ADD_CHUNK(load_u8_4); 556 if (key.u.copyAlpha) { 557 ADD_CHUNK(unpack_u8_3); 558 } else { 559 ADD_CHUNK(unpack_u8_4); 560 } 561 break; 562 case 2: 563 ADD_CHUNK(load_u8_3); 564 ADD_CHUNK(unpack_u8_3); 565 break; 566 case 1: 567 ADD_CHUNK(load_u8_2); 568 ADD_CHUNK(unpack_u8_2); 569 break; 570 case 0: 571 ADD_CHUNK(load_u8_1); 572 ADD_CHUNK(unpack_u8_1); 573 break; 574 } 575 576 // Add multiply and accumulate 577 // use MULL to init the output register, 578 // use MLAL from there 579 for (int i=0; i < 4; i++) { 580 for (int j=0; j < 4; j++) { 581 switch(ops[i][j]) { 582 case 0: 583 break; 584 case 2: 585 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j); 586 break; 587 case 3: 588 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j); 589 break; 590 } 591 } 592 } 593 for (int j=0; j < 4; j++) { 594 if (opInit[j]) { 595 if (key.u.addMask & (1 << j)) { 596 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j); 597 } 598 } else { 599 if (key.u.addMask & (1 << j)) { 600 buf = addVQADD_S32(buf, 8+j, 12+j, 4+j); 601 } 602 } 603 } 604 605 // If we have a dot product, perform the special pack. 606 if (key.u.dot) { 607 ADD_CHUNK(pack_u8_1); 608 ADD_CHUNK(dot); 609 } else { 610 switch(key.u.outVecSize) { 611 case 3: 612 if (key.u.copyAlpha) { 613 ADD_CHUNK(pack_u8_3); 614 } else { 615 ADD_CHUNK(pack_u8_4); 616 } 617 break; 618 case 2: 619 ADD_CHUNK(pack_u8_3); 620 break; 621 case 1: 622 ADD_CHUNK(pack_u8_2); 623 break; 624 case 0: 625 ADD_CHUNK(pack_u8_1); 626 break; 627 } 628 } 629 630 // Write out result 631 switch(key.u.outVecSize) { 632 case 3: 633 case 2: 634 ADD_CHUNK(store_u8_4); 635 break; 636 case 1: 637 ADD_CHUNK(store_u8_2); 638 break; 639 case 0: 640 ADD_CHUNK(store_u8_1); 641 break; 642 } 643 } 644 645 if (key.u.inType != key.u.outType) { 646 key.u.copyAlpha = 0; 647 key.u.dot = 0; 648 } 649 650 // Loop, branch, and cleanup 651 ADD_CHUNK(postfix1); 652 buf = addBranch(buf, buf2, 0x01); 653 ADD_CHUNK(postfix2); 654 655 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC); 656 if (ret == -1) { 657 ALOGE("mprotect error %i", ret); 658 return false; 659 } 660 661 cacheflush((long)mBuf, (long)mBuf + mBufSize, 0); 662 return true; 663 #else 664 return false; 665 #endif 666 } 667 668 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) { 669 for(int ct=0; ct < 16; ct++) { 670 ip[ct] = (short)(fp[ct] * 256.f + 0.5f); 671 tmpFp[ct] = fp[ct] * fpMul; 672 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]); 673 } 674 675 float add = 0.f; 676 if (fpMul > 254.f) add = 0.5f; 677 for(int ct=0; ct < 4; ct++) { 678 tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add; 679 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]); 680 tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4]; 681 tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4]; 682 tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4]; 683 } 684 685 for(int ct=0; ct < 4; ct++) { 686 ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f); 687 ipa[ct * 4 + 1] = ipa[ct * 4]; 688 ipa[ct * 4 + 2] = ipa[ct * 4]; 689 ipa[ct * 4 + 3] = ipa[ct * 4]; 690 } 691 } 692 693 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data, 694 size_t dataLength) { 695 switch(slot) { 696 case 0: 697 memcpy (fp, data, sizeof(fp)); 698 break; 699 case 1: 700 memcpy (fpa, data, sizeof(fpa)); 701 break; 702 default: 703 rsAssert(0); 704 break; 705 } 706 mRootPtr = &kernel; 707 } 708 709 710 static void One(const RsForEachStubParamStruct *p, void *out, 711 const void *py, const float* coeff, const float *add, 712 uint32_t vsin, uint32_t vsout, bool fin, bool fout) { 713 714 float4 f = 0.f; 715 if (fin) { 716 switch(vsin) { 717 case 3: 718 f = ((const float4 *)py)[0]; 719 break; 720 case 2: 721 f = ((const float4 *)py)[0]; 722 f.w = 0.f; 723 break; 724 case 1: 725 f.xy = ((const float2 *)py)[0]; 726 break; 727 case 0: 728 f.x = ((const float *)py)[0]; 729 break; 730 } 731 } else { 732 switch(vsin) { 733 case 3: 734 f = convert_float4(((const uchar4 *)py)[0]); 735 break; 736 case 2: 737 f = convert_float4(((const uchar4 *)py)[0]); 738 f.w = 0.f; 739 break; 740 case 1: 741 f.xy = convert_float2(((const uchar2 *)py)[0]); 742 break; 743 case 0: 744 f.x = (float)(((const uchar *)py)[0]); 745 break; 746 } 747 } 748 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w); 749 750 float4 sum; 751 sum.x = f.x * coeff[0] + 752 f.y * coeff[4] + 753 f.z * coeff[8] + 754 f.w * coeff[12]; 755 sum.y = f.x * coeff[1] + 756 f.y * coeff[5] + 757 f.z * coeff[9] + 758 f.w * coeff[13]; 759 sum.z = f.x * coeff[2] + 760 f.y * coeff[6] + 761 f.z * coeff[10] + 762 f.w * coeff[14]; 763 sum.w = f.x * coeff[3] + 764 f.y * coeff[7] + 765 f.z * coeff[11] + 766 f.w * coeff[15]; 767 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w); 768 769 sum.x += add[0]; 770 sum.y += add[4]; 771 sum.z += add[8]; 772 sum.w += add[12]; 773 774 775 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w); 776 if (fout) { 777 switch(vsout) { 778 case 3: 779 case 2: 780 ((float4 *)out)[0] = sum; 781 break; 782 case 1: 783 ((float2 *)out)[0] = sum.xy; 784 break; 785 case 0: 786 ((float *)out)[0] = sum.x; 787 break; 788 } 789 } else { 790 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x); 791 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y); 792 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z); 793 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w); 794 795 switch(vsout) { 796 case 3: 797 case 2: 798 ((uchar4 *)out)[0] = convert_uchar4(sum); 799 break; 800 case 1: 801 ((uchar2 *)out)[0] = convert_uchar2(sum.xy); 802 break; 803 case 0: 804 ((uchar *)out)[0] = sum.x; 805 break; 806 } 807 } 808 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]); 809 } 810 811 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p, 812 uint32_t xstart, uint32_t xend, 813 uint32_t instep, uint32_t outstep) { 814 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr; 815 uchar *out = (uchar *)p->out; 816 uchar *in = (uchar *)p->in; 817 uint32_t x1 = xstart; 818 uint32_t x2 = xend; 819 820 uint32_t vsin = cp->mLastKey.u.inVecSize; 821 uint32_t vsout = cp->mLastKey.u.outVecSize; 822 bool floatIn = !!cp->mLastKey.u.inType; 823 bool floatOut = !!cp->mLastKey.u.outType; 824 825 //if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout); 826 827 if(x2 > x1) { 828 int32_t len = (x2 - x1) >> 2; 829 if((cp->mOptKernel != NULL) && (len > 0)) { 830 cp->mOptKernel(out, in, cp->ip, len); 831 x1 += len << 2; 832 out += outstep * (len << 2); 833 in += instep * (len << 2); 834 } 835 836 while(x1 != x2) { 837 One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut); 838 out += outstep; 839 in += instep; 840 x1++; 841 } 842 } 843 } 844 845 void RsdCpuScriptIntrinsicColorMatrix::preLaunch( 846 uint32_t slot, const Allocation * ain, Allocation * aout, 847 const void * usr, uint32_t usrLen, const RsScriptCall *sc) { 848 849 const Element *ein = ain->mHal.state.type->getElement(); 850 const Element *eout = aout->mHal.state.type->getElement(); 851 852 if (ein->getType() == eout->getType()) { 853 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 854 updateCoeffCache(1.f, 255.f); 855 } else { 856 updateCoeffCache(1.f, 1.f); 857 } 858 } else { 859 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 860 updateCoeffCache(255.f, 255.f); 861 } else { 862 updateCoeffCache(1.f / 255.f, 1.f); 863 } 864 } 865 866 Key_t key = computeKey(ain->mHal.state.type->getElement(), 867 aout->mHal.state.type->getElement()); 868 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) { 869 if (mBuf) munmap(mBuf, mBufSize); 870 mBuf = NULL; 871 mOptKernel = NULL; 872 if (build(key)) { 873 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf; 874 mLastKey = key; 875 } 876 } 877 } 878 879 void RsdCpuScriptIntrinsicColorMatrix::postLaunch( 880 uint32_t slot, const Allocation * ain, Allocation * aout, 881 const void * usr, uint32_t usrLen, const RsScriptCall *sc) { 882 883 } 884 885 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( 886 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 887 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { 888 889 mLastKey.key = 0; 890 mBuf = NULL; 891 mBufSize = 0; 892 mOptKernel = NULL; 893 const static float defaultMatrix[] = { 894 1.f, 0.f, 0.f, 0.f, 895 0.f, 1.f, 0.f, 0.f, 896 0.f, 0.f, 1.f, 0.f, 897 0.f, 0.f, 0.f, 1.f 898 }; 899 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f}; 900 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix)); 901 setGlobalVar(1, defaultAdd, sizeof(defaultAdd)); 902 } 903 904 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() { 905 if (mBuf) munmap(mBuf, mBufSize); 906 mBuf = NULL; 907 mOptKernel = NULL; 908 } 909 910 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) { 911 s->mHal.info.exportedVariableCount = 2; 912 } 913 914 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, 915 const Script *s, const Element *e) { 916 917 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e); 918 } 919 920 921 922