1 /* libs/pixelflinger/codeflinger/texturing.cpp 2 ** 3 ** Copyright 2006, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include <assert.h> 19 #include <stdint.h> 20 #include <stdlib.h> 21 #include <stdio.h> 22 #include <sys/types.h> 23 24 #include <cutils/log.h> 25 26 #include "codeflinger/GGLAssembler.h" 27 28 #ifdef __ARM_ARCH__ 29 #include <machine/cpu-features.h> 30 #endif 31 32 namespace android { 33 34 // --------------------------------------------------------------------------- 35 36 // iterators are initialized like this: 37 // (intToFixedCenter(x) * dx)>>16 + x0 38 // ((x<<16 + 0x8000) * dx)>>16 + x0 39 // ((x<<16)*dx + (0x8000*dx))>>16 + x0 40 // ( (x*dx) + dx>>1 ) + x0 41 // (x*dx) + (dx>>1 + x0) 42 43 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x) 44 { 45 context_t const* c = mBuilderContext.c; 46 const needs_t& needs = mBuilderContext.needs; 47 48 if (mSmooth) { 49 // NOTE: we could take this case in the mDithering + !mSmooth case, 50 // but this would use up to 4 more registers for the color components 51 // for only a little added quality. 52 // Currently, this causes the system to run out of registers in 53 // some case (see issue #719496) 54 55 comment("compute initial iterated color (smooth and/or dither case)"); 56 57 parts.iterated_packed = 0; 58 parts.packed = 0; 59 60 // 0x1: color component 61 // 0x2: iterators 62 const int optReload = mOptLevel >> 1; 63 if (optReload >= 3) parts.reload = 0; // reload nothing 64 else if (optReload == 2) parts.reload = 2; // reload iterators 65 else if (optReload == 1) parts.reload = 1; // reload colors 66 else if (optReload <= 0) parts.reload = 3; // reload both 67 68 if (!mSmooth) { 69 // we're not smoothing (just dithering), we never have to 70 // reload the iterators 71 parts.reload &= ~2; 72 } 73 74 Scratch scratches(registerFile()); 75 const int t0 = (parts.reload & 1) ? scratches.obtain() : 0; 76 const int t1 = (parts.reload & 2) ? scratches.obtain() : 0; 77 for (int i=0 ; i<4 ; i++) { 78 if (!mInfo[i].iterated) 79 continue; 80 81 // this component exists in the destination and is not replaced 82 // by a texture unit. 83 const int c = (parts.reload & 1) ? t0 : obtainReg(); 84 if (i==0) CONTEXT_LOAD(c, iterators.ydady); 85 if (i==1) CONTEXT_LOAD(c, iterators.ydrdy); 86 if (i==2) CONTEXT_LOAD(c, iterators.ydgdy); 87 if (i==3) CONTEXT_LOAD(c, iterators.ydbdy); 88 parts.argb[i].reg = c; 89 90 if (mInfo[i].smooth) { 91 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg(); 92 const int dvdx = parts.argb_dx[i].reg; 93 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx); 94 MLA(AL, 0, c, x.reg, dvdx, c); 95 96 // adjust the color iterator to make sure it won't overflow 97 if (!mAA) { 98 // this is not needed when we're using anti-aliasing 99 // because we will (have to) clamp the components 100 // anyway. 101 int end = scratches.obtain(); 102 MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16)); 103 MLA(AL, 1, end, dvdx, end, c); 104 SUB(MI, 0, c, c, end); 105 BIC(AL, 0, c, c, reg_imm(c, ASR, 31)); 106 scratches.recycle(end); 107 } 108 } 109 110 if (parts.reload & 1) { 111 CONTEXT_STORE(c, generated_vars.argb[i].c); 112 } 113 } 114 } else { 115 // We're not smoothed, so we can 116 // just use a packed version of the color and extract the 117 // components as needed (or not at all if we don't blend) 118 119 // figure out if we need the iterated color 120 int load = 0; 121 for (int i=0 ; i<4 ; i++) { 122 component_info_t& info = mInfo[i]; 123 if ((info.inDest || info.needed) && !info.replaced) 124 load |= 1; 125 } 126 127 parts.iterated_packed = 1; 128 parts.packed = (!mTextureMachine.mask && !mBlending 129 && !mFog && !mDithering); 130 parts.reload = 0; 131 if (load || parts.packed) { 132 if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) { 133 comment("load initial iterated color (8888 packed)"); 134 parts.iterated.setTo(obtainReg(), 135 &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888])); 136 CONTEXT_LOAD(parts.iterated.reg, packed8888); 137 } else { 138 comment("load initial iterated color (dest format packed)"); 139 140 parts.iterated.setTo(obtainReg(), &mCbFormat); 141 142 // pre-mask the iterated color 143 const int bits = parts.iterated.size(); 144 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1; 145 uint32_t mask = 0; 146 if (mMasking) { 147 for (int i=0 ; i<4 ; i++) { 148 const int component_mask = 1<<i; 149 const int h = parts.iterated.format.c[i].h; 150 const int l = parts.iterated.format.c[i].l; 151 if (h && (!(mMasking & component_mask))) { 152 mask |= ((1<<(h-l))-1) << l; 153 } 154 } 155 } 156 157 if (mMasking && ((mask & size)==0)) { 158 // none of the components are present in the mask 159 } else { 160 CONTEXT_LOAD(parts.iterated.reg, packed); 161 if (mCbFormat.size == 1) { 162 AND(AL, 0, parts.iterated.reg, 163 parts.iterated.reg, imm(0xFF)); 164 } else if (mCbFormat.size == 2) { 165 MOV(AL, 0, parts.iterated.reg, 166 reg_imm(parts.iterated.reg, LSR, 16)); 167 } 168 } 169 170 // pre-mask the iterated color 171 if (mMasking) { 172 build_and_immediate(parts.iterated.reg, parts.iterated.reg, 173 mask, bits); 174 } 175 } 176 } 177 } 178 } 179 180 void GGLAssembler::build_iterated_color( 181 component_t& fragment, 182 const fragment_parts_t& parts, 183 int component, 184 Scratch& regs) 185 { 186 fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE); 187 188 if (!mInfo[component].iterated) 189 return; 190 191 if (parts.iterated_packed) { 192 // iterated colors are packed, extract the one we need 193 extract(fragment, parts.iterated, component); 194 } else { 195 fragment.h = GGL_COLOR_BITS; 196 fragment.l = GGL_COLOR_BITS - 8; 197 fragment.flags |= CLEAR_LO; 198 // iterated colors are held in their own register, 199 // (smooth and/or dithering case) 200 if (parts.reload==3) { 201 // this implies mSmooth 202 Scratch scratches(registerFile()); 203 int dx = scratches.obtain(); 204 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); 205 CONTEXT_LOAD(dx, generated_vars.argb[component].dx); 206 ADD(AL, 0, dx, fragment.reg, dx); 207 CONTEXT_STORE(dx, generated_vars.argb[component].c); 208 } else if (parts.reload & 1) { 209 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); 210 } else { 211 // we don't reload, so simply rename the register and mark as 212 // non CORRUPTIBLE so that the texture env or blending code 213 // won't modify this (renamed) register 214 regs.recycle(fragment.reg); 215 fragment.reg = parts.argb[component].reg; 216 fragment.flags &= ~CORRUPTIBLE; 217 } 218 if (mInfo[component].smooth && mAA) { 219 // when using smooth shading AND anti-aliasing, we need to clamp 220 // the iterators because there is always an extra pixel on the 221 // edges, which most of the time will cause an overflow 222 // (since technically its outside of the domain). 223 BIC(AL, 0, fragment.reg, fragment.reg, 224 reg_imm(fragment.reg, ASR, 31)); 225 component_sat(fragment); 226 } 227 } 228 } 229 230 // --------------------------------------------------------------------------- 231 232 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs) 233 { 234 // gather some informations about the components we need to process... 235 const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR; 236 switch(opcode) { 237 case GGL_COPY: 238 mLogicOp = 0; 239 break; 240 case GGL_CLEAR: 241 case GGL_SET: 242 mLogicOp = LOGIC_OP; 243 break; 244 case GGL_AND: 245 case GGL_AND_REVERSE: 246 case GGL_AND_INVERTED: 247 case GGL_XOR: 248 case GGL_OR: 249 case GGL_NOR: 250 case GGL_EQUIV: 251 case GGL_OR_REVERSE: 252 case GGL_OR_INVERTED: 253 case GGL_NAND: 254 mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST; 255 break; 256 case GGL_NOOP: 257 case GGL_INVERT: 258 mLogicOp = LOGIC_OP|LOGIC_OP_DST; 259 break; 260 case GGL_COPY_INVERTED: 261 mLogicOp = LOGIC_OP|LOGIC_OP_SRC; 262 break; 263 }; 264 } 265 266 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c) 267 { 268 uint8_t replaced=0; 269 mTextureMachine.mask = 0; 270 mTextureMachine.activeUnits = 0; 271 for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) { 272 texture_unit_t& tmu = mTextureMachine.tmu[i]; 273 if (replaced == 0xF) { 274 // all components are replaced, skip this TMU. 275 tmu.format_idx = 0; 276 tmu.mask = 0; 277 tmu.replaced = replaced; 278 continue; 279 } 280 tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]); 281 tmu.format = c->formats[tmu.format_idx]; 282 tmu.bits = tmu.format.size*8; 283 tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]); 284 tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]); 285 tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i])); 286 tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]); 287 tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i]) 288 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now 289 290 // 5551 linear filtering is not supported 291 if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551) 292 tmu.linear = 0; 293 294 tmu.mask = 0; 295 tmu.replaced = replaced; 296 297 if (tmu.format_idx) { 298 mTextureMachine.activeUnits++; 299 if (tmu.format.c[0].h) tmu.mask |= 0x1; 300 if (tmu.format.c[1].h) tmu.mask |= 0x2; 301 if (tmu.format.c[2].h) tmu.mask |= 0x4; 302 if (tmu.format.c[3].h) tmu.mask |= 0x8; 303 if (tmu.env == GGL_REPLACE) { 304 replaced |= tmu.mask; 305 } else if (tmu.env == GGL_DECAL) { 306 if (!tmu.format.c[GGLFormat::ALPHA].h) { 307 // if we don't have alpha, decal does nothing 308 tmu.mask = 0; 309 } else { 310 // decal always ignores At 311 tmu.mask &= ~(1<<GGLFormat::ALPHA); 312 } 313 } 314 } 315 mTextureMachine.mask |= tmu.mask; 316 //printf("%d: mask=%08lx, replaced=%08lx\n", 317 // i, int(tmu.mask), int(tmu.replaced)); 318 } 319 mTextureMachine.replaced = replaced; 320 mTextureMachine.directTexture = 0; 321 //printf("replaced=%08lx\n", mTextureMachine.replaced); 322 } 323 324 325 void GGLAssembler::init_textures( 326 tex_coord_t* coords, 327 const reg_t& x, const reg_t& y) 328 { 329 context_t const* c = mBuilderContext.c; 330 const needs_t& needs = mBuilderContext.needs; 331 int Rctx = mBuilderContext.Rctx; 332 int Rx = x.reg; 333 int Ry = y.reg; 334 335 if (mTextureMachine.mask) { 336 comment("compute texture coordinates"); 337 } 338 339 // init texture coordinates for each tmu 340 const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n); 341 const bool multiTexture = mTextureMachine.activeUnits > 1; 342 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 343 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 344 if (tmu.format_idx == 0) 345 continue; 346 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 347 (tmu.twrap == GGL_NEEDS_WRAP_11)) 348 { 349 // 1:1 texture 350 pointer_t& txPtr = coords[i].ptr; 351 txPtr.setTo(obtainReg(), tmu.bits); 352 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy); 353 ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16) 354 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy); 355 ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16) 356 // merge base & offset 357 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride); 358 SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride 359 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); 360 base_offset(txPtr, txPtr, Rx); 361 } else { 362 Scratch scratches(registerFile()); 363 reg_t& s = coords[i].s; 364 reg_t& t = coords[i].t; 365 // s = (x * dsdx)>>16 + ydsdy 366 // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0 367 // t = (x * dtdx)>>16 + ydtdy 368 // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0 369 s.setTo(obtainReg()); 370 t.setTo(obtainReg()); 371 const int need_w = GGL_READ_NEEDS(W, needs.n); 372 if (need_w) { 373 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy); 374 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy); 375 } else { 376 int ydsdy = scratches.obtain(); 377 int ydtdy = scratches.obtain(); 378 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx); 379 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy); 380 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx); 381 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy); 382 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy); 383 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy); 384 } 385 386 if ((mOptLevel&1)==0) { 387 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); 388 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); 389 recycleReg(s.reg); 390 recycleReg(t.reg); 391 } 392 } 393 394 // direct texture? 395 if (!multiTexture && !mBlending && !mDithering && !mFog && 396 cb_format_idx == tmu.format_idx && !tmu.linear && 397 mTextureMachine.replaced == tmu.mask) 398 { 399 mTextureMachine.directTexture = i + 1; 400 } 401 } 402 } 403 404 void GGLAssembler::build_textures( fragment_parts_t& parts, 405 Scratch& regs) 406 { 407 context_t const* c = mBuilderContext.c; 408 const needs_t& needs = mBuilderContext.needs; 409 int Rctx = mBuilderContext.Rctx; 410 411 // We don't have a way to spill registers automatically 412 // spill depth and AA regs, when we know we may have to. 413 // build the spill list... 414 uint32_t spill_list = 0; 415 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 416 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 417 if (tmu.format_idx == 0) 418 continue; 419 if (tmu.linear) { 420 // we may run out of register if we have linear filtering 421 // at 1 or 4 bytes / pixel on any texture unit. 422 if (tmu.format.size == 1) { 423 // if depth and AA enabled, we'll run out of 1 register 424 if (parts.z.reg > 0 && parts.covPtr.reg > 0) 425 spill_list |= 1<<parts.covPtr.reg; 426 } 427 if (tmu.format.size == 4) { 428 // if depth or AA enabled, we'll run out of 1 or 2 registers 429 if (parts.z.reg > 0) 430 spill_list |= 1<<parts.z.reg; 431 if (parts.covPtr.reg > 0) 432 spill_list |= 1<<parts.covPtr.reg; 433 } 434 } 435 } 436 437 Spill spill(registerFile(), *this, spill_list); 438 439 const bool multiTexture = mTextureMachine.activeUnits > 1; 440 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 441 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 442 if (tmu.format_idx == 0) 443 continue; 444 445 pointer_t& txPtr = parts.coords[i].ptr; 446 pixel_t& texel = parts.texel[i]; 447 448 // repeat... 449 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 450 (tmu.twrap == GGL_NEEDS_WRAP_11)) 451 { // 1:1 textures 452 comment("fetch texel"); 453 texel.setTo(regs.obtain(), &tmu.format); 454 load(txPtr, texel, WRITE_BACK); 455 } else { 456 Scratch scratches(registerFile()); 457 reg_t& s = parts.coords[i].s; 458 reg_t& t = parts.coords[i].t; 459 if ((mOptLevel&1)==0) { 460 comment("reload s/t (multitexture or linear filtering)"); 461 s.reg = scratches.obtain(); 462 t.reg = scratches.obtain(); 463 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]); 464 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]); 465 } 466 467 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS) 468 return; 469 470 comment("compute repeat/clamp"); 471 int u = scratches.obtain(); 472 int v = scratches.obtain(); 473 int width = scratches.obtain(); 474 int height = scratches.obtain(); 475 int U = 0; 476 int V = 0; 477 478 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS) 479 return; 480 481 CONTEXT_LOAD(width, generated_vars.texture[i].width); 482 CONTEXT_LOAD(height, generated_vars.texture[i].height); 483 484 int FRAC_BITS = 0; 485 if (tmu.linear) { 486 // linear interpolation 487 if (tmu.format.size == 1) { 488 // for 8-bits textures, we can afford 489 // 7 bits of fractional precision at no 490 // additional cost (we can't do 8 bits 491 // because filter8 uses signed 16 bits muls) 492 FRAC_BITS = 7; 493 } else if (tmu.format.size == 2) { 494 // filter16() is internally limited to 4 bits, so: 495 // FRAC_BITS=2 generates less instructions, 496 // FRAC_BITS=3,4,5 creates unpleasant artifacts, 497 // FRAC_BITS=6+ looks good 498 FRAC_BITS = 6; 499 } else if (tmu.format.size == 4) { 500 // filter32() is internally limited to 8 bits, so: 501 // FRAC_BITS=4 looks good 502 // FRAC_BITS=5+ looks better, but generates 3 extra ipp 503 FRAC_BITS = 6; 504 } else { 505 // for all other cases we use 4 bits. 506 FRAC_BITS = 4; 507 } 508 } 509 wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS); 510 wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS); 511 512 if (tmu.linear) { 513 comment("compute linear filtering offsets"); 514 // pixel size scale 515 const int shift = 31 - gglClz(tmu.format.size); 516 U = scratches.obtain(); 517 V = scratches.obtain(); 518 519 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS) 520 return; 521 522 // sample the texel center 523 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1))); 524 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1))); 525 526 // get the fractionnal part of U,V 527 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1)); 528 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1)); 529 530 // compute width-1 and height-1 531 SUB(AL, 0, width, width, imm(1)); 532 SUB(AL, 0, height, height, imm(1)); 533 534 // get the integer part of U,V and clamp/wrap 535 // and compute offset to the next texel 536 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) { 537 // u has already been REPEATed 538 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); 539 MOV(MI, 0, u, width); 540 CMP(AL, u, width); 541 MOV(LT, 0, width, imm(1 << shift)); 542 if (shift) 543 MOV(GE, 0, width, reg_imm(width, LSL, shift)); 544 RSB(GE, 0, width, width, imm(0)); 545 } else { 546 // u has not been CLAMPed yet 547 // algorithm: 548 // if ((u>>4) >= width) 549 // u = width<<4 550 // width = 0 551 // else 552 // width = 1<<shift 553 // u = u>>4; // get integer part 554 // if (u<0) 555 // u = 0 556 // width = 0 557 // generated_vars.rt = width 558 559 CMP(AL, width, reg_imm(u, ASR, FRAC_BITS)); 560 MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS)); 561 MOV(LE, 0, width, imm(0)); 562 MOV(GT, 0, width, imm(1 << shift)); 563 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); 564 MOV(MI, 0, u, imm(0)); 565 MOV(MI, 0, width, imm(0)); 566 } 567 CONTEXT_STORE(width, generated_vars.rt); 568 569 const int stride = width; 570 CONTEXT_LOAD(stride, generated_vars.texture[i].stride); 571 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) { 572 // v has already been REPEATed 573 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); 574 MOV(MI, 0, v, height); 575 CMP(AL, v, height); 576 MOV(LT, 0, height, imm(1 << shift)); 577 if (shift) 578 MOV(GE, 0, height, reg_imm(height, LSL, shift)); 579 RSB(GE, 0, height, height, imm(0)); 580 MUL(AL, 0, height, stride, height); 581 } else { 582 // v has not been CLAMPed yet 583 CMP(AL, height, reg_imm(v, ASR, FRAC_BITS)); 584 MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS)); 585 MOV(LE, 0, height, imm(0)); 586 if (shift) { 587 MOV(GT, 0, height, reg_imm(stride, LSL, shift)); 588 } else { 589 MOV(GT, 0, height, stride); 590 } 591 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); 592 MOV(MI, 0, v, imm(0)); 593 MOV(MI, 0, height, imm(0)); 594 } 595 CONTEXT_STORE(height, generated_vars.lb); 596 } 597 598 scratches.recycle(width); 599 scratches.recycle(height); 600 601 // iterate texture coordinates... 602 comment("iterate s,t"); 603 int dsdx = scratches.obtain(); 604 int dtdx = scratches.obtain(); 605 606 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS) 607 return; 608 609 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); 610 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); 611 ADD(AL, 0, s.reg, s.reg, dsdx); 612 ADD(AL, 0, t.reg, t.reg, dtdx); 613 if ((mOptLevel&1)==0) { 614 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); 615 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); 616 scratches.recycle(s.reg); 617 scratches.recycle(t.reg); 618 } 619 scratches.recycle(dsdx); 620 scratches.recycle(dtdx); 621 622 // merge base & offset... 623 comment("merge base & offset"); 624 texel.setTo(regs.obtain(), &tmu.format); 625 txPtr.setTo(texel.reg, tmu.bits); 626 int stride = scratches.obtain(); 627 628 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS) 629 return; 630 631 CONTEXT_LOAD(stride, generated_vars.texture[i].stride); 632 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); 633 SMLABB(AL, u, v, stride, u); // u+v*stride 634 base_offset(txPtr, txPtr, u); 635 636 // load texel 637 if (!tmu.linear) { 638 comment("fetch texel"); 639 load(txPtr, texel, 0); 640 } else { 641 // recycle registers we don't need anymore 642 scratches.recycle(u); 643 scratches.recycle(v); 644 scratches.recycle(stride); 645 646 comment("fetch texel, bilinear"); 647 switch (tmu.format.size) { 648 case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 649 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 650 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 651 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 652 } 653 } 654 } 655 } 656 } 657 658 void GGLAssembler::build_iterate_texture_coordinates( 659 const fragment_parts_t& parts) 660 { 661 const bool multiTexture = mTextureMachine.activeUnits > 1; 662 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 663 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 664 if (tmu.format_idx == 0) 665 continue; 666 667 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 668 (tmu.twrap == GGL_NEEDS_WRAP_11)) 669 { // 1:1 textures 670 const pointer_t& txPtr = parts.coords[i].ptr; 671 ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3)); 672 } else { 673 Scratch scratches(registerFile()); 674 int s = parts.coords[i].s.reg; 675 int t = parts.coords[i].t.reg; 676 if ((mOptLevel&1)==0) { 677 s = scratches.obtain(); 678 t = scratches.obtain(); 679 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]); 680 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]); 681 } 682 int dsdx = scratches.obtain(); 683 int dtdx = scratches.obtain(); 684 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); 685 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); 686 ADD(AL, 0, s, s, dsdx); 687 ADD(AL, 0, t, t, dtdx); 688 if ((mOptLevel&1)==0) { 689 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]); 690 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]); 691 } 692 } 693 } 694 } 695 696 void GGLAssembler::filter8( 697 const fragment_parts_t& parts, 698 pixel_t& texel, const texture_unit_t& tmu, 699 int U, int V, pointer_t& txPtr, 700 int FRAC_BITS) 701 { 702 if (tmu.format.components != GGL_ALPHA && 703 tmu.format.components != GGL_LUMINANCE) 704 { 705 // this is a packed format, and we don't support 706 // linear filtering (it's probably RGB 332) 707 // Should not happen with OpenGL|ES 708 LDRB(AL, texel.reg, txPtr.reg); 709 return; 710 } 711 712 // ------------------------ 713 // about ~22 cycles / pixel 714 Scratch scratches(registerFile()); 715 716 int pixel= scratches.obtain(); 717 int d = scratches.obtain(); 718 int u = scratches.obtain(); 719 int k = scratches.obtain(); 720 int rt = scratches.obtain(); 721 int lb = scratches.obtain(); 722 723 // RB -> U * V 724 725 CONTEXT_LOAD(rt, generated_vars.rt); 726 CONTEXT_LOAD(lb, generated_vars.lb); 727 728 int offset = pixel; 729 ADD(AL, 0, offset, lb, rt); 730 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 731 SMULBB(AL, u, U, V); 732 SMULBB(AL, d, pixel, u); 733 RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2))); 734 735 // LB -> (1-U) * V 736 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 737 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb)); 738 SMULBB(AL, u, U, V); 739 SMLABB(AL, d, pixel, u, d); 740 SUB(AL, 0, k, k, u); 741 742 // LT -> (1-U)*(1-V) 743 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 744 LDRB(AL, pixel, txPtr.reg); 745 SMULBB(AL, u, U, V); 746 SMLABB(AL, d, pixel, u, d); 747 748 // RT -> U*(1-V) 749 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt)); 750 SUB(AL, 0, u, k, u); 751 SMLABB(AL, texel.reg, pixel, u, d); 752 753 for (int i=0 ; i<4 ; i++) { 754 if (!texel.format.c[i].h) continue; 755 texel.format.c[i].h = FRAC_BITS*2+8; 756 texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough 757 } 758 texel.format.size = 4; 759 texel.format.bitsPerPixel = 32; 760 texel.flags |= CLEAR_LO; 761 } 762 763 void GGLAssembler::filter16( 764 const fragment_parts_t& parts, 765 pixel_t& texel, const texture_unit_t& tmu, 766 int U, int V, pointer_t& txPtr, 767 int FRAC_BITS) 768 { 769 // compute the mask 770 // XXX: it would be nice if the mask below could be computed 771 // automatically. 772 uint32_t mask = 0; 773 int shift = 0; 774 int prec = 0; 775 switch (tmu.format_idx) { 776 case GGL_PIXEL_FORMAT_RGB_565: 777 // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb 778 // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb 779 mask = 0x07E0F81F; 780 shift = 16; 781 prec = 5; 782 break; 783 case GGL_PIXEL_FORMAT_RGBA_4444: 784 // 0000,1111,0000,1111 | 0000,1111,0000,1111 785 mask = 0x0F0F0F0F; 786 shift = 12; 787 prec = 4; 788 break; 789 case GGL_PIXEL_FORMAT_LA_88: 790 // 0000,0000,1111,1111 | 0000,0000,1111,1111 791 // AALL -> 00AA | 00LL 792 mask = 0x00FF00FF; 793 shift = 8; 794 prec = 8; 795 break; 796 default: 797 // unsupported format, do something sensical... 798 ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx); 799 LDRH(AL, texel.reg, txPtr.reg); 800 return; 801 } 802 803 const int adjust = FRAC_BITS*2 - prec; 804 const int round = 0; 805 806 // update the texel format 807 texel.format.size = 4; 808 texel.format.bitsPerPixel = 32; 809 texel.flags |= CLEAR_HI|CLEAR_LO; 810 for (int i=0 ; i<4 ; i++) { 811 if (!texel.format.c[i].h) continue; 812 const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift; 813 texel.format.c[i].h = tmu.format.c[i].h + offset + prec; 814 texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec); 815 } 816 817 // ------------------------ 818 // about ~40 cycles / pixel 819 Scratch scratches(registerFile()); 820 821 int pixel= scratches.obtain(); 822 int d = scratches.obtain(); 823 int u = scratches.obtain(); 824 int k = scratches.obtain(); 825 826 // RB -> U * V 827 int offset = pixel; 828 CONTEXT_LOAD(offset, generated_vars.rt); 829 CONTEXT_LOAD(u, generated_vars.lb); 830 ADD(AL, 0, offset, offset, u); 831 832 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 833 SMULBB(AL, u, U, V); 834 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 835 build_and_immediate(pixel, pixel, mask, 32); 836 if (adjust) { 837 if (round) 838 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 839 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 840 } 841 MUL(AL, 0, d, pixel, u); 842 RSB(AL, 0, k, u, imm(1<<prec)); 843 844 // LB -> (1-U) * V 845 CONTEXT_LOAD(offset, generated_vars.lb); 846 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 847 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 848 SMULBB(AL, u, U, V); 849 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 850 build_and_immediate(pixel, pixel, mask, 32); 851 if (adjust) { 852 if (round) 853 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 854 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 855 } 856 MLA(AL, 0, d, pixel, u, d); 857 SUB(AL, 0, k, k, u); 858 859 // LT -> (1-U)*(1-V) 860 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 861 LDRH(AL, pixel, txPtr.reg); 862 SMULBB(AL, u, U, V); 863 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 864 build_and_immediate(pixel, pixel, mask, 32); 865 if (adjust) { 866 if (round) 867 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 868 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 869 } 870 MLA(AL, 0, d, pixel, u, d); 871 872 // RT -> U*(1-V) 873 CONTEXT_LOAD(offset, generated_vars.rt); 874 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 875 SUB(AL, 0, u, k, u); 876 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 877 build_and_immediate(pixel, pixel, mask, 32); 878 MLA(AL, 0, texel.reg, pixel, u, d); 879 } 880 881 void GGLAssembler::filter24( 882 const fragment_parts_t& parts, 883 pixel_t& texel, const texture_unit_t& tmu, 884 int U, int V, pointer_t& txPtr, 885 int FRAC_BITS) 886 { 887 // not supported yet (currently disabled) 888 load(txPtr, texel, 0); 889 } 890 891 #if __ARM_ARCH__ >= 6 892 // ARMv6 version, using UXTB16, and scheduled for Cortex-A8 pipeline 893 void GGLAssembler::filter32( 894 const fragment_parts_t& parts, 895 pixel_t& texel, const texture_unit_t& tmu, 896 int U, int V, pointer_t& txPtr, 897 int FRAC_BITS) 898 { 899 const int adjust = FRAC_BITS*2 - 8; 900 const int round = 0; 901 const int prescale = 16 - adjust; 902 903 Scratch scratches(registerFile()); 904 905 int pixel= scratches.obtain(); 906 int dh = scratches.obtain(); 907 int u = scratches.obtain(); 908 int k = scratches.obtain(); 909 910 int temp = scratches.obtain(); 911 int dl = scratches.obtain(); 912 913 int offsetrt = scratches.obtain(); 914 int offsetlb = scratches.obtain(); 915 916 int pixellb = offsetlb; 917 918 // RB -> U * V 919 CONTEXT_LOAD(offsetrt, generated_vars.rt); 920 CONTEXT_LOAD(offsetlb, generated_vars.lb); 921 if(!round) { 922 MOV(AL, 0, U, reg_imm(U, LSL, prescale)); 923 } 924 ADD(AL, 0, u, offsetrt, offsetlb); 925 926 LDR(AL, pixel, txPtr.reg, reg_scale_pre(u)); 927 if (round) { 928 SMULBB(AL, u, U, V); 929 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 930 } else { 931 SMULWB(AL, u, U, V); 932 RSB(AL, 0, U, U, imm(1<<(FRAC_BITS+prescale))); 933 } 934 UXTB16(AL, temp, pixel, 0); 935 if (round) { 936 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 937 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 938 } 939 LDR(AL, pixellb, txPtr.reg, reg_scale_pre(offsetlb)); 940 MUL(AL, 0, dh, temp, u); 941 UXTB16(AL, temp, pixel, 8); 942 MUL(AL, 0, dl, temp, u); 943 RSB(AL, 0, k, u, imm(0x100)); 944 945 // LB -> (1-U) * V 946 if (round) { 947 SMULBB(AL, u, U, V); 948 } else { 949 SMULWB(AL, u, U, V); 950 } 951 UXTB16(AL, temp, pixellb, 0); 952 if (round) { 953 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 954 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 955 } 956 MLA(AL, 0, dh, temp, u, dh); 957 UXTB16(AL, temp, pixellb, 8); 958 MLA(AL, 0, dl, temp, u, dl); 959 SUB(AL, 0, k, k, u); 960 961 // LT -> (1-U)*(1-V) 962 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 963 LDR(AL, pixel, txPtr.reg); 964 if (round) { 965 SMULBB(AL, u, U, V); 966 } else { 967 SMULWB(AL, u, U, V); 968 } 969 UXTB16(AL, temp, pixel, 0); 970 if (round) { 971 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 972 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 973 } 974 MLA(AL, 0, dh, temp, u, dh); 975 UXTB16(AL, temp, pixel, 8); 976 MLA(AL, 0, dl, temp, u, dl); 977 978 // RT -> U*(1-V) 979 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offsetrt)); 980 SUB(AL, 0, u, k, u); 981 UXTB16(AL, temp, pixel, 0); 982 MLA(AL, 0, dh, temp, u, dh); 983 UXTB16(AL, temp, pixel, 8); 984 MLA(AL, 0, dl, temp, u, dl); 985 986 UXTB16(AL, dh, dh, 8); 987 UXTB16(AL, dl, dl, 8); 988 ORR(AL, 0, texel.reg, dh, reg_imm(dl, LSL, 8)); 989 } 990 #else 991 void GGLAssembler::filter32( 992 const fragment_parts_t& parts, 993 pixel_t& texel, const texture_unit_t& tmu, 994 int U, int V, pointer_t& txPtr, 995 int FRAC_BITS) 996 { 997 const int adjust = FRAC_BITS*2 - 8; 998 const int round = 0; 999 1000 // ------------------------ 1001 // about ~38 cycles / pixel 1002 Scratch scratches(registerFile()); 1003 1004 int pixel= scratches.obtain(); 1005 int dh = scratches.obtain(); 1006 int u = scratches.obtain(); 1007 int k = scratches.obtain(); 1008 1009 int temp = scratches.obtain(); 1010 int dl = scratches.obtain(); 1011 int mask = scratches.obtain(); 1012 1013 MOV(AL, 0, mask, imm(0xFF)); 1014 ORR(AL, 0, mask, mask, imm(0xFF0000)); 1015 1016 // RB -> U * V 1017 int offset = pixel; 1018 CONTEXT_LOAD(offset, generated_vars.rt); 1019 CONTEXT_LOAD(u, generated_vars.lb); 1020 ADD(AL, 0, offset, offset, u); 1021 1022 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1023 SMULBB(AL, u, U, V); 1024 AND(AL, 0, temp, mask, pixel); 1025 if (adjust) { 1026 if (round) 1027 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1028 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1029 } 1030 MUL(AL, 0, dh, temp, u); 1031 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1032 MUL(AL, 0, dl, temp, u); 1033 RSB(AL, 0, k, u, imm(0x100)); 1034 1035 // LB -> (1-U) * V 1036 CONTEXT_LOAD(offset, generated_vars.lb); 1037 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 1038 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1039 SMULBB(AL, u, U, V); 1040 AND(AL, 0, temp, mask, pixel); 1041 if (adjust) { 1042 if (round) 1043 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1044 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1045 } 1046 MLA(AL, 0, dh, temp, u, dh); 1047 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1048 MLA(AL, 0, dl, temp, u, dl); 1049 SUB(AL, 0, k, k, u); 1050 1051 // LT -> (1-U)*(1-V) 1052 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 1053 LDR(AL, pixel, txPtr.reg); 1054 SMULBB(AL, u, U, V); 1055 AND(AL, 0, temp, mask, pixel); 1056 if (adjust) { 1057 if (round) 1058 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1059 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1060 } 1061 MLA(AL, 0, dh, temp, u, dh); 1062 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1063 MLA(AL, 0, dl, temp, u, dl); 1064 1065 // RT -> U*(1-V) 1066 CONTEXT_LOAD(offset, generated_vars.rt); 1067 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1068 SUB(AL, 0, u, k, u); 1069 AND(AL, 0, temp, mask, pixel); 1070 MLA(AL, 0, dh, temp, u, dh); 1071 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1072 MLA(AL, 0, dl, temp, u, dl); 1073 1074 AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8)); 1075 AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8)); 1076 ORR(AL, 0, texel.reg, dh, dl); 1077 } 1078 #endif 1079 1080 void GGLAssembler::build_texture_environment( 1081 component_t& fragment, 1082 const fragment_parts_t& parts, 1083 int component, 1084 Scratch& regs) 1085 { 1086 const uint32_t component_mask = 1<<component; 1087 const bool multiTexture = mTextureMachine.activeUnits > 1; 1088 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) { 1089 texture_unit_t& tmu = mTextureMachine.tmu[i]; 1090 1091 if (tmu.mask & component_mask) { 1092 // replace or modulate with this texture 1093 if ((tmu.replaced & component_mask) == 0) { 1094 // not replaced by a later tmu... 1095 1096 Scratch scratches(registerFile()); 1097 pixel_t texel(parts.texel[i]); 1098 1099 if (multiTexture && 1100 tmu.swrap == GGL_NEEDS_WRAP_11 && 1101 tmu.twrap == GGL_NEEDS_WRAP_11) 1102 { 1103 texel.reg = scratches.obtain(); 1104 texel.flags |= CORRUPTIBLE; 1105 comment("fetch texel (multitexture 1:1)"); 1106 load(parts.coords[i].ptr, texel, WRITE_BACK); 1107 } 1108 1109 component_t incoming(fragment); 1110 modify(fragment, regs); 1111 1112 switch (tmu.env) { 1113 case GGL_REPLACE: 1114 extract(fragment, texel, component); 1115 break; 1116 case GGL_MODULATE: 1117 modulate(fragment, incoming, texel, component); 1118 break; 1119 case GGL_DECAL: 1120 decal(fragment, incoming, texel, component); 1121 break; 1122 case GGL_BLEND: 1123 blend(fragment, incoming, texel, component, i); 1124 break; 1125 case GGL_ADD: 1126 add(fragment, incoming, texel, component); 1127 break; 1128 } 1129 } 1130 } 1131 } 1132 } 1133 1134 // --------------------------------------------------------------------------- 1135 1136 void GGLAssembler::wrapping( 1137 int d, 1138 int coord, int size, 1139 int tx_wrap, int tx_linear) 1140 { 1141 // notes: 1142 // if tx_linear is set, we need 4 extra bits of precision on the result 1143 // SMULL/UMULL is 3 cycles 1144 Scratch scratches(registerFile()); 1145 int c = coord; 1146 if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) { 1147 // UMULL takes 4 cycles (interlocked), and we can get away with 1148 // 2 cycles using SMULWB, but we're loosing 16 bits of precision 1149 // out of 32 (this is not a problem because the iterator keeps 1150 // its full precision) 1151 // UMULL(AL, 0, size, d, c, size); 1152 // note: we can't use SMULTB because it's signed. 1153 MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear)); 1154 SMULWB(AL, d, d, size); 1155 } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) { 1156 if (tx_linear) { 1157 // 1 cycle 1158 MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear)); 1159 } else { 1160 // 4 cycles (common case) 1161 MOV(AL, 0, d, reg_imm(coord, ASR, 16)); 1162 BIC(AL, 0, d, d, reg_imm(d, ASR, 31)); 1163 CMP(AL, d, size); 1164 SUB(GE, 0, d, size, imm(1)); 1165 } 1166 } 1167 } 1168 1169 // --------------------------------------------------------------------------- 1170 1171 void GGLAssembler::modulate( 1172 component_t& dest, 1173 const component_t& incoming, 1174 const pixel_t& incomingTexel, int component) 1175 { 1176 Scratch locals(registerFile()); 1177 integer_t texel(locals.obtain(), 32, CORRUPTIBLE); 1178 extract(texel, incomingTexel, component); 1179 1180 const int Nt = texel.size(); 1181 // Nt should always be less than 10 bits because it comes 1182 // from the TMU. 1183 1184 int Ni = incoming.size(); 1185 // Ni could be big because it comes from previous MODULATEs 1186 1187 if (Nt == 1) { 1188 // texel acts as a bit-mask 1189 // dest = incoming & ((texel << incoming.h)-texel) 1190 RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h)); 1191 AND(AL, 0, dest.reg, dest.reg, incoming.reg); 1192 dest.l = incoming.l; 1193 dest.h = incoming.h; 1194 dest.flags |= (incoming.flags & CLEAR_LO); 1195 } else if (Ni == 1) { 1196 MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h)); 1197 AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31)); 1198 dest.l = 0; 1199 dest.h = Nt; 1200 } else { 1201 int inReg = incoming.reg; 1202 int shift = incoming.l; 1203 if ((Nt + Ni) > 32) { 1204 // we will overflow, reduce the precision of Ni to 8 bits 1205 // (Note Nt cannot be more than 10 bits which happens with 1206 // 565 textures and GGL_LINEAR) 1207 shift += Ni-8; 1208 Ni = 8; 1209 } 1210 1211 // modulate by the component with the lowest precision 1212 if (Nt >= Ni) { 1213 if (shift) { 1214 // XXX: we should be able to avoid this shift 1215 // when shift==16 && Nt<16 && Ni<16, in which 1216 // we could use SMULBT below. 1217 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); 1218 inReg = dest.reg; 1219 shift = 0; 1220 } 1221 // operation: (Cf*Ct)/((1<<Ni)-1) 1222 // approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni 1223 // this operation doesn't change texel's size 1224 ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1)); 1225 if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg); 1226 else MUL(AL, 0, dest.reg, texel.reg, dest.reg); 1227 dest.l = Ni; 1228 dest.h = Nt + Ni; 1229 } else { 1230 if (shift && (shift != 16)) { 1231 // if shift==16, we can use 16-bits mul instructions later 1232 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); 1233 inReg = dest.reg; 1234 shift = 0; 1235 } 1236 // operation: (Cf*Ct)/((1<<Nt)-1) 1237 // approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt 1238 // this operation doesn't change incoming's size 1239 Scratch scratches(registerFile()); 1240 int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg; 1241 if (t == inReg) 1242 t = scratches.obtain(); 1243 ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1)); 1244 if (Nt<16 && Ni<16) { 1245 if (shift==16) SMULBT(AL, dest.reg, t, inReg); 1246 else SMULBB(AL, dest.reg, t, inReg); 1247 } else MUL(AL, 0, dest.reg, t, inReg); 1248 dest.l = Nt; 1249 dest.h = Nt + Ni; 1250 } 1251 1252 // low bits are not valid 1253 dest.flags |= CLEAR_LO; 1254 1255 // no need to keep more than 8 bits/component 1256 if (dest.size() > 8) 1257 dest.l = dest.h-8; 1258 } 1259 } 1260 1261 void GGLAssembler::decal( 1262 component_t& dest, 1263 const component_t& incoming, 1264 const pixel_t& incomingTexel, int component) 1265 { 1266 // RGBA: 1267 // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At 1268 // Av = Af 1269 Scratch locals(registerFile()); 1270 integer_t texel(locals.obtain(), 32, CORRUPTIBLE); 1271 integer_t factor(locals.obtain(), 32, CORRUPTIBLE); 1272 extract(texel, incomingTexel, component); 1273 extract(factor, incomingTexel, GGLFormat::ALPHA); 1274 1275 // no need to keep more than 8-bits for decal 1276 int Ni = incoming.size(); 1277 int shift = incoming.l; 1278 if (Ni > 8) { 1279 shift += Ni-8; 1280 Ni = 8; 1281 } 1282 integer_t incomingNorm(incoming.reg, Ni, incoming.flags); 1283 if (shift) { 1284 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); 1285 incomingNorm.reg = dest.reg; 1286 incomingNorm.flags |= CORRUPTIBLE; 1287 } 1288 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); 1289 build_blendOneMinusFF(dest, factor, incomingNorm, texel); 1290 } 1291 1292 void GGLAssembler::blend( 1293 component_t& dest, 1294 const component_t& incoming, 1295 const pixel_t& incomingTexel, int component, int tmu) 1296 { 1297 // RGBA: 1298 // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct 1299 // Av = At*Af 1300 1301 if (component == GGLFormat::ALPHA) { 1302 modulate(dest, incoming, incomingTexel, component); 1303 return; 1304 } 1305 1306 Scratch locals(registerFile()); 1307 integer_t color(locals.obtain(), 8, CORRUPTIBLE); 1308 integer_t factor(locals.obtain(), 32, CORRUPTIBLE); 1309 LDRB(AL, color.reg, mBuilderContext.Rctx, 1310 immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component]))); 1311 extract(factor, incomingTexel, component); 1312 1313 // no need to keep more than 8-bits for blend 1314 int Ni = incoming.size(); 1315 int shift = incoming.l; 1316 if (Ni > 8) { 1317 shift += Ni-8; 1318 Ni = 8; 1319 } 1320 integer_t incomingNorm(incoming.reg, Ni, incoming.flags); 1321 if (shift) { 1322 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); 1323 incomingNorm.reg = dest.reg; 1324 incomingNorm.flags |= CORRUPTIBLE; 1325 } 1326 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); 1327 build_blendOneMinusFF(dest, factor, incomingNorm, color); 1328 } 1329 1330 void GGLAssembler::add( 1331 component_t& dest, 1332 const component_t& incoming, 1333 const pixel_t& incomingTexel, int component) 1334 { 1335 // RGBA: 1336 // Cv = Cf + Ct; 1337 Scratch locals(registerFile()); 1338 1339 component_t incomingTemp(incoming); 1340 1341 // use "dest" as a temporary for extracting the texel, unless "dest" 1342 // overlaps "incoming". 1343 integer_t texel(dest.reg, 32, CORRUPTIBLE); 1344 if (dest.reg == incomingTemp.reg) 1345 texel.reg = locals.obtain(); 1346 extract(texel, incomingTexel, component); 1347 1348 if (texel.s < incomingTemp.size()) { 1349 expand(texel, texel, incomingTemp.size()); 1350 } else if (texel.s > incomingTemp.size()) { 1351 if (incomingTemp.flags & CORRUPTIBLE) { 1352 expand(incomingTemp, incomingTemp, texel.s); 1353 } else { 1354 incomingTemp.reg = locals.obtain(); 1355 expand(incomingTemp, incoming, texel.s); 1356 } 1357 } 1358 1359 if (incomingTemp.l) { 1360 ADD(AL, 0, dest.reg, texel.reg, 1361 reg_imm(incomingTemp.reg, LSR, incomingTemp.l)); 1362 } else { 1363 ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg); 1364 } 1365 dest.l = 0; 1366 dest.h = texel.size(); 1367 component_sat(dest); 1368 } 1369 1370 // ---------------------------------------------------------------------------- 1371 1372 }; // namespace android 1373 1374