1 /* libs/pixelflinger/codeflinger/texturing.cpp 2 ** 3 ** Copyright 2006, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include <assert.h> 19 #include <stdint.h> 20 #include <stdlib.h> 21 #include <stdio.h> 22 #include <sys/types.h> 23 24 #include <cutils/log.h> 25 26 #include "codeflinger/GGLAssembler.h" 27 28 #ifdef __ARM_ARCH__ 29 #include <machine/cpu-features.h> 30 #endif 31 32 namespace android { 33 34 // --------------------------------------------------------------------------- 35 36 // iterators are initialized like this: 37 // (intToFixedCenter(x) * dx)>>16 + x0 38 // ((x<<16 + 0x8000) * dx)>>16 + x0 39 // ((x<<16)*dx + (0x8000*dx))>>16 + x0 40 // ( (x*dx) + dx>>1 ) + x0 41 // (x*dx) + (dx>>1 + x0) 42 43 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x) 44 { 45 context_t const* c = mBuilderContext.c; 46 const needs_t& needs = mBuilderContext.needs; 47 48 if (mSmooth) { 49 // NOTE: we could take this case in the mDithering + !mSmooth case, 50 // but this would use up to 4 more registers for the color components 51 // for only a little added quality. 52 // Currently, this causes the system to run out of registers in 53 // some case (see issue #719496) 54 55 comment("compute initial iterated color (smooth and/or dither case)"); 56 57 parts.iterated_packed = 0; 58 parts.packed = 0; 59 60 // 0x1: color component 61 // 0x2: iterators 62 const int optReload = mOptLevel >> 1; 63 if (optReload >= 3) parts.reload = 0; // reload nothing 64 else if (optReload == 2) parts.reload = 2; // reload iterators 65 else if (optReload == 1) parts.reload = 1; // reload colors 66 else if (optReload <= 0) parts.reload = 3; // reload both 67 68 if (!mSmooth) { 69 // we're not smoothing (just dithering), we never have to 70 // reload the iterators 71 parts.reload &= ~2; 72 } 73 74 Scratch scratches(registerFile()); 75 const int t0 = (parts.reload & 1) ? scratches.obtain() : 0; 76 const int t1 = (parts.reload & 2) ? scratches.obtain() : 0; 77 for (int i=0 ; i<4 ; i++) { 78 if (!mInfo[i].iterated) 79 continue; 80 81 // this component exists in the destination and is not replaced 82 // by a texture unit. 83 const int c = (parts.reload & 1) ? t0 : obtainReg(); 84 if (i==0) CONTEXT_LOAD(c, iterators.ydady); 85 if (i==1) CONTEXT_LOAD(c, iterators.ydrdy); 86 if (i==2) CONTEXT_LOAD(c, iterators.ydgdy); 87 if (i==3) CONTEXT_LOAD(c, iterators.ydbdy); 88 parts.argb[i].reg = c; 89 90 if (mInfo[i].smooth) { 91 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg(); 92 const int dvdx = parts.argb_dx[i].reg; 93 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx); 94 MLA(AL, 0, c, x.reg, dvdx, c); 95 96 // adjust the color iterator to make sure it won't overflow 97 if (!mAA) { 98 // this is not needed when we're using anti-aliasing 99 // because we will (have to) clamp the components 100 // anyway. 101 int end = scratches.obtain(); 102 MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16)); 103 MLA(AL, 1, end, dvdx, end, c); 104 SUB(MI, 0, c, c, end); 105 BIC(AL, 0, c, c, reg_imm(c, ASR, 31)); 106 scratches.recycle(end); 107 } 108 } 109 110 if (parts.reload & 1) { 111 CONTEXT_STORE(c, generated_vars.argb[i].c); 112 } 113 } 114 } else { 115 // We're not smoothed, so we can 116 // just use a packed version of the color and extract the 117 // components as needed (or not at all if we don't blend) 118 119 // figure out if we need the iterated color 120 int load = 0; 121 for (int i=0 ; i<4 ; i++) { 122 component_info_t& info = mInfo[i]; 123 if ((info.inDest || info.needed) && !info.replaced) 124 load |= 1; 125 } 126 127 parts.iterated_packed = 1; 128 parts.packed = (!mTextureMachine.mask && !mBlending 129 && !mFog && !mDithering); 130 parts.reload = 0; 131 if (load || parts.packed) { 132 if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) { 133 comment("load initial iterated color (8888 packed)"); 134 parts.iterated.setTo(obtainReg(), 135 &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888])); 136 CONTEXT_LOAD(parts.iterated.reg, packed8888); 137 } else { 138 comment("load initial iterated color (dest format packed)"); 139 140 parts.iterated.setTo(obtainReg(), &mCbFormat); 141 142 // pre-mask the iterated color 143 const int bits = parts.iterated.size(); 144 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1; 145 uint32_t mask = 0; 146 if (mMasking) { 147 for (int i=0 ; i<4 ; i++) { 148 const int component_mask = 1<<i; 149 const int h = parts.iterated.format.c[i].h; 150 const int l = parts.iterated.format.c[i].l; 151 if (h && (!(mMasking & component_mask))) { 152 mask |= ((1<<(h-l))-1) << l; 153 } 154 } 155 } 156 157 if (mMasking && ((mask & size)==0)) { 158 // none of the components are present in the mask 159 } else { 160 CONTEXT_LOAD(parts.iterated.reg, packed); 161 if (mCbFormat.size == 1) { 162 AND(AL, 0, parts.iterated.reg, 163 parts.iterated.reg, imm(0xFF)); 164 } else if (mCbFormat.size == 2) { 165 MOV(AL, 0, parts.iterated.reg, 166 reg_imm(parts.iterated.reg, LSR, 16)); 167 } 168 } 169 170 // pre-mask the iterated color 171 if (mMasking) { 172 build_and_immediate(parts.iterated.reg, parts.iterated.reg, 173 mask, bits); 174 } 175 } 176 } 177 } 178 } 179 180 void GGLAssembler::build_iterated_color( 181 component_t& fragment, 182 const fragment_parts_t& parts, 183 int component, 184 Scratch& regs) 185 { 186 fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE); 187 188 if (!mInfo[component].iterated) 189 return; 190 191 if (parts.iterated_packed) { 192 // iterated colors are packed, extract the one we need 193 extract(fragment, parts.iterated, component); 194 } else { 195 fragment.h = GGL_COLOR_BITS; 196 fragment.l = GGL_COLOR_BITS - 8; 197 fragment.flags |= CLEAR_LO; 198 // iterated colors are held in their own register, 199 // (smooth and/or dithering case) 200 if (parts.reload==3) { 201 // this implies mSmooth 202 Scratch scratches(registerFile()); 203 int dx = scratches.obtain(); 204 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); 205 CONTEXT_LOAD(dx, generated_vars.argb[component].dx); 206 ADD(AL, 0, dx, fragment.reg, dx); 207 CONTEXT_STORE(dx, generated_vars.argb[component].c); 208 } else if (parts.reload & 1) { 209 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); 210 } else { 211 // we don't reload, so simply rename the register and mark as 212 // non CORRUPTIBLE so that the texture env or blending code 213 // won't modify this (renamed) register 214 regs.recycle(fragment.reg); 215 fragment.reg = parts.argb[component].reg; 216 fragment.flags &= ~CORRUPTIBLE; 217 } 218 if (mInfo[component].smooth && mAA) { 219 // when using smooth shading AND anti-aliasing, we need to clamp 220 // the iterators because there is always an extra pixel on the 221 // edges, which most of the time will cause an overflow 222 // (since technically its outside of the domain). 223 BIC(AL, 0, fragment.reg, fragment.reg, 224 reg_imm(fragment.reg, ASR, 31)); 225 component_sat(fragment); 226 } 227 } 228 } 229 230 // --------------------------------------------------------------------------- 231 232 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs) 233 { 234 // gather some informations about the components we need to process... 235 const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR; 236 switch(opcode) { 237 case GGL_COPY: 238 mLogicOp = 0; 239 break; 240 case GGL_CLEAR: 241 case GGL_SET: 242 mLogicOp = LOGIC_OP; 243 break; 244 case GGL_AND: 245 case GGL_AND_REVERSE: 246 case GGL_AND_INVERTED: 247 case GGL_XOR: 248 case GGL_OR: 249 case GGL_NOR: 250 case GGL_EQUIV: 251 case GGL_OR_REVERSE: 252 case GGL_OR_INVERTED: 253 case GGL_NAND: 254 mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST; 255 break; 256 case GGL_NOOP: 257 case GGL_INVERT: 258 mLogicOp = LOGIC_OP|LOGIC_OP_DST; 259 break; 260 case GGL_COPY_INVERTED: 261 mLogicOp = LOGIC_OP|LOGIC_OP_SRC; 262 break; 263 }; 264 } 265 266 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c) 267 { 268 uint8_t replaced=0; 269 mTextureMachine.mask = 0; 270 mTextureMachine.activeUnits = 0; 271 for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) { 272 texture_unit_t& tmu = mTextureMachine.tmu[i]; 273 if (replaced == 0xF) { 274 // all components are replaced, skip this TMU. 275 tmu.format_idx = 0; 276 tmu.mask = 0; 277 tmu.replaced = replaced; 278 continue; 279 } 280 tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]); 281 tmu.format = c->formats[tmu.format_idx]; 282 tmu.bits = tmu.format.size*8; 283 tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]); 284 tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]); 285 tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i])); 286 tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]); 287 tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i]) 288 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now 289 290 // 5551 linear filtering is not supported 291 if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551) 292 tmu.linear = 0; 293 294 tmu.mask = 0; 295 tmu.replaced = replaced; 296 297 if (tmu.format_idx) { 298 mTextureMachine.activeUnits++; 299 if (tmu.format.c[0].h) tmu.mask |= 0x1; 300 if (tmu.format.c[1].h) tmu.mask |= 0x2; 301 if (tmu.format.c[2].h) tmu.mask |= 0x4; 302 if (tmu.format.c[3].h) tmu.mask |= 0x8; 303 if (tmu.env == GGL_REPLACE) { 304 replaced |= tmu.mask; 305 } else if (tmu.env == GGL_DECAL) { 306 if (!tmu.format.c[GGLFormat::ALPHA].h) { 307 // if we don't have alpha, decal does nothing 308 tmu.mask = 0; 309 } else { 310 // decal always ignores At 311 tmu.mask &= ~(1<<GGLFormat::ALPHA); 312 } 313 } 314 } 315 mTextureMachine.mask |= tmu.mask; 316 //printf("%d: mask=%08lx, replaced=%08lx\n", 317 // i, int(tmu.mask), int(tmu.replaced)); 318 } 319 mTextureMachine.replaced = replaced; 320 mTextureMachine.directTexture = 0; 321 //printf("replaced=%08lx\n", mTextureMachine.replaced); 322 } 323 324 325 void GGLAssembler::init_textures( 326 tex_coord_t* coords, 327 const reg_t& x, const reg_t& y) 328 { 329 context_t const* c = mBuilderContext.c; 330 const needs_t& needs = mBuilderContext.needs; 331 int Rctx = mBuilderContext.Rctx; 332 int Rx = x.reg; 333 int Ry = y.reg; 334 335 if (mTextureMachine.mask) { 336 comment("compute texture coordinates"); 337 } 338 339 // init texture coordinates for each tmu 340 const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n); 341 const bool multiTexture = mTextureMachine.activeUnits > 1; 342 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 343 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 344 if (tmu.format_idx == 0) 345 continue; 346 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 347 (tmu.twrap == GGL_NEEDS_WRAP_11)) 348 { 349 // 1:1 texture 350 pointer_t& txPtr = coords[i].ptr; 351 txPtr.setTo(obtainReg(), tmu.bits); 352 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy); 353 ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16) 354 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy); 355 ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16) 356 // merge base & offset 357 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride); 358 SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride 359 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); 360 base_offset(txPtr, txPtr, Rx); 361 } else { 362 Scratch scratches(registerFile()); 363 reg_t& s = coords[i].s; 364 reg_t& t = coords[i].t; 365 // s = (x * dsdx)>>16 + ydsdy 366 // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0 367 // t = (x * dtdx)>>16 + ydtdy 368 // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0 369 s.setTo(obtainReg()); 370 t.setTo(obtainReg()); 371 const int need_w = GGL_READ_NEEDS(W, needs.n); 372 if (need_w) { 373 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy); 374 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy); 375 } else { 376 int ydsdy = scratches.obtain(); 377 int ydtdy = scratches.obtain(); 378 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx); 379 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy); 380 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx); 381 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy); 382 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy); 383 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy); 384 } 385 386 if ((mOptLevel&1)==0) { 387 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); 388 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); 389 recycleReg(s.reg); 390 recycleReg(t.reg); 391 } 392 } 393 394 // direct texture? 395 if (!multiTexture && !mBlending && !mDithering && !mFog && 396 cb_format_idx == tmu.format_idx && !tmu.linear && 397 mTextureMachine.replaced == tmu.mask) 398 { 399 mTextureMachine.directTexture = i + 1; 400 } 401 } 402 } 403 404 void GGLAssembler::build_textures( fragment_parts_t& parts, 405 Scratch& regs) 406 { 407 context_t const* c = mBuilderContext.c; 408 const needs_t& needs = mBuilderContext.needs; 409 int Rctx = mBuilderContext.Rctx; 410 411 // We don't have a way to spill registers automatically 412 // spill depth and AA regs, when we know we may have to. 413 // build the spill list... 414 uint32_t spill_list = 0; 415 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 416 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 417 if (tmu.format_idx == 0) 418 continue; 419 if (tmu.linear) { 420 // we may run out of register if we have linear filtering 421 // at 1 or 4 bytes / pixel on any texture unit. 422 if (tmu.format.size == 1) { 423 // if depth and AA enabled, we'll run out of 1 register 424 if (parts.z.reg > 0 && parts.covPtr.reg > 0) 425 spill_list |= 1<<parts.covPtr.reg; 426 } 427 if (tmu.format.size == 4) { 428 // if depth or AA enabled, we'll run out of 1 or 2 registers 429 if (parts.z.reg > 0) 430 spill_list |= 1<<parts.z.reg; 431 if (parts.covPtr.reg > 0) 432 spill_list |= 1<<parts.covPtr.reg; 433 } 434 } 435 } 436 437 Spill spill(registerFile(), *this, spill_list); 438 439 const bool multiTexture = mTextureMachine.activeUnits > 1; 440 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 441 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 442 if (tmu.format_idx == 0) 443 continue; 444 445 pointer_t& txPtr = parts.coords[i].ptr; 446 pixel_t& texel = parts.texel[i]; 447 448 // repeat... 449 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 450 (tmu.twrap == GGL_NEEDS_WRAP_11)) 451 { // 1:1 textures 452 comment("fetch texel"); 453 texel.setTo(regs.obtain(), &tmu.format); 454 load(txPtr, texel, WRITE_BACK); 455 } else { 456 Scratch scratches(registerFile()); 457 reg_t& s = parts.coords[i].s; 458 reg_t& t = parts.coords[i].t; 459 if ((mOptLevel&1)==0) { 460 comment("reload s/t (multitexture or linear filtering)"); 461 s.reg = scratches.obtain(); 462 t.reg = scratches.obtain(); 463 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]); 464 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]); 465 } 466 467 comment("compute repeat/clamp"); 468 int u = scratches.obtain(); 469 int v = scratches.obtain(); 470 int width = scratches.obtain(); 471 int height = scratches.obtain(); 472 int U = 0; 473 int V = 0; 474 475 CONTEXT_LOAD(width, generated_vars.texture[i].width); 476 CONTEXT_LOAD(height, generated_vars.texture[i].height); 477 478 int FRAC_BITS = 0; 479 if (tmu.linear) { 480 // linear interpolation 481 if (tmu.format.size == 1) { 482 // for 8-bits textures, we can afford 483 // 7 bits of fractional precision at no 484 // additional cost (we can't do 8 bits 485 // because filter8 uses signed 16 bits muls) 486 FRAC_BITS = 7; 487 } else if (tmu.format.size == 2) { 488 // filter16() is internally limited to 4 bits, so: 489 // FRAC_BITS=2 generates less instructions, 490 // FRAC_BITS=3,4,5 creates unpleasant artifacts, 491 // FRAC_BITS=6+ looks good 492 FRAC_BITS = 6; 493 } else if (tmu.format.size == 4) { 494 // filter32() is internally limited to 8 bits, so: 495 // FRAC_BITS=4 looks good 496 // FRAC_BITS=5+ looks better, but generates 3 extra ipp 497 FRAC_BITS = 6; 498 } else { 499 // for all other cases we use 4 bits. 500 FRAC_BITS = 4; 501 } 502 } 503 wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS); 504 wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS); 505 506 if (tmu.linear) { 507 comment("compute linear filtering offsets"); 508 // pixel size scale 509 const int shift = 31 - gglClz(tmu.format.size); 510 U = scratches.obtain(); 511 V = scratches.obtain(); 512 513 // sample the texel center 514 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1))); 515 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1))); 516 517 // get the fractionnal part of U,V 518 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1)); 519 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1)); 520 521 // compute width-1 and height-1 522 SUB(AL, 0, width, width, imm(1)); 523 SUB(AL, 0, height, height, imm(1)); 524 525 // get the integer part of U,V and clamp/wrap 526 // and compute offset to the next texel 527 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) { 528 // u has already been REPEATed 529 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); 530 MOV(MI, 0, u, width); 531 CMP(AL, u, width); 532 MOV(LT, 0, width, imm(1 << shift)); 533 if (shift) 534 MOV(GE, 0, width, reg_imm(width, LSL, shift)); 535 RSB(GE, 0, width, width, imm(0)); 536 } else { 537 // u has not been CLAMPed yet 538 // algorithm: 539 // if ((u>>4) >= width) 540 // u = width<<4 541 // width = 0 542 // else 543 // width = 1<<shift 544 // u = u>>4; // get integer part 545 // if (u<0) 546 // u = 0 547 // width = 0 548 // generated_vars.rt = width 549 550 CMP(AL, width, reg_imm(u, ASR, FRAC_BITS)); 551 MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS)); 552 MOV(LE, 0, width, imm(0)); 553 MOV(GT, 0, width, imm(1 << shift)); 554 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); 555 MOV(MI, 0, u, imm(0)); 556 MOV(MI, 0, width, imm(0)); 557 } 558 CONTEXT_STORE(width, generated_vars.rt); 559 560 const int stride = width; 561 CONTEXT_LOAD(stride, generated_vars.texture[i].stride); 562 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) { 563 // v has already been REPEATed 564 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); 565 MOV(MI, 0, v, height); 566 CMP(AL, v, height); 567 MOV(LT, 0, height, imm(1 << shift)); 568 if (shift) 569 MOV(GE, 0, height, reg_imm(height, LSL, shift)); 570 RSB(GE, 0, height, height, imm(0)); 571 MUL(AL, 0, height, stride, height); 572 } else { 573 // v has not been CLAMPed yet 574 CMP(AL, height, reg_imm(v, ASR, FRAC_BITS)); 575 MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS)); 576 MOV(LE, 0, height, imm(0)); 577 if (shift) { 578 MOV(GT, 0, height, reg_imm(stride, LSL, shift)); 579 } else { 580 MOV(GT, 0, height, stride); 581 } 582 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); 583 MOV(MI, 0, v, imm(0)); 584 MOV(MI, 0, height, imm(0)); 585 } 586 CONTEXT_STORE(height, generated_vars.lb); 587 } 588 589 scratches.recycle(width); 590 scratches.recycle(height); 591 592 // iterate texture coordinates... 593 comment("iterate s,t"); 594 int dsdx = scratches.obtain(); 595 int dtdx = scratches.obtain(); 596 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); 597 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); 598 ADD(AL, 0, s.reg, s.reg, dsdx); 599 ADD(AL, 0, t.reg, t.reg, dtdx); 600 if ((mOptLevel&1)==0) { 601 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); 602 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); 603 scratches.recycle(s.reg); 604 scratches.recycle(t.reg); 605 } 606 scratches.recycle(dsdx); 607 scratches.recycle(dtdx); 608 609 // merge base & offset... 610 comment("merge base & offset"); 611 texel.setTo(regs.obtain(), &tmu.format); 612 txPtr.setTo(texel.reg, tmu.bits); 613 int stride = scratches.obtain(); 614 CONTEXT_LOAD(stride, generated_vars.texture[i].stride); 615 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); 616 SMLABB(AL, u, v, stride, u); // u+v*stride 617 base_offset(txPtr, txPtr, u); 618 619 // load texel 620 if (!tmu.linear) { 621 comment("fetch texel"); 622 load(txPtr, texel, 0); 623 } else { 624 // recycle registers we don't need anymore 625 scratches.recycle(u); 626 scratches.recycle(v); 627 scratches.recycle(stride); 628 629 comment("fetch texel, bilinear"); 630 switch (tmu.format.size) { 631 case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 632 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 633 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 634 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 635 } 636 } 637 } 638 } 639 } 640 641 void GGLAssembler::build_iterate_texture_coordinates( 642 const fragment_parts_t& parts) 643 { 644 const bool multiTexture = mTextureMachine.activeUnits > 1; 645 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 646 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 647 if (tmu.format_idx == 0) 648 continue; 649 650 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 651 (tmu.twrap == GGL_NEEDS_WRAP_11)) 652 { // 1:1 textures 653 const pointer_t& txPtr = parts.coords[i].ptr; 654 ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3)); 655 } else { 656 Scratch scratches(registerFile()); 657 int s = parts.coords[i].s.reg; 658 int t = parts.coords[i].t.reg; 659 if ((mOptLevel&1)==0) { 660 s = scratches.obtain(); 661 t = scratches.obtain(); 662 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]); 663 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]); 664 } 665 int dsdx = scratches.obtain(); 666 int dtdx = scratches.obtain(); 667 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); 668 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); 669 ADD(AL, 0, s, s, dsdx); 670 ADD(AL, 0, t, t, dtdx); 671 if ((mOptLevel&1)==0) { 672 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]); 673 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]); 674 } 675 } 676 } 677 } 678 679 void GGLAssembler::filter8( 680 const fragment_parts_t& parts, 681 pixel_t& texel, const texture_unit_t& tmu, 682 int U, int V, pointer_t& txPtr, 683 int FRAC_BITS) 684 { 685 if (tmu.format.components != GGL_ALPHA && 686 tmu.format.components != GGL_LUMINANCE) 687 { 688 // this is a packed format, and we don't support 689 // linear filtering (it's probably RGB 332) 690 // Should not happen with OpenGL|ES 691 LDRB(AL, texel.reg, txPtr.reg); 692 return; 693 } 694 695 // ------------------------ 696 // about ~22 cycles / pixel 697 Scratch scratches(registerFile()); 698 699 int pixel= scratches.obtain(); 700 int d = scratches.obtain(); 701 int u = scratches.obtain(); 702 int k = scratches.obtain(); 703 int rt = scratches.obtain(); 704 int lb = scratches.obtain(); 705 706 // RB -> U * V 707 708 CONTEXT_LOAD(rt, generated_vars.rt); 709 CONTEXT_LOAD(lb, generated_vars.lb); 710 711 int offset = pixel; 712 ADD(AL, 0, offset, lb, rt); 713 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 714 SMULBB(AL, u, U, V); 715 SMULBB(AL, d, pixel, u); 716 RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2))); 717 718 // LB -> (1-U) * V 719 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 720 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb)); 721 SMULBB(AL, u, U, V); 722 SMLABB(AL, d, pixel, u, d); 723 SUB(AL, 0, k, k, u); 724 725 // LT -> (1-U)*(1-V) 726 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 727 LDRB(AL, pixel, txPtr.reg); 728 SMULBB(AL, u, U, V); 729 SMLABB(AL, d, pixel, u, d); 730 731 // RT -> U*(1-V) 732 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt)); 733 SUB(AL, 0, u, k, u); 734 SMLABB(AL, texel.reg, pixel, u, d); 735 736 for (int i=0 ; i<4 ; i++) { 737 if (!texel.format.c[i].h) continue; 738 texel.format.c[i].h = FRAC_BITS*2+8; 739 texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough 740 } 741 texel.format.size = 4; 742 texel.format.bitsPerPixel = 32; 743 texel.flags |= CLEAR_LO; 744 } 745 746 void GGLAssembler::filter16( 747 const fragment_parts_t& parts, 748 pixel_t& texel, const texture_unit_t& tmu, 749 int U, int V, pointer_t& txPtr, 750 int FRAC_BITS) 751 { 752 // compute the mask 753 // XXX: it would be nice if the mask below could be computed 754 // automatically. 755 uint32_t mask = 0; 756 int shift = 0; 757 int prec = 0; 758 switch (tmu.format_idx) { 759 case GGL_PIXEL_FORMAT_RGB_565: 760 // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb 761 // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb 762 mask = 0x07E0F81F; 763 shift = 16; 764 prec = 5; 765 break; 766 case GGL_PIXEL_FORMAT_RGBA_4444: 767 // 0000,1111,0000,1111 | 0000,1111,0000,1111 768 mask = 0x0F0F0F0F; 769 shift = 12; 770 prec = 4; 771 break; 772 case GGL_PIXEL_FORMAT_LA_88: 773 // 0000,0000,1111,1111 | 0000,0000,1111,1111 774 // AALL -> 00AA | 00LL 775 mask = 0x00FF00FF; 776 shift = 8; 777 prec = 8; 778 break; 779 default: 780 // unsupported format, do something sensical... 781 LOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx); 782 LDRH(AL, texel.reg, txPtr.reg); 783 return; 784 } 785 786 const int adjust = FRAC_BITS*2 - prec; 787 const int round = 0; 788 789 // update the texel format 790 texel.format.size = 4; 791 texel.format.bitsPerPixel = 32; 792 texel.flags |= CLEAR_HI|CLEAR_LO; 793 for (int i=0 ; i<4 ; i++) { 794 if (!texel.format.c[i].h) continue; 795 const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift; 796 texel.format.c[i].h = tmu.format.c[i].h + offset + prec; 797 texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec); 798 } 799 800 // ------------------------ 801 // about ~40 cycles / pixel 802 Scratch scratches(registerFile()); 803 804 int pixel= scratches.obtain(); 805 int d = scratches.obtain(); 806 int u = scratches.obtain(); 807 int k = scratches.obtain(); 808 809 // RB -> U * V 810 int offset = pixel; 811 CONTEXT_LOAD(offset, generated_vars.rt); 812 CONTEXT_LOAD(u, generated_vars.lb); 813 ADD(AL, 0, offset, offset, u); 814 815 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 816 SMULBB(AL, u, U, V); 817 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 818 build_and_immediate(pixel, pixel, mask, 32); 819 if (adjust) { 820 if (round) 821 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 822 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 823 } 824 MUL(AL, 0, d, pixel, u); 825 RSB(AL, 0, k, u, imm(1<<prec)); 826 827 // LB -> (1-U) * V 828 CONTEXT_LOAD(offset, generated_vars.lb); 829 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 830 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 831 SMULBB(AL, u, U, V); 832 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 833 build_and_immediate(pixel, pixel, mask, 32); 834 if (adjust) { 835 if (round) 836 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 837 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 838 } 839 MLA(AL, 0, d, pixel, u, d); 840 SUB(AL, 0, k, k, u); 841 842 // LT -> (1-U)*(1-V) 843 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 844 LDRH(AL, pixel, txPtr.reg); 845 SMULBB(AL, u, U, V); 846 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 847 build_and_immediate(pixel, pixel, mask, 32); 848 if (adjust) { 849 if (round) 850 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 851 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 852 } 853 MLA(AL, 0, d, pixel, u, d); 854 855 // RT -> U*(1-V) 856 CONTEXT_LOAD(offset, generated_vars.rt); 857 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 858 SUB(AL, 0, u, k, u); 859 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 860 build_and_immediate(pixel, pixel, mask, 32); 861 MLA(AL, 0, texel.reg, pixel, u, d); 862 } 863 864 void GGLAssembler::filter24( 865 const fragment_parts_t& parts, 866 pixel_t& texel, const texture_unit_t& tmu, 867 int U, int V, pointer_t& txPtr, 868 int FRAC_BITS) 869 { 870 // not supported yet (currently disabled) 871 load(txPtr, texel, 0); 872 } 873 874 #if __ARM_ARCH__ >= 6 875 // ARMv6 version, using UXTB16, and scheduled for Cortex-A8 pipeline 876 void GGLAssembler::filter32( 877 const fragment_parts_t& parts, 878 pixel_t& texel, const texture_unit_t& tmu, 879 int U, int V, pointer_t& txPtr, 880 int FRAC_BITS) 881 { 882 const int adjust = FRAC_BITS*2 - 8; 883 const int round = 0; 884 const int prescale = 16 - adjust; 885 886 Scratch scratches(registerFile()); 887 888 int pixel= scratches.obtain(); 889 int dh = scratches.obtain(); 890 int u = scratches.obtain(); 891 int k = scratches.obtain(); 892 893 int temp = scratches.obtain(); 894 int dl = scratches.obtain(); 895 896 int offsetrt = scratches.obtain(); 897 int offsetlb = scratches.obtain(); 898 899 int pixellb = offsetlb; 900 901 // RB -> U * V 902 CONTEXT_LOAD(offsetrt, generated_vars.rt); 903 CONTEXT_LOAD(offsetlb, generated_vars.lb); 904 if(!round) { 905 MOV(AL, 0, U, reg_imm(U, LSL, prescale)); 906 } 907 ADD(AL, 0, u, offsetrt, offsetlb); 908 909 LDR(AL, pixel, txPtr.reg, reg_scale_pre(u)); 910 if (round) { 911 SMULBB(AL, u, U, V); 912 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 913 } else { 914 SMULWB(AL, u, U, V); 915 RSB(AL, 0, U, U, imm(1<<(FRAC_BITS+prescale))); 916 } 917 UXTB16(AL, temp, pixel, 0); 918 if (round) { 919 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 920 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 921 } 922 LDR(AL, pixellb, txPtr.reg, reg_scale_pre(offsetlb)); 923 MUL(AL, 0, dh, temp, u); 924 UXTB16(AL, temp, pixel, 8); 925 MUL(AL, 0, dl, temp, u); 926 RSB(AL, 0, k, u, imm(0x100)); 927 928 // LB -> (1-U) * V 929 if (round) { 930 SMULBB(AL, u, U, V); 931 } else { 932 SMULWB(AL, u, U, V); 933 } 934 UXTB16(AL, temp, pixellb, 0); 935 if (round) { 936 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 937 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 938 } 939 MLA(AL, 0, dh, temp, u, dh); 940 UXTB16(AL, temp, pixellb, 8); 941 MLA(AL, 0, dl, temp, u, dl); 942 SUB(AL, 0, k, k, u); 943 944 // LT -> (1-U)*(1-V) 945 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 946 LDR(AL, pixel, txPtr.reg); 947 if (round) { 948 SMULBB(AL, u, U, V); 949 } else { 950 SMULWB(AL, u, U, V); 951 } 952 UXTB16(AL, temp, pixel, 0); 953 if (round) { 954 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 955 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 956 } 957 MLA(AL, 0, dh, temp, u, dh); 958 UXTB16(AL, temp, pixel, 8); 959 MLA(AL, 0, dl, temp, u, dl); 960 961 // RT -> U*(1-V) 962 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offsetrt)); 963 SUB(AL, 0, u, k, u); 964 UXTB16(AL, temp, pixel, 0); 965 MLA(AL, 0, dh, temp, u, dh); 966 UXTB16(AL, temp, pixel, 8); 967 MLA(AL, 0, dl, temp, u, dl); 968 969 UXTB16(AL, dh, dh, 8); 970 UXTB16(AL, dl, dl, 8); 971 ORR(AL, 0, texel.reg, dh, reg_imm(dl, LSL, 8)); 972 } 973 #else 974 void GGLAssembler::filter32( 975 const fragment_parts_t& parts, 976 pixel_t& texel, const texture_unit_t& tmu, 977 int U, int V, pointer_t& txPtr, 978 int FRAC_BITS) 979 { 980 const int adjust = FRAC_BITS*2 - 8; 981 const int round = 0; 982 983 // ------------------------ 984 // about ~38 cycles / pixel 985 Scratch scratches(registerFile()); 986 987 int pixel= scratches.obtain(); 988 int dh = scratches.obtain(); 989 int u = scratches.obtain(); 990 int k = scratches.obtain(); 991 992 int temp = scratches.obtain(); 993 int dl = scratches.obtain(); 994 int mask = scratches.obtain(); 995 996 MOV(AL, 0, mask, imm(0xFF)); 997 ORR(AL, 0, mask, mask, imm(0xFF0000)); 998 999 // RB -> U * V 1000 int offset = pixel; 1001 CONTEXT_LOAD(offset, generated_vars.rt); 1002 CONTEXT_LOAD(u, generated_vars.lb); 1003 ADD(AL, 0, offset, offset, u); 1004 1005 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1006 SMULBB(AL, u, U, V); 1007 AND(AL, 0, temp, mask, pixel); 1008 if (adjust) { 1009 if (round) 1010 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1011 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1012 } 1013 MUL(AL, 0, dh, temp, u); 1014 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1015 MUL(AL, 0, dl, temp, u); 1016 RSB(AL, 0, k, u, imm(0x100)); 1017 1018 // LB -> (1-U) * V 1019 CONTEXT_LOAD(offset, generated_vars.lb); 1020 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 1021 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1022 SMULBB(AL, u, U, V); 1023 AND(AL, 0, temp, mask, pixel); 1024 if (adjust) { 1025 if (round) 1026 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1027 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1028 } 1029 MLA(AL, 0, dh, temp, u, dh); 1030 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1031 MLA(AL, 0, dl, temp, u, dl); 1032 SUB(AL, 0, k, k, u); 1033 1034 // LT -> (1-U)*(1-V) 1035 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 1036 LDR(AL, pixel, txPtr.reg); 1037 SMULBB(AL, u, U, V); 1038 AND(AL, 0, temp, mask, pixel); 1039 if (adjust) { 1040 if (round) 1041 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1042 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1043 } 1044 MLA(AL, 0, dh, temp, u, dh); 1045 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1046 MLA(AL, 0, dl, temp, u, dl); 1047 1048 // RT -> U*(1-V) 1049 CONTEXT_LOAD(offset, generated_vars.rt); 1050 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1051 SUB(AL, 0, u, k, u); 1052 AND(AL, 0, temp, mask, pixel); 1053 MLA(AL, 0, dh, temp, u, dh); 1054 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1055 MLA(AL, 0, dl, temp, u, dl); 1056 1057 AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8)); 1058 AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8)); 1059 ORR(AL, 0, texel.reg, dh, dl); 1060 } 1061 #endif 1062 1063 void GGLAssembler::build_texture_environment( 1064 component_t& fragment, 1065 const fragment_parts_t& parts, 1066 int component, 1067 Scratch& regs) 1068 { 1069 const uint32_t component_mask = 1<<component; 1070 const bool multiTexture = mTextureMachine.activeUnits > 1; 1071 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) { 1072 texture_unit_t& tmu = mTextureMachine.tmu[i]; 1073 1074 if (tmu.mask & component_mask) { 1075 // replace or modulate with this texture 1076 if ((tmu.replaced & component_mask) == 0) { 1077 // not replaced by a later tmu... 1078 1079 Scratch scratches(registerFile()); 1080 pixel_t texel(parts.texel[i]); 1081 if (multiTexture && 1082 tmu.swrap == GGL_NEEDS_WRAP_11 && 1083 tmu.twrap == GGL_NEEDS_WRAP_11) 1084 { 1085 texel.reg = scratches.obtain(); 1086 texel.flags |= CORRUPTIBLE; 1087 comment("fetch texel (multitexture 1:1)"); 1088 load(parts.coords[i].ptr, texel, WRITE_BACK); 1089 } 1090 1091 component_t incoming(fragment); 1092 modify(fragment, regs); 1093 1094 switch (tmu.env) { 1095 case GGL_REPLACE: 1096 extract(fragment, texel, component); 1097 break; 1098 case GGL_MODULATE: 1099 modulate(fragment, incoming, texel, component); 1100 break; 1101 case GGL_DECAL: 1102 decal(fragment, incoming, texel, component); 1103 break; 1104 case GGL_BLEND: 1105 blend(fragment, incoming, texel, component, i); 1106 break; 1107 case GGL_ADD: 1108 add(fragment, incoming, texel, component); 1109 break; 1110 } 1111 } 1112 } 1113 } 1114 } 1115 1116 // --------------------------------------------------------------------------- 1117 1118 void GGLAssembler::wrapping( 1119 int d, 1120 int coord, int size, 1121 int tx_wrap, int tx_linear) 1122 { 1123 // notes: 1124 // if tx_linear is set, we need 4 extra bits of precision on the result 1125 // SMULL/UMULL is 3 cycles 1126 Scratch scratches(registerFile()); 1127 int c = coord; 1128 if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) { 1129 // UMULL takes 4 cycles (interlocked), and we can get away with 1130 // 2 cycles using SMULWB, but we're loosing 16 bits of precision 1131 // out of 32 (this is not a problem because the iterator keeps 1132 // its full precision) 1133 // UMULL(AL, 0, size, d, c, size); 1134 // note: we can't use SMULTB because it's signed. 1135 MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear)); 1136 SMULWB(AL, d, d, size); 1137 } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) { 1138 if (tx_linear) { 1139 // 1 cycle 1140 MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear)); 1141 } else { 1142 // 4 cycles (common case) 1143 MOV(AL, 0, d, reg_imm(coord, ASR, 16)); 1144 BIC(AL, 0, d, d, reg_imm(d, ASR, 31)); 1145 CMP(AL, d, size); 1146 SUB(GE, 0, d, size, imm(1)); 1147 } 1148 } 1149 } 1150 1151 // --------------------------------------------------------------------------- 1152 1153 void GGLAssembler::modulate( 1154 component_t& dest, 1155 const component_t& incoming, 1156 const pixel_t& incomingTexel, int component) 1157 { 1158 Scratch locals(registerFile()); 1159 integer_t texel(locals.obtain(), 32, CORRUPTIBLE); 1160 extract(texel, incomingTexel, component); 1161 1162 const int Nt = texel.size(); 1163 // Nt should always be less than 10 bits because it comes 1164 // from the TMU. 1165 1166 int Ni = incoming.size(); 1167 // Ni could be big because it comes from previous MODULATEs 1168 1169 if (Nt == 1) { 1170 // texel acts as a bit-mask 1171 // dest = incoming & ((texel << incoming.h)-texel) 1172 RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h)); 1173 AND(AL, 0, dest.reg, dest.reg, incoming.reg); 1174 dest.l = incoming.l; 1175 dest.h = incoming.h; 1176 dest.flags |= (incoming.flags & CLEAR_LO); 1177 } else if (Ni == 1) { 1178 MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h)); 1179 AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31)); 1180 dest.l = 0; 1181 dest.h = Nt; 1182 } else { 1183 int inReg = incoming.reg; 1184 int shift = incoming.l; 1185 if ((Nt + Ni) > 32) { 1186 // we will overflow, reduce the precision of Ni to 8 bits 1187 // (Note Nt cannot be more than 10 bits which happens with 1188 // 565 textures and GGL_LINEAR) 1189 shift += Ni-8; 1190 Ni = 8; 1191 } 1192 1193 // modulate by the component with the lowest precision 1194 if (Nt >= Ni) { 1195 if (shift) { 1196 // XXX: we should be able to avoid this shift 1197 // when shift==16 && Nt<16 && Ni<16, in which 1198 // we could use SMULBT below. 1199 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); 1200 inReg = dest.reg; 1201 shift = 0; 1202 } 1203 // operation: (Cf*Ct)/((1<<Ni)-1) 1204 // approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni 1205 // this operation doesn't change texel's size 1206 ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1)); 1207 if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg); 1208 else MUL(AL, 0, dest.reg, texel.reg, dest.reg); 1209 dest.l = Ni; 1210 dest.h = Nt + Ni; 1211 } else { 1212 if (shift && (shift != 16)) { 1213 // if shift==16, we can use 16-bits mul instructions later 1214 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); 1215 inReg = dest.reg; 1216 shift = 0; 1217 } 1218 // operation: (Cf*Ct)/((1<<Nt)-1) 1219 // approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt 1220 // this operation doesn't change incoming's size 1221 Scratch scratches(registerFile()); 1222 int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg; 1223 if (t == inReg) 1224 t = scratches.obtain(); 1225 ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1)); 1226 if (Nt<16 && Ni<16) { 1227 if (shift==16) SMULBT(AL, dest.reg, t, inReg); 1228 else SMULBB(AL, dest.reg, t, inReg); 1229 } else MUL(AL, 0, dest.reg, t, inReg); 1230 dest.l = Nt; 1231 dest.h = Nt + Ni; 1232 } 1233 1234 // low bits are not valid 1235 dest.flags |= CLEAR_LO; 1236 1237 // no need to keep more than 8 bits/component 1238 if (dest.size() > 8) 1239 dest.l = dest.h-8; 1240 } 1241 } 1242 1243 void GGLAssembler::decal( 1244 component_t& dest, 1245 const component_t& incoming, 1246 const pixel_t& incomingTexel, int component) 1247 { 1248 // RGBA: 1249 // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At 1250 // Av = Af 1251 Scratch locals(registerFile()); 1252 integer_t texel(locals.obtain(), 32, CORRUPTIBLE); 1253 integer_t factor(locals.obtain(), 32, CORRUPTIBLE); 1254 extract(texel, incomingTexel, component); 1255 extract(factor, incomingTexel, GGLFormat::ALPHA); 1256 1257 // no need to keep more than 8-bits for decal 1258 int Ni = incoming.size(); 1259 int shift = incoming.l; 1260 if (Ni > 8) { 1261 shift += Ni-8; 1262 Ni = 8; 1263 } 1264 integer_t incomingNorm(incoming.reg, Ni, incoming.flags); 1265 if (shift) { 1266 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); 1267 incomingNorm.reg = dest.reg; 1268 incomingNorm.flags |= CORRUPTIBLE; 1269 } 1270 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); 1271 build_blendOneMinusFF(dest, factor, incomingNorm, texel); 1272 } 1273 1274 void GGLAssembler::blend( 1275 component_t& dest, 1276 const component_t& incoming, 1277 const pixel_t& incomingTexel, int component, int tmu) 1278 { 1279 // RGBA: 1280 // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct 1281 // Av = At*Af 1282 1283 if (component == GGLFormat::ALPHA) { 1284 modulate(dest, incoming, incomingTexel, component); 1285 return; 1286 } 1287 1288 Scratch locals(registerFile()); 1289 integer_t color(locals.obtain(), 8, CORRUPTIBLE); 1290 integer_t factor(locals.obtain(), 32, CORRUPTIBLE); 1291 LDRB(AL, color.reg, mBuilderContext.Rctx, 1292 immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component]))); 1293 extract(factor, incomingTexel, component); 1294 1295 // no need to keep more than 8-bits for blend 1296 int Ni = incoming.size(); 1297 int shift = incoming.l; 1298 if (Ni > 8) { 1299 shift += Ni-8; 1300 Ni = 8; 1301 } 1302 integer_t incomingNorm(incoming.reg, Ni, incoming.flags); 1303 if (shift) { 1304 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); 1305 incomingNorm.reg = dest.reg; 1306 incomingNorm.flags |= CORRUPTIBLE; 1307 } 1308 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); 1309 build_blendOneMinusFF(dest, factor, incomingNorm, color); 1310 } 1311 1312 void GGLAssembler::add( 1313 component_t& dest, 1314 const component_t& incoming, 1315 const pixel_t& incomingTexel, int component) 1316 { 1317 // RGBA: 1318 // Cv = Cf + Ct; 1319 Scratch locals(registerFile()); 1320 1321 component_t incomingTemp(incoming); 1322 1323 // use "dest" as a temporary for extracting the texel, unless "dest" 1324 // overlaps "incoming". 1325 integer_t texel(dest.reg, 32, CORRUPTIBLE); 1326 if (dest.reg == incomingTemp.reg) 1327 texel.reg = locals.obtain(); 1328 extract(texel, incomingTexel, component); 1329 1330 if (texel.s < incomingTemp.size()) { 1331 expand(texel, texel, incomingTemp.size()); 1332 } else if (texel.s > incomingTemp.size()) { 1333 if (incomingTemp.flags & CORRUPTIBLE) { 1334 expand(incomingTemp, incomingTemp, texel.s); 1335 } else { 1336 incomingTemp.reg = locals.obtain(); 1337 expand(incomingTemp, incoming, texel.s); 1338 } 1339 } 1340 1341 if (incomingTemp.l) { 1342 ADD(AL, 0, dest.reg, texel.reg, 1343 reg_imm(incomingTemp.reg, LSR, incomingTemp.l)); 1344 } else { 1345 ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg); 1346 } 1347 dest.l = 0; 1348 dest.h = texel.size(); 1349 component_sat(dest); 1350 } 1351 1352 // ---------------------------------------------------------------------------- 1353 1354 }; // namespace android 1355 1356