Home | History | Annotate | Download | only in codeflinger
      1 /* libs/pixelflinger/codeflinger/texturing.cpp
      2 **
      3 ** Copyright 2006, The Android Open Source Project
      4 **
      5 ** Licensed under the Apache License, Version 2.0 (the "License");
      6 ** you may not use this file except in compliance with the License.
      7 ** You may obtain a copy of the License at
      8 **
      9 **     http://www.apache.org/licenses/LICENSE-2.0
     10 **
     11 ** Unless required by applicable law or agreed to in writing, software
     12 ** distributed under the License is distributed on an "AS IS" BASIS,
     13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 ** See the License for the specific language governing permissions and
     15 ** limitations under the License.
     16 */
     17 
     18 #define LOG_TAG "pixelflinger-code"
     19 
     20 #include <assert.h>
     21 #include <stdint.h>
     22 #include <stdio.h>
     23 #include <stdlib.h>
     24 #include <sys/types.h>
     25 
     26 #include <log/log.h>
     27 
     28 #include "GGLAssembler.h"
     29 
     30 namespace android {
     31 
     32 // ---------------------------------------------------------------------------
     33 
     34 // iterators are initialized like this:
     35 // (intToFixedCenter(x) * dx)>>16 + x0
     36 // ((x<<16 + 0x8000) * dx)>>16 + x0
     37 // ((x<<16)*dx + (0x8000*dx))>>16 + x0
     38 // ( (x*dx) + dx>>1 ) + x0
     39 // (x*dx) + (dx>>1 + x0)
     40 
     41 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
     42 {
     43     context_t const* c = mBuilderContext.c;
     44 
     45     if (mSmooth) {
     46         // NOTE: we could take this case in the mDithering + !mSmooth case,
     47         // but this would use up to 4 more registers for the color components
     48         // for only a little added quality.
     49         // Currently, this causes the system to run out of registers in
     50         // some case (see issue #719496)
     51 
     52         comment("compute initial iterated color (smooth and/or dither case)");
     53 
     54         parts.iterated_packed = 0;
     55         parts.packed = 0;
     56 
     57         // 0x1: color component
     58         // 0x2: iterators
     59         const int optReload = mOptLevel >> 1;
     60         if (optReload >= 3)         parts.reload = 0; // reload nothing
     61         else if (optReload == 2)    parts.reload = 2; // reload iterators
     62         else if (optReload == 1)    parts.reload = 1; // reload colors
     63         else if (optReload <= 0)    parts.reload = 3; // reload both
     64 
     65         if (!mSmooth) {
     66             // we're not smoothing (just dithering), we never have to
     67             // reload the iterators
     68             parts.reload &= ~2;
     69         }
     70 
     71         Scratch scratches(registerFile());
     72         const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
     73         const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
     74         for (int i=0 ; i<4 ; i++) {
     75             if (!mInfo[i].iterated)
     76                 continue;
     77 
     78             // this component exists in the destination and is not replaced
     79             // by a texture unit.
     80             const int c = (parts.reload & 1) ? t0 : obtainReg();
     81             if (i==0) CONTEXT_LOAD(c, iterators.ydady);
     82             if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
     83             if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
     84             if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
     85             parts.argb[i].reg = c;
     86 
     87             if (mInfo[i].smooth) {
     88                 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
     89                 const int dvdx = parts.argb_dx[i].reg;
     90                 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
     91                 MLA(AL, 0, c, x.reg, dvdx, c);
     92 
     93                 // adjust the color iterator to make sure it won't overflow
     94                 if (!mAA) {
     95                     // this is not needed when we're using anti-aliasing
     96                     // because we will (have to) clamp the components
     97                     // anyway.
     98                     int end = scratches.obtain();
     99                     MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
    100                     MLA(AL, 1, end, dvdx, end, c);
    101                     SUB(MI, 0, c, c, end);
    102                     BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
    103                     scratches.recycle(end);
    104                 }
    105             }
    106 
    107             if (parts.reload & 1) {
    108                 CONTEXT_STORE(c, generated_vars.argb[i].c);
    109             }
    110         }
    111     } else {
    112         // We're not smoothed, so we can
    113         // just use a packed version of the color and extract the
    114         // components as needed (or not at all if we don't blend)
    115 
    116         // figure out if we need the iterated color
    117         int load = 0;
    118         for (int i=0 ; i<4 ; i++) {
    119             component_info_t& info = mInfo[i];
    120             if ((info.inDest || info.needed) && !info.replaced)
    121                 load |= 1;
    122         }
    123 
    124         parts.iterated_packed = 1;
    125         parts.packed = (!mTextureMachine.mask && !mBlending
    126                 && !mFog && !mDithering);
    127         parts.reload = 0;
    128         if (load || parts.packed) {
    129             if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
    130                 comment("load initial iterated color (8888 packed)");
    131                 parts.iterated.setTo(obtainReg(),
    132                         &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
    133                 CONTEXT_LOAD(parts.iterated.reg, packed8888);
    134             } else {
    135                 comment("load initial iterated color (dest format packed)");
    136 
    137                 parts.iterated.setTo(obtainReg(), &mCbFormat);
    138 
    139                 // pre-mask the iterated color
    140                 const int bits = parts.iterated.size();
    141                 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
    142                 uint32_t mask = 0;
    143                 if (mMasking) {
    144                     for (int i=0 ; i<4 ; i++) {
    145                         const int component_mask = 1<<i;
    146                         const int h = parts.iterated.format.c[i].h;
    147                         const int l = parts.iterated.format.c[i].l;
    148                         if (h && (!(mMasking & component_mask))) {
    149                             mask |= ((1<<(h-l))-1) << l;
    150                         }
    151                     }
    152                 }
    153 
    154                 if (mMasking && ((mask & size)==0)) {
    155                     // none of the components are present in the mask
    156                 } else {
    157                     CONTEXT_LOAD(parts.iterated.reg, packed);
    158                     if (mCbFormat.size == 1) {
    159                         AND(AL, 0, parts.iterated.reg,
    160                                 parts.iterated.reg, imm(0xFF));
    161                     } else if (mCbFormat.size == 2) {
    162                         MOV(AL, 0, parts.iterated.reg,
    163                                 reg_imm(parts.iterated.reg, LSR, 16));
    164                     }
    165                 }
    166 
    167                 // pre-mask the iterated color
    168                 if (mMasking) {
    169                     build_and_immediate(parts.iterated.reg, parts.iterated.reg,
    170                             mask, bits);
    171                 }
    172             }
    173         }
    174     }
    175 }
    176 
    177 void GGLAssembler::build_iterated_color(
    178         component_t& fragment,
    179         const fragment_parts_t& parts,
    180         int component,
    181         Scratch& regs)
    182 {
    183     fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
    184 
    185     if (!mInfo[component].iterated)
    186         return;
    187 
    188     if (parts.iterated_packed) {
    189         // iterated colors are packed, extract the one we need
    190         extract(fragment, parts.iterated, component);
    191     } else {
    192         fragment.h = GGL_COLOR_BITS;
    193         fragment.l = GGL_COLOR_BITS - 8;
    194         fragment.flags |= CLEAR_LO;
    195         // iterated colors are held in their own register,
    196         // (smooth and/or dithering case)
    197         if (parts.reload==3) {
    198             // this implies mSmooth
    199             Scratch scratches(registerFile());
    200             int dx = scratches.obtain();
    201             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
    202             CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
    203             ADD(AL, 0, dx, fragment.reg, dx);
    204             CONTEXT_STORE(dx, generated_vars.argb[component].c);
    205         } else if (parts.reload & 1) {
    206             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
    207         } else {
    208             // we don't reload, so simply rename the register and mark as
    209             // non CORRUPTIBLE so that the texture env or blending code
    210             // won't modify this (renamed) register
    211             regs.recycle(fragment.reg);
    212             fragment.reg = parts.argb[component].reg;
    213             fragment.flags &= ~CORRUPTIBLE;
    214         }
    215         if (mInfo[component].smooth && mAA) {
    216             // when using smooth shading AND anti-aliasing, we need to clamp
    217             // the iterators because there is always an extra pixel on the
    218             // edges, which most of the time will cause an overflow
    219             // (since technically its outside of the domain).
    220             BIC(AL, 0, fragment.reg, fragment.reg,
    221                     reg_imm(fragment.reg, ASR, 31));
    222             component_sat(fragment);
    223         }
    224     }
    225 }
    226 
    227 // ---------------------------------------------------------------------------
    228 
    229 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
    230 {
    231     // gather some informations about the components we need to process...
    232     const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
    233     switch(opcode) {
    234     case GGL_COPY:
    235         mLogicOp = 0;
    236         break;
    237     case GGL_CLEAR:
    238     case GGL_SET:
    239         mLogicOp = LOGIC_OP;
    240         break;
    241     case GGL_AND:
    242     case GGL_AND_REVERSE:
    243     case GGL_AND_INVERTED:
    244     case GGL_XOR:
    245     case GGL_OR:
    246     case GGL_NOR:
    247     case GGL_EQUIV:
    248     case GGL_OR_REVERSE:
    249     case GGL_OR_INVERTED:
    250     case GGL_NAND:
    251         mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
    252         break;
    253     case GGL_NOOP:
    254     case GGL_INVERT:
    255         mLogicOp = LOGIC_OP|LOGIC_OP_DST;
    256         break;
    257     case GGL_COPY_INVERTED:
    258         mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
    259         break;
    260     };
    261 }
    262 
    263 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
    264 {
    265     uint8_t replaced=0;
    266     mTextureMachine.mask = 0;
    267     mTextureMachine.activeUnits = 0;
    268     for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
    269         texture_unit_t& tmu = mTextureMachine.tmu[i];
    270         if (replaced == 0xF) {
    271             // all components are replaced, skip this TMU.
    272             tmu.format_idx = 0;
    273             tmu.mask = 0;
    274             tmu.replaced = replaced;
    275             continue;
    276         }
    277         tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
    278         tmu.format = c->formats[tmu.format_idx];
    279         tmu.bits = tmu.format.size*8;
    280         tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
    281         tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
    282         tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
    283         tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
    284         tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
    285                 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
    286 
    287         // 5551 linear filtering is not supported
    288         if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
    289             tmu.linear = 0;
    290 
    291         tmu.mask = 0;
    292         tmu.replaced = replaced;
    293 
    294         if (tmu.format_idx) {
    295             mTextureMachine.activeUnits++;
    296             if (tmu.format.c[0].h)    tmu.mask |= 0x1;
    297             if (tmu.format.c[1].h)    tmu.mask |= 0x2;
    298             if (tmu.format.c[2].h)    tmu.mask |= 0x4;
    299             if (tmu.format.c[3].h)    tmu.mask |= 0x8;
    300             if (tmu.env == GGL_REPLACE) {
    301                 replaced |= tmu.mask;
    302             } else if (tmu.env == GGL_DECAL) {
    303                 if (!tmu.format.c[GGLFormat::ALPHA].h) {
    304                     // if we don't have alpha, decal does nothing
    305                     tmu.mask = 0;
    306                 } else {
    307                     // decal always ignores At
    308                     tmu.mask &= ~(1<<GGLFormat::ALPHA);
    309                 }
    310             }
    311         }
    312         mTextureMachine.mask |= tmu.mask;
    313         //printf("%d: mask=%08lx, replaced=%08lx\n",
    314         //    i, int(tmu.mask), int(tmu.replaced));
    315     }
    316     mTextureMachine.replaced = replaced;
    317     mTextureMachine.directTexture = 0;
    318     //printf("replaced=%08lx\n", mTextureMachine.replaced);
    319 }
    320 
    321 
    322 void GGLAssembler::init_textures(
    323         tex_coord_t* coords,
    324         const reg_t& x, const reg_t& y)
    325 {
    326     const needs_t& needs = mBuilderContext.needs;
    327     int Rx = x.reg;
    328     int Ry = y.reg;
    329 
    330     if (mTextureMachine.mask) {
    331         comment("compute texture coordinates");
    332     }
    333 
    334     // init texture coordinates for each tmu
    335     const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
    336     const bool multiTexture = mTextureMachine.activeUnits > 1;
    337     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
    338         const texture_unit_t& tmu = mTextureMachine.tmu[i];
    339         if (tmu.format_idx == 0)
    340             continue;
    341         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
    342             (tmu.twrap == GGL_NEEDS_WRAP_11))
    343         {
    344             // 1:1 texture
    345             pointer_t& txPtr = coords[i].ptr;
    346             txPtr.setTo(obtainReg(), tmu.bits);
    347             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
    348             ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
    349             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
    350             ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
    351             // merge base & offset
    352             CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
    353             SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
    354             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
    355             base_offset(txPtr, txPtr, Rx);
    356         } else {
    357             Scratch scratches(registerFile());
    358             reg_t& s = coords[i].s;
    359             reg_t& t = coords[i].t;
    360             // s = (x * dsdx)>>16 + ydsdy
    361             // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
    362             // t = (x * dtdx)>>16 + ydtdy
    363             // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
    364             s.setTo(obtainReg());
    365             t.setTo(obtainReg());
    366             const int need_w = GGL_READ_NEEDS(W, needs.n);
    367             if (need_w) {
    368                 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
    369                 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
    370             } else {
    371                 int ydsdy = scratches.obtain();
    372                 int ydtdy = scratches.obtain();
    373                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
    374                 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
    375                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
    376                 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
    377                 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
    378                 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
    379             }
    380 
    381             if ((mOptLevel&1)==0) {
    382                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
    383                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
    384                 recycleReg(s.reg);
    385                 recycleReg(t.reg);
    386             }
    387         }
    388 
    389         // direct texture?
    390         if (!multiTexture && !mBlending && !mDithering && !mFog &&
    391             cb_format_idx == tmu.format_idx && !tmu.linear &&
    392             mTextureMachine.replaced == tmu.mask)
    393         {
    394                 mTextureMachine.directTexture = i + 1;
    395         }
    396     }
    397 }
    398 
    399 void GGLAssembler::build_textures(  fragment_parts_t& parts,
    400                                     Scratch& regs)
    401 {
    402     // We don't have a way to spill registers automatically
    403     // spill depth and AA regs, when we know we may have to.
    404     // build the spill list...
    405     uint32_t spill_list = 0;
    406     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
    407         const texture_unit_t& tmu = mTextureMachine.tmu[i];
    408         if (tmu.format_idx == 0)
    409             continue;
    410         if (tmu.linear) {
    411             // we may run out of register if we have linear filtering
    412             // at 1 or 4 bytes / pixel on any texture unit.
    413             if (tmu.format.size == 1) {
    414                 // if depth and AA enabled, we'll run out of 1 register
    415                 if (parts.z.reg > 0 && parts.covPtr.reg > 0)
    416                     spill_list |= 1<<parts.covPtr.reg;
    417             }
    418             if (tmu.format.size == 4) {
    419                 // if depth or AA enabled, we'll run out of 1 or 2 registers
    420                 if (parts.z.reg > 0)
    421                     spill_list |= 1<<parts.z.reg;
    422                 if (parts.covPtr.reg > 0)
    423                     spill_list |= 1<<parts.covPtr.reg;
    424             }
    425         }
    426     }
    427 
    428     Spill spill(registerFile(), *this, spill_list);
    429 
    430     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
    431         const texture_unit_t& tmu = mTextureMachine.tmu[i];
    432         if (tmu.format_idx == 0)
    433             continue;
    434 
    435         pointer_t& txPtr = parts.coords[i].ptr;
    436         pixel_t& texel = parts.texel[i];
    437 
    438         // repeat...
    439         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
    440             (tmu.twrap == GGL_NEEDS_WRAP_11))
    441         { // 1:1 textures
    442             comment("fetch texel");
    443             texel.setTo(regs.obtain(), &tmu.format);
    444             load(txPtr, texel, WRITE_BACK);
    445         } else {
    446             Scratch scratches(registerFile());
    447             reg_t& s = parts.coords[i].s;
    448             reg_t& t = parts.coords[i].t;
    449             if ((mOptLevel&1)==0) {
    450                 comment("reload s/t (multitexture or linear filtering)");
    451                 s.reg = scratches.obtain();
    452                 t.reg = scratches.obtain();
    453                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
    454                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
    455             }
    456 
    457             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
    458                 return;
    459 
    460             comment("compute repeat/clamp");
    461             int u       = scratches.obtain();
    462             int v       = scratches.obtain();
    463             int width   = scratches.obtain();
    464             int height  = scratches.obtain();
    465             int U = 0;
    466             int V = 0;
    467 
    468             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
    469                 return;
    470 
    471             CONTEXT_LOAD(width,  generated_vars.texture[i].width);
    472             CONTEXT_LOAD(height, generated_vars.texture[i].height);
    473 
    474             int FRAC_BITS = 0;
    475             if (tmu.linear) {
    476                 // linear interpolation
    477                 if (tmu.format.size == 1) {
    478                     // for 8-bits textures, we can afford
    479                     // 7 bits of fractional precision at no
    480                     // additional cost (we can't do 8 bits
    481                     // because filter8 uses signed 16 bits muls)
    482                     FRAC_BITS = 7;
    483                 } else if (tmu.format.size == 2) {
    484                     // filter16() is internally limited to 4 bits, so:
    485                     // FRAC_BITS=2 generates less instructions,
    486                     // FRAC_BITS=3,4,5 creates unpleasant artifacts,
    487                     // FRAC_BITS=6+ looks good
    488                     FRAC_BITS = 6;
    489                 } else if (tmu.format.size == 4) {
    490                     // filter32() is internally limited to 8 bits, so:
    491                     // FRAC_BITS=4 looks good
    492                     // FRAC_BITS=5+ looks better, but generates 3 extra ipp
    493                     FRAC_BITS = 6;
    494                 } else {
    495                     // for all other cases we use 4 bits.
    496                     FRAC_BITS = 4;
    497                 }
    498             }
    499             wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
    500             wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
    501 
    502             if (tmu.linear) {
    503                 comment("compute linear filtering offsets");
    504                 // pixel size scale
    505                 const int shift = 31 - gglClz(tmu.format.size);
    506                 U = scratches.obtain();
    507                 V = scratches.obtain();
    508 
    509                 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
    510                     return;
    511 
    512                 // sample the texel center
    513                 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
    514                 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
    515 
    516                 // get the fractionnal part of U,V
    517                 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
    518                 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
    519 
    520                 // compute width-1 and height-1
    521                 SUB(AL, 0, width,  width,  imm(1));
    522                 SUB(AL, 0, height, height, imm(1));
    523 
    524                 // get the integer part of U,V and clamp/wrap
    525                 // and compute offset to the next texel
    526                 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
    527                     // u has already been REPEATed
    528                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
    529                     MOV(MI, 0, u, width);
    530                     CMP(AL, u, width);
    531                     MOV(LT, 0, width, imm(1 << shift));
    532                     if (shift)
    533                         MOV(GE, 0, width, reg_imm(width, LSL, shift));
    534                     RSB(GE, 0, width, width, imm(0));
    535                 } else {
    536                     // u has not been CLAMPed yet
    537                     // algorithm:
    538                     // if ((u>>4) >= width)
    539                     //      u = width<<4
    540                     //      width = 0
    541                     // else
    542                     //      width = 1<<shift
    543                     // u = u>>4; // get integer part
    544                     // if (u<0)
    545                     //      u = 0
    546                     //      width = 0
    547                     // generated_vars.rt = width
    548 
    549                     CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
    550                     MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
    551                     MOV(LE, 0, width, imm(0));
    552                     MOV(GT, 0, width, imm(1 << shift));
    553                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
    554                     MOV(MI, 0, u, imm(0));
    555                     MOV(MI, 0, width, imm(0));
    556                 }
    557                 CONTEXT_STORE(width, generated_vars.rt);
    558 
    559                 const int stride = width;
    560                 CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
    561                 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
    562                     // v has already been REPEATed
    563                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
    564                     MOV(MI, 0, v, height);
    565                     CMP(AL, v, height);
    566                     MOV(LT, 0, height, imm(1 << shift));
    567                     if (shift)
    568                         MOV(GE, 0, height, reg_imm(height, LSL, shift));
    569                     RSB(GE, 0, height, height, imm(0));
    570                     MUL(AL, 0, height, stride, height);
    571                 } else {
    572                     // v has not been CLAMPed yet
    573                     CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
    574                     MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
    575                     MOV(LE, 0, height, imm(0));
    576                     if (shift) {
    577                         MOV(GT, 0, height, reg_imm(stride, LSL, shift));
    578                     } else {
    579                         MOV(GT, 0, height, stride);
    580                     }
    581                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
    582                     MOV(MI, 0, v, imm(0));
    583                     MOV(MI, 0, height, imm(0));
    584                 }
    585                 CONTEXT_STORE(height, generated_vars.lb);
    586             }
    587 
    588             scratches.recycle(width);
    589             scratches.recycle(height);
    590 
    591             // iterate texture coordinates...
    592             comment("iterate s,t");
    593             int dsdx = scratches.obtain();
    594             int dtdx = scratches.obtain();
    595 
    596             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
    597                 return;
    598 
    599             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
    600             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
    601             ADD(AL, 0, s.reg, s.reg, dsdx);
    602             ADD(AL, 0, t.reg, t.reg, dtdx);
    603             if ((mOptLevel&1)==0) {
    604                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
    605                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
    606                 scratches.recycle(s.reg);
    607                 scratches.recycle(t.reg);
    608             }
    609             scratches.recycle(dsdx);
    610             scratches.recycle(dtdx);
    611 
    612             // merge base & offset...
    613             comment("merge base & offset");
    614             texel.setTo(regs.obtain(), &tmu.format);
    615             txPtr.setTo(texel.reg, tmu.bits);
    616             int stride = scratches.obtain();
    617 
    618             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
    619                 return;
    620 
    621             CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
    622             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
    623             SMLABB(AL, u, v, stride, u);    // u+v*stride
    624             base_offset(txPtr, txPtr, u);
    625 
    626             // load texel
    627             if (!tmu.linear) {
    628                 comment("fetch texel");
    629                 load(txPtr, texel, 0);
    630             } else {
    631                 // recycle registers we don't need anymore
    632                 scratches.recycle(u);
    633                 scratches.recycle(v);
    634                 scratches.recycle(stride);
    635 
    636                 comment("fetch texel, bilinear");
    637                 switch (tmu.format.size) {
    638                 case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
    639                 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
    640                 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
    641                 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
    642                 }
    643             }
    644         }
    645     }
    646 }
    647 
    648 void GGLAssembler::build_iterate_texture_coordinates(
    649     const fragment_parts_t& parts)
    650 {
    651     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
    652         const texture_unit_t& tmu = mTextureMachine.tmu[i];
    653         if (tmu.format_idx == 0)
    654             continue;
    655 
    656         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
    657             (tmu.twrap == GGL_NEEDS_WRAP_11))
    658         { // 1:1 textures
    659             const pointer_t& txPtr = parts.coords[i].ptr;
    660             ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
    661         } else {
    662             Scratch scratches(registerFile());
    663             int s = parts.coords[i].s.reg;
    664             int t = parts.coords[i].t.reg;
    665             if ((mOptLevel&1)==0) {
    666                 s = scratches.obtain();
    667                 t = scratches.obtain();
    668                 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
    669                 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
    670             }
    671             int dsdx = scratches.obtain();
    672             int dtdx = scratches.obtain();
    673             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
    674             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
    675             ADD(AL, 0, s, s, dsdx);
    676             ADD(AL, 0, t, t, dtdx);
    677             if ((mOptLevel&1)==0) {
    678                 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
    679                 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
    680             }
    681         }
    682     }
    683 }
    684 
    685 void GGLAssembler::filter8(
    686         const fragment_parts_t& /*parts*/,
    687         pixel_t& texel, const texture_unit_t& tmu,
    688         int U, int V, pointer_t& txPtr,
    689         int FRAC_BITS)
    690 {
    691     if (tmu.format.components != GGL_ALPHA &&
    692         tmu.format.components != GGL_LUMINANCE)
    693     {
    694         // this is a packed format, and we don't support
    695         // linear filtering (it's probably RGB 332)
    696         // Should not happen with OpenGL|ES
    697         LDRB(AL, texel.reg, txPtr.reg);
    698         return;
    699     }
    700 
    701     // ------------------------
    702     // about ~22 cycles / pixel
    703     Scratch scratches(registerFile());
    704 
    705     int pixel= scratches.obtain();
    706     int d    = scratches.obtain();
    707     int u    = scratches.obtain();
    708     int k    = scratches.obtain();
    709     int rt   = scratches.obtain();
    710     int lb   = scratches.obtain();
    711 
    712     // RB -> U * V
    713 
    714     CONTEXT_LOAD(rt, generated_vars.rt);
    715     CONTEXT_LOAD(lb, generated_vars.lb);
    716 
    717     int offset = pixel;
    718     ADD(AL, 0, offset, lb, rt);
    719     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
    720     SMULBB(AL, u, U, V);
    721     SMULBB(AL, d, pixel, u);
    722     RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
    723 
    724     // LB -> (1-U) * V
    725     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
    726     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
    727     SMULBB(AL, u, U, V);
    728     SMLABB(AL, d, pixel, u, d);
    729     SUB(AL, 0, k, k, u);
    730 
    731     // LT -> (1-U)*(1-V)
    732     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
    733     LDRB(AL, pixel, txPtr.reg);
    734     SMULBB(AL, u, U, V);
    735     SMLABB(AL, d, pixel, u, d);
    736 
    737     // RT -> U*(1-V)
    738     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
    739     SUB(AL, 0, u, k, u);
    740     SMLABB(AL, texel.reg, pixel, u, d);
    741 
    742     for (int i=0 ; i<4 ; i++) {
    743         if (!texel.format.c[i].h) continue;
    744         texel.format.c[i].h = FRAC_BITS*2+8;
    745         texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
    746     }
    747     texel.format.size = 4;
    748     texel.format.bitsPerPixel = 32;
    749     texel.flags |= CLEAR_LO;
    750 }
    751 
    752 void GGLAssembler::filter16(
    753         const fragment_parts_t& /*parts*/,
    754         pixel_t& texel, const texture_unit_t& tmu,
    755         int U, int V, pointer_t& txPtr,
    756         int FRAC_BITS)
    757 {
    758     // compute the mask
    759     // XXX: it would be nice if the mask below could be computed
    760     // automatically.
    761     uint32_t mask = 0;
    762     int shift = 0;
    763     int prec = 0;
    764     switch (tmu.format_idx) {
    765         case GGL_PIXEL_FORMAT_RGB_565:
    766             // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
    767             // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
    768             mask = 0x07E0F81F;
    769             shift = 16;
    770             prec = 5;
    771             break;
    772         case GGL_PIXEL_FORMAT_RGBA_4444:
    773             // 0000,1111,0000,1111 | 0000,1111,0000,1111
    774             mask = 0x0F0F0F0F;
    775             shift = 12;
    776             prec = 4;
    777             break;
    778         case GGL_PIXEL_FORMAT_LA_88:
    779             // 0000,0000,1111,1111 | 0000,0000,1111,1111
    780             // AALL -> 00AA | 00LL
    781             mask = 0x00FF00FF;
    782             shift = 8;
    783             prec = 8;
    784             break;
    785         default:
    786             // unsupported format, do something sensical...
    787             ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
    788             LDRH(AL, texel.reg, txPtr.reg);
    789             return;
    790     }
    791 
    792     const int adjust = FRAC_BITS*2 - prec;
    793     const int round  = 0;
    794 
    795     // update the texel format
    796     texel.format.size = 4;
    797     texel.format.bitsPerPixel = 32;
    798     texel.flags |= CLEAR_HI|CLEAR_LO;
    799     for (int i=0 ; i<4 ; i++) {
    800         if (!texel.format.c[i].h) continue;
    801         const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
    802         texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
    803         texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
    804     }
    805 
    806     // ------------------------
    807     // about ~40 cycles / pixel
    808     Scratch scratches(registerFile());
    809 
    810     int pixel= scratches.obtain();
    811     int d    = scratches.obtain();
    812     int u    = scratches.obtain();
    813     int k    = scratches.obtain();
    814 
    815     // RB -> U * V
    816     int offset = pixel;
    817     CONTEXT_LOAD(offset, generated_vars.rt);
    818     CONTEXT_LOAD(u, generated_vars.lb);
    819     ADD(AL, 0, offset, offset, u);
    820 
    821     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
    822     SMULBB(AL, u, U, V);
    823     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
    824     build_and_immediate(pixel, pixel, mask, 32);
    825     if (adjust) {
    826         if (round)
    827             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
    828         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
    829     }
    830     MUL(AL, 0, d, pixel, u);
    831     RSB(AL, 0, k, u, imm(1<<prec));
    832 
    833     // LB -> (1-U) * V
    834     CONTEXT_LOAD(offset, generated_vars.lb);
    835     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
    836     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
    837     SMULBB(AL, u, U, V);
    838     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
    839     build_and_immediate(pixel, pixel, mask, 32);
    840     if (adjust) {
    841         if (round)
    842             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
    843         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
    844     }
    845     MLA(AL, 0, d, pixel, u, d);
    846     SUB(AL, 0, k, k, u);
    847 
    848     // LT -> (1-U)*(1-V)
    849     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
    850     LDRH(AL, pixel, txPtr.reg);
    851     SMULBB(AL, u, U, V);
    852     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
    853     build_and_immediate(pixel, pixel, mask, 32);
    854     if (adjust) {
    855         if (round)
    856             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
    857         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
    858     }
    859     MLA(AL, 0, d, pixel, u, d);
    860 
    861     // RT -> U*(1-V)
    862     CONTEXT_LOAD(offset, generated_vars.rt);
    863     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
    864     SUB(AL, 0, u, k, u);
    865     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
    866     build_and_immediate(pixel, pixel, mask, 32);
    867     MLA(AL, 0, texel.reg, pixel, u, d);
    868 }
    869 
    870 void GGLAssembler::filter24(
    871         const fragment_parts_t& /*parts*/,
    872         pixel_t& texel, const texture_unit_t& /*tmu*/,
    873         int /*U*/, int /*V*/, pointer_t& txPtr,
    874         int /*FRAC_BITS*/)
    875 {
    876     // not supported yet (currently disabled)
    877     load(txPtr, texel, 0);
    878 }
    879 
    880 void GGLAssembler::filter32(
    881         const fragment_parts_t& /*parts*/,
    882         pixel_t& texel, const texture_unit_t& /*tmu*/,
    883         int U, int V, pointer_t& txPtr,
    884         int FRAC_BITS)
    885 {
    886     const int adjust = FRAC_BITS*2 - 8;
    887     const int round  = 0;
    888 
    889     // ------------------------
    890     // about ~38 cycles / pixel
    891     Scratch scratches(registerFile());
    892 
    893     int pixel= scratches.obtain();
    894     int dh   = scratches.obtain();
    895     int u    = scratches.obtain();
    896     int k    = scratches.obtain();
    897 
    898     int temp = scratches.obtain();
    899     int dl   = scratches.obtain();
    900     int mask = scratches.obtain();
    901 
    902     MOV(AL, 0, mask, imm(0xFF));
    903     ORR(AL, 0, mask, mask, imm(0xFF0000));
    904 
    905     // RB -> U * V
    906     int offset = pixel;
    907     CONTEXT_LOAD(offset, generated_vars.rt);
    908     CONTEXT_LOAD(u, generated_vars.lb);
    909     ADD(AL, 0, offset, offset, u);
    910 
    911     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
    912     SMULBB(AL, u, U, V);
    913     AND(AL, 0, temp, mask, pixel);
    914     if (adjust) {
    915         if (round)
    916             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
    917         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
    918     }
    919     MUL(AL, 0, dh, temp, u);
    920     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
    921     MUL(AL, 0, dl, temp, u);
    922     RSB(AL, 0, k, u, imm(0x100));
    923 
    924     // LB -> (1-U) * V
    925     CONTEXT_LOAD(offset, generated_vars.lb);
    926     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
    927     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
    928     SMULBB(AL, u, U, V);
    929     AND(AL, 0, temp, mask, pixel);
    930     if (adjust) {
    931         if (round)
    932             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
    933         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
    934     }
    935     MLA(AL, 0, dh, temp, u, dh);
    936     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
    937     MLA(AL, 0, dl, temp, u, dl);
    938     SUB(AL, 0, k, k, u);
    939 
    940     // LT -> (1-U)*(1-V)
    941     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
    942     LDR(AL, pixel, txPtr.reg);
    943     SMULBB(AL, u, U, V);
    944     AND(AL, 0, temp, mask, pixel);
    945     if (adjust) {
    946         if (round)
    947             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
    948         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
    949     }
    950     MLA(AL, 0, dh, temp, u, dh);
    951     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
    952     MLA(AL, 0, dl, temp, u, dl);
    953 
    954     // RT -> U*(1-V)
    955     CONTEXT_LOAD(offset, generated_vars.rt);
    956     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
    957     SUB(AL, 0, u, k, u);
    958     AND(AL, 0, temp, mask, pixel);
    959     MLA(AL, 0, dh, temp, u, dh);
    960     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
    961     MLA(AL, 0, dl, temp, u, dl);
    962 
    963     AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
    964     AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
    965     ORR(AL, 0, texel.reg, dh, dl);
    966 }
    967 
    968 void GGLAssembler::build_texture_environment(
    969         component_t& fragment,
    970         const fragment_parts_t& parts,
    971         int component,
    972         Scratch& regs)
    973 {
    974     const uint32_t component_mask = 1<<component;
    975     const bool multiTexture = mTextureMachine.activeUnits > 1;
    976     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
    977         texture_unit_t& tmu = mTextureMachine.tmu[i];
    978 
    979         if (tmu.mask & component_mask) {
    980             // replace or modulate with this texture
    981             if ((tmu.replaced & component_mask) == 0) {
    982                 // not replaced by a later tmu...
    983 
    984                 Scratch scratches(registerFile());
    985                 pixel_t texel(parts.texel[i]);
    986 
    987                 if (multiTexture &&
    988                     tmu.swrap == GGL_NEEDS_WRAP_11 &&
    989                     tmu.twrap == GGL_NEEDS_WRAP_11)
    990                 {
    991                     texel.reg = scratches.obtain();
    992                     texel.flags |= CORRUPTIBLE;
    993                     comment("fetch texel (multitexture 1:1)");
    994                     load(parts.coords[i].ptr, texel, WRITE_BACK);
    995                  }
    996 
    997                 component_t incoming(fragment);
    998                 modify(fragment, regs);
    999 
   1000                 switch (tmu.env) {
   1001                 case GGL_REPLACE:
   1002                     extract(fragment, texel, component);
   1003                     break;
   1004                 case GGL_MODULATE:
   1005                     modulate(fragment, incoming, texel, component);
   1006                     break;
   1007                 case GGL_DECAL:
   1008                     decal(fragment, incoming, texel, component);
   1009                     break;
   1010                 case GGL_BLEND:
   1011                     blend(fragment, incoming, texel, component, i);
   1012                     break;
   1013                 case GGL_ADD:
   1014                     add(fragment, incoming, texel, component);
   1015                     break;
   1016                 }
   1017             }
   1018         }
   1019     }
   1020 }
   1021 
   1022 // ---------------------------------------------------------------------------
   1023 
   1024 void GGLAssembler::wrapping(
   1025             int d,
   1026             int coord, int size,
   1027             int tx_wrap, int tx_linear)
   1028 {
   1029     // notes:
   1030     // if tx_linear is set, we need 4 extra bits of precision on the result
   1031     // SMULL/UMULL is 3 cycles
   1032     Scratch scratches(registerFile());
   1033     int c = coord;
   1034     if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
   1035         // UMULL takes 4 cycles (interlocked), and we can get away with
   1036         // 2 cycles using SMULWB, but we're loosing 16 bits of precision
   1037         // out of 32 (this is not a problem because the iterator keeps
   1038         // its full precision)
   1039         // UMULL(AL, 0, size, d, c, size);
   1040         // note: we can't use SMULTB because it's signed.
   1041         MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
   1042         SMULWB(AL, d, d, size);
   1043     } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
   1044         if (tx_linear) {
   1045             // 1 cycle
   1046             MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
   1047         } else {
   1048             // 4 cycles (common case)
   1049             MOV(AL, 0, d, reg_imm(coord, ASR, 16));
   1050             BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
   1051             CMP(AL, d, size);
   1052             SUB(GE, 0, d, size, imm(1));
   1053         }
   1054     }
   1055 }
   1056 
   1057 // ---------------------------------------------------------------------------
   1058 
   1059 void GGLAssembler::modulate(
   1060         component_t& dest,
   1061         const component_t& incoming,
   1062         const pixel_t& incomingTexel, int component)
   1063 {
   1064     Scratch locals(registerFile());
   1065     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
   1066     extract(texel, incomingTexel, component);
   1067 
   1068     const int Nt = texel.size();
   1069         // Nt should always be less than 10 bits because it comes
   1070         // from the TMU.
   1071 
   1072     int Ni = incoming.size();
   1073         // Ni could be big because it comes from previous MODULATEs
   1074 
   1075     if (Nt == 1) {
   1076         // texel acts as a bit-mask
   1077         // dest = incoming & ((texel << incoming.h)-texel)
   1078         RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
   1079         AND(AL, 0, dest.reg, dest.reg, incoming.reg);
   1080         dest.l = incoming.l;
   1081         dest.h = incoming.h;
   1082         dest.flags |= (incoming.flags & CLEAR_LO);
   1083     } else if (Ni == 1) {
   1084         MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
   1085         AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
   1086         dest.l = 0;
   1087         dest.h = Nt;
   1088     } else {
   1089         int inReg = incoming.reg;
   1090         int shift = incoming.l;
   1091         if ((Nt + Ni) > 32) {
   1092             // we will overflow, reduce the precision of Ni to 8 bits
   1093             // (Note Nt cannot be more than 10 bits which happens with
   1094             // 565 textures and GGL_LINEAR)
   1095             shift += Ni-8;
   1096             Ni = 8;
   1097         }
   1098 
   1099         // modulate by the component with the lowest precision
   1100         if (Nt >= Ni) {
   1101             if (shift) {
   1102                 // XXX: we should be able to avoid this shift
   1103                 // when shift==16 && Nt<16 && Ni<16, in which
   1104                 // we could use SMULBT below.
   1105                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
   1106                 inReg = dest.reg;
   1107                 shift = 0;
   1108             }
   1109             // operation:           (Cf*Ct)/((1<<Ni)-1)
   1110             // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
   1111             // this operation doesn't change texel's size
   1112             ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
   1113             if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
   1114             else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
   1115             dest.l = Ni;
   1116             dest.h = Nt + Ni;
   1117         } else {
   1118             if (shift && (shift != 16)) {
   1119                 // if shift==16, we can use 16-bits mul instructions later
   1120                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
   1121                 inReg = dest.reg;
   1122                 shift = 0;
   1123             }
   1124             // operation:           (Cf*Ct)/((1<<Nt)-1)
   1125             // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
   1126             // this operation doesn't change incoming's size
   1127             Scratch scratches(registerFile());
   1128             int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
   1129             if (t == inReg)
   1130                 t = scratches.obtain();
   1131             ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
   1132             if (Nt<16 && Ni<16) {
   1133                 if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
   1134                 else            SMULBB(AL, dest.reg, t, inReg);
   1135             } else              MUL(AL, 0, dest.reg, t, inReg);
   1136             dest.l = Nt;
   1137             dest.h = Nt + Ni;
   1138         }
   1139 
   1140         // low bits are not valid
   1141         dest.flags |= CLEAR_LO;
   1142 
   1143         // no need to keep more than 8 bits/component
   1144         if (dest.size() > 8)
   1145             dest.l = dest.h-8;
   1146     }
   1147 }
   1148 
   1149 void GGLAssembler::decal(
   1150         component_t& dest,
   1151         const component_t& incoming,
   1152         const pixel_t& incomingTexel, int component)
   1153 {
   1154     // RGBA:
   1155     // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
   1156     // Av = Af
   1157     Scratch locals(registerFile());
   1158     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
   1159     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
   1160     extract(texel, incomingTexel, component);
   1161     extract(factor, incomingTexel, GGLFormat::ALPHA);
   1162 
   1163     // no need to keep more than 8-bits for decal
   1164     int Ni = incoming.size();
   1165     int shift = incoming.l;
   1166     if (Ni > 8) {
   1167         shift += Ni-8;
   1168         Ni = 8;
   1169     }
   1170     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
   1171     if (shift) {
   1172         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
   1173         incomingNorm.reg = dest.reg;
   1174         incomingNorm.flags |= CORRUPTIBLE;
   1175     }
   1176     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
   1177     build_blendOneMinusFF(dest, factor, incomingNorm, texel);
   1178 }
   1179 
   1180 void GGLAssembler::blend(
   1181         component_t& dest,
   1182         const component_t& incoming,
   1183         const pixel_t& incomingTexel, int component, int tmu)
   1184 {
   1185     // RGBA:
   1186     // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
   1187     // Av = At*Af
   1188 
   1189     if (component == GGLFormat::ALPHA) {
   1190         modulate(dest, incoming, incomingTexel, component);
   1191         return;
   1192     }
   1193 
   1194     Scratch locals(registerFile());
   1195     integer_t color(locals.obtain(), 8, CORRUPTIBLE);
   1196     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
   1197     LDRB(AL, color.reg, mBuilderContext.Rctx,
   1198             immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
   1199     extract(factor, incomingTexel, component);
   1200 
   1201     // no need to keep more than 8-bits for blend
   1202     int Ni = incoming.size();
   1203     int shift = incoming.l;
   1204     if (Ni > 8) {
   1205         shift += Ni-8;
   1206         Ni = 8;
   1207     }
   1208     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
   1209     if (shift) {
   1210         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
   1211         incomingNorm.reg = dest.reg;
   1212         incomingNorm.flags |= CORRUPTIBLE;
   1213     }
   1214     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
   1215     build_blendOneMinusFF(dest, factor, incomingNorm, color);
   1216 }
   1217 
   1218 void GGLAssembler::add(
   1219         component_t& dest,
   1220         const component_t& incoming,
   1221         const pixel_t& incomingTexel, int component)
   1222 {
   1223     // RGBA:
   1224     // Cv = Cf + Ct;
   1225     Scratch locals(registerFile());
   1226 
   1227     component_t incomingTemp(incoming);
   1228 
   1229     // use "dest" as a temporary for extracting the texel, unless "dest"
   1230     // overlaps "incoming".
   1231     integer_t texel(dest.reg, 32, CORRUPTIBLE);
   1232     if (dest.reg == incomingTemp.reg)
   1233         texel.reg = locals.obtain();
   1234     extract(texel, incomingTexel, component);
   1235 
   1236     if (texel.s < incomingTemp.size()) {
   1237         expand(texel, texel, incomingTemp.size());
   1238     } else if (texel.s > incomingTemp.size()) {
   1239         if (incomingTemp.flags & CORRUPTIBLE) {
   1240             expand(incomingTemp, incomingTemp, texel.s);
   1241         } else {
   1242             incomingTemp.reg = locals.obtain();
   1243             expand(incomingTemp, incoming, texel.s);
   1244         }
   1245     }
   1246 
   1247     if (incomingTemp.l) {
   1248         ADD(AL, 0, dest.reg, texel.reg,
   1249                 reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
   1250     } else {
   1251         ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
   1252     }
   1253     dest.l = 0;
   1254     dest.h = texel.size();
   1255     component_sat(dest);
   1256 }
   1257 
   1258 // ----------------------------------------------------------------------------
   1259 
   1260 }; // namespace android
   1261 
   1262