Home | History | Annotate | Download | only in libpixelflinger
      1 /* libs/pixelflinger/scanline.cpp
      2 **
      3 ** Copyright 2006-2011, The Android Open Source Project
      4 **
      5 ** Licensed under the Apache License, Version 2.0 (the "License");
      6 ** you may not use this file except in compliance with the License.
      7 ** You may obtain a copy of the License at
      8 **
      9 **     http://www.apache.org/licenses/LICENSE-2.0
     10 **
     11 ** Unless required by applicable law or agreed to in writing, software
     12 ** distributed under the License is distributed on an "AS IS" BASIS,
     13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 ** See the License for the specific language governing permissions and
     15 ** limitations under the License.
     16 */
     17 
     18 
     19 #define LOG_TAG "pixelflinger"
     20 
     21 #include <assert.h>
     22 #include <stdlib.h>
     23 #include <stdio.h>
     24 #include <string.h>
     25 
     26 #include <cutils/memory.h>
     27 #include <cutils/log.h>
     28 
     29 #include "buffer.h"
     30 #include "scanline.h"
     31 
     32 #include "codeflinger/CodeCache.h"
     33 #include "codeflinger/GGLAssembler.h"
     34 #include "codeflinger/ARMAssembler.h"
     35 #if defined(__mips__)
     36 #include "codeflinger/MIPSAssembler.h"
     37 #endif
     38 //#include "codeflinger/ARMAssemblerOptimizer.h"
     39 
     40 // ----------------------------------------------------------------------------
     41 
     42 #define ANDROID_CODEGEN_GENERIC     0   // force generic pixel pipeline
     43 #define ANDROID_CODEGEN_C           1   // hand-written C, fallback generic
     44 #define ANDROID_CODEGEN_ASM         2   // hand-written asm, fallback generic
     45 #define ANDROID_CODEGEN_GENERATED   3   // hand-written asm, fallback codegen
     46 
     47 #ifdef NDEBUG
     48 #   define ANDROID_RELEASE
     49 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
     50 #else
     51 #   define ANDROID_DEBUG
     52 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
     53 #endif
     54 
     55 #if defined(__arm__) || defined(__mips__)
     56 #   define ANDROID_ARM_CODEGEN  1
     57 #else
     58 #   define ANDROID_ARM_CODEGEN  0
     59 #endif
     60 
     61 #define DEBUG__CODEGEN_ONLY     0
     62 
     63 /* Set to 1 to dump to the log the states that need a new
     64  * code-generated scanline callback, i.e. those that don't
     65  * have a corresponding shortcut function.
     66  */
     67 #define DEBUG_NEEDS  0
     68 
     69 #ifdef __mips__
     70 #define ASSEMBLY_SCRATCH_SIZE   4096
     71 #else
     72 #define ASSEMBLY_SCRATCH_SIZE   2048
     73 #endif
     74 
     75 // ----------------------------------------------------------------------------
     76 namespace android {
     77 // ----------------------------------------------------------------------------
     78 
     79 static void init_y(context_t*, int32_t);
     80 static void init_y_noop(context_t*, int32_t);
     81 static void init_y_packed(context_t*, int32_t);
     82 static void init_y_error(context_t*, int32_t);
     83 
     84 static void step_y__generic(context_t* c);
     85 static void step_y__nop(context_t*);
     86 static void step_y__smooth(context_t* c);
     87 static void step_y__tmu(context_t* c);
     88 static void step_y__w(context_t* c);
     89 
     90 static void scanline(context_t* c);
     91 static void scanline_perspective(context_t* c);
     92 static void scanline_perspective_single(context_t* c);
     93 static void scanline_t32cb16blend(context_t* c);
     94 static void scanline_t32cb16blend_dither(context_t* c);
     95 static void scanline_t32cb16blend_srca(context_t* c);
     96 static void scanline_t32cb16blend_clamp(context_t* c);
     97 static void scanline_t32cb16blend_clamp_dither(context_t* c);
     98 static void scanline_t32cb16blend_clamp_mod(context_t* c);
     99 static void scanline_x32cb16blend_clamp_mod(context_t* c);
    100 static void scanline_t32cb16blend_clamp_mod_dither(context_t* c);
    101 static void scanline_x32cb16blend_clamp_mod_dither(context_t* c);
    102 static void scanline_t32cb16(context_t* c);
    103 static void scanline_t32cb16_dither(context_t* c);
    104 static void scanline_t32cb16_clamp(context_t* c);
    105 static void scanline_t32cb16_clamp_dither(context_t* c);
    106 static void scanline_col32cb16blend(context_t* c);
    107 static void scanline_t16cb16_clamp(context_t* c);
    108 static void scanline_t16cb16blend_clamp_mod(context_t* c);
    109 static void scanline_memcpy(context_t* c);
    110 static void scanline_memset8(context_t* c);
    111 static void scanline_memset16(context_t* c);
    112 static void scanline_memset32(context_t* c);
    113 static void scanline_noop(context_t* c);
    114 static void scanline_set(context_t* c);
    115 static void scanline_clear(context_t* c);
    116 
    117 static void rect_generic(context_t* c, size_t yc);
    118 static void rect_memcpy(context_t* c, size_t yc);
    119 
    120 #if defined( __arm__)
    121 extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
    122 extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
    123 extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
    124 extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
    125 #elif defined(__mips__)
    126 extern "C" void scanline_t32cb16blend_mips(uint16_t*, uint32_t*, size_t);
    127 #endif
    128 
    129 // ----------------------------------------------------------------------------
    130 
    131 static inline uint16_t  convertAbgr8888ToRgb565(uint32_t  pix)
    132 {
    133     return uint16_t( ((pix << 8) & 0xf800) |
    134                       ((pix >> 5) & 0x07e0) |
    135                       ((pix >> 19) & 0x001f) );
    136 }
    137 
    138 struct shortcut_t {
    139     needs_filter_t  filter;
    140     const char*     desc;
    141     void            (*scanline)(context_t*);
    142     void            (*init_y)(context_t*, int32_t);
    143 };
    144 
    145 // Keep in sync with needs
    146 
    147 /* To understand the values here, have a look at:
    148  *     system/core/include/private/pixelflinger/ggl_context.h
    149  *
    150  * Especially the lines defining and using GGL_RESERVE_NEEDS
    151  *
    152  * Quick reminders:
    153  *   - the last nibble of the first value is the destination buffer format.
    154  *   - the last nibble of the third value is the source texture format
    155  *   - formats: 4=rgb565 1=abgr8888 2=xbgr8888
    156  *
    157  * In the descriptions below:
    158  *
    159  *   SRC      means we copy the source pixels to the destination
    160  *
    161  *   SRC_OVER means we blend the source pixels to the destination
    162  *            with dstFactor = 1-srcA, srcFactor=1  (premultiplied source).
    163  *            This mode is otherwise called 'blend'.
    164  *
    165  *   SRCA_OVER means we blend the source pixels to the destination
    166  *             with dstFactor=srcA*(1-srcA) srcFactor=srcA (non-premul source).
    167  *             This mode is otherwise called 'blend_srca'
    168  *
    169  *   clamp    means we fetch source pixels from a texture with u/v clamping
    170  *
    171  *   mod      means the source pixels are modulated (multiplied) by the
    172  *            a/r/g/b of the current context's color. Typically used for
    173  *            fade-in / fade-out.
    174  *
    175  *   dither   means we dither 32 bit values to 16 bits
    176  */
    177 static shortcut_t shortcuts[] = {
    178     { { { 0x03515104, 0x00000077, { 0x00000A01, 0x00000000 } },
    179         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    180         "565 fb, 8888 tx, blend SRC_OVER", scanline_t32cb16blend, init_y_noop },
    181     { { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
    182         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    183         "565 fb, 8888 tx, SRC", scanline_t32cb16, init_y_noop  },
    184     /* same as first entry, but with dithering */
    185     { { { 0x03515104, 0x00000177, { 0x00000A01, 0x00000000 } },
    186         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    187         "565 fb, 8888 tx, blend SRC_OVER dither", scanline_t32cb16blend_dither, init_y_noop },
    188     /* same as second entry, but with dithering */
    189     { { { 0x03010104, 0x00000177, { 0x00000A01, 0x00000000 } },
    190         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    191         "565 fb, 8888 tx, SRC dither", scanline_t32cb16_dither, init_y_noop  },
    192     /* this is used during the boot animation - CHEAT: ignore dithering */
    193     { { { 0x03545404, 0x00000077, { 0x00000A01, 0x00000000 } },
    194         { 0xFFFFFFFF, 0xFFFFFEFF, { 0xFFFFFFFF, 0x0000003F } } },
    195         "565 fb, 8888 tx, blend dst:ONE_MINUS_SRCA src:SRCA", scanline_t32cb16blend_srca, init_y_noop },
    196     /* special case for arbitrary texture coordinates (think scaling) */
    197     { { { 0x03515104, 0x00000077, { 0x00000001, 0x00000000 } },
    198         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    199         "565 fb, 8888 tx, SRC_OVER clamp", scanline_t32cb16blend_clamp, init_y },
    200     { { { 0x03515104, 0x00000177, { 0x00000001, 0x00000000 } },
    201         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    202         "565 fb, 8888 tx, SRC_OVER clamp dither", scanline_t32cb16blend_clamp_dither, init_y },
    203     /* another case used during emulation */
    204     { { { 0x03515104, 0x00000077, { 0x00001001, 0x00000000 } },
    205         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    206         "565 fb, 8888 tx, SRC_OVER clamp modulate", scanline_t32cb16blend_clamp_mod, init_y },
    207     /* and this */
    208     { { { 0x03515104, 0x00000077, { 0x00001002, 0x00000000 } },
    209         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    210         "565 fb, x888 tx, SRC_OVER clamp modulate", scanline_x32cb16blend_clamp_mod, init_y },
    211     { { { 0x03515104, 0x00000177, { 0x00001001, 0x00000000 } },
    212         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    213         "565 fb, 8888 tx, SRC_OVER clamp modulate dither", scanline_t32cb16blend_clamp_mod_dither, init_y },
    214     { { { 0x03515104, 0x00000177, { 0x00001002, 0x00000000 } },
    215         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    216         "565 fb, x888 tx, SRC_OVER clamp modulate dither", scanline_x32cb16blend_clamp_mod_dither, init_y },
    217     { { { 0x03010104, 0x00000077, { 0x00000001, 0x00000000 } },
    218         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    219         "565 fb, 8888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
    220     { { { 0x03010104, 0x00000077, { 0x00000002, 0x00000000 } },
    221         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    222         "565 fb, x888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
    223     { { { 0x03010104, 0x00000177, { 0x00000001, 0x00000000 } },
    224         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    225         "565 fb, 8888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
    226     { { { 0x03010104, 0x00000177, { 0x00000002, 0x00000000 } },
    227         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    228         "565 fb, x888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
    229     { { { 0x03010104, 0x00000077, { 0x00000004, 0x00000000 } },
    230         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    231         "565 fb, 565 tx, SRC clamp", scanline_t16cb16_clamp, init_y  },
    232     { { { 0x03515104, 0x00000077, { 0x00001004, 0x00000000 } },
    233         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    234         "565 fb, 565 tx, SRC_OVER clamp", scanline_t16cb16blend_clamp_mod, init_y  },
    235     { { { 0x03515104, 0x00000077, { 0x00000000, 0x00000000 } },
    236         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0xFFFFFFFF } } },
    237         "565 fb, 8888 fixed color", scanline_col32cb16blend, init_y_packed  },
    238     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    239         { 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
    240         "(nop) alpha test", scanline_noop, init_y_noop },
    241     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    242         { 0x00000000, 0x00000070, { 0x00000000, 0x00000000 } } },
    243         "(nop) depth test", scanline_noop, init_y_noop },
    244     { { { 0x05000000, 0x00000000, { 0x00000000, 0x00000000 } },
    245         { 0x0F000000, 0x00000080, { 0x00000000, 0x00000000 } } },
    246         "(nop) logic_op", scanline_noop, init_y_noop },
    247     { { { 0xF0000000, 0x00000000, { 0x00000000, 0x00000000 } },
    248         { 0xF0000000, 0x00000080, { 0x00000000, 0x00000000 } } },
    249         "(nop) color mask", scanline_noop, init_y_noop },
    250     { { { 0x0F000000, 0x00000077, { 0x00000000, 0x00000000 } },
    251         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
    252         "(set) logic_op", scanline_set, init_y_noop },
    253     { { { 0x00000000, 0x00000077, { 0x00000000, 0x00000000 } },
    254         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
    255         "(clear) logic_op", scanline_clear, init_y_noop },
    256     { { { 0x03000000, 0x00000077, { 0x00000000, 0x00000000 } },
    257         { 0xFFFFFF00, 0x000000F7, { 0x00000000, 0x00000000 } } },
    258         "(clear) blending 0/0", scanline_clear, init_y_noop },
    259     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    260         { 0x0000003F, 0x00000000, { 0x00000000, 0x00000000 } } },
    261         "(error) invalid color-buffer format", scanline_noop, init_y_error },
    262 };
    263 static const needs_filter_t noblend1to1 = {
    264         // (disregard dithering, see below)
    265         { 0x03010100, 0x00000077, { 0x00000A00, 0x00000000 } },
    266         { 0xFFFFFFC0, 0xFFFFFEFF, { 0xFFFFFFC0, 0x0000003F } }
    267 };
    268 static  const needs_filter_t fill16noblend = {
    269         { 0x03010100, 0x00000077, { 0x00000000, 0x00000000 } },
    270         { 0xFFFFFFC0, 0xFFFFFFFF, { 0x0000003F, 0x0000003F } }
    271 };
    272 
    273 // ----------------------------------------------------------------------------
    274 
    275 #if ANDROID_ARM_CODEGEN
    276 
    277 #if defined(__mips__)
    278 static CodeCache gCodeCache(32 * 1024);
    279 #else
    280 static CodeCache gCodeCache(12 * 1024);
    281 #endif
    282 
    283 class ScanlineAssembly : public Assembly {
    284     AssemblyKey<needs_t> mKey;
    285 public:
    286     ScanlineAssembly(needs_t needs, size_t size)
    287         : Assembly(size), mKey(needs) { }
    288     const AssemblyKey<needs_t>& key() const { return mKey; }
    289 };
    290 #endif
    291 
    292 // ----------------------------------------------------------------------------
    293 
    294 void ggl_init_scanline(context_t* c)
    295 {
    296     c->init_y = init_y;
    297     c->step_y = step_y__generic;
    298     c->scanline = scanline;
    299 }
    300 
    301 void ggl_uninit_scanline(context_t* c)
    302 {
    303     if (c->state.buffers.coverage)
    304         free(c->state.buffers.coverage);
    305 #if ANDROID_ARM_CODEGEN
    306     if (c->scanline_as)
    307         c->scanline_as->decStrong(c);
    308 #endif
    309 }
    310 
    311 // ----------------------------------------------------------------------------
    312 
    313 static void pick_scanline(context_t* c)
    314 {
    315 #if (!defined(DEBUG__CODEGEN_ONLY) || (DEBUG__CODEGEN_ONLY == 0))
    316 
    317 #if ANDROID_CODEGEN == ANDROID_CODEGEN_GENERIC
    318     c->init_y = init_y;
    319     c->step_y = step_y__generic;
    320     c->scanline = scanline;
    321     return;
    322 #endif
    323 
    324     //printf("*** needs [%08lx:%08lx:%08lx:%08lx]\n",
    325     //    c->state.needs.n, c->state.needs.p,
    326     //    c->state.needs.t[0], c->state.needs.t[1]);
    327 
    328     // first handle the special case that we cannot test with a filter
    329     const uint32_t cb_format = GGL_READ_NEEDS(CB_FORMAT, c->state.needs.n);
    330     if (GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0]) == cb_format) {
    331         if (c->state.needs.match(noblend1to1)) {
    332             // this will match regardless of dithering state, since both
    333             // src and dest have the same format anyway, there is no dithering
    334             // to be done.
    335             const GGLFormat* f =
    336                 &(c->formats[GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0])]);
    337             if ((f->components == GGL_RGB) ||
    338                 (f->components == GGL_RGBA) ||
    339                 (f->components == GGL_LUMINANCE) ||
    340                 (f->components == GGL_LUMINANCE_ALPHA))
    341             {
    342                 // format must have all of RGB components
    343                 // (so the current color doesn't show through)
    344                 c->scanline = scanline_memcpy;
    345                 c->init_y = init_y_noop;
    346                 return;
    347             }
    348         }
    349     }
    350 
    351     if (c->state.needs.match(fill16noblend)) {
    352         c->init_y = init_y_packed;
    353         switch (c->formats[cb_format].size) {
    354         case 1: c->scanline = scanline_memset8;  return;
    355         case 2: c->scanline = scanline_memset16; return;
    356         case 4: c->scanline = scanline_memset32; return;
    357         }
    358     }
    359 
    360     const int numFilters = sizeof(shortcuts)/sizeof(shortcut_t);
    361     for (int i=0 ; i<numFilters ; i++) {
    362         if (c->state.needs.match(shortcuts[i].filter)) {
    363             c->scanline = shortcuts[i].scanline;
    364             c->init_y = shortcuts[i].init_y;
    365             return;
    366         }
    367     }
    368 
    369 #if DEBUG_NEEDS
    370     ALOGI("Needs: n=0x%08x p=0x%08x t0=0x%08x t1=0x%08x",
    371          c->state.needs.n, c->state.needs.p,
    372          c->state.needs.t[0], c->state.needs.t[1]);
    373 #endif
    374 
    375 #endif // DEBUG__CODEGEN_ONLY
    376 
    377     c->init_y = init_y;
    378     c->step_y = step_y__generic;
    379 
    380 #if ANDROID_ARM_CODEGEN
    381     // we're going to have to generate some code...
    382     // here, generate code for our pixel pipeline
    383     const AssemblyKey<needs_t> key(c->state.needs);
    384     sp<Assembly> assembly = gCodeCache.lookup(key);
    385     if (assembly == 0) {
    386         // create a new assembly region
    387         sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs,
    388                 ASSEMBLY_SCRATCH_SIZE);
    389         // initialize our assembler
    390 #if defined(__arm__)
    391         GGLAssembler assembler( new ARMAssembler(a) );
    392         //GGLAssembler assembler(
    393         //        new ARMAssemblerOptimizer(new ARMAssembler(a)) );
    394 #endif
    395 #if defined(__mips__)
    396         GGLAssembler assembler( new ArmToMipsAssembler(a) );
    397 #endif
    398         // generate the scanline code for the given needs
    399         int err = assembler.scanline(c->state.needs, c);
    400         if (ggl_likely(!err)) {
    401             // finally, cache this assembly
    402             err = gCodeCache.cache(a->key(), a);
    403         }
    404         if (ggl_unlikely(err)) {
    405             ALOGE("error generating or caching assembly. Reverting to NOP.");
    406             c->scanline = scanline_noop;
    407             c->init_y = init_y_noop;
    408             c->step_y = step_y__nop;
    409             return;
    410         }
    411         assembly = a;
    412     }
    413 
    414     // release the previous assembly
    415     if (c->scanline_as) {
    416         c->scanline_as->decStrong(c);
    417     }
    418 
    419     //ALOGI("using generated pixel-pipeline");
    420     c->scanline_as = assembly.get();
    421     c->scanline_as->incStrong(c); //  hold on to assembly
    422     c->scanline = (void(*)(context_t* c))assembly->base();
    423 #else
    424 //    ALOGW("using generic (slow) pixel-pipeline");
    425     c->scanline = scanline;
    426 #endif
    427 }
    428 
    429 void ggl_pick_scanline(context_t* c)
    430 {
    431     pick_scanline(c);
    432     if ((c->state.enables & GGL_ENABLE_W) &&
    433         (c->state.enables & GGL_ENABLE_TMUS))
    434     {
    435         c->span = c->scanline;
    436         c->scanline = scanline_perspective;
    437         if (!(c->state.enabled_tmu & (c->state.enabled_tmu - 1))) {
    438             // only one TMU enabled
    439             c->scanline = scanline_perspective_single;
    440         }
    441     }
    442 }
    443 
    444 // ----------------------------------------------------------------------------
    445 
    446 static void blending(context_t* c, pixel_t* fragment, pixel_t* fb);
    447 static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
    448         const pixel_t* src, const pixel_t* dst);
    449 static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
    450 
    451 #if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
    452 
    453 // no need to compile the generic-pipeline, it can't be reached
    454 void scanline(context_t*)
    455 {
    456 }
    457 
    458 #else
    459 
    460 void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv)
    461 {
    462     if (su && sv) {
    463         if (su > sv) {
    464             v = ggl_expand(v, sv, su);
    465             sv = su;
    466         } else if (su < sv) {
    467             u = ggl_expand(u, su, sv);
    468             su = sv;
    469         }
    470     }
    471 }
    472 
    473 void blending(context_t* c, pixel_t* fragment, pixel_t* fb)
    474 {
    475     rescale(fragment->c[0], fragment->s[0], fb->c[0], fb->s[0]);
    476     rescale(fragment->c[1], fragment->s[1], fb->c[1], fb->s[1]);
    477     rescale(fragment->c[2], fragment->s[2], fb->c[2], fb->s[2]);
    478     rescale(fragment->c[3], fragment->s[3], fb->c[3], fb->s[3]);
    479 
    480     pixel_t sf, df;
    481     blend_factor(c, &sf, c->state.blend.src, fragment, fb);
    482     blend_factor(c, &df, c->state.blend.dst, fragment, fb);
    483 
    484     fragment->c[1] =
    485             gglMulAddx(fragment->c[1], sf.c[1], gglMulx(fb->c[1], df.c[1]));
    486     fragment->c[2] =
    487             gglMulAddx(fragment->c[2], sf.c[2], gglMulx(fb->c[2], df.c[2]));
    488     fragment->c[3] =
    489             gglMulAddx(fragment->c[3], sf.c[3], gglMulx(fb->c[3], df.c[3]));
    490 
    491     if (c->state.blend.alpha_separate) {
    492         blend_factor(c, &sf, c->state.blend.src_alpha, fragment, fb);
    493         blend_factor(c, &df, c->state.blend.dst_alpha, fragment, fb);
    494     }
    495 
    496     fragment->c[0] =
    497             gglMulAddx(fragment->c[0], sf.c[0], gglMulx(fb->c[0], df.c[0]));
    498 
    499     // clamp to 1.0
    500     if (fragment->c[0] >= (1LU<<fragment->s[0]))
    501         fragment->c[0] = (1<<fragment->s[0])-1;
    502     if (fragment->c[1] >= (1LU<<fragment->s[1]))
    503         fragment->c[1] = (1<<fragment->s[1])-1;
    504     if (fragment->c[2] >= (1LU<<fragment->s[2]))
    505         fragment->c[2] = (1<<fragment->s[2])-1;
    506     if (fragment->c[3] >= (1LU<<fragment->s[3]))
    507         fragment->c[3] = (1<<fragment->s[3])-1;
    508 }
    509 
    510 static inline int blendfactor(uint32_t x, uint32_t size, uint32_t def = 0)
    511 {
    512     if (!size)
    513         return def;
    514 
    515     // scale to 16 bits
    516     if (size > 16) {
    517         x >>= (size - 16);
    518     } else if (size < 16) {
    519         x = ggl_expand(x, size, 16);
    520     }
    521     x += x >> 15;
    522     return x;
    523 }
    524 
    525 void blend_factor(context_t* c, pixel_t* r,
    526         uint32_t factor, const pixel_t* src, const pixel_t* dst)
    527 {
    528     switch (factor) {
    529         case GGL_ZERO:
    530             r->c[1] =
    531             r->c[2] =
    532             r->c[3] =
    533             r->c[0] = 0;
    534             break;
    535         case GGL_ONE:
    536             r->c[1] =
    537             r->c[2] =
    538             r->c[3] =
    539             r->c[0] = FIXED_ONE;
    540             break;
    541         case GGL_DST_COLOR:
    542             r->c[1] = blendfactor(dst->c[1], dst->s[1]);
    543             r->c[2] = blendfactor(dst->c[2], dst->s[2]);
    544             r->c[3] = blendfactor(dst->c[3], dst->s[3]);
    545             r->c[0] = blendfactor(dst->c[0], dst->s[0]);
    546             break;
    547         case GGL_SRC_COLOR:
    548             r->c[1] = blendfactor(src->c[1], src->s[1]);
    549             r->c[2] = blendfactor(src->c[2], src->s[2]);
    550             r->c[3] = blendfactor(src->c[3], src->s[3]);
    551             r->c[0] = blendfactor(src->c[0], src->s[0]);
    552             break;
    553         case GGL_ONE_MINUS_DST_COLOR:
    554             r->c[1] = FIXED_ONE - blendfactor(dst->c[1], dst->s[1]);
    555             r->c[2] = FIXED_ONE - blendfactor(dst->c[2], dst->s[2]);
    556             r->c[3] = FIXED_ONE - blendfactor(dst->c[3], dst->s[3]);
    557             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0]);
    558             break;
    559         case GGL_ONE_MINUS_SRC_COLOR:
    560             r->c[1] = FIXED_ONE - blendfactor(src->c[1], src->s[1]);
    561             r->c[2] = FIXED_ONE - blendfactor(src->c[2], src->s[2]);
    562             r->c[3] = FIXED_ONE - blendfactor(src->c[3], src->s[3]);
    563             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0]);
    564             break;
    565         case GGL_SRC_ALPHA:
    566             r->c[1] =
    567             r->c[2] =
    568             r->c[3] =
    569             r->c[0] = blendfactor(src->c[0], src->s[0], FIXED_ONE);
    570             break;
    571         case GGL_ONE_MINUS_SRC_ALPHA:
    572             r->c[1] =
    573             r->c[2] =
    574             r->c[3] =
    575             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0], FIXED_ONE);
    576             break;
    577         case GGL_DST_ALPHA:
    578             r->c[1] =
    579             r->c[2] =
    580             r->c[3] =
    581             r->c[0] = blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
    582             break;
    583         case GGL_ONE_MINUS_DST_ALPHA:
    584             r->c[1] =
    585             r->c[2] =
    586             r->c[3] =
    587             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
    588             break;
    589         case GGL_SRC_ALPHA_SATURATE:
    590             // XXX: GGL_SRC_ALPHA_SATURATE
    591             break;
    592     }
    593 }
    594 
    595 static GGLfixed wrapping(int32_t coord, uint32_t size, int tx_wrap)
    596 {
    597     GGLfixed d;
    598     if (tx_wrap == GGL_REPEAT) {
    599         d = (uint32_t(coord)>>16) * size;
    600     } else if (tx_wrap == GGL_CLAMP) { // CLAMP_TO_EDGE semantics
    601         const GGLfixed clamp_min = FIXED_HALF;
    602         const GGLfixed clamp_max = (size << 16) - FIXED_HALF;
    603         if (coord < clamp_min)     coord = clamp_min;
    604         if (coord > clamp_max)     coord = clamp_max;
    605         d = coord;
    606     } else { // 1:1
    607         const GGLfixed clamp_min = 0;
    608         const GGLfixed clamp_max = (size << 16);
    609         if (coord < clamp_min)     coord = clamp_min;
    610         if (coord > clamp_max)     coord = clamp_max;
    611         d = coord;
    612     }
    613     return d;
    614 }
    615 
    616 static inline
    617 GGLcolor ADJUST_COLOR_ITERATOR(GGLcolor v, GGLcolor dvdx, int len)
    618 {
    619     const int32_t end = dvdx * (len-1) + v;
    620     if (end < 0)
    621         v -= end;
    622     v &= ~(v>>31);
    623     return v;
    624 }
    625 
    626 void scanline(context_t* c)
    627 {
    628     const uint32_t enables = c->state.enables;
    629     const int xs = c->iterators.xl;
    630     const int x1 = c->iterators.xr;
    631 	int xc = x1 - xs;
    632     const int16_t* covPtr = c->state.buffers.coverage + xs;
    633 
    634     // All iterated values are sampled at the pixel center
    635 
    636     // reset iterators for that scanline...
    637     GGLcolor r, g, b, a;
    638     iterators_t& ci = c->iterators;
    639     if (enables & GGL_ENABLE_SMOOTH) {
    640         r = (xs * c->shade.drdx) + ci.ydrdy;
    641         g = (xs * c->shade.dgdx) + ci.ydgdy;
    642         b = (xs * c->shade.dbdx) + ci.ydbdy;
    643         a = (xs * c->shade.dadx) + ci.ydady;
    644         r = ADJUST_COLOR_ITERATOR(r, c->shade.drdx, xc);
    645         g = ADJUST_COLOR_ITERATOR(g, c->shade.dgdx, xc);
    646         b = ADJUST_COLOR_ITERATOR(b, c->shade.dbdx, xc);
    647         a = ADJUST_COLOR_ITERATOR(a, c->shade.dadx, xc);
    648     } else {
    649         r = ci.ydrdy;
    650         g = ci.ydgdy;
    651         b = ci.ydbdy;
    652         a = ci.ydady;
    653     }
    654 
    655     // z iterators are 1.31
    656     GGLfixed z = (xs * c->shade.dzdx) + ci.ydzdy;
    657     GGLfixed f = (xs * c->shade.dfdx) + ci.ydfdy;
    658 
    659     struct {
    660         GGLfixed s, t;
    661     } tc[GGL_TEXTURE_UNIT_COUNT];
    662     if (enables & GGL_ENABLE_TMUS) {
    663         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
    664             if (c->state.texture[i].enable) {
    665                 texture_iterators_t& ti = c->state.texture[i].iterators;
    666                 if (enables & GGL_ENABLE_W) {
    667                     tc[i].s = ti.ydsdy;
    668                     tc[i].t = ti.ydtdy;
    669                 } else {
    670                     tc[i].s = (xs * ti.dsdx) + ti.ydsdy;
    671                     tc[i].t = (xs * ti.dtdx) + ti.ydtdy;
    672                 }
    673             }
    674         }
    675     }
    676 
    677     pixel_t fragment;
    678     pixel_t texel;
    679     pixel_t fb;
    680 
    681 	uint32_t x = xs;
    682 	uint32_t y = c->iterators.y;
    683 
    684 	while (xc--) {
    685 
    686         { // just a scope
    687 
    688 		// read color (convert to 8 bits by keeping only the integer part)
    689         fragment.s[1] = fragment.s[2] =
    690         fragment.s[3] = fragment.s[0] = 8;
    691         fragment.c[1] = r >> (GGL_COLOR_BITS-8);
    692         fragment.c[2] = g >> (GGL_COLOR_BITS-8);
    693         fragment.c[3] = b >> (GGL_COLOR_BITS-8);
    694         fragment.c[0] = a >> (GGL_COLOR_BITS-8);
    695 
    696 		// texturing
    697         if (enables & GGL_ENABLE_TMUS) {
    698             for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
    699                 texture_t& tx = c->state.texture[i];
    700                 if (!tx.enable)
    701                     continue;
    702                 texture_iterators_t& ti = tx.iterators;
    703                 int32_t u, v;
    704 
    705                 // s-coordinate
    706                 if (tx.s_coord != GGL_ONE_TO_ONE) {
    707                     const int w = tx.surface.width;
    708                     u = wrapping(tc[i].s, w, tx.s_wrap);
    709                     tc[i].s += ti.dsdx;
    710                 } else {
    711                     u = (((tx.shade.is0>>16) + x)<<16) + FIXED_HALF;
    712                 }
    713 
    714                 // t-coordinate
    715                 if (tx.t_coord != GGL_ONE_TO_ONE) {
    716                     const int h = tx.surface.height;
    717                     v = wrapping(tc[i].t, h, tx.t_wrap);
    718                     tc[i].t += ti.dtdx;
    719                 } else {
    720                     v = (((tx.shade.it0>>16) + y)<<16) + FIXED_HALF;
    721                 }
    722 
    723                 // read texture
    724                 if (tx.mag_filter == GGL_NEAREST &&
    725                     tx.min_filter == GGL_NEAREST)
    726                 {
    727                     u >>= 16;
    728                     v >>= 16;
    729                     tx.surface.read(&tx.surface, c, u, v, &texel);
    730                 } else {
    731                     const int w = tx.surface.width;
    732                     const int h = tx.surface.height;
    733                     u -= FIXED_HALF;
    734                     v -= FIXED_HALF;
    735                     int u0 = u >> 16;
    736                     int v0 = v >> 16;
    737                     int u1 = u0 + 1;
    738                     int v1 = v0 + 1;
    739                     if (tx.s_wrap == GGL_REPEAT) {
    740                         if (u0<0)  u0 += w;
    741                         if (u1<0)  u1 += w;
    742                         if (u0>=w) u0 -= w;
    743                         if (u1>=w) u1 -= w;
    744                     } else {
    745                         if (u0<0)  u0 = 0;
    746                         if (u1<0)  u1 = 0;
    747                         if (u0>=w) u0 = w-1;
    748                         if (u1>=w) u1 = w-1;
    749                     }
    750                     if (tx.t_wrap == GGL_REPEAT) {
    751                         if (v0<0)  v0 += h;
    752                         if (v1<0)  v1 += h;
    753                         if (v0>=h) v0 -= h;
    754                         if (v1>=h) v1 -= h;
    755                     } else {
    756                         if (v0<0)  v0 = 0;
    757                         if (v1<0)  v1 = 0;
    758                         if (v0>=h) v0 = h-1;
    759                         if (v1>=h) v1 = h-1;
    760                     }
    761                     pixel_t texels[4];
    762                     uint32_t mm[4];
    763                     tx.surface.read(&tx.surface, c, u0, v0, &texels[0]);
    764                     tx.surface.read(&tx.surface, c, u0, v1, &texels[1]);
    765                     tx.surface.read(&tx.surface, c, u1, v0, &texels[2]);
    766                     tx.surface.read(&tx.surface, c, u1, v1, &texels[3]);
    767                     u = (u >> 12) & 0xF;
    768                     v = (v >> 12) & 0xF;
    769                     u += u>>3;
    770                     v += v>>3;
    771                     mm[0] = (0x10 - u) * (0x10 - v);
    772                     mm[1] = (0x10 - u) * v;
    773                     mm[2] = u * (0x10 - v);
    774                     mm[3] = 0x100 - (mm[0] + mm[1] + mm[2]);
    775                     for (int j=0 ; j<4 ; j++) {
    776                         texel.s[j] = texels[0].s[j];
    777                         if (!texel.s[j]) continue;
    778                         texel.s[j] += 8;
    779                         texel.c[j] =    texels[0].c[j]*mm[0] +
    780                                         texels[1].c[j]*mm[1] +
    781                                         texels[2].c[j]*mm[2] +
    782                                         texels[3].c[j]*mm[3] ;
    783                     }
    784                 }
    785 
    786                 // Texture environnement...
    787                 for (int j=0 ; j<4 ; j++) {
    788                     uint32_t& Cf = fragment.c[j];
    789                     uint32_t& Ct = texel.c[j];
    790                     uint8_t& sf  = fragment.s[j];
    791                     uint8_t& st  = texel.s[j];
    792                     uint32_t At = texel.c[0];
    793                     uint8_t sat = texel.s[0];
    794                     switch (tx.env) {
    795                     case GGL_REPLACE:
    796                         if (st) {
    797                             Cf = Ct;
    798                             sf = st;
    799                         }
    800                         break;
    801                     case GGL_MODULATE:
    802                         if (st) {
    803                             uint32_t factor = Ct + (Ct>>(st-1));
    804                             Cf = (Cf * factor) >> st;
    805                         }
    806                         break;
    807                     case GGL_DECAL:
    808                         if (sat) {
    809                             rescale(Cf, sf, Ct, st);
    810                             Cf += ((Ct - Cf) * (At + (At>>(sat-1)))) >> sat;
    811                         }
    812                         break;
    813                     case GGL_BLEND:
    814                         if (st) {
    815                             uint32_t Cc = tx.env_color[i];
    816                             if (sf>8)       Cc = (Cc * ((1<<sf)-1))>>8;
    817                             else if (sf<8)  Cc = (Cc - (Cc>>(8-sf)))>>(8-sf);
    818                             uint32_t factor = Ct + (Ct>>(st-1));
    819                             Cf = ((((1<<st) - factor) * Cf) + Ct*Cc)>>st;
    820                         }
    821                         break;
    822                     case GGL_ADD:
    823                         if (st) {
    824                             rescale(Cf, sf, Ct, st);
    825                             Cf += Ct;
    826                         }
    827                         break;
    828                     }
    829                 }
    830             }
    831 		}
    832 
    833         // coverage application
    834         if (enables & GGL_ENABLE_AA) {
    835             int16_t cf = *covPtr++;
    836             fragment.c[0] = (int64_t(fragment.c[0]) * cf) >> 15;
    837         }
    838 
    839         // alpha-test
    840         if (enables & GGL_ENABLE_ALPHA_TEST) {
    841             GGLcolor ref = c->state.alpha_test.ref;
    842             GGLcolor alpha = (uint64_t(fragment.c[0]) *
    843                     ((1<<GGL_COLOR_BITS)-1)) / ((1<<fragment.s[0])-1);
    844             switch (c->state.alpha_test.func) {
    845             case GGL_NEVER:     goto discard;
    846             case GGL_LESS:      if (alpha<ref)  break; goto discard;
    847             case GGL_EQUAL:     if (alpha==ref) break; goto discard;
    848             case GGL_LEQUAL:    if (alpha<=ref) break; goto discard;
    849             case GGL_GREATER:   if (alpha>ref)  break; goto discard;
    850             case GGL_NOTEQUAL:  if (alpha!=ref) break; goto discard;
    851             case GGL_GEQUAL:    if (alpha>=ref) break; goto discard;
    852             }
    853         }
    854 
    855         // depth test
    856         if (c->state.buffers.depth.format) {
    857             if (enables & GGL_ENABLE_DEPTH_TEST) {
    858                 surface_t* cb = &(c->state.buffers.depth);
    859                 uint16_t* p = (uint16_t*)(cb->data)+(x+(cb->stride*y));
    860                 uint16_t zz = uint32_t(z)>>(16);
    861                 uint16_t depth = *p;
    862                 switch (c->state.depth_test.func) {
    863                 case GGL_NEVER:     goto discard;
    864                 case GGL_LESS:      if (zz<depth)    break; goto discard;
    865                 case GGL_EQUAL:     if (zz==depth)   break; goto discard;
    866                 case GGL_LEQUAL:    if (zz<=depth)   break; goto discard;
    867                 case GGL_GREATER:   if (zz>depth)    break; goto discard;
    868                 case GGL_NOTEQUAL:  if (zz!=depth)   break; goto discard;
    869                 case GGL_GEQUAL:    if (zz>=depth)   break; goto discard;
    870                 }
    871                 // depth buffer is not enabled, if depth-test is not enabled
    872 /*
    873         fragment.s[1] = fragment.s[2] =
    874         fragment.s[3] = fragment.s[0] = 8;
    875         fragment.c[1] =
    876         fragment.c[2] =
    877         fragment.c[3] =
    878         fragment.c[0] = 255 - (zz>>8);
    879 */
    880                 if (c->state.mask.depth) {
    881                     *p = zz;
    882                 }
    883             }
    884         }
    885 
    886         // fog
    887         if (enables & GGL_ENABLE_FOG) {
    888             for (int i=1 ; i<=3 ; i++) {
    889                 GGLfixed fc = (c->state.fog.color[i] * 0x10000) / 0xFF;
    890                 uint32_t& c = fragment.c[i];
    891                 uint8_t& s  = fragment.s[i];
    892                 c = (c * 0x10000) / ((1<<s)-1);
    893                 c = gglMulAddx(c, f, gglMulx(fc, 0x10000 - f));
    894                 s = 16;
    895             }
    896         }
    897 
    898         // blending
    899         if (enables & GGL_ENABLE_BLENDING) {
    900             fb.c[1] = fb.c[2] = fb.c[3] = fb.c[0] = 0; // placate valgrind
    901             fb.s[1] = fb.s[2] = fb.s[3] = fb.s[0] = 0;
    902             c->state.buffers.color.read(
    903                     &(c->state.buffers.color), c, x, y, &fb);
    904             blending( c, &fragment, &fb );
    905         }
    906 
    907 		// write
    908         c->state.buffers.color.write(
    909                 &(c->state.buffers.color), c, x, y, &fragment);
    910         }
    911 
    912 discard:
    913 		// iterate...
    914         x += 1;
    915         if (enables & GGL_ENABLE_SMOOTH) {
    916             r += c->shade.drdx;
    917             g += c->shade.dgdx;
    918             b += c->shade.dbdx;
    919             a += c->shade.dadx;
    920         }
    921         z += c->shade.dzdx;
    922         f += c->shade.dfdx;
    923 	}
    924 }
    925 
    926 #endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
    927 
    928 // ----------------------------------------------------------------------------
    929 #if 0
    930 #pragma mark -
    931 #pragma mark Scanline
    932 #endif
    933 
    934 /* Used to parse a 32-bit source texture linearly. Usage is:
    935  *
    936  * horz_iterator32  hi(context);
    937  * while (...) {
    938  *    uint32_t  src_pixel = hi.get_pixel32();
    939  *    ...
    940  * }
    941  *
    942  * Use only for one-to-one texture mapping.
    943  */
    944 struct horz_iterator32 {
    945     horz_iterator32(context_t* c) {
    946         const int x = c->iterators.xl;
    947         const int y = c->iterators.y;
    948         texture_t& tx = c->state.texture[0];
    949         const int32_t u = (tx.shade.is0>>16) + x;
    950         const int32_t v = (tx.shade.it0>>16) + y;
    951         m_src = reinterpret_cast<uint32_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
    952     }
    953     uint32_t  get_pixel32() {
    954         return *m_src++;
    955     }
    956 protected:
    957     uint32_t* m_src;
    958 };
    959 
    960 /* A variant for 16-bit source textures. */
    961 struct horz_iterator16 {
    962     horz_iterator16(context_t* c) {
    963         const int x = c->iterators.xl;
    964         const int y = c->iterators.y;
    965         texture_t& tx = c->state.texture[0];
    966         const int32_t u = (tx.shade.is0>>16) + x;
    967         const int32_t v = (tx.shade.it0>>16) + y;
    968         m_src = reinterpret_cast<uint16_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
    969     }
    970     uint16_t  get_pixel16() {
    971         return *m_src++;
    972     }
    973 protected:
    974     uint16_t* m_src;
    975 };
    976 
    977 /* A clamp iterator is used to iterate inside a texture with GGL_CLAMP.
    978  * After initialization, call get_src16() or get_src32() to get the current
    979  * texture pixel value.
    980  */
    981 struct clamp_iterator {
    982     clamp_iterator(context_t* c) {
    983         const int xs = c->iterators.xl;
    984         texture_t& tx = c->state.texture[0];
    985         texture_iterators_t& ti = tx.iterators;
    986         m_s = (xs * ti.dsdx) + ti.ydsdy;
    987         m_t = (xs * ti.dtdx) + ti.ydtdy;
    988         m_ds = ti.dsdx;
    989         m_dt = ti.dtdx;
    990         m_width_m1 = tx.surface.width - 1;
    991         m_height_m1 = tx.surface.height - 1;
    992         m_data = tx.surface.data;
    993         m_stride = tx.surface.stride;
    994     }
    995     uint16_t get_pixel16() {
    996         int  u, v;
    997         get_uv(u, v);
    998         uint16_t* src = reinterpret_cast<uint16_t*>(m_data) + (u + (m_stride*v));
    999         return src[0];
   1000     }
   1001     uint32_t get_pixel32() {
   1002         int  u, v;
   1003         get_uv(u, v);
   1004         uint32_t* src = reinterpret_cast<uint32_t*>(m_data) + (u + (m_stride*v));
   1005         return src[0];
   1006     }
   1007 private:
   1008     void   get_uv(int& u, int& v) {
   1009         int  uu = m_s >> 16;
   1010         int  vv = m_t >> 16;
   1011         if (uu < 0)
   1012             uu = 0;
   1013         if (uu > m_width_m1)
   1014             uu = m_width_m1;
   1015         if (vv < 0)
   1016             vv = 0;
   1017         if (vv > m_height_m1)
   1018             vv = m_height_m1;
   1019         u = uu;
   1020         v = vv;
   1021         m_s += m_ds;
   1022         m_t += m_dt;
   1023     }
   1024 
   1025     GGLfixed  m_s, m_t;
   1026     GGLfixed  m_ds, m_dt;
   1027     int       m_width_m1, m_height_m1;
   1028     uint8_t*  m_data;
   1029     int       m_stride;
   1030 };
   1031 
   1032 /*
   1033  * The 'horizontal clamp iterator' variant corresponds to the case where
   1034  * the 'v' coordinate doesn't change. This is useful to avoid one mult and
   1035  * extra adds / checks per pixels, if the blending/processing operation after
   1036  * this is very fast.
   1037  */
   1038 static int is_context_horizontal(const context_t* c) {
   1039     return (c->state.texture[0].iterators.dtdx == 0);
   1040 }
   1041 
   1042 struct horz_clamp_iterator {
   1043     uint16_t  get_pixel16() {
   1044         int  u = m_s >> 16;
   1045         m_s += m_ds;
   1046         if (u < 0)
   1047             u = 0;
   1048         if (u > m_width_m1)
   1049             u = m_width_m1;
   1050         const uint16_t* src = reinterpret_cast<const uint16_t*>(m_data);
   1051         return src[u];
   1052     }
   1053     uint32_t  get_pixel32() {
   1054         int  u = m_s >> 16;
   1055         m_s += m_ds;
   1056         if (u < 0)
   1057             u = 0;
   1058         if (u > m_width_m1)
   1059             u = m_width_m1;
   1060         const uint32_t* src = reinterpret_cast<const uint32_t*>(m_data);
   1061         return src[u];
   1062     }
   1063 protected:
   1064     void init(const context_t* c, int shift);
   1065     GGLfixed       m_s;
   1066     GGLfixed       m_ds;
   1067     int            m_width_m1;
   1068     const uint8_t* m_data;
   1069 };
   1070 
   1071 void horz_clamp_iterator::init(const context_t* c, int shift)
   1072 {
   1073     const int xs = c->iterators.xl;
   1074     const texture_t& tx = c->state.texture[0];
   1075     const texture_iterators_t& ti = tx.iterators;
   1076     m_s = (xs * ti.dsdx) + ti.ydsdy;
   1077     m_ds = ti.dsdx;
   1078     m_width_m1 = tx.surface.width-1;
   1079     m_data = tx.surface.data;
   1080 
   1081     GGLfixed t = (xs * ti.dtdx) + ti.ydtdy;
   1082     int      v = t >> 16;
   1083     if (v < 0)
   1084         v = 0;
   1085     else if (v >= (int)tx.surface.height)
   1086         v = (int)tx.surface.height-1;
   1087 
   1088     m_data += (tx.surface.stride*v) << shift;
   1089 }
   1090 
   1091 struct horz_clamp_iterator16 : horz_clamp_iterator {
   1092     horz_clamp_iterator16(const context_t* c) {
   1093         init(c,1);
   1094     };
   1095 };
   1096 
   1097 struct horz_clamp_iterator32 : horz_clamp_iterator {
   1098     horz_clamp_iterator32(context_t* c) {
   1099         init(c,2);
   1100     };
   1101 };
   1102 
   1103 /* This is used to perform dithering operations.
   1104  */
   1105 struct ditherer {
   1106     ditherer(const context_t* c) {
   1107         const int x = c->iterators.xl;
   1108         const int y = c->iterators.y;
   1109         m_line = &c->ditherMatrix[ ((y & GGL_DITHER_MASK)<<GGL_DITHER_ORDER_SHIFT) ];
   1110         m_index = x & GGL_DITHER_MASK;
   1111     }
   1112     void step(void) {
   1113         m_index++;
   1114     }
   1115     int  get_value(void) {
   1116         int ret = m_line[m_index & GGL_DITHER_MASK];
   1117         m_index++;
   1118         return ret;
   1119     }
   1120     uint16_t abgr8888ToRgb565(uint32_t s) {
   1121         uint32_t r = s & 0xff;
   1122         uint32_t g = (s >> 8) & 0xff;
   1123         uint32_t b = (s >> 16) & 0xff;
   1124         return rgb888ToRgb565(r,g,b);
   1125     }
   1126     /* The following assumes that r/g/b are in the 0..255 range each */
   1127     uint16_t rgb888ToRgb565(uint32_t& r, uint32_t& g, uint32_t &b) {
   1128         int threshold = get_value();
   1129         /* dither in on GGL_DITHER_BITS, and each of r, g, b is on 8 bits */
   1130         r += (threshold >> (GGL_DITHER_BITS-8 +5));
   1131         g += (threshold >> (GGL_DITHER_BITS-8 +6));
   1132         b += (threshold >> (GGL_DITHER_BITS-8 +5));
   1133         if (r > 0xff)
   1134             r = 0xff;
   1135         if (g > 0xff)
   1136             g = 0xff;
   1137         if (b > 0xff)
   1138             b = 0xff;
   1139         return uint16_t(((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3));
   1140     }
   1141 protected:
   1142     const uint8_t* m_line;
   1143     int            m_index;
   1144 };
   1145 
   1146 /* This structure is used to blend (SRC_OVER) 32-bit source pixels
   1147  * onto 16-bit destination ones. Usage is simply:
   1148  *
   1149  *   blender.blend(<32-bit-src-pixel-value>,<ptr-to-16-bit-dest-pixel>)
   1150  */
   1151 struct blender_32to16 {
   1152     blender_32to16(context_t* c) { }
   1153     void write(uint32_t s, uint16_t* dst) {
   1154         if (s == 0)
   1155             return;
   1156         s = GGL_RGBA_TO_HOST(s);
   1157         int sA = (s>>24);
   1158         if (sA == 0xff) {
   1159             *dst = convertAbgr8888ToRgb565(s);
   1160         } else {
   1161             int f = 0x100 - (sA + (sA>>7));
   1162             int sR = (s >> (   3))&0x1F;
   1163             int sG = (s >> ( 8+2))&0x3F;
   1164             int sB = (s >> (16+3))&0x1F;
   1165             uint16_t d = *dst;
   1166             int dR = (d>>11)&0x1f;
   1167             int dG = (d>>5)&0x3f;
   1168             int dB = (d)&0x1f;
   1169             sR += (f*dR)>>8;
   1170             sG += (f*dG)>>8;
   1171             sB += (f*dB)>>8;
   1172             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1173         }
   1174     }
   1175     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1176         if (s == 0) {
   1177             di.step();
   1178             return;
   1179         }
   1180         s = GGL_RGBA_TO_HOST(s);
   1181         int sA = (s>>24);
   1182         if (sA == 0xff) {
   1183             *dst = di.abgr8888ToRgb565(s);
   1184         } else {
   1185             int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1186             int f = 0x100 - (sA + (sA>>7));
   1187             int sR = (s >> (   3))&0x1F;
   1188             int sG = (s >> ( 8+2))&0x3F;
   1189             int sB = (s >> (16+3))&0x1F;
   1190             uint16_t d = *dst;
   1191             int dR = (d>>11)&0x1f;
   1192             int dG = (d>>5)&0x3f;
   1193             int dB = (d)&0x1f;
   1194             sR = ((sR << 8) + f*dR + threshold)>>8;
   1195             sG = ((sG << 8) + f*dG + threshold)>>8;
   1196             sB = ((sB << 8) + f*dB + threshold)>>8;
   1197             if (sR > 0x1f) sR = 0x1f;
   1198             if (sG > 0x3f) sG = 0x3f;
   1199             if (sB > 0x1f) sB = 0x1f;
   1200             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1201         }
   1202     }
   1203 };
   1204 
   1205 /* This blender does the same for the 'blend_srca' operation.
   1206  * where dstFactor=srcA*(1-srcA) srcFactor=srcA
   1207  */
   1208 struct blender_32to16_srcA {
   1209     blender_32to16_srcA(const context_t* c) { }
   1210     void write(uint32_t s, uint16_t* dst) {
   1211         if (!s) {
   1212             return;
   1213         }
   1214         uint16_t d = *dst;
   1215         s = GGL_RGBA_TO_HOST(s);
   1216         int sR = (s >> (   3))&0x1F;
   1217         int sG = (s >> ( 8+2))&0x3F;
   1218         int sB = (s >> (16+3))&0x1F;
   1219         int sA = (s>>24);
   1220         int f1 = (sA + (sA>>7));
   1221         int f2 = 0x100-f1;
   1222         int dR = (d>>11)&0x1f;
   1223         int dG = (d>>5)&0x3f;
   1224         int dB = (d)&0x1f;
   1225         sR = (f1*sR + f2*dR)>>8;
   1226         sG = (f1*sG + f2*dG)>>8;
   1227         sB = (f1*sB + f2*dB)>>8;
   1228         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1229     }
   1230 };
   1231 
   1232 /* Common init code the modulating blenders */
   1233 struct blender_modulate {
   1234     void init(const context_t* c) {
   1235         const int r = c->iterators.ydrdy >> (GGL_COLOR_BITS-8);
   1236         const int g = c->iterators.ydgdy >> (GGL_COLOR_BITS-8);
   1237         const int b = c->iterators.ydbdy >> (GGL_COLOR_BITS-8);
   1238         const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
   1239         m_r = r + (r >> 7);
   1240         m_g = g + (g >> 7);
   1241         m_b = b + (b >> 7);
   1242         m_a = a + (a >> 7);
   1243     }
   1244 protected:
   1245     int m_r, m_g, m_b, m_a;
   1246 };
   1247 
   1248 /* This blender does a normal blend after modulation.
   1249  */
   1250 struct blender_32to16_modulate : blender_modulate {
   1251     blender_32to16_modulate(const context_t* c) {
   1252         init(c);
   1253     }
   1254     void write(uint32_t s, uint16_t* dst) {
   1255         // blend source and destination
   1256         if (!s) {
   1257             return;
   1258         }
   1259         s = GGL_RGBA_TO_HOST(s);
   1260 
   1261         /* We need to modulate s */
   1262         uint32_t  sA = (s >> 24);
   1263         uint32_t  sB = (s >> 16) & 0xff;
   1264         uint32_t  sG = (s >> 8) & 0xff;
   1265         uint32_t  sR = s & 0xff;
   1266 
   1267         sA = (sA*m_a) >> 8;
   1268         /* Keep R/G/B scaled to 5.8 or 6.8 fixed float format */
   1269         sR = (sR*m_r) >> (8 - 5);
   1270         sG = (sG*m_g) >> (8 - 6);
   1271         sB = (sB*m_b) >> (8 - 5);
   1272 
   1273         /* Now do a normal blend */
   1274         int f = 0x100 - (sA + (sA>>7));
   1275         uint16_t d = *dst;
   1276         int dR = (d>>11)&0x1f;
   1277         int dG = (d>>5)&0x3f;
   1278         int dB = (d)&0x1f;
   1279         sR = (sR + f*dR)>>8;
   1280         sG = (sG + f*dG)>>8;
   1281         sB = (sB + f*dB)>>8;
   1282         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1283     }
   1284     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1285         // blend source and destination
   1286         if (!s) {
   1287             di.step();
   1288             return;
   1289         }
   1290         s = GGL_RGBA_TO_HOST(s);
   1291 
   1292         /* We need to modulate s */
   1293         uint32_t  sA = (s >> 24);
   1294         uint32_t  sB = (s >> 16) & 0xff;
   1295         uint32_t  sG = (s >> 8) & 0xff;
   1296         uint32_t  sR = s & 0xff;
   1297 
   1298         sA = (sA*m_a) >> 8;
   1299         /* keep R/G/B scaled to 5.8 or 6.8 fixed float format */
   1300         sR = (sR*m_r) >> (8 - 5);
   1301         sG = (sG*m_g) >> (8 - 6);
   1302         sB = (sB*m_b) >> (8 - 5);
   1303 
   1304         /* Scale threshold to 0.8 fixed float format */
   1305         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1306         int f = 0x100 - (sA + (sA>>7));
   1307         uint16_t d = *dst;
   1308         int dR = (d>>11)&0x1f;
   1309         int dG = (d>>5)&0x3f;
   1310         int dB = (d)&0x1f;
   1311         sR = (sR + f*dR + threshold)>>8;
   1312         sG = (sG + f*dG + threshold)>>8;
   1313         sB = (sB + f*dB + threshold)>>8;
   1314         if (sR > 0x1f) sR = 0x1f;
   1315         if (sG > 0x3f) sG = 0x3f;
   1316         if (sB > 0x1f) sB = 0x1f;
   1317         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1318     }
   1319 };
   1320 
   1321 /* same as 32to16_modulate, except that the input is xRGB, instead of ARGB */
   1322 struct blender_x32to16_modulate : blender_modulate {
   1323     blender_x32to16_modulate(const context_t* c) {
   1324         init(c);
   1325     }
   1326     void write(uint32_t s, uint16_t* dst) {
   1327         s = GGL_RGBA_TO_HOST(s);
   1328 
   1329         uint32_t  sB = (s >> 16) & 0xff;
   1330         uint32_t  sG = (s >> 8) & 0xff;
   1331         uint32_t  sR = s & 0xff;
   1332 
   1333         /* Keep R/G/B in 5.8 or 6.8 format */
   1334         sR = (sR*m_r) >> (8 - 5);
   1335         sG = (sG*m_g) >> (8 - 6);
   1336         sB = (sB*m_b) >> (8 - 5);
   1337 
   1338         int f = 0x100 - m_a;
   1339         uint16_t d = *dst;
   1340         int dR = (d>>11)&0x1f;
   1341         int dG = (d>>5)&0x3f;
   1342         int dB = (d)&0x1f;
   1343         sR = (sR + f*dR)>>8;
   1344         sG = (sG + f*dG)>>8;
   1345         sB = (sB + f*dB)>>8;
   1346         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1347     }
   1348     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1349         s = GGL_RGBA_TO_HOST(s);
   1350 
   1351         uint32_t  sB = (s >> 16) & 0xff;
   1352         uint32_t  sG = (s >> 8) & 0xff;
   1353         uint32_t  sR = s & 0xff;
   1354 
   1355         sR = (sR*m_r) >> (8 - 5);
   1356         sG = (sG*m_g) >> (8 - 6);
   1357         sB = (sB*m_b) >> (8 - 5);
   1358 
   1359         /* Now do a normal blend */
   1360         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1361         int f = 0x100 - m_a;
   1362         uint16_t d = *dst;
   1363         int dR = (d>>11)&0x1f;
   1364         int dG = (d>>5)&0x3f;
   1365         int dB = (d)&0x1f;
   1366         sR = (sR + f*dR + threshold)>>8;
   1367         sG = (sG + f*dG + threshold)>>8;
   1368         sB = (sB + f*dB + threshold)>>8;
   1369         if (sR > 0x1f) sR = 0x1f;
   1370         if (sG > 0x3f) sG = 0x3f;
   1371         if (sB > 0x1f) sB = 0x1f;
   1372         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1373     }
   1374 };
   1375 
   1376 /* Same as above, but source is 16bit rgb565 */
   1377 struct blender_16to16_modulate : blender_modulate {
   1378     blender_16to16_modulate(const context_t* c) {
   1379         init(c);
   1380     }
   1381     void write(uint16_t s16, uint16_t* dst) {
   1382         uint32_t  s = s16;
   1383 
   1384         uint32_t  sR = s >> 11;
   1385         uint32_t  sG = (s >> 5) & 0x3f;
   1386         uint32_t  sB = s & 0x1f;
   1387 
   1388         sR = (sR*m_r);
   1389         sG = (sG*m_g);
   1390         sB = (sB*m_b);
   1391 
   1392         int f = 0x100 - m_a;
   1393         uint16_t d = *dst;
   1394         int dR = (d>>11)&0x1f;
   1395         int dG = (d>>5)&0x3f;
   1396         int dB = (d)&0x1f;
   1397         sR = (sR + f*dR)>>8;
   1398         sG = (sG + f*dG)>>8;
   1399         sB = (sB + f*dB)>>8;
   1400         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1401     }
   1402 };
   1403 
   1404 /* This is used to iterate over a 16-bit destination color buffer.
   1405  * Usage is:
   1406  *
   1407  *   dst_iterator16  di(context);
   1408  *   while (di.count--) {
   1409  *       <do stuff with dest pixel at di.dst>
   1410  *       di.dst++;
   1411  *   }
   1412  */
   1413 struct dst_iterator16 {
   1414     dst_iterator16(const context_t* c) {
   1415         const int x = c->iterators.xl;
   1416         const int width = c->iterators.xr - x;
   1417         const int32_t y = c->iterators.y;
   1418         const surface_t* cb = &(c->state.buffers.color);
   1419         count = width;
   1420         dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   1421     }
   1422     int        count;
   1423     uint16_t*  dst;
   1424 };
   1425 
   1426 
   1427 static void scanline_t32cb16_clamp(context_t* c)
   1428 {
   1429     dst_iterator16  di(c);
   1430 
   1431     if (is_context_horizontal(c)) {
   1432         /* Special case for simple horizontal scaling */
   1433         horz_clamp_iterator32 ci(c);
   1434         while (di.count--) {
   1435             uint32_t s = ci.get_pixel32();
   1436             *di.dst++ = convertAbgr8888ToRgb565(s);
   1437         }
   1438     } else {
   1439         /* General case */
   1440         clamp_iterator ci(c);
   1441         while (di.count--) {
   1442             uint32_t s = ci.get_pixel32();
   1443             *di.dst++ = convertAbgr8888ToRgb565(s);
   1444         }
   1445     }
   1446 }
   1447 
   1448 static void scanline_t32cb16_dither(context_t* c)
   1449 {
   1450     horz_iterator32 si(c);
   1451     dst_iterator16  di(c);
   1452     ditherer        dither(c);
   1453 
   1454     while (di.count--) {
   1455         uint32_t s = si.get_pixel32();
   1456         *di.dst++ = dither.abgr8888ToRgb565(s);
   1457     }
   1458 }
   1459 
   1460 static void scanline_t32cb16_clamp_dither(context_t* c)
   1461 {
   1462     dst_iterator16  di(c);
   1463     ditherer        dither(c);
   1464 
   1465     if (is_context_horizontal(c)) {
   1466         /* Special case for simple horizontal scaling */
   1467         horz_clamp_iterator32 ci(c);
   1468         while (di.count--) {
   1469             uint32_t s = ci.get_pixel32();
   1470             *di.dst++ = dither.abgr8888ToRgb565(s);
   1471         }
   1472     } else {
   1473         /* General case */
   1474         clamp_iterator ci(c);
   1475         while (di.count--) {
   1476             uint32_t s = ci.get_pixel32();
   1477             *di.dst++ = dither.abgr8888ToRgb565(s);
   1478         }
   1479     }
   1480 }
   1481 
   1482 static void scanline_t32cb16blend_dither(context_t* c)
   1483 {
   1484     dst_iterator16 di(c);
   1485     ditherer       dither(c);
   1486     blender_32to16 bl(c);
   1487     horz_iterator32  hi(c);
   1488     while (di.count--) {
   1489         uint32_t s = hi.get_pixel32();
   1490         bl.write(s, di.dst, dither);
   1491         di.dst++;
   1492     }
   1493 }
   1494 
   1495 static void scanline_t32cb16blend_clamp(context_t* c)
   1496 {
   1497     dst_iterator16  di(c);
   1498     blender_32to16  bl(c);
   1499 
   1500     if (is_context_horizontal(c)) {
   1501         horz_clamp_iterator32 ci(c);
   1502         while (di.count--) {
   1503             uint32_t s = ci.get_pixel32();
   1504             bl.write(s, di.dst);
   1505             di.dst++;
   1506         }
   1507     } else {
   1508         clamp_iterator ci(c);
   1509         while (di.count--) {
   1510             uint32_t s = ci.get_pixel32();
   1511             bl.write(s, di.dst);
   1512             di.dst++;
   1513         }
   1514     }
   1515 }
   1516 
   1517 static void scanline_t32cb16blend_clamp_dither(context_t* c)
   1518 {
   1519     dst_iterator16 di(c);
   1520     ditherer       dither(c);
   1521     blender_32to16 bl(c);
   1522 
   1523     clamp_iterator ci(c);
   1524     while (di.count--) {
   1525         uint32_t s = ci.get_pixel32();
   1526         bl.write(s, di.dst, dither);
   1527         di.dst++;
   1528     }
   1529 }
   1530 
   1531 void scanline_t32cb16blend_clamp_mod(context_t* c)
   1532 {
   1533     dst_iterator16 di(c);
   1534     blender_32to16_modulate bl(c);
   1535 
   1536     clamp_iterator ci(c);
   1537     while (di.count--) {
   1538         uint32_t s = ci.get_pixel32();
   1539         bl.write(s, di.dst);
   1540         di.dst++;
   1541     }
   1542 }
   1543 
   1544 void scanline_t32cb16blend_clamp_mod_dither(context_t* c)
   1545 {
   1546     dst_iterator16 di(c);
   1547     blender_32to16_modulate bl(c);
   1548     ditherer dither(c);
   1549 
   1550     clamp_iterator ci(c);
   1551     while (di.count--) {
   1552         uint32_t s = ci.get_pixel32();
   1553         bl.write(s, di.dst, dither);
   1554         di.dst++;
   1555     }
   1556 }
   1557 
   1558 /* Variant of scanline_t32cb16blend_clamp_mod with a xRGB texture */
   1559 void scanline_x32cb16blend_clamp_mod(context_t* c)
   1560 {
   1561     dst_iterator16 di(c);
   1562     blender_x32to16_modulate  bl(c);
   1563 
   1564     clamp_iterator ci(c);
   1565     while (di.count--) {
   1566         uint32_t s = ci.get_pixel32();
   1567         bl.write(s, di.dst);
   1568         di.dst++;
   1569     }
   1570 }
   1571 
   1572 void scanline_x32cb16blend_clamp_mod_dither(context_t* c)
   1573 {
   1574     dst_iterator16 di(c);
   1575     blender_x32to16_modulate  bl(c);
   1576     ditherer dither(c);
   1577 
   1578     clamp_iterator ci(c);
   1579     while (di.count--) {
   1580         uint32_t s = ci.get_pixel32();
   1581         bl.write(s, di.dst, dither);
   1582         di.dst++;
   1583     }
   1584 }
   1585 
   1586 void scanline_t16cb16_clamp(context_t* c)
   1587 {
   1588     dst_iterator16  di(c);
   1589 
   1590     /* Special case for simple horizontal scaling */
   1591     if (is_context_horizontal(c)) {
   1592         horz_clamp_iterator16 ci(c);
   1593         while (di.count--) {
   1594             *di.dst++ = ci.get_pixel16();
   1595         }
   1596     } else {
   1597         clamp_iterator ci(c);
   1598         while (di.count--) {
   1599             *di.dst++ = ci.get_pixel16();
   1600         }
   1601     }
   1602 }
   1603 
   1604 
   1605 
   1606 template <typename T, typename U>
   1607 static inline __attribute__((const))
   1608 T interpolate(int y, T v0, U dvdx, U dvdy) {
   1609     // interpolates in pixel's centers
   1610     // v = v0 + (y + 0.5) * dvdy + (0.5 * dvdx)
   1611     return (y * dvdy) + (v0 + ((dvdy + dvdx) >> 1));
   1612 }
   1613 
   1614 // ----------------------------------------------------------------------------
   1615 #if 0
   1616 #pragma mark -
   1617 #endif
   1618 
   1619 void init_y(context_t* c, int32_t ys)
   1620 {
   1621     const uint32_t enables = c->state.enables;
   1622 
   1623     // compute iterators...
   1624     iterators_t& ci = c->iterators;
   1625 
   1626     // sample in the center
   1627     ci.y = ys;
   1628 
   1629     if (enables & (GGL_ENABLE_DEPTH_TEST|GGL_ENABLE_W|GGL_ENABLE_FOG)) {
   1630         ci.ydzdy = interpolate(ys, c->shade.z0, c->shade.dzdx, c->shade.dzdy);
   1631         ci.ydwdy = interpolate(ys, c->shade.w0, c->shade.dwdx, c->shade.dwdy);
   1632         ci.ydfdy = interpolate(ys, c->shade.f0, c->shade.dfdx, c->shade.dfdy);
   1633     }
   1634 
   1635     if (ggl_unlikely(enables & GGL_ENABLE_SMOOTH)) {
   1636         ci.ydrdy = interpolate(ys, c->shade.r0, c->shade.drdx, c->shade.drdy);
   1637         ci.ydgdy = interpolate(ys, c->shade.g0, c->shade.dgdx, c->shade.dgdy);
   1638         ci.ydbdy = interpolate(ys, c->shade.b0, c->shade.dbdx, c->shade.dbdy);
   1639         ci.ydady = interpolate(ys, c->shade.a0, c->shade.dadx, c->shade.dady);
   1640         c->step_y = step_y__smooth;
   1641     } else {
   1642         ci.ydrdy = c->shade.r0;
   1643         ci.ydgdy = c->shade.g0;
   1644         ci.ydbdy = c->shade.b0;
   1645         ci.ydady = c->shade.a0;
   1646         // XXX: do only if needed, or make sure this is fast
   1647         c->packed = ggl_pack_color(c, c->state.buffers.color.format,
   1648                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
   1649         c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
   1650                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
   1651     }
   1652 
   1653     // initialize the variables we need in the shader
   1654     generated_vars_t& gen = c->generated_vars;
   1655     gen.argb[GGLFormat::ALPHA].c  = ci.ydady;
   1656     gen.argb[GGLFormat::ALPHA].dx = c->shade.dadx;
   1657     gen.argb[GGLFormat::RED  ].c  = ci.ydrdy;
   1658     gen.argb[GGLFormat::RED  ].dx = c->shade.drdx;
   1659     gen.argb[GGLFormat::GREEN].c  = ci.ydgdy;
   1660     gen.argb[GGLFormat::GREEN].dx = c->shade.dgdx;
   1661     gen.argb[GGLFormat::BLUE ].c  = ci.ydbdy;
   1662     gen.argb[GGLFormat::BLUE ].dx = c->shade.dbdx;
   1663     gen.dzdx = c->shade.dzdx;
   1664     gen.f    = ci.ydfdy;
   1665     gen.dfdx = c->shade.dfdx;
   1666 
   1667     if (enables & GGL_ENABLE_TMUS) {
   1668         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1669             texture_t& t = c->state.texture[i];
   1670             if (!t.enable) continue;
   1671 
   1672             texture_iterators_t& ti = t.iterators;
   1673             if (t.s_coord == GGL_ONE_TO_ONE && t.t_coord == GGL_ONE_TO_ONE) {
   1674                 // we need to set all of these to 0 because in some cases
   1675                 // step_y__generic() or step_y__tmu() will be used and
   1676                 // therefore will update dtdy, however, in 1:1 mode
   1677                 // this is always done by the scanline rasterizer.
   1678                 ti.dsdx = ti.dsdy = ti.dtdx = ti.dtdy = 0;
   1679                 ti.ydsdy = t.shade.is0;
   1680                 ti.ydtdy = t.shade.it0;
   1681             } else {
   1682                 const int adjustSWrap = ((t.s_wrap==GGL_CLAMP)?0:16);
   1683                 const int adjustTWrap = ((t.t_wrap==GGL_CLAMP)?0:16);
   1684                 ti.sscale = t.shade.sscale + adjustSWrap;
   1685                 ti.tscale = t.shade.tscale + adjustTWrap;
   1686                 if (!(enables & GGL_ENABLE_W)) {
   1687                     // S coordinate
   1688                     const int32_t sscale = ti.sscale;
   1689                     const int32_t sy = interpolate(ys,
   1690                             t.shade.is0, t.shade.idsdx, t.shade.idsdy);
   1691                     if (sscale>=0) {
   1692                         ti.ydsdy= sy            << sscale;
   1693                         ti.dsdx = t.shade.idsdx << sscale;
   1694                         ti.dsdy = t.shade.idsdy << sscale;
   1695                     } else {
   1696                         ti.ydsdy= sy            >> -sscale;
   1697                         ti.dsdx = t.shade.idsdx >> -sscale;
   1698                         ti.dsdy = t.shade.idsdy >> -sscale;
   1699                     }
   1700                     // T coordinate
   1701                     const int32_t tscale = ti.tscale;
   1702                     const int32_t ty = interpolate(ys,
   1703                             t.shade.it0, t.shade.idtdx, t.shade.idtdy);
   1704                     if (tscale>=0) {
   1705                         ti.ydtdy= ty            << tscale;
   1706                         ti.dtdx = t.shade.idtdx << tscale;
   1707                         ti.dtdy = t.shade.idtdy << tscale;
   1708                     } else {
   1709                         ti.ydtdy= ty            >> -tscale;
   1710                         ti.dtdx = t.shade.idtdx >> -tscale;
   1711                         ti.dtdy = t.shade.idtdy >> -tscale;
   1712                     }
   1713                 }
   1714             }
   1715             // mirror for generated code...
   1716             generated_tex_vars_t& gen = c->generated_vars.texture[i];
   1717             gen.width   = t.surface.width;
   1718             gen.height  = t.surface.height;
   1719             gen.stride  = t.surface.stride;
   1720             gen.data    = int32_t(t.surface.data);
   1721             gen.dsdx = ti.dsdx;
   1722             gen.dtdx = ti.dtdx;
   1723         }
   1724     }
   1725 
   1726     // choose the y-stepper
   1727     c->step_y = step_y__nop;
   1728     if (enables & GGL_ENABLE_FOG) {
   1729         c->step_y = step_y__generic;
   1730     } else if (enables & GGL_ENABLE_TMUS) {
   1731         if (enables & GGL_ENABLE_SMOOTH) {
   1732             c->step_y = step_y__generic;
   1733         } else if (enables & GGL_ENABLE_W) {
   1734             c->step_y = step_y__w;
   1735         } else {
   1736             c->step_y = step_y__tmu;
   1737         }
   1738     } else {
   1739         if (enables & GGL_ENABLE_SMOOTH) {
   1740             c->step_y = step_y__smooth;
   1741         }
   1742     }
   1743 
   1744     // choose the rectangle blitter
   1745     c->rect = rect_generic;
   1746     if ((c->step_y == step_y__nop) &&
   1747         (c->scanline == scanline_memcpy))
   1748     {
   1749         c->rect = rect_memcpy;
   1750     }
   1751 }
   1752 
   1753 void init_y_packed(context_t* c, int32_t y0)
   1754 {
   1755     uint8_t f = c->state.buffers.color.format;
   1756     c->packed = ggl_pack_color(c, f,
   1757             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
   1758     c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
   1759             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
   1760     c->iterators.y = y0;
   1761     c->step_y = step_y__nop;
   1762     // choose the rectangle blitter
   1763     c->rect = rect_generic;
   1764     if (c->scanline == scanline_memcpy) {
   1765         c->rect = rect_memcpy;
   1766     }
   1767 }
   1768 
   1769 void init_y_noop(context_t* c, int32_t y0)
   1770 {
   1771     c->iterators.y = y0;
   1772     c->step_y = step_y__nop;
   1773     // choose the rectangle blitter
   1774     c->rect = rect_generic;
   1775     if (c->scanline == scanline_memcpy) {
   1776         c->rect = rect_memcpy;
   1777     }
   1778 }
   1779 
   1780 void init_y_error(context_t* c, int32_t y0)
   1781 {
   1782     // woooops, shoud never happen,
   1783     // fail gracefully (don't display anything)
   1784     init_y_noop(c, y0);
   1785     ALOGE("color-buffer has an invalid format!");
   1786 }
   1787 
   1788 // ----------------------------------------------------------------------------
   1789 #if 0
   1790 #pragma mark -
   1791 #endif
   1792 
   1793 void step_y__generic(context_t* c)
   1794 {
   1795     const uint32_t enables = c->state.enables;
   1796 
   1797     // iterate...
   1798     iterators_t& ci = c->iterators;
   1799     ci.y += 1;
   1800 
   1801     if (enables & GGL_ENABLE_SMOOTH) {
   1802         ci.ydrdy += c->shade.drdy;
   1803         ci.ydgdy += c->shade.dgdy;
   1804         ci.ydbdy += c->shade.dbdy;
   1805         ci.ydady += c->shade.dady;
   1806     }
   1807 
   1808     const uint32_t mask =
   1809             GGL_ENABLE_DEPTH_TEST |
   1810             GGL_ENABLE_W |
   1811             GGL_ENABLE_FOG;
   1812     if (enables & mask) {
   1813         ci.ydzdy += c->shade.dzdy;
   1814         ci.ydwdy += c->shade.dwdy;
   1815         ci.ydfdy += c->shade.dfdy;
   1816     }
   1817 
   1818     if ((enables & GGL_ENABLE_TMUS) && (!(enables & GGL_ENABLE_W))) {
   1819         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1820             if (c->state.texture[i].enable) {
   1821                 texture_iterators_t& ti = c->state.texture[i].iterators;
   1822                 ti.ydsdy += ti.dsdy;
   1823                 ti.ydtdy += ti.dtdy;
   1824             }
   1825         }
   1826     }
   1827 }
   1828 
   1829 void step_y__nop(context_t* c)
   1830 {
   1831     c->iterators.y += 1;
   1832     c->iterators.ydzdy += c->shade.dzdy;
   1833 }
   1834 
   1835 void step_y__smooth(context_t* c)
   1836 {
   1837     iterators_t& ci = c->iterators;
   1838     ci.y += 1;
   1839     ci.ydrdy += c->shade.drdy;
   1840     ci.ydgdy += c->shade.dgdy;
   1841     ci.ydbdy += c->shade.dbdy;
   1842     ci.ydady += c->shade.dady;
   1843     ci.ydzdy += c->shade.dzdy;
   1844 }
   1845 
   1846 void step_y__w(context_t* c)
   1847 {
   1848     iterators_t& ci = c->iterators;
   1849     ci.y += 1;
   1850     ci.ydzdy += c->shade.dzdy;
   1851     ci.ydwdy += c->shade.dwdy;
   1852 }
   1853 
   1854 void step_y__tmu(context_t* c)
   1855 {
   1856     iterators_t& ci = c->iterators;
   1857     ci.y += 1;
   1858     ci.ydzdy += c->shade.dzdy;
   1859     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1860         if (c->state.texture[i].enable) {
   1861             texture_iterators_t& ti = c->state.texture[i].iterators;
   1862             ti.ydsdy += ti.dsdy;
   1863             ti.ydtdy += ti.dtdy;
   1864         }
   1865     }
   1866 }
   1867 
   1868 // ----------------------------------------------------------------------------
   1869 #if 0
   1870 #pragma mark -
   1871 #endif
   1872 
   1873 void scanline_perspective(context_t* c)
   1874 {
   1875     struct {
   1876         union {
   1877             struct {
   1878                 int32_t s, sq;
   1879                 int32_t t, tq;
   1880             };
   1881             struct {
   1882                 int32_t v, q;
   1883             } st[2];
   1884         };
   1885     } tc[GGL_TEXTURE_UNIT_COUNT] __attribute__((aligned(16)));
   1886 
   1887     // XXX: we should have a special case when dwdx = 0
   1888 
   1889     // 32 pixels spans works okay. 16 is a lot better,
   1890     // but hey, it's a software renderer...
   1891     const uint32_t SPAN_BITS = 5;
   1892     const uint32_t ys = c->iterators.y;
   1893     const uint32_t xs = c->iterators.xl;
   1894     const uint32_t x1 = c->iterators.xr;
   1895 	const uint32_t xc = x1 - xs;
   1896     uint32_t remainder = xc & ((1<<SPAN_BITS)-1);
   1897     uint32_t numSpans = xc >> SPAN_BITS;
   1898 
   1899     const iterators_t& ci = c->iterators;
   1900     int32_t w0 = (xs * c->shade.dwdx) + ci.ydwdy;
   1901     int32_t q0 = gglRecipQ(w0, 30);
   1902     const int iwscale = 32 - gglClz(q0);
   1903 
   1904     const int32_t dwdx = c->shade.dwdx << SPAN_BITS;
   1905     int32_t xl = c->iterators.xl;
   1906 
   1907     // We process s & t with a loop to reduce the code size
   1908     // (and i-cache pressure).
   1909 
   1910     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1911         const texture_t& tmu = c->state.texture[i];
   1912         if (!tmu.enable) continue;
   1913         int32_t s =   tmu.shade.is0 +
   1914                      (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
   1915                      ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
   1916         int32_t t =   tmu.shade.it0 +
   1917                      (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
   1918                      ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
   1919         tc[i].s  = s;
   1920         tc[i].t  = t;
   1921         tc[i].sq = gglMulx(s, q0, iwscale);
   1922         tc[i].tq = gglMulx(t, q0, iwscale);
   1923     }
   1924 
   1925     int32_t span = 0;
   1926     do {
   1927         int32_t w1;
   1928         if (ggl_likely(numSpans)) {
   1929             w1 = w0 + dwdx;
   1930         } else {
   1931             if (remainder) {
   1932                 // finish off the scanline...
   1933                 span = remainder;
   1934                 w1 = (c->shade.dwdx * span) + w0;
   1935             } else {
   1936                 break;
   1937             }
   1938         }
   1939         int32_t q1 = gglRecipQ(w1, 30);
   1940         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1941             texture_t& tmu = c->state.texture[i];
   1942             if (!tmu.enable) continue;
   1943             texture_iterators_t& ti = tmu.iterators;
   1944 
   1945             for (int j=0 ; j<2 ; j++) {
   1946                 int32_t v = tc[i].st[j].v;
   1947                 if (span)   v += (tmu.shade.st[j].dx)*span;
   1948                 else        v += (tmu.shade.st[j].dx)<<SPAN_BITS;
   1949                 const int32_t v0 = tc[i].st[j].q;
   1950                 const int32_t v1 = gglMulx(v, q1, iwscale);
   1951                 int32_t dvdx = v1 - v0;
   1952                 if (span)   dvdx /= span;
   1953                 else        dvdx >>= SPAN_BITS;
   1954                 tc[i].st[j].v = v;
   1955                 tc[i].st[j].q = v1;
   1956 
   1957                 const int scale = ti.st[j].scale + (iwscale - 30);
   1958                 if (scale >= 0) {
   1959                     ti.st[j].ydvdy = v0   << scale;
   1960                     ti.st[j].dvdx  = dvdx << scale;
   1961                 } else {
   1962                     ti.st[j].ydvdy = v0   >> -scale;
   1963                     ti.st[j].dvdx  = dvdx >> -scale;
   1964                 }
   1965             }
   1966             generated_tex_vars_t& gen = c->generated_vars.texture[i];
   1967             gen.dsdx = ti.st[0].dvdx;
   1968             gen.dtdx = ti.st[1].dvdx;
   1969         }
   1970         c->iterators.xl = xl;
   1971         c->iterators.xr = xl = xl + (span ? span : (1<<SPAN_BITS));
   1972         w0 = w1;
   1973         q0 = q1;
   1974         c->span(c);
   1975     } while(numSpans--);
   1976 }
   1977 
   1978 void scanline_perspective_single(context_t* c)
   1979 {
   1980     // 32 pixels spans works okay. 16 is a lot better,
   1981     // but hey, it's a software renderer...
   1982     const uint32_t SPAN_BITS = 5;
   1983     const uint32_t ys = c->iterators.y;
   1984     const uint32_t xs = c->iterators.xl;
   1985     const uint32_t x1 = c->iterators.xr;
   1986 	const uint32_t xc = x1 - xs;
   1987 
   1988     const iterators_t& ci = c->iterators;
   1989     int32_t w = (xs * c->shade.dwdx) + ci.ydwdy;
   1990     int32_t iw = gglRecipQ(w, 30);
   1991     const int iwscale = 32 - gglClz(iw);
   1992 
   1993     const int i = 31 - gglClz(c->state.enabled_tmu);
   1994     generated_tex_vars_t& gen = c->generated_vars.texture[i];
   1995     texture_t& tmu = c->state.texture[i];
   1996     texture_iterators_t& ti = tmu.iterators;
   1997     const int sscale = ti.sscale + (iwscale - 30);
   1998     const int tscale = ti.tscale + (iwscale - 30);
   1999     int32_t s =   tmu.shade.is0 +
   2000                  (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
   2001                  ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
   2002     int32_t t =   tmu.shade.it0 +
   2003                  (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
   2004                  ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
   2005     int32_t s0 = gglMulx(s, iw, iwscale);
   2006     int32_t t0 = gglMulx(t, iw, iwscale);
   2007     int32_t xl = c->iterators.xl;
   2008 
   2009     int32_t sq, tq, dsdx, dtdx;
   2010     int32_t premainder = xc & ((1<<SPAN_BITS)-1);
   2011     uint32_t numSpans = xc >> SPAN_BITS;
   2012     if (c->shade.dwdx == 0) {
   2013         // XXX: we could choose to do this if the error is small enough
   2014         numSpans = 0;
   2015         premainder = xc;
   2016         goto no_perspective;
   2017     }
   2018 
   2019     if (premainder) {
   2020         w += c->shade.dwdx   * premainder;
   2021         iw = gglRecipQ(w, 30);
   2022 no_perspective:
   2023         s += tmu.shade.idsdx * premainder;
   2024         t += tmu.shade.idtdx * premainder;
   2025         sq = gglMulx(s, iw, iwscale);
   2026         tq = gglMulx(t, iw, iwscale);
   2027         dsdx = (sq - s0) / premainder;
   2028         dtdx = (tq - t0) / premainder;
   2029         c->iterators.xl = xl;
   2030         c->iterators.xr = xl = xl + premainder;
   2031         goto finish;
   2032     }
   2033 
   2034     while (numSpans--) {
   2035         w += c->shade.dwdx   << SPAN_BITS;
   2036         s += tmu.shade.idsdx << SPAN_BITS;
   2037         t += tmu.shade.idtdx << SPAN_BITS;
   2038         iw = gglRecipQ(w, 30);
   2039         sq = gglMulx(s, iw, iwscale);
   2040         tq = gglMulx(t, iw, iwscale);
   2041         dsdx = (sq - s0) >> SPAN_BITS;
   2042         dtdx = (tq - t0) >> SPAN_BITS;
   2043         c->iterators.xl = xl;
   2044         c->iterators.xr = xl = xl + (1<<SPAN_BITS);
   2045 finish:
   2046         if (sscale >= 0) {
   2047             ti.ydsdy = s0   << sscale;
   2048             ti.dsdx  = dsdx << sscale;
   2049         } else {
   2050             ti.ydsdy = s0   >>-sscale;
   2051             ti.dsdx  = dsdx >>-sscale;
   2052         }
   2053         if (tscale >= 0) {
   2054             ti.ydtdy = t0   << tscale;
   2055             ti.dtdx  = dtdx << tscale;
   2056         } else {
   2057             ti.ydtdy = t0   >>-tscale;
   2058             ti.dtdx  = dtdx >>-tscale;
   2059         }
   2060         s0 = sq;
   2061         t0 = tq;
   2062         gen.dsdx = ti.dsdx;
   2063         gen.dtdx = ti.dtdx;
   2064         c->span(c);
   2065     }
   2066 }
   2067 
   2068 // ----------------------------------------------------------------------------
   2069 
   2070 void scanline_col32cb16blend(context_t* c)
   2071 {
   2072     int32_t x = c->iterators.xl;
   2073     size_t ct = c->iterators.xr - x;
   2074     int32_t y = c->iterators.y;
   2075     surface_t* cb = &(c->state.buffers.color);
   2076     union {
   2077         uint16_t* dst;
   2078         uint32_t* dst32;
   2079     };
   2080     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2081 
   2082 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
   2083 #if defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2084     scanline_col32cb16blend_neon(dst, &(c->packed8888), ct);
   2085 #else  // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2086     scanline_col32cb16blend_arm(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
   2087 #endif // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2088 #else
   2089     uint32_t s = GGL_RGBA_TO_HOST(c->packed8888);
   2090     int sA = (s>>24);
   2091     int f = 0x100 - (sA + (sA>>7));
   2092     while (ct--) {
   2093         uint16_t d = *dst;
   2094         int dR = (d>>11)&0x1f;
   2095         int dG = (d>>5)&0x3f;
   2096         int dB = (d)&0x1f;
   2097         int sR = (s >> (   3))&0x1F;
   2098         int sG = (s >> ( 8+2))&0x3F;
   2099         int sB = (s >> (16+3))&0x1F;
   2100         sR += (f*dR)>>8;
   2101         sG += (f*dG)>>8;
   2102         sB += (f*dB)>>8;
   2103         *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
   2104     }
   2105 #endif
   2106 
   2107 }
   2108 
   2109 void scanline_t32cb16(context_t* c)
   2110 {
   2111     int32_t x = c->iterators.xl;
   2112     size_t ct = c->iterators.xr - x;
   2113     int32_t y = c->iterators.y;
   2114     surface_t* cb = &(c->state.buffers.color);
   2115     union {
   2116         uint16_t* dst;
   2117         uint32_t* dst32;
   2118     };
   2119     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2120 
   2121     surface_t* tex = &(c->state.texture[0].surface);
   2122     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2123     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2124     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
   2125     int sR, sG, sB;
   2126     uint32_t s, d;
   2127 
   2128     if (ct==1 || uint32_t(dst)&2) {
   2129 last_one:
   2130         s = GGL_RGBA_TO_HOST( *src++ );
   2131         *dst++ = convertAbgr8888ToRgb565(s);
   2132         ct--;
   2133     }
   2134 
   2135     while (ct >= 2) {
   2136 #if BYTE_ORDER == BIG_ENDIAN
   2137         s = GGL_RGBA_TO_HOST( *src++ );
   2138         d = convertAbgr8888ToRgb565_hi16(s);
   2139 
   2140         s = GGL_RGBA_TO_HOST( *src++ );
   2141         d |= convertAbgr8888ToRgb565(s);
   2142 #else
   2143         s = GGL_RGBA_TO_HOST( *src++ );
   2144         d = convertAbgr8888ToRgb565(s);
   2145 
   2146         s = GGL_RGBA_TO_HOST( *src++ );
   2147         d |= convertAbgr8888ToRgb565(s) << 16;
   2148 #endif
   2149         *dst32++ = d;
   2150         ct -= 2;
   2151     }
   2152 
   2153     if (ct > 0) {
   2154         goto last_one;
   2155     }
   2156 }
   2157 
   2158 void scanline_t32cb16blend(context_t* c)
   2159 {
   2160 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__mips)))
   2161     int32_t x = c->iterators.xl;
   2162     size_t ct = c->iterators.xr - x;
   2163     int32_t y = c->iterators.y;
   2164     surface_t* cb = &(c->state.buffers.color);
   2165     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2166 
   2167     surface_t* tex = &(c->state.texture[0].surface);
   2168     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2169     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2170     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
   2171 
   2172 #ifdef __arm__
   2173     scanline_t32cb16blend_arm(dst, src, ct);
   2174 #else
   2175     scanline_t32cb16blend_mips(dst, src, ct);
   2176 #endif
   2177 #else
   2178     dst_iterator16  di(c);
   2179     horz_iterator32  hi(c);
   2180     blender_32to16  bl(c);
   2181     while (di.count--) {
   2182         uint32_t s = hi.get_pixel32();
   2183         bl.write(s, di.dst);
   2184         di.dst++;
   2185     }
   2186 #endif
   2187 }
   2188 
   2189 void scanline_t32cb16blend_srca(context_t* c)
   2190 {
   2191     dst_iterator16  di(c);
   2192     horz_iterator32  hi(c);
   2193     blender_32to16_srcA  blender(c);
   2194 
   2195     while (di.count--) {
   2196         uint32_t s = hi.get_pixel32();
   2197         blender.write(s,di.dst);
   2198         di.dst++;
   2199     }
   2200 }
   2201 
   2202 void scanline_t16cb16blend_clamp_mod(context_t* c)
   2203 {
   2204     const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
   2205     if (a == 0) {
   2206         return;
   2207     }
   2208 
   2209     if (a == 255) {
   2210         scanline_t16cb16_clamp(c);
   2211         return;
   2212     }
   2213 
   2214     dst_iterator16  di(c);
   2215     blender_16to16_modulate  blender(c);
   2216     clamp_iterator  ci(c);
   2217 
   2218     while (di.count--) {
   2219         uint16_t s = ci.get_pixel16();
   2220         blender.write(s, di.dst);
   2221         di.dst++;
   2222     }
   2223 }
   2224 
   2225 void scanline_memcpy(context_t* c)
   2226 {
   2227     int32_t x = c->iterators.xl;
   2228     size_t ct = c->iterators.xr - x;
   2229     int32_t y = c->iterators.y;
   2230     surface_t* cb = &(c->state.buffers.color);
   2231     const GGLFormat* fp = &(c->formats[cb->format]);
   2232     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2233                             (x + (cb->stride * y)) * fp->size;
   2234 
   2235     surface_t* tex = &(c->state.texture[0].surface);
   2236     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2237     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2238     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
   2239                             (u + (tex->stride * v)) * fp->size;
   2240 
   2241     const size_t size = ct * fp->size;
   2242     memcpy(dst, src, size);
   2243 }
   2244 
   2245 void scanline_memset8(context_t* c)
   2246 {
   2247     int32_t x = c->iterators.xl;
   2248     size_t ct = c->iterators.xr - x;
   2249     int32_t y = c->iterators.y;
   2250     surface_t* cb = &(c->state.buffers.color);
   2251     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) + (x+(cb->stride*y));
   2252     uint32_t packed = c->packed;
   2253     memset(dst, packed, ct);
   2254 }
   2255 
   2256 void scanline_memset16(context_t* c)
   2257 {
   2258     int32_t x = c->iterators.xl;
   2259     size_t ct = c->iterators.xr - x;
   2260     int32_t y = c->iterators.y;
   2261     surface_t* cb = &(c->state.buffers.color);
   2262     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2263     uint32_t packed = c->packed;
   2264     android_memset16(dst, packed, ct*2);
   2265 }
   2266 
   2267 void scanline_memset32(context_t* c)
   2268 {
   2269     int32_t x = c->iterators.xl;
   2270     size_t ct = c->iterators.xr - x;
   2271     int32_t y = c->iterators.y;
   2272     surface_t* cb = &(c->state.buffers.color);
   2273     uint32_t* dst = reinterpret_cast<uint32_t*>(cb->data) + (x+(cb->stride*y));
   2274     uint32_t packed = GGL_HOST_TO_RGBA(c->packed);
   2275     android_memset32(dst, packed, ct*4);
   2276 }
   2277 
   2278 void scanline_clear(context_t* c)
   2279 {
   2280     int32_t x = c->iterators.xl;
   2281     size_t ct = c->iterators.xr - x;
   2282     int32_t y = c->iterators.y;
   2283     surface_t* cb = &(c->state.buffers.color);
   2284     const GGLFormat* fp = &(c->formats[cb->format]);
   2285     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2286                             (x + (cb->stride * y)) * fp->size;
   2287     const size_t size = ct * fp->size;
   2288     memset(dst, 0, size);
   2289 }
   2290 
   2291 void scanline_set(context_t* c)
   2292 {
   2293     int32_t x = c->iterators.xl;
   2294     size_t ct = c->iterators.xr - x;
   2295     int32_t y = c->iterators.y;
   2296     surface_t* cb = &(c->state.buffers.color);
   2297     const GGLFormat* fp = &(c->formats[cb->format]);
   2298     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2299                             (x + (cb->stride * y)) * fp->size;
   2300     const size_t size = ct * fp->size;
   2301     memset(dst, 0xFF, size);
   2302 }
   2303 
   2304 void scanline_noop(context_t* c)
   2305 {
   2306 }
   2307 
   2308 void rect_generic(context_t* c, size_t yc)
   2309 {
   2310     do {
   2311         c->scanline(c);
   2312         c->step_y(c);
   2313     } while (--yc);
   2314 }
   2315 
   2316 void rect_memcpy(context_t* c, size_t yc)
   2317 {
   2318     int32_t x = c->iterators.xl;
   2319     size_t ct = c->iterators.xr - x;
   2320     int32_t y = c->iterators.y;
   2321     surface_t* cb = &(c->state.buffers.color);
   2322     const GGLFormat* fp = &(c->formats[cb->format]);
   2323     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2324                             (x + (cb->stride * y)) * fp->size;
   2325 
   2326     surface_t* tex = &(c->state.texture[0].surface);
   2327     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2328     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2329     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
   2330                             (u + (tex->stride * v)) * fp->size;
   2331 
   2332     if (cb->stride == tex->stride && ct == size_t(cb->stride)) {
   2333         memcpy(dst, src, ct * fp->size * yc);
   2334     } else {
   2335         const size_t size = ct * fp->size;
   2336         const size_t dbpr = cb->stride  * fp->size;
   2337         const size_t sbpr = tex->stride * fp->size;
   2338         do {
   2339             memcpy(dst, src, size);
   2340             dst += dbpr;
   2341             src += sbpr;
   2342         } while (--yc);
   2343     }
   2344 }
   2345 // ----------------------------------------------------------------------------
   2346 }; // namespace android
   2347 
   2348