Home | History | Annotate | Download | only in libpixelflinger
      1 /* libs/pixelflinger/scanline.cpp
      2 **
      3 ** Copyright 2006-2011, The Android Open Source Project
      4 **
      5 ** Licensed under the Apache License, Version 2.0 (the "License");
      6 ** you may not use this file except in compliance with the License.
      7 ** You may obtain a copy of the License at
      8 **
      9 **     http://www.apache.org/licenses/LICENSE-2.0
     10 **
     11 ** Unless required by applicable law or agreed to in writing, software
     12 ** distributed under the License is distributed on an "AS IS" BASIS,
     13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 ** See the License for the specific language governing permissions and
     15 ** limitations under the License.
     16 */
     17 
     18 
     19 #define LOG_TAG "pixelflinger"
     20 
     21 #include <assert.h>
     22 #include <stdlib.h>
     23 #include <stdio.h>
     24 #include <string.h>
     25 
     26 #include <cutils/memory.h>
     27 #include <cutils/log.h>
     28 
     29 #ifdef __arm__
     30 #include <machine/cpu-features.h>
     31 #endif
     32 
     33 #include "buffer.h"
     34 #include "scanline.h"
     35 
     36 #include "codeflinger/CodeCache.h"
     37 #include "codeflinger/GGLAssembler.h"
     38 #if defined(__arm__)
     39 #include "codeflinger/ARMAssembler.h"
     40 #elif defined(__aarch64__)
     41 #include "codeflinger/Arm64Assembler.h"
     42 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
     43 #include "codeflinger/MIPSAssembler.h"
     44 #elif defined(__mips__) && defined(__LP64__)
     45 #include "codeflinger/MIPS64Assembler.h"
     46 #endif
     47 //#include "codeflinger/ARMAssemblerOptimizer.h"
     48 
     49 // ----------------------------------------------------------------------------
     50 
     51 #define ANDROID_CODEGEN_GENERIC     0   // force generic pixel pipeline
     52 #define ANDROID_CODEGEN_C           1   // hand-written C, fallback generic
     53 #define ANDROID_CODEGEN_ASM         2   // hand-written asm, fallback generic
     54 #define ANDROID_CODEGEN_GENERATED   3   // hand-written asm, fallback codegen
     55 
     56 #ifdef NDEBUG
     57 #   define ANDROID_RELEASE
     58 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
     59 #else
     60 #   define ANDROID_DEBUG
     61 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
     62 #endif
     63 
     64 #if defined(__arm__) || (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))) || defined(__aarch64__)
     65 #   define ANDROID_ARM_CODEGEN  1
     66 #else
     67 #   define ANDROID_ARM_CODEGEN  0
     68 #endif
     69 
     70 #define DEBUG__CODEGEN_ONLY     0
     71 
     72 /* Set to 1 to dump to the log the states that need a new
     73  * code-generated scanline callback, i.e. those that don't
     74  * have a corresponding shortcut function.
     75  */
     76 #define DEBUG_NEEDS  0
     77 
     78 #if defined( __mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
     79 #define ASSEMBLY_SCRATCH_SIZE   4096
     80 #elif defined(__aarch64__)
     81 #define ASSEMBLY_SCRATCH_SIZE   8192
     82 #else
     83 #define ASSEMBLY_SCRATCH_SIZE   2048
     84 #endif
     85 
     86 // ----------------------------------------------------------------------------
     87 namespace android {
     88 // ----------------------------------------------------------------------------
     89 
     90 static void init_y(context_t*, int32_t);
     91 static void init_y_noop(context_t*, int32_t);
     92 static void init_y_packed(context_t*, int32_t);
     93 static void init_y_error(context_t*, int32_t);
     94 
     95 static void step_y__generic(context_t* c);
     96 static void step_y__nop(context_t*);
     97 static void step_y__smooth(context_t* c);
     98 static void step_y__tmu(context_t* c);
     99 static void step_y__w(context_t* c);
    100 
    101 static void scanline(context_t* c);
    102 static void scanline_perspective(context_t* c);
    103 static void scanline_perspective_single(context_t* c);
    104 static void scanline_t32cb16blend(context_t* c);
    105 static void scanline_t32cb16blend_dither(context_t* c);
    106 static void scanline_t32cb16blend_srca(context_t* c);
    107 static void scanline_t32cb16blend_clamp(context_t* c);
    108 static void scanline_t32cb16blend_clamp_dither(context_t* c);
    109 static void scanline_t32cb16blend_clamp_mod(context_t* c);
    110 static void scanline_x32cb16blend_clamp_mod(context_t* c);
    111 static void scanline_t32cb16blend_clamp_mod_dither(context_t* c);
    112 static void scanline_x32cb16blend_clamp_mod_dither(context_t* c);
    113 static void scanline_t32cb16(context_t* c);
    114 static void scanline_t32cb16_dither(context_t* c);
    115 static void scanline_t32cb16_clamp(context_t* c);
    116 static void scanline_t32cb16_clamp_dither(context_t* c);
    117 static void scanline_col32cb16blend(context_t* c);
    118 static void scanline_t16cb16_clamp(context_t* c);
    119 static void scanline_t16cb16blend_clamp_mod(context_t* c);
    120 static void scanline_memcpy(context_t* c);
    121 static void scanline_memset8(context_t* c);
    122 static void scanline_memset16(context_t* c);
    123 static void scanline_memset32(context_t* c);
    124 static void scanline_noop(context_t* c);
    125 static void scanline_set(context_t* c);
    126 static void scanline_clear(context_t* c);
    127 
    128 static void rect_generic(context_t* c, size_t yc);
    129 static void rect_memcpy(context_t* c, size_t yc);
    130 
    131 #if defined( __arm__)
    132 extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
    133 extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
    134 extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
    135 extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
    136 #elif defined(__aarch64__)
    137 extern "C" void scanline_t32cb16blend_arm64(uint16_t*, uint32_t*, size_t);
    138 extern "C" void scanline_col32cb16blend_arm64(uint16_t *dst, uint32_t col, size_t ct);
    139 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
    140 extern "C" void scanline_t32cb16blend_mips(uint16_t*, uint32_t*, size_t);
    141 #elif defined(__mips__) && defined(__LP64__)
    142 extern "C" void scanline_t32cb16blend_mips64(uint16_t*, uint32_t*, size_t);
    143 extern "C" void scanline_col32cb16blend_mips64(uint16_t *dst, uint32_t col, size_t ct);
    144 #endif
    145 
    146 // ----------------------------------------------------------------------------
    147 
    148 static inline uint16_t  convertAbgr8888ToRgb565(uint32_t  pix)
    149 {
    150     return uint16_t( ((pix << 8) & 0xf800) |
    151                       ((pix >> 5) & 0x07e0) |
    152                       ((pix >> 19) & 0x001f) );
    153 }
    154 
    155 struct shortcut_t {
    156     needs_filter_t  filter;
    157     const char*     desc;
    158     void            (*scanline)(context_t*);
    159     void            (*init_y)(context_t*, int32_t);
    160 };
    161 
    162 // Keep in sync with needs
    163 
    164 /* To understand the values here, have a look at:
    165  *     system/core/include/private/pixelflinger/ggl_context.h
    166  *
    167  * Especially the lines defining and using GGL_RESERVE_NEEDS
    168  *
    169  * Quick reminders:
    170  *   - the last nibble of the first value is the destination buffer format.
    171  *   - the last nibble of the third value is the source texture format
    172  *   - formats: 4=rgb565 1=abgr8888 2=xbgr8888
    173  *
    174  * In the descriptions below:
    175  *
    176  *   SRC      means we copy the source pixels to the destination
    177  *
    178  *   SRC_OVER means we blend the source pixels to the destination
    179  *            with dstFactor = 1-srcA, srcFactor=1  (premultiplied source).
    180  *            This mode is otherwise called 'blend'.
    181  *
    182  *   SRCA_OVER means we blend the source pixels to the destination
    183  *             with dstFactor=srcA*(1-srcA) srcFactor=srcA (non-premul source).
    184  *             This mode is otherwise called 'blend_srca'
    185  *
    186  *   clamp    means we fetch source pixels from a texture with u/v clamping
    187  *
    188  *   mod      means the source pixels are modulated (multiplied) by the
    189  *            a/r/g/b of the current context's color. Typically used for
    190  *            fade-in / fade-out.
    191  *
    192  *   dither   means we dither 32 bit values to 16 bits
    193  */
    194 static shortcut_t shortcuts[] = {
    195     { { { 0x03515104, 0x00000077, { 0x00000A01, 0x00000000 } },
    196         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    197         "565 fb, 8888 tx, blend SRC_OVER", scanline_t32cb16blend, init_y_noop },
    198     { { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
    199         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    200         "565 fb, 8888 tx, SRC", scanline_t32cb16, init_y_noop  },
    201     /* same as first entry, but with dithering */
    202     { { { 0x03515104, 0x00000177, { 0x00000A01, 0x00000000 } },
    203         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    204         "565 fb, 8888 tx, blend SRC_OVER dither", scanline_t32cb16blend_dither, init_y_noop },
    205     /* same as second entry, but with dithering */
    206     { { { 0x03010104, 0x00000177, { 0x00000A01, 0x00000000 } },
    207         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    208         "565 fb, 8888 tx, SRC dither", scanline_t32cb16_dither, init_y_noop  },
    209     /* this is used during the boot animation - CHEAT: ignore dithering */
    210     { { { 0x03545404, 0x00000077, { 0x00000A01, 0x00000000 } },
    211         { 0xFFFFFFFF, 0xFFFFFEFF, { 0xFFFFFFFF, 0x0000003F } } },
    212         "565 fb, 8888 tx, blend dst:ONE_MINUS_SRCA src:SRCA", scanline_t32cb16blend_srca, init_y_noop },
    213     /* special case for arbitrary texture coordinates (think scaling) */
    214     { { { 0x03515104, 0x00000077, { 0x00000001, 0x00000000 } },
    215         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    216         "565 fb, 8888 tx, SRC_OVER clamp", scanline_t32cb16blend_clamp, init_y },
    217     { { { 0x03515104, 0x00000177, { 0x00000001, 0x00000000 } },
    218         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    219         "565 fb, 8888 tx, SRC_OVER clamp dither", scanline_t32cb16blend_clamp_dither, init_y },
    220     /* another case used during emulation */
    221     { { { 0x03515104, 0x00000077, { 0x00001001, 0x00000000 } },
    222         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    223         "565 fb, 8888 tx, SRC_OVER clamp modulate", scanline_t32cb16blend_clamp_mod, init_y },
    224     /* and this */
    225     { { { 0x03515104, 0x00000077, { 0x00001002, 0x00000000 } },
    226         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    227         "565 fb, x888 tx, SRC_OVER clamp modulate", scanline_x32cb16blend_clamp_mod, init_y },
    228     { { { 0x03515104, 0x00000177, { 0x00001001, 0x00000000 } },
    229         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    230         "565 fb, 8888 tx, SRC_OVER clamp modulate dither", scanline_t32cb16blend_clamp_mod_dither, init_y },
    231     { { { 0x03515104, 0x00000177, { 0x00001002, 0x00000000 } },
    232         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    233         "565 fb, x888 tx, SRC_OVER clamp modulate dither", scanline_x32cb16blend_clamp_mod_dither, init_y },
    234     { { { 0x03010104, 0x00000077, { 0x00000001, 0x00000000 } },
    235         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    236         "565 fb, 8888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
    237     { { { 0x03010104, 0x00000077, { 0x00000002, 0x00000000 } },
    238         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    239         "565 fb, x888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
    240     { { { 0x03010104, 0x00000177, { 0x00000001, 0x00000000 } },
    241         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    242         "565 fb, 8888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
    243     { { { 0x03010104, 0x00000177, { 0x00000002, 0x00000000 } },
    244         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    245         "565 fb, x888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
    246     { { { 0x03010104, 0x00000077, { 0x00000004, 0x00000000 } },
    247         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    248         "565 fb, 565 tx, SRC clamp", scanline_t16cb16_clamp, init_y  },
    249     { { { 0x03515104, 0x00000077, { 0x00001004, 0x00000000 } },
    250         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    251         "565 fb, 565 tx, SRC_OVER clamp", scanline_t16cb16blend_clamp_mod, init_y  },
    252     { { { 0x03515104, 0x00000077, { 0x00000000, 0x00000000 } },
    253         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0xFFFFFFFF } } },
    254         "565 fb, 8888 fixed color", scanline_col32cb16blend, init_y_packed  },
    255     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    256         { 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
    257         "(nop) alpha test", scanline_noop, init_y_noop },
    258     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    259         { 0x00000000, 0x00000070, { 0x00000000, 0x00000000 } } },
    260         "(nop) depth test", scanline_noop, init_y_noop },
    261     { { { 0x05000000, 0x00000000, { 0x00000000, 0x00000000 } },
    262         { 0x0F000000, 0x00000080, { 0x00000000, 0x00000000 } } },
    263         "(nop) logic_op", scanline_noop, init_y_noop },
    264     { { { 0xF0000000, 0x00000000, { 0x00000000, 0x00000000 } },
    265         { 0xF0000000, 0x00000080, { 0x00000000, 0x00000000 } } },
    266         "(nop) color mask", scanline_noop, init_y_noop },
    267     { { { 0x0F000000, 0x00000077, { 0x00000000, 0x00000000 } },
    268         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
    269         "(set) logic_op", scanline_set, init_y_noop },
    270     { { { 0x00000000, 0x00000077, { 0x00000000, 0x00000000 } },
    271         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
    272         "(clear) logic_op", scanline_clear, init_y_noop },
    273     { { { 0x03000000, 0x00000077, { 0x00000000, 0x00000000 } },
    274         { 0xFFFFFF00, 0x000000F7, { 0x00000000, 0x00000000 } } },
    275         "(clear) blending 0/0", scanline_clear, init_y_noop },
    276     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    277         { 0x0000003F, 0x00000000, { 0x00000000, 0x00000000 } } },
    278         "(error) invalid color-buffer format", scanline_noop, init_y_error },
    279 };
    280 static const needs_filter_t noblend1to1 = {
    281         // (disregard dithering, see below)
    282         { 0x03010100, 0x00000077, { 0x00000A00, 0x00000000 } },
    283         { 0xFFFFFFC0, 0xFFFFFEFF, { 0xFFFFFFC0, 0x0000003F } }
    284 };
    285 static  const needs_filter_t fill16noblend = {
    286         { 0x03010100, 0x00000077, { 0x00000000, 0x00000000 } },
    287         { 0xFFFFFFC0, 0xFFFFFFFF, { 0x0000003F, 0x0000003F } }
    288 };
    289 
    290 // ----------------------------------------------------------------------------
    291 
    292 #if ANDROID_ARM_CODEGEN
    293 
    294 #if defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
    295 static CodeCache gCodeCache(32 * 1024);
    296 #elif defined(__aarch64__)
    297 static CodeCache gCodeCache(48 * 1024);
    298 #else
    299 static CodeCache gCodeCache(12 * 1024);
    300 #endif
    301 
    302 class ScanlineAssembly : public Assembly {
    303     AssemblyKey<needs_t> mKey;
    304 public:
    305     ScanlineAssembly(needs_t needs, size_t size)
    306         : Assembly(size), mKey(needs) { }
    307     const AssemblyKey<needs_t>& key() const { return mKey; }
    308 };
    309 #endif
    310 
    311 // ----------------------------------------------------------------------------
    312 
    313 void ggl_init_scanline(context_t* c)
    314 {
    315     c->init_y = init_y;
    316     c->step_y = step_y__generic;
    317     c->scanline = scanline;
    318 }
    319 
    320 void ggl_uninit_scanline(context_t* c)
    321 {
    322     if (c->state.buffers.coverage)
    323         free(c->state.buffers.coverage);
    324 #if ANDROID_ARM_CODEGEN
    325     if (c->scanline_as)
    326         c->scanline_as->decStrong(c);
    327 #endif
    328 }
    329 
    330 // ----------------------------------------------------------------------------
    331 
    332 static void pick_scanline(context_t* c)
    333 {
    334 #if (!defined(DEBUG__CODEGEN_ONLY) || (DEBUG__CODEGEN_ONLY == 0))
    335 
    336 #if ANDROID_CODEGEN == ANDROID_CODEGEN_GENERIC
    337     c->init_y = init_y;
    338     c->step_y = step_y__generic;
    339     c->scanline = scanline;
    340     return;
    341 #endif
    342 
    343     //printf("*** needs [%08lx:%08lx:%08lx:%08lx]\n",
    344     //    c->state.needs.n, c->state.needs.p,
    345     //    c->state.needs.t[0], c->state.needs.t[1]);
    346 
    347     // first handle the special case that we cannot test with a filter
    348     const uint32_t cb_format = GGL_READ_NEEDS(CB_FORMAT, c->state.needs.n);
    349     if (GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0]) == cb_format) {
    350         if (c->state.needs.match(noblend1to1)) {
    351             // this will match regardless of dithering state, since both
    352             // src and dest have the same format anyway, there is no dithering
    353             // to be done.
    354             const GGLFormat* f =
    355                 &(c->formats[GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0])]);
    356             if ((f->components == GGL_RGB) ||
    357                 (f->components == GGL_RGBA) ||
    358                 (f->components == GGL_LUMINANCE) ||
    359                 (f->components == GGL_LUMINANCE_ALPHA))
    360             {
    361                 // format must have all of RGB components
    362                 // (so the current color doesn't show through)
    363                 c->scanline = scanline_memcpy;
    364                 c->init_y = init_y_noop;
    365                 return;
    366             }
    367         }
    368     }
    369 
    370     if (c->state.needs.match(fill16noblend)) {
    371         c->init_y = init_y_packed;
    372         switch (c->formats[cb_format].size) {
    373         case 1: c->scanline = scanline_memset8;  return;
    374         case 2: c->scanline = scanline_memset16; return;
    375         case 4: c->scanline = scanline_memset32; return;
    376         }
    377     }
    378 
    379     const int numFilters = sizeof(shortcuts)/sizeof(shortcut_t);
    380     for (int i=0 ; i<numFilters ; i++) {
    381         if (c->state.needs.match(shortcuts[i].filter)) {
    382             c->scanline = shortcuts[i].scanline;
    383             c->init_y = shortcuts[i].init_y;
    384             return;
    385         }
    386     }
    387 
    388 #if DEBUG_NEEDS
    389     ALOGI("Needs: n=0x%08x p=0x%08x t0=0x%08x t1=0x%08x",
    390          c->state.needs.n, c->state.needs.p,
    391          c->state.needs.t[0], c->state.needs.t[1]);
    392 #endif
    393 
    394 #endif // DEBUG__CODEGEN_ONLY
    395 
    396     c->init_y = init_y;
    397     c->step_y = step_y__generic;
    398 
    399 #if ANDROID_ARM_CODEGEN
    400     // we're going to have to generate some code...
    401     // here, generate code for our pixel pipeline
    402     const AssemblyKey<needs_t> key(c->state.needs);
    403     sp<Assembly> assembly = gCodeCache.lookup(key);
    404     if (assembly == 0) {
    405         // create a new assembly region
    406         sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs,
    407                 ASSEMBLY_SCRATCH_SIZE);
    408         // initialize our assembler
    409 #if defined(__arm__)
    410         GGLAssembler assembler( new ARMAssembler(a) );
    411         //GGLAssembler assembler(
    412         //        new ARMAssemblerOptimizer(new ARMAssembler(a)) );
    413 #endif
    414 #if defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
    415         GGLAssembler assembler( new ArmToMipsAssembler(a) );
    416 #elif defined(__mips__) && defined(__LP64__)
    417         GGLAssembler assembler( new ArmToMips64Assembler(a) );
    418 #elif defined(__aarch64__)
    419         GGLAssembler assembler( new ArmToArm64Assembler(a) );
    420 #endif
    421         // generate the scanline code for the given needs
    422         bool err = assembler.scanline(c->state.needs, c) != 0;
    423         if (ggl_likely(!err)) {
    424             // finally, cache this assembly
    425             err = gCodeCache.cache(a->key(), a) < 0;
    426         }
    427         if (ggl_unlikely(err)) {
    428             ALOGE("error generating or caching assembly. Reverting to NOP.");
    429             c->scanline = scanline_noop;
    430             c->init_y = init_y_noop;
    431             c->step_y = step_y__nop;
    432             return;
    433         }
    434         assembly = a;
    435     }
    436 
    437     // release the previous assembly
    438     if (c->scanline_as) {
    439         c->scanline_as->decStrong(c);
    440     }
    441 
    442     //ALOGI("using generated pixel-pipeline");
    443     c->scanline_as = assembly.get();
    444     c->scanline_as->incStrong(c); //  hold on to assembly
    445     c->scanline = (void(*)(context_t* c))assembly->base();
    446 #else
    447 //    ALOGW("using generic (slow) pixel-pipeline");
    448     c->scanline = scanline;
    449 #endif
    450 }
    451 
    452 void ggl_pick_scanline(context_t* c)
    453 {
    454     pick_scanline(c);
    455     if ((c->state.enables & GGL_ENABLE_W) &&
    456         (c->state.enables & GGL_ENABLE_TMUS))
    457     {
    458         c->span = c->scanline;
    459         c->scanline = scanline_perspective;
    460         if (!(c->state.enabled_tmu & (c->state.enabled_tmu - 1))) {
    461             // only one TMU enabled
    462             c->scanline = scanline_perspective_single;
    463         }
    464     }
    465 }
    466 
    467 // ----------------------------------------------------------------------------
    468 
    469 static void blending(context_t* c, pixel_t* fragment, pixel_t* fb);
    470 static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
    471         const pixel_t* src, const pixel_t* dst);
    472 static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
    473 
    474 #if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
    475 
    476 // no need to compile the generic-pipeline, it can't be reached
    477 void scanline(context_t*)
    478 {
    479 }
    480 
    481 #else
    482 
    483 void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv)
    484 {
    485     if (su && sv) {
    486         if (su > sv) {
    487             v = ggl_expand(v, sv, su);
    488             sv = su;
    489         } else if (su < sv) {
    490             u = ggl_expand(u, su, sv);
    491             su = sv;
    492         }
    493     }
    494 }
    495 
    496 void blending(context_t* c, pixel_t* fragment, pixel_t* fb)
    497 {
    498     rescale(fragment->c[0], fragment->s[0], fb->c[0], fb->s[0]);
    499     rescale(fragment->c[1], fragment->s[1], fb->c[1], fb->s[1]);
    500     rescale(fragment->c[2], fragment->s[2], fb->c[2], fb->s[2]);
    501     rescale(fragment->c[3], fragment->s[3], fb->c[3], fb->s[3]);
    502 
    503     pixel_t sf, df;
    504     blend_factor(c, &sf, c->state.blend.src, fragment, fb);
    505     blend_factor(c, &df, c->state.blend.dst, fragment, fb);
    506 
    507     fragment->c[1] =
    508             gglMulAddx(fragment->c[1], sf.c[1], gglMulx(fb->c[1], df.c[1]));
    509     fragment->c[2] =
    510             gglMulAddx(fragment->c[2], sf.c[2], gglMulx(fb->c[2], df.c[2]));
    511     fragment->c[3] =
    512             gglMulAddx(fragment->c[3], sf.c[3], gglMulx(fb->c[3], df.c[3]));
    513 
    514     if (c->state.blend.alpha_separate) {
    515         blend_factor(c, &sf, c->state.blend.src_alpha, fragment, fb);
    516         blend_factor(c, &df, c->state.blend.dst_alpha, fragment, fb);
    517     }
    518 
    519     fragment->c[0] =
    520             gglMulAddx(fragment->c[0], sf.c[0], gglMulx(fb->c[0], df.c[0]));
    521 
    522     // clamp to 1.0
    523     if (fragment->c[0] >= (1LU<<fragment->s[0]))
    524         fragment->c[0] = (1<<fragment->s[0])-1;
    525     if (fragment->c[1] >= (1LU<<fragment->s[1]))
    526         fragment->c[1] = (1<<fragment->s[1])-1;
    527     if (fragment->c[2] >= (1LU<<fragment->s[2]))
    528         fragment->c[2] = (1<<fragment->s[2])-1;
    529     if (fragment->c[3] >= (1LU<<fragment->s[3]))
    530         fragment->c[3] = (1<<fragment->s[3])-1;
    531 }
    532 
    533 static inline int blendfactor(uint32_t x, uint32_t size, uint32_t def = 0)
    534 {
    535     if (!size)
    536         return def;
    537 
    538     // scale to 16 bits
    539     if (size > 16) {
    540         x >>= (size - 16);
    541     } else if (size < 16) {
    542         x = ggl_expand(x, size, 16);
    543     }
    544     x += x >> 15;
    545     return x;
    546 }
    547 
    548 void blend_factor(context_t* /*c*/, pixel_t* r,
    549         uint32_t factor, const pixel_t* src, const pixel_t* dst)
    550 {
    551     switch (factor) {
    552         case GGL_ZERO:
    553             r->c[1] =
    554             r->c[2] =
    555             r->c[3] =
    556             r->c[0] = 0;
    557             break;
    558         case GGL_ONE:
    559             r->c[1] =
    560             r->c[2] =
    561             r->c[3] =
    562             r->c[0] = FIXED_ONE;
    563             break;
    564         case GGL_DST_COLOR:
    565             r->c[1] = blendfactor(dst->c[1], dst->s[1]);
    566             r->c[2] = blendfactor(dst->c[2], dst->s[2]);
    567             r->c[3] = blendfactor(dst->c[3], dst->s[3]);
    568             r->c[0] = blendfactor(dst->c[0], dst->s[0]);
    569             break;
    570         case GGL_SRC_COLOR:
    571             r->c[1] = blendfactor(src->c[1], src->s[1]);
    572             r->c[2] = blendfactor(src->c[2], src->s[2]);
    573             r->c[3] = blendfactor(src->c[3], src->s[3]);
    574             r->c[0] = blendfactor(src->c[0], src->s[0]);
    575             break;
    576         case GGL_ONE_MINUS_DST_COLOR:
    577             r->c[1] = FIXED_ONE - blendfactor(dst->c[1], dst->s[1]);
    578             r->c[2] = FIXED_ONE - blendfactor(dst->c[2], dst->s[2]);
    579             r->c[3] = FIXED_ONE - blendfactor(dst->c[3], dst->s[3]);
    580             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0]);
    581             break;
    582         case GGL_ONE_MINUS_SRC_COLOR:
    583             r->c[1] = FIXED_ONE - blendfactor(src->c[1], src->s[1]);
    584             r->c[2] = FIXED_ONE - blendfactor(src->c[2], src->s[2]);
    585             r->c[3] = FIXED_ONE - blendfactor(src->c[3], src->s[3]);
    586             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0]);
    587             break;
    588         case GGL_SRC_ALPHA:
    589             r->c[1] =
    590             r->c[2] =
    591             r->c[3] =
    592             r->c[0] = blendfactor(src->c[0], src->s[0], FIXED_ONE);
    593             break;
    594         case GGL_ONE_MINUS_SRC_ALPHA:
    595             r->c[1] =
    596             r->c[2] =
    597             r->c[3] =
    598             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0], FIXED_ONE);
    599             break;
    600         case GGL_DST_ALPHA:
    601             r->c[1] =
    602             r->c[2] =
    603             r->c[3] =
    604             r->c[0] = blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
    605             break;
    606         case GGL_ONE_MINUS_DST_ALPHA:
    607             r->c[1] =
    608             r->c[2] =
    609             r->c[3] =
    610             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
    611             break;
    612         case GGL_SRC_ALPHA_SATURATE:
    613             // XXX: GGL_SRC_ALPHA_SATURATE
    614             break;
    615     }
    616 }
    617 
    618 static GGLfixed wrapping(int32_t coord, uint32_t size, int tx_wrap)
    619 {
    620     GGLfixed d;
    621     if (tx_wrap == GGL_REPEAT) {
    622         d = (uint32_t(coord)>>16) * size;
    623     } else if (tx_wrap == GGL_CLAMP) { // CLAMP_TO_EDGE semantics
    624         const GGLfixed clamp_min = FIXED_HALF;
    625         const GGLfixed clamp_max = (size << 16) - FIXED_HALF;
    626         if (coord < clamp_min)     coord = clamp_min;
    627         if (coord > clamp_max)     coord = clamp_max;
    628         d = coord;
    629     } else { // 1:1
    630         const GGLfixed clamp_min = 0;
    631         const GGLfixed clamp_max = (size << 16);
    632         if (coord < clamp_min)     coord = clamp_min;
    633         if (coord > clamp_max)     coord = clamp_max;
    634         d = coord;
    635     }
    636     return d;
    637 }
    638 
    639 static inline
    640 GGLcolor ADJUST_COLOR_ITERATOR(GGLcolor v, GGLcolor dvdx, int len)
    641 {
    642     const int32_t end = dvdx * (len-1) + v;
    643     if (end < 0)
    644         v -= end;
    645     v &= ~(v>>31);
    646     return v;
    647 }
    648 
    649 void scanline(context_t* c)
    650 {
    651     const uint32_t enables = c->state.enables;
    652     const int xs = c->iterators.xl;
    653     const int x1 = c->iterators.xr;
    654 	int xc = x1 - xs;
    655     const int16_t* covPtr = c->state.buffers.coverage + xs;
    656 
    657     // All iterated values are sampled at the pixel center
    658 
    659     // reset iterators for that scanline...
    660     GGLcolor r, g, b, a;
    661     iterators_t& ci = c->iterators;
    662     if (enables & GGL_ENABLE_SMOOTH) {
    663         r = (xs * c->shade.drdx) + ci.ydrdy;
    664         g = (xs * c->shade.dgdx) + ci.ydgdy;
    665         b = (xs * c->shade.dbdx) + ci.ydbdy;
    666         a = (xs * c->shade.dadx) + ci.ydady;
    667         r = ADJUST_COLOR_ITERATOR(r, c->shade.drdx, xc);
    668         g = ADJUST_COLOR_ITERATOR(g, c->shade.dgdx, xc);
    669         b = ADJUST_COLOR_ITERATOR(b, c->shade.dbdx, xc);
    670         a = ADJUST_COLOR_ITERATOR(a, c->shade.dadx, xc);
    671     } else {
    672         r = ci.ydrdy;
    673         g = ci.ydgdy;
    674         b = ci.ydbdy;
    675         a = ci.ydady;
    676     }
    677 
    678     // z iterators are 1.31
    679     GGLfixed z = (xs * c->shade.dzdx) + ci.ydzdy;
    680     GGLfixed f = (xs * c->shade.dfdx) + ci.ydfdy;
    681 
    682     struct {
    683         GGLfixed s, t;
    684     } tc[GGL_TEXTURE_UNIT_COUNT];
    685     if (enables & GGL_ENABLE_TMUS) {
    686         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
    687             if (c->state.texture[i].enable) {
    688                 texture_iterators_t& ti = c->state.texture[i].iterators;
    689                 if (enables & GGL_ENABLE_W) {
    690                     tc[i].s = ti.ydsdy;
    691                     tc[i].t = ti.ydtdy;
    692                 } else {
    693                     tc[i].s = (xs * ti.dsdx) + ti.ydsdy;
    694                     tc[i].t = (xs * ti.dtdx) + ti.ydtdy;
    695                 }
    696             }
    697         }
    698     }
    699 
    700     pixel_t fragment;
    701     pixel_t texel;
    702     pixel_t fb;
    703 
    704 	uint32_t x = xs;
    705 	uint32_t y = c->iterators.y;
    706 
    707 	while (xc--) {
    708 
    709         { // just a scope
    710 
    711 		// read color (convert to 8 bits by keeping only the integer part)
    712         fragment.s[1] = fragment.s[2] =
    713         fragment.s[3] = fragment.s[0] = 8;
    714         fragment.c[1] = r >> (GGL_COLOR_BITS-8);
    715         fragment.c[2] = g >> (GGL_COLOR_BITS-8);
    716         fragment.c[3] = b >> (GGL_COLOR_BITS-8);
    717         fragment.c[0] = a >> (GGL_COLOR_BITS-8);
    718 
    719 		// texturing
    720         if (enables & GGL_ENABLE_TMUS) {
    721             for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
    722                 texture_t& tx = c->state.texture[i];
    723                 if (!tx.enable)
    724                     continue;
    725                 texture_iterators_t& ti = tx.iterators;
    726                 int32_t u, v;
    727 
    728                 // s-coordinate
    729                 if (tx.s_coord != GGL_ONE_TO_ONE) {
    730                     const int w = tx.surface.width;
    731                     u = wrapping(tc[i].s, w, tx.s_wrap);
    732                     tc[i].s += ti.dsdx;
    733                 } else {
    734                     u = (((tx.shade.is0>>16) + x)<<16) + FIXED_HALF;
    735                 }
    736 
    737                 // t-coordinate
    738                 if (tx.t_coord != GGL_ONE_TO_ONE) {
    739                     const int h = tx.surface.height;
    740                     v = wrapping(tc[i].t, h, tx.t_wrap);
    741                     tc[i].t += ti.dtdx;
    742                 } else {
    743                     v = (((tx.shade.it0>>16) + y)<<16) + FIXED_HALF;
    744                 }
    745 
    746                 // read texture
    747                 if (tx.mag_filter == GGL_NEAREST &&
    748                     tx.min_filter == GGL_NEAREST)
    749                 {
    750                     u >>= 16;
    751                     v >>= 16;
    752                     tx.surface.read(&tx.surface, c, u, v, &texel);
    753                 } else {
    754                     const int w = tx.surface.width;
    755                     const int h = tx.surface.height;
    756                     u -= FIXED_HALF;
    757                     v -= FIXED_HALF;
    758                     int u0 = u >> 16;
    759                     int v0 = v >> 16;
    760                     int u1 = u0 + 1;
    761                     int v1 = v0 + 1;
    762                     if (tx.s_wrap == GGL_REPEAT) {
    763                         if (u0<0)  u0 += w;
    764                         if (u1<0)  u1 += w;
    765                         if (u0>=w) u0 -= w;
    766                         if (u1>=w) u1 -= w;
    767                     } else {
    768                         if (u0<0)  u0 = 0;
    769                         if (u1<0)  u1 = 0;
    770                         if (u0>=w) u0 = w-1;
    771                         if (u1>=w) u1 = w-1;
    772                     }
    773                     if (tx.t_wrap == GGL_REPEAT) {
    774                         if (v0<0)  v0 += h;
    775                         if (v1<0)  v1 += h;
    776                         if (v0>=h) v0 -= h;
    777                         if (v1>=h) v1 -= h;
    778                     } else {
    779                         if (v0<0)  v0 = 0;
    780                         if (v1<0)  v1 = 0;
    781                         if (v0>=h) v0 = h-1;
    782                         if (v1>=h) v1 = h-1;
    783                     }
    784                     pixel_t texels[4];
    785                     uint32_t mm[4];
    786                     tx.surface.read(&tx.surface, c, u0, v0, &texels[0]);
    787                     tx.surface.read(&tx.surface, c, u0, v1, &texels[1]);
    788                     tx.surface.read(&tx.surface, c, u1, v0, &texels[2]);
    789                     tx.surface.read(&tx.surface, c, u1, v1, &texels[3]);
    790                     u = (u >> 12) & 0xF;
    791                     v = (v >> 12) & 0xF;
    792                     u += u>>3;
    793                     v += v>>3;
    794                     mm[0] = (0x10 - u) * (0x10 - v);
    795                     mm[1] = (0x10 - u) * v;
    796                     mm[2] = u * (0x10 - v);
    797                     mm[3] = 0x100 - (mm[0] + mm[1] + mm[2]);
    798                     for (int j=0 ; j<4 ; j++) {
    799                         texel.s[j] = texels[0].s[j];
    800                         if (!texel.s[j]) continue;
    801                         texel.s[j] += 8;
    802                         texel.c[j] =    texels[0].c[j]*mm[0] +
    803                                         texels[1].c[j]*mm[1] +
    804                                         texels[2].c[j]*mm[2] +
    805                                         texels[3].c[j]*mm[3] ;
    806                     }
    807                 }
    808 
    809                 // Texture environnement...
    810                 for (int j=0 ; j<4 ; j++) {
    811                     uint32_t& Cf = fragment.c[j];
    812                     uint32_t& Ct = texel.c[j];
    813                     uint8_t& sf  = fragment.s[j];
    814                     uint8_t& st  = texel.s[j];
    815                     uint32_t At = texel.c[0];
    816                     uint8_t sat = texel.s[0];
    817                     switch (tx.env) {
    818                     case GGL_REPLACE:
    819                         if (st) {
    820                             Cf = Ct;
    821                             sf = st;
    822                         }
    823                         break;
    824                     case GGL_MODULATE:
    825                         if (st) {
    826                             uint32_t factor = Ct + (Ct>>(st-1));
    827                             Cf = (Cf * factor) >> st;
    828                         }
    829                         break;
    830                     case GGL_DECAL:
    831                         if (sat) {
    832                             rescale(Cf, sf, Ct, st);
    833                             Cf += ((Ct - Cf) * (At + (At>>(sat-1)))) >> sat;
    834                         }
    835                         break;
    836                     case GGL_BLEND:
    837                         if (st) {
    838                             uint32_t Cc = tx.env_color[i];
    839                             if (sf>8)       Cc = (Cc * ((1<<sf)-1))>>8;
    840                             else if (sf<8)  Cc = (Cc - (Cc>>(8-sf)))>>(8-sf);
    841                             uint32_t factor = Ct + (Ct>>(st-1));
    842                             Cf = ((((1<<st) - factor) * Cf) + Ct*Cc)>>st;
    843                         }
    844                         break;
    845                     case GGL_ADD:
    846                         if (st) {
    847                             rescale(Cf, sf, Ct, st);
    848                             Cf += Ct;
    849                         }
    850                         break;
    851                     }
    852                 }
    853             }
    854 		}
    855 
    856         // coverage application
    857         if (enables & GGL_ENABLE_AA) {
    858             int16_t cf = *covPtr++;
    859             fragment.c[0] = (int64_t(fragment.c[0]) * cf) >> 15;
    860         }
    861 
    862         // alpha-test
    863         if (enables & GGL_ENABLE_ALPHA_TEST) {
    864             GGLcolor ref = c->state.alpha_test.ref;
    865             GGLcolor alpha = (uint64_t(fragment.c[0]) *
    866                     ((1<<GGL_COLOR_BITS)-1)) / ((1<<fragment.s[0])-1);
    867             switch (c->state.alpha_test.func) {
    868             case GGL_NEVER:     goto discard;
    869             case GGL_LESS:      if (alpha<ref)  break; goto discard;
    870             case GGL_EQUAL:     if (alpha==ref) break; goto discard;
    871             case GGL_LEQUAL:    if (alpha<=ref) break; goto discard;
    872             case GGL_GREATER:   if (alpha>ref)  break; goto discard;
    873             case GGL_NOTEQUAL:  if (alpha!=ref) break; goto discard;
    874             case GGL_GEQUAL:    if (alpha>=ref) break; goto discard;
    875             }
    876         }
    877 
    878         // depth test
    879         if (c->state.buffers.depth.format) {
    880             if (enables & GGL_ENABLE_DEPTH_TEST) {
    881                 surface_t* cb = &(c->state.buffers.depth);
    882                 uint16_t* p = (uint16_t*)(cb->data)+(x+(cb->stride*y));
    883                 uint16_t zz = uint32_t(z)>>(16);
    884                 uint16_t depth = *p;
    885                 switch (c->state.depth_test.func) {
    886                 case GGL_NEVER:     goto discard;
    887                 case GGL_LESS:      if (zz<depth)    break; goto discard;
    888                 case GGL_EQUAL:     if (zz==depth)   break; goto discard;
    889                 case GGL_LEQUAL:    if (zz<=depth)   break; goto discard;
    890                 case GGL_GREATER:   if (zz>depth)    break; goto discard;
    891                 case GGL_NOTEQUAL:  if (zz!=depth)   break; goto discard;
    892                 case GGL_GEQUAL:    if (zz>=depth)   break; goto discard;
    893                 }
    894                 // depth buffer is not enabled, if depth-test is not enabled
    895 /*
    896         fragment.s[1] = fragment.s[2] =
    897         fragment.s[3] = fragment.s[0] = 8;
    898         fragment.c[1] =
    899         fragment.c[2] =
    900         fragment.c[3] =
    901         fragment.c[0] = 255 - (zz>>8);
    902 */
    903                 if (c->state.mask.depth) {
    904                     *p = zz;
    905                 }
    906             }
    907         }
    908 
    909         // fog
    910         if (enables & GGL_ENABLE_FOG) {
    911             for (int i=1 ; i<=3 ; i++) {
    912                 GGLfixed fc = (c->state.fog.color[i] * 0x10000) / 0xFF;
    913                 uint32_t& c = fragment.c[i];
    914                 uint8_t& s  = fragment.s[i];
    915                 c = (c * 0x10000) / ((1<<s)-1);
    916                 c = gglMulAddx(c, f, gglMulx(fc, 0x10000 - f));
    917                 s = 16;
    918             }
    919         }
    920 
    921         // blending
    922         if (enables & GGL_ENABLE_BLENDING) {
    923             fb.c[1] = fb.c[2] = fb.c[3] = fb.c[0] = 0; // placate valgrind
    924             fb.s[1] = fb.s[2] = fb.s[3] = fb.s[0] = 0;
    925             c->state.buffers.color.read(
    926                     &(c->state.buffers.color), c, x, y, &fb);
    927             blending( c, &fragment, &fb );
    928         }
    929 
    930 		// write
    931         c->state.buffers.color.write(
    932                 &(c->state.buffers.color), c, x, y, &fragment);
    933         }
    934 
    935 discard:
    936 		// iterate...
    937         x += 1;
    938         if (enables & GGL_ENABLE_SMOOTH) {
    939             r += c->shade.drdx;
    940             g += c->shade.dgdx;
    941             b += c->shade.dbdx;
    942             a += c->shade.dadx;
    943         }
    944         z += c->shade.dzdx;
    945         f += c->shade.dfdx;
    946 	}
    947 }
    948 
    949 #endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
    950 
    951 // ----------------------------------------------------------------------------
    952 #if 0
    953 #pragma mark -
    954 #pragma mark Scanline
    955 #endif
    956 
    957 /* Used to parse a 32-bit source texture linearly. Usage is:
    958  *
    959  * horz_iterator32  hi(context);
    960  * while (...) {
    961  *    uint32_t  src_pixel = hi.get_pixel32();
    962  *    ...
    963  * }
    964  *
    965  * Use only for one-to-one texture mapping.
    966  */
    967 struct horz_iterator32 {
    968     horz_iterator32(context_t* c) {
    969         const int x = c->iterators.xl;
    970         const int y = c->iterators.y;
    971         texture_t& tx = c->state.texture[0];
    972         const int32_t u = (tx.shade.is0>>16) + x;
    973         const int32_t v = (tx.shade.it0>>16) + y;
    974         m_src = reinterpret_cast<uint32_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
    975     }
    976     uint32_t  get_pixel32() {
    977         return *m_src++;
    978     }
    979 protected:
    980     uint32_t* m_src;
    981 };
    982 
    983 /* A variant for 16-bit source textures. */
    984 struct horz_iterator16 {
    985     horz_iterator16(context_t* c) {
    986         const int x = c->iterators.xl;
    987         const int y = c->iterators.y;
    988         texture_t& tx = c->state.texture[0];
    989         const int32_t u = (tx.shade.is0>>16) + x;
    990         const int32_t v = (tx.shade.it0>>16) + y;
    991         m_src = reinterpret_cast<uint16_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
    992     }
    993     uint16_t  get_pixel16() {
    994         return *m_src++;
    995     }
    996 protected:
    997     uint16_t* m_src;
    998 };
    999 
   1000 /* A clamp iterator is used to iterate inside a texture with GGL_CLAMP.
   1001  * After initialization, call get_src16() or get_src32() to get the current
   1002  * texture pixel value.
   1003  */
   1004 struct clamp_iterator {
   1005     clamp_iterator(context_t* c) {
   1006         const int xs = c->iterators.xl;
   1007         texture_t& tx = c->state.texture[0];
   1008         texture_iterators_t& ti = tx.iterators;
   1009         m_s = (xs * ti.dsdx) + ti.ydsdy;
   1010         m_t = (xs * ti.dtdx) + ti.ydtdy;
   1011         m_ds = ti.dsdx;
   1012         m_dt = ti.dtdx;
   1013         m_width_m1 = tx.surface.width - 1;
   1014         m_height_m1 = tx.surface.height - 1;
   1015         m_data = tx.surface.data;
   1016         m_stride = tx.surface.stride;
   1017     }
   1018     uint16_t get_pixel16() {
   1019         int  u, v;
   1020         get_uv(u, v);
   1021         uint16_t* src = reinterpret_cast<uint16_t*>(m_data) + (u + (m_stride*v));
   1022         return src[0];
   1023     }
   1024     uint32_t get_pixel32() {
   1025         int  u, v;
   1026         get_uv(u, v);
   1027         uint32_t* src = reinterpret_cast<uint32_t*>(m_data) + (u + (m_stride*v));
   1028         return src[0];
   1029     }
   1030 private:
   1031     void   get_uv(int& u, int& v) {
   1032         int  uu = m_s >> 16;
   1033         int  vv = m_t >> 16;
   1034         if (uu < 0)
   1035             uu = 0;
   1036         if (uu > m_width_m1)
   1037             uu = m_width_m1;
   1038         if (vv < 0)
   1039             vv = 0;
   1040         if (vv > m_height_m1)
   1041             vv = m_height_m1;
   1042         u = uu;
   1043         v = vv;
   1044         m_s += m_ds;
   1045         m_t += m_dt;
   1046     }
   1047 
   1048     GGLfixed  m_s, m_t;
   1049     GGLfixed  m_ds, m_dt;
   1050     int       m_width_m1, m_height_m1;
   1051     uint8_t*  m_data;
   1052     int       m_stride;
   1053 };
   1054 
   1055 /*
   1056  * The 'horizontal clamp iterator' variant corresponds to the case where
   1057  * the 'v' coordinate doesn't change. This is useful to avoid one mult and
   1058  * extra adds / checks per pixels, if the blending/processing operation after
   1059  * this is very fast.
   1060  */
   1061 static int is_context_horizontal(const context_t* c) {
   1062     return (c->state.texture[0].iterators.dtdx == 0);
   1063 }
   1064 
   1065 struct horz_clamp_iterator {
   1066     uint16_t  get_pixel16() {
   1067         int  u = m_s >> 16;
   1068         m_s += m_ds;
   1069         if (u < 0)
   1070             u = 0;
   1071         if (u > m_width_m1)
   1072             u = m_width_m1;
   1073         const uint16_t* src = reinterpret_cast<const uint16_t*>(m_data);
   1074         return src[u];
   1075     }
   1076     uint32_t  get_pixel32() {
   1077         int  u = m_s >> 16;
   1078         m_s += m_ds;
   1079         if (u < 0)
   1080             u = 0;
   1081         if (u > m_width_m1)
   1082             u = m_width_m1;
   1083         const uint32_t* src = reinterpret_cast<const uint32_t*>(m_data);
   1084         return src[u];
   1085     }
   1086 protected:
   1087     void init(const context_t* c, int shift);
   1088     GGLfixed       m_s;
   1089     GGLfixed       m_ds;
   1090     int            m_width_m1;
   1091     const uint8_t* m_data;
   1092 };
   1093 
   1094 void horz_clamp_iterator::init(const context_t* c, int shift)
   1095 {
   1096     const int xs = c->iterators.xl;
   1097     const texture_t& tx = c->state.texture[0];
   1098     const texture_iterators_t& ti = tx.iterators;
   1099     m_s = (xs * ti.dsdx) + ti.ydsdy;
   1100     m_ds = ti.dsdx;
   1101     m_width_m1 = tx.surface.width-1;
   1102     m_data = tx.surface.data;
   1103 
   1104     GGLfixed t = (xs * ti.dtdx) + ti.ydtdy;
   1105     int      v = t >> 16;
   1106     if (v < 0)
   1107         v = 0;
   1108     else if (v >= (int)tx.surface.height)
   1109         v = (int)tx.surface.height-1;
   1110 
   1111     m_data += (tx.surface.stride*v) << shift;
   1112 }
   1113 
   1114 struct horz_clamp_iterator16 : horz_clamp_iterator {
   1115     horz_clamp_iterator16(const context_t* c) {
   1116         init(c,1);
   1117     };
   1118 };
   1119 
   1120 struct horz_clamp_iterator32 : horz_clamp_iterator {
   1121     horz_clamp_iterator32(context_t* c) {
   1122         init(c,2);
   1123     };
   1124 };
   1125 
   1126 /* This is used to perform dithering operations.
   1127  */
   1128 struct ditherer {
   1129     ditherer(const context_t* c) {
   1130         const int x = c->iterators.xl;
   1131         const int y = c->iterators.y;
   1132         m_line = &c->ditherMatrix[ ((y & GGL_DITHER_MASK)<<GGL_DITHER_ORDER_SHIFT) ];
   1133         m_index = x & GGL_DITHER_MASK;
   1134     }
   1135     void step(void) {
   1136         m_index++;
   1137     }
   1138     int  get_value(void) {
   1139         int ret = m_line[m_index & GGL_DITHER_MASK];
   1140         m_index++;
   1141         return ret;
   1142     }
   1143     uint16_t abgr8888ToRgb565(uint32_t s) {
   1144         uint32_t r = s & 0xff;
   1145         uint32_t g = (s >> 8) & 0xff;
   1146         uint32_t b = (s >> 16) & 0xff;
   1147         return rgb888ToRgb565(r,g,b);
   1148     }
   1149     /* The following assumes that r/g/b are in the 0..255 range each */
   1150     uint16_t rgb888ToRgb565(uint32_t& r, uint32_t& g, uint32_t &b) {
   1151         int threshold = get_value();
   1152         /* dither in on GGL_DITHER_BITS, and each of r, g, b is on 8 bits */
   1153         r += (threshold >> (GGL_DITHER_BITS-8 +5));
   1154         g += (threshold >> (GGL_DITHER_BITS-8 +6));
   1155         b += (threshold >> (GGL_DITHER_BITS-8 +5));
   1156         if (r > 0xff)
   1157             r = 0xff;
   1158         if (g > 0xff)
   1159             g = 0xff;
   1160         if (b > 0xff)
   1161             b = 0xff;
   1162         return uint16_t(((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3));
   1163     }
   1164 protected:
   1165     const uint8_t* m_line;
   1166     int            m_index;
   1167 };
   1168 
   1169 /* This structure is used to blend (SRC_OVER) 32-bit source pixels
   1170  * onto 16-bit destination ones. Usage is simply:
   1171  *
   1172  *   blender.blend(<32-bit-src-pixel-value>,<ptr-to-16-bit-dest-pixel>)
   1173  */
   1174 struct blender_32to16 {
   1175     blender_32to16(context_t* /*c*/) { }
   1176     void write(uint32_t s, uint16_t* dst) {
   1177         if (s == 0)
   1178             return;
   1179         s = GGL_RGBA_TO_HOST(s);
   1180         int sA = (s>>24);
   1181         if (sA == 0xff) {
   1182             *dst = convertAbgr8888ToRgb565(s);
   1183         } else {
   1184             int f = 0x100 - (sA + (sA>>7));
   1185             int sR = (s >> (   3))&0x1F;
   1186             int sG = (s >> ( 8+2))&0x3F;
   1187             int sB = (s >> (16+3))&0x1F;
   1188             uint16_t d = *dst;
   1189             int dR = (d>>11)&0x1f;
   1190             int dG = (d>>5)&0x3f;
   1191             int dB = (d)&0x1f;
   1192             sR += (f*dR)>>8;
   1193             sG += (f*dG)>>8;
   1194             sB += (f*dB)>>8;
   1195             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1196         }
   1197     }
   1198     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1199         if (s == 0) {
   1200             di.step();
   1201             return;
   1202         }
   1203         s = GGL_RGBA_TO_HOST(s);
   1204         int sA = (s>>24);
   1205         if (sA == 0xff) {
   1206             *dst = di.abgr8888ToRgb565(s);
   1207         } else {
   1208             int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1209             int f = 0x100 - (sA + (sA>>7));
   1210             int sR = (s >> (   3))&0x1F;
   1211             int sG = (s >> ( 8+2))&0x3F;
   1212             int sB = (s >> (16+3))&0x1F;
   1213             uint16_t d = *dst;
   1214             int dR = (d>>11)&0x1f;
   1215             int dG = (d>>5)&0x3f;
   1216             int dB = (d)&0x1f;
   1217             sR = ((sR << 8) + f*dR + threshold)>>8;
   1218             sG = ((sG << 8) + f*dG + threshold)>>8;
   1219             sB = ((sB << 8) + f*dB + threshold)>>8;
   1220             if (sR > 0x1f) sR = 0x1f;
   1221             if (sG > 0x3f) sG = 0x3f;
   1222             if (sB > 0x1f) sB = 0x1f;
   1223             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1224         }
   1225     }
   1226 };
   1227 
   1228 /* This blender does the same for the 'blend_srca' operation.
   1229  * where dstFactor=srcA*(1-srcA) srcFactor=srcA
   1230  */
   1231 struct blender_32to16_srcA {
   1232     blender_32to16_srcA(const context_t* /*c*/) { }
   1233     void write(uint32_t s, uint16_t* dst) {
   1234         if (!s) {
   1235             return;
   1236         }
   1237         uint16_t d = *dst;
   1238         s = GGL_RGBA_TO_HOST(s);
   1239         int sR = (s >> (   3))&0x1F;
   1240         int sG = (s >> ( 8+2))&0x3F;
   1241         int sB = (s >> (16+3))&0x1F;
   1242         int sA = (s>>24);
   1243         int f1 = (sA + (sA>>7));
   1244         int f2 = 0x100-f1;
   1245         int dR = (d>>11)&0x1f;
   1246         int dG = (d>>5)&0x3f;
   1247         int dB = (d)&0x1f;
   1248         sR = (f1*sR + f2*dR)>>8;
   1249         sG = (f1*sG + f2*dG)>>8;
   1250         sB = (f1*sB + f2*dB)>>8;
   1251         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1252     }
   1253 };
   1254 
   1255 /* Common init code the modulating blenders */
   1256 struct blender_modulate {
   1257     void init(const context_t* c) {
   1258         const int r = c->iterators.ydrdy >> (GGL_COLOR_BITS-8);
   1259         const int g = c->iterators.ydgdy >> (GGL_COLOR_BITS-8);
   1260         const int b = c->iterators.ydbdy >> (GGL_COLOR_BITS-8);
   1261         const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
   1262         m_r = r + (r >> 7);
   1263         m_g = g + (g >> 7);
   1264         m_b = b + (b >> 7);
   1265         m_a = a + (a >> 7);
   1266     }
   1267 protected:
   1268     int m_r, m_g, m_b, m_a;
   1269 };
   1270 
   1271 /* This blender does a normal blend after modulation.
   1272  */
   1273 struct blender_32to16_modulate : blender_modulate {
   1274     blender_32to16_modulate(const context_t* c) {
   1275         init(c);
   1276     }
   1277     void write(uint32_t s, uint16_t* dst) {
   1278         // blend source and destination
   1279         if (!s) {
   1280             return;
   1281         }
   1282         s = GGL_RGBA_TO_HOST(s);
   1283 
   1284         /* We need to modulate s */
   1285         uint32_t  sA = (s >> 24);
   1286         uint32_t  sB = (s >> 16) & 0xff;
   1287         uint32_t  sG = (s >> 8) & 0xff;
   1288         uint32_t  sR = s & 0xff;
   1289 
   1290         sA = (sA*m_a) >> 8;
   1291         /* Keep R/G/B scaled to 5.8 or 6.8 fixed float format */
   1292         sR = (sR*m_r) >> (8 - 5);
   1293         sG = (sG*m_g) >> (8 - 6);
   1294         sB = (sB*m_b) >> (8 - 5);
   1295 
   1296         /* Now do a normal blend */
   1297         int f = 0x100 - (sA + (sA>>7));
   1298         uint16_t d = *dst;
   1299         int dR = (d>>11)&0x1f;
   1300         int dG = (d>>5)&0x3f;
   1301         int dB = (d)&0x1f;
   1302         sR = (sR + f*dR)>>8;
   1303         sG = (sG + f*dG)>>8;
   1304         sB = (sB + f*dB)>>8;
   1305         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1306     }
   1307     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1308         // blend source and destination
   1309         if (!s) {
   1310             di.step();
   1311             return;
   1312         }
   1313         s = GGL_RGBA_TO_HOST(s);
   1314 
   1315         /* We need to modulate s */
   1316         uint32_t  sA = (s >> 24);
   1317         uint32_t  sB = (s >> 16) & 0xff;
   1318         uint32_t  sG = (s >> 8) & 0xff;
   1319         uint32_t  sR = s & 0xff;
   1320 
   1321         sA = (sA*m_a) >> 8;
   1322         /* keep R/G/B scaled to 5.8 or 6.8 fixed float format */
   1323         sR = (sR*m_r) >> (8 - 5);
   1324         sG = (sG*m_g) >> (8 - 6);
   1325         sB = (sB*m_b) >> (8 - 5);
   1326 
   1327         /* Scale threshold to 0.8 fixed float format */
   1328         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1329         int f = 0x100 - (sA + (sA>>7));
   1330         uint16_t d = *dst;
   1331         int dR = (d>>11)&0x1f;
   1332         int dG = (d>>5)&0x3f;
   1333         int dB = (d)&0x1f;
   1334         sR = (sR + f*dR + threshold)>>8;
   1335         sG = (sG + f*dG + threshold)>>8;
   1336         sB = (sB + f*dB + threshold)>>8;
   1337         if (sR > 0x1f) sR = 0x1f;
   1338         if (sG > 0x3f) sG = 0x3f;
   1339         if (sB > 0x1f) sB = 0x1f;
   1340         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1341     }
   1342 };
   1343 
   1344 /* same as 32to16_modulate, except that the input is xRGB, instead of ARGB */
   1345 struct blender_x32to16_modulate : blender_modulate {
   1346     blender_x32to16_modulate(const context_t* c) {
   1347         init(c);
   1348     }
   1349     void write(uint32_t s, uint16_t* dst) {
   1350         s = GGL_RGBA_TO_HOST(s);
   1351 
   1352         uint32_t  sB = (s >> 16) & 0xff;
   1353         uint32_t  sG = (s >> 8) & 0xff;
   1354         uint32_t  sR = s & 0xff;
   1355 
   1356         /* Keep R/G/B in 5.8 or 6.8 format */
   1357         sR = (sR*m_r) >> (8 - 5);
   1358         sG = (sG*m_g) >> (8 - 6);
   1359         sB = (sB*m_b) >> (8 - 5);
   1360 
   1361         int f = 0x100 - m_a;
   1362         uint16_t d = *dst;
   1363         int dR = (d>>11)&0x1f;
   1364         int dG = (d>>5)&0x3f;
   1365         int dB = (d)&0x1f;
   1366         sR = (sR + f*dR)>>8;
   1367         sG = (sG + f*dG)>>8;
   1368         sB = (sB + f*dB)>>8;
   1369         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1370     }
   1371     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1372         s = GGL_RGBA_TO_HOST(s);
   1373 
   1374         uint32_t  sB = (s >> 16) & 0xff;
   1375         uint32_t  sG = (s >> 8) & 0xff;
   1376         uint32_t  sR = s & 0xff;
   1377 
   1378         sR = (sR*m_r) >> (8 - 5);
   1379         sG = (sG*m_g) >> (8 - 6);
   1380         sB = (sB*m_b) >> (8 - 5);
   1381 
   1382         /* Now do a normal blend */
   1383         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1384         int f = 0x100 - m_a;
   1385         uint16_t d = *dst;
   1386         int dR = (d>>11)&0x1f;
   1387         int dG = (d>>5)&0x3f;
   1388         int dB = (d)&0x1f;
   1389         sR = (sR + f*dR + threshold)>>8;
   1390         sG = (sG + f*dG + threshold)>>8;
   1391         sB = (sB + f*dB + threshold)>>8;
   1392         if (sR > 0x1f) sR = 0x1f;
   1393         if (sG > 0x3f) sG = 0x3f;
   1394         if (sB > 0x1f) sB = 0x1f;
   1395         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1396     }
   1397 };
   1398 
   1399 /* Same as above, but source is 16bit rgb565 */
   1400 struct blender_16to16_modulate : blender_modulate {
   1401     blender_16to16_modulate(const context_t* c) {
   1402         init(c);
   1403     }
   1404     void write(uint16_t s16, uint16_t* dst) {
   1405         uint32_t  s = s16;
   1406 
   1407         uint32_t  sR = s >> 11;
   1408         uint32_t  sG = (s >> 5) & 0x3f;
   1409         uint32_t  sB = s & 0x1f;
   1410 
   1411         sR = (sR*m_r);
   1412         sG = (sG*m_g);
   1413         sB = (sB*m_b);
   1414 
   1415         int f = 0x100 - m_a;
   1416         uint16_t d = *dst;
   1417         int dR = (d>>11)&0x1f;
   1418         int dG = (d>>5)&0x3f;
   1419         int dB = (d)&0x1f;
   1420         sR = (sR + f*dR)>>8;
   1421         sG = (sG + f*dG)>>8;
   1422         sB = (sB + f*dB)>>8;
   1423         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1424     }
   1425 };
   1426 
   1427 /* This is used to iterate over a 16-bit destination color buffer.
   1428  * Usage is:
   1429  *
   1430  *   dst_iterator16  di(context);
   1431  *   while (di.count--) {
   1432  *       <do stuff with dest pixel at di.dst>
   1433  *       di.dst++;
   1434  *   }
   1435  */
   1436 struct dst_iterator16 {
   1437     dst_iterator16(const context_t* c) {
   1438         const int x = c->iterators.xl;
   1439         const int width = c->iterators.xr - x;
   1440         const int32_t y = c->iterators.y;
   1441         const surface_t* cb = &(c->state.buffers.color);
   1442         count = width;
   1443         dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   1444     }
   1445     int        count;
   1446     uint16_t*  dst;
   1447 };
   1448 
   1449 
   1450 static void scanline_t32cb16_clamp(context_t* c)
   1451 {
   1452     dst_iterator16  di(c);
   1453 
   1454     if (is_context_horizontal(c)) {
   1455         /* Special case for simple horizontal scaling */
   1456         horz_clamp_iterator32 ci(c);
   1457         while (di.count--) {
   1458             uint32_t s = ci.get_pixel32();
   1459             *di.dst++ = convertAbgr8888ToRgb565(s);
   1460         }
   1461     } else {
   1462         /* General case */
   1463         clamp_iterator ci(c);
   1464         while (di.count--) {
   1465             uint32_t s = ci.get_pixel32();
   1466             *di.dst++ = convertAbgr8888ToRgb565(s);
   1467         }
   1468     }
   1469 }
   1470 
   1471 static void scanline_t32cb16_dither(context_t* c)
   1472 {
   1473     horz_iterator32 si(c);
   1474     dst_iterator16  di(c);
   1475     ditherer        dither(c);
   1476 
   1477     while (di.count--) {
   1478         uint32_t s = si.get_pixel32();
   1479         *di.dst++ = dither.abgr8888ToRgb565(s);
   1480     }
   1481 }
   1482 
   1483 static void scanline_t32cb16_clamp_dither(context_t* c)
   1484 {
   1485     dst_iterator16  di(c);
   1486     ditherer        dither(c);
   1487 
   1488     if (is_context_horizontal(c)) {
   1489         /* Special case for simple horizontal scaling */
   1490         horz_clamp_iterator32 ci(c);
   1491         while (di.count--) {
   1492             uint32_t s = ci.get_pixel32();
   1493             *di.dst++ = dither.abgr8888ToRgb565(s);
   1494         }
   1495     } else {
   1496         /* General case */
   1497         clamp_iterator ci(c);
   1498         while (di.count--) {
   1499             uint32_t s = ci.get_pixel32();
   1500             *di.dst++ = dither.abgr8888ToRgb565(s);
   1501         }
   1502     }
   1503 }
   1504 
   1505 static void scanline_t32cb16blend_dither(context_t* c)
   1506 {
   1507     dst_iterator16 di(c);
   1508     ditherer       dither(c);
   1509     blender_32to16 bl(c);
   1510     horz_iterator32  hi(c);
   1511     while (di.count--) {
   1512         uint32_t s = hi.get_pixel32();
   1513         bl.write(s, di.dst, dither);
   1514         di.dst++;
   1515     }
   1516 }
   1517 
   1518 static void scanline_t32cb16blend_clamp(context_t* c)
   1519 {
   1520     dst_iterator16  di(c);
   1521     blender_32to16  bl(c);
   1522 
   1523     if (is_context_horizontal(c)) {
   1524         horz_clamp_iterator32 ci(c);
   1525         while (di.count--) {
   1526             uint32_t s = ci.get_pixel32();
   1527             bl.write(s, di.dst);
   1528             di.dst++;
   1529         }
   1530     } else {
   1531         clamp_iterator ci(c);
   1532         while (di.count--) {
   1533             uint32_t s = ci.get_pixel32();
   1534             bl.write(s, di.dst);
   1535             di.dst++;
   1536         }
   1537     }
   1538 }
   1539 
   1540 static void scanline_t32cb16blend_clamp_dither(context_t* c)
   1541 {
   1542     dst_iterator16 di(c);
   1543     ditherer       dither(c);
   1544     blender_32to16 bl(c);
   1545 
   1546     clamp_iterator ci(c);
   1547     while (di.count--) {
   1548         uint32_t s = ci.get_pixel32();
   1549         bl.write(s, di.dst, dither);
   1550         di.dst++;
   1551     }
   1552 }
   1553 
   1554 void scanline_t32cb16blend_clamp_mod(context_t* c)
   1555 {
   1556     dst_iterator16 di(c);
   1557     blender_32to16_modulate bl(c);
   1558 
   1559     clamp_iterator ci(c);
   1560     while (di.count--) {
   1561         uint32_t s = ci.get_pixel32();
   1562         bl.write(s, di.dst);
   1563         di.dst++;
   1564     }
   1565 }
   1566 
   1567 void scanline_t32cb16blend_clamp_mod_dither(context_t* c)
   1568 {
   1569     dst_iterator16 di(c);
   1570     blender_32to16_modulate bl(c);
   1571     ditherer dither(c);
   1572 
   1573     clamp_iterator ci(c);
   1574     while (di.count--) {
   1575         uint32_t s = ci.get_pixel32();
   1576         bl.write(s, di.dst, dither);
   1577         di.dst++;
   1578     }
   1579 }
   1580 
   1581 /* Variant of scanline_t32cb16blend_clamp_mod with a xRGB texture */
   1582 void scanline_x32cb16blend_clamp_mod(context_t* c)
   1583 {
   1584     dst_iterator16 di(c);
   1585     blender_x32to16_modulate  bl(c);
   1586 
   1587     clamp_iterator ci(c);
   1588     while (di.count--) {
   1589         uint32_t s = ci.get_pixel32();
   1590         bl.write(s, di.dst);
   1591         di.dst++;
   1592     }
   1593 }
   1594 
   1595 void scanline_x32cb16blend_clamp_mod_dither(context_t* c)
   1596 {
   1597     dst_iterator16 di(c);
   1598     blender_x32to16_modulate  bl(c);
   1599     ditherer dither(c);
   1600 
   1601     clamp_iterator ci(c);
   1602     while (di.count--) {
   1603         uint32_t s = ci.get_pixel32();
   1604         bl.write(s, di.dst, dither);
   1605         di.dst++;
   1606     }
   1607 }
   1608 
   1609 void scanline_t16cb16_clamp(context_t* c)
   1610 {
   1611     dst_iterator16  di(c);
   1612 
   1613     /* Special case for simple horizontal scaling */
   1614     if (is_context_horizontal(c)) {
   1615         horz_clamp_iterator16 ci(c);
   1616         while (di.count--) {
   1617             *di.dst++ = ci.get_pixel16();
   1618         }
   1619     } else {
   1620         clamp_iterator ci(c);
   1621         while (di.count--) {
   1622             *di.dst++ = ci.get_pixel16();
   1623         }
   1624     }
   1625 }
   1626 
   1627 
   1628 
   1629 template <typename T, typename U>
   1630 static inline __attribute__((const))
   1631 T interpolate(int y, T v0, U dvdx, U dvdy) {
   1632     // interpolates in pixel's centers
   1633     // v = v0 + (y + 0.5) * dvdy + (0.5 * dvdx)
   1634     return (y * dvdy) + (v0 + ((dvdy + dvdx) >> 1));
   1635 }
   1636 
   1637 // ----------------------------------------------------------------------------
   1638 #if 0
   1639 #pragma mark -
   1640 #endif
   1641 
   1642 void init_y(context_t* c, int32_t ys)
   1643 {
   1644     const uint32_t enables = c->state.enables;
   1645 
   1646     // compute iterators...
   1647     iterators_t& ci = c->iterators;
   1648 
   1649     // sample in the center
   1650     ci.y = ys;
   1651 
   1652     if (enables & (GGL_ENABLE_DEPTH_TEST|GGL_ENABLE_W|GGL_ENABLE_FOG)) {
   1653         ci.ydzdy = interpolate(ys, c->shade.z0, c->shade.dzdx, c->shade.dzdy);
   1654         ci.ydwdy = interpolate(ys, c->shade.w0, c->shade.dwdx, c->shade.dwdy);
   1655         ci.ydfdy = interpolate(ys, c->shade.f0, c->shade.dfdx, c->shade.dfdy);
   1656     }
   1657 
   1658     if (ggl_unlikely(enables & GGL_ENABLE_SMOOTH)) {
   1659         ci.ydrdy = interpolate(ys, c->shade.r0, c->shade.drdx, c->shade.drdy);
   1660         ci.ydgdy = interpolate(ys, c->shade.g0, c->shade.dgdx, c->shade.dgdy);
   1661         ci.ydbdy = interpolate(ys, c->shade.b0, c->shade.dbdx, c->shade.dbdy);
   1662         ci.ydady = interpolate(ys, c->shade.a0, c->shade.dadx, c->shade.dady);
   1663         c->step_y = step_y__smooth;
   1664     } else {
   1665         ci.ydrdy = c->shade.r0;
   1666         ci.ydgdy = c->shade.g0;
   1667         ci.ydbdy = c->shade.b0;
   1668         ci.ydady = c->shade.a0;
   1669         // XXX: do only if needed, or make sure this is fast
   1670         c->packed = ggl_pack_color(c, c->state.buffers.color.format,
   1671                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
   1672         c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
   1673                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
   1674     }
   1675 
   1676     // initialize the variables we need in the shader
   1677     generated_vars_t& gen = c->generated_vars;
   1678     gen.argb[GGLFormat::ALPHA].c  = ci.ydady;
   1679     gen.argb[GGLFormat::ALPHA].dx = c->shade.dadx;
   1680     gen.argb[GGLFormat::RED  ].c  = ci.ydrdy;
   1681     gen.argb[GGLFormat::RED  ].dx = c->shade.drdx;
   1682     gen.argb[GGLFormat::GREEN].c  = ci.ydgdy;
   1683     gen.argb[GGLFormat::GREEN].dx = c->shade.dgdx;
   1684     gen.argb[GGLFormat::BLUE ].c  = ci.ydbdy;
   1685     gen.argb[GGLFormat::BLUE ].dx = c->shade.dbdx;
   1686     gen.dzdx = c->shade.dzdx;
   1687     gen.f    = ci.ydfdy;
   1688     gen.dfdx = c->shade.dfdx;
   1689 
   1690     if (enables & GGL_ENABLE_TMUS) {
   1691         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1692             texture_t& t = c->state.texture[i];
   1693             if (!t.enable) continue;
   1694 
   1695             texture_iterators_t& ti = t.iterators;
   1696             if (t.s_coord == GGL_ONE_TO_ONE && t.t_coord == GGL_ONE_TO_ONE) {
   1697                 // we need to set all of these to 0 because in some cases
   1698                 // step_y__generic() or step_y__tmu() will be used and
   1699                 // therefore will update dtdy, however, in 1:1 mode
   1700                 // this is always done by the scanline rasterizer.
   1701                 ti.dsdx = ti.dsdy = ti.dtdx = ti.dtdy = 0;
   1702                 ti.ydsdy = t.shade.is0;
   1703                 ti.ydtdy = t.shade.it0;
   1704             } else {
   1705                 const int adjustSWrap = ((t.s_wrap==GGL_CLAMP)?0:16);
   1706                 const int adjustTWrap = ((t.t_wrap==GGL_CLAMP)?0:16);
   1707                 ti.sscale = t.shade.sscale + adjustSWrap;
   1708                 ti.tscale = t.shade.tscale + adjustTWrap;
   1709                 if (!(enables & GGL_ENABLE_W)) {
   1710                     // S coordinate
   1711                     const int32_t sscale = ti.sscale;
   1712                     const int32_t sy = interpolate(ys,
   1713                             t.shade.is0, t.shade.idsdx, t.shade.idsdy);
   1714                     if (sscale>=0) {
   1715                         ti.ydsdy= sy            << sscale;
   1716                         ti.dsdx = t.shade.idsdx << sscale;
   1717                         ti.dsdy = t.shade.idsdy << sscale;
   1718                     } else {
   1719                         ti.ydsdy= sy            >> -sscale;
   1720                         ti.dsdx = t.shade.idsdx >> -sscale;
   1721                         ti.dsdy = t.shade.idsdy >> -sscale;
   1722                     }
   1723                     // T coordinate
   1724                     const int32_t tscale = ti.tscale;
   1725                     const int32_t ty = interpolate(ys,
   1726                             t.shade.it0, t.shade.idtdx, t.shade.idtdy);
   1727                     if (tscale>=0) {
   1728                         ti.ydtdy= ty            << tscale;
   1729                         ti.dtdx = t.shade.idtdx << tscale;
   1730                         ti.dtdy = t.shade.idtdy << tscale;
   1731                     } else {
   1732                         ti.ydtdy= ty            >> -tscale;
   1733                         ti.dtdx = t.shade.idtdx >> -tscale;
   1734                         ti.dtdy = t.shade.idtdy >> -tscale;
   1735                     }
   1736                 }
   1737             }
   1738             // mirror for generated code...
   1739             generated_tex_vars_t& gen = c->generated_vars.texture[i];
   1740             gen.width   = t.surface.width;
   1741             gen.height  = t.surface.height;
   1742             gen.stride  = t.surface.stride;
   1743             gen.data    = uintptr_t(t.surface.data);
   1744             gen.dsdx = ti.dsdx;
   1745             gen.dtdx = ti.dtdx;
   1746         }
   1747     }
   1748 
   1749     // choose the y-stepper
   1750     c->step_y = step_y__nop;
   1751     if (enables & GGL_ENABLE_FOG) {
   1752         c->step_y = step_y__generic;
   1753     } else if (enables & GGL_ENABLE_TMUS) {
   1754         if (enables & GGL_ENABLE_SMOOTH) {
   1755             c->step_y = step_y__generic;
   1756         } else if (enables & GGL_ENABLE_W) {
   1757             c->step_y = step_y__w;
   1758         } else {
   1759             c->step_y = step_y__tmu;
   1760         }
   1761     } else {
   1762         if (enables & GGL_ENABLE_SMOOTH) {
   1763             c->step_y = step_y__smooth;
   1764         }
   1765     }
   1766 
   1767     // choose the rectangle blitter
   1768     c->rect = rect_generic;
   1769     if ((c->step_y == step_y__nop) &&
   1770         (c->scanline == scanline_memcpy))
   1771     {
   1772         c->rect = rect_memcpy;
   1773     }
   1774 }
   1775 
   1776 void init_y_packed(context_t* c, int32_t y0)
   1777 {
   1778     uint8_t f = c->state.buffers.color.format;
   1779     c->packed = ggl_pack_color(c, f,
   1780             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
   1781     c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
   1782             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
   1783     c->iterators.y = y0;
   1784     c->step_y = step_y__nop;
   1785     // choose the rectangle blitter
   1786     c->rect = rect_generic;
   1787     if (c->scanline == scanline_memcpy) {
   1788         c->rect = rect_memcpy;
   1789     }
   1790 }
   1791 
   1792 void init_y_noop(context_t* c, int32_t y0)
   1793 {
   1794     c->iterators.y = y0;
   1795     c->step_y = step_y__nop;
   1796     // choose the rectangle blitter
   1797     c->rect = rect_generic;
   1798     if (c->scanline == scanline_memcpy) {
   1799         c->rect = rect_memcpy;
   1800     }
   1801 }
   1802 
   1803 void init_y_error(context_t* c, int32_t y0)
   1804 {
   1805     // woooops, shoud never happen,
   1806     // fail gracefully (don't display anything)
   1807     init_y_noop(c, y0);
   1808     ALOGE("color-buffer has an invalid format!");
   1809 }
   1810 
   1811 // ----------------------------------------------------------------------------
   1812 #if 0
   1813 #pragma mark -
   1814 #endif
   1815 
   1816 void step_y__generic(context_t* c)
   1817 {
   1818     const uint32_t enables = c->state.enables;
   1819 
   1820     // iterate...
   1821     iterators_t& ci = c->iterators;
   1822     ci.y += 1;
   1823 
   1824     if (enables & GGL_ENABLE_SMOOTH) {
   1825         ci.ydrdy += c->shade.drdy;
   1826         ci.ydgdy += c->shade.dgdy;
   1827         ci.ydbdy += c->shade.dbdy;
   1828         ci.ydady += c->shade.dady;
   1829     }
   1830 
   1831     const uint32_t mask =
   1832             GGL_ENABLE_DEPTH_TEST |
   1833             GGL_ENABLE_W |
   1834             GGL_ENABLE_FOG;
   1835     if (enables & mask) {
   1836         ci.ydzdy += c->shade.dzdy;
   1837         ci.ydwdy += c->shade.dwdy;
   1838         ci.ydfdy += c->shade.dfdy;
   1839     }
   1840 
   1841     if ((enables & GGL_ENABLE_TMUS) && (!(enables & GGL_ENABLE_W))) {
   1842         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1843             if (c->state.texture[i].enable) {
   1844                 texture_iterators_t& ti = c->state.texture[i].iterators;
   1845                 ti.ydsdy += ti.dsdy;
   1846                 ti.ydtdy += ti.dtdy;
   1847             }
   1848         }
   1849     }
   1850 }
   1851 
   1852 void step_y__nop(context_t* c)
   1853 {
   1854     c->iterators.y += 1;
   1855     c->iterators.ydzdy += c->shade.dzdy;
   1856 }
   1857 
   1858 void step_y__smooth(context_t* c)
   1859 {
   1860     iterators_t& ci = c->iterators;
   1861     ci.y += 1;
   1862     ci.ydrdy += c->shade.drdy;
   1863     ci.ydgdy += c->shade.dgdy;
   1864     ci.ydbdy += c->shade.dbdy;
   1865     ci.ydady += c->shade.dady;
   1866     ci.ydzdy += c->shade.dzdy;
   1867 }
   1868 
   1869 void step_y__w(context_t* c)
   1870 {
   1871     iterators_t& ci = c->iterators;
   1872     ci.y += 1;
   1873     ci.ydzdy += c->shade.dzdy;
   1874     ci.ydwdy += c->shade.dwdy;
   1875 }
   1876 
   1877 void step_y__tmu(context_t* c)
   1878 {
   1879     iterators_t& ci = c->iterators;
   1880     ci.y += 1;
   1881     ci.ydzdy += c->shade.dzdy;
   1882     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1883         if (c->state.texture[i].enable) {
   1884             texture_iterators_t& ti = c->state.texture[i].iterators;
   1885             ti.ydsdy += ti.dsdy;
   1886             ti.ydtdy += ti.dtdy;
   1887         }
   1888     }
   1889 }
   1890 
   1891 // ----------------------------------------------------------------------------
   1892 #if 0
   1893 #pragma mark -
   1894 #endif
   1895 
   1896 void scanline_perspective(context_t* c)
   1897 {
   1898     struct {
   1899         union {
   1900             struct {
   1901                 int32_t s, sq;
   1902                 int32_t t, tq;
   1903             } sqtq;
   1904             struct {
   1905                 int32_t v, q;
   1906             } st[2];
   1907         };
   1908     } tc[GGL_TEXTURE_UNIT_COUNT] __attribute__((aligned(16)));
   1909 
   1910     // XXX: we should have a special case when dwdx = 0
   1911 
   1912     // 32 pixels spans works okay. 16 is a lot better,
   1913     // but hey, it's a software renderer...
   1914     const uint32_t SPAN_BITS = 5;
   1915     const uint32_t ys = c->iterators.y;
   1916     const uint32_t xs = c->iterators.xl;
   1917     const uint32_t x1 = c->iterators.xr;
   1918 	const uint32_t xc = x1 - xs;
   1919     uint32_t remainder = xc & ((1<<SPAN_BITS)-1);
   1920     uint32_t numSpans = xc >> SPAN_BITS;
   1921 
   1922     const iterators_t& ci = c->iterators;
   1923     int32_t w0 = (xs * c->shade.dwdx) + ci.ydwdy;
   1924     int32_t q0 = gglRecipQ(w0, 30);
   1925     const int iwscale = 32 - gglClz(q0);
   1926 
   1927     const int32_t dwdx = c->shade.dwdx << SPAN_BITS;
   1928     int32_t xl = c->iterators.xl;
   1929 
   1930     // We process s & t with a loop to reduce the code size
   1931     // (and i-cache pressure).
   1932 
   1933     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1934         const texture_t& tmu = c->state.texture[i];
   1935         if (!tmu.enable) continue;
   1936         int32_t s =   tmu.shade.is0 +
   1937                      (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
   1938                      ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
   1939         int32_t t =   tmu.shade.it0 +
   1940                      (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
   1941                      ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
   1942         tc[i].sqtq.s  = s;
   1943         tc[i].sqtq.t  = t;
   1944         tc[i].sqtq.sq = gglMulx(s, q0, iwscale);
   1945         tc[i].sqtq.tq = gglMulx(t, q0, iwscale);
   1946     }
   1947 
   1948     int32_t span = 0;
   1949     do {
   1950         int32_t w1;
   1951         if (ggl_likely(numSpans)) {
   1952             w1 = w0 + dwdx;
   1953         } else {
   1954             if (remainder) {
   1955                 // finish off the scanline...
   1956                 span = remainder;
   1957                 w1 = (c->shade.dwdx * span) + w0;
   1958             } else {
   1959                 break;
   1960             }
   1961         }
   1962         int32_t q1 = gglRecipQ(w1, 30);
   1963         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1964             texture_t& tmu = c->state.texture[i];
   1965             if (!tmu.enable) continue;
   1966             texture_iterators_t& ti = tmu.iterators;
   1967 
   1968             for (int j=0 ; j<2 ; j++) {
   1969                 int32_t v = tc[i].st[j].v;
   1970                 if (span)   v += (tmu.shade.st[j].dx)*span;
   1971                 else        v += (tmu.shade.st[j].dx)<<SPAN_BITS;
   1972                 const int32_t v0 = tc[i].st[j].q;
   1973                 const int32_t v1 = gglMulx(v, q1, iwscale);
   1974                 int32_t dvdx = v1 - v0;
   1975                 if (span)   dvdx /= span;
   1976                 else        dvdx >>= SPAN_BITS;
   1977                 tc[i].st[j].v = v;
   1978                 tc[i].st[j].q = v1;
   1979 
   1980                 const int scale = ti.st[j].scale + (iwscale - 30);
   1981                 if (scale >= 0) {
   1982                     ti.st[j].ydvdy = v0   << scale;
   1983                     ti.st[j].dvdx  = dvdx << scale;
   1984                 } else {
   1985                     ti.st[j].ydvdy = v0   >> -scale;
   1986                     ti.st[j].dvdx  = dvdx >> -scale;
   1987                 }
   1988             }
   1989             generated_tex_vars_t& gen = c->generated_vars.texture[i];
   1990             gen.dsdx = ti.st[0].dvdx;
   1991             gen.dtdx = ti.st[1].dvdx;
   1992         }
   1993         c->iterators.xl = xl;
   1994         c->iterators.xr = xl = xl + (span ? span : (1<<SPAN_BITS));
   1995         w0 = w1;
   1996         q0 = q1;
   1997         c->span(c);
   1998     } while(numSpans--);
   1999 }
   2000 
   2001 void scanline_perspective_single(context_t* c)
   2002 {
   2003     // 32 pixels spans works okay. 16 is a lot better,
   2004     // but hey, it's a software renderer...
   2005     const uint32_t SPAN_BITS = 5;
   2006     const uint32_t ys = c->iterators.y;
   2007     const uint32_t xs = c->iterators.xl;
   2008     const uint32_t x1 = c->iterators.xr;
   2009 	const uint32_t xc = x1 - xs;
   2010 
   2011     const iterators_t& ci = c->iterators;
   2012     int32_t w = (xs * c->shade.dwdx) + ci.ydwdy;
   2013     int32_t iw = gglRecipQ(w, 30);
   2014     const int iwscale = 32 - gglClz(iw);
   2015 
   2016     const int i = 31 - gglClz(c->state.enabled_tmu);
   2017     generated_tex_vars_t& gen = c->generated_vars.texture[i];
   2018     texture_t& tmu = c->state.texture[i];
   2019     texture_iterators_t& ti = tmu.iterators;
   2020     const int sscale = ti.sscale + (iwscale - 30);
   2021     const int tscale = ti.tscale + (iwscale - 30);
   2022     int32_t s =   tmu.shade.is0 +
   2023                  (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
   2024                  ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
   2025     int32_t t =   tmu.shade.it0 +
   2026                  (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
   2027                  ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
   2028     int32_t s0 = gglMulx(s, iw, iwscale);
   2029     int32_t t0 = gglMulx(t, iw, iwscale);
   2030     int32_t xl = c->iterators.xl;
   2031 
   2032     int32_t sq, tq, dsdx, dtdx;
   2033     int32_t premainder = xc & ((1<<SPAN_BITS)-1);
   2034     uint32_t numSpans = xc >> SPAN_BITS;
   2035     if (c->shade.dwdx == 0) {
   2036         // XXX: we could choose to do this if the error is small enough
   2037         numSpans = 0;
   2038         premainder = xc;
   2039         goto no_perspective;
   2040     }
   2041 
   2042     if (premainder) {
   2043         w += c->shade.dwdx   * premainder;
   2044         iw = gglRecipQ(w, 30);
   2045 no_perspective:
   2046         s += tmu.shade.idsdx * premainder;
   2047         t += tmu.shade.idtdx * premainder;
   2048         sq = gglMulx(s, iw, iwscale);
   2049         tq = gglMulx(t, iw, iwscale);
   2050         dsdx = (sq - s0) / premainder;
   2051         dtdx = (tq - t0) / premainder;
   2052         c->iterators.xl = xl;
   2053         c->iterators.xr = xl = xl + premainder;
   2054         goto finish;
   2055     }
   2056 
   2057     while (numSpans--) {
   2058         w += c->shade.dwdx   << SPAN_BITS;
   2059         s += tmu.shade.idsdx << SPAN_BITS;
   2060         t += tmu.shade.idtdx << SPAN_BITS;
   2061         iw = gglRecipQ(w, 30);
   2062         sq = gglMulx(s, iw, iwscale);
   2063         tq = gglMulx(t, iw, iwscale);
   2064         dsdx = (sq - s0) >> SPAN_BITS;
   2065         dtdx = (tq - t0) >> SPAN_BITS;
   2066         c->iterators.xl = xl;
   2067         c->iterators.xr = xl = xl + (1<<SPAN_BITS);
   2068 finish:
   2069         if (sscale >= 0) {
   2070             ti.ydsdy = s0   << sscale;
   2071             ti.dsdx  = dsdx << sscale;
   2072         } else {
   2073             ti.ydsdy = s0   >>-sscale;
   2074             ti.dsdx  = dsdx >>-sscale;
   2075         }
   2076         if (tscale >= 0) {
   2077             ti.ydtdy = t0   << tscale;
   2078             ti.dtdx  = dtdx << tscale;
   2079         } else {
   2080             ti.ydtdy = t0   >>-tscale;
   2081             ti.dtdx  = dtdx >>-tscale;
   2082         }
   2083         s0 = sq;
   2084         t0 = tq;
   2085         gen.dsdx = ti.dsdx;
   2086         gen.dtdx = ti.dtdx;
   2087         c->span(c);
   2088     }
   2089 }
   2090 
   2091 // ----------------------------------------------------------------------------
   2092 
   2093 void scanline_col32cb16blend(context_t* c)
   2094 {
   2095     int32_t x = c->iterators.xl;
   2096     size_t ct = c->iterators.xr - x;
   2097     int32_t y = c->iterators.y;
   2098     surface_t* cb = &(c->state.buffers.color);
   2099     union {
   2100         uint16_t* dst;
   2101         uint32_t* dst32;
   2102     };
   2103     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2104 
   2105 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
   2106 #if defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2107     scanline_col32cb16blend_neon(dst, &(c->packed8888), ct);
   2108 #else  // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2109     scanline_col32cb16blend_arm(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
   2110 #endif // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2111 #elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__aarch64__))
   2112     scanline_col32cb16blend_arm64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
   2113 #elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__mips__) && defined(__LP64__)))
   2114     scanline_col32cb16blend_mips64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
   2115 #else
   2116     uint32_t s = GGL_RGBA_TO_HOST(c->packed8888);
   2117     int sA = (s>>24);
   2118     int f = 0x100 - (sA + (sA>>7));
   2119     while (ct--) {
   2120         uint16_t d = *dst;
   2121         int dR = (d>>11)&0x1f;
   2122         int dG = (d>>5)&0x3f;
   2123         int dB = (d)&0x1f;
   2124         int sR = (s >> (   3))&0x1F;
   2125         int sG = (s >> ( 8+2))&0x3F;
   2126         int sB = (s >> (16+3))&0x1F;
   2127         sR += (f*dR)>>8;
   2128         sG += (f*dG)>>8;
   2129         sB += (f*dB)>>8;
   2130         *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
   2131     }
   2132 #endif
   2133 
   2134 }
   2135 
   2136 void scanline_t32cb16(context_t* c)
   2137 {
   2138     int32_t x = c->iterators.xl;
   2139     size_t ct = c->iterators.xr - x;
   2140     int32_t y = c->iterators.y;
   2141     surface_t* cb = &(c->state.buffers.color);
   2142     union {
   2143         uint16_t* dst;
   2144         uint32_t* dst32;
   2145     };
   2146     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2147 
   2148     surface_t* tex = &(c->state.texture[0].surface);
   2149     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2150     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2151     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
   2152     int sR, sG, sB;
   2153     uint32_t s, d;
   2154 
   2155     if (ct==1 || uintptr_t(dst)&2) {
   2156 last_one:
   2157         s = GGL_RGBA_TO_HOST( *src++ );
   2158         *dst++ = convertAbgr8888ToRgb565(s);
   2159         ct--;
   2160     }
   2161 
   2162     while (ct >= 2) {
   2163 #if BYTE_ORDER == BIG_ENDIAN
   2164         s = GGL_RGBA_TO_HOST( *src++ );
   2165         d = convertAbgr8888ToRgb565_hi16(s);
   2166 
   2167         s = GGL_RGBA_TO_HOST( *src++ );
   2168         d |= convertAbgr8888ToRgb565(s);
   2169 #else
   2170         s = GGL_RGBA_TO_HOST( *src++ );
   2171         d = convertAbgr8888ToRgb565(s);
   2172 
   2173         s = GGL_RGBA_TO_HOST( *src++ );
   2174         d |= convertAbgr8888ToRgb565(s) << 16;
   2175 #endif
   2176         *dst32++ = d;
   2177         ct -= 2;
   2178     }
   2179 
   2180     if (ct > 0) {
   2181         goto last_one;
   2182     }
   2183 }
   2184 
   2185 void scanline_t32cb16blend(context_t* c)
   2186 {
   2187 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__aarch64__) || \
   2188     (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__)))))
   2189     int32_t x = c->iterators.xl;
   2190     size_t ct = c->iterators.xr - x;
   2191     int32_t y = c->iterators.y;
   2192     surface_t* cb = &(c->state.buffers.color);
   2193     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2194 
   2195     surface_t* tex = &(c->state.texture[0].surface);
   2196     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2197     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2198     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
   2199 
   2200 #ifdef __arm__
   2201     scanline_t32cb16blend_arm(dst, src, ct);
   2202 #elif defined(__aarch64__)
   2203     scanline_t32cb16blend_arm64(dst, src, ct);
   2204 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
   2205     scanline_t32cb16blend_mips(dst, src, ct);
   2206 #elif defined(__mips__) && defined(__LP64__)
   2207     scanline_t32cb16blend_mips64(dst, src, ct);
   2208 #endif
   2209 #else
   2210     dst_iterator16  di(c);
   2211     horz_iterator32  hi(c);
   2212     blender_32to16  bl(c);
   2213     while (di.count--) {
   2214         uint32_t s = hi.get_pixel32();
   2215         bl.write(s, di.dst);
   2216         di.dst++;
   2217     }
   2218 #endif
   2219 }
   2220 
   2221 void scanline_t32cb16blend_srca(context_t* c)
   2222 {
   2223     dst_iterator16  di(c);
   2224     horz_iterator32  hi(c);
   2225     blender_32to16_srcA  blender(c);
   2226 
   2227     while (di.count--) {
   2228         uint32_t s = hi.get_pixel32();
   2229         blender.write(s,di.dst);
   2230         di.dst++;
   2231     }
   2232 }
   2233 
   2234 void scanline_t16cb16blend_clamp_mod(context_t* c)
   2235 {
   2236     const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
   2237     if (a == 0) {
   2238         return;
   2239     }
   2240 
   2241     if (a == 255) {
   2242         scanline_t16cb16_clamp(c);
   2243         return;
   2244     }
   2245 
   2246     dst_iterator16  di(c);
   2247     blender_16to16_modulate  blender(c);
   2248     clamp_iterator  ci(c);
   2249 
   2250     while (di.count--) {
   2251         uint16_t s = ci.get_pixel16();
   2252         blender.write(s, di.dst);
   2253         di.dst++;
   2254     }
   2255 }
   2256 
   2257 void scanline_memcpy(context_t* c)
   2258 {
   2259     int32_t x = c->iterators.xl;
   2260     size_t ct = c->iterators.xr - x;
   2261     int32_t y = c->iterators.y;
   2262     surface_t* cb = &(c->state.buffers.color);
   2263     const GGLFormat* fp = &(c->formats[cb->format]);
   2264     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2265                             (x + (cb->stride * y)) * fp->size;
   2266 
   2267     surface_t* tex = &(c->state.texture[0].surface);
   2268     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2269     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2270     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
   2271                             (u + (tex->stride * v)) * fp->size;
   2272 
   2273     const size_t size = ct * fp->size;
   2274     memcpy(dst, src, size);
   2275 }
   2276 
   2277 void scanline_memset8(context_t* c)
   2278 {
   2279     int32_t x = c->iterators.xl;
   2280     size_t ct = c->iterators.xr - x;
   2281     int32_t y = c->iterators.y;
   2282     surface_t* cb = &(c->state.buffers.color);
   2283     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) + (x+(cb->stride*y));
   2284     uint32_t packed = c->packed;
   2285     memset(dst, packed, ct);
   2286 }
   2287 
   2288 void scanline_memset16(context_t* c)
   2289 {
   2290     int32_t x = c->iterators.xl;
   2291     size_t ct = c->iterators.xr - x;
   2292     int32_t y = c->iterators.y;
   2293     surface_t* cb = &(c->state.buffers.color);
   2294     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2295     uint32_t packed = c->packed;
   2296     android_memset16(dst, packed, ct*2);
   2297 }
   2298 
   2299 void scanline_memset32(context_t* c)
   2300 {
   2301     int32_t x = c->iterators.xl;
   2302     size_t ct = c->iterators.xr - x;
   2303     int32_t y = c->iterators.y;
   2304     surface_t* cb = &(c->state.buffers.color);
   2305     uint32_t* dst = reinterpret_cast<uint32_t*>(cb->data) + (x+(cb->stride*y));
   2306     uint32_t packed = GGL_HOST_TO_RGBA(c->packed);
   2307     android_memset32(dst, packed, ct*4);
   2308 }
   2309 
   2310 void scanline_clear(context_t* c)
   2311 {
   2312     int32_t x = c->iterators.xl;
   2313     size_t ct = c->iterators.xr - x;
   2314     int32_t y = c->iterators.y;
   2315     surface_t* cb = &(c->state.buffers.color);
   2316     const GGLFormat* fp = &(c->formats[cb->format]);
   2317     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2318                             (x + (cb->stride * y)) * fp->size;
   2319     const size_t size = ct * fp->size;
   2320     memset(dst, 0, size);
   2321 }
   2322 
   2323 void scanline_set(context_t* c)
   2324 {
   2325     int32_t x = c->iterators.xl;
   2326     size_t ct = c->iterators.xr - x;
   2327     int32_t y = c->iterators.y;
   2328     surface_t* cb = &(c->state.buffers.color);
   2329     const GGLFormat* fp = &(c->formats[cb->format]);
   2330     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2331                             (x + (cb->stride * y)) * fp->size;
   2332     const size_t size = ct * fp->size;
   2333     memset(dst, 0xFF, size);
   2334 }
   2335 
   2336 void scanline_noop(context_t* /*c*/)
   2337 {
   2338 }
   2339 
   2340 void rect_generic(context_t* c, size_t yc)
   2341 {
   2342     do {
   2343         c->scanline(c);
   2344         c->step_y(c);
   2345     } while (--yc);
   2346 }
   2347 
   2348 void rect_memcpy(context_t* c, size_t yc)
   2349 {
   2350     int32_t x = c->iterators.xl;
   2351     size_t ct = c->iterators.xr - x;
   2352     int32_t y = c->iterators.y;
   2353     surface_t* cb = &(c->state.buffers.color);
   2354     const GGLFormat* fp = &(c->formats[cb->format]);
   2355     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2356                             (x + (cb->stride * y)) * fp->size;
   2357 
   2358     surface_t* tex = &(c->state.texture[0].surface);
   2359     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2360     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2361     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
   2362                             (u + (tex->stride * v)) * fp->size;
   2363 
   2364     if (cb->stride == tex->stride && ct == size_t(cb->stride)) {
   2365         memcpy(dst, src, ct * fp->size * yc);
   2366     } else {
   2367         const size_t size = ct * fp->size;
   2368         const size_t dbpr = cb->stride  * fp->size;
   2369         const size_t sbpr = tex->stride * fp->size;
   2370         do {
   2371             memcpy(dst, src, size);
   2372             dst += dbpr;
   2373             src += sbpr;
   2374         } while (--yc);
   2375     }
   2376 }
   2377 // ----------------------------------------------------------------------------
   2378 }; // namespace android
   2379 
   2380