Home | History | Annotate | Download | only in libpixelflinger
      1 /* libs/pixelflinger/scanline.cpp
      2 **
      3 ** Copyright 2006-2011, The Android Open Source Project
      4 **
      5 ** Licensed under the Apache License, Version 2.0 (the "License");
      6 ** you may not use this file except in compliance with the License.
      7 ** You may obtain a copy of the License at
      8 **
      9 **     http://www.apache.org/licenses/LICENSE-2.0
     10 **
     11 ** Unless required by applicable law or agreed to in writing, software
     12 ** distributed under the License is distributed on an "AS IS" BASIS,
     13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 ** See the License for the specific language governing permissions and
     15 ** limitations under the License.
     16 */
     17 
     18 #define LOG_TAG "pixelflinger"
     19 
     20 #include <assert.h>
     21 #include <stdio.h>
     22 #include <stdlib.h>
     23 #include <string.h>
     24 
     25 #include <cutils/memory.h>
     26 #include <log/log.h>
     27 
     28 #include "buffer.h"
     29 #include "scanline.h"
     30 
     31 #include "codeflinger/CodeCache.h"
     32 #include "codeflinger/GGLAssembler.h"
     33 #if defined(__arm__)
     34 #include "codeflinger/ARMAssembler.h"
     35 #elif defined(__aarch64__)
     36 #include "codeflinger/Arm64Assembler.h"
     37 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
     38 #include "codeflinger/MIPSAssembler.h"
     39 #elif defined(__mips__) && defined(__LP64__)
     40 #include "codeflinger/MIPS64Assembler.h"
     41 #endif
     42 //#include "codeflinger/ARMAssemblerOptimizer.h"
     43 
     44 // ----------------------------------------------------------------------------
     45 
     46 #define ANDROID_CODEGEN_GENERIC     0   // force generic pixel pipeline
     47 #define ANDROID_CODEGEN_C           1   // hand-written C, fallback generic
     48 #define ANDROID_CODEGEN_ASM         2   // hand-written asm, fallback generic
     49 #define ANDROID_CODEGEN_GENERATED   3   // hand-written asm, fallback codegen
     50 
     51 #ifdef NDEBUG
     52 #   define ANDROID_RELEASE
     53 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
     54 #else
     55 #   define ANDROID_DEBUG
     56 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
     57 #endif
     58 
     59 #if defined(__arm__) || (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))) || defined(__aarch64__)
     60 #   define ANDROID_ARM_CODEGEN  1
     61 #else
     62 #   define ANDROID_ARM_CODEGEN  0
     63 #endif
     64 
     65 #define DEBUG__CODEGEN_ONLY     0
     66 
     67 /* Set to 1 to dump to the log the states that need a new
     68  * code-generated scanline callback, i.e. those that don't
     69  * have a corresponding shortcut function.
     70  */
     71 #define DEBUG_NEEDS  0
     72 
     73 #if defined( __mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
     74 #define ASSEMBLY_SCRATCH_SIZE   4096
     75 #elif defined(__aarch64__)
     76 #define ASSEMBLY_SCRATCH_SIZE   8192
     77 #else
     78 #define ASSEMBLY_SCRATCH_SIZE   2048
     79 #endif
     80 
     81 // ----------------------------------------------------------------------------
     82 namespace android {
     83 // ----------------------------------------------------------------------------
     84 
     85 static void init_y(context_t*, int32_t);
     86 static void init_y_noop(context_t*, int32_t);
     87 static void init_y_packed(context_t*, int32_t);
     88 static void init_y_error(context_t*, int32_t);
     89 
     90 static void step_y__generic(context_t* c);
     91 static void step_y__nop(context_t*);
     92 static void step_y__smooth(context_t* c);
     93 static void step_y__tmu(context_t* c);
     94 static void step_y__w(context_t* c);
     95 
     96 static void scanline(context_t* c);
     97 static void scanline_perspective(context_t* c);
     98 static void scanline_perspective_single(context_t* c);
     99 static void scanline_t32cb16blend(context_t* c);
    100 static void scanline_t32cb16blend_dither(context_t* c);
    101 static void scanline_t32cb16blend_srca(context_t* c);
    102 static void scanline_t32cb16blend_clamp(context_t* c);
    103 static void scanline_t32cb16blend_clamp_dither(context_t* c);
    104 static void scanline_t32cb16blend_clamp_mod(context_t* c);
    105 static void scanline_x32cb16blend_clamp_mod(context_t* c);
    106 static void scanline_t32cb16blend_clamp_mod_dither(context_t* c);
    107 static void scanline_x32cb16blend_clamp_mod_dither(context_t* c);
    108 static void scanline_t32cb16(context_t* c);
    109 static void scanline_t32cb16_dither(context_t* c);
    110 static void scanline_t32cb16_clamp(context_t* c);
    111 static void scanline_t32cb16_clamp_dither(context_t* c);
    112 static void scanline_col32cb16blend(context_t* c);
    113 static void scanline_t16cb16_clamp(context_t* c);
    114 static void scanline_t16cb16blend_clamp_mod(context_t* c);
    115 static void scanline_memcpy(context_t* c);
    116 static void scanline_memset8(context_t* c);
    117 static void scanline_memset16(context_t* c);
    118 static void scanline_memset32(context_t* c);
    119 static void scanline_noop(context_t* c);
    120 static void scanline_set(context_t* c);
    121 static void scanline_clear(context_t* c);
    122 
    123 static void rect_generic(context_t* c, size_t yc);
    124 static void rect_memcpy(context_t* c, size_t yc);
    125 
    126 #if defined( __arm__)
    127 extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
    128 extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
    129 extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
    130 extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
    131 #elif defined(__aarch64__)
    132 extern "C" void scanline_t32cb16blend_arm64(uint16_t*, uint32_t*, size_t);
    133 extern "C" void scanline_col32cb16blend_arm64(uint16_t *dst, uint32_t col, size_t ct);
    134 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
    135 extern "C" void scanline_t32cb16blend_mips(uint16_t*, uint32_t*, size_t);
    136 #elif defined(__mips__) && defined(__LP64__)
    137 extern "C" void scanline_t32cb16blend_mips64(uint16_t*, uint32_t*, size_t);
    138 extern "C" void scanline_col32cb16blend_mips64(uint16_t *dst, uint32_t col, size_t ct);
    139 #endif
    140 
    141 // ----------------------------------------------------------------------------
    142 
    143 static inline uint16_t  convertAbgr8888ToRgb565(uint32_t  pix)
    144 {
    145     return uint16_t( ((pix << 8) & 0xf800) |
    146                       ((pix >> 5) & 0x07e0) |
    147                       ((pix >> 19) & 0x001f) );
    148 }
    149 
    150 struct shortcut_t {
    151     needs_filter_t  filter;
    152     const char*     desc;
    153     void            (*scanline)(context_t*);
    154     void            (*init_y)(context_t*, int32_t);
    155 };
    156 
    157 // Keep in sync with needs
    158 
    159 /* To understand the values here, have a look at:
    160  *     system/core/include/private/pixelflinger/ggl_context.h
    161  *
    162  * Especially the lines defining and using GGL_RESERVE_NEEDS
    163  *
    164  * Quick reminders:
    165  *   - the last nibble of the first value is the destination buffer format.
    166  *   - the last nibble of the third value is the source texture format
    167  *   - formats: 4=rgb565 1=abgr8888 2=xbgr8888
    168  *
    169  * In the descriptions below:
    170  *
    171  *   SRC      means we copy the source pixels to the destination
    172  *
    173  *   SRC_OVER means we blend the source pixels to the destination
    174  *            with dstFactor = 1-srcA, srcFactor=1  (premultiplied source).
    175  *            This mode is otherwise called 'blend'.
    176  *
    177  *   SRCA_OVER means we blend the source pixels to the destination
    178  *             with dstFactor=srcA*(1-srcA) srcFactor=srcA (non-premul source).
    179  *             This mode is otherwise called 'blend_srca'
    180  *
    181  *   clamp    means we fetch source pixels from a texture with u/v clamping
    182  *
    183  *   mod      means the source pixels are modulated (multiplied) by the
    184  *            a/r/g/b of the current context's color. Typically used for
    185  *            fade-in / fade-out.
    186  *
    187  *   dither   means we dither 32 bit values to 16 bits
    188  */
    189 static shortcut_t shortcuts[] = {
    190     { { { 0x03515104, 0x00000077, { 0x00000A01, 0x00000000 } },
    191         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    192         "565 fb, 8888 tx, blend SRC_OVER", scanline_t32cb16blend, init_y_noop },
    193     { { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
    194         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    195         "565 fb, 8888 tx, SRC", scanline_t32cb16, init_y_noop  },
    196     /* same as first entry, but with dithering */
    197     { { { 0x03515104, 0x00000177, { 0x00000A01, 0x00000000 } },
    198         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    199         "565 fb, 8888 tx, blend SRC_OVER dither", scanline_t32cb16blend_dither, init_y_noop },
    200     /* same as second entry, but with dithering */
    201     { { { 0x03010104, 0x00000177, { 0x00000A01, 0x00000000 } },
    202         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    203         "565 fb, 8888 tx, SRC dither", scanline_t32cb16_dither, init_y_noop  },
    204     /* this is used during the boot animation - CHEAT: ignore dithering */
    205     { { { 0x03545404, 0x00000077, { 0x00000A01, 0x00000000 } },
    206         { 0xFFFFFFFF, 0xFFFFFEFF, { 0xFFFFFFFF, 0x0000003F } } },
    207         "565 fb, 8888 tx, blend dst:ONE_MINUS_SRCA src:SRCA", scanline_t32cb16blend_srca, init_y_noop },
    208     /* special case for arbitrary texture coordinates (think scaling) */
    209     { { { 0x03515104, 0x00000077, { 0x00000001, 0x00000000 } },
    210         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    211         "565 fb, 8888 tx, SRC_OVER clamp", scanline_t32cb16blend_clamp, init_y },
    212     { { { 0x03515104, 0x00000177, { 0x00000001, 0x00000000 } },
    213         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    214         "565 fb, 8888 tx, SRC_OVER clamp dither", scanline_t32cb16blend_clamp_dither, init_y },
    215     /* another case used during emulation */
    216     { { { 0x03515104, 0x00000077, { 0x00001001, 0x00000000 } },
    217         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    218         "565 fb, 8888 tx, SRC_OVER clamp modulate", scanline_t32cb16blend_clamp_mod, init_y },
    219     /* and this */
    220     { { { 0x03515104, 0x00000077, { 0x00001002, 0x00000000 } },
    221         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    222         "565 fb, x888 tx, SRC_OVER clamp modulate", scanline_x32cb16blend_clamp_mod, init_y },
    223     { { { 0x03515104, 0x00000177, { 0x00001001, 0x00000000 } },
    224         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    225         "565 fb, 8888 tx, SRC_OVER clamp modulate dither", scanline_t32cb16blend_clamp_mod_dither, init_y },
    226     { { { 0x03515104, 0x00000177, { 0x00001002, 0x00000000 } },
    227         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    228         "565 fb, x888 tx, SRC_OVER clamp modulate dither", scanline_x32cb16blend_clamp_mod_dither, init_y },
    229     { { { 0x03010104, 0x00000077, { 0x00000001, 0x00000000 } },
    230         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    231         "565 fb, 8888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
    232     { { { 0x03010104, 0x00000077, { 0x00000002, 0x00000000 } },
    233         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    234         "565 fb, x888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
    235     { { { 0x03010104, 0x00000177, { 0x00000001, 0x00000000 } },
    236         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    237         "565 fb, 8888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
    238     { { { 0x03010104, 0x00000177, { 0x00000002, 0x00000000 } },
    239         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    240         "565 fb, x888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
    241     { { { 0x03010104, 0x00000077, { 0x00000004, 0x00000000 } },
    242         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    243         "565 fb, 565 tx, SRC clamp", scanline_t16cb16_clamp, init_y  },
    244     { { { 0x03515104, 0x00000077, { 0x00001004, 0x00000000 } },
    245         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
    246         "565 fb, 565 tx, SRC_OVER clamp", scanline_t16cb16blend_clamp_mod, init_y  },
    247     { { { 0x03515104, 0x00000077, { 0x00000000, 0x00000000 } },
    248         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0xFFFFFFFF } } },
    249         "565 fb, 8888 fixed color", scanline_col32cb16blend, init_y_packed  },
    250     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    251         { 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
    252         "(nop) alpha test", scanline_noop, init_y_noop },
    253     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    254         { 0x00000000, 0x00000070, { 0x00000000, 0x00000000 } } },
    255         "(nop) depth test", scanline_noop, init_y_noop },
    256     { { { 0x05000000, 0x00000000, { 0x00000000, 0x00000000 } },
    257         { 0x0F000000, 0x00000080, { 0x00000000, 0x00000000 } } },
    258         "(nop) logic_op", scanline_noop, init_y_noop },
    259     { { { 0xF0000000, 0x00000000, { 0x00000000, 0x00000000 } },
    260         { 0xF0000000, 0x00000080, { 0x00000000, 0x00000000 } } },
    261         "(nop) color mask", scanline_noop, init_y_noop },
    262     { { { 0x0F000000, 0x00000077, { 0x00000000, 0x00000000 } },
    263         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
    264         "(set) logic_op", scanline_set, init_y_noop },
    265     { { { 0x00000000, 0x00000077, { 0x00000000, 0x00000000 } },
    266         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
    267         "(clear) logic_op", scanline_clear, init_y_noop },
    268     { { { 0x03000000, 0x00000077, { 0x00000000, 0x00000000 } },
    269         { 0xFFFFFF00, 0x000000F7, { 0x00000000, 0x00000000 } } },
    270         "(clear) blending 0/0", scanline_clear, init_y_noop },
    271     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
    272         { 0x0000003F, 0x00000000, { 0x00000000, 0x00000000 } } },
    273         "(error) invalid color-buffer format", scanline_noop, init_y_error },
    274 };
    275 static const needs_filter_t noblend1to1 = {
    276         // (disregard dithering, see below)
    277         { 0x03010100, 0x00000077, { 0x00000A00, 0x00000000 } },
    278         { 0xFFFFFFC0, 0xFFFFFEFF, { 0xFFFFFFC0, 0x0000003F } }
    279 };
    280 static  const needs_filter_t fill16noblend = {
    281         { 0x03010100, 0x00000077, { 0x00000000, 0x00000000 } },
    282         { 0xFFFFFFC0, 0xFFFFFFFF, { 0x0000003F, 0x0000003F } }
    283 };
    284 
    285 // ----------------------------------------------------------------------------
    286 
    287 #if ANDROID_ARM_CODEGEN
    288 
    289 #if defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
    290 static CodeCache gCodeCache(32 * 1024);
    291 #elif defined(__aarch64__)
    292 static CodeCache gCodeCache(48 * 1024);
    293 #else
    294 static CodeCache gCodeCache(12 * 1024);
    295 #endif
    296 
    297 class ScanlineAssembly : public Assembly {
    298     AssemblyKey<needs_t> mKey;
    299 public:
    300     ScanlineAssembly(needs_t needs, size_t size)
    301         : Assembly(size), mKey(needs) { }
    302     const AssemblyKey<needs_t>& key() const { return mKey; }
    303 };
    304 #endif
    305 
    306 // ----------------------------------------------------------------------------
    307 
    308 void ggl_init_scanline(context_t* c)
    309 {
    310     c->init_y = init_y;
    311     c->step_y = step_y__generic;
    312     c->scanline = scanline;
    313 }
    314 
    315 void ggl_uninit_scanline(context_t* c)
    316 {
    317     if (c->state.buffers.coverage)
    318         free(c->state.buffers.coverage);
    319 #if ANDROID_ARM_CODEGEN
    320     if (c->scanline_as)
    321         c->scanline_as->decStrong(c);
    322 #endif
    323 }
    324 
    325 // ----------------------------------------------------------------------------
    326 
    327 static void pick_scanline(context_t* c)
    328 {
    329 #if (!defined(DEBUG__CODEGEN_ONLY) || (DEBUG__CODEGEN_ONLY == 0))
    330 
    331 #if ANDROID_CODEGEN == ANDROID_CODEGEN_GENERIC
    332     c->init_y = init_y;
    333     c->step_y = step_y__generic;
    334     c->scanline = scanline;
    335     return;
    336 #endif
    337 
    338     //printf("*** needs [%08lx:%08lx:%08lx:%08lx]\n",
    339     //    c->state.needs.n, c->state.needs.p,
    340     //    c->state.needs.t[0], c->state.needs.t[1]);
    341 
    342     // first handle the special case that we cannot test with a filter
    343     const uint32_t cb_format = GGL_READ_NEEDS(CB_FORMAT, c->state.needs.n);
    344     if (GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0]) == cb_format) {
    345         if (c->state.needs.match(noblend1to1)) {
    346             // this will match regardless of dithering state, since both
    347             // src and dest have the same format anyway, there is no dithering
    348             // to be done.
    349             const GGLFormat* f =
    350                 &(c->formats[GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0])]);
    351             if ((f->components == GGL_RGB) ||
    352                 (f->components == GGL_RGBA) ||
    353                 (f->components == GGL_LUMINANCE) ||
    354                 (f->components == GGL_LUMINANCE_ALPHA))
    355             {
    356                 // format must have all of RGB components
    357                 // (so the current color doesn't show through)
    358                 c->scanline = scanline_memcpy;
    359                 c->init_y = init_y_noop;
    360                 return;
    361             }
    362         }
    363     }
    364 
    365     if (c->state.needs.match(fill16noblend)) {
    366         c->init_y = init_y_packed;
    367         switch (c->formats[cb_format].size) {
    368         case 1: c->scanline = scanline_memset8;  return;
    369         case 2: c->scanline = scanline_memset16; return;
    370         case 4: c->scanline = scanline_memset32; return;
    371         }
    372     }
    373 
    374     const int numFilters = sizeof(shortcuts)/sizeof(shortcut_t);
    375     for (int i=0 ; i<numFilters ; i++) {
    376         if (c->state.needs.match(shortcuts[i].filter)) {
    377             c->scanline = shortcuts[i].scanline;
    378             c->init_y = shortcuts[i].init_y;
    379             return;
    380         }
    381     }
    382 
    383 #if DEBUG_NEEDS
    384     ALOGI("Needs: n=0x%08x p=0x%08x t0=0x%08x t1=0x%08x",
    385          c->state.needs.n, c->state.needs.p,
    386          c->state.needs.t[0], c->state.needs.t[1]);
    387 #endif
    388 
    389 #endif // DEBUG__CODEGEN_ONLY
    390 
    391     c->init_y = init_y;
    392     c->step_y = step_y__generic;
    393 
    394 #if ANDROID_ARM_CODEGEN
    395     // we're going to have to generate some code...
    396     // here, generate code for our pixel pipeline
    397     const AssemblyKey<needs_t> key(c->state.needs);
    398     sp<Assembly> assembly = gCodeCache.lookup(key);
    399     if (assembly == 0) {
    400         // create a new assembly region
    401         sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs,
    402                 ASSEMBLY_SCRATCH_SIZE);
    403         // initialize our assembler
    404 #if defined(__arm__)
    405         GGLAssembler assembler( new ARMAssembler(a) );
    406         //GGLAssembler assembler(
    407         //        new ARMAssemblerOptimizer(new ARMAssembler(a)) );
    408 #endif
    409 #if defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
    410         GGLAssembler assembler( new ArmToMipsAssembler(a) );
    411 #elif defined(__mips__) && defined(__LP64__)
    412         GGLAssembler assembler( new ArmToMips64Assembler(a) );
    413 #elif defined(__aarch64__)
    414         GGLAssembler assembler( new ArmToArm64Assembler(a) );
    415 #endif
    416         // generate the scanline code for the given needs
    417         bool err = assembler.scanline(c->state.needs, c) != 0;
    418         if (ggl_likely(!err)) {
    419             // finally, cache this assembly
    420             err = gCodeCache.cache(a->key(), a) < 0;
    421         }
    422         if (ggl_unlikely(err)) {
    423             ALOGE("error generating or caching assembly. Reverting to NOP.");
    424             c->scanline = scanline_noop;
    425             c->init_y = init_y_noop;
    426             c->step_y = step_y__nop;
    427             return;
    428         }
    429         assembly = a;
    430     }
    431 
    432     // release the previous assembly
    433     if (c->scanline_as) {
    434         c->scanline_as->decStrong(c);
    435     }
    436 
    437     //ALOGI("using generated pixel-pipeline");
    438     c->scanline_as = assembly.get();
    439     c->scanline_as->incStrong(c); //  hold on to assembly
    440     c->scanline = (void(*)(context_t* c))assembly->base();
    441 #else
    442 //    ALOGW("using generic (slow) pixel-pipeline");
    443     c->scanline = scanline;
    444 #endif
    445 }
    446 
    447 void ggl_pick_scanline(context_t* c)
    448 {
    449     pick_scanline(c);
    450     if ((c->state.enables & GGL_ENABLE_W) &&
    451         (c->state.enables & GGL_ENABLE_TMUS))
    452     {
    453         c->span = c->scanline;
    454         c->scanline = scanline_perspective;
    455         if (!(c->state.enabled_tmu & (c->state.enabled_tmu - 1))) {
    456             // only one TMU enabled
    457             c->scanline = scanline_perspective_single;
    458         }
    459     }
    460 }
    461 
    462 // ----------------------------------------------------------------------------
    463 
    464 static void blending(context_t* c, pixel_t* fragment, pixel_t* fb);
    465 static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
    466         const pixel_t* src, const pixel_t* dst);
    467 static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
    468 
    469 #if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
    470 
    471 // no need to compile the generic-pipeline, it can't be reached
    472 void scanline(context_t*)
    473 {
    474 }
    475 
    476 #else
    477 
    478 void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv)
    479 {
    480     if (su && sv) {
    481         if (su > sv) {
    482             v = ggl_expand(v, sv, su);
    483             sv = su;
    484         } else if (su < sv) {
    485             u = ggl_expand(u, su, sv);
    486             su = sv;
    487         }
    488     }
    489 }
    490 
    491 void blending(context_t* c, pixel_t* fragment, pixel_t* fb)
    492 {
    493     rescale(fragment->c[0], fragment->s[0], fb->c[0], fb->s[0]);
    494     rescale(fragment->c[1], fragment->s[1], fb->c[1], fb->s[1]);
    495     rescale(fragment->c[2], fragment->s[2], fb->c[2], fb->s[2]);
    496     rescale(fragment->c[3], fragment->s[3], fb->c[3], fb->s[3]);
    497 
    498     pixel_t sf, df;
    499     blend_factor(c, &sf, c->state.blend.src, fragment, fb);
    500     blend_factor(c, &df, c->state.blend.dst, fragment, fb);
    501 
    502     fragment->c[1] =
    503             gglMulAddx(fragment->c[1], sf.c[1], gglMulx(fb->c[1], df.c[1]));
    504     fragment->c[2] =
    505             gglMulAddx(fragment->c[2], sf.c[2], gglMulx(fb->c[2], df.c[2]));
    506     fragment->c[3] =
    507             gglMulAddx(fragment->c[3], sf.c[3], gglMulx(fb->c[3], df.c[3]));
    508 
    509     if (c->state.blend.alpha_separate) {
    510         blend_factor(c, &sf, c->state.blend.src_alpha, fragment, fb);
    511         blend_factor(c, &df, c->state.blend.dst_alpha, fragment, fb);
    512     }
    513 
    514     fragment->c[0] =
    515             gglMulAddx(fragment->c[0], sf.c[0], gglMulx(fb->c[0], df.c[0]));
    516 
    517     // clamp to 1.0
    518     if (fragment->c[0] >= (1LU<<fragment->s[0]))
    519         fragment->c[0] = (1<<fragment->s[0])-1;
    520     if (fragment->c[1] >= (1LU<<fragment->s[1]))
    521         fragment->c[1] = (1<<fragment->s[1])-1;
    522     if (fragment->c[2] >= (1LU<<fragment->s[2]))
    523         fragment->c[2] = (1<<fragment->s[2])-1;
    524     if (fragment->c[3] >= (1LU<<fragment->s[3]))
    525         fragment->c[3] = (1<<fragment->s[3])-1;
    526 }
    527 
    528 static inline int blendfactor(uint32_t x, uint32_t size, uint32_t def = 0)
    529 {
    530     if (!size)
    531         return def;
    532 
    533     // scale to 16 bits
    534     if (size > 16) {
    535         x >>= (size - 16);
    536     } else if (size < 16) {
    537         x = ggl_expand(x, size, 16);
    538     }
    539     x += x >> 15;
    540     return x;
    541 }
    542 
    543 void blend_factor(context_t* /*c*/, pixel_t* r,
    544         uint32_t factor, const pixel_t* src, const pixel_t* dst)
    545 {
    546     switch (factor) {
    547         case GGL_ZERO:
    548             r->c[1] =
    549             r->c[2] =
    550             r->c[3] =
    551             r->c[0] = 0;
    552             break;
    553         case GGL_ONE:
    554             r->c[1] =
    555             r->c[2] =
    556             r->c[3] =
    557             r->c[0] = FIXED_ONE;
    558             break;
    559         case GGL_DST_COLOR:
    560             r->c[1] = blendfactor(dst->c[1], dst->s[1]);
    561             r->c[2] = blendfactor(dst->c[2], dst->s[2]);
    562             r->c[3] = blendfactor(dst->c[3], dst->s[3]);
    563             r->c[0] = blendfactor(dst->c[0], dst->s[0]);
    564             break;
    565         case GGL_SRC_COLOR:
    566             r->c[1] = blendfactor(src->c[1], src->s[1]);
    567             r->c[2] = blendfactor(src->c[2], src->s[2]);
    568             r->c[3] = blendfactor(src->c[3], src->s[3]);
    569             r->c[0] = blendfactor(src->c[0], src->s[0]);
    570             break;
    571         case GGL_ONE_MINUS_DST_COLOR:
    572             r->c[1] = FIXED_ONE - blendfactor(dst->c[1], dst->s[1]);
    573             r->c[2] = FIXED_ONE - blendfactor(dst->c[2], dst->s[2]);
    574             r->c[3] = FIXED_ONE - blendfactor(dst->c[3], dst->s[3]);
    575             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0]);
    576             break;
    577         case GGL_ONE_MINUS_SRC_COLOR:
    578             r->c[1] = FIXED_ONE - blendfactor(src->c[1], src->s[1]);
    579             r->c[2] = FIXED_ONE - blendfactor(src->c[2], src->s[2]);
    580             r->c[3] = FIXED_ONE - blendfactor(src->c[3], src->s[3]);
    581             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0]);
    582             break;
    583         case GGL_SRC_ALPHA:
    584             r->c[1] =
    585             r->c[2] =
    586             r->c[3] =
    587             r->c[0] = blendfactor(src->c[0], src->s[0], FIXED_ONE);
    588             break;
    589         case GGL_ONE_MINUS_SRC_ALPHA:
    590             r->c[1] =
    591             r->c[2] =
    592             r->c[3] =
    593             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0], FIXED_ONE);
    594             break;
    595         case GGL_DST_ALPHA:
    596             r->c[1] =
    597             r->c[2] =
    598             r->c[3] =
    599             r->c[0] = blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
    600             break;
    601         case GGL_ONE_MINUS_DST_ALPHA:
    602             r->c[1] =
    603             r->c[2] =
    604             r->c[3] =
    605             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
    606             break;
    607         case GGL_SRC_ALPHA_SATURATE:
    608             // XXX: GGL_SRC_ALPHA_SATURATE
    609             break;
    610     }
    611 }
    612 
    613 static GGLfixed wrapping(int32_t coord, uint32_t size, int tx_wrap)
    614 {
    615     GGLfixed d;
    616     if (tx_wrap == GGL_REPEAT) {
    617         d = (uint32_t(coord)>>16) * size;
    618     } else if (tx_wrap == GGL_CLAMP) { // CLAMP_TO_EDGE semantics
    619         const GGLfixed clamp_min = FIXED_HALF;
    620         const GGLfixed clamp_max = (size << 16) - FIXED_HALF;
    621         if (coord < clamp_min)     coord = clamp_min;
    622         if (coord > clamp_max)     coord = clamp_max;
    623         d = coord;
    624     } else { // 1:1
    625         const GGLfixed clamp_min = 0;
    626         const GGLfixed clamp_max = (size << 16);
    627         if (coord < clamp_min)     coord = clamp_min;
    628         if (coord > clamp_max)     coord = clamp_max;
    629         d = coord;
    630     }
    631     return d;
    632 }
    633 
    634 static inline
    635 GGLcolor ADJUST_COLOR_ITERATOR(GGLcolor v, GGLcolor dvdx, int len)
    636 {
    637     const int32_t end = dvdx * (len-1) + v;
    638     if (end < 0)
    639         v -= end;
    640     v &= ~(v>>31);
    641     return v;
    642 }
    643 
    644 void scanline(context_t* c)
    645 {
    646     const uint32_t enables = c->state.enables;
    647     const int xs = c->iterators.xl;
    648     const int x1 = c->iterators.xr;
    649 	int xc = x1 - xs;
    650     const int16_t* covPtr = c->state.buffers.coverage + xs;
    651 
    652     // All iterated values are sampled at the pixel center
    653 
    654     // reset iterators for that scanline...
    655     GGLcolor r, g, b, a;
    656     iterators_t& ci = c->iterators;
    657     if (enables & GGL_ENABLE_SMOOTH) {
    658         r = (xs * c->shade.drdx) + ci.ydrdy;
    659         g = (xs * c->shade.dgdx) + ci.ydgdy;
    660         b = (xs * c->shade.dbdx) + ci.ydbdy;
    661         a = (xs * c->shade.dadx) + ci.ydady;
    662         r = ADJUST_COLOR_ITERATOR(r, c->shade.drdx, xc);
    663         g = ADJUST_COLOR_ITERATOR(g, c->shade.dgdx, xc);
    664         b = ADJUST_COLOR_ITERATOR(b, c->shade.dbdx, xc);
    665         a = ADJUST_COLOR_ITERATOR(a, c->shade.dadx, xc);
    666     } else {
    667         r = ci.ydrdy;
    668         g = ci.ydgdy;
    669         b = ci.ydbdy;
    670         a = ci.ydady;
    671     }
    672 
    673     // z iterators are 1.31
    674     GGLfixed z = (xs * c->shade.dzdx) + ci.ydzdy;
    675     GGLfixed f = (xs * c->shade.dfdx) + ci.ydfdy;
    676 
    677     struct {
    678         GGLfixed s, t;
    679     } tc[GGL_TEXTURE_UNIT_COUNT];
    680     if (enables & GGL_ENABLE_TMUS) {
    681         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
    682             if (c->state.texture[i].enable) {
    683                 texture_iterators_t& ti = c->state.texture[i].iterators;
    684                 if (enables & GGL_ENABLE_W) {
    685                     tc[i].s = ti.ydsdy;
    686                     tc[i].t = ti.ydtdy;
    687                 } else {
    688                     tc[i].s = (xs * ti.dsdx) + ti.ydsdy;
    689                     tc[i].t = (xs * ti.dtdx) + ti.ydtdy;
    690                 }
    691             }
    692         }
    693     }
    694 
    695     pixel_t fragment;
    696     pixel_t texel;
    697     pixel_t fb;
    698 
    699 	uint32_t x = xs;
    700 	uint32_t y = c->iterators.y;
    701 
    702 	while (xc--) {
    703 
    704         { // just a scope
    705 
    706 		// read color (convert to 8 bits by keeping only the integer part)
    707         fragment.s[1] = fragment.s[2] =
    708         fragment.s[3] = fragment.s[0] = 8;
    709         fragment.c[1] = r >> (GGL_COLOR_BITS-8);
    710         fragment.c[2] = g >> (GGL_COLOR_BITS-8);
    711         fragment.c[3] = b >> (GGL_COLOR_BITS-8);
    712         fragment.c[0] = a >> (GGL_COLOR_BITS-8);
    713 
    714 		// texturing
    715         if (enables & GGL_ENABLE_TMUS) {
    716             for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
    717                 texture_t& tx = c->state.texture[i];
    718                 if (!tx.enable)
    719                     continue;
    720                 texture_iterators_t& ti = tx.iterators;
    721                 int32_t u, v;
    722 
    723                 // s-coordinate
    724                 if (tx.s_coord != GGL_ONE_TO_ONE) {
    725                     const int w = tx.surface.width;
    726                     u = wrapping(tc[i].s, w, tx.s_wrap);
    727                     tc[i].s += ti.dsdx;
    728                 } else {
    729                     u = (((tx.shade.is0>>16) + x)<<16) + FIXED_HALF;
    730                 }
    731 
    732                 // t-coordinate
    733                 if (tx.t_coord != GGL_ONE_TO_ONE) {
    734                     const int h = tx.surface.height;
    735                     v = wrapping(tc[i].t, h, tx.t_wrap);
    736                     tc[i].t += ti.dtdx;
    737                 } else {
    738                     v = (((tx.shade.it0>>16) + y)<<16) + FIXED_HALF;
    739                 }
    740 
    741                 // read texture
    742                 if (tx.mag_filter == GGL_NEAREST &&
    743                     tx.min_filter == GGL_NEAREST)
    744                 {
    745                     u >>= 16;
    746                     v >>= 16;
    747                     tx.surface.read(&tx.surface, c, u, v, &texel);
    748                 } else {
    749                     const int w = tx.surface.width;
    750                     const int h = tx.surface.height;
    751                     u -= FIXED_HALF;
    752                     v -= FIXED_HALF;
    753                     int u0 = u >> 16;
    754                     int v0 = v >> 16;
    755                     int u1 = u0 + 1;
    756                     int v1 = v0 + 1;
    757                     if (tx.s_wrap == GGL_REPEAT) {
    758                         if (u0<0)  u0 += w;
    759                         if (u1<0)  u1 += w;
    760                         if (u0>=w) u0 -= w;
    761                         if (u1>=w) u1 -= w;
    762                     } else {
    763                         if (u0<0)  u0 = 0;
    764                         if (u1<0)  u1 = 0;
    765                         if (u0>=w) u0 = w-1;
    766                         if (u1>=w) u1 = w-1;
    767                     }
    768                     if (tx.t_wrap == GGL_REPEAT) {
    769                         if (v0<0)  v0 += h;
    770                         if (v1<0)  v1 += h;
    771                         if (v0>=h) v0 -= h;
    772                         if (v1>=h) v1 -= h;
    773                     } else {
    774                         if (v0<0)  v0 = 0;
    775                         if (v1<0)  v1 = 0;
    776                         if (v0>=h) v0 = h-1;
    777                         if (v1>=h) v1 = h-1;
    778                     }
    779                     pixel_t texels[4];
    780                     uint32_t mm[4];
    781                     tx.surface.read(&tx.surface, c, u0, v0, &texels[0]);
    782                     tx.surface.read(&tx.surface, c, u0, v1, &texels[1]);
    783                     tx.surface.read(&tx.surface, c, u1, v0, &texels[2]);
    784                     tx.surface.read(&tx.surface, c, u1, v1, &texels[3]);
    785                     u = (u >> 12) & 0xF;
    786                     v = (v >> 12) & 0xF;
    787                     u += u>>3;
    788                     v += v>>3;
    789                     mm[0] = (0x10 - u) * (0x10 - v);
    790                     mm[1] = (0x10 - u) * v;
    791                     mm[2] = u * (0x10 - v);
    792                     mm[3] = 0x100 - (mm[0] + mm[1] + mm[2]);
    793                     for (int j=0 ; j<4 ; j++) {
    794                         texel.s[j] = texels[0].s[j];
    795                         if (!texel.s[j]) continue;
    796                         texel.s[j] += 8;
    797                         texel.c[j] =    texels[0].c[j]*mm[0] +
    798                                         texels[1].c[j]*mm[1] +
    799                                         texels[2].c[j]*mm[2] +
    800                                         texels[3].c[j]*mm[3] ;
    801                     }
    802                 }
    803 
    804                 // Texture environnement...
    805                 for (int j=0 ; j<4 ; j++) {
    806                     uint32_t& Cf = fragment.c[j];
    807                     uint32_t& Ct = texel.c[j];
    808                     uint8_t& sf  = fragment.s[j];
    809                     uint8_t& st  = texel.s[j];
    810                     uint32_t At = texel.c[0];
    811                     uint8_t sat = texel.s[0];
    812                     switch (tx.env) {
    813                     case GGL_REPLACE:
    814                         if (st) {
    815                             Cf = Ct;
    816                             sf = st;
    817                         }
    818                         break;
    819                     case GGL_MODULATE:
    820                         if (st) {
    821                             uint32_t factor = Ct + (Ct>>(st-1));
    822                             Cf = (Cf * factor) >> st;
    823                         }
    824                         break;
    825                     case GGL_DECAL:
    826                         if (sat) {
    827                             rescale(Cf, sf, Ct, st);
    828                             Cf += ((Ct - Cf) * (At + (At>>(sat-1)))) >> sat;
    829                         }
    830                         break;
    831                     case GGL_BLEND:
    832                         if (st) {
    833                             uint32_t Cc = tx.env_color[i];
    834                             if (sf>8)       Cc = (Cc * ((1<<sf)-1))>>8;
    835                             else if (sf<8)  Cc = (Cc - (Cc>>(8-sf)))>>(8-sf);
    836                             uint32_t factor = Ct + (Ct>>(st-1));
    837                             Cf = ((((1<<st) - factor) * Cf) + Ct*Cc)>>st;
    838                         }
    839                         break;
    840                     case GGL_ADD:
    841                         if (st) {
    842                             rescale(Cf, sf, Ct, st);
    843                             Cf += Ct;
    844                         }
    845                         break;
    846                     }
    847                 }
    848             }
    849 		}
    850 
    851         // coverage application
    852         if (enables & GGL_ENABLE_AA) {
    853             int16_t cf = *covPtr++;
    854             fragment.c[0] = (int64_t(fragment.c[0]) * cf) >> 15;
    855         }
    856 
    857         // alpha-test
    858         if (enables & GGL_ENABLE_ALPHA_TEST) {
    859             GGLcolor ref = c->state.alpha_test.ref;
    860             GGLcolor alpha = (uint64_t(fragment.c[0]) *
    861                     ((1<<GGL_COLOR_BITS)-1)) / ((1<<fragment.s[0])-1);
    862             switch (c->state.alpha_test.func) {
    863             case GGL_NEVER:     goto discard;
    864             case GGL_LESS:      if (alpha<ref)  break; goto discard;
    865             case GGL_EQUAL:     if (alpha==ref) break; goto discard;
    866             case GGL_LEQUAL:    if (alpha<=ref) break; goto discard;
    867             case GGL_GREATER:   if (alpha>ref)  break; goto discard;
    868             case GGL_NOTEQUAL:  if (alpha!=ref) break; goto discard;
    869             case GGL_GEQUAL:    if (alpha>=ref) break; goto discard;
    870             }
    871         }
    872 
    873         // depth test
    874         if (c->state.buffers.depth.format) {
    875             if (enables & GGL_ENABLE_DEPTH_TEST) {
    876                 surface_t* cb = &(c->state.buffers.depth);
    877                 uint16_t* p = (uint16_t*)(cb->data)+(x+(cb->stride*y));
    878                 uint16_t zz = uint32_t(z)>>(16);
    879                 uint16_t depth = *p;
    880                 switch (c->state.depth_test.func) {
    881                 case GGL_NEVER:     goto discard;
    882                 case GGL_LESS:      if (zz<depth)    break; goto discard;
    883                 case GGL_EQUAL:     if (zz==depth)   break; goto discard;
    884                 case GGL_LEQUAL:    if (zz<=depth)   break; goto discard;
    885                 case GGL_GREATER:   if (zz>depth)    break; goto discard;
    886                 case GGL_NOTEQUAL:  if (zz!=depth)   break; goto discard;
    887                 case GGL_GEQUAL:    if (zz>=depth)   break; goto discard;
    888                 }
    889                 // depth buffer is not enabled, if depth-test is not enabled
    890 /*
    891         fragment.s[1] = fragment.s[2] =
    892         fragment.s[3] = fragment.s[0] = 8;
    893         fragment.c[1] =
    894         fragment.c[2] =
    895         fragment.c[3] =
    896         fragment.c[0] = 255 - (zz>>8);
    897 */
    898                 if (c->state.mask.depth) {
    899                     *p = zz;
    900                 }
    901             }
    902         }
    903 
    904         // fog
    905         if (enables & GGL_ENABLE_FOG) {
    906             for (int i=1 ; i<=3 ; i++) {
    907                 GGLfixed fc = (c->state.fog.color[i] * 0x10000) / 0xFF;
    908                 uint32_t& c = fragment.c[i];
    909                 uint8_t& s  = fragment.s[i];
    910                 c = (c * 0x10000) / ((1<<s)-1);
    911                 c = gglMulAddx(c, f, gglMulx(fc, 0x10000 - f));
    912                 s = 16;
    913             }
    914         }
    915 
    916         // blending
    917         if (enables & GGL_ENABLE_BLENDING) {
    918             fb.c[1] = fb.c[2] = fb.c[3] = fb.c[0] = 0; // placate valgrind
    919             fb.s[1] = fb.s[2] = fb.s[3] = fb.s[0] = 0;
    920             c->state.buffers.color.read(
    921                     &(c->state.buffers.color), c, x, y, &fb);
    922             blending( c, &fragment, &fb );
    923         }
    924 
    925 		// write
    926         c->state.buffers.color.write(
    927                 &(c->state.buffers.color), c, x, y, &fragment);
    928         }
    929 
    930 discard:
    931 		// iterate...
    932         x += 1;
    933         if (enables & GGL_ENABLE_SMOOTH) {
    934             r += c->shade.drdx;
    935             g += c->shade.dgdx;
    936             b += c->shade.dbdx;
    937             a += c->shade.dadx;
    938         }
    939         z += c->shade.dzdx;
    940         f += c->shade.dfdx;
    941 	}
    942 }
    943 
    944 #endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
    945 
    946 // ----------------------------------------------------------------------------
    947 #if 0
    948 #pragma mark -
    949 #pragma mark Scanline
    950 #endif
    951 
    952 /* Used to parse a 32-bit source texture linearly. Usage is:
    953  *
    954  * horz_iterator32  hi(context);
    955  * while (...) {
    956  *    uint32_t  src_pixel = hi.get_pixel32();
    957  *    ...
    958  * }
    959  *
    960  * Use only for one-to-one texture mapping.
    961  */
    962 struct horz_iterator32 {
    963     explicit horz_iterator32(context_t* c) {
    964         const int x = c->iterators.xl;
    965         const int y = c->iterators.y;
    966         texture_t& tx = c->state.texture[0];
    967         const int32_t u = (tx.shade.is0>>16) + x;
    968         const int32_t v = (tx.shade.it0>>16) + y;
    969         m_src = reinterpret_cast<uint32_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
    970     }
    971     uint32_t  get_pixel32() {
    972         return *m_src++;
    973     }
    974 protected:
    975     uint32_t* m_src;
    976 };
    977 
    978 /* A variant for 16-bit source textures. */
    979 struct horz_iterator16 {
    980     explicit horz_iterator16(context_t* c) {
    981         const int x = c->iterators.xl;
    982         const int y = c->iterators.y;
    983         texture_t& tx = c->state.texture[0];
    984         const int32_t u = (tx.shade.is0>>16) + x;
    985         const int32_t v = (tx.shade.it0>>16) + y;
    986         m_src = reinterpret_cast<uint16_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
    987     }
    988     uint16_t  get_pixel16() {
    989         return *m_src++;
    990     }
    991 protected:
    992     uint16_t* m_src;
    993 };
    994 
    995 /* A clamp iterator is used to iterate inside a texture with GGL_CLAMP.
    996  * After initialization, call get_src16() or get_src32() to get the current
    997  * texture pixel value.
    998  */
    999 struct clamp_iterator {
   1000     explicit clamp_iterator(context_t* c) {
   1001         const int xs = c->iterators.xl;
   1002         texture_t& tx = c->state.texture[0];
   1003         texture_iterators_t& ti = tx.iterators;
   1004         m_s = (xs * ti.dsdx) + ti.ydsdy;
   1005         m_t = (xs * ti.dtdx) + ti.ydtdy;
   1006         m_ds = ti.dsdx;
   1007         m_dt = ti.dtdx;
   1008         m_width_m1 = tx.surface.width - 1;
   1009         m_height_m1 = tx.surface.height - 1;
   1010         m_data = tx.surface.data;
   1011         m_stride = tx.surface.stride;
   1012     }
   1013     uint16_t get_pixel16() {
   1014         int  u, v;
   1015         get_uv(u, v);
   1016         uint16_t* src = reinterpret_cast<uint16_t*>(m_data) + (u + (m_stride*v));
   1017         return src[0];
   1018     }
   1019     uint32_t get_pixel32() {
   1020         int  u, v;
   1021         get_uv(u, v);
   1022         uint32_t* src = reinterpret_cast<uint32_t*>(m_data) + (u + (m_stride*v));
   1023         return src[0];
   1024     }
   1025 private:
   1026     void   get_uv(int& u, int& v) {
   1027         int  uu = m_s >> 16;
   1028         int  vv = m_t >> 16;
   1029         if (uu < 0)
   1030             uu = 0;
   1031         if (uu > m_width_m1)
   1032             uu = m_width_m1;
   1033         if (vv < 0)
   1034             vv = 0;
   1035         if (vv > m_height_m1)
   1036             vv = m_height_m1;
   1037         u = uu;
   1038         v = vv;
   1039         m_s += m_ds;
   1040         m_t += m_dt;
   1041     }
   1042 
   1043     GGLfixed  m_s, m_t;
   1044     GGLfixed  m_ds, m_dt;
   1045     int       m_width_m1, m_height_m1;
   1046     uint8_t*  m_data;
   1047     int       m_stride;
   1048 };
   1049 
   1050 /*
   1051  * The 'horizontal clamp iterator' variant corresponds to the case where
   1052  * the 'v' coordinate doesn't change. This is useful to avoid one mult and
   1053  * extra adds / checks per pixels, if the blending/processing operation after
   1054  * this is very fast.
   1055  */
   1056 static int is_context_horizontal(const context_t* c) {
   1057     return (c->state.texture[0].iterators.dtdx == 0);
   1058 }
   1059 
   1060 struct horz_clamp_iterator {
   1061     uint16_t  get_pixel16() {
   1062         int  u = m_s >> 16;
   1063         m_s += m_ds;
   1064         if (u < 0)
   1065             u = 0;
   1066         if (u > m_width_m1)
   1067             u = m_width_m1;
   1068         const uint16_t* src = reinterpret_cast<const uint16_t*>(m_data);
   1069         return src[u];
   1070     }
   1071     uint32_t  get_pixel32() {
   1072         int  u = m_s >> 16;
   1073         m_s += m_ds;
   1074         if (u < 0)
   1075             u = 0;
   1076         if (u > m_width_m1)
   1077             u = m_width_m1;
   1078         const uint32_t* src = reinterpret_cast<const uint32_t*>(m_data);
   1079         return src[u];
   1080     }
   1081 protected:
   1082     void init(const context_t* c, int shift);
   1083     GGLfixed       m_s;
   1084     GGLfixed       m_ds;
   1085     int            m_width_m1;
   1086     const uint8_t* m_data;
   1087 };
   1088 
   1089 void horz_clamp_iterator::init(const context_t* c, int shift)
   1090 {
   1091     const int xs = c->iterators.xl;
   1092     const texture_t& tx = c->state.texture[0];
   1093     const texture_iterators_t& ti = tx.iterators;
   1094     m_s = (xs * ti.dsdx) + ti.ydsdy;
   1095     m_ds = ti.dsdx;
   1096     m_width_m1 = tx.surface.width-1;
   1097     m_data = tx.surface.data;
   1098 
   1099     GGLfixed t = (xs * ti.dtdx) + ti.ydtdy;
   1100     int      v = t >> 16;
   1101     if (v < 0)
   1102         v = 0;
   1103     else if (v >= (int)tx.surface.height)
   1104         v = (int)tx.surface.height-1;
   1105 
   1106     m_data += (tx.surface.stride*v) << shift;
   1107 }
   1108 
   1109 struct horz_clamp_iterator16 : horz_clamp_iterator {
   1110     explicit horz_clamp_iterator16(const context_t* c) {
   1111         init(c,1);
   1112     };
   1113 };
   1114 
   1115 struct horz_clamp_iterator32 : horz_clamp_iterator {
   1116     explicit horz_clamp_iterator32(context_t* c) {
   1117         init(c,2);
   1118     };
   1119 };
   1120 
   1121 /* This is used to perform dithering operations.
   1122  */
   1123 struct ditherer {
   1124     explicit ditherer(const context_t* c) {
   1125         const int x = c->iterators.xl;
   1126         const int y = c->iterators.y;
   1127         m_line = &c->ditherMatrix[ ((y & GGL_DITHER_MASK)<<GGL_DITHER_ORDER_SHIFT) ];
   1128         m_index = x & GGL_DITHER_MASK;
   1129     }
   1130     void step(void) {
   1131         m_index++;
   1132     }
   1133     int  get_value(void) {
   1134         int ret = m_line[m_index & GGL_DITHER_MASK];
   1135         m_index++;
   1136         return ret;
   1137     }
   1138     uint16_t abgr8888ToRgb565(uint32_t s) {
   1139         uint32_t r = s & 0xff;
   1140         uint32_t g = (s >> 8) & 0xff;
   1141         uint32_t b = (s >> 16) & 0xff;
   1142         return rgb888ToRgb565(r,g,b);
   1143     }
   1144     /* The following assumes that r/g/b are in the 0..255 range each */
   1145     uint16_t rgb888ToRgb565(uint32_t& r, uint32_t& g, uint32_t &b) {
   1146         int threshold = get_value();
   1147         /* dither in on GGL_DITHER_BITS, and each of r, g, b is on 8 bits */
   1148         r += (threshold >> (GGL_DITHER_BITS-8 +5));
   1149         g += (threshold >> (GGL_DITHER_BITS-8 +6));
   1150         b += (threshold >> (GGL_DITHER_BITS-8 +5));
   1151         if (r > 0xff)
   1152             r = 0xff;
   1153         if (g > 0xff)
   1154             g = 0xff;
   1155         if (b > 0xff)
   1156             b = 0xff;
   1157         return uint16_t(((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3));
   1158     }
   1159 protected:
   1160     const uint8_t* m_line;
   1161     int            m_index;
   1162 };
   1163 
   1164 /* This structure is used to blend (SRC_OVER) 32-bit source pixels
   1165  * onto 16-bit destination ones. Usage is simply:
   1166  *
   1167  *   blender.blend(<32-bit-src-pixel-value>,<ptr-to-16-bit-dest-pixel>)
   1168  */
   1169 struct blender_32to16 {
   1170     explicit blender_32to16(context_t* /*c*/) { }
   1171     void write(uint32_t s, uint16_t* dst) {
   1172         if (s == 0)
   1173             return;
   1174         s = GGL_RGBA_TO_HOST(s);
   1175         int sA = (s>>24);
   1176         if (sA == 0xff) {
   1177             *dst = convertAbgr8888ToRgb565(s);
   1178         } else {
   1179             int f = 0x100 - (sA + (sA>>7));
   1180             int sR = (s >> (   3))&0x1F;
   1181             int sG = (s >> ( 8+2))&0x3F;
   1182             int sB = (s >> (16+3))&0x1F;
   1183             uint16_t d = *dst;
   1184             int dR = (d>>11)&0x1f;
   1185             int dG = (d>>5)&0x3f;
   1186             int dB = (d)&0x1f;
   1187             sR += (f*dR)>>8;
   1188             sG += (f*dG)>>8;
   1189             sB += (f*dB)>>8;
   1190             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1191         }
   1192     }
   1193     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1194         if (s == 0) {
   1195             di.step();
   1196             return;
   1197         }
   1198         s = GGL_RGBA_TO_HOST(s);
   1199         int sA = (s>>24);
   1200         if (sA == 0xff) {
   1201             *dst = di.abgr8888ToRgb565(s);
   1202         } else {
   1203             int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1204             int f = 0x100 - (sA + (sA>>7));
   1205             int sR = (s >> (   3))&0x1F;
   1206             int sG = (s >> ( 8+2))&0x3F;
   1207             int sB = (s >> (16+3))&0x1F;
   1208             uint16_t d = *dst;
   1209             int dR = (d>>11)&0x1f;
   1210             int dG = (d>>5)&0x3f;
   1211             int dB = (d)&0x1f;
   1212             sR = ((sR << 8) + f*dR + threshold)>>8;
   1213             sG = ((sG << 8) + f*dG + threshold)>>8;
   1214             sB = ((sB << 8) + f*dB + threshold)>>8;
   1215             if (sR > 0x1f) sR = 0x1f;
   1216             if (sG > 0x3f) sG = 0x3f;
   1217             if (sB > 0x1f) sB = 0x1f;
   1218             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1219         }
   1220     }
   1221 };
   1222 
   1223 /* This blender does the same for the 'blend_srca' operation.
   1224  * where dstFactor=srcA*(1-srcA) srcFactor=srcA
   1225  */
   1226 struct blender_32to16_srcA {
   1227     explicit blender_32to16_srcA(const context_t* /*c*/) { }
   1228     void write(uint32_t s, uint16_t* dst) {
   1229         if (!s) {
   1230             return;
   1231         }
   1232         uint16_t d = *dst;
   1233         s = GGL_RGBA_TO_HOST(s);
   1234         int sR = (s >> (   3))&0x1F;
   1235         int sG = (s >> ( 8+2))&0x3F;
   1236         int sB = (s >> (16+3))&0x1F;
   1237         int sA = (s>>24);
   1238         int f1 = (sA + (sA>>7));
   1239         int f2 = 0x100-f1;
   1240         int dR = (d>>11)&0x1f;
   1241         int dG = (d>>5)&0x3f;
   1242         int dB = (d)&0x1f;
   1243         sR = (f1*sR + f2*dR)>>8;
   1244         sG = (f1*sG + f2*dG)>>8;
   1245         sB = (f1*sB + f2*dB)>>8;
   1246         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1247     }
   1248 };
   1249 
   1250 /* Common init code the modulating blenders */
   1251 struct blender_modulate {
   1252     void init(const context_t* c) {
   1253         const int r = c->iterators.ydrdy >> (GGL_COLOR_BITS-8);
   1254         const int g = c->iterators.ydgdy >> (GGL_COLOR_BITS-8);
   1255         const int b = c->iterators.ydbdy >> (GGL_COLOR_BITS-8);
   1256         const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
   1257         m_r = r + (r >> 7);
   1258         m_g = g + (g >> 7);
   1259         m_b = b + (b >> 7);
   1260         m_a = a + (a >> 7);
   1261     }
   1262 protected:
   1263     int m_r, m_g, m_b, m_a;
   1264 };
   1265 
   1266 /* This blender does a normal blend after modulation.
   1267  */
   1268 struct blender_32to16_modulate : blender_modulate {
   1269     explicit blender_32to16_modulate(const context_t* c) {
   1270         init(c);
   1271     }
   1272     void write(uint32_t s, uint16_t* dst) {
   1273         // blend source and destination
   1274         if (!s) {
   1275             return;
   1276         }
   1277         s = GGL_RGBA_TO_HOST(s);
   1278 
   1279         /* We need to modulate s */
   1280         uint32_t  sA = (s >> 24);
   1281         uint32_t  sB = (s >> 16) & 0xff;
   1282         uint32_t  sG = (s >> 8) & 0xff;
   1283         uint32_t  sR = s & 0xff;
   1284 
   1285         sA = (sA*m_a) >> 8;
   1286         /* Keep R/G/B scaled to 5.8 or 6.8 fixed float format */
   1287         sR = (sR*m_r) >> (8 - 5);
   1288         sG = (sG*m_g) >> (8 - 6);
   1289         sB = (sB*m_b) >> (8 - 5);
   1290 
   1291         /* Now do a normal blend */
   1292         int f = 0x100 - (sA + (sA>>7));
   1293         uint16_t d = *dst;
   1294         int dR = (d>>11)&0x1f;
   1295         int dG = (d>>5)&0x3f;
   1296         int dB = (d)&0x1f;
   1297         sR = (sR + f*dR)>>8;
   1298         sG = (sG + f*dG)>>8;
   1299         sB = (sB + f*dB)>>8;
   1300         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1301     }
   1302     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1303         // blend source and destination
   1304         if (!s) {
   1305             di.step();
   1306             return;
   1307         }
   1308         s = GGL_RGBA_TO_HOST(s);
   1309 
   1310         /* We need to modulate s */
   1311         uint32_t  sA = (s >> 24);
   1312         uint32_t  sB = (s >> 16) & 0xff;
   1313         uint32_t  sG = (s >> 8) & 0xff;
   1314         uint32_t  sR = s & 0xff;
   1315 
   1316         sA = (sA*m_a) >> 8;
   1317         /* keep R/G/B scaled to 5.8 or 6.8 fixed float format */
   1318         sR = (sR*m_r) >> (8 - 5);
   1319         sG = (sG*m_g) >> (8 - 6);
   1320         sB = (sB*m_b) >> (8 - 5);
   1321 
   1322         /* Scale threshold to 0.8 fixed float format */
   1323         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1324         int f = 0x100 - (sA + (sA>>7));
   1325         uint16_t d = *dst;
   1326         int dR = (d>>11)&0x1f;
   1327         int dG = (d>>5)&0x3f;
   1328         int dB = (d)&0x1f;
   1329         sR = (sR + f*dR + threshold)>>8;
   1330         sG = (sG + f*dG + threshold)>>8;
   1331         sB = (sB + f*dB + threshold)>>8;
   1332         if (sR > 0x1f) sR = 0x1f;
   1333         if (sG > 0x3f) sG = 0x3f;
   1334         if (sB > 0x1f) sB = 0x1f;
   1335         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1336     }
   1337 };
   1338 
   1339 /* same as 32to16_modulate, except that the input is xRGB, instead of ARGB */
   1340 struct blender_x32to16_modulate : blender_modulate {
   1341     explicit blender_x32to16_modulate(const context_t* c) {
   1342         init(c);
   1343     }
   1344     void write(uint32_t s, uint16_t* dst) {
   1345         s = GGL_RGBA_TO_HOST(s);
   1346 
   1347         uint32_t  sB = (s >> 16) & 0xff;
   1348         uint32_t  sG = (s >> 8) & 0xff;
   1349         uint32_t  sR = s & 0xff;
   1350 
   1351         /* Keep R/G/B in 5.8 or 6.8 format */
   1352         sR = (sR*m_r) >> (8 - 5);
   1353         sG = (sG*m_g) >> (8 - 6);
   1354         sB = (sB*m_b) >> (8 - 5);
   1355 
   1356         int f = 0x100 - m_a;
   1357         uint16_t d = *dst;
   1358         int dR = (d>>11)&0x1f;
   1359         int dG = (d>>5)&0x3f;
   1360         int dB = (d)&0x1f;
   1361         sR = (sR + f*dR)>>8;
   1362         sG = (sG + f*dG)>>8;
   1363         sB = (sB + f*dB)>>8;
   1364         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1365     }
   1366     void write(uint32_t s, uint16_t* dst, ditherer& di) {
   1367         s = GGL_RGBA_TO_HOST(s);
   1368 
   1369         uint32_t  sB = (s >> 16) & 0xff;
   1370         uint32_t  sG = (s >> 8) & 0xff;
   1371         uint32_t  sR = s & 0xff;
   1372 
   1373         sR = (sR*m_r) >> (8 - 5);
   1374         sG = (sG*m_g) >> (8 - 6);
   1375         sB = (sB*m_b) >> (8 - 5);
   1376 
   1377         /* Now do a normal blend */
   1378         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
   1379         int f = 0x100 - m_a;
   1380         uint16_t d = *dst;
   1381         int dR = (d>>11)&0x1f;
   1382         int dG = (d>>5)&0x3f;
   1383         int dB = (d)&0x1f;
   1384         sR = (sR + f*dR + threshold)>>8;
   1385         sG = (sG + f*dG + threshold)>>8;
   1386         sB = (sB + f*dB + threshold)>>8;
   1387         if (sR > 0x1f) sR = 0x1f;
   1388         if (sG > 0x3f) sG = 0x3f;
   1389         if (sB > 0x1f) sB = 0x1f;
   1390         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1391     }
   1392 };
   1393 
   1394 /* Same as above, but source is 16bit rgb565 */
   1395 struct blender_16to16_modulate : blender_modulate {
   1396     explicit blender_16to16_modulate(const context_t* c) {
   1397         init(c);
   1398     }
   1399     void write(uint16_t s16, uint16_t* dst) {
   1400         uint32_t  s = s16;
   1401 
   1402         uint32_t  sR = s >> 11;
   1403         uint32_t  sG = (s >> 5) & 0x3f;
   1404         uint32_t  sB = s & 0x1f;
   1405 
   1406         sR = (sR*m_r);
   1407         sG = (sG*m_g);
   1408         sB = (sB*m_b);
   1409 
   1410         int f = 0x100 - m_a;
   1411         uint16_t d = *dst;
   1412         int dR = (d>>11)&0x1f;
   1413         int dG = (d>>5)&0x3f;
   1414         int dB = (d)&0x1f;
   1415         sR = (sR + f*dR)>>8;
   1416         sG = (sG + f*dG)>>8;
   1417         sB = (sB + f*dB)>>8;
   1418         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
   1419     }
   1420 };
   1421 
   1422 /* This is used to iterate over a 16-bit destination color buffer.
   1423  * Usage is:
   1424  *
   1425  *   dst_iterator16  di(context);
   1426  *   while (di.count--) {
   1427  *       <do stuff with dest pixel at di.dst>
   1428  *       di.dst++;
   1429  *   }
   1430  */
   1431 struct dst_iterator16 {
   1432     explicit dst_iterator16(const context_t* c) {
   1433         const int x = c->iterators.xl;
   1434         const int width = c->iterators.xr - x;
   1435         const int32_t y = c->iterators.y;
   1436         const surface_t* cb = &(c->state.buffers.color);
   1437         count = width;
   1438         dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   1439     }
   1440     int        count;
   1441     uint16_t*  dst;
   1442 };
   1443 
   1444 
   1445 static void scanline_t32cb16_clamp(context_t* c)
   1446 {
   1447     dst_iterator16  di(c);
   1448 
   1449     if (is_context_horizontal(c)) {
   1450         /* Special case for simple horizontal scaling */
   1451         horz_clamp_iterator32 ci(c);
   1452         while (di.count--) {
   1453             uint32_t s = ci.get_pixel32();
   1454             *di.dst++ = convertAbgr8888ToRgb565(s);
   1455         }
   1456     } else {
   1457         /* General case */
   1458         clamp_iterator ci(c);
   1459         while (di.count--) {
   1460             uint32_t s = ci.get_pixel32();
   1461             *di.dst++ = convertAbgr8888ToRgb565(s);
   1462         }
   1463     }
   1464 }
   1465 
   1466 static void scanline_t32cb16_dither(context_t* c)
   1467 {
   1468     horz_iterator32 si(c);
   1469     dst_iterator16  di(c);
   1470     ditherer        dither(c);
   1471 
   1472     while (di.count--) {
   1473         uint32_t s = si.get_pixel32();
   1474         *di.dst++ = dither.abgr8888ToRgb565(s);
   1475     }
   1476 }
   1477 
   1478 static void scanline_t32cb16_clamp_dither(context_t* c)
   1479 {
   1480     dst_iterator16  di(c);
   1481     ditherer        dither(c);
   1482 
   1483     if (is_context_horizontal(c)) {
   1484         /* Special case for simple horizontal scaling */
   1485         horz_clamp_iterator32 ci(c);
   1486         while (di.count--) {
   1487             uint32_t s = ci.get_pixel32();
   1488             *di.dst++ = dither.abgr8888ToRgb565(s);
   1489         }
   1490     } else {
   1491         /* General case */
   1492         clamp_iterator ci(c);
   1493         while (di.count--) {
   1494             uint32_t s = ci.get_pixel32();
   1495             *di.dst++ = dither.abgr8888ToRgb565(s);
   1496         }
   1497     }
   1498 }
   1499 
   1500 static void scanline_t32cb16blend_dither(context_t* c)
   1501 {
   1502     dst_iterator16 di(c);
   1503     ditherer       dither(c);
   1504     blender_32to16 bl(c);
   1505     horz_iterator32  hi(c);
   1506     while (di.count--) {
   1507         uint32_t s = hi.get_pixel32();
   1508         bl.write(s, di.dst, dither);
   1509         di.dst++;
   1510     }
   1511 }
   1512 
   1513 static void scanline_t32cb16blend_clamp(context_t* c)
   1514 {
   1515     dst_iterator16  di(c);
   1516     blender_32to16  bl(c);
   1517 
   1518     if (is_context_horizontal(c)) {
   1519         horz_clamp_iterator32 ci(c);
   1520         while (di.count--) {
   1521             uint32_t s = ci.get_pixel32();
   1522             bl.write(s, di.dst);
   1523             di.dst++;
   1524         }
   1525     } else {
   1526         clamp_iterator ci(c);
   1527         while (di.count--) {
   1528             uint32_t s = ci.get_pixel32();
   1529             bl.write(s, di.dst);
   1530             di.dst++;
   1531         }
   1532     }
   1533 }
   1534 
   1535 static void scanline_t32cb16blend_clamp_dither(context_t* c)
   1536 {
   1537     dst_iterator16 di(c);
   1538     ditherer       dither(c);
   1539     blender_32to16 bl(c);
   1540 
   1541     clamp_iterator ci(c);
   1542     while (di.count--) {
   1543         uint32_t s = ci.get_pixel32();
   1544         bl.write(s, di.dst, dither);
   1545         di.dst++;
   1546     }
   1547 }
   1548 
   1549 void scanline_t32cb16blend_clamp_mod(context_t* c)
   1550 {
   1551     dst_iterator16 di(c);
   1552     blender_32to16_modulate bl(c);
   1553 
   1554     clamp_iterator ci(c);
   1555     while (di.count--) {
   1556         uint32_t s = ci.get_pixel32();
   1557         bl.write(s, di.dst);
   1558         di.dst++;
   1559     }
   1560 }
   1561 
   1562 void scanline_t32cb16blend_clamp_mod_dither(context_t* c)
   1563 {
   1564     dst_iterator16 di(c);
   1565     blender_32to16_modulate bl(c);
   1566     ditherer dither(c);
   1567 
   1568     clamp_iterator ci(c);
   1569     while (di.count--) {
   1570         uint32_t s = ci.get_pixel32();
   1571         bl.write(s, di.dst, dither);
   1572         di.dst++;
   1573     }
   1574 }
   1575 
   1576 /* Variant of scanline_t32cb16blend_clamp_mod with a xRGB texture */
   1577 void scanline_x32cb16blend_clamp_mod(context_t* c)
   1578 {
   1579     dst_iterator16 di(c);
   1580     blender_x32to16_modulate  bl(c);
   1581 
   1582     clamp_iterator ci(c);
   1583     while (di.count--) {
   1584         uint32_t s = ci.get_pixel32();
   1585         bl.write(s, di.dst);
   1586         di.dst++;
   1587     }
   1588 }
   1589 
   1590 void scanline_x32cb16blend_clamp_mod_dither(context_t* c)
   1591 {
   1592     dst_iterator16 di(c);
   1593     blender_x32to16_modulate  bl(c);
   1594     ditherer dither(c);
   1595 
   1596     clamp_iterator ci(c);
   1597     while (di.count--) {
   1598         uint32_t s = ci.get_pixel32();
   1599         bl.write(s, di.dst, dither);
   1600         di.dst++;
   1601     }
   1602 }
   1603 
   1604 void scanline_t16cb16_clamp(context_t* c)
   1605 {
   1606     dst_iterator16  di(c);
   1607 
   1608     /* Special case for simple horizontal scaling */
   1609     if (is_context_horizontal(c)) {
   1610         horz_clamp_iterator16 ci(c);
   1611         while (di.count--) {
   1612             *di.dst++ = ci.get_pixel16();
   1613         }
   1614     } else {
   1615         clamp_iterator ci(c);
   1616         while (di.count--) {
   1617             *di.dst++ = ci.get_pixel16();
   1618         }
   1619     }
   1620 }
   1621 
   1622 
   1623 
   1624 template <typename T, typename U>
   1625 static inline __attribute__((const))
   1626 T interpolate(int y, T v0, U dvdx, U dvdy) {
   1627     // interpolates in pixel's centers
   1628     // v = v0 + (y + 0.5) * dvdy + (0.5 * dvdx)
   1629     return (y * dvdy) + (v0 + ((dvdy + dvdx) >> 1));
   1630 }
   1631 
   1632 // ----------------------------------------------------------------------------
   1633 #if 0
   1634 #pragma mark -
   1635 #endif
   1636 
   1637 void init_y(context_t* c, int32_t ys)
   1638 {
   1639     const uint32_t enables = c->state.enables;
   1640 
   1641     // compute iterators...
   1642     iterators_t& ci = c->iterators;
   1643 
   1644     // sample in the center
   1645     ci.y = ys;
   1646 
   1647     if (enables & (GGL_ENABLE_DEPTH_TEST|GGL_ENABLE_W|GGL_ENABLE_FOG)) {
   1648         ci.ydzdy = interpolate(ys, c->shade.z0, c->shade.dzdx, c->shade.dzdy);
   1649         ci.ydwdy = interpolate(ys, c->shade.w0, c->shade.dwdx, c->shade.dwdy);
   1650         ci.ydfdy = interpolate(ys, c->shade.f0, c->shade.dfdx, c->shade.dfdy);
   1651     }
   1652 
   1653     if (ggl_unlikely(enables & GGL_ENABLE_SMOOTH)) {
   1654         ci.ydrdy = interpolate(ys, c->shade.r0, c->shade.drdx, c->shade.drdy);
   1655         ci.ydgdy = interpolate(ys, c->shade.g0, c->shade.dgdx, c->shade.dgdy);
   1656         ci.ydbdy = interpolate(ys, c->shade.b0, c->shade.dbdx, c->shade.dbdy);
   1657         ci.ydady = interpolate(ys, c->shade.a0, c->shade.dadx, c->shade.dady);
   1658         c->step_y = step_y__smooth;
   1659     } else {
   1660         ci.ydrdy = c->shade.r0;
   1661         ci.ydgdy = c->shade.g0;
   1662         ci.ydbdy = c->shade.b0;
   1663         ci.ydady = c->shade.a0;
   1664         // XXX: do only if needed, or make sure this is fast
   1665         c->packed = ggl_pack_color(c, c->state.buffers.color.format,
   1666                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
   1667         c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
   1668                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
   1669     }
   1670 
   1671     // initialize the variables we need in the shader
   1672     generated_vars_t& gen = c->generated_vars;
   1673     gen.argb[GGLFormat::ALPHA].c  = ci.ydady;
   1674     gen.argb[GGLFormat::ALPHA].dx = c->shade.dadx;
   1675     gen.argb[GGLFormat::RED  ].c  = ci.ydrdy;
   1676     gen.argb[GGLFormat::RED  ].dx = c->shade.drdx;
   1677     gen.argb[GGLFormat::GREEN].c  = ci.ydgdy;
   1678     gen.argb[GGLFormat::GREEN].dx = c->shade.dgdx;
   1679     gen.argb[GGLFormat::BLUE ].c  = ci.ydbdy;
   1680     gen.argb[GGLFormat::BLUE ].dx = c->shade.dbdx;
   1681     gen.dzdx = c->shade.dzdx;
   1682     gen.f    = ci.ydfdy;
   1683     gen.dfdx = c->shade.dfdx;
   1684 
   1685     if (enables & GGL_ENABLE_TMUS) {
   1686         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1687             texture_t& t = c->state.texture[i];
   1688             if (!t.enable) continue;
   1689 
   1690             texture_iterators_t& ti = t.iterators;
   1691             if (t.s_coord == GGL_ONE_TO_ONE && t.t_coord == GGL_ONE_TO_ONE) {
   1692                 // we need to set all of these to 0 because in some cases
   1693                 // step_y__generic() or step_y__tmu() will be used and
   1694                 // therefore will update dtdy, however, in 1:1 mode
   1695                 // this is always done by the scanline rasterizer.
   1696                 ti.dsdx = ti.dsdy = ti.dtdx = ti.dtdy = 0;
   1697                 ti.ydsdy = t.shade.is0;
   1698                 ti.ydtdy = t.shade.it0;
   1699             } else {
   1700                 const int adjustSWrap = ((t.s_wrap==GGL_CLAMP)?0:16);
   1701                 const int adjustTWrap = ((t.t_wrap==GGL_CLAMP)?0:16);
   1702                 ti.sscale = t.shade.sscale + adjustSWrap;
   1703                 ti.tscale = t.shade.tscale + adjustTWrap;
   1704                 if (!(enables & GGL_ENABLE_W)) {
   1705                     // S coordinate
   1706                     const int32_t sscale = ti.sscale;
   1707                     const int32_t sy = interpolate(ys,
   1708                             t.shade.is0, t.shade.idsdx, t.shade.idsdy);
   1709                     if (sscale>=0) {
   1710                         ti.ydsdy= sy            << sscale;
   1711                         ti.dsdx = t.shade.idsdx << sscale;
   1712                         ti.dsdy = t.shade.idsdy << sscale;
   1713                     } else {
   1714                         ti.ydsdy= sy            >> -sscale;
   1715                         ti.dsdx = t.shade.idsdx >> -sscale;
   1716                         ti.dsdy = t.shade.idsdy >> -sscale;
   1717                     }
   1718                     // T coordinate
   1719                     const int32_t tscale = ti.tscale;
   1720                     const int32_t ty = interpolate(ys,
   1721                             t.shade.it0, t.shade.idtdx, t.shade.idtdy);
   1722                     if (tscale>=0) {
   1723                         ti.ydtdy= ty            << tscale;
   1724                         ti.dtdx = t.shade.idtdx << tscale;
   1725                         ti.dtdy = t.shade.idtdy << tscale;
   1726                     } else {
   1727                         ti.ydtdy= ty            >> -tscale;
   1728                         ti.dtdx = t.shade.idtdx >> -tscale;
   1729                         ti.dtdy = t.shade.idtdy >> -tscale;
   1730                     }
   1731                 }
   1732             }
   1733             // mirror for generated code...
   1734             generated_tex_vars_t& gen = c->generated_vars.texture[i];
   1735             gen.width   = t.surface.width;
   1736             gen.height  = t.surface.height;
   1737             gen.stride  = t.surface.stride;
   1738             gen.data    = uintptr_t(t.surface.data);
   1739             gen.dsdx = ti.dsdx;
   1740             gen.dtdx = ti.dtdx;
   1741         }
   1742     }
   1743 
   1744     // choose the y-stepper
   1745     c->step_y = step_y__nop;
   1746     if (enables & GGL_ENABLE_FOG) {
   1747         c->step_y = step_y__generic;
   1748     } else if (enables & GGL_ENABLE_TMUS) {
   1749         if (enables & GGL_ENABLE_SMOOTH) {
   1750             c->step_y = step_y__generic;
   1751         } else if (enables & GGL_ENABLE_W) {
   1752             c->step_y = step_y__w;
   1753         } else {
   1754             c->step_y = step_y__tmu;
   1755         }
   1756     } else {
   1757         if (enables & GGL_ENABLE_SMOOTH) {
   1758             c->step_y = step_y__smooth;
   1759         }
   1760     }
   1761 
   1762     // choose the rectangle blitter
   1763     c->rect = rect_generic;
   1764     if ((c->step_y == step_y__nop) &&
   1765         (c->scanline == scanline_memcpy))
   1766     {
   1767         c->rect = rect_memcpy;
   1768     }
   1769 }
   1770 
   1771 void init_y_packed(context_t* c, int32_t y0)
   1772 {
   1773     uint8_t f = c->state.buffers.color.format;
   1774     c->packed = ggl_pack_color(c, f,
   1775             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
   1776     c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
   1777             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
   1778     c->iterators.y = y0;
   1779     c->step_y = step_y__nop;
   1780     // choose the rectangle blitter
   1781     c->rect = rect_generic;
   1782     if (c->scanline == scanline_memcpy) {
   1783         c->rect = rect_memcpy;
   1784     }
   1785 }
   1786 
   1787 void init_y_noop(context_t* c, int32_t y0)
   1788 {
   1789     c->iterators.y = y0;
   1790     c->step_y = step_y__nop;
   1791     // choose the rectangle blitter
   1792     c->rect = rect_generic;
   1793     if (c->scanline == scanline_memcpy) {
   1794         c->rect = rect_memcpy;
   1795     }
   1796 }
   1797 
   1798 void init_y_error(context_t* c, int32_t y0)
   1799 {
   1800     // woooops, shoud never happen,
   1801     // fail gracefully (don't display anything)
   1802     init_y_noop(c, y0);
   1803     ALOGE("color-buffer has an invalid format!");
   1804 }
   1805 
   1806 // ----------------------------------------------------------------------------
   1807 #if 0
   1808 #pragma mark -
   1809 #endif
   1810 
   1811 void step_y__generic(context_t* c)
   1812 {
   1813     const uint32_t enables = c->state.enables;
   1814 
   1815     // iterate...
   1816     iterators_t& ci = c->iterators;
   1817     ci.y += 1;
   1818 
   1819     if (enables & GGL_ENABLE_SMOOTH) {
   1820         ci.ydrdy += c->shade.drdy;
   1821         ci.ydgdy += c->shade.dgdy;
   1822         ci.ydbdy += c->shade.dbdy;
   1823         ci.ydady += c->shade.dady;
   1824     }
   1825 
   1826     const uint32_t mask =
   1827             GGL_ENABLE_DEPTH_TEST |
   1828             GGL_ENABLE_W |
   1829             GGL_ENABLE_FOG;
   1830     if (enables & mask) {
   1831         ci.ydzdy += c->shade.dzdy;
   1832         ci.ydwdy += c->shade.dwdy;
   1833         ci.ydfdy += c->shade.dfdy;
   1834     }
   1835 
   1836     if ((enables & GGL_ENABLE_TMUS) && (!(enables & GGL_ENABLE_W))) {
   1837         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1838             if (c->state.texture[i].enable) {
   1839                 texture_iterators_t& ti = c->state.texture[i].iterators;
   1840                 ti.ydsdy += ti.dsdy;
   1841                 ti.ydtdy += ti.dtdy;
   1842             }
   1843         }
   1844     }
   1845 }
   1846 
   1847 void step_y__nop(context_t* c)
   1848 {
   1849     c->iterators.y += 1;
   1850     c->iterators.ydzdy += c->shade.dzdy;
   1851 }
   1852 
   1853 void step_y__smooth(context_t* c)
   1854 {
   1855     iterators_t& ci = c->iterators;
   1856     ci.y += 1;
   1857     ci.ydrdy += c->shade.drdy;
   1858     ci.ydgdy += c->shade.dgdy;
   1859     ci.ydbdy += c->shade.dbdy;
   1860     ci.ydady += c->shade.dady;
   1861     ci.ydzdy += c->shade.dzdy;
   1862 }
   1863 
   1864 void step_y__w(context_t* c)
   1865 {
   1866     iterators_t& ci = c->iterators;
   1867     ci.y += 1;
   1868     ci.ydzdy += c->shade.dzdy;
   1869     ci.ydwdy += c->shade.dwdy;
   1870 }
   1871 
   1872 void step_y__tmu(context_t* c)
   1873 {
   1874     iterators_t& ci = c->iterators;
   1875     ci.y += 1;
   1876     ci.ydzdy += c->shade.dzdy;
   1877     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1878         if (c->state.texture[i].enable) {
   1879             texture_iterators_t& ti = c->state.texture[i].iterators;
   1880             ti.ydsdy += ti.dsdy;
   1881             ti.ydtdy += ti.dtdy;
   1882         }
   1883     }
   1884 }
   1885 
   1886 // ----------------------------------------------------------------------------
   1887 #if 0
   1888 #pragma mark -
   1889 #endif
   1890 
   1891 void scanline_perspective(context_t* c)
   1892 {
   1893     struct {
   1894         union {
   1895             struct {
   1896                 int32_t s, sq;
   1897                 int32_t t, tq;
   1898             } sqtq;
   1899             struct {
   1900                 int32_t v, q;
   1901             } st[2];
   1902         };
   1903     } tc[GGL_TEXTURE_UNIT_COUNT] __attribute__((aligned(16)));
   1904 
   1905     // XXX: we should have a special case when dwdx = 0
   1906 
   1907     // 32 pixels spans works okay. 16 is a lot better,
   1908     // but hey, it's a software renderer...
   1909     const uint32_t SPAN_BITS = 5;
   1910     const uint32_t ys = c->iterators.y;
   1911     const uint32_t xs = c->iterators.xl;
   1912     const uint32_t x1 = c->iterators.xr;
   1913 	const uint32_t xc = x1 - xs;
   1914     uint32_t remainder = xc & ((1<<SPAN_BITS)-1);
   1915     uint32_t numSpans = xc >> SPAN_BITS;
   1916 
   1917     const iterators_t& ci = c->iterators;
   1918     int32_t w0 = (xs * c->shade.dwdx) + ci.ydwdy;
   1919     int32_t q0 = gglRecipQ(w0, 30);
   1920     const int iwscale = 32 - gglClz(q0);
   1921 
   1922     const int32_t dwdx = c->shade.dwdx << SPAN_BITS;
   1923     int32_t xl = c->iterators.xl;
   1924 
   1925     // We process s & t with a loop to reduce the code size
   1926     // (and i-cache pressure).
   1927 
   1928     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1929         const texture_t& tmu = c->state.texture[i];
   1930         if (!tmu.enable) continue;
   1931         int32_t s =   tmu.shade.is0 +
   1932                      (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
   1933                      ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
   1934         int32_t t =   tmu.shade.it0 +
   1935                      (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
   1936                      ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
   1937         tc[i].sqtq.s  = s;
   1938         tc[i].sqtq.t  = t;
   1939         tc[i].sqtq.sq = gglMulx(s, q0, iwscale);
   1940         tc[i].sqtq.tq = gglMulx(t, q0, iwscale);
   1941     }
   1942 
   1943     int32_t span = 0;
   1944     do {
   1945         int32_t w1;
   1946         if (ggl_likely(numSpans)) {
   1947             w1 = w0 + dwdx;
   1948         } else {
   1949             if (remainder) {
   1950                 // finish off the scanline...
   1951                 span = remainder;
   1952                 w1 = (c->shade.dwdx * span) + w0;
   1953             } else {
   1954                 break;
   1955             }
   1956         }
   1957         int32_t q1 = gglRecipQ(w1, 30);
   1958         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
   1959             texture_t& tmu = c->state.texture[i];
   1960             if (!tmu.enable) continue;
   1961             texture_iterators_t& ti = tmu.iterators;
   1962 
   1963             for (int j=0 ; j<2 ; j++) {
   1964                 int32_t v = tc[i].st[j].v;
   1965                 if (span)   v += (tmu.shade.st[j].dx)*span;
   1966                 else        v += (tmu.shade.st[j].dx)<<SPAN_BITS;
   1967                 const int32_t v0 = tc[i].st[j].q;
   1968                 const int32_t v1 = gglMulx(v, q1, iwscale);
   1969                 int32_t dvdx = v1 - v0;
   1970                 if (span)   dvdx /= span;
   1971                 else        dvdx >>= SPAN_BITS;
   1972                 tc[i].st[j].v = v;
   1973                 tc[i].st[j].q = v1;
   1974 
   1975                 const int scale = ti.st[j].scale + (iwscale - 30);
   1976                 if (scale >= 0) {
   1977                     ti.st[j].ydvdy = v0   << scale;
   1978                     ti.st[j].dvdx  = dvdx << scale;
   1979                 } else {
   1980                     ti.st[j].ydvdy = v0   >> -scale;
   1981                     ti.st[j].dvdx  = dvdx >> -scale;
   1982                 }
   1983             }
   1984             generated_tex_vars_t& gen = c->generated_vars.texture[i];
   1985             gen.dsdx = ti.st[0].dvdx;
   1986             gen.dtdx = ti.st[1].dvdx;
   1987         }
   1988         c->iterators.xl = xl;
   1989         c->iterators.xr = xl = xl + (span ? span : (1<<SPAN_BITS));
   1990         w0 = w1;
   1991         q0 = q1;
   1992         c->span(c);
   1993     } while(numSpans--);
   1994 }
   1995 
   1996 void scanline_perspective_single(context_t* c)
   1997 {
   1998     // 32 pixels spans works okay. 16 is a lot better,
   1999     // but hey, it's a software renderer...
   2000     const uint32_t SPAN_BITS = 5;
   2001     const uint32_t ys = c->iterators.y;
   2002     const uint32_t xs = c->iterators.xl;
   2003     const uint32_t x1 = c->iterators.xr;
   2004 	const uint32_t xc = x1 - xs;
   2005 
   2006     const iterators_t& ci = c->iterators;
   2007     int32_t w = (xs * c->shade.dwdx) + ci.ydwdy;
   2008     int32_t iw = gglRecipQ(w, 30);
   2009     const int iwscale = 32 - gglClz(iw);
   2010 
   2011     const int i = 31 - gglClz(c->state.enabled_tmu);
   2012     generated_tex_vars_t& gen = c->generated_vars.texture[i];
   2013     texture_t& tmu = c->state.texture[i];
   2014     texture_iterators_t& ti = tmu.iterators;
   2015     const int sscale = ti.sscale + (iwscale - 30);
   2016     const int tscale = ti.tscale + (iwscale - 30);
   2017     int32_t s =   tmu.shade.is0 +
   2018                  (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
   2019                  ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
   2020     int32_t t =   tmu.shade.it0 +
   2021                  (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
   2022                  ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
   2023     int32_t s0 = gglMulx(s, iw, iwscale);
   2024     int32_t t0 = gglMulx(t, iw, iwscale);
   2025     int32_t xl = c->iterators.xl;
   2026 
   2027     int32_t sq, tq, dsdx, dtdx;
   2028     int32_t premainder = xc & ((1<<SPAN_BITS)-1);
   2029     uint32_t numSpans = xc >> SPAN_BITS;
   2030     if (c->shade.dwdx == 0) {
   2031         // XXX: we could choose to do this if the error is small enough
   2032         numSpans = 0;
   2033         premainder = xc;
   2034         goto no_perspective;
   2035     }
   2036 
   2037     if (premainder) {
   2038         w += c->shade.dwdx   * premainder;
   2039         iw = gglRecipQ(w, 30);
   2040 no_perspective:
   2041         s += tmu.shade.idsdx * premainder;
   2042         t += tmu.shade.idtdx * premainder;
   2043         sq = gglMulx(s, iw, iwscale);
   2044         tq = gglMulx(t, iw, iwscale);
   2045         dsdx = (sq - s0) / premainder;
   2046         dtdx = (tq - t0) / premainder;
   2047         c->iterators.xl = xl;
   2048         c->iterators.xr = xl = xl + premainder;
   2049         goto finish;
   2050     }
   2051 
   2052     while (numSpans--) {
   2053         w += c->shade.dwdx   << SPAN_BITS;
   2054         s += tmu.shade.idsdx << SPAN_BITS;
   2055         t += tmu.shade.idtdx << SPAN_BITS;
   2056         iw = gglRecipQ(w, 30);
   2057         sq = gglMulx(s, iw, iwscale);
   2058         tq = gglMulx(t, iw, iwscale);
   2059         dsdx = (sq - s0) >> SPAN_BITS;
   2060         dtdx = (tq - t0) >> SPAN_BITS;
   2061         c->iterators.xl = xl;
   2062         c->iterators.xr = xl = xl + (1<<SPAN_BITS);
   2063 finish:
   2064         if (sscale >= 0) {
   2065             ti.ydsdy = s0   << sscale;
   2066             ti.dsdx  = dsdx << sscale;
   2067         } else {
   2068             ti.ydsdy = s0   >>-sscale;
   2069             ti.dsdx  = dsdx >>-sscale;
   2070         }
   2071         if (tscale >= 0) {
   2072             ti.ydtdy = t0   << tscale;
   2073             ti.dtdx  = dtdx << tscale;
   2074         } else {
   2075             ti.ydtdy = t0   >>-tscale;
   2076             ti.dtdx  = dtdx >>-tscale;
   2077         }
   2078         s0 = sq;
   2079         t0 = tq;
   2080         gen.dsdx = ti.dsdx;
   2081         gen.dtdx = ti.dtdx;
   2082         c->span(c);
   2083     }
   2084 }
   2085 
   2086 // ----------------------------------------------------------------------------
   2087 
   2088 void scanline_col32cb16blend(context_t* c)
   2089 {
   2090     int32_t x = c->iterators.xl;
   2091     size_t ct = c->iterators.xr - x;
   2092     int32_t y = c->iterators.y;
   2093     surface_t* cb = &(c->state.buffers.color);
   2094     union {
   2095         uint16_t* dst;
   2096         uint32_t* dst32;
   2097     };
   2098     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2099 
   2100 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
   2101 #if defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2102     scanline_col32cb16blend_neon(dst, &(c->packed8888), ct);
   2103 #else  // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2104     scanline_col32cb16blend_arm(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
   2105 #endif // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
   2106 #elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__aarch64__))
   2107     scanline_col32cb16blend_arm64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
   2108 #elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__mips__) && defined(__LP64__)))
   2109     scanline_col32cb16blend_mips64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
   2110 #else
   2111     uint32_t s = GGL_RGBA_TO_HOST(c->packed8888);
   2112     int sA = (s>>24);
   2113     int f = 0x100 - (sA + (sA>>7));
   2114     while (ct--) {
   2115         uint16_t d = *dst;
   2116         int dR = (d>>11)&0x1f;
   2117         int dG = (d>>5)&0x3f;
   2118         int dB = (d)&0x1f;
   2119         int sR = (s >> (   3))&0x1F;
   2120         int sG = (s >> ( 8+2))&0x3F;
   2121         int sB = (s >> (16+3))&0x1F;
   2122         sR += (f*dR)>>8;
   2123         sG += (f*dG)>>8;
   2124         sB += (f*dB)>>8;
   2125         *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
   2126     }
   2127 #endif
   2128 
   2129 }
   2130 
   2131 void scanline_t32cb16(context_t* c)
   2132 {
   2133     int32_t x = c->iterators.xl;
   2134     size_t ct = c->iterators.xr - x;
   2135     int32_t y = c->iterators.y;
   2136     surface_t* cb = &(c->state.buffers.color);
   2137     union {
   2138         uint16_t* dst;
   2139         uint32_t* dst32;
   2140     };
   2141     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2142 
   2143     surface_t* tex = &(c->state.texture[0].surface);
   2144     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2145     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2146     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
   2147     int sR, sG, sB;
   2148     uint32_t s, d;
   2149 
   2150     if (ct==1 || uintptr_t(dst)&2) {
   2151 last_one:
   2152         s = GGL_RGBA_TO_HOST( *src++ );
   2153         *dst++ = convertAbgr8888ToRgb565(s);
   2154         ct--;
   2155     }
   2156 
   2157     while (ct >= 2) {
   2158 #if BYTE_ORDER == BIG_ENDIAN
   2159         s = GGL_RGBA_TO_HOST( *src++ );
   2160         d = convertAbgr8888ToRgb565_hi16(s);
   2161 
   2162         s = GGL_RGBA_TO_HOST( *src++ );
   2163         d |= convertAbgr8888ToRgb565(s);
   2164 #else
   2165         s = GGL_RGBA_TO_HOST( *src++ );
   2166         d = convertAbgr8888ToRgb565(s);
   2167 
   2168         s = GGL_RGBA_TO_HOST( *src++ );
   2169         d |= convertAbgr8888ToRgb565(s) << 16;
   2170 #endif
   2171         *dst32++ = d;
   2172         ct -= 2;
   2173     }
   2174 
   2175     if (ct > 0) {
   2176         goto last_one;
   2177     }
   2178 }
   2179 
   2180 void scanline_t32cb16blend(context_t* c)
   2181 {
   2182 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__aarch64__) || \
   2183     (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__)))))
   2184     int32_t x = c->iterators.xl;
   2185     size_t ct = c->iterators.xr - x;
   2186     int32_t y = c->iterators.y;
   2187     surface_t* cb = &(c->state.buffers.color);
   2188     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2189 
   2190     surface_t* tex = &(c->state.texture[0].surface);
   2191     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2192     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2193     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
   2194 
   2195 #ifdef __arm__
   2196     scanline_t32cb16blend_arm(dst, src, ct);
   2197 #elif defined(__aarch64__)
   2198     scanline_t32cb16blend_arm64(dst, src, ct);
   2199 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
   2200     scanline_t32cb16blend_mips(dst, src, ct);
   2201 #elif defined(__mips__) && defined(__LP64__)
   2202     scanline_t32cb16blend_mips64(dst, src, ct);
   2203 #endif
   2204 #else
   2205     dst_iterator16  di(c);
   2206     horz_iterator32  hi(c);
   2207     blender_32to16  bl(c);
   2208     while (di.count--) {
   2209         uint32_t s = hi.get_pixel32();
   2210         bl.write(s, di.dst);
   2211         di.dst++;
   2212     }
   2213 #endif
   2214 }
   2215 
   2216 void scanline_t32cb16blend_srca(context_t* c)
   2217 {
   2218     dst_iterator16  di(c);
   2219     horz_iterator32  hi(c);
   2220     blender_32to16_srcA  blender(c);
   2221 
   2222     while (di.count--) {
   2223         uint32_t s = hi.get_pixel32();
   2224         blender.write(s,di.dst);
   2225         di.dst++;
   2226     }
   2227 }
   2228 
   2229 void scanline_t16cb16blend_clamp_mod(context_t* c)
   2230 {
   2231     const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
   2232     if (a == 0) {
   2233         return;
   2234     }
   2235 
   2236     if (a == 255) {
   2237         scanline_t16cb16_clamp(c);
   2238         return;
   2239     }
   2240 
   2241     dst_iterator16  di(c);
   2242     blender_16to16_modulate  blender(c);
   2243     clamp_iterator  ci(c);
   2244 
   2245     while (di.count--) {
   2246         uint16_t s = ci.get_pixel16();
   2247         blender.write(s, di.dst);
   2248         di.dst++;
   2249     }
   2250 }
   2251 
   2252 void scanline_memcpy(context_t* c)
   2253 {
   2254     int32_t x = c->iterators.xl;
   2255     size_t ct = c->iterators.xr - x;
   2256     int32_t y = c->iterators.y;
   2257     surface_t* cb = &(c->state.buffers.color);
   2258     const GGLFormat* fp = &(c->formats[cb->format]);
   2259     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2260                             (x + (cb->stride * y)) * fp->size;
   2261 
   2262     surface_t* tex = &(c->state.texture[0].surface);
   2263     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2264     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2265     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
   2266                             (u + (tex->stride * v)) * fp->size;
   2267 
   2268     const size_t size = ct * fp->size;
   2269     memcpy(dst, src, size);
   2270 }
   2271 
   2272 void scanline_memset8(context_t* c)
   2273 {
   2274     int32_t x = c->iterators.xl;
   2275     size_t ct = c->iterators.xr - x;
   2276     int32_t y = c->iterators.y;
   2277     surface_t* cb = &(c->state.buffers.color);
   2278     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) + (x+(cb->stride*y));
   2279     uint32_t packed = c->packed;
   2280     memset(dst, packed, ct);
   2281 }
   2282 
   2283 void scanline_memset16(context_t* c)
   2284 {
   2285     int32_t x = c->iterators.xl;
   2286     size_t ct = c->iterators.xr - x;
   2287     int32_t y = c->iterators.y;
   2288     surface_t* cb = &(c->state.buffers.color);
   2289     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
   2290     uint32_t packed = c->packed;
   2291     android_memset16(dst, packed, ct*2);
   2292 }
   2293 
   2294 void scanline_memset32(context_t* c)
   2295 {
   2296     int32_t x = c->iterators.xl;
   2297     size_t ct = c->iterators.xr - x;
   2298     int32_t y = c->iterators.y;
   2299     surface_t* cb = &(c->state.buffers.color);
   2300     uint32_t* dst = reinterpret_cast<uint32_t*>(cb->data) + (x+(cb->stride*y));
   2301     uint32_t packed = GGL_HOST_TO_RGBA(c->packed);
   2302     android_memset32(dst, packed, ct*4);
   2303 }
   2304 
   2305 void scanline_clear(context_t* c)
   2306 {
   2307     int32_t x = c->iterators.xl;
   2308     size_t ct = c->iterators.xr - x;
   2309     int32_t y = c->iterators.y;
   2310     surface_t* cb = &(c->state.buffers.color);
   2311     const GGLFormat* fp = &(c->formats[cb->format]);
   2312     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2313                             (x + (cb->stride * y)) * fp->size;
   2314     const size_t size = ct * fp->size;
   2315     memset(dst, 0, size);
   2316 }
   2317 
   2318 void scanline_set(context_t* c)
   2319 {
   2320     int32_t x = c->iterators.xl;
   2321     size_t ct = c->iterators.xr - x;
   2322     int32_t y = c->iterators.y;
   2323     surface_t* cb = &(c->state.buffers.color);
   2324     const GGLFormat* fp = &(c->formats[cb->format]);
   2325     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2326                             (x + (cb->stride * y)) * fp->size;
   2327     const size_t size = ct * fp->size;
   2328     memset(dst, 0xFF, size);
   2329 }
   2330 
   2331 void scanline_noop(context_t* /*c*/)
   2332 {
   2333 }
   2334 
   2335 void rect_generic(context_t* c, size_t yc)
   2336 {
   2337     do {
   2338         c->scanline(c);
   2339         c->step_y(c);
   2340     } while (--yc);
   2341 }
   2342 
   2343 void rect_memcpy(context_t* c, size_t yc)
   2344 {
   2345     int32_t x = c->iterators.xl;
   2346     size_t ct = c->iterators.xr - x;
   2347     int32_t y = c->iterators.y;
   2348     surface_t* cb = &(c->state.buffers.color);
   2349     const GGLFormat* fp = &(c->formats[cb->format]);
   2350     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
   2351                             (x + (cb->stride * y)) * fp->size;
   2352 
   2353     surface_t* tex = &(c->state.texture[0].surface);
   2354     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
   2355     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
   2356     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
   2357                             (u + (tex->stride * v)) * fp->size;
   2358 
   2359     if (cb->stride == tex->stride && ct == size_t(cb->stride)) {
   2360         memcpy(dst, src, ct * fp->size * yc);
   2361     } else {
   2362         const size_t size = ct * fp->size;
   2363         const size_t dbpr = cb->stride  * fp->size;
   2364         const size_t sbpr = tex->stride * fp->size;
   2365         do {
   2366             memcpy(dst, src, size);
   2367             dst += dbpr;
   2368             src += sbpr;
   2369         } while (--yc);
   2370     }
   2371 }
   2372 // ----------------------------------------------------------------------------
   2373 }; // namespace android
   2374 
   2375