Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkCachePreload_arm.h"
     18 #include "SkColor_opts_neon.h"
     19 #include <arm_neon.h>
     20 
     21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     22                            const SkPMColor* SK_RESTRICT src, int count,
     23                            U8CPU alpha, int /*x*/, int /*y*/) {
     24     SkASSERT(255 == alpha);
     25 
     26     while (count >= 8) {
     27         uint8x8x4_t vsrc;
     28         uint16x8_t vdst;
     29 
     30         // Load
     31         vsrc = vld4_u8((uint8_t*)src);
     32 
     33         // Convert src to 565
     34         vdst = SkPixel32ToPixel16_neon8(vsrc);
     35 
     36         // Store
     37         vst1q_u16(dst, vdst);
     38 
     39         // Prepare next iteration
     40         dst += 8;
     41         src += 8;
     42         count -= 8;
     43     };
     44 
     45     // Leftovers
     46     while (count > 0) {
     47         SkPMColor c = *src++;
     48         SkPMColorAssert(c);
     49         *dst = SkPixel32ToPixel16_ToU16(c);
     50         dst++;
     51         count--;
     52     };
     53 }
     54 
     55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     56                            const SkPMColor* SK_RESTRICT src, int count,
     57                            U8CPU alpha, int /*x*/, int /*y*/) {
     58     SkASSERT(255 == alpha);
     59 
     60     if (count >= 8) {
     61         uint16_t* SK_RESTRICT keep_dst = 0;
     62 
     63         asm volatile (
     64                       "ands       ip, %[count], #7            \n\t"
     65                       "vmov.u8    d31, #1<<7                  \n\t"
     66                       "vld1.16    {q12}, [%[dst]]             \n\t"
     67                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
     68                       // Thumb does not support the standard ARM conditional
     69                       // instructions but instead requires the 'it' instruction
     70                       // to signal conditional execution
     71                       "it eq                                  \n\t"
     72                       "moveq      ip, #8                      \n\t"
     73                       "mov        %[keep_dst], %[dst]         \n\t"
     74 
     75                       "add        %[src], %[src], ip, LSL#2   \n\t"
     76                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
     77                       "subs       %[count], %[count], ip      \n\t"
     78                       "b          9f                          \n\t"
     79                       // LOOP
     80                       "2:                                         \n\t"
     81 
     82                       "vld1.16    {q12}, [%[dst]]!            \n\t"
     83                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
     84                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
     85                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
     86                       "subs       %[count], %[count], #8      \n\t"
     87                       "9:                                         \n\t"
     88                       "pld        [%[dst],#32]                \n\t"
     89                       // expand 0565 q12 to 8888 {d4-d7}
     90                       "vmovn.u16  d4, q12                     \n\t"
     91                       "vshr.u16   q11, q12, #5                \n\t"
     92                       "vshr.u16   q10, q12, #6+5              \n\t"
     93                       "vmovn.u16  d5, q11                     \n\t"
     94                       "vmovn.u16  d6, q10                     \n\t"
     95                       "vshl.u8    d4, d4, #3                  \n\t"
     96                       "vshl.u8    d5, d5, #2                  \n\t"
     97                       "vshl.u8    d6, d6, #3                  \n\t"
     98 
     99                       "vmovl.u8   q14, d31                    \n\t"
    100                       "vmovl.u8   q13, d31                    \n\t"
    101                       "vmovl.u8   q12, d31                    \n\t"
    102 
    103                       // duplicate in 4/2/1 & 8pix vsns
    104                       "vmvn.8     d30, d3                     \n\t"
    105                       "vmlal.u8   q14, d30, d6                \n\t"
    106                       "vmlal.u8   q13, d30, d5                \n\t"
    107                       "vmlal.u8   q12, d30, d4                \n\t"
    108                       "vshr.u16   q8, q14, #5                 \n\t"
    109                       "vshr.u16   q9, q13, #6                 \n\t"
    110                       "vaddhn.u16 d6, q14, q8                 \n\t"
    111                       "vshr.u16   q8, q12, #5                 \n\t"
    112                       "vaddhn.u16 d5, q13, q9                 \n\t"
    113                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    114                       "vaddhn.u16 d4, q12, q8                 \n\t"
    115                       // intentionally don't calculate alpha
    116                       // result in d4-d6
    117 
    118                       "vqadd.u8   d5, d5, d1                  \n\t"
    119                       "vqadd.u8   d4, d4, d2                  \n\t"
    120 
    121                       // pack 8888 {d4-d6} to 0565 q10
    122                       "vshll.u8   q10, d6, #8                 \n\t"
    123                       "vshll.u8   q3, d5, #8                  \n\t"
    124                       "vshll.u8   q2, d4, #8                  \n\t"
    125                       "vsri.u16   q10, q3, #5                 \n\t"
    126                       "vsri.u16   q10, q2, #11                \n\t"
    127 
    128                       "bne        2b                          \n\t"
    129 
    130                       "1:                                         \n\t"
    131                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
    132                       : [count] "+r" (count)
    133                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    134                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    135                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    136                       "d30","d31"
    137                       );
    138     }
    139     else
    140     {   // handle count < 8
    141         uint16_t* SK_RESTRICT keep_dst = 0;
    142 
    143         asm volatile (
    144                       "vmov.u8    d31, #1<<7                  \n\t"
    145                       "mov        %[keep_dst], %[dst]         \n\t"
    146 
    147                       "tst        %[count], #4                \n\t"
    148                       "beq        14f                         \n\t"
    149                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    150                       "vld1.32    {q1}, [%[src]]!             \n\t"
    151 
    152                       "14:                                        \n\t"
    153                       "tst        %[count], #2                \n\t"
    154                       "beq        12f                         \n\t"
    155                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    156                       "vld1.32    {d1}, [%[src]]!             \n\t"
    157 
    158                       "12:                                        \n\t"
    159                       "tst        %[count], #1                \n\t"
    160                       "beq        11f                         \n\t"
    161                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    162                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    163 
    164                       "11:                                        \n\t"
    165                       // unzips achieve the same as a vld4 operation
    166                       "vuzpq.u16  q0, q1                      \n\t"
    167                       "vuzp.u8    d0, d1                      \n\t"
    168                       "vuzp.u8    d2, d3                      \n\t"
    169                       // expand 0565 q12 to 8888 {d4-d7}
    170                       "vmovn.u16  d4, q12                     \n\t"
    171                       "vshr.u16   q11, q12, #5                \n\t"
    172                       "vshr.u16   q10, q12, #6+5              \n\t"
    173                       "vmovn.u16  d5, q11                     \n\t"
    174                       "vmovn.u16  d6, q10                     \n\t"
    175                       "vshl.u8    d4, d4, #3                  \n\t"
    176                       "vshl.u8    d5, d5, #2                  \n\t"
    177                       "vshl.u8    d6, d6, #3                  \n\t"
    178 
    179                       "vmovl.u8   q14, d31                    \n\t"
    180                       "vmovl.u8   q13, d31                    \n\t"
    181                       "vmovl.u8   q12, d31                    \n\t"
    182 
    183                       // duplicate in 4/2/1 & 8pix vsns
    184                       "vmvn.8     d30, d3                     \n\t"
    185                       "vmlal.u8   q14, d30, d6                \n\t"
    186                       "vmlal.u8   q13, d30, d5                \n\t"
    187                       "vmlal.u8   q12, d30, d4                \n\t"
    188                       "vshr.u16   q8, q14, #5                 \n\t"
    189                       "vshr.u16   q9, q13, #6                 \n\t"
    190                       "vaddhn.u16 d6, q14, q8                 \n\t"
    191                       "vshr.u16   q8, q12, #5                 \n\t"
    192                       "vaddhn.u16 d5, q13, q9                 \n\t"
    193                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    194                       "vaddhn.u16 d4, q12, q8                 \n\t"
    195                       // intentionally don't calculate alpha
    196                       // result in d4-d6
    197 
    198                       "vqadd.u8   d5, d5, d1                  \n\t"
    199                       "vqadd.u8   d4, d4, d2                  \n\t"
    200 
    201                       // pack 8888 {d4-d6} to 0565 q10
    202                       "vshll.u8   q10, d6, #8                 \n\t"
    203                       "vshll.u8   q3, d5, #8                  \n\t"
    204                       "vshll.u8   q2, d4, #8                  \n\t"
    205                       "vsri.u16   q10, q3, #5                 \n\t"
    206                       "vsri.u16   q10, q2, #11                \n\t"
    207 
    208                       // store
    209                       "tst        %[count], #4                \n\t"
    210                       "beq        24f                         \n\t"
    211                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    212 
    213                       "24:                                        \n\t"
    214                       "tst        %[count], #2                \n\t"
    215                       "beq        22f                         \n\t"
    216                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    217 
    218                       "22:                                        \n\t"
    219                       "tst        %[count], #1                \n\t"
    220                       "beq        21f                         \n\t"
    221                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    222 
    223                       "21:                                        \n\t"
    224                       : [count] "+r" (count)
    225                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    226                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    227                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    228                       "d30","d31"
    229                       );
    230     }
    231 }
    232 
    233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    234                           const SkPMColor* SK_RESTRICT src, int count,
    235                           U8CPU alpha, int /*x*/, int /*y*/) {
    236 
    237     U8CPU alpha_for_asm = alpha;
    238 
    239     asm volatile (
    240     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
    241      * the original in two respects:
    242      *  1. The results have a few mismatches compared to the original code. These mismatches
    243      *     never exceed 1. It's possible to improve accuracy vs. a floating point
    244      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
    245      *     Rounding is not present in the code below, because although results would be closer
    246      *     to a floating point implementation, the number of mismatches compared to the
    247      *     original code would be far greater.
    248      *  2. On certain inputs, the original code can overflow, causing colour channels to
    249      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
    250      *     to affect another.
    251      */
    252 
    253 #if 1
    254         /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
    255                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
    256 #else
    257                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
    258 #endif
    259                   "vmov.u16   q3, #255                        \n\t"   // set up constant
    260                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
    261                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
    262                   "beq        2f                              \n\t"   // if count8 == 0, exit
    263                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
    264 
    265                   "1:                                             \n\t"
    266                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
    267                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
    268                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
    269                   //  and deinterleave
    270 
    271                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
    272                   "vand       q10, q0, q15                    \n\t"   // extract blue
    273                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
    274                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
    275                   // dstrgb = {q8, q9, q10}
    276 
    277                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
    278                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
    279                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
    280 
    281                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
    282                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
    283                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
    284                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
    285                   // srcrgba = {q11, q12, q13, q14}
    286 
    287                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
    288                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
    289                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
    290                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
    291 
    292                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
    293                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
    294                   // dst_scale = q2
    295 
    296                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
    297                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
    298                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
    299 
    300 #if 1
    301     // trying for a better match with SkDiv255Round(a)
    302     // C alg is:  a+=128; (a+a>>8)>>8
    303     // we'll use just a rounding shift [q2 is available for scratch]
    304                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
    305                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
    306                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    307 #else
    308     // arm's original "truncating divide by 256"
    309                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
    310                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
    311                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    312 #endif
    313 
    314                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
    315                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
    316                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
    317 
    318                   "bne        1b                              \n\t"   // if counter != 0, loop
    319                   "2:                                             \n\t"   // exit
    320 
    321                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
    322                   :
    323                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
    324                   );
    325 
    326     count &= 7;
    327     if (count > 0) {
    328         do {
    329             SkPMColor sc = *src++;
    330             if (sc) {
    331                 uint16_t dc = *dst;
    332                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    333                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    334                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    335                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    336                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    337             }
    338             dst += 1;
    339         } while (--count != 0);
    340     }
    341 }
    342 
    343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    344  * each dither value is spaced out into byte lanes, and repeated
    345  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    346  * start of each row.
    347  */
    348 static const uint8_t gDitherMatrix_Neon[48] = {
    349     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    350     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    351     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    352     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    353 
    354 };
    355 
    356 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    357                                 int count, U8CPU alpha, int x, int y)
    358 {
    359 
    360     SkASSERT(255 > alpha);
    361 
    362     // rescale alpha to range 1 - 256
    363     int scale = SkAlpha255To256(alpha);
    364 
    365     if (count >= 8) {
    366         /* select row and offset for dither array */
    367         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    368 
    369         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
    370         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
    371 
    372         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
    373         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
    374 
    375         do {
    376 
    377             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
    378             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
    379             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
    380             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
    381             uint16x8_t vdst;
    382             uint16x8_t vdst_r, vdst_g, vdst_b;
    383             int16x8_t vres_r, vres_g, vres_b;
    384             int8x8_t vres8_r, vres8_g, vres8_b;
    385 
    386             // Load source and add dither
    387             {
    388             register uint8x8_t d0 asm("d0");
    389             register uint8x8_t d1 asm("d1");
    390             register uint8x8_t d2 asm("d2");
    391             register uint8x8_t d3 asm("d3");
    392 
    393             asm (
    394                 "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
    395                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    396                 :
    397             );
    398             vsrc_g = d1;
    399 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
    400             vsrc_r = d2; vsrc_b = d0;
    401 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
    402             vsrc_r = d0; vsrc_b = d2;
    403 #endif
    404             }
    405 
    406             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
    407             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
    408             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
    409 
    410             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
    411             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
    412             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
    413 
    414             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
    415             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
    416             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
    417 
    418             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
    419             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
    420             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
    421 
    422             // Load dst and unpack
    423             vdst = vld1q_u16(dst);
    424             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
    425             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
    426             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
    427 
    428             // subtract dst from src and widen
    429             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
    430             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
    431             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
    432 
    433             // multiply diffs by scale and shift
    434             vres_r = vmulq_s16(vres_r, vscale);
    435             vres_g = vmulq_s16(vres_g, vscale);
    436             vres_b = vmulq_s16(vres_b, vscale);
    437 
    438             vres8_r = vshrn_n_s16(vres_r, 8);
    439             vres8_g = vshrn_n_s16(vres_g, 8);
    440             vres8_b = vshrn_n_s16(vres_b, 8);
    441 
    442             // add dst to result
    443             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
    444             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
    445             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
    446 
    447             // put result into 565 format
    448             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
    449             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
    450 
    451             // Store result
    452             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
    453 
    454             // Next iteration
    455             dst += 8;
    456             count -= 8;
    457 
    458         } while (count >= 8);
    459     }
    460 
    461     // Leftovers
    462     if (count > 0) {
    463         int scale = SkAlpha255To256(alpha);
    464         DITHER_565_SCAN(y);
    465         do {
    466             SkPMColor c = *src++;
    467             SkPMColorAssert(c);
    468 
    469             int dither = DITHER_VALUE(x);
    470             int sr = SkGetPackedR32(c);
    471             int sg = SkGetPackedG32(c);
    472             int sb = SkGetPackedB32(c);
    473             sr = SkDITHER_R32To565(sr, dither);
    474             sg = SkDITHER_G32To565(sg, dither);
    475             sb = SkDITHER_B32To565(sb, dither);
    476 
    477             uint16_t d = *dst;
    478             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    479                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    480                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    481             DITHER_INC_X(x);
    482         } while (--count != 0);
    483     }
    484 }
    485 
    486 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    487                                 const SkPMColor* SK_RESTRICT src,
    488                                 int count, U8CPU alpha) {
    489 
    490     SkASSERT(255 == alpha);
    491     if (count > 0) {
    492 
    493 
    494     uint8x8_t alpha_mask;
    495 
    496     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    497     alpha_mask = vld1_u8(alpha_mask_setup);
    498 
    499     /* do the NEON unrolled code */
    500 #define    UNROLL    4
    501     while (count >= UNROLL) {
    502         uint8x8_t src_raw, dst_raw, dst_final;
    503         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    504 
    505         /* The two prefetches below may make the code slighlty
    506          * slower for small values of count but are worth having
    507          * in the general case.
    508          */
    509         __builtin_prefetch(src+32);
    510         __builtin_prefetch(dst+32);
    511 
    512         /* get the source */
    513         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    514 #if    UNROLL > 2
    515         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    516 #endif
    517 
    518         /* get and hold the dst too */
    519         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    520 #if    UNROLL > 2
    521         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    522 #endif
    523 
    524     /* 1st and 2nd bits of the unrolling */
    525     {
    526         uint8x8_t dst_cooked;
    527         uint16x8_t dst_wide;
    528         uint8x8_t alpha_narrow;
    529         uint16x8_t alpha_wide;
    530 
    531         /* get the alphas spread out properly */
    532         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    533         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    534 
    535         /* spread the dest */
    536         dst_wide = vmovl_u8(dst_raw);
    537 
    538         /* alpha mul the dest */
    539         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    540         dst_cooked = vshrn_n_u16(dst_wide, 8);
    541 
    542         /* sum -- ignoring any byte lane overflows */
    543         dst_final = vadd_u8(src_raw, dst_cooked);
    544     }
    545 
    546 #if    UNROLL > 2
    547     /* the 3rd and 4th bits of our unrolling */
    548     {
    549         uint8x8_t dst_cooked;
    550         uint16x8_t dst_wide;
    551         uint8x8_t alpha_narrow;
    552         uint16x8_t alpha_wide;
    553 
    554         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    555         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    556 
    557         /* spread the dest */
    558         dst_wide = vmovl_u8(dst_raw_2);
    559 
    560         /* alpha mul the dest */
    561         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    562         dst_cooked = vshrn_n_u16(dst_wide, 8);
    563 
    564         /* sum -- ignoring any byte lane overflows */
    565         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    566     }
    567 #endif
    568 
    569         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    570 #if    UNROLL > 2
    571         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    572 #endif
    573 
    574         src += UNROLL;
    575         dst += UNROLL;
    576         count -= UNROLL;
    577     }
    578 #undef    UNROLL
    579 
    580     /* do any residual iterations */
    581         while (--count >= 0) {
    582             *dst = SkPMSrcOver(*src, *dst);
    583             src += 1;
    584             dst += 1;
    585         }
    586     }
    587 }
    588 
    589 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
    590                                 const SkPMColor* SK_RESTRICT src,
    591                                 int count, U8CPU alpha) {
    592     SkASSERT(255 == alpha);
    593 
    594     if (count <= 0)
    595     return;
    596 
    597     /* Use these to check if src is transparent or opaque */
    598     const unsigned int ALPHA_OPAQ  = 0xFF000000;
    599     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
    600 
    601 #define UNROLL  4
    602     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
    603     const SkPMColor* SK_RESTRICT src_temp = src;
    604 
    605     /* set up the NEON variables */
    606     uint8x8_t alpha_mask;
    607     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    608     alpha_mask = vld1_u8(alpha_mask_setup);
    609 
    610     uint8x8_t src_raw, dst_raw, dst_final;
    611     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    612     uint8x8_t dst_cooked;
    613     uint16x8_t dst_wide;
    614     uint8x8_t alpha_narrow;
    615     uint16x8_t alpha_wide;
    616 
    617     /* choose the first processing type */
    618     if( src >= src_end)
    619         goto TAIL;
    620     if(*src <= ALPHA_TRANS)
    621         goto ALPHA_0;
    622     if(*src >= ALPHA_OPAQ)
    623         goto ALPHA_255;
    624     /* fall-thru */
    625 
    626 ALPHA_1_TO_254:
    627     do {
    628 
    629         /* get the source */
    630         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    631         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    632 
    633         /* get and hold the dst too */
    634         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    635         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    636 
    637 
    638         /* get the alphas spread out properly */
    639         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    640         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    641         /* we collapsed (255-a)+1 ... */
    642         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    643 
    644         /* spread the dest */
    645         dst_wide = vmovl_u8(dst_raw);
    646 
    647         /* alpha mul the dest */
    648         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    649         dst_cooked = vshrn_n_u16(dst_wide, 8);
    650 
    651         /* sum -- ignoring any byte lane overflows */
    652         dst_final = vadd_u8(src_raw, dst_cooked);
    653 
    654         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    655         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    656         /* we collapsed (255-a)+1 ... */
    657         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    658 
    659         /* spread the dest */
    660         dst_wide = vmovl_u8(dst_raw_2);
    661 
    662         /* alpha mul the dest */
    663         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    664         dst_cooked = vshrn_n_u16(dst_wide, 8);
    665 
    666         /* sum -- ignoring any byte lane overflows */
    667         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    668 
    669         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    670         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    671 
    672         src += UNROLL;
    673         dst += UNROLL;
    674 
    675         /* if 2 of the next pixels aren't between 1 and 254
    676         it might make sense to go to the optimized loops */
    677         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
    678             break;
    679 
    680     } while(src < src_end);
    681 
    682     if (src >= src_end)
    683         goto TAIL;
    684 
    685     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
    686         goto ALPHA_255;
    687 
    688     /*fall-thru*/
    689 
    690 ALPHA_0:
    691 
    692     /*In this state, we know the current alpha is 0 and
    693      we optimize for the next alpha also being zero. */
    694     src_temp = src;  //so we don't have to increment dst every time
    695     do {
    696         if(*(++src) > ALPHA_TRANS)
    697             break;
    698         if(*(++src) > ALPHA_TRANS)
    699             break;
    700         if(*(++src) > ALPHA_TRANS)
    701             break;
    702         if(*(++src) > ALPHA_TRANS)
    703             break;
    704     } while(src < src_end);
    705 
    706     dst += (src - src_temp);
    707 
    708     /* no longer alpha 0, so determine where to go next. */
    709     if( src >= src_end)
    710         goto TAIL;
    711     if(*src >= ALPHA_OPAQ)
    712         goto ALPHA_255;
    713     else
    714         goto ALPHA_1_TO_254;
    715 
    716 ALPHA_255:
    717     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
    718         dst[0]=src[0];
    719         dst[1]=src[1];
    720         dst[2]=src[2];
    721         dst[3]=src[3];
    722         src+=UNROLL;
    723         dst+=UNROLL;
    724         if(src >= src_end)
    725             goto TAIL;
    726     }
    727 
    728     //Handle remainder.
    729     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    730         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    731             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
    732         }
    733     }
    734 
    735     if( src >= src_end)
    736         goto TAIL;
    737     if(*src <= ALPHA_TRANS)
    738         goto ALPHA_0;
    739     else
    740         goto ALPHA_1_TO_254;
    741 
    742 TAIL:
    743     /* do any residual iterations */
    744     src_end += UNROLL + 1;  //goto the real end
    745     while(src != src_end) {
    746         if( *src != 0 ) {
    747             if( *src >= ALPHA_OPAQ ) {
    748                 *dst = *src;
    749             }
    750             else {
    751                 *dst = SkPMSrcOver(*src, *dst);
    752             }
    753         }
    754         src++;
    755         dst++;
    756     }
    757 
    758 #undef    UNROLL
    759     return;
    760 }
    761 
    762 /* Neon version of S32_Blend_BlitRow32()
    763  * portable version is in src/core/SkBlitRow_D32.cpp
    764  */
    765 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    766                               const SkPMColor* SK_RESTRICT src,
    767                               int count, U8CPU alpha) {
    768     SkASSERT(alpha <= 255);
    769     if (count > 0) {
    770         uint16_t src_scale = SkAlpha255To256(alpha);
    771         uint16_t dst_scale = 256 - src_scale;
    772 
    773     /* run them N at a time through the NEON unit */
    774     /* note that each 1 is 4 bytes, each treated exactly the same,
    775      * so we can work under that guise. We *do* know that the src&dst
    776      * will be 32-bit aligned quantities, so we can specify that on
    777      * the load/store ops and do a neon 'reinterpret' to get us to
    778      * byte-sized (pun intended) pieces that we widen/multiply/shift
    779      * we're limited at 128 bits in the wide ops, which is 8x16bits
    780      * or a pair of 32 bit src/dsts.
    781      */
    782     /* we *could* manually unroll this loop so that we load 128 bits
    783      * (as a pair of 64s) from each of src and dst, processing them
    784      * in pieces. This might give us a little better management of
    785      * the memory latency, but my initial attempts here did not
    786      * produce an instruction stream that looked all that nice.
    787      */
    788 #define    UNROLL    2
    789     while (count >= UNROLL) {
    790         uint8x8_t  src_raw, dst_raw, dst_final;
    791         uint16x8_t  src_wide, dst_wide;
    792 
    793         /* get 64 bits of src, widen it, multiply by src_scale */
    794         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    795         src_wide = vmovl_u8(src_raw);
    796         /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
    797         src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
    798 
    799         /* ditto with dst */
    800         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    801         dst_wide = vmovl_u8(dst_raw);
    802 
    803         /* combine add with dst multiply into mul-accumulate */
    804         dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
    805 
    806         dst_final = vshrn_n_u16(dst_wide, 8);
    807         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    808 
    809         src += UNROLL;
    810         dst += UNROLL;
    811         count -= UNROLL;
    812     }
    813     /* RBE: well, i don't like how gcc manages src/dst across the above
    814      * loop it's constantly calculating src+bias, dst+bias and it only
    815      * adjusts the real ones when we leave the loop. Not sure why
    816      * it's "hoisting down" (hoisting implies above in my lexicon ;))
    817      * the adjustments to src/dst/count, but it does...
    818      * (might be SSA-style internal logic...
    819      */
    820 
    821 #if    UNROLL == 2
    822     if (count == 1) {
    823             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    824     }
    825 #else
    826     if (count > 0) {
    827             do {
    828                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    829                 src += 1;
    830                 dst += 1;
    831             } while (--count > 0);
    832     }
    833 #endif
    834 
    835 #undef    UNROLL
    836     }
    837 }
    838 
    839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    840                          const SkPMColor* SK_RESTRICT src,
    841                          int count, U8CPU alpha) {
    842 
    843     SkASSERT(255 >= alpha);
    844 
    845     if (count <= 0) {
    846         return;
    847     }
    848 
    849     unsigned alpha256 = SkAlpha255To256(alpha);
    850 
    851     // First deal with odd counts
    852     if (count & 1) {
    853         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    854         uint16x8_t vdst_wide, vsrc_wide;
    855         unsigned dst_scale;
    856 
    857         // Load
    858         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    859         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    860 
    861         // Calc dst_scale
    862         dst_scale = vget_lane_u8(vsrc, 3);
    863         dst_scale *= alpha256;
    864         dst_scale >>= 8;
    865         dst_scale = 256 - dst_scale;
    866 
    867         // Process src
    868         vsrc_wide = vmovl_u8(vsrc);
    869         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
    870 
    871         // Process dst
    872         vdst_wide = vmovl_u8(vdst);
    873         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
    874 
    875         // Combine
    876         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
    877 
    878         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    879         dst++;
    880         src++;
    881         count--;
    882     }
    883 
    884     if (count) {
    885         uint8x8_t alpha_mask;
    886         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    887         alpha_mask = vld1_u8(alpha_mask_setup);
    888 
    889         do {
    890 
    891             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
    892             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
    893 
    894             __builtin_prefetch(src+32);
    895             __builtin_prefetch(dst+32);
    896 
    897             // Load
    898             vsrc = vreinterpret_u8_u32(vld1_u32(src));
    899             vdst = vreinterpret_u8_u32(vld1_u32(dst));
    900 
    901             // Prepare src_scale
    902             vsrc_scale = vdupq_n_u16(alpha256);
    903 
    904             // Calc dst_scale
    905             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
    906             vdst_scale = vmovl_u8(vsrc_alphas);
    907             vdst_scale *= vsrc_scale;
    908             vdst_scale = vshrq_n_u16(vdst_scale, 8);
    909             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
    910 
    911             // Process src
    912             vsrc_wide = vmovl_u8(vsrc);
    913             vsrc_wide *= vsrc_scale;
    914 
    915             // Process dst
    916             vdst_wide = vmovl_u8(vdst);
    917             vdst_wide *= vdst_scale;
    918 
    919             // Combine
    920             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
    921 
    922             vst1_u32(dst, vreinterpret_u32_u8(vres));
    923 
    924             src += 2;
    925             dst += 2;
    926             count -= 2;
    927         } while(count);
    928     }
    929 }
    930 
    931 ///////////////////////////////////////////////////////////////////////////////
    932 
    933 #undef    DEBUG_OPAQUE_DITHER
    934 
    935 #if    defined(DEBUG_OPAQUE_DITHER)
    936 static void showme8(char *str, void *p, int len)
    937 {
    938     static char buf[256];
    939     char tbuf[32];
    940     int i;
    941     char *pc = (char*) p;
    942     sprintf(buf,"%8s:", str);
    943     for(i=0;i<len;i++) {
    944         sprintf(tbuf, "   %02x", pc[i]);
    945         strcat(buf, tbuf);
    946     }
    947     SkDebugf("%s\n", buf);
    948 }
    949 static void showme16(char *str, void *p, int len)
    950 {
    951     static char buf[256];
    952     char tbuf[32];
    953     int i;
    954     uint16_t *pc = (uint16_t*) p;
    955     sprintf(buf,"%8s:", str);
    956     len = (len / sizeof(uint16_t));    /* passed as bytes */
    957     for(i=0;i<len;i++) {
    958         sprintf(tbuf, " %04x", pc[i]);
    959         strcat(buf, tbuf);
    960     }
    961     SkDebugf("%s\n", buf);
    962 }
    963 #endif
    964 
    965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
    966                                    const SkPMColor* SK_RESTRICT src,
    967                                    int count, U8CPU alpha, int x, int y) {
    968     SkASSERT(255 == alpha);
    969 
    970 #define    UNROLL    8
    971 
    972     if (count >= UNROLL) {
    973     uint8x8_t dbase;
    974 
    975 #if    defined(DEBUG_OPAQUE_DITHER)
    976     uint16_t tmpbuf[UNROLL];
    977     int td[UNROLL];
    978     int tdv[UNROLL];
    979     int ta[UNROLL];
    980     int tap[UNROLL];
    981     uint16_t in_dst[UNROLL];
    982     int offset = 0;
    983     int noisy = 0;
    984 #endif
    985 
    986     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    987     dbase = vld1_u8(dstart);
    988 
    989         do {
    990         uint8x8_t sr, sg, sb, sa, d;
    991         uint16x8_t dst8, scale8, alpha8;
    992         uint16x8_t dst_r, dst_g, dst_b;
    993 
    994 #if    defined(DEBUG_OPAQUE_DITHER)
    995     /* calculate 8 elements worth into a temp buffer */
    996     {
    997       int my_y = y;
    998       int my_x = x;
    999       SkPMColor* my_src = (SkPMColor*)src;
   1000       uint16_t* my_dst = dst;
   1001       int i;
   1002 
   1003           DITHER_565_SCAN(my_y);
   1004           for(i=0;i<UNROLL;i++) {
   1005             SkPMColor c = *my_src++;
   1006             SkPMColorAssert(c);
   1007             if (c) {
   1008                 unsigned a = SkGetPackedA32(c);
   1009 
   1010                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
   1011         tdv[i] = DITHER_VALUE(my_x);
   1012         ta[i] = a;
   1013         tap[i] = SkAlpha255To256(a);
   1014         td[i] = d;
   1015 
   1016                 unsigned sr = SkGetPackedR32(c);
   1017                 unsigned sg = SkGetPackedG32(c);
   1018                 unsigned sb = SkGetPackedB32(c);
   1019                 sr = SkDITHER_R32_FOR_565(sr, d);
   1020                 sg = SkDITHER_G32_FOR_565(sg, d);
   1021                 sb = SkDITHER_B32_FOR_565(sb, d);
   1022 
   1023                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1024                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
   1025                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1026                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1027                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1028         td[i] = d;
   1029 
   1030             } else {
   1031         tmpbuf[i] = *my_dst;
   1032         ta[i] = tdv[i] = td[i] = 0xbeef;
   1033         }
   1034         in_dst[i] = *my_dst;
   1035             my_dst += 1;
   1036             DITHER_INC_X(my_x);
   1037           }
   1038     }
   1039 #endif
   1040 
   1041         /* source is in ABGR */
   1042         {
   1043         register uint8x8_t d0 asm("d0");
   1044         register uint8x8_t d1 asm("d1");
   1045         register uint8x8_t d2 asm("d2");
   1046         register uint8x8_t d3 asm("d3");
   1047 
   1048         asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
   1049             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
   1050             : "r" (src)
   1051                     );
   1052             sr = d0; sg = d1; sb = d2; sa = d3;
   1053         }
   1054 
   1055         /* calculate 'd', which will be 0..7 */
   1056         /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
   1057 #if defined(SK_BUILD_FOR_ANDROID)
   1058         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
   1059         alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
   1060 #else
   1061         alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
   1062 #endif
   1063         alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
   1064         d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
   1065 
   1066         /* sr = sr - (sr>>5) + d */
   1067         /* watching for 8-bit overflow.  d is 0..7; risky range of
   1068          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
   1069          * safe  as long as we do ((sr-sr>>5) + d) */
   1070         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1071         sr = vadd_u8(sr, d);
   1072 
   1073         /* sb = sb - (sb>>5) + d */
   1074         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1075         sb = vadd_u8(sb, d);
   1076 
   1077         /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
   1078         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1079         sg = vadd_u8(sg, vshr_n_u8(d,1));
   1080 
   1081         /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
   1082         dst8 = vld1q_u16(dst);
   1083         dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
   1084         dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
   1085         dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
   1086 
   1087         /* blend */
   1088 #if 1
   1089         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
   1090         /* originally 255-sa + 1 */
   1091         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1092 #else
   1093         scale8 = vsubw_u8(vdupq_n_u16(255), sa);
   1094         scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
   1095 #endif
   1096 
   1097 #if 1
   1098         /* combine the addq and mul, save 3 insns */
   1099         scale8 = vshrq_n_u16(scale8, 3);
   1100         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1101         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1102         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1103 #else
   1104         /* known correct, but +3 insns over above */
   1105         scale8 = vshrq_n_u16(scale8, 3);
   1106         dst_b = vmulq_u16(dst_b, scale8);
   1107         dst_g = vmulq_u16(dst_g, scale8);
   1108         dst_r = vmulq_u16(dst_r, scale8);
   1109 
   1110         /* combine */
   1111         /* NB: vshll widens, need to preserve those bits */
   1112         dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
   1113         dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
   1114         dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
   1115 #endif
   1116 
   1117         /* repack to store */
   1118         dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
   1119         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1120         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1121 
   1122         vst1q_u16(dst, dst8);
   1123 
   1124 #if    defined(DEBUG_OPAQUE_DITHER)
   1125         /* verify my 8 elements match the temp buffer */
   1126     {
   1127        int i, bad=0;
   1128        static int invocation;
   1129 
   1130        for (i=0;i<UNROLL;i++)
   1131         if (tmpbuf[i] != dst[i]) bad=1;
   1132        if (bad) {
   1133         SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
   1134             invocation, offset);
   1135         SkDebugf("  alpha 0x%x\n", alpha);
   1136         for (i=0;i<UNROLL;i++)
   1137             SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
   1138             i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
   1139             dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
   1140 
   1141         showme16("alpha8", &alpha8, sizeof(alpha8));
   1142         showme16("scale8", &scale8, sizeof(scale8));
   1143         showme8("d", &d, sizeof(d));
   1144         showme16("dst8", &dst8, sizeof(dst8));
   1145         showme16("dst_b", &dst_b, sizeof(dst_b));
   1146         showme16("dst_g", &dst_g, sizeof(dst_g));
   1147         showme16("dst_r", &dst_r, sizeof(dst_r));
   1148         showme8("sb", &sb, sizeof(sb));
   1149         showme8("sg", &sg, sizeof(sg));
   1150         showme8("sr", &sr, sizeof(sr));
   1151 
   1152         /* cop out */
   1153         return;
   1154        }
   1155        offset += UNROLL;
   1156        invocation++;
   1157     }
   1158 #endif
   1159 
   1160             dst += UNROLL;
   1161         src += UNROLL;
   1162         count -= UNROLL;
   1163         /* skip x += UNROLL, since it's unchanged mod-4 */
   1164         } while (count >= UNROLL);
   1165     }
   1166 #undef    UNROLL
   1167 
   1168     /* residuals */
   1169     if (count > 0) {
   1170         DITHER_565_SCAN(y);
   1171         do {
   1172             SkPMColor c = *src++;
   1173             SkPMColorAssert(c);
   1174             if (c) {
   1175                 unsigned a = SkGetPackedA32(c);
   1176 
   1177                 // dither and alpha are just temporary variables to work-around
   1178                 // an ICE in debug.
   1179                 unsigned dither = DITHER_VALUE(x);
   1180                 unsigned alpha = SkAlpha255To256(a);
   1181                 int d = SkAlphaMul(dither, alpha);
   1182 
   1183                 unsigned sr = SkGetPackedR32(c);
   1184                 unsigned sg = SkGetPackedG32(c);
   1185                 unsigned sb = SkGetPackedB32(c);
   1186                 sr = SkDITHER_R32_FOR_565(sr, d);
   1187                 sg = SkDITHER_G32_FOR_565(sg, d);
   1188                 sb = SkDITHER_B32_FOR_565(sb, d);
   1189 
   1190                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1191                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1192                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1193                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1194                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1195             }
   1196             dst += 1;
   1197             DITHER_INC_X(x);
   1198         } while (--count != 0);
   1199     }
   1200 }
   1201 
   1202 ///////////////////////////////////////////////////////////////////////////////
   1203 
   1204 #undef    DEBUG_S32_OPAQUE_DITHER
   1205 
   1206 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1207                                  const SkPMColor* SK_RESTRICT src,
   1208                                  int count, U8CPU alpha, int x, int y) {
   1209     SkASSERT(255 == alpha);
   1210 
   1211 #define    UNROLL    8
   1212     if (count >= UNROLL) {
   1213     uint8x8_t d;
   1214     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1215     d = vld1_u8(dstart);
   1216 
   1217     while (count >= UNROLL) {
   1218         uint8x8_t sr, sg, sb;
   1219         uint16x8_t dr, dg, db;
   1220         uint16x8_t dst8;
   1221 
   1222         {
   1223         register uint8x8_t d0 asm("d0");
   1224         register uint8x8_t d1 asm("d1");
   1225         register uint8x8_t d2 asm("d2");
   1226         register uint8x8_t d3 asm("d3");
   1227 
   1228         asm (
   1229             "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
   1230             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   1231             :
   1232         );
   1233         sg = d1;
   1234 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
   1235         sr = d2; sb = d0;
   1236 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
   1237         sr = d0; sb = d2;
   1238 #endif
   1239         }
   1240         /* XXX: if we want to prefetch, hide it in the above asm()
   1241          * using the gcc __builtin_prefetch(), the prefetch will
   1242          * fall to the bottom of the loop -- it won't stick up
   1243          * at the top of the loop, just after the vld4.
   1244          */
   1245 
   1246         // sr = sr - (sr>>5) + d
   1247         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1248         dr = vaddl_u8(sr, d);
   1249 
   1250         // sb = sb - (sb>>5) + d
   1251         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1252         db = vaddl_u8(sb, d);
   1253 
   1254         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1255         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1256         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
   1257 
   1258         // pack high bits of each into 565 format  (rgb, b is lsb)
   1259         dst8 = vshrq_n_u16(db, 3);
   1260         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1261         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
   1262 
   1263         // store it
   1264         vst1q_u16(dst, dst8);
   1265 
   1266 #if    defined(DEBUG_S32_OPAQUE_DITHER)
   1267         // always good to know if we generated good results
   1268         {
   1269         int i, myx = x, myy = y;
   1270         DITHER_565_SCAN(myy);
   1271         for (i=0;i<UNROLL;i++) {
   1272             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
   1273             SkPMColor c = src[i-8];
   1274             unsigned dither = DITHER_VALUE(myx);
   1275             uint16_t val = SkDitherRGB32To565(c, dither);
   1276             if (val != dst[i]) {
   1277             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
   1278                 c, dither, val, dst[i], dstart[i]);
   1279             }
   1280             DITHER_INC_X(myx);
   1281         }
   1282         }
   1283 #endif
   1284 
   1285         dst += UNROLL;
   1286         // we don't need to increment src as the asm above has already done it
   1287         count -= UNROLL;
   1288         x += UNROLL;        // probably superfluous
   1289     }
   1290     }
   1291 #undef    UNROLL
   1292 
   1293     // residuals
   1294     if (count > 0) {
   1295         DITHER_565_SCAN(y);
   1296         do {
   1297             SkPMColor c = *src++;
   1298             SkPMColorAssert(c);
   1299             SkASSERT(SkGetPackedA32(c) == 255);
   1300 
   1301             unsigned dither = DITHER_VALUE(x);
   1302             *dst++ = SkDitherRGB32To565(c, dither);
   1303             DITHER_INC_X(x);
   1304         } while (--count != 0);
   1305     }
   1306 }
   1307 
   1308 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
   1309                       SkPMColor color) {
   1310     if (count <= 0) {
   1311         return;
   1312     }
   1313 
   1314     if (0 == color) {
   1315         if (src != dst) {
   1316             memcpy(dst, src, count * sizeof(SkPMColor));
   1317         }
   1318         return;
   1319     }
   1320 
   1321     unsigned colorA = SkGetPackedA32(color);
   1322     if (255 == colorA) {
   1323         sk_memset32(dst, color, count);
   1324     } else {
   1325         unsigned scale = 256 - SkAlpha255To256(colorA);
   1326 
   1327         if (count >= 8) {
   1328             // at the end of this assembly, count will have been decremented
   1329             // to a negative value. That is, if count mod 8 = x, it will be
   1330             // -8 +x coming out.
   1331             asm volatile (
   1332                 PLD128(src, 0)
   1333 
   1334                 "vdup.32    q0, %[color]                \n\t"
   1335 
   1336                 PLD128(src, 128)
   1337 
   1338                 // scale numerical interval [0-255], so load as 8 bits
   1339                 "vdup.8     d2, %[scale]                \n\t"
   1340 
   1341                 PLD128(src, 256)
   1342 
   1343                 "subs       %[count], %[count], #8      \n\t"
   1344 
   1345                 PLD128(src, 384)
   1346 
   1347                 "Loop_Color32:                          \n\t"
   1348 
   1349                 // load src color, 8 pixels, 4 64 bit registers
   1350                 // (and increment src).
   1351                 "vld1.32    {d4-d7}, [%[src]]!          \n\t"
   1352 
   1353                 PLD128(src, 384)
   1354 
   1355                 // multiply long by scale, 64 bits at a time,
   1356                 // destination into a 128 bit register.
   1357                 "vmull.u8   q4, d4, d2                  \n\t"
   1358                 "vmull.u8   q5, d5, d2                  \n\t"
   1359                 "vmull.u8   q6, d6, d2                  \n\t"
   1360                 "vmull.u8   q7, d7, d2                  \n\t"
   1361 
   1362                 // shift the 128 bit registers, containing the 16
   1363                 // bit scaled values back to 8 bits, narrowing the
   1364                 // results to 64 bit registers.
   1365                 "vshrn.i16  d8, q4, #8                  \n\t"
   1366                 "vshrn.i16  d9, q5, #8                  \n\t"
   1367                 "vshrn.i16  d10, q6, #8                 \n\t"
   1368                 "vshrn.i16  d11, q7, #8                 \n\t"
   1369 
   1370                 // adding back the color, using 128 bit registers.
   1371                 "vadd.i8    q6, q4, q0                  \n\t"
   1372                 "vadd.i8    q7, q5, q0                  \n\t"
   1373 
   1374                 // store back the 8 calculated pixels (2 128 bit
   1375                 // registers), and increment dst.
   1376                 "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
   1377 
   1378                 "subs       %[count], %[count], #8      \n\t"
   1379                 "bge        Loop_Color32                \n\t"
   1380                 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
   1381                 : [color] "r" (color), [scale] "r" (scale)
   1382                 : "cc", "memory",
   1383                   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
   1384                   "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
   1385                           );
   1386             // At this point, if we went through the inline assembly, count is
   1387             // a negative value:
   1388             // if the value is -8, there is no pixel left to process.
   1389             // if the value is -7, there is one pixel left to process
   1390             // ...
   1391             // And'ing it with 7 will give us the number of pixels
   1392             // left to process.
   1393             count = count & 0x7;
   1394         }
   1395 
   1396         while (count > 0) {
   1397             *dst = color + SkAlphaMulQ(*src, scale);
   1398             src += 1;
   1399             dst += 1;
   1400             count--;
   1401         }
   1402     }
   1403 }
   1404 
   1405 ///////////////////////////////////////////////////////////////////////////////
   1406 
   1407 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
   1408     // no dither
   1409     // NOTE: For the S32_D565_Blend function below, we don't have a special
   1410     //       version that assumes that each source pixel is opaque. But our
   1411     //       S32A is still faster than the default, so use it.
   1412     S32_D565_Opaque_neon,
   1413     S32A_D565_Blend_neon,   // really S32_D565_Blend
   1414     S32A_D565_Opaque_neon,
   1415     S32A_D565_Blend_neon,
   1416 
   1417     // dither
   1418     S32_D565_Opaque_Dither_neon,
   1419     S32_D565_Blend_Dither_neon,
   1420     S32A_D565_Opaque_Dither_neon,
   1421     NULL,   // S32A_D565_Blend_Dither
   1422 };
   1423 
   1424 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1425     NULL,   // S32_Opaque,
   1426     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1427     /*
   1428      * We have two choices for S32A_Opaque procs. The one reads the src alpha
   1429      * value and attempts to optimize accordingly.  The optimization is
   1430      * sensitive to the source content and is not a win in all cases. For
   1431      * example, if there are a lot of transitions between the alpha states,
   1432      * the performance will almost certainly be worse.  However, for many
   1433      * common cases the performance is equivalent or better than the standard
   1434      * case where we do not inspect the src alpha.
   1435      */
   1436 #if SK_A32_SHIFT == 24
   1437     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
   1438     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
   1439 #else
   1440     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
   1441 #endif
   1442     S32A_Blend_BlitRow32_neon        // S32A_Blend
   1443 };
   1444