Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkCachePreload_arm.h"
     18 
     19 #include <arm_neon.h>
     20 
     21 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     22                            const SkPMColor* SK_RESTRICT src, int count,
     23                            U8CPU alpha, int /*x*/, int /*y*/) {
     24     SkASSERT(255 == alpha);
     25 
     26     if (count >= 8) {
     27         uint16_t* SK_RESTRICT keep_dst;
     28 
     29         asm volatile (
     30                       "ands       ip, %[count], #7            \n\t"
     31                       "vmov.u8    d31, #1<<7                  \n\t"
     32                       "vld1.16    {q12}, [%[dst]]             \n\t"
     33                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
     34                       // Thumb does not support the standard ARM conditional
     35                       // instructions but instead requires the 'it' instruction
     36                       // to signal conditional execution
     37                       "it eq                                  \n\t"
     38                       "moveq      ip, #8                      \n\t"
     39                       "mov        %[keep_dst], %[dst]         \n\t"
     40 
     41                       "add        %[src], %[src], ip, LSL#2   \n\t"
     42                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
     43                       "subs       %[count], %[count], ip      \n\t"
     44                       "b          9f                          \n\t"
     45                       // LOOP
     46                       "2:                                         \n\t"
     47 
     48                       "vld1.16    {q12}, [%[dst]]!            \n\t"
     49                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
     50                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
     51                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
     52                       "subs       %[count], %[count], #8      \n\t"
     53                       "9:                                         \n\t"
     54                       "pld        [%[dst],#32]                \n\t"
     55                       // expand 0565 q12 to 8888 {d4-d7}
     56                       "vmovn.u16  d4, q12                     \n\t"
     57                       "vshr.u16   q11, q12, #5                \n\t"
     58                       "vshr.u16   q10, q12, #6+5              \n\t"
     59                       "vmovn.u16  d5, q11                     \n\t"
     60                       "vmovn.u16  d6, q10                     \n\t"
     61                       "vshl.u8    d4, d4, #3                  \n\t"
     62                       "vshl.u8    d5, d5, #2                  \n\t"
     63                       "vshl.u8    d6, d6, #3                  \n\t"
     64 
     65                       "vmovl.u8   q14, d31                    \n\t"
     66                       "vmovl.u8   q13, d31                    \n\t"
     67                       "vmovl.u8   q12, d31                    \n\t"
     68 
     69                       // duplicate in 4/2/1 & 8pix vsns
     70                       "vmvn.8     d30, d3                     \n\t"
     71                       "vmlal.u8   q14, d30, d6                \n\t"
     72                       "vmlal.u8   q13, d30, d5                \n\t"
     73                       "vmlal.u8   q12, d30, d4                \n\t"
     74                       "vshr.u16   q8, q14, #5                 \n\t"
     75                       "vshr.u16   q9, q13, #6                 \n\t"
     76                       "vaddhn.u16 d6, q14, q8                 \n\t"
     77                       "vshr.u16   q8, q12, #5                 \n\t"
     78                       "vaddhn.u16 d5, q13, q9                 \n\t"
     79                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
     80                       "vaddhn.u16 d4, q12, q8                 \n\t"
     81                       // intentionally don't calculate alpha
     82                       // result in d4-d6
     83 
     84                       "vqadd.u8   d5, d5, d1                  \n\t"
     85                       "vqadd.u8   d4, d4, d2                  \n\t"
     86 
     87                       // pack 8888 {d4-d6} to 0565 q10
     88                       "vshll.u8   q10, d6, #8                 \n\t"
     89                       "vshll.u8   q3, d5, #8                  \n\t"
     90                       "vshll.u8   q2, d4, #8                  \n\t"
     91                       "vsri.u16   q10, q3, #5                 \n\t"
     92                       "vsri.u16   q10, q2, #11                \n\t"
     93 
     94                       "bne        2b                          \n\t"
     95 
     96                       "1:                                         \n\t"
     97                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
     98                       : [count] "+r" (count)
     99                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    100                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    101                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    102                       "d30","d31"
    103                       );
    104     }
    105     else
    106     {   // handle count < 8
    107         uint16_t* SK_RESTRICT keep_dst;
    108 
    109         asm volatile (
    110                       "vmov.u8    d31, #1<<7                  \n\t"
    111                       "mov        %[keep_dst], %[dst]         \n\t"
    112 
    113                       "tst        %[count], #4                \n\t"
    114                       "beq        14f                         \n\t"
    115                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    116                       "vld1.32    {q1}, [%[src]]!             \n\t"
    117 
    118                       "14:                                        \n\t"
    119                       "tst        %[count], #2                \n\t"
    120                       "beq        12f                         \n\t"
    121                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    122                       "vld1.32    {d1}, [%[src]]!             \n\t"
    123 
    124                       "12:                                        \n\t"
    125                       "tst        %[count], #1                \n\t"
    126                       "beq        11f                         \n\t"
    127                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    128                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    129 
    130                       "11:                                        \n\t"
    131                       // unzips achieve the same as a vld4 operation
    132                       "vuzpq.u16  q0, q1                      \n\t"
    133                       "vuzp.u8    d0, d1                      \n\t"
    134                       "vuzp.u8    d2, d3                      \n\t"
    135                       // expand 0565 q12 to 8888 {d4-d7}
    136                       "vmovn.u16  d4, q12                     \n\t"
    137                       "vshr.u16   q11, q12, #5                \n\t"
    138                       "vshr.u16   q10, q12, #6+5              \n\t"
    139                       "vmovn.u16  d5, q11                     \n\t"
    140                       "vmovn.u16  d6, q10                     \n\t"
    141                       "vshl.u8    d4, d4, #3                  \n\t"
    142                       "vshl.u8    d5, d5, #2                  \n\t"
    143                       "vshl.u8    d6, d6, #3                  \n\t"
    144 
    145                       "vmovl.u8   q14, d31                    \n\t"
    146                       "vmovl.u8   q13, d31                    \n\t"
    147                       "vmovl.u8   q12, d31                    \n\t"
    148 
    149                       // duplicate in 4/2/1 & 8pix vsns
    150                       "vmvn.8     d30, d3                     \n\t"
    151                       "vmlal.u8   q14, d30, d6                \n\t"
    152                       "vmlal.u8   q13, d30, d5                \n\t"
    153                       "vmlal.u8   q12, d30, d4                \n\t"
    154                       "vshr.u16   q8, q14, #5                 \n\t"
    155                       "vshr.u16   q9, q13, #6                 \n\t"
    156                       "vaddhn.u16 d6, q14, q8                 \n\t"
    157                       "vshr.u16   q8, q12, #5                 \n\t"
    158                       "vaddhn.u16 d5, q13, q9                 \n\t"
    159                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    160                       "vaddhn.u16 d4, q12, q8                 \n\t"
    161                       // intentionally don't calculate alpha
    162                       // result in d4-d6
    163 
    164                       "vqadd.u8   d5, d5, d1                  \n\t"
    165                       "vqadd.u8   d4, d4, d2                  \n\t"
    166 
    167                       // pack 8888 {d4-d6} to 0565 q10
    168                       "vshll.u8   q10, d6, #8                 \n\t"
    169                       "vshll.u8   q3, d5, #8                  \n\t"
    170                       "vshll.u8   q2, d4, #8                  \n\t"
    171                       "vsri.u16   q10, q3, #5                 \n\t"
    172                       "vsri.u16   q10, q2, #11                \n\t"
    173 
    174                       // store
    175                       "tst        %[count], #4                \n\t"
    176                       "beq        24f                         \n\t"
    177                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    178 
    179                       "24:                                        \n\t"
    180                       "tst        %[count], #2                \n\t"
    181                       "beq        22f                         \n\t"
    182                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    183 
    184                       "22:                                        \n\t"
    185                       "tst        %[count], #1                \n\t"
    186                       "beq        21f                         \n\t"
    187                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    188 
    189                       "21:                                        \n\t"
    190                       : [count] "+r" (count)
    191                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    192                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    193                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    194                       "d30","d31"
    195                       );
    196     }
    197 }
    198 
    199 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    200                           const SkPMColor* SK_RESTRICT src, int count,
    201                           U8CPU alpha, int /*x*/, int /*y*/) {
    202 
    203     U8CPU alpha_for_asm = alpha;
    204 
    205     asm volatile (
    206     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
    207      * the original in two respects:
    208      *  1. The results have a few mismatches compared to the original code. These mismatches
    209      *     never exceed 1. It's possible to improve accuracy vs. a floating point
    210      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
    211      *     Rounding is not present in the code below, because although results would be closer
    212      *     to a floating point implementation, the number of mismatches compared to the
    213      *     original code would be far greater.
    214      *  2. On certain inputs, the original code can overflow, causing colour channels to
    215      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
    216      *     to affect another.
    217      */
    218 
    219 #if 1
    220         /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
    221                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
    222 #else
    223                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
    224 #endif
    225                   "vmov.u16   q3, #255                        \n\t"   // set up constant
    226                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
    227                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
    228                   "beq        2f                              \n\t"   // if count8 == 0, exit
    229                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
    230 
    231                   "1:                                             \n\t"
    232                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
    233                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
    234                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
    235                   //  and deinterleave
    236 
    237                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
    238                   "vand       q10, q0, q15                    \n\t"   // extract blue
    239                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
    240                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
    241                   // dstrgb = {q8, q9, q10}
    242 
    243                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
    244                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
    245                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
    246 
    247                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
    248                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
    249                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
    250                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
    251                   // srcrgba = {q11, q12, q13, q14}
    252 
    253                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
    254                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
    255                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
    256                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
    257 
    258                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
    259                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
    260                   // dst_scale = q2
    261 
    262                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
    263                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
    264                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
    265 
    266 #if 1
    267     // trying for a better match with SkDiv255Round(a)
    268     // C alg is:  a+=128; (a+a>>8)>>8
    269     // we'll use just a rounding shift [q2 is available for scratch]
    270                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
    271                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
    272                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    273 #else
    274     // arm's original "truncating divide by 256"
    275                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
    276                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
    277                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    278 #endif
    279 
    280                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
    281                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
    282                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
    283 
    284                   "bne        1b                              \n\t"   // if counter != 0, loop
    285                   "2:                                             \n\t"   // exit
    286 
    287                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
    288                   :
    289                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
    290                   );
    291 
    292     count &= 7;
    293     if (count > 0) {
    294         do {
    295             SkPMColor sc = *src++;
    296             if (sc) {
    297                 uint16_t dc = *dst;
    298                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    299                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    300                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    301                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    302                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    303             }
    304             dst += 1;
    305         } while (--count != 0);
    306     }
    307 }
    308 
    309 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    310  * each dither value is spaced out into byte lanes, and repeated
    311  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    312  * start of each row.
    313  */
    314 static const uint8_t gDitherMatrix_Neon[48] = {
    315     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    316     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    317     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    318     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    319 
    320 };
    321 
    322 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    323                                 int count, U8CPU alpha, int x, int y)
    324 {
    325     /* select row and offset for dither array */
    326     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    327 
    328     /* rescale alpha to range 0 - 256 */
    329     int scale = SkAlpha255To256(alpha);
    330 
    331     asm volatile (
    332                   "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
    333                   "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
    334                   "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
    335                   "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
    336                   "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
    337                   "1:                                                 \n\t"
    338                   "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
    339                   "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
    340                   "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
    341                   "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
    342                   "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
    343                   "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
    344                   "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
    345                   "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
    346                   "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
    347                   "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
    348                   "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
    349                   "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
    350                   "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
    351                   // load 8 pixels from dst, extract rgb
    352                   "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
    353                   "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
    354                   "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
    355                   "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
    356                   "vand           d17, d17, d29                   \n\t"   // and green with green mask
    357                   "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
    358                   "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
    359                   // src = {d22 (r), d23 (g), d24 (b)}
    360                   // dst = {d16 (r), d17 (g), d18 (b)}
    361                   // subtract dst from src and widen
    362                   "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
    363                   "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
    364                   "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
    365                   // multiply diffs by scale and shift
    366                   "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
    367                   "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
    368                   "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
    369                   "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
    370                   "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
    371                   "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
    372                   "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
    373                   // add dst to result
    374                   "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
    375                   "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
    376                   "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
    377                   // put result into 565 format
    378                   "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
    379                   "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
    380                   "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
    381                   "bgt            1b                              \n\t"   // loop if count > 0
    382                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
    383                   : [dstart] "r" (dstart), [scale] "r" (scale)
    384                   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
    385                   );
    386 
    387     DITHER_565_SCAN(y);
    388 
    389     while((count & 7) > 0)
    390     {
    391         SkPMColor c = *src++;
    392 
    393         int dither = DITHER_VALUE(x);
    394         int sr = SkGetPackedR32(c);
    395         int sg = SkGetPackedG32(c);
    396         int sb = SkGetPackedB32(c);
    397         sr = SkDITHER_R32To565(sr, dither);
    398         sg = SkDITHER_G32To565(sg, dither);
    399         sb = SkDITHER_B32To565(sb, dither);
    400 
    401         uint16_t d = *dst;
    402         *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    403                              SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    404                              SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    405         DITHER_INC_X(x);
    406         count--;
    407     }
    408 }
    409 
    410 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    411                                 const SkPMColor* SK_RESTRICT src,
    412                                 int count, U8CPU alpha) {
    413 
    414     SkASSERT(255 == alpha);
    415     if (count > 0) {
    416 
    417 
    418     uint8x8_t alpha_mask;
    419 
    420     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    421     alpha_mask = vld1_u8(alpha_mask_setup);
    422 
    423     /* do the NEON unrolled code */
    424 #define    UNROLL    4
    425     while (count >= UNROLL) {
    426         uint8x8_t src_raw, dst_raw, dst_final;
    427         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    428 
    429         /* get the source */
    430         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    431 #if    UNROLL > 2
    432         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    433 #endif
    434 
    435         /* get and hold the dst too */
    436         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    437 #if    UNROLL > 2
    438         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    439 #endif
    440 
    441     /* 1st and 2nd bits of the unrolling */
    442     {
    443         uint8x8_t dst_cooked;
    444         uint16x8_t dst_wide;
    445         uint8x8_t alpha_narrow;
    446         uint16x8_t alpha_wide;
    447 
    448         /* get the alphas spread out properly */
    449         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    450 #if 1
    451         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    452         /* we collapsed (255-a)+1 ... */
    453         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    454 #else
    455         alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
    456         alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
    457 #endif
    458 
    459         /* spread the dest */
    460         dst_wide = vmovl_u8(dst_raw);
    461 
    462         /* alpha mul the dest */
    463         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    464         dst_cooked = vshrn_n_u16(dst_wide, 8);
    465 
    466         /* sum -- ignoring any byte lane overflows */
    467         dst_final = vadd_u8(src_raw, dst_cooked);
    468     }
    469 
    470 #if    UNROLL > 2
    471     /* the 3rd and 4th bits of our unrolling */
    472     {
    473         uint8x8_t dst_cooked;
    474         uint16x8_t dst_wide;
    475         uint8x8_t alpha_narrow;
    476         uint16x8_t alpha_wide;
    477 
    478         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    479 #if 1
    480         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    481         /* we collapsed (255-a)+1 ... */
    482         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    483 #else
    484         alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
    485         alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
    486 #endif
    487 
    488         /* spread the dest */
    489         dst_wide = vmovl_u8(dst_raw_2);
    490 
    491         /* alpha mul the dest */
    492         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    493         dst_cooked = vshrn_n_u16(dst_wide, 8);
    494 
    495         /* sum -- ignoring any byte lane overflows */
    496         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    497     }
    498 #endif
    499 
    500         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    501 #if    UNROLL > 2
    502         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    503 #endif
    504 
    505         src += UNROLL;
    506         dst += UNROLL;
    507         count -= UNROLL;
    508     }
    509 #undef    UNROLL
    510 
    511     /* do any residual iterations */
    512         while (--count >= 0) {
    513             *dst = SkPMSrcOver(*src, *dst);
    514             src += 1;
    515             dst += 1;
    516         }
    517     }
    518 }
    519 
    520 
    521 /* Neon version of S32_Blend_BlitRow32()
    522  * portable version is in src/core/SkBlitRow_D32.cpp
    523  */
    524 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    525                               const SkPMColor* SK_RESTRICT src,
    526                               int count, U8CPU alpha) {
    527     SkASSERT(alpha <= 255);
    528     if (count > 0) {
    529         uint16_t src_scale = SkAlpha255To256(alpha);
    530         uint16_t dst_scale = 256 - src_scale;
    531 
    532     /* run them N at a time through the NEON unit */
    533     /* note that each 1 is 4 bytes, each treated exactly the same,
    534      * so we can work under that guise. We *do* know that the src&dst
    535      * will be 32-bit aligned quantities, so we can specify that on
    536      * the load/store ops and do a neon 'reinterpret' to get us to
    537      * byte-sized (pun intended) pieces that we widen/multiply/shift
    538      * we're limited at 128 bits in the wide ops, which is 8x16bits
    539      * or a pair of 32 bit src/dsts.
    540      */
    541     /* we *could* manually unroll this loop so that we load 128 bits
    542      * (as a pair of 64s) from each of src and dst, processing them
    543      * in pieces. This might give us a little better management of
    544      * the memory latency, but my initial attempts here did not
    545      * produce an instruction stream that looked all that nice.
    546      */
    547 #define    UNROLL    2
    548     while (count >= UNROLL) {
    549         uint8x8_t  src_raw, dst_raw, dst_final;
    550         uint16x8_t  src_wide, dst_wide;
    551 
    552         /* get 64 bits of src, widen it, multiply by src_scale */
    553         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    554         src_wide = vmovl_u8(src_raw);
    555         /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
    556         src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
    557 
    558         /* ditto with dst */
    559         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    560         dst_wide = vmovl_u8(dst_raw);
    561 
    562         /* combine add with dst multiply into mul-accumulate */
    563         dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
    564 
    565         dst_final = vshrn_n_u16(dst_wide, 8);
    566         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    567 
    568         src += UNROLL;
    569         dst += UNROLL;
    570         count -= UNROLL;
    571     }
    572     /* RBE: well, i don't like how gcc manages src/dst across the above
    573      * loop it's constantly calculating src+bias, dst+bias and it only
    574      * adjusts the real ones when we leave the loop. Not sure why
    575      * it's "hoisting down" (hoisting implies above in my lexicon ;))
    576      * the adjustments to src/dst/count, but it does...
    577      * (might be SSA-style internal logic...
    578      */
    579 
    580 #if    UNROLL == 2
    581     if (count == 1) {
    582             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    583     }
    584 #else
    585     if (count > 0) {
    586             do {
    587                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    588                 src += 1;
    589                 dst += 1;
    590             } while (--count > 0);
    591     }
    592 #endif
    593 
    594 #undef    UNROLL
    595     }
    596 }
    597 
    598 ///////////////////////////////////////////////////////////////////////////////
    599 
    600 #undef    DEBUG_OPAQUE_DITHER
    601 
    602 #if    defined(DEBUG_OPAQUE_DITHER)
    603 static void showme8(char *str, void *p, int len)
    604 {
    605     static char buf[256];
    606     char tbuf[32];
    607     int i;
    608     char *pc = (char*) p;
    609     sprintf(buf,"%8s:", str);
    610     for(i=0;i<len;i++) {
    611         sprintf(tbuf, "   %02x", pc[i]);
    612         strcat(buf, tbuf);
    613     }
    614     SkDebugf("%s\n", buf);
    615 }
    616 static void showme16(char *str, void *p, int len)
    617 {
    618     static char buf[256];
    619     char tbuf[32];
    620     int i;
    621     uint16_t *pc = (uint16_t*) p;
    622     sprintf(buf,"%8s:", str);
    623     len = (len / sizeof(uint16_t));    /* passed as bytes */
    624     for(i=0;i<len;i++) {
    625         sprintf(tbuf, " %04x", pc[i]);
    626         strcat(buf, tbuf);
    627     }
    628     SkDebugf("%s\n", buf);
    629 }
    630 #endif
    631 
    632 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
    633                                    const SkPMColor* SK_RESTRICT src,
    634                                    int count, U8CPU alpha, int x, int y) {
    635     SkASSERT(255 == alpha);
    636 
    637 #define    UNROLL    8
    638 
    639     if (count >= UNROLL) {
    640     uint8x8_t dbase;
    641 
    642 #if    defined(DEBUG_OPAQUE_DITHER)
    643     uint16_t tmpbuf[UNROLL];
    644     int td[UNROLL];
    645     int tdv[UNROLL];
    646     int ta[UNROLL];
    647     int tap[UNROLL];
    648     uint16_t in_dst[UNROLL];
    649     int offset = 0;
    650     int noisy = 0;
    651 #endif
    652 
    653     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    654     dbase = vld1_u8(dstart);
    655 
    656         do {
    657         uint8x8_t sr, sg, sb, sa, d;
    658         uint16x8_t dst8, scale8, alpha8;
    659         uint16x8_t dst_r, dst_g, dst_b;
    660 
    661 #if    defined(DEBUG_OPAQUE_DITHER)
    662     /* calculate 8 elements worth into a temp buffer */
    663     {
    664       int my_y = y;
    665       int my_x = x;
    666       SkPMColor* my_src = (SkPMColor*)src;
    667       uint16_t* my_dst = dst;
    668       int i;
    669 
    670           DITHER_565_SCAN(my_y);
    671           for(i=0;i<UNROLL;i++) {
    672             SkPMColor c = *my_src++;
    673             SkPMColorAssert(c);
    674             if (c) {
    675                 unsigned a = SkGetPackedA32(c);
    676 
    677                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
    678         tdv[i] = DITHER_VALUE(my_x);
    679         ta[i] = a;
    680         tap[i] = SkAlpha255To256(a);
    681         td[i] = d;
    682 
    683                 unsigned sr = SkGetPackedR32(c);
    684                 unsigned sg = SkGetPackedG32(c);
    685                 unsigned sb = SkGetPackedB32(c);
    686                 sr = SkDITHER_R32_FOR_565(sr, d);
    687                 sg = SkDITHER_G32_FOR_565(sg, d);
    688                 sb = SkDITHER_B32_FOR_565(sb, d);
    689 
    690                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    691                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
    692                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    693                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    694                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    695         td[i] = d;
    696 
    697             } else {
    698         tmpbuf[i] = *my_dst;
    699         ta[i] = tdv[i] = td[i] = 0xbeef;
    700         }
    701         in_dst[i] = *my_dst;
    702             my_dst += 1;
    703             DITHER_INC_X(my_x);
    704           }
    705     }
    706 #endif
    707 
    708         /* source is in ABGR */
    709         {
    710         register uint8x8_t d0 asm("d0");
    711         register uint8x8_t d1 asm("d1");
    712         register uint8x8_t d2 asm("d2");
    713         register uint8x8_t d3 asm("d3");
    714 
    715         asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
    716             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
    717             : "r" (src)
    718                     );
    719             sr = d0; sg = d1; sb = d2; sa = d3;
    720         }
    721 
    722         /* calculate 'd', which will be 0..7 */
    723         /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
    724 #if defined(SK_BUILD_FOR_ANDROID)
    725         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
    726         alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
    727 #else
    728         alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
    729 #endif
    730         alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
    731         d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
    732 
    733         /* sr = sr - (sr>>5) + d */
    734         /* watching for 8-bit overflow.  d is 0..7; risky range of
    735          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
    736          * safe  as long as we do ((sr-sr>>5) + d) */
    737         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
    738         sr = vadd_u8(sr, d);
    739 
    740         /* sb = sb - (sb>>5) + d */
    741         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
    742         sb = vadd_u8(sb, d);
    743 
    744         /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
    745         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
    746         sg = vadd_u8(sg, vshr_n_u8(d,1));
    747 
    748         /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
    749         dst8 = vld1q_u16(dst);
    750         dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
    751         dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
    752         dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
    753 
    754         /* blend */
    755 #if 1
    756         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
    757         /* originally 255-sa + 1 */
    758         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
    759 #else
    760         scale8 = vsubw_u8(vdupq_n_u16(255), sa);
    761         scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
    762 #endif
    763 
    764 #if 1
    765         /* combine the addq and mul, save 3 insns */
    766         scale8 = vshrq_n_u16(scale8, 3);
    767         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
    768         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
    769         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
    770 #else
    771         /* known correct, but +3 insns over above */
    772         scale8 = vshrq_n_u16(scale8, 3);
    773         dst_b = vmulq_u16(dst_b, scale8);
    774         dst_g = vmulq_u16(dst_g, scale8);
    775         dst_r = vmulq_u16(dst_r, scale8);
    776 
    777         /* combine */
    778         /* NB: vshll widens, need to preserve those bits */
    779         dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
    780         dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
    781         dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
    782 #endif
    783 
    784         /* repack to store */
    785         dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
    786         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
    787         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
    788 
    789         vst1q_u16(dst, dst8);
    790 
    791 #if    defined(DEBUG_OPAQUE_DITHER)
    792         /* verify my 8 elements match the temp buffer */
    793     {
    794        int i, bad=0;
    795        static int invocation;
    796 
    797        for (i=0;i<UNROLL;i++)
    798         if (tmpbuf[i] != dst[i]) bad=1;
    799        if (bad) {
    800         SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
    801             invocation, offset);
    802         SkDebugf("  alpha 0x%x\n", alpha);
    803         for (i=0;i<UNROLL;i++)
    804             SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
    805             i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
    806             dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
    807 
    808         showme16("alpha8", &alpha8, sizeof(alpha8));
    809         showme16("scale8", &scale8, sizeof(scale8));
    810         showme8("d", &d, sizeof(d));
    811         showme16("dst8", &dst8, sizeof(dst8));
    812         showme16("dst_b", &dst_b, sizeof(dst_b));
    813         showme16("dst_g", &dst_g, sizeof(dst_g));
    814         showme16("dst_r", &dst_r, sizeof(dst_r));
    815         showme8("sb", &sb, sizeof(sb));
    816         showme8("sg", &sg, sizeof(sg));
    817         showme8("sr", &sr, sizeof(sr));
    818 
    819         /* cop out */
    820         return;
    821        }
    822        offset += UNROLL;
    823        invocation++;
    824     }
    825 #endif
    826 
    827             dst += UNROLL;
    828         src += UNROLL;
    829         count -= UNROLL;
    830         /* skip x += UNROLL, since it's unchanged mod-4 */
    831         } while (count >= UNROLL);
    832     }
    833 #undef    UNROLL
    834 
    835     /* residuals */
    836     if (count > 0) {
    837         DITHER_565_SCAN(y);
    838         do {
    839             SkPMColor c = *src++;
    840             SkPMColorAssert(c);
    841             if (c) {
    842                 unsigned a = SkGetPackedA32(c);
    843 
    844                 // dither and alpha are just temporary variables to work-around
    845                 // an ICE in debug.
    846                 unsigned dither = DITHER_VALUE(x);
    847                 unsigned alpha = SkAlpha255To256(a);
    848                 int d = SkAlphaMul(dither, alpha);
    849 
    850                 unsigned sr = SkGetPackedR32(c);
    851                 unsigned sg = SkGetPackedG32(c);
    852                 unsigned sb = SkGetPackedB32(c);
    853                 sr = SkDITHER_R32_FOR_565(sr, d);
    854                 sg = SkDITHER_G32_FOR_565(sg, d);
    855                 sb = SkDITHER_B32_FOR_565(sb, d);
    856 
    857                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    858                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
    859                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    860                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    861                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    862             }
    863             dst += 1;
    864             DITHER_INC_X(x);
    865         } while (--count != 0);
    866     }
    867 }
    868 
    869 ///////////////////////////////////////////////////////////////////////////////
    870 
    871 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
    872  * speedup untested, but ARM version is 26 insns/iteration and
    873  * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
    874  * which is 10x the native version; that's pure instruction counts,
    875  * not accounting for any instruction or memory latencies.
    876  */
    877 
    878 #undef    DEBUG_S32_OPAQUE_DITHER
    879 
    880 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
    881                                  const SkPMColor* SK_RESTRICT src,
    882                                  int count, U8CPU alpha, int x, int y) {
    883     SkASSERT(255 == alpha);
    884 
    885 #define    UNROLL    8
    886     if (count >= UNROLL) {
    887     uint8x8_t d;
    888     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    889     d = vld1_u8(dstart);
    890 
    891     while (count >= UNROLL) {
    892         uint8x8_t sr, sg, sb, sa;
    893         uint16x8_t dr, dg, db, da;
    894         uint16x8_t dst8;
    895 
    896         /* source is in ABGR ordering (R == lsb) */
    897         {
    898         register uint8x8_t d0 asm("d0");
    899         register uint8x8_t d1 asm("d1");
    900         register uint8x8_t d2 asm("d2");
    901         register uint8x8_t d3 asm("d3");
    902 
    903         asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
    904             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
    905             : "r" (src)
    906                     );
    907             sr = d0; sg = d1; sb = d2; sa = d3;
    908         }
    909         /* XXX: if we want to prefetch, hide it in the above asm()
    910          * using the gcc __builtin_prefetch(), the prefetch will
    911          * fall to the bottom of the loop -- it won't stick up
    912          * at the top of the loop, just after the vld4.
    913          */
    914 
    915         /* sr = sr - (sr>>5) + d */
    916         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
    917         dr = vaddl_u8(sr, d);
    918 
    919         /* sb = sb - (sb>>5) + d */
    920         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
    921         db = vaddl_u8(sb, d);
    922 
    923         /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
    924         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
    925         dg = vaddl_u8(sg, vshr_n_u8(d,1));
    926         /* XXX: check that the "d>>1" here is hoisted */
    927 
    928         /* pack high bits of each into 565 format  (rgb, b is lsb) */
    929         dst8 = vshrq_n_u16(db, 3);
    930         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
    931         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
    932 
    933         /* store it */
    934         vst1q_u16(dst, dst8);
    935 
    936 #if    defined(DEBUG_S32_OPAQUE_DITHER)
    937         /* always good to know if we generated good results */
    938         {
    939         int i, myx = x, myy = y;
    940         DITHER_565_SCAN(myy);
    941         for (i=0;i<UNROLL;i++) {
    942             SkPMColor c = src[i];
    943             unsigned dither = DITHER_VALUE(myx);
    944             uint16_t val = SkDitherRGB32To565(c, dither);
    945             if (val != dst[i]) {
    946             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
    947                 c, dither, val, dst[i], dstart[i]);
    948             }
    949             DITHER_INC_X(myx);
    950         }
    951         }
    952 #endif
    953 
    954         dst += UNROLL;
    955         src += UNROLL;
    956         count -= UNROLL;
    957         x += UNROLL;        /* probably superfluous */
    958     }
    959     }
    960 #undef    UNROLL
    961 
    962     /* residuals */
    963     if (count > 0) {
    964         DITHER_565_SCAN(y);
    965         do {
    966             SkPMColor c = *src++;
    967             SkPMColorAssert(c);
    968             SkASSERT(SkGetPackedA32(c) == 255);
    969 
    970             unsigned dither = DITHER_VALUE(x);
    971             *dst++ = SkDitherRGB32To565(c, dither);
    972             DITHER_INC_X(x);
    973         } while (--count != 0);
    974     }
    975 }
    976 
    977 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
    978                       SkPMColor color) {
    979     if (count <= 0) {
    980         return;
    981     }
    982 
    983     if (0 == color) {
    984         if (src != dst) {
    985             memcpy(dst, src, count * sizeof(SkPMColor));
    986         }
    987         return;
    988     }
    989 
    990     unsigned colorA = SkGetPackedA32(color);
    991     if (255 == colorA) {
    992         sk_memset32(dst, color, count);
    993     } else {
    994         unsigned scale = 256 - SkAlpha255To256(colorA);
    995 
    996         if (count >= 8) {
    997             // at the end of this assembly, count will have been decremented
    998             // to a negative value. That is, if count mod 8 = x, it will be
    999             // -8 +x coming out.
   1000             asm volatile (
   1001                 PLD128(src, 0)
   1002 
   1003                 "vdup.32    q0, %[color]                \n\t"
   1004 
   1005                 PLD128(src, 128)
   1006 
   1007                 // scale numerical interval [0-255], so load as 8 bits
   1008                 "vdup.8     d2, %[scale]                \n\t"
   1009 
   1010                 PLD128(src, 256)
   1011 
   1012                 "subs       %[count], %[count], #8      \n\t"
   1013 
   1014                 PLD128(src, 384)
   1015 
   1016                 "Loop_Color32:                          \n\t"
   1017 
   1018                 // load src color, 8 pixels, 4 64 bit registers
   1019                 // (and increment src).
   1020                 "vld1.32    {d4-d7}, [%[src]]!          \n\t"
   1021 
   1022                 PLD128(src, 384)
   1023 
   1024                 // multiply long by scale, 64 bits at a time,
   1025                 // destination into a 128 bit register.
   1026                 "vmull.u8   q4, d4, d2                  \n\t"
   1027                 "vmull.u8   q5, d5, d2                  \n\t"
   1028                 "vmull.u8   q6, d6, d2                  \n\t"
   1029                 "vmull.u8   q7, d7, d2                  \n\t"
   1030 
   1031                 // shift the 128 bit registers, containing the 16
   1032                 // bit scaled values back to 8 bits, narrowing the
   1033                 // results to 64 bit registers.
   1034                 "vshrn.i16  d8, q4, #8                  \n\t"
   1035                 "vshrn.i16  d9, q5, #8                  \n\t"
   1036                 "vshrn.i16  d10, q6, #8                 \n\t"
   1037                 "vshrn.i16  d11, q7, #8                 \n\t"
   1038 
   1039                 // adding back the color, using 128 bit registers.
   1040                 "vadd.i8    q6, q4, q0                  \n\t"
   1041                 "vadd.i8    q7, q5, q0                  \n\t"
   1042 
   1043                 // store back the 8 calculated pixels (2 128 bit
   1044                 // registers), and increment dst.
   1045                 "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
   1046 
   1047                 "subs       %[count], %[count], #8      \n\t"
   1048                 "bge        Loop_Color32                \n\t"
   1049                 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
   1050                 : [color] "r" (color), [scale] "r" (scale)
   1051                 : "cc", "memory",
   1052                   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
   1053                   "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
   1054                           );
   1055             // At this point, if we went through the inline assembly, count is
   1056             // a negative value:
   1057             // if the value is -8, there is no pixel left to process.
   1058             // if the value is -7, there is one pixel left to process
   1059             // ...
   1060             // And'ing it with 7 will give us the number of pixels
   1061             // left to process.
   1062             count = count & 0x7;
   1063         }
   1064 
   1065         while (count > 0) {
   1066             *dst = color + SkAlphaMulQ(*src, scale);
   1067             src += 1;
   1068             dst += 1;
   1069             count--;
   1070         }
   1071     }
   1072 }
   1073 
   1074 ///////////////////////////////////////////////////////////////////////////////
   1075 
   1076 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
   1077     // no dither
   1078     // NOTE: For the two functions below, we don't have a special version
   1079     //       that assumes that each source pixel is opaque. But our S32A is
   1080     //       still faster than the default, so use it.
   1081     S32A_D565_Opaque_neon,  // really S32_D565_Opaque
   1082     S32A_D565_Blend_neon,   // really S32_D565_Blend
   1083     S32A_D565_Opaque_neon,
   1084     S32A_D565_Blend_neon,
   1085 
   1086     // dither
   1087     S32_D565_Opaque_Dither_neon,
   1088     S32_D565_Blend_Dither_neon,
   1089     S32A_D565_Opaque_Dither_neon,
   1090     NULL,   // S32A_D565_Blend_Dither
   1091 };
   1092 
   1093 const SkBlitRow::Proc sk_blitrow_platform_4444_procs_arm_neon[] = {
   1094     // no dither
   1095     NULL,   // S32_D4444_Opaque,
   1096     NULL,   // S32_D4444_Blend,
   1097     NULL,   // S32A_D4444_Opaque,
   1098     NULL,   // S32A_D4444_Blend,
   1099 
   1100     // dither
   1101     NULL,   // S32_D4444_Opaque_Dither,
   1102     NULL,   // S32_D4444_Blend_Dither,
   1103     NULL,   // S32A_D4444_Opaque_Dither,
   1104     NULL,   // S32A_D4444_Blend_Dither
   1105 };
   1106 
   1107 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1108     NULL,   // S32_Opaque,
   1109     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1110     S32A_Opaque_BlitRow32_neon,        // S32A_Opaque,
   1111     S32A_Blend_BlitRow32_arm        // S32A_Blend
   1112 };
   1113