Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkColor_opts_neon.h"
     18 #include <arm_neon.h>
     19 
     20 #ifdef SK_CPU_ARM64
     21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
     22     uint8x8x4_t vsrc;
     23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
     24 
     25     asm (
     26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     27         "mov    %[vsrc0].8b, v0.8b             \t\n"
     28         "mov    %[vsrc1].8b, v1.8b             \t\n"
     29         "mov    %[vsrc2].8b, v2.8b             \t\n"
     30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
     32         : : "v0", "v1", "v2", "v3"
     33     );
     34 
     35     vsrc.val[0] = vsrc_0;
     36     vsrc.val[1] = vsrc_1;
     37     vsrc.val[2] = vsrc_2;
     38 
     39     return vsrc;
     40 }
     41 
     42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
     43     uint8x8x4_t vsrc;
     44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
     45 
     46     asm (
     47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     48         "mov    %[vsrc0].8b, v0.8b             \t\n"
     49         "mov    %[vsrc1].8b, v1.8b             \t\n"
     50         "mov    %[vsrc2].8b, v2.8b             \t\n"
     51         "mov    %[vsrc3].8b, v3.8b             \t\n"
     52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
     54           [src] "+&r" (src)
     55         : : "v0", "v1", "v2", "v3"
     56     );
     57 
     58     vsrc.val[0] = vsrc_0;
     59     vsrc.val[1] = vsrc_1;
     60     vsrc.val[2] = vsrc_2;
     61     vsrc.val[3] = vsrc_3;
     62 
     63     return vsrc;
     64 }
     65 #endif
     66 
     67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     68                            const SkPMColor* SK_RESTRICT src, int count,
     69                            U8CPU alpha, int /*x*/, int /*y*/) {
     70     SkASSERT(255 == alpha);
     71 
     72     while (count >= 8) {
     73         uint8x8x4_t vsrc;
     74         uint16x8_t vdst;
     75 
     76         // Load
     77 #ifdef SK_CPU_ARM64
     78         vsrc = sk_vld4_u8_arm64_3(src);
     79 #else
     80         vsrc = vld4_u8((uint8_t*)src);
     81         src += 8;
     82 #endif
     83 
     84         // Convert src to 565
     85         vdst = SkPixel32ToPixel16_neon8(vsrc);
     86 
     87         // Store
     88         vst1q_u16(dst, vdst);
     89 
     90         // Prepare next iteration
     91         dst += 8;
     92         count -= 8;
     93     };
     94 
     95     // Leftovers
     96     while (count > 0) {
     97         SkPMColor c = *src++;
     98         SkPMColorAssert(c);
     99         *dst = SkPixel32ToPixel16_ToU16(c);
    100         dst++;
    101         count--;
    102     };
    103 }
    104 
    105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    106                           const SkPMColor* SK_RESTRICT src, int count,
    107                           U8CPU alpha, int /*x*/, int /*y*/) {
    108     SkASSERT(255 > alpha);
    109 
    110     uint16x8_t vmask_blue, vscale;
    111 
    112     // prepare constants
    113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
    114     vmask_blue = vmovq_n_u16(0x1F);
    115 
    116     while (count >= 8) {
    117         uint8x8x4_t vsrc;
    118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    119         uint16x8_t vres_r, vres_g, vres_b;
    120 
    121         // Load src
    122 #ifdef SK_CPU_ARM64
    123         vsrc = sk_vld4_u8_arm64_3(src);
    124 #else
    125         {
    126         register uint8x8_t d0 asm("d0");
    127         register uint8x8_t d1 asm("d1");
    128         register uint8x8_t d2 asm("d2");
    129         register uint8x8_t d3 asm("d3");
    130 
    131         asm (
    132             "vld4.8    {d0-d3},[%[src]]!"
    133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    134             :
    135         );
    136         vsrc.val[0] = d0;
    137         vsrc.val[1] = d1;
    138         vsrc.val[2] = d2;
    139         }
    140 #endif
    141 
    142         // Load and unpack dst
    143         vdst = vld1q_u16(dst);
    144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
    145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
    146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
    147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
    148 
    149         // Shift src to 565 range
    150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
    151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
    152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
    153 
    154         // Scale src - dst
    155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
    156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
    157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
    158 
    159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
    160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
    161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
    162 
    163         vres_r += vdst_r;
    164         vres_g += vdst_g;
    165         vres_b += vdst_b;
    166 
    167         // Combine
    168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
    169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
    170 
    171         // Store
    172         vst1q_u16(dst, vres_b);
    173         dst += 8;
    174         count -= 8;
    175     }
    176     if (count > 0) {
    177         int scale = SkAlpha255To256(alpha);
    178         do {
    179             SkPMColor c = *src++;
    180             SkPMColorAssert(c);
    181             uint16_t d = *dst;
    182             *dst++ = SkPackRGB16(
    183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
    184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
    185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
    186         } while (--count != 0);
    187     }
    188 }
    189 
    190 #ifdef SK_CPU_ARM32
    191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    192                            const SkPMColor* SK_RESTRICT src, int count,
    193                            U8CPU alpha, int /*x*/, int /*y*/) {
    194     SkASSERT(255 == alpha);
    195 
    196     if (count >= 8) {
    197         uint16_t* SK_RESTRICT keep_dst = 0;
    198 
    199         asm volatile (
    200                       "ands       ip, %[count], #7            \n\t"
    201                       "vmov.u8    d31, #1<<7                  \n\t"
    202                       "vld1.16    {q12}, [%[dst]]             \n\t"
    203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
    204                       // Thumb does not support the standard ARM conditional
    205                       // instructions but instead requires the 'it' instruction
    206                       // to signal conditional execution
    207                       "it eq                                  \n\t"
    208                       "moveq      ip, #8                      \n\t"
    209                       "mov        %[keep_dst], %[dst]         \n\t"
    210 
    211                       "add        %[src], %[src], ip, LSL#2   \n\t"
    212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
    213                       "subs       %[count], %[count], ip      \n\t"
    214                       "b          9f                          \n\t"
    215                       // LOOP
    216                       "2:                                         \n\t"
    217 
    218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
    219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
    220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
    221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
    222                       "subs       %[count], %[count], #8      \n\t"
    223                       "9:                                         \n\t"
    224                       "pld        [%[dst],#32]                \n\t"
    225                       // expand 0565 q12 to 8888 {d4-d7}
    226                       "vmovn.u16  d4, q12                     \n\t"
    227                       "vshr.u16   q11, q12, #5                \n\t"
    228                       "vshr.u16   q10, q12, #6+5              \n\t"
    229                       "vmovn.u16  d5, q11                     \n\t"
    230                       "vmovn.u16  d6, q10                     \n\t"
    231                       "vshl.u8    d4, d4, #3                  \n\t"
    232                       "vshl.u8    d5, d5, #2                  \n\t"
    233                       "vshl.u8    d6, d6, #3                  \n\t"
    234 
    235                       "vmovl.u8   q14, d31                    \n\t"
    236                       "vmovl.u8   q13, d31                    \n\t"
    237                       "vmovl.u8   q12, d31                    \n\t"
    238 
    239                       // duplicate in 4/2/1 & 8pix vsns
    240                       "vmvn.8     d30, d3                     \n\t"
    241                       "vmlal.u8   q14, d30, d6                \n\t"
    242                       "vmlal.u8   q13, d30, d5                \n\t"
    243                       "vmlal.u8   q12, d30, d4                \n\t"
    244                       "vshr.u16   q8, q14, #5                 \n\t"
    245                       "vshr.u16   q9, q13, #6                 \n\t"
    246                       "vaddhn.u16 d6, q14, q8                 \n\t"
    247                       "vshr.u16   q8, q12, #5                 \n\t"
    248                       "vaddhn.u16 d5, q13, q9                 \n\t"
    249                       "vaddhn.u16 d4, q12, q8                 \n\t"
    250                       // intentionally don't calculate alpha
    251                       // result in d4-d6
    252 
    253             #ifdef SK_PMCOLOR_IS_RGBA
    254                       "vqadd.u8   d6, d6, d0                  \n\t"
    255                       "vqadd.u8   d5, d5, d1                  \n\t"
    256                       "vqadd.u8   d4, d4, d2                  \n\t"
    257             #else
    258                       "vqadd.u8   d6, d6, d2                  \n\t"
    259                       "vqadd.u8   d5, d5, d1                  \n\t"
    260                       "vqadd.u8   d4, d4, d0                  \n\t"
    261             #endif
    262 
    263                       // pack 8888 {d4-d6} to 0565 q10
    264                       "vshll.u8   q10, d6, #8                 \n\t"
    265                       "vshll.u8   q3, d5, #8                  \n\t"
    266                       "vshll.u8   q2, d4, #8                  \n\t"
    267                       "vsri.u16   q10, q3, #5                 \n\t"
    268                       "vsri.u16   q10, q2, #11                \n\t"
    269 
    270                       "bne        2b                          \n\t"
    271 
    272                       "1:                                         \n\t"
    273                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
    274                       : [count] "+r" (count)
    275                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    276                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    277                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    278                       "d30","d31"
    279                       );
    280     }
    281     else
    282     {   // handle count < 8
    283         uint16_t* SK_RESTRICT keep_dst = 0;
    284 
    285         asm volatile (
    286                       "vmov.u8    d31, #1<<7                  \n\t"
    287                       "mov        %[keep_dst], %[dst]         \n\t"
    288 
    289                       "tst        %[count], #4                \n\t"
    290                       "beq        14f                         \n\t"
    291                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    292                       "vld1.32    {q1}, [%[src]]!             \n\t"
    293 
    294                       "14:                                        \n\t"
    295                       "tst        %[count], #2                \n\t"
    296                       "beq        12f                         \n\t"
    297                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    298                       "vld1.32    {d1}, [%[src]]!             \n\t"
    299 
    300                       "12:                                        \n\t"
    301                       "tst        %[count], #1                \n\t"
    302                       "beq        11f                         \n\t"
    303                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    304                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    305 
    306                       "11:                                        \n\t"
    307                       // unzips achieve the same as a vld4 operation
    308                       "vuzp.u16   q0, q1                      \n\t"
    309                       "vuzp.u8    d0, d1                      \n\t"
    310                       "vuzp.u8    d2, d3                      \n\t"
    311                       // expand 0565 q12 to 8888 {d4-d7}
    312                       "vmovn.u16  d4, q12                     \n\t"
    313                       "vshr.u16   q11, q12, #5                \n\t"
    314                       "vshr.u16   q10, q12, #6+5              \n\t"
    315                       "vmovn.u16  d5, q11                     \n\t"
    316                       "vmovn.u16  d6, q10                     \n\t"
    317                       "vshl.u8    d4, d4, #3                  \n\t"
    318                       "vshl.u8    d5, d5, #2                  \n\t"
    319                       "vshl.u8    d6, d6, #3                  \n\t"
    320 
    321                       "vmovl.u8   q14, d31                    \n\t"
    322                       "vmovl.u8   q13, d31                    \n\t"
    323                       "vmovl.u8   q12, d31                    \n\t"
    324 
    325                       // duplicate in 4/2/1 & 8pix vsns
    326                       "vmvn.8     d30, d3                     \n\t"
    327                       "vmlal.u8   q14, d30, d6                \n\t"
    328                       "vmlal.u8   q13, d30, d5                \n\t"
    329                       "vmlal.u8   q12, d30, d4                \n\t"
    330                       "vshr.u16   q8, q14, #5                 \n\t"
    331                       "vshr.u16   q9, q13, #6                 \n\t"
    332                       "vaddhn.u16 d6, q14, q8                 \n\t"
    333                       "vshr.u16   q8, q12, #5                 \n\t"
    334                       "vaddhn.u16 d5, q13, q9                 \n\t"
    335                       "vaddhn.u16 d4, q12, q8                 \n\t"
    336                       // intentionally don't calculate alpha
    337                       // result in d4-d6
    338 
    339             #ifdef SK_PMCOLOR_IS_RGBA
    340                       "vqadd.u8   d6, d6, d0                  \n\t"
    341                       "vqadd.u8   d5, d5, d1                  \n\t"
    342                       "vqadd.u8   d4, d4, d2                  \n\t"
    343             #else
    344                       "vqadd.u8   d6, d6, d2                  \n\t"
    345                       "vqadd.u8   d5, d5, d1                  \n\t"
    346                       "vqadd.u8   d4, d4, d0                  \n\t"
    347             #endif
    348 
    349                       // pack 8888 {d4-d6} to 0565 q10
    350                       "vshll.u8   q10, d6, #8                 \n\t"
    351                       "vshll.u8   q3, d5, #8                  \n\t"
    352                       "vshll.u8   q2, d4, #8                  \n\t"
    353                       "vsri.u16   q10, q3, #5                 \n\t"
    354                       "vsri.u16   q10, q2, #11                \n\t"
    355 
    356                       // store
    357                       "tst        %[count], #4                \n\t"
    358                       "beq        24f                         \n\t"
    359                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    360 
    361                       "24:                                        \n\t"
    362                       "tst        %[count], #2                \n\t"
    363                       "beq        22f                         \n\t"
    364                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    365 
    366                       "22:                                        \n\t"
    367                       "tst        %[count], #1                \n\t"
    368                       "beq        21f                         \n\t"
    369                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    370 
    371                       "21:                                        \n\t"
    372                       : [count] "+r" (count)
    373                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    374                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    375                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    376                       "d30","d31"
    377                       );
    378     }
    379 }
    380 
    381 #else // #ifdef SK_CPU_ARM32
    382 
    383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    384                            const SkPMColor* SK_RESTRICT src, int count,
    385                            U8CPU alpha, int /*x*/, int /*y*/) {
    386     SkASSERT(255 == alpha);
    387 
    388     if (count >= 16) {
    389         asm (
    390             "movi    v4.8h, #0x80                   \t\n"
    391 
    392             "1:                                     \t\n"
    393             "sub     %w[count], %w[count], #16      \t\n"
    394             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
    395             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
    396             "prfm    pldl1keep, [%[src],#512]       \t\n"
    397             "prfm    pldl1keep, [%[dst],#256]       \t\n"
    398             "ushr    v20.8h, v17.8h, #5             \t\n"
    399             "ushr    v31.8h, v16.8h, #5             \t\n"
    400             "xtn     v6.8b, v31.8h                  \t\n"
    401             "xtn2    v6.16b, v20.8h                 \t\n"
    402             "ushr    v20.8h, v17.8h, #11            \t\n"
    403             "shl     v19.16b, v6.16b, #2            \t\n"
    404             "ushr    v31.8h, v16.8h, #11            \t\n"
    405             "xtn     v22.8b, v31.8h                 \t\n"
    406             "xtn2    v22.16b, v20.8h                \t\n"
    407             "shl     v18.16b, v22.16b, #3           \t\n"
    408             "mvn     v3.16b, v3.16b                 \t\n"
    409             "xtn     v16.8b, v16.8h                 \t\n"
    410             "mov     v7.16b, v4.16b                 \t\n"
    411             "xtn2    v16.16b, v17.8h                \t\n"
    412             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
    413             "shl     v16.16b, v16.16b, #3           \t\n"
    414             "mov     v22.16b, v4.16b                \t\n"
    415             "ushr    v24.8h, v7.8h, #6              \t\n"
    416             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
    417             "ushr    v20.8h, v22.8h, #5             \t\n"
    418             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
    419             "cmp     %w[count], #16                 \t\n"
    420             "mov     v6.16b, v4.16b                 \t\n"
    421             "mov     v5.16b, v4.16b                 \t\n"
    422             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
    423             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
    424             "mov     v17.16b, v4.16b                \t\n"
    425             "ushr    v19.8h, v6.8h, #5              \t\n"
    426             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
    427             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
    428             "ushr    v18.8h, v5.8h, #6              \t\n"
    429             "ushr    v21.8h, v17.8h, #5             \t\n"
    430             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
    431             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
    432             "mov     v22.16b, v4.16b                \t\n"
    433             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
    434             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
    435             "ushr    v5.8h, v22.8h, #5              \t\n"
    436             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
    437             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
    438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
    439             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
    440             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
    441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
    442             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
    443             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
    444 #else
    445 #error "This function only supports BGRA and RGBA."
    446 #endif
    447             "shll    v22.8h, v20.8b, #8             \t\n"
    448             "shll    v5.8h, v7.8b, #8               \t\n"
    449             "sri     v22.8h, v5.8h, #5              \t\n"
    450             "shll    v17.8h, v6.8b, #8              \t\n"
    451             "shll2   v23.8h, v20.16b, #8            \t\n"
    452             "shll2   v7.8h, v7.16b, #8              \t\n"
    453             "sri     v22.8h, v17.8h, #11            \t\n"
    454             "sri     v23.8h, v7.8h, #5              \t\n"
    455             "shll2   v6.8h, v6.16b, #8              \t\n"
    456             "st1     {v22.8h}, [%[dst]], #16        \t\n"
    457             "sri     v23.8h, v6.8h, #11             \t\n"
    458             "st1     {v23.8h}, [%[dst]], #16        \t\n"
    459             "b.ge    1b                             \t\n"
    460             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
    461             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    462                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
    463                "v31"
    464         );
    465     }
    466         // Leftovers
    467     if (count > 0) {
    468         do {
    469             SkPMColor c = *src++;
    470             SkPMColorAssert(c);
    471             if (c) {
    472                 *dst = SkSrcOver32To16(c, *dst);
    473             }
    474             dst += 1;
    475         } while (--count != 0);
    476     }
    477 }
    478 #endif // #ifdef SK_CPU_ARM32
    479 
    480 static uint32_t pmcolor_to_expand16(SkPMColor c) {
    481     unsigned r = SkGetPackedR32(c);
    482     unsigned g = SkGetPackedG32(c);
    483     unsigned b = SkGetPackedB32(c);
    484     return (g << 24) | (r << 13) | (b << 2);
    485 }
    486 
    487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
    488     uint32_t src_expand;
    489     unsigned scale;
    490     uint16x8_t vmask_blue;
    491 
    492     if (count <= 0) return;
    493     SkASSERT(((size_t)dst & 0x01) == 0);
    494 
    495     /*
    496      * This preamble code is in order to make dst aligned to 8 bytes
    497      * in the next mutiple bytes read & write access.
    498      */
    499     src_expand = pmcolor_to_expand16(src);
    500     scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
    501 
    502 #define DST_ALIGN 8
    503 
    504     /*
    505      * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
    506      */
    507     int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
    508 
    509     for (int i = 0; i < preamble_size; i+=2, dst++) {
    510         uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
    511         *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
    512         if (--count == 0)
    513             break;
    514     }
    515 
    516     int count16 = 0;
    517     count16 = count >> 4;
    518     vmask_blue = vmovq_n_u16(SK_B16_MASK);
    519 
    520     if (count16) {
    521         uint16x8_t wide_sr;
    522         uint16x8_t wide_sg;
    523         uint16x8_t wide_sb;
    524         uint16x8_t wide_256_sa;
    525 
    526         unsigned sr = SkGetPackedR32(src);
    527         unsigned sg = SkGetPackedG32(src);
    528         unsigned sb = SkGetPackedB32(src);
    529         unsigned sa = SkGetPackedA32(src);
    530 
    531         // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
    532         // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
    533         //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
    534         wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
    535 
    536         // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
    537         //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
    538         wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
    539 
    540         // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
    541         //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
    542         wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
    543 
    544         wide_256_sa =
    545             vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
    546 
    547         while (count16-- > 0) {
    548             uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
    549             uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
    550             vdst1 = vld1q_u16(dst);
    551             dst += 8;
    552             vdst2 = vld1q_u16(dst);
    553             dst -= 8;    //to store dst again.
    554 
    555             vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
    556             vdst1_b = vdst1 & vmask_blue;                              // extract blue
    557             vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
    558             vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
    559 
    560             vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
    561             vdst2_b = vdst2 & vmask_blue;                              // extract blue
    562             vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
    563             vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
    564 
    565             vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
    566             vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
    567             vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
    568 
    569             vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
    570             vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
    571             vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
    572 
    573             vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
    574             vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
    575             vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
    576 
    577             vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
    578             vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
    579 
    580             vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
    581             vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
    582             vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
    583 
    584             vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
    585             vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
    586 
    587             vst1q_u16(dst, vdst1);
    588             dst += 8;
    589             vst1q_u16(dst, vdst2);
    590             dst += 8;
    591         }
    592     }
    593 
    594     count &= 0xF;
    595     if (count > 0) {
    596         do {
    597             uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
    598             *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
    599             dst += 1;
    600         } while (--count != 0);
    601     }
    602 }
    603 
    604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
    605     prod += vdupq_n_u16(128);
    606     prod += vshrq_n_u16(prod, 8);
    607     return vshrq_n_u16(prod, 8);
    608 }
    609 
    610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    611                           const SkPMColor* SK_RESTRICT src, int count,
    612                           U8CPU alpha, int /*x*/, int /*y*/) {
    613    SkASSERT(255 > alpha);
    614 
    615     /* This code implements a Neon version of S32A_D565_Blend. The results have
    616      * a few mismatches compared to the original code. These mismatches never
    617      * exceed 1.
    618      */
    619 
    620     if (count >= 8) {
    621         uint16x8_t valpha_max, vmask_blue;
    622         uint8x8_t valpha;
    623 
    624         // prepare constants
    625         valpha_max = vmovq_n_u16(255);
    626         valpha = vdup_n_u8(alpha);
    627         vmask_blue = vmovq_n_u16(SK_B16_MASK);
    628 
    629         do {
    630             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    631             uint16x8_t vres_a, vres_r, vres_g, vres_b;
    632             uint8x8x4_t vsrc;
    633 
    634             // load pixels
    635             vdst = vld1q_u16(dst);
    636 #ifdef SK_CPU_ARM64
    637             vsrc = sk_vld4_u8_arm64_4(src);
    638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    639             asm (
    640                 "vld4.u8 %h[vsrc], [%[src]]!"
    641                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
    642                 : :
    643             );
    644 #else
    645             register uint8x8_t d0 asm("d0");
    646             register uint8x8_t d1 asm("d1");
    647             register uint8x8_t d2 asm("d2");
    648             register uint8x8_t d3 asm("d3");
    649 
    650             asm volatile (
    651                 "vld4.u8    {d0-d3},[%[src]]!;"
    652                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    653                   [src] "+&r" (src)
    654                 : :
    655             );
    656             vsrc.val[0] = d0;
    657             vsrc.val[1] = d1;
    658             vsrc.val[2] = d2;
    659             vsrc.val[3] = d3;
    660 #endif
    661 
    662 
    663             // deinterleave dst
    664             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
    665             vdst_b = vdst & vmask_blue;                     // extract blue
    666             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
    667             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
    668 
    669             // shift src to 565
    670             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
    671             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
    672             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
    673 
    674             // calc src * src_scale
    675             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
    676             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
    677             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
    678             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
    679 
    680             // prepare dst_scale
    681             vres_a = SkDiv255Round_neon8(vres_a);
    682             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
    683 
    684             // add dst * dst_scale to previous result
    685             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
    686             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
    687             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
    688 
    689 #ifdef S32A_D565_BLEND_EXACT
    690             // It is possible to get exact results with this but it is slow,
    691             // even slower than C code in some cases
    692             vres_r = SkDiv255Round_neon8(vres_r);
    693             vres_g = SkDiv255Round_neon8(vres_g);
    694             vres_b = SkDiv255Round_neon8(vres_b);
    695 #else
    696             vres_r = vrshrq_n_u16(vres_r, 8);
    697             vres_g = vrshrq_n_u16(vres_g, 8);
    698             vres_b = vrshrq_n_u16(vres_b, 8);
    699 #endif
    700             // pack result
    701             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
    702             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
    703 
    704             // store
    705             vst1q_u16(dst, vres_b);
    706             dst += 8;
    707             count -= 8;
    708         } while (count >= 8);
    709     }
    710 
    711     // leftovers
    712     while (count-- > 0) {
    713         SkPMColor sc = *src++;
    714         if (sc) {
    715             uint16_t dc = *dst;
    716             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    717             unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
    718             unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
    719             unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
    720             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    721         }
    722         dst += 1;
    723     }
    724 }
    725 
    726 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    727  * each dither value is spaced out into byte lanes, and repeated
    728  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    729  * start of each row.
    730  */
    731 static const uint8_t gDitherMatrix_Neon[48] = {
    732     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    733     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    734     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    735     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    736 
    737 };
    738 
    739 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    740                                 int count, U8CPU alpha, int x, int y)
    741 {
    742 
    743     SkASSERT(255 > alpha);
    744 
    745     // rescale alpha to range 1 - 256
    746     int scale = SkAlpha255To256(alpha);
    747 
    748     if (count >= 8) {
    749         /* select row and offset for dither array */
    750         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    751 
    752         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
    753         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
    754 
    755         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
    756         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
    757 
    758         do {
    759 
    760             uint8x8x4_t vsrc;
    761             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
    762             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
    763             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
    764             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
    765             uint16x8_t vdst;
    766             uint16x8_t vdst_r, vdst_g, vdst_b;
    767             int16x8_t vres_r, vres_g, vres_b;
    768             int8x8_t vres8_r, vres8_g, vres8_b;
    769 
    770             // Load source and add dither
    771 #ifdef SK_CPU_ARM64
    772             vsrc = sk_vld4_u8_arm64_3(src);
    773 #else
    774             {
    775             register uint8x8_t d0 asm("d0");
    776             register uint8x8_t d1 asm("d1");
    777             register uint8x8_t d2 asm("d2");
    778             register uint8x8_t d3 asm("d3");
    779 
    780             asm (
    781                 "vld4.8    {d0-d3},[%[src]]! "
    782                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    783                 :
    784             );
    785             vsrc.val[0] = d0;
    786             vsrc.val[1] = d1;
    787             vsrc.val[2] = d2;
    788             }
    789 #endif
    790             vsrc_r = vsrc.val[NEON_R];
    791             vsrc_g = vsrc.val[NEON_G];
    792             vsrc_b = vsrc.val[NEON_B];
    793 
    794             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
    795             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
    796             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
    797 
    798             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
    799             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
    800             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
    801 
    802             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
    803             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
    804             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
    805 
    806             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
    807             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
    808             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
    809 
    810             // Load dst and unpack
    811             vdst = vld1q_u16(dst);
    812             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
    813             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
    814             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
    815 
    816             // subtract dst from src and widen
    817             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
    818             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
    819             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
    820 
    821             // multiply diffs by scale and shift
    822             vres_r = vmulq_s16(vres_r, vscale);
    823             vres_g = vmulq_s16(vres_g, vscale);
    824             vres_b = vmulq_s16(vres_b, vscale);
    825 
    826             vres8_r = vshrn_n_s16(vres_r, 8);
    827             vres8_g = vshrn_n_s16(vres_g, 8);
    828             vres8_b = vshrn_n_s16(vres_b, 8);
    829 
    830             // add dst to result
    831             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
    832             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
    833             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
    834 
    835             // put result into 565 format
    836             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
    837             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
    838 
    839             // Store result
    840             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
    841 
    842             // Next iteration
    843             dst += 8;
    844             count -= 8;
    845 
    846         } while (count >= 8);
    847     }
    848 
    849     // Leftovers
    850     if (count > 0) {
    851         int scale = SkAlpha255To256(alpha);
    852         DITHER_565_SCAN(y);
    853         do {
    854             SkPMColor c = *src++;
    855             SkPMColorAssert(c);
    856 
    857             int dither = DITHER_VALUE(x);
    858             int sr = SkGetPackedR32(c);
    859             int sg = SkGetPackedG32(c);
    860             int sb = SkGetPackedB32(c);
    861             sr = SkDITHER_R32To565(sr, dither);
    862             sg = SkDITHER_G32To565(sg, dither);
    863             sb = SkDITHER_B32To565(sb, dither);
    864 
    865             uint16_t d = *dst;
    866             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    867                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    868                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    869             DITHER_INC_X(x);
    870         } while (--count != 0);
    871     }
    872 }
    873 
    874 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    875                                 const SkPMColor* SK_RESTRICT src,
    876                                 int count, U8CPU alpha) {
    877 
    878     SkASSERT(255 == alpha);
    879     if (count > 0) {
    880 
    881 
    882     uint8x8_t alpha_mask;
    883 
    884     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    885     alpha_mask = vld1_u8(alpha_mask_setup);
    886 
    887     /* do the NEON unrolled code */
    888 #define    UNROLL    4
    889     while (count >= UNROLL) {
    890         uint8x8_t src_raw, dst_raw, dst_final;
    891         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    892 
    893         /* The two prefetches below may make the code slighlty
    894          * slower for small values of count but are worth having
    895          * in the general case.
    896          */
    897         __builtin_prefetch(src+32);
    898         __builtin_prefetch(dst+32);
    899 
    900         /* get the source */
    901         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    902 #if    UNROLL > 2
    903         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    904 #endif
    905 
    906         /* get and hold the dst too */
    907         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    908 #if    UNROLL > 2
    909         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    910 #endif
    911 
    912     /* 1st and 2nd bits of the unrolling */
    913     {
    914         uint8x8_t dst_cooked;
    915         uint16x8_t dst_wide;
    916         uint8x8_t alpha_narrow;
    917         uint16x8_t alpha_wide;
    918 
    919         /* get the alphas spread out properly */
    920         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    921         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    922 
    923         /* spread the dest */
    924         dst_wide = vmovl_u8(dst_raw);
    925 
    926         /* alpha mul the dest */
    927         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    928         dst_cooked = vshrn_n_u16(dst_wide, 8);
    929 
    930         /* sum -- ignoring any byte lane overflows */
    931         dst_final = vadd_u8(src_raw, dst_cooked);
    932     }
    933 
    934 #if    UNROLL > 2
    935     /* the 3rd and 4th bits of our unrolling */
    936     {
    937         uint8x8_t dst_cooked;
    938         uint16x8_t dst_wide;
    939         uint8x8_t alpha_narrow;
    940         uint16x8_t alpha_wide;
    941 
    942         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    943         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    944 
    945         /* spread the dest */
    946         dst_wide = vmovl_u8(dst_raw_2);
    947 
    948         /* alpha mul the dest */
    949         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    950         dst_cooked = vshrn_n_u16(dst_wide, 8);
    951 
    952         /* sum -- ignoring any byte lane overflows */
    953         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    954     }
    955 #endif
    956 
    957         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    958 #if    UNROLL > 2
    959         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    960 #endif
    961 
    962         src += UNROLL;
    963         dst += UNROLL;
    964         count -= UNROLL;
    965     }
    966 #undef    UNROLL
    967 
    968     /* do any residual iterations */
    969         while (--count >= 0) {
    970             *dst = SkPMSrcOver(*src, *dst);
    971             src += 1;
    972             dst += 1;
    973         }
    974     }
    975 }
    976 
    977 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
    978                                 const SkPMColor* SK_RESTRICT src,
    979                                 int count, U8CPU alpha) {
    980     SkASSERT(255 == alpha);
    981 
    982     if (count <= 0)
    983     return;
    984 
    985     /* Use these to check if src is transparent or opaque */
    986     const unsigned int ALPHA_OPAQ  = 0xFF000000;
    987     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
    988 
    989 #define UNROLL  4
    990     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
    991     const SkPMColor* SK_RESTRICT src_temp = src;
    992 
    993     /* set up the NEON variables */
    994     uint8x8_t alpha_mask;
    995     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    996     alpha_mask = vld1_u8(alpha_mask_setup);
    997 
    998     uint8x8_t src_raw, dst_raw, dst_final;
    999     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
   1000     uint8x8_t dst_cooked;
   1001     uint16x8_t dst_wide;
   1002     uint8x8_t alpha_narrow;
   1003     uint16x8_t alpha_wide;
   1004 
   1005     /* choose the first processing type */
   1006     if( src >= src_end)
   1007         goto TAIL;
   1008     if(*src <= ALPHA_TRANS)
   1009         goto ALPHA_0;
   1010     if(*src >= ALPHA_OPAQ)
   1011         goto ALPHA_255;
   1012     /* fall-thru */
   1013 
   1014 ALPHA_1_TO_254:
   1015     do {
   1016 
   1017         /* get the source */
   1018         src_raw = vreinterpret_u8_u32(vld1_u32(src));
   1019         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
   1020 
   1021         /* get and hold the dst too */
   1022         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
   1023         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
   1024 
   1025 
   1026         /* get the alphas spread out properly */
   1027         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
   1028         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   1029         /* we collapsed (255-a)+1 ... */
   1030         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1031 
   1032         /* spread the dest */
   1033         dst_wide = vmovl_u8(dst_raw);
   1034 
   1035         /* alpha mul the dest */
   1036         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1037         dst_cooked = vshrn_n_u16(dst_wide, 8);
   1038 
   1039         /* sum -- ignoring any byte lane overflows */
   1040         dst_final = vadd_u8(src_raw, dst_cooked);
   1041 
   1042         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
   1043         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   1044         /* we collapsed (255-a)+1 ... */
   1045         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1046 
   1047         /* spread the dest */
   1048         dst_wide = vmovl_u8(dst_raw_2);
   1049 
   1050         /* alpha mul the dest */
   1051         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1052         dst_cooked = vshrn_n_u16(dst_wide, 8);
   1053 
   1054         /* sum -- ignoring any byte lane overflows */
   1055         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
   1056 
   1057         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
   1058         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
   1059 
   1060         src += UNROLL;
   1061         dst += UNROLL;
   1062 
   1063         /* if 2 of the next pixels aren't between 1 and 254
   1064         it might make sense to go to the optimized loops */
   1065         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
   1066             break;
   1067 
   1068     } while(src < src_end);
   1069 
   1070     if (src >= src_end)
   1071         goto TAIL;
   1072 
   1073     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
   1074         goto ALPHA_255;
   1075 
   1076     /*fall-thru*/
   1077 
   1078 ALPHA_0:
   1079 
   1080     /*In this state, we know the current alpha is 0 and
   1081      we optimize for the next alpha also being zero. */
   1082     src_temp = src;  //so we don't have to increment dst every time
   1083     do {
   1084         if(*(++src) > ALPHA_TRANS)
   1085             break;
   1086         if(*(++src) > ALPHA_TRANS)
   1087             break;
   1088         if(*(++src) > ALPHA_TRANS)
   1089             break;
   1090         if(*(++src) > ALPHA_TRANS)
   1091             break;
   1092     } while(src < src_end);
   1093 
   1094     dst += (src - src_temp);
   1095 
   1096     /* no longer alpha 0, so determine where to go next. */
   1097     if( src >= src_end)
   1098         goto TAIL;
   1099     if(*src >= ALPHA_OPAQ)
   1100         goto ALPHA_255;
   1101     else
   1102         goto ALPHA_1_TO_254;
   1103 
   1104 ALPHA_255:
   1105     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
   1106         dst[0]=src[0];
   1107         dst[1]=src[1];
   1108         dst[2]=src[2];
   1109         dst[3]=src[3];
   1110         src+=UNROLL;
   1111         dst+=UNROLL;
   1112         if(src >= src_end)
   1113             goto TAIL;
   1114     }
   1115 
   1116     //Handle remainder.
   1117     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   1118         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   1119             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
   1120         }
   1121     }
   1122 
   1123     if( src >= src_end)
   1124         goto TAIL;
   1125     if(*src <= ALPHA_TRANS)
   1126         goto ALPHA_0;
   1127     else
   1128         goto ALPHA_1_TO_254;
   1129 
   1130 TAIL:
   1131     /* do any residual iterations */
   1132     src_end += UNROLL + 1;  //goto the real end
   1133     while(src != src_end) {
   1134         if( *src != 0 ) {
   1135             if( *src >= ALPHA_OPAQ ) {
   1136                 *dst = *src;
   1137             }
   1138             else {
   1139                 *dst = SkPMSrcOver(*src, *dst);
   1140             }
   1141         }
   1142         src++;
   1143         dst++;
   1144     }
   1145 
   1146 #undef    UNROLL
   1147     return;
   1148 }
   1149 
   1150 /* Neon version of S32_Blend_BlitRow32()
   1151  * portable version is in src/core/SkBlitRow_D32.cpp
   1152  */
   1153 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1154                               const SkPMColor* SK_RESTRICT src,
   1155                               int count, U8CPU alpha) {
   1156     SkASSERT(alpha <= 255);
   1157 
   1158     if (count <= 0) {
   1159         return;
   1160     }
   1161 
   1162     uint16_t src_scale = SkAlpha255To256(alpha);
   1163     uint16_t dst_scale = 256 - src_scale;
   1164 
   1165     while (count >= 2) {
   1166         uint8x8_t vsrc, vdst, vres;
   1167         uint16x8_t vsrc_wide, vdst_wide;
   1168 
   1169         /* These commented prefetches are a big win for count
   1170          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
   1171          * They also hurt a little (<5%) on an A15
   1172          */
   1173         //__builtin_prefetch(src+32);
   1174         //__builtin_prefetch(dst+32);
   1175 
   1176         // Load
   1177         vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1178         vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1179 
   1180         // Process src
   1181         vsrc_wide = vmovl_u8(vsrc);
   1182         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1183 
   1184         // Process dst
   1185         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1186 
   1187         // Combine
   1188         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1189 
   1190         // Store
   1191         vst1_u32(dst, vreinterpret_u32_u8(vres));
   1192 
   1193         src += 2;
   1194         dst += 2;
   1195         count -= 2;
   1196     }
   1197 
   1198     if (count == 1) {
   1199         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1200         uint16x8_t vsrc_wide, vdst_wide;
   1201 
   1202         // Load
   1203         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1204         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1205 
   1206         // Process
   1207         vsrc_wide = vmovl_u8(vsrc);
   1208         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1209         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1210         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1211 
   1212         // Store
   1213         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1214     }
   1215 }
   1216 
   1217 #ifdef SK_CPU_ARM32
   1218 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1219                          const SkPMColor* SK_RESTRICT src,
   1220                          int count, U8CPU alpha) {
   1221 
   1222     SkASSERT(255 >= alpha);
   1223 
   1224     if (count <= 0) {
   1225         return;
   1226     }
   1227 
   1228     unsigned alpha256 = SkAlpha255To256(alpha);
   1229 
   1230     // First deal with odd counts
   1231     if (count & 1) {
   1232         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1233         uint16x8_t vdst_wide, vsrc_wide;
   1234         unsigned dst_scale;
   1235 
   1236         // Load
   1237         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1238         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1239 
   1240         // Calc dst_scale
   1241         dst_scale = vget_lane_u8(vsrc, 3);
   1242         dst_scale *= alpha256;
   1243         dst_scale >>= 8;
   1244         dst_scale = 256 - dst_scale;
   1245 
   1246         // Process src
   1247         vsrc_wide = vmovl_u8(vsrc);
   1248         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
   1249 
   1250         // Process dst
   1251         vdst_wide = vmovl_u8(vdst);
   1252         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
   1253 
   1254         // Combine
   1255         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1256 
   1257         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1258         dst++;
   1259         src++;
   1260         count--;
   1261     }
   1262 
   1263     if (count) {
   1264         uint8x8_t alpha_mask;
   1265         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   1266         alpha_mask = vld1_u8(alpha_mask_setup);
   1267 
   1268         do {
   1269 
   1270             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
   1271             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
   1272 
   1273             __builtin_prefetch(src+32);
   1274             __builtin_prefetch(dst+32);
   1275 
   1276             // Load
   1277             vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1278             vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1279 
   1280             // Prepare src_scale
   1281             vsrc_scale = vdupq_n_u16(alpha256);
   1282 
   1283             // Calc dst_scale
   1284             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
   1285             vdst_scale = vmovl_u8(vsrc_alphas);
   1286             vdst_scale *= vsrc_scale;
   1287             vdst_scale = vshrq_n_u16(vdst_scale, 8);
   1288             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
   1289 
   1290             // Process src
   1291             vsrc_wide = vmovl_u8(vsrc);
   1292             vsrc_wide *= vsrc_scale;
   1293 
   1294             // Process dst
   1295             vdst_wide = vmovl_u8(vdst);
   1296             vdst_wide *= vdst_scale;
   1297 
   1298             // Combine
   1299             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1300 
   1301             vst1_u32(dst, vreinterpret_u32_u8(vres));
   1302 
   1303             src += 2;
   1304             dst += 2;
   1305             count -= 2;
   1306         } while(count);
   1307     }
   1308 }
   1309 
   1310 ///////////////////////////////////////////////////////////////////////////////
   1311 
   1312 #endif // #ifdef SK_CPU_ARM32
   1313 
   1314 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   1315                                    const SkPMColor* SK_RESTRICT src,
   1316                                    int count, U8CPU alpha, int x, int y) {
   1317     SkASSERT(255 == alpha);
   1318 
   1319 #define    UNROLL    8
   1320 
   1321     if (count >= UNROLL) {
   1322 
   1323     uint8x8_t dbase;
   1324     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1325     dbase = vld1_u8(dstart);
   1326 
   1327         do {
   1328         uint8x8x4_t vsrc;
   1329         uint8x8_t sr, sg, sb, sa, d;
   1330         uint16x8_t dst8, scale8, alpha8;
   1331         uint16x8_t dst_r, dst_g, dst_b;
   1332 
   1333 #ifdef SK_CPU_ARM64
   1334         vsrc = sk_vld4_u8_arm64_4(src);
   1335 #else
   1336         {
   1337         register uint8x8_t d0 asm("d0");
   1338         register uint8x8_t d1 asm("d1");
   1339         register uint8x8_t d2 asm("d2");
   1340         register uint8x8_t d3 asm("d3");
   1341 
   1342         asm ("vld4.8    {d0-d3},[%[src]]! "
   1343             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
   1344             :
   1345         );
   1346         vsrc.val[0] = d0;
   1347         vsrc.val[1] = d1;
   1348         vsrc.val[2] = d2;
   1349         vsrc.val[3] = d3;
   1350         }
   1351 #endif
   1352         sa = vsrc.val[NEON_A];
   1353         sr = vsrc.val[NEON_R];
   1354         sg = vsrc.val[NEON_G];
   1355         sb = vsrc.val[NEON_B];
   1356 
   1357         /* calculate 'd', which will be 0..7
   1358          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
   1359          */
   1360         alpha8 = vmovl_u8(dbase);
   1361         alpha8 = vmlal_u8(alpha8, sa, dbase);
   1362         d = vshrn_n_u16(alpha8, 8);    // narrowing too
   1363 
   1364         // sr = sr - (sr>>5) + d
   1365         /* watching for 8-bit overflow.  d is 0..7; risky range of
   1366          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
   1367          * safe  as long as we do ((sr-sr>>5) + d)
   1368          */
   1369         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1370         sr = vadd_u8(sr, d);
   1371 
   1372         // sb = sb - (sb>>5) + d
   1373         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1374         sb = vadd_u8(sb, d);
   1375 
   1376         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1377         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1378         sg = vadd_u8(sg, vshr_n_u8(d,1));
   1379 
   1380         // need to pick up 8 dst's -- at 16 bits each, 128 bits
   1381         dst8 = vld1q_u16(dst);
   1382         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
   1383         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
   1384         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
   1385 
   1386         // blend
   1387         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1388 
   1389         // combine the addq and mul, save 3 insns
   1390         scale8 = vshrq_n_u16(scale8, 3);
   1391         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1392         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1393         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1394 
   1395         // repack to store
   1396         dst8 = vshrq_n_u16(dst_b, 5);
   1397         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1398         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1399 
   1400         vst1q_u16(dst, dst8);
   1401 
   1402         dst += UNROLL;
   1403         count -= UNROLL;
   1404         // skip x += UNROLL, since it's unchanged mod-4
   1405         } while (count >= UNROLL);
   1406     }
   1407 #undef    UNROLL
   1408 
   1409     // residuals
   1410     if (count > 0) {
   1411         DITHER_565_SCAN(y);
   1412         do {
   1413             SkPMColor c = *src++;
   1414             SkPMColorAssert(c);
   1415             if (c) {
   1416                 unsigned a = SkGetPackedA32(c);
   1417 
   1418                 // dither and alpha are just temporary variables to work-around
   1419                 // an ICE in debug.
   1420                 unsigned dither = DITHER_VALUE(x);
   1421                 unsigned alpha = SkAlpha255To256(a);
   1422                 int d = SkAlphaMul(dither, alpha);
   1423 
   1424                 unsigned sr = SkGetPackedR32(c);
   1425                 unsigned sg = SkGetPackedG32(c);
   1426                 unsigned sb = SkGetPackedB32(c);
   1427                 sr = SkDITHER_R32_FOR_565(sr, d);
   1428                 sg = SkDITHER_G32_FOR_565(sg, d);
   1429                 sb = SkDITHER_B32_FOR_565(sb, d);
   1430 
   1431                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1432                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1433                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1434                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1435                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1436             }
   1437             dst += 1;
   1438             DITHER_INC_X(x);
   1439         } while (--count != 0);
   1440     }
   1441 }
   1442 
   1443 ///////////////////////////////////////////////////////////////////////////////
   1444 
   1445 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1446                                  const SkPMColor* SK_RESTRICT src,
   1447                                  int count, U8CPU alpha, int x, int y) {
   1448     SkASSERT(255 == alpha);
   1449 
   1450 #define    UNROLL    8
   1451     if (count >= UNROLL) {
   1452     uint8x8_t d;
   1453     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1454     d = vld1_u8(dstart);
   1455 
   1456     while (count >= UNROLL) {
   1457         uint8x8_t sr, sg, sb;
   1458         uint16x8_t dr, dg, db;
   1459         uint16x8_t dst8;
   1460         uint8x8x4_t vsrc;
   1461 
   1462 #ifdef SK_CPU_ARM64
   1463         vsrc = sk_vld4_u8_arm64_3(src);
   1464 #else
   1465         {
   1466         register uint8x8_t d0 asm("d0");
   1467         register uint8x8_t d1 asm("d1");
   1468         register uint8x8_t d2 asm("d2");
   1469         register uint8x8_t d3 asm("d3");
   1470 
   1471         asm (
   1472             "vld4.8    {d0-d3},[%[src]]! "
   1473             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   1474             :
   1475         );
   1476         vsrc.val[0] = d0;
   1477         vsrc.val[1] = d1;
   1478         vsrc.val[2] = d2;
   1479         }
   1480 #endif
   1481         sr = vsrc.val[NEON_R];
   1482         sg = vsrc.val[NEON_G];
   1483         sb = vsrc.val[NEON_B];
   1484 
   1485         /* XXX: if we want to prefetch, hide it in the above asm()
   1486          * using the gcc __builtin_prefetch(), the prefetch will
   1487          * fall to the bottom of the loop -- it won't stick up
   1488          * at the top of the loop, just after the vld4.
   1489          */
   1490 
   1491         // sr = sr - (sr>>5) + d
   1492         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1493         dr = vaddl_u8(sr, d);
   1494 
   1495         // sb = sb - (sb>>5) + d
   1496         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1497         db = vaddl_u8(sb, d);
   1498 
   1499         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1500         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1501         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
   1502 
   1503         // pack high bits of each into 565 format  (rgb, b is lsb)
   1504         dst8 = vshrq_n_u16(db, 3);
   1505         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1506         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
   1507 
   1508         // store it
   1509         vst1q_u16(dst, dst8);
   1510 
   1511         dst += UNROLL;
   1512         // we don't need to increment src as the asm above has already done it
   1513         count -= UNROLL;
   1514         x += UNROLL;        // probably superfluous
   1515     }
   1516     }
   1517 #undef    UNROLL
   1518 
   1519     // residuals
   1520     if (count > 0) {
   1521         DITHER_565_SCAN(y);
   1522         do {
   1523             SkPMColor c = *src++;
   1524             SkPMColorAssert(c);
   1525             SkASSERT(SkGetPackedA32(c) == 255);
   1526 
   1527             unsigned dither = DITHER_VALUE(x);
   1528             *dst++ = SkDitherRGB32To565(c, dither);
   1529             DITHER_INC_X(x);
   1530         } while (--count != 0);
   1531     }
   1532 }
   1533 
   1534 ///////////////////////////////////////////////////////////////////////////////
   1535 
   1536 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
   1537     // no dither
   1538     S32_D565_Opaque_neon,
   1539     S32_D565_Blend_neon,
   1540     S32A_D565_Opaque_neon,
   1541 #if 0
   1542     S32A_D565_Blend_neon,
   1543 #else
   1544     nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
   1545 #endif
   1546 
   1547     // dither
   1548     S32_D565_Opaque_Dither_neon,
   1549     S32_D565_Blend_Dither_neon,
   1550     S32A_D565_Opaque_Dither_neon,
   1551     nullptr,   // S32A_D565_Blend_Dither
   1552 };
   1553 
   1554 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
   1555     Color32A_D565_neon,    // Color32_D565,
   1556     Color32A_D565_neon,    // Color32A_D565,
   1557     Color32A_D565_neon,    // Color32_D565_Dither,
   1558     Color32A_D565_neon,    // Color32A_D565_Dither
   1559 };
   1560 
   1561 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1562     nullptr,   // S32_Opaque,
   1563     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1564     /*
   1565      * We have two choices for S32A_Opaque procs. The one reads the src alpha
   1566      * value and attempts to optimize accordingly.  The optimization is
   1567      * sensitive to the source content and is not a win in all cases. For
   1568      * example, if there are a lot of transitions between the alpha states,
   1569      * the performance will almost certainly be worse.  However, for many
   1570      * common cases the performance is equivalent or better than the standard
   1571      * case where we do not inspect the src alpha.
   1572      */
   1573 #if SK_A32_SHIFT == 24
   1574     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
   1575     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
   1576 #else
   1577     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
   1578 #endif
   1579 #ifdef SK_CPU_ARM32
   1580     S32A_Blend_BlitRow32_neon        // S32A_Blend
   1581 #else
   1582     nullptr
   1583 #endif
   1584 };
   1585