Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkColor_opts_neon.h"
     18 #include <arm_neon.h>
     19 
     20 #ifdef SK_CPU_ARM64
     21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
     22     uint8x8x4_t vsrc;
     23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
     24 
     25     asm (
     26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     27         "mov    %[vsrc0].8b, v0.8b             \t\n"
     28         "mov    %[vsrc1].8b, v1.8b             \t\n"
     29         "mov    %[vsrc2].8b, v2.8b             \t\n"
     30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
     32         : : "v0", "v1", "v2", "v3"
     33     );
     34 
     35     vsrc.val[0] = vsrc_0;
     36     vsrc.val[1] = vsrc_1;
     37     vsrc.val[2] = vsrc_2;
     38 
     39     return vsrc;
     40 }
     41 
     42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
     43     uint8x8x4_t vsrc;
     44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
     45 
     46     asm (
     47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     48         "mov    %[vsrc0].8b, v0.8b             \t\n"
     49         "mov    %[vsrc1].8b, v1.8b             \t\n"
     50         "mov    %[vsrc2].8b, v2.8b             \t\n"
     51         "mov    %[vsrc3].8b, v3.8b             \t\n"
     52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
     54           [src] "+&r" (src)
     55         : : "v0", "v1", "v2", "v3"
     56     );
     57 
     58     vsrc.val[0] = vsrc_0;
     59     vsrc.val[1] = vsrc_1;
     60     vsrc.val[2] = vsrc_2;
     61     vsrc.val[3] = vsrc_3;
     62 
     63     return vsrc;
     64 }
     65 #endif
     66 
     67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     68                            const SkPMColor* SK_RESTRICT src, int count,
     69                            U8CPU alpha, int /*x*/, int /*y*/) {
     70     SkASSERT(255 == alpha);
     71 
     72     while (count >= 8) {
     73         uint8x8x4_t vsrc;
     74         uint16x8_t vdst;
     75 
     76         // Load
     77 #ifdef SK_CPU_ARM64
     78         vsrc = sk_vld4_u8_arm64_3(src);
     79 #else
     80         vsrc = vld4_u8((uint8_t*)src);
     81         src += 8;
     82 #endif
     83 
     84         // Convert src to 565
     85         vdst = SkPixel32ToPixel16_neon8(vsrc);
     86 
     87         // Store
     88         vst1q_u16(dst, vdst);
     89 
     90         // Prepare next iteration
     91         dst += 8;
     92         count -= 8;
     93     };
     94 
     95     // Leftovers
     96     while (count > 0) {
     97         SkPMColor c = *src++;
     98         SkPMColorAssert(c);
     99         *dst = SkPixel32ToPixel16_ToU16(c);
    100         dst++;
    101         count--;
    102     };
    103 }
    104 
    105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    106                           const SkPMColor* SK_RESTRICT src, int count,
    107                           U8CPU alpha, int /*x*/, int /*y*/) {
    108     SkASSERT(255 > alpha);
    109 
    110     uint16x8_t vmask_blue, vscale;
    111 
    112     // prepare constants
    113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
    114     vmask_blue = vmovq_n_u16(0x1F);
    115 
    116     while (count >= 8) {
    117         uint8x8x4_t vsrc;
    118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    119         uint16x8_t vres_r, vres_g, vres_b;
    120 
    121         // Load src
    122 #ifdef SK_CPU_ARM64
    123         vsrc = sk_vld4_u8_arm64_3(src);
    124 #else
    125         {
    126         register uint8x8_t d0 asm("d0");
    127         register uint8x8_t d1 asm("d1");
    128         register uint8x8_t d2 asm("d2");
    129         register uint8x8_t d3 asm("d3");
    130 
    131         asm (
    132             "vld4.8    {d0-d3},[%[src]]!"
    133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    134             :
    135         );
    136         vsrc.val[0] = d0;
    137         vsrc.val[1] = d1;
    138         vsrc.val[2] = d2;
    139         }
    140 #endif
    141 
    142         // Load and unpack dst
    143         vdst = vld1q_u16(dst);
    144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
    145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
    146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
    147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
    148 
    149         // Shift src to 565 range
    150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
    151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
    152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
    153 
    154         // Scale src - dst
    155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
    156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
    157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
    158 
    159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
    160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
    161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
    162 
    163         vres_r += vdst_r;
    164         vres_g += vdst_g;
    165         vres_b += vdst_b;
    166 
    167         // Combine
    168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
    169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
    170 
    171         // Store
    172         vst1q_u16(dst, vres_b);
    173         dst += 8;
    174         count -= 8;
    175     }
    176     if (count > 0) {
    177         int scale = SkAlpha255To256(alpha);
    178         do {
    179             SkPMColor c = *src++;
    180             SkPMColorAssert(c);
    181             uint16_t d = *dst;
    182             *dst++ = SkPackRGB16(
    183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
    184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
    185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
    186         } while (--count != 0);
    187     }
    188 }
    189 
    190 #ifdef SK_CPU_ARM32
    191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    192                            const SkPMColor* SK_RESTRICT src, int count,
    193                            U8CPU alpha, int /*x*/, int /*y*/) {
    194     SkASSERT(255 == alpha);
    195 
    196     if (count >= 8) {
    197         uint16_t* SK_RESTRICT keep_dst = 0;
    198 
    199         asm volatile (
    200                       "ands       ip, %[count], #7            \n\t"
    201                       "vmov.u8    d31, #1<<7                  \n\t"
    202                       "vld1.16    {q12}, [%[dst]]             \n\t"
    203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
    204                       // Thumb does not support the standard ARM conditional
    205                       // instructions but instead requires the 'it' instruction
    206                       // to signal conditional execution
    207                       "it eq                                  \n\t"
    208                       "moveq      ip, #8                      \n\t"
    209                       "mov        %[keep_dst], %[dst]         \n\t"
    210 
    211                       "add        %[src], %[src], ip, LSL#2   \n\t"
    212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
    213                       "subs       %[count], %[count], ip      \n\t"
    214                       "b          9f                          \n\t"
    215                       // LOOP
    216                       "2:                                         \n\t"
    217 
    218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
    219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
    220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
    221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
    222                       "subs       %[count], %[count], #8      \n\t"
    223                       "9:                                         \n\t"
    224                       "pld        [%[dst],#32]                \n\t"
    225                       // expand 0565 q12 to 8888 {d4-d7}
    226                       "vmovn.u16  d4, q12                     \n\t"
    227                       "vshr.u16   q11, q12, #5                \n\t"
    228                       "vshr.u16   q10, q12, #6+5              \n\t"
    229                       "vmovn.u16  d5, q11                     \n\t"
    230                       "vmovn.u16  d6, q10                     \n\t"
    231                       "vshl.u8    d4, d4, #3                  \n\t"
    232                       "vshl.u8    d5, d5, #2                  \n\t"
    233                       "vshl.u8    d6, d6, #3                  \n\t"
    234 
    235                       "vmovl.u8   q14, d31                    \n\t"
    236                       "vmovl.u8   q13, d31                    \n\t"
    237                       "vmovl.u8   q12, d31                    \n\t"
    238 
    239                       // duplicate in 4/2/1 & 8pix vsns
    240                       "vmvn.8     d30, d3                     \n\t"
    241                       "vmlal.u8   q14, d30, d6                \n\t"
    242                       "vmlal.u8   q13, d30, d5                \n\t"
    243                       "vmlal.u8   q12, d30, d4                \n\t"
    244                       "vshr.u16   q8, q14, #5                 \n\t"
    245                       "vshr.u16   q9, q13, #6                 \n\t"
    246                       "vaddhn.u16 d6, q14, q8                 \n\t"
    247                       "vshr.u16   q8, q12, #5                 \n\t"
    248                       "vaddhn.u16 d5, q13, q9                 \n\t"
    249                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    250                       "vaddhn.u16 d4, q12, q8                 \n\t"
    251                       // intentionally don't calculate alpha
    252                       // result in d4-d6
    253 
    254                       "vqadd.u8   d5, d5, d1                  \n\t"
    255                       "vqadd.u8   d4, d4, d2                  \n\t"
    256 
    257                       // pack 8888 {d4-d6} to 0565 q10
    258                       "vshll.u8   q10, d6, #8                 \n\t"
    259                       "vshll.u8   q3, d5, #8                  \n\t"
    260                       "vshll.u8   q2, d4, #8                  \n\t"
    261                       "vsri.u16   q10, q3, #5                 \n\t"
    262                       "vsri.u16   q10, q2, #11                \n\t"
    263 
    264                       "bne        2b                          \n\t"
    265 
    266                       "1:                                         \n\t"
    267                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
    268                       : [count] "+r" (count)
    269                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    270                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    271                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    272                       "d30","d31"
    273                       );
    274     }
    275     else
    276     {   // handle count < 8
    277         uint16_t* SK_RESTRICT keep_dst = 0;
    278 
    279         asm volatile (
    280                       "vmov.u8    d31, #1<<7                  \n\t"
    281                       "mov        %[keep_dst], %[dst]         \n\t"
    282 
    283                       "tst        %[count], #4                \n\t"
    284                       "beq        14f                         \n\t"
    285                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    286                       "vld1.32    {q1}, [%[src]]!             \n\t"
    287 
    288                       "14:                                        \n\t"
    289                       "tst        %[count], #2                \n\t"
    290                       "beq        12f                         \n\t"
    291                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    292                       "vld1.32    {d1}, [%[src]]!             \n\t"
    293 
    294                       "12:                                        \n\t"
    295                       "tst        %[count], #1                \n\t"
    296                       "beq        11f                         \n\t"
    297                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    298                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    299 
    300                       "11:                                        \n\t"
    301                       // unzips achieve the same as a vld4 operation
    302                       "vuzp.u16   q0, q1                      \n\t"
    303                       "vuzp.u8    d0, d1                      \n\t"
    304                       "vuzp.u8    d2, d3                      \n\t"
    305                       // expand 0565 q12 to 8888 {d4-d7}
    306                       "vmovn.u16  d4, q12                     \n\t"
    307                       "vshr.u16   q11, q12, #5                \n\t"
    308                       "vshr.u16   q10, q12, #6+5              \n\t"
    309                       "vmovn.u16  d5, q11                     \n\t"
    310                       "vmovn.u16  d6, q10                     \n\t"
    311                       "vshl.u8    d4, d4, #3                  \n\t"
    312                       "vshl.u8    d5, d5, #2                  \n\t"
    313                       "vshl.u8    d6, d6, #3                  \n\t"
    314 
    315                       "vmovl.u8   q14, d31                    \n\t"
    316                       "vmovl.u8   q13, d31                    \n\t"
    317                       "vmovl.u8   q12, d31                    \n\t"
    318 
    319                       // duplicate in 4/2/1 & 8pix vsns
    320                       "vmvn.8     d30, d3                     \n\t"
    321                       "vmlal.u8   q14, d30, d6                \n\t"
    322                       "vmlal.u8   q13, d30, d5                \n\t"
    323                       "vmlal.u8   q12, d30, d4                \n\t"
    324                       "vshr.u16   q8, q14, #5                 \n\t"
    325                       "vshr.u16   q9, q13, #6                 \n\t"
    326                       "vaddhn.u16 d6, q14, q8                 \n\t"
    327                       "vshr.u16   q8, q12, #5                 \n\t"
    328                       "vaddhn.u16 d5, q13, q9                 \n\t"
    329                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    330                       "vaddhn.u16 d4, q12, q8                 \n\t"
    331                       // intentionally don't calculate alpha
    332                       // result in d4-d6
    333 
    334                       "vqadd.u8   d5, d5, d1                  \n\t"
    335                       "vqadd.u8   d4, d4, d2                  \n\t"
    336 
    337                       // pack 8888 {d4-d6} to 0565 q10
    338                       "vshll.u8   q10, d6, #8                 \n\t"
    339                       "vshll.u8   q3, d5, #8                  \n\t"
    340                       "vshll.u8   q2, d4, #8                  \n\t"
    341                       "vsri.u16   q10, q3, #5                 \n\t"
    342                       "vsri.u16   q10, q2, #11                \n\t"
    343 
    344                       // store
    345                       "tst        %[count], #4                \n\t"
    346                       "beq        24f                         \n\t"
    347                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    348 
    349                       "24:                                        \n\t"
    350                       "tst        %[count], #2                \n\t"
    351                       "beq        22f                         \n\t"
    352                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    353 
    354                       "22:                                        \n\t"
    355                       "tst        %[count], #1                \n\t"
    356                       "beq        21f                         \n\t"
    357                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    358 
    359                       "21:                                        \n\t"
    360                       : [count] "+r" (count)
    361                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    362                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    363                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    364                       "d30","d31"
    365                       );
    366     }
    367 }
    368 
    369 #else // #ifdef SK_CPU_ARM32
    370 
    371 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    372                            const SkPMColor* SK_RESTRICT src, int count,
    373                            U8CPU alpha, int /*x*/, int /*y*/) {
    374     SkASSERT(255 == alpha);
    375 
    376     if (count >= 16) {
    377         asm (
    378             "movi    v4.8h, #0x80                   \t\n"
    379 
    380             "1:                                     \t\n"
    381             "sub     %[count], %[count], #16        \t\n"
    382             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
    383             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
    384             "prfm    pldl1keep, [%[src],#512]       \t\n"
    385             "prfm    pldl1keep, [%[dst],#256]       \t\n"
    386             "ushr    v20.8h, v17.8h, #5             \t\n"
    387             "ushr    v31.8h, v16.8h, #5             \t\n"
    388             "xtn     v6.8b, v31.8h                  \t\n"
    389             "xtn2    v6.16b, v20.8h                 \t\n"
    390             "ushr    v20.8h, v17.8h, #11            \t\n"
    391             "shl     v19.16b, v6.16b, #2            \t\n"
    392             "ushr    v31.8h, v16.8h, #11            \t\n"
    393             "xtn     v22.8b, v31.8h                 \t\n"
    394             "xtn2    v22.16b, v20.8h                \t\n"
    395             "shl     v18.16b, v22.16b, #3           \t\n"
    396             "mvn     v3.16b, v3.16b                 \t\n"
    397             "xtn     v16.8b, v16.8h                 \t\n"
    398             "mov     v7.16b, v4.16b                 \t\n"
    399             "xtn2    v16.16b, v17.8h                \t\n"
    400             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
    401             "shl     v16.16b, v16.16b, #3           \t\n"
    402             "mov     v22.16b, v4.16b                \t\n"
    403             "ushr    v24.8h, v7.8h, #6              \t\n"
    404             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
    405             "ushr    v20.8h, v22.8h, #5             \t\n"
    406             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
    407             "cmp     %[count], #16                  \t\n"
    408             "mov     v6.16b, v4.16b                 \t\n"
    409             "mov     v5.16b, v4.16b                 \t\n"
    410             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
    411             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
    412             "mov     v17.16b, v4.16b                \t\n"
    413             "ushr    v19.8h, v6.8h, #5              \t\n"
    414             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
    415             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
    416             "ushr    v18.8h, v5.8h, #6              \t\n"
    417             "ushr    v21.8h, v17.8h, #5             \t\n"
    418             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
    419             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
    420             "mov     v22.16b, v4.16b                \t\n"
    421             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
    422             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
    423             "ushr    v5.8h, v22.8h, #5              \t\n"
    424             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
    425             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
    426 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
    427             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
    428             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
    429 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
    430             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
    431             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
    432 #else
    433 #error "This function only supports BGRA and RGBA."
    434 #endif
    435             "shll    v22.8h, v20.8b, #8             \t\n"
    436             "shll    v5.8h, v7.8b, #8               \t\n"
    437             "sri     v22.8h, v5.8h, #5              \t\n"
    438             "shll    v17.8h, v6.8b, #8              \t\n"
    439             "shll2   v23.8h, v20.16b, #8            \t\n"
    440             "shll2   v7.8h, v7.16b, #8              \t\n"
    441             "sri     v22.8h, v17.8h, #11            \t\n"
    442             "sri     v23.8h, v7.8h, #5              \t\n"
    443             "shll2   v6.8h, v6.16b, #8              \t\n"
    444             "st1     {v22.8h}, [%[dst]], #16        \t\n"
    445             "sri     v23.8h, v6.8h, #11             \t\n"
    446             "st1     {v23.8h}, [%[dst]], #16        \t\n"
    447             "b.ge    1b                             \t\n"
    448             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
    449             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    450                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
    451                "v31"
    452         );
    453     }
    454         // Leftovers
    455     if (count > 0) {
    456         do {
    457             SkPMColor c = *src++;
    458             SkPMColorAssert(c);
    459             if (c) {
    460                 *dst = SkSrcOver32To16(c, *dst);
    461             }
    462             dst += 1;
    463         } while (--count != 0);
    464     }
    465 }
    466 #endif // #ifdef SK_CPU_ARM32
    467 
    468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
    469     prod += vdupq_n_u16(128);
    470     prod += vshrq_n_u16(prod, 8);
    471     return vshrq_n_u16(prod, 8);
    472 }
    473 
    474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    475                           const SkPMColor* SK_RESTRICT src, int count,
    476                           U8CPU alpha, int /*x*/, int /*y*/) {
    477    SkASSERT(255 > alpha);
    478 
    479     /* This code implements a Neon version of S32A_D565_Blend. The results have
    480      * a few mismatches compared to the original code. These mismatches never
    481      * exceed 1.
    482      */
    483 
    484     if (count >= 8) {
    485         uint16x8_t valpha_max, vmask_blue;
    486         uint8x8_t valpha;
    487 
    488         // prepare constants
    489         valpha_max = vmovq_n_u16(255);
    490         valpha = vdup_n_u8(alpha);
    491         vmask_blue = vmovq_n_u16(SK_B16_MASK);
    492 
    493         do {
    494             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    495             uint16x8_t vres_a, vres_r, vres_g, vres_b;
    496             uint8x8x4_t vsrc;
    497 
    498             // load pixels
    499             vdst = vld1q_u16(dst);
    500 #ifdef SK_CPU_ARM64
    501             vsrc = sk_vld4_u8_arm64_4(src);
    502 #else
    503 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    504             asm (
    505                 "vld4.u8 %h[vsrc], [%[src]]!"
    506                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
    507                 : :
    508             );
    509 #else
    510             register uint8x8_t d0 asm("d0");
    511             register uint8x8_t d1 asm("d1");
    512             register uint8x8_t d2 asm("d2");
    513             register uint8x8_t d3 asm("d3");
    514 
    515             asm volatile (
    516                 "vld4.u8    {d0-d3},[%[src]]!;"
    517                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    518                   [src] "+&r" (src)
    519                 : :
    520             );
    521             vsrc.val[0] = d0;
    522             vsrc.val[1] = d1;
    523             vsrc.val[2] = d2;
    524             vsrc.val[3] = d3;
    525 #endif
    526 #endif // #ifdef SK_CPU_ARM64
    527 
    528 
    529             // deinterleave dst
    530             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
    531             vdst_b = vdst & vmask_blue;                     // extract blue
    532             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
    533             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
    534 
    535             // shift src to 565
    536             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
    537             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
    538             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
    539 
    540             // calc src * src_scale
    541             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
    542             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
    543             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
    544             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
    545 
    546             // prepare dst_scale
    547             vres_a = SkDiv255Round_neon8(vres_a);
    548             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
    549 
    550             // add dst * dst_scale to previous result
    551             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
    552             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
    553             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
    554 
    555 #ifdef S32A_D565_BLEND_EXACT
    556             // It is possible to get exact results with this but it is slow,
    557             // even slower than C code in some cases
    558             vres_r = SkDiv255Round_neon8(vres_r);
    559             vres_g = SkDiv255Round_neon8(vres_g);
    560             vres_b = SkDiv255Round_neon8(vres_b);
    561 #else
    562             vres_r = vrshrq_n_u16(vres_r, 8);
    563             vres_g = vrshrq_n_u16(vres_g, 8);
    564             vres_b = vrshrq_n_u16(vres_b, 8);
    565 #endif
    566             // pack result
    567             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
    568             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
    569 
    570             // store
    571             vst1q_u16(dst, vres_b);
    572             dst += 8;
    573             count -= 8;
    574         } while (count >= 8);
    575     }
    576 
    577     // leftovers
    578     while (count-- > 0) {
    579         SkPMColor sc = *src++;
    580         if (sc) {
    581             uint16_t dc = *dst;
    582             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    583             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    584             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    585             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    586             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    587         }
    588         dst += 1;
    589     }
    590 }
    591 
    592 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    593  * each dither value is spaced out into byte lanes, and repeated
    594  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    595  * start of each row.
    596  */
    597 static const uint8_t gDitherMatrix_Neon[48] = {
    598     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    599     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    600     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    601     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    602 
    603 };
    604 
    605 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    606                                 int count, U8CPU alpha, int x, int y)
    607 {
    608 
    609     SkASSERT(255 > alpha);
    610 
    611     // rescale alpha to range 1 - 256
    612     int scale = SkAlpha255To256(alpha);
    613 
    614     if (count >= 8) {
    615         /* select row and offset for dither array */
    616         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    617 
    618         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
    619         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
    620 
    621         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
    622         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
    623 
    624         do {
    625 
    626             uint8x8x4_t vsrc;
    627             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
    628             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
    629             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
    630             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
    631             uint16x8_t vdst;
    632             uint16x8_t vdst_r, vdst_g, vdst_b;
    633             int16x8_t vres_r, vres_g, vres_b;
    634             int8x8_t vres8_r, vres8_g, vres8_b;
    635 
    636             // Load source and add dither
    637 #ifdef SK_CPU_ARM64
    638             vsrc = sk_vld4_u8_arm64_3(src);
    639 #else
    640             {
    641             register uint8x8_t d0 asm("d0");
    642             register uint8x8_t d1 asm("d1");
    643             register uint8x8_t d2 asm("d2");
    644             register uint8x8_t d3 asm("d3");
    645 
    646             asm (
    647                 "vld4.8    {d0-d3},[%[src]]! "
    648                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    649                 :
    650             );
    651             vsrc.val[0] = d0;
    652             vsrc.val[1] = d1;
    653             vsrc.val[2] = d2;
    654             }
    655 #endif
    656             vsrc_r = vsrc.val[NEON_R];
    657             vsrc_g = vsrc.val[NEON_G];
    658             vsrc_b = vsrc.val[NEON_B];
    659 
    660             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
    661             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
    662             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
    663 
    664             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
    665             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
    666             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
    667 
    668             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
    669             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
    670             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
    671 
    672             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
    673             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
    674             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
    675 
    676             // Load dst and unpack
    677             vdst = vld1q_u16(dst);
    678             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
    679             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
    680             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
    681 
    682             // subtract dst from src and widen
    683             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
    684             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
    685             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
    686 
    687             // multiply diffs by scale and shift
    688             vres_r = vmulq_s16(vres_r, vscale);
    689             vres_g = vmulq_s16(vres_g, vscale);
    690             vres_b = vmulq_s16(vres_b, vscale);
    691 
    692             vres8_r = vshrn_n_s16(vres_r, 8);
    693             vres8_g = vshrn_n_s16(vres_g, 8);
    694             vres8_b = vshrn_n_s16(vres_b, 8);
    695 
    696             // add dst to result
    697             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
    698             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
    699             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
    700 
    701             // put result into 565 format
    702             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
    703             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
    704 
    705             // Store result
    706             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
    707 
    708             // Next iteration
    709             dst += 8;
    710             count -= 8;
    711 
    712         } while (count >= 8);
    713     }
    714 
    715     // Leftovers
    716     if (count > 0) {
    717         int scale = SkAlpha255To256(alpha);
    718         DITHER_565_SCAN(y);
    719         do {
    720             SkPMColor c = *src++;
    721             SkPMColorAssert(c);
    722 
    723             int dither = DITHER_VALUE(x);
    724             int sr = SkGetPackedR32(c);
    725             int sg = SkGetPackedG32(c);
    726             int sb = SkGetPackedB32(c);
    727             sr = SkDITHER_R32To565(sr, dither);
    728             sg = SkDITHER_G32To565(sg, dither);
    729             sb = SkDITHER_B32To565(sb, dither);
    730 
    731             uint16_t d = *dst;
    732             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    733                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    734                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    735             DITHER_INC_X(x);
    736         } while (--count != 0);
    737     }
    738 }
    739 
    740 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    741                                 const SkPMColor* SK_RESTRICT src,
    742                                 int count, U8CPU alpha) {
    743 
    744     SkASSERT(255 == alpha);
    745     if (count > 0) {
    746 
    747 
    748     uint8x8_t alpha_mask;
    749 
    750     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    751     alpha_mask = vld1_u8(alpha_mask_setup);
    752 
    753     /* do the NEON unrolled code */
    754 #define    UNROLL    4
    755     while (count >= UNROLL) {
    756         uint8x8_t src_raw, dst_raw, dst_final;
    757         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    758 
    759         /* The two prefetches below may make the code slighlty
    760          * slower for small values of count but are worth having
    761          * in the general case.
    762          */
    763         __builtin_prefetch(src+32);
    764         __builtin_prefetch(dst+32);
    765 
    766         /* get the source */
    767         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    768 #if    UNROLL > 2
    769         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    770 #endif
    771 
    772         /* get and hold the dst too */
    773         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    774 #if    UNROLL > 2
    775         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    776 #endif
    777 
    778     /* 1st and 2nd bits of the unrolling */
    779     {
    780         uint8x8_t dst_cooked;
    781         uint16x8_t dst_wide;
    782         uint8x8_t alpha_narrow;
    783         uint16x8_t alpha_wide;
    784 
    785         /* get the alphas spread out properly */
    786         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    787         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    788 
    789         /* spread the dest */
    790         dst_wide = vmovl_u8(dst_raw);
    791 
    792         /* alpha mul the dest */
    793         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    794         dst_cooked = vshrn_n_u16(dst_wide, 8);
    795 
    796         /* sum -- ignoring any byte lane overflows */
    797         dst_final = vadd_u8(src_raw, dst_cooked);
    798     }
    799 
    800 #if    UNROLL > 2
    801     /* the 3rd and 4th bits of our unrolling */
    802     {
    803         uint8x8_t dst_cooked;
    804         uint16x8_t dst_wide;
    805         uint8x8_t alpha_narrow;
    806         uint16x8_t alpha_wide;
    807 
    808         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    809         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    810 
    811         /* spread the dest */
    812         dst_wide = vmovl_u8(dst_raw_2);
    813 
    814         /* alpha mul the dest */
    815         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    816         dst_cooked = vshrn_n_u16(dst_wide, 8);
    817 
    818         /* sum -- ignoring any byte lane overflows */
    819         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    820     }
    821 #endif
    822 
    823         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    824 #if    UNROLL > 2
    825         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    826 #endif
    827 
    828         src += UNROLL;
    829         dst += UNROLL;
    830         count -= UNROLL;
    831     }
    832 #undef    UNROLL
    833 
    834     /* do any residual iterations */
    835         while (--count >= 0) {
    836             *dst = SkPMSrcOver(*src, *dst);
    837             src += 1;
    838             dst += 1;
    839         }
    840     }
    841 }
    842 
    843 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
    844                                 const SkPMColor* SK_RESTRICT src,
    845                                 int count, U8CPU alpha) {
    846     SkASSERT(255 == alpha);
    847 
    848     if (count <= 0)
    849     return;
    850 
    851     /* Use these to check if src is transparent or opaque */
    852     const unsigned int ALPHA_OPAQ  = 0xFF000000;
    853     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
    854 
    855 #define UNROLL  4
    856     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
    857     const SkPMColor* SK_RESTRICT src_temp = src;
    858 
    859     /* set up the NEON variables */
    860     uint8x8_t alpha_mask;
    861     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    862     alpha_mask = vld1_u8(alpha_mask_setup);
    863 
    864     uint8x8_t src_raw, dst_raw, dst_final;
    865     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    866     uint8x8_t dst_cooked;
    867     uint16x8_t dst_wide;
    868     uint8x8_t alpha_narrow;
    869     uint16x8_t alpha_wide;
    870 
    871     /* choose the first processing type */
    872     if( src >= src_end)
    873         goto TAIL;
    874     if(*src <= ALPHA_TRANS)
    875         goto ALPHA_0;
    876     if(*src >= ALPHA_OPAQ)
    877         goto ALPHA_255;
    878     /* fall-thru */
    879 
    880 ALPHA_1_TO_254:
    881     do {
    882 
    883         /* get the source */
    884         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    885         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    886 
    887         /* get and hold the dst too */
    888         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    889         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    890 
    891 
    892         /* get the alphas spread out properly */
    893         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    894         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    895         /* we collapsed (255-a)+1 ... */
    896         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    897 
    898         /* spread the dest */
    899         dst_wide = vmovl_u8(dst_raw);
    900 
    901         /* alpha mul the dest */
    902         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    903         dst_cooked = vshrn_n_u16(dst_wide, 8);
    904 
    905         /* sum -- ignoring any byte lane overflows */
    906         dst_final = vadd_u8(src_raw, dst_cooked);
    907 
    908         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    909         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    910         /* we collapsed (255-a)+1 ... */
    911         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    912 
    913         /* spread the dest */
    914         dst_wide = vmovl_u8(dst_raw_2);
    915 
    916         /* alpha mul the dest */
    917         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    918         dst_cooked = vshrn_n_u16(dst_wide, 8);
    919 
    920         /* sum -- ignoring any byte lane overflows */
    921         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    922 
    923         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    924         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    925 
    926         src += UNROLL;
    927         dst += UNROLL;
    928 
    929         /* if 2 of the next pixels aren't between 1 and 254
    930         it might make sense to go to the optimized loops */
    931         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
    932             break;
    933 
    934     } while(src < src_end);
    935 
    936     if (src >= src_end)
    937         goto TAIL;
    938 
    939     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
    940         goto ALPHA_255;
    941 
    942     /*fall-thru*/
    943 
    944 ALPHA_0:
    945 
    946     /*In this state, we know the current alpha is 0 and
    947      we optimize for the next alpha also being zero. */
    948     src_temp = src;  //so we don't have to increment dst every time
    949     do {
    950         if(*(++src) > ALPHA_TRANS)
    951             break;
    952         if(*(++src) > ALPHA_TRANS)
    953             break;
    954         if(*(++src) > ALPHA_TRANS)
    955             break;
    956         if(*(++src) > ALPHA_TRANS)
    957             break;
    958     } while(src < src_end);
    959 
    960     dst += (src - src_temp);
    961 
    962     /* no longer alpha 0, so determine where to go next. */
    963     if( src >= src_end)
    964         goto TAIL;
    965     if(*src >= ALPHA_OPAQ)
    966         goto ALPHA_255;
    967     else
    968         goto ALPHA_1_TO_254;
    969 
    970 ALPHA_255:
    971     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
    972         dst[0]=src[0];
    973         dst[1]=src[1];
    974         dst[2]=src[2];
    975         dst[3]=src[3];
    976         src+=UNROLL;
    977         dst+=UNROLL;
    978         if(src >= src_end)
    979             goto TAIL;
    980     }
    981 
    982     //Handle remainder.
    983     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    984         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    985             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
    986         }
    987     }
    988 
    989     if( src >= src_end)
    990         goto TAIL;
    991     if(*src <= ALPHA_TRANS)
    992         goto ALPHA_0;
    993     else
    994         goto ALPHA_1_TO_254;
    995 
    996 TAIL:
    997     /* do any residual iterations */
    998     src_end += UNROLL + 1;  //goto the real end
    999     while(src != src_end) {
   1000         if( *src != 0 ) {
   1001             if( *src >= ALPHA_OPAQ ) {
   1002                 *dst = *src;
   1003             }
   1004             else {
   1005                 *dst = SkPMSrcOver(*src, *dst);
   1006             }
   1007         }
   1008         src++;
   1009         dst++;
   1010     }
   1011 
   1012 #undef    UNROLL
   1013     return;
   1014 }
   1015 
   1016 /* Neon version of S32_Blend_BlitRow32()
   1017  * portable version is in src/core/SkBlitRow_D32.cpp
   1018  */
   1019 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1020                               const SkPMColor* SK_RESTRICT src,
   1021                               int count, U8CPU alpha) {
   1022     SkASSERT(alpha <= 255);
   1023 
   1024     if (count <= 0) {
   1025         return;
   1026     }
   1027 
   1028     uint16_t src_scale = SkAlpha255To256(alpha);
   1029     uint16_t dst_scale = 256 - src_scale;
   1030 
   1031     while (count >= 2) {
   1032         uint8x8_t vsrc, vdst, vres;
   1033         uint16x8_t vsrc_wide, vdst_wide;
   1034 
   1035         /* These commented prefetches are a big win for count
   1036          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
   1037          * They also hurt a little (<5%) on an A15
   1038          */
   1039         //__builtin_prefetch(src+32);
   1040         //__builtin_prefetch(dst+32);
   1041 
   1042         // Load
   1043         vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1044         vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1045 
   1046         // Process src
   1047         vsrc_wide = vmovl_u8(vsrc);
   1048         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1049 
   1050         // Process dst
   1051         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1052 
   1053         // Combine
   1054         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1055 
   1056         // Store
   1057         vst1_u32(dst, vreinterpret_u32_u8(vres));
   1058 
   1059         src += 2;
   1060         dst += 2;
   1061         count -= 2;
   1062     }
   1063 
   1064     if (count == 1) {
   1065         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1066         uint16x8_t vsrc_wide, vdst_wide;
   1067 
   1068         // Load
   1069         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1070         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1071 
   1072         // Process
   1073         vsrc_wide = vmovl_u8(vsrc);
   1074         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1075         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1076         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1077 
   1078         // Store
   1079         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1080     }
   1081 }
   1082 
   1083 #ifdef SK_CPU_ARM32
   1084 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1085                          const SkPMColor* SK_RESTRICT src,
   1086                          int count, U8CPU alpha) {
   1087 
   1088     SkASSERT(255 >= alpha);
   1089 
   1090     if (count <= 0) {
   1091         return;
   1092     }
   1093 
   1094     unsigned alpha256 = SkAlpha255To256(alpha);
   1095 
   1096     // First deal with odd counts
   1097     if (count & 1) {
   1098         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1099         uint16x8_t vdst_wide, vsrc_wide;
   1100         unsigned dst_scale;
   1101 
   1102         // Load
   1103         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1104         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1105 
   1106         // Calc dst_scale
   1107         dst_scale = vget_lane_u8(vsrc, 3);
   1108         dst_scale *= alpha256;
   1109         dst_scale >>= 8;
   1110         dst_scale = 256 - dst_scale;
   1111 
   1112         // Process src
   1113         vsrc_wide = vmovl_u8(vsrc);
   1114         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
   1115 
   1116         // Process dst
   1117         vdst_wide = vmovl_u8(vdst);
   1118         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
   1119 
   1120         // Combine
   1121         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1122 
   1123         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1124         dst++;
   1125         src++;
   1126         count--;
   1127     }
   1128 
   1129     if (count) {
   1130         uint8x8_t alpha_mask;
   1131         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   1132         alpha_mask = vld1_u8(alpha_mask_setup);
   1133 
   1134         do {
   1135 
   1136             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
   1137             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
   1138 
   1139             __builtin_prefetch(src+32);
   1140             __builtin_prefetch(dst+32);
   1141 
   1142             // Load
   1143             vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1144             vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1145 
   1146             // Prepare src_scale
   1147             vsrc_scale = vdupq_n_u16(alpha256);
   1148 
   1149             // Calc dst_scale
   1150             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
   1151             vdst_scale = vmovl_u8(vsrc_alphas);
   1152             vdst_scale *= vsrc_scale;
   1153             vdst_scale = vshrq_n_u16(vdst_scale, 8);
   1154             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
   1155 
   1156             // Process src
   1157             vsrc_wide = vmovl_u8(vsrc);
   1158             vsrc_wide *= vsrc_scale;
   1159 
   1160             // Process dst
   1161             vdst_wide = vmovl_u8(vdst);
   1162             vdst_wide *= vdst_scale;
   1163 
   1164             // Combine
   1165             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1166 
   1167             vst1_u32(dst, vreinterpret_u32_u8(vres));
   1168 
   1169             src += 2;
   1170             dst += 2;
   1171             count -= 2;
   1172         } while(count);
   1173     }
   1174 }
   1175 
   1176 ///////////////////////////////////////////////////////////////////////////////
   1177 
   1178 #undef    DEBUG_OPAQUE_DITHER
   1179 
   1180 #if    defined(DEBUG_OPAQUE_DITHER)
   1181 static void showme8(char *str, void *p, int len)
   1182 {
   1183     static char buf[256];
   1184     char tbuf[32];
   1185     int i;
   1186     char *pc = (char*) p;
   1187     sprintf(buf,"%8s:", str);
   1188     for(i=0;i<len;i++) {
   1189         sprintf(tbuf, "   %02x", pc[i]);
   1190         strcat(buf, tbuf);
   1191     }
   1192     SkDebugf("%s\n", buf);
   1193 }
   1194 static void showme16(char *str, void *p, int len)
   1195 {
   1196     static char buf[256];
   1197     char tbuf[32];
   1198     int i;
   1199     uint16_t *pc = (uint16_t*) p;
   1200     sprintf(buf,"%8s:", str);
   1201     len = (len / sizeof(uint16_t));    /* passed as bytes */
   1202     for(i=0;i<len;i++) {
   1203         sprintf(tbuf, " %04x", pc[i]);
   1204         strcat(buf, tbuf);
   1205     }
   1206     SkDebugf("%s\n", buf);
   1207 }
   1208 #endif
   1209 #endif // #ifdef SK_CPU_ARM32
   1210 
   1211 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   1212                                    const SkPMColor* SK_RESTRICT src,
   1213                                    int count, U8CPU alpha, int x, int y) {
   1214     SkASSERT(255 == alpha);
   1215 
   1216 #define    UNROLL    8
   1217 
   1218     if (count >= UNROLL) {
   1219 
   1220 #if defined(DEBUG_OPAQUE_DITHER)
   1221     uint16_t tmpbuf[UNROLL];
   1222     int td[UNROLL];
   1223     int tdv[UNROLL];
   1224     int ta[UNROLL];
   1225     int tap[UNROLL];
   1226     uint16_t in_dst[UNROLL];
   1227     int offset = 0;
   1228     int noisy = 0;
   1229 #endif
   1230 
   1231     uint8x8_t dbase;
   1232     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1233     dbase = vld1_u8(dstart);
   1234 
   1235         do {
   1236         uint8x8x4_t vsrc;
   1237         uint8x8_t sr, sg, sb, sa, d;
   1238         uint16x8_t dst8, scale8, alpha8;
   1239         uint16x8_t dst_r, dst_g, dst_b;
   1240 
   1241 #if defined(DEBUG_OPAQUE_DITHER)
   1242         // calculate 8 elements worth into a temp buffer
   1243         {
   1244         int my_y = y;
   1245         int my_x = x;
   1246         SkPMColor* my_src = (SkPMColor*)src;
   1247         uint16_t* my_dst = dst;
   1248         int i;
   1249 
   1250         DITHER_565_SCAN(my_y);
   1251         for(i = 0; i < UNROLL; i++) {
   1252             SkPMColor c = *my_src++;
   1253             SkPMColorAssert(c);
   1254             if (c) {
   1255                 unsigned a = SkGetPackedA32(c);
   1256 
   1257                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
   1258                 tdv[i] = DITHER_VALUE(my_x);
   1259                 ta[i] = a;
   1260                 tap[i] = SkAlpha255To256(a);
   1261                 td[i] = d;
   1262 
   1263                 unsigned sr = SkGetPackedR32(c);
   1264                 unsigned sg = SkGetPackedG32(c);
   1265                 unsigned sb = SkGetPackedB32(c);
   1266                 sr = SkDITHER_R32_FOR_565(sr, d);
   1267                 sg = SkDITHER_G32_FOR_565(sg, d);
   1268                 sb = SkDITHER_B32_FOR_565(sb, d);
   1269 
   1270                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1271                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
   1272                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1273                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1274                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1275                 td[i] = d;
   1276             } else {
   1277                 tmpbuf[i] = *my_dst;
   1278                 ta[i] = tdv[i] = td[i] = 0xbeef;
   1279             }
   1280             in_dst[i] = *my_dst;
   1281             my_dst += 1;
   1282             DITHER_INC_X(my_x);
   1283         }
   1284         }
   1285 #endif
   1286 
   1287 #ifdef SK_CPU_ARM64
   1288         vsrc = sk_vld4_u8_arm64_4(src);
   1289 #else
   1290         {
   1291         register uint8x8_t d0 asm("d0");
   1292         register uint8x8_t d1 asm("d1");
   1293         register uint8x8_t d2 asm("d2");
   1294         register uint8x8_t d3 asm("d3");
   1295 
   1296         asm ("vld4.8    {d0-d3},[%[src]]! "
   1297             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
   1298             :
   1299         );
   1300         vsrc.val[0] = d0;
   1301         vsrc.val[1] = d1;
   1302         vsrc.val[2] = d2;
   1303         vsrc.val[3] = d3;
   1304         }
   1305 #endif
   1306         sa = vsrc.val[NEON_A];
   1307         sr = vsrc.val[NEON_R];
   1308         sg = vsrc.val[NEON_G];
   1309         sb = vsrc.val[NEON_B];
   1310 
   1311         /* calculate 'd', which will be 0..7
   1312          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
   1313          */
   1314         alpha8 = vmovl_u8(dbase);
   1315         alpha8 = vmlal_u8(alpha8, sa, dbase);
   1316         d = vshrn_n_u16(alpha8, 8);    // narrowing too
   1317 
   1318         // sr = sr - (sr>>5) + d
   1319         /* watching for 8-bit overflow.  d is 0..7; risky range of
   1320          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
   1321          * safe  as long as we do ((sr-sr>>5) + d)
   1322          */
   1323         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1324         sr = vadd_u8(sr, d);
   1325 
   1326         // sb = sb - (sb>>5) + d
   1327         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1328         sb = vadd_u8(sb, d);
   1329 
   1330         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1331         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1332         sg = vadd_u8(sg, vshr_n_u8(d,1));
   1333 
   1334         // need to pick up 8 dst's -- at 16 bits each, 128 bits
   1335         dst8 = vld1q_u16(dst);
   1336         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
   1337         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
   1338         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
   1339 
   1340         // blend
   1341         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1342 
   1343         // combine the addq and mul, save 3 insns
   1344         scale8 = vshrq_n_u16(scale8, 3);
   1345         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1346         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1347         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1348 
   1349         // repack to store
   1350         dst8 = vshrq_n_u16(dst_b, 5);
   1351         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1352         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1353 
   1354         vst1q_u16(dst, dst8);
   1355 
   1356 #if defined(DEBUG_OPAQUE_DITHER)
   1357         // verify my 8 elements match the temp buffer
   1358         {
   1359         int i, bad=0;
   1360         static int invocation;
   1361 
   1362         for (i = 0; i < UNROLL; i++) {
   1363             if (tmpbuf[i] != dst[i]) {
   1364                 bad=1;
   1365             }
   1366         }
   1367         if (bad) {
   1368             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
   1369                      invocation, offset);
   1370             SkDebugf("  alpha 0x%x\n", alpha);
   1371             for (i = 0; i < UNROLL; i++)
   1372                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
   1373                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
   1374                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
   1375 
   1376             showme16("alpha8", &alpha8, sizeof(alpha8));
   1377             showme16("scale8", &scale8, sizeof(scale8));
   1378             showme8("d", &d, sizeof(d));
   1379             showme16("dst8", &dst8, sizeof(dst8));
   1380             showme16("dst_b", &dst_b, sizeof(dst_b));
   1381             showme16("dst_g", &dst_g, sizeof(dst_g));
   1382             showme16("dst_r", &dst_r, sizeof(dst_r));
   1383             showme8("sb", &sb, sizeof(sb));
   1384             showme8("sg", &sg, sizeof(sg));
   1385             showme8("sr", &sr, sizeof(sr));
   1386 
   1387             return;
   1388         }
   1389         offset += UNROLL;
   1390         invocation++;
   1391         }
   1392 #endif
   1393         dst += UNROLL;
   1394         count -= UNROLL;
   1395         // skip x += UNROLL, since it's unchanged mod-4
   1396         } while (count >= UNROLL);
   1397     }
   1398 #undef    UNROLL
   1399 
   1400     // residuals
   1401     if (count > 0) {
   1402         DITHER_565_SCAN(y);
   1403         do {
   1404             SkPMColor c = *src++;
   1405             SkPMColorAssert(c);
   1406             if (c) {
   1407                 unsigned a = SkGetPackedA32(c);
   1408 
   1409                 // dither and alpha are just temporary variables to work-around
   1410                 // an ICE in debug.
   1411                 unsigned dither = DITHER_VALUE(x);
   1412                 unsigned alpha = SkAlpha255To256(a);
   1413                 int d = SkAlphaMul(dither, alpha);
   1414 
   1415                 unsigned sr = SkGetPackedR32(c);
   1416                 unsigned sg = SkGetPackedG32(c);
   1417                 unsigned sb = SkGetPackedB32(c);
   1418                 sr = SkDITHER_R32_FOR_565(sr, d);
   1419                 sg = SkDITHER_G32_FOR_565(sg, d);
   1420                 sb = SkDITHER_B32_FOR_565(sb, d);
   1421 
   1422                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1423                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1424                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1425                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1426                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1427             }
   1428             dst += 1;
   1429             DITHER_INC_X(x);
   1430         } while (--count != 0);
   1431     }
   1432 }
   1433 
   1434 ///////////////////////////////////////////////////////////////////////////////
   1435 
   1436 #undef    DEBUG_S32_OPAQUE_DITHER
   1437 
   1438 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1439                                  const SkPMColor* SK_RESTRICT src,
   1440                                  int count, U8CPU alpha, int x, int y) {
   1441     SkASSERT(255 == alpha);
   1442 
   1443 #define    UNROLL    8
   1444     if (count >= UNROLL) {
   1445     uint8x8_t d;
   1446     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1447     d = vld1_u8(dstart);
   1448 
   1449     while (count >= UNROLL) {
   1450         uint8x8_t sr, sg, sb;
   1451         uint16x8_t dr, dg, db;
   1452         uint16x8_t dst8;
   1453         uint8x8x4_t vsrc;
   1454 
   1455 #ifdef SK_CPU_ARM64
   1456         vsrc = sk_vld4_u8_arm64_3(src);
   1457 #else
   1458         {
   1459         register uint8x8_t d0 asm("d0");
   1460         register uint8x8_t d1 asm("d1");
   1461         register uint8x8_t d2 asm("d2");
   1462         register uint8x8_t d3 asm("d3");
   1463 
   1464         asm (
   1465             "vld4.8    {d0-d3},[%[src]]! "
   1466             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   1467             :
   1468         );
   1469         vsrc.val[0] = d0;
   1470         vsrc.val[1] = d1;
   1471         vsrc.val[2] = d2;
   1472         }
   1473 #endif
   1474         sr = vsrc.val[NEON_R];
   1475         sg = vsrc.val[NEON_G];
   1476         sb = vsrc.val[NEON_B];
   1477 
   1478         /* XXX: if we want to prefetch, hide it in the above asm()
   1479          * using the gcc __builtin_prefetch(), the prefetch will
   1480          * fall to the bottom of the loop -- it won't stick up
   1481          * at the top of the loop, just after the vld4.
   1482          */
   1483 
   1484         // sr = sr - (sr>>5) + d
   1485         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1486         dr = vaddl_u8(sr, d);
   1487 
   1488         // sb = sb - (sb>>5) + d
   1489         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1490         db = vaddl_u8(sb, d);
   1491 
   1492         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1493         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1494         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
   1495 
   1496         // pack high bits of each into 565 format  (rgb, b is lsb)
   1497         dst8 = vshrq_n_u16(db, 3);
   1498         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1499         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
   1500 
   1501         // store it
   1502         vst1q_u16(dst, dst8);
   1503 
   1504 #if    defined(DEBUG_S32_OPAQUE_DITHER)
   1505         // always good to know if we generated good results
   1506         {
   1507         int i, myx = x, myy = y;
   1508         DITHER_565_SCAN(myy);
   1509         for (i=0;i<UNROLL;i++) {
   1510             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
   1511             SkPMColor c = src[i-8];
   1512             unsigned dither = DITHER_VALUE(myx);
   1513             uint16_t val = SkDitherRGB32To565(c, dither);
   1514             if (val != dst[i]) {
   1515             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
   1516                 c, dither, val, dst[i], dstart[i]);
   1517             }
   1518             DITHER_INC_X(myx);
   1519         }
   1520         }
   1521 #endif
   1522 
   1523         dst += UNROLL;
   1524         // we don't need to increment src as the asm above has already done it
   1525         count -= UNROLL;
   1526         x += UNROLL;        // probably superfluous
   1527     }
   1528     }
   1529 #undef    UNROLL
   1530 
   1531     // residuals
   1532     if (count > 0) {
   1533         DITHER_565_SCAN(y);
   1534         do {
   1535             SkPMColor c = *src++;
   1536             SkPMColorAssert(c);
   1537             SkASSERT(SkGetPackedA32(c) == 255);
   1538 
   1539             unsigned dither = DITHER_VALUE(x);
   1540             *dst++ = SkDitherRGB32To565(c, dither);
   1541             DITHER_INC_X(x);
   1542         } while (--count != 0);
   1543     }
   1544 }
   1545 
   1546 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
   1547                       SkPMColor color) {
   1548     if (count <= 0) {
   1549         return;
   1550     }
   1551 
   1552     if (0 == color) {
   1553         if (src != dst) {
   1554             memcpy(dst, src, count * sizeof(SkPMColor));
   1555         }
   1556         return;
   1557     }
   1558 
   1559     unsigned colorA = SkGetPackedA32(color);
   1560     if (255 == colorA) {
   1561         sk_memset32(dst, color, count);
   1562         return;
   1563     }
   1564 
   1565     unsigned scale = 256 - SkAlpha255To256(colorA);
   1566 
   1567     if (count >= 8) {
   1568         uint32x4_t vcolor;
   1569         uint8x8_t vscale;
   1570 
   1571         vcolor = vdupq_n_u32(color);
   1572 
   1573         // scale numerical interval [0-255], so load as 8 bits
   1574         vscale = vdup_n_u8(scale);
   1575 
   1576         do {
   1577             // load src color, 8 pixels, 4 64 bit registers
   1578             // (and increment src).
   1579             uint32x2x4_t vsrc;
   1580 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
   1581             asm (
   1582                 "vld1.32    %h[vsrc], [%[src]]!"
   1583                 : [vsrc] "=w" (vsrc), [src] "+r" (src)
   1584                 : :
   1585             );
   1586 #else // 64bit targets and Clang
   1587             vsrc.val[0] = vld1_u32(src);
   1588             vsrc.val[1] = vld1_u32(src+2);
   1589             vsrc.val[2] = vld1_u32(src+4);
   1590             vsrc.val[3] = vld1_u32(src+6);
   1591             src += 8;
   1592 #endif
   1593 
   1594             // multiply long by scale, 64 bits at a time,
   1595             // destination into a 128 bit register.
   1596             uint16x8x4_t vtmp;
   1597             vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
   1598             vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
   1599             vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
   1600             vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
   1601 
   1602             // shift the 128 bit registers, containing the 16
   1603             // bit scaled values back to 8 bits, narrowing the
   1604             // results to 64 bit registers.
   1605             uint8x16x2_t vres;
   1606             vres.val[0] = vcombine_u8(
   1607                             vshrn_n_u16(vtmp.val[0], 8),
   1608                             vshrn_n_u16(vtmp.val[1], 8));
   1609             vres.val[1] = vcombine_u8(
   1610                             vshrn_n_u16(vtmp.val[2], 8),
   1611                             vshrn_n_u16(vtmp.val[3], 8));
   1612 
   1613             // adding back the color, using 128 bit registers.
   1614             uint32x4x2_t vdst;
   1615             vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
   1616                                                vreinterpretq_u8_u32(vcolor));
   1617             vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
   1618                                                vreinterpretq_u8_u32(vcolor));
   1619 
   1620             // store back the 8 calculated pixels (2 128 bit
   1621             // registers), and increment dst.
   1622 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
   1623             asm (
   1624                 "vst1.32    %h[vdst], [%[dst]]!"
   1625                 : [dst] "+r" (dst)
   1626                 : [vdst] "w" (vdst)
   1627                 : "memory"
   1628             );
   1629 #else // 64bit targets and Clang
   1630             vst1q_u32(dst, vdst.val[0]);
   1631             vst1q_u32(dst+4, vdst.val[1]);
   1632             dst += 8;
   1633 #endif
   1634             count -= 8;
   1635 
   1636         } while (count >= 8);
   1637     }
   1638 
   1639     while (count > 0) {
   1640         *dst = color + SkAlphaMulQ(*src, scale);
   1641         src += 1;
   1642         dst += 1;
   1643         count--;
   1644     }
   1645 }
   1646 
   1647 ///////////////////////////////////////////////////////////////////////////////
   1648 
   1649 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
   1650     // no dither
   1651     S32_D565_Opaque_neon,
   1652     S32_D565_Blend_neon,
   1653     S32A_D565_Opaque_neon,
   1654 #if 0
   1655     S32A_D565_Blend_neon,
   1656 #else
   1657     NULL,   // https://code.google.com/p/skia/issues/detail?id=2845
   1658             // https://code.google.com/p/skia/issues/detail?id=2797
   1659 #endif
   1660 
   1661     // dither
   1662     S32_D565_Opaque_Dither_neon,
   1663     S32_D565_Blend_Dither_neon,
   1664     S32A_D565_Opaque_Dither_neon,
   1665     NULL,   // S32A_D565_Blend_Dither
   1666 };
   1667 
   1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1669     NULL,   // S32_Opaque,
   1670     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1671     /*
   1672      * We have two choices for S32A_Opaque procs. The one reads the src alpha
   1673      * value and attempts to optimize accordingly.  The optimization is
   1674      * sensitive to the source content and is not a win in all cases. For
   1675      * example, if there are a lot of transitions between the alpha states,
   1676      * the performance will almost certainly be worse.  However, for many
   1677      * common cases the performance is equivalent or better than the standard
   1678      * case where we do not inspect the src alpha.
   1679      */
   1680 #if SK_A32_SHIFT == 24
   1681     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
   1682     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
   1683 #else
   1684     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
   1685 #endif
   1686 #ifdef SK_CPU_ARM32
   1687     S32A_Blend_BlitRow32_neon        // S32A_Blend
   1688 #else
   1689     NULL
   1690 #endif
   1691 };
   1692