Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkColor_opts_neon.h"
     18 #include <arm_neon.h>
     19 
     20 #ifdef SK_CPU_ARM64
     21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
     22     uint8x8x4_t vsrc;
     23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
     24 
     25     asm (
     26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     27         "mov    %[vsrc0].8b, v0.8b             \t\n"
     28         "mov    %[vsrc1].8b, v1.8b             \t\n"
     29         "mov    %[vsrc2].8b, v2.8b             \t\n"
     30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
     32         : : "v0", "v1", "v2", "v3"
     33     );
     34 
     35     vsrc.val[0] = vsrc_0;
     36     vsrc.val[1] = vsrc_1;
     37     vsrc.val[2] = vsrc_2;
     38 
     39     return vsrc;
     40 }
     41 
     42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
     43     uint8x8x4_t vsrc;
     44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
     45 
     46     asm (
     47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     48         "mov    %[vsrc0].8b, v0.8b             \t\n"
     49         "mov    %[vsrc1].8b, v1.8b             \t\n"
     50         "mov    %[vsrc2].8b, v2.8b             \t\n"
     51         "mov    %[vsrc3].8b, v3.8b             \t\n"
     52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
     54           [src] "+&r" (src)
     55         : : "v0", "v1", "v2", "v3"
     56     );
     57 
     58     vsrc.val[0] = vsrc_0;
     59     vsrc.val[1] = vsrc_1;
     60     vsrc.val[2] = vsrc_2;
     61     vsrc.val[3] = vsrc_3;
     62 
     63     return vsrc;
     64 }
     65 #endif
     66 
     67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     68                            const SkPMColor* SK_RESTRICT src, int count,
     69                            U8CPU alpha, int /*x*/, int /*y*/) {
     70     SkASSERT(255 == alpha);
     71 
     72     while (count >= 8) {
     73         uint8x8x4_t vsrc;
     74         uint16x8_t vdst;
     75 
     76         // Load
     77 #ifdef SK_CPU_ARM64
     78         vsrc = sk_vld4_u8_arm64_3(src);
     79 #else
     80         vsrc = vld4_u8((uint8_t*)src);
     81         src += 8;
     82 #endif
     83 
     84         // Convert src to 565
     85         vdst = SkPixel32ToPixel16_neon8(vsrc);
     86 
     87         // Store
     88         vst1q_u16(dst, vdst);
     89 
     90         // Prepare next iteration
     91         dst += 8;
     92         count -= 8;
     93     };
     94 
     95     // Leftovers
     96     while (count > 0) {
     97         SkPMColor c = *src++;
     98         SkPMColorAssert(c);
     99         *dst = SkPixel32ToPixel16_ToU16(c);
    100         dst++;
    101         count--;
    102     };
    103 }
    104 
    105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    106                           const SkPMColor* SK_RESTRICT src, int count,
    107                           U8CPU alpha, int /*x*/, int /*y*/) {
    108     SkASSERT(255 > alpha);
    109 
    110     uint16x8_t vmask_blue, vscale;
    111 
    112     // prepare constants
    113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
    114     vmask_blue = vmovq_n_u16(0x1F);
    115 
    116     while (count >= 8) {
    117         uint8x8x4_t vsrc;
    118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    119         uint16x8_t vres_r, vres_g, vres_b;
    120 
    121         // Load src
    122 #ifdef SK_CPU_ARM64
    123         vsrc = sk_vld4_u8_arm64_3(src);
    124 #else
    125         {
    126         register uint8x8_t d0 asm("d0");
    127         register uint8x8_t d1 asm("d1");
    128         register uint8x8_t d2 asm("d2");
    129         register uint8x8_t d3 asm("d3");
    130 
    131         asm (
    132             "vld4.8    {d0-d3},[%[src]]!"
    133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    134             :
    135         );
    136         vsrc.val[0] = d0;
    137         vsrc.val[1] = d1;
    138         vsrc.val[2] = d2;
    139         }
    140 #endif
    141 
    142         // Load and unpack dst
    143         vdst = vld1q_u16(dst);
    144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
    145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
    146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
    147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
    148 
    149         // Shift src to 565 range
    150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
    151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
    152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
    153 
    154         // Scale src - dst
    155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
    156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
    157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
    158 
    159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
    160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
    161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
    162 
    163         vres_r += vdst_r;
    164         vres_g += vdst_g;
    165         vres_b += vdst_b;
    166 
    167         // Combine
    168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
    169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
    170 
    171         // Store
    172         vst1q_u16(dst, vres_b);
    173         dst += 8;
    174         count -= 8;
    175     }
    176     if (count > 0) {
    177         int scale = SkAlpha255To256(alpha);
    178         do {
    179             SkPMColor c = *src++;
    180             SkPMColorAssert(c);
    181             uint16_t d = *dst;
    182             *dst++ = SkPackRGB16(
    183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
    184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
    185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
    186         } while (--count != 0);
    187     }
    188 }
    189 
    190 #ifdef SK_CPU_ARM32
    191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    192                            const SkPMColor* SK_RESTRICT src, int count,
    193                            U8CPU alpha, int /*x*/, int /*y*/) {
    194     SkASSERT(255 == alpha);
    195 
    196     if (count >= 8) {
    197         uint16_t* SK_RESTRICT keep_dst = 0;
    198 
    199         asm volatile (
    200                       "ands       ip, %[count], #7            \n\t"
    201                       "vmov.u8    d31, #1<<7                  \n\t"
    202                       "vld1.16    {q12}, [%[dst]]             \n\t"
    203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
    204                       // Thumb does not support the standard ARM conditional
    205                       // instructions but instead requires the 'it' instruction
    206                       // to signal conditional execution
    207                       "it eq                                  \n\t"
    208                       "moveq      ip, #8                      \n\t"
    209                       "mov        %[keep_dst], %[dst]         \n\t"
    210 
    211                       "add        %[src], %[src], ip, LSL#2   \n\t"
    212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
    213                       "subs       %[count], %[count], ip      \n\t"
    214                       "b          9f                          \n\t"
    215                       // LOOP
    216                       "2:                                         \n\t"
    217 
    218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
    219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
    220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
    221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
    222                       "subs       %[count], %[count], #8      \n\t"
    223                       "9:                                         \n\t"
    224                       "pld        [%[dst],#32]                \n\t"
    225                       // expand 0565 q12 to 8888 {d4-d7}
    226                       "vmovn.u16  d4, q12                     \n\t"
    227                       "vshr.u16   q11, q12, #5                \n\t"
    228                       "vshr.u16   q10, q12, #6+5              \n\t"
    229                       "vmovn.u16  d5, q11                     \n\t"
    230                       "vmovn.u16  d6, q10                     \n\t"
    231                       "vshl.u8    d4, d4, #3                  \n\t"
    232                       "vshl.u8    d5, d5, #2                  \n\t"
    233                       "vshl.u8    d6, d6, #3                  \n\t"
    234 
    235                       "vmovl.u8   q14, d31                    \n\t"
    236                       "vmovl.u8   q13, d31                    \n\t"
    237                       "vmovl.u8   q12, d31                    \n\t"
    238 
    239                       // duplicate in 4/2/1 & 8pix vsns
    240                       "vmvn.8     d30, d3                     \n\t"
    241                       "vmlal.u8   q14, d30, d6                \n\t"
    242                       "vmlal.u8   q13, d30, d5                \n\t"
    243                       "vmlal.u8   q12, d30, d4                \n\t"
    244                       "vshr.u16   q8, q14, #5                 \n\t"
    245                       "vshr.u16   q9, q13, #6                 \n\t"
    246                       "vaddhn.u16 d6, q14, q8                 \n\t"
    247                       "vshr.u16   q8, q12, #5                 \n\t"
    248                       "vaddhn.u16 d5, q13, q9                 \n\t"
    249                       "vaddhn.u16 d4, q12, q8                 \n\t"
    250                       // intentionally don't calculate alpha
    251                       // result in d4-d6
    252 
    253             #ifdef SK_PMCOLOR_IS_RGBA
    254                       "vqadd.u8   d6, d6, d0                  \n\t"
    255                       "vqadd.u8   d5, d5, d1                  \n\t"
    256                       "vqadd.u8   d4, d4, d2                  \n\t"
    257             #else
    258                       "vqadd.u8   d6, d6, d2                  \n\t"
    259                       "vqadd.u8   d5, d5, d1                  \n\t"
    260                       "vqadd.u8   d4, d4, d0                  \n\t"
    261             #endif
    262 
    263                       // pack 8888 {d4-d6} to 0565 q10
    264                       "vshll.u8   q10, d6, #8                 \n\t"
    265                       "vshll.u8   q3, d5, #8                  \n\t"
    266                       "vshll.u8   q2, d4, #8                  \n\t"
    267                       "vsri.u16   q10, q3, #5                 \n\t"
    268                       "vsri.u16   q10, q2, #11                \n\t"
    269 
    270                       "bne        2b                          \n\t"
    271 
    272                       "1:                                         \n\t"
    273                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
    274                       : [count] "+r" (count)
    275                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    276                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    277                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    278                       "d30","d31"
    279                       );
    280     }
    281     else
    282     {   // handle count < 8
    283         uint16_t* SK_RESTRICT keep_dst = 0;
    284 
    285         asm volatile (
    286                       "vmov.u8    d31, #1<<7                  \n\t"
    287                       "mov        %[keep_dst], %[dst]         \n\t"
    288 
    289                       "tst        %[count], #4                \n\t"
    290                       "beq        14f                         \n\t"
    291                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    292                       "vld1.32    {q1}, [%[src]]!             \n\t"
    293 
    294                       "14:                                        \n\t"
    295                       "tst        %[count], #2                \n\t"
    296                       "beq        12f                         \n\t"
    297                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    298                       "vld1.32    {d1}, [%[src]]!             \n\t"
    299 
    300                       "12:                                        \n\t"
    301                       "tst        %[count], #1                \n\t"
    302                       "beq        11f                         \n\t"
    303                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    304                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    305 
    306                       "11:                                        \n\t"
    307                       // unzips achieve the same as a vld4 operation
    308                       "vuzp.u16   q0, q1                      \n\t"
    309                       "vuzp.u8    d0, d1                      \n\t"
    310                       "vuzp.u8    d2, d3                      \n\t"
    311                       // expand 0565 q12 to 8888 {d4-d7}
    312                       "vmovn.u16  d4, q12                     \n\t"
    313                       "vshr.u16   q11, q12, #5                \n\t"
    314                       "vshr.u16   q10, q12, #6+5              \n\t"
    315                       "vmovn.u16  d5, q11                     \n\t"
    316                       "vmovn.u16  d6, q10                     \n\t"
    317                       "vshl.u8    d4, d4, #3                  \n\t"
    318                       "vshl.u8    d5, d5, #2                  \n\t"
    319                       "vshl.u8    d6, d6, #3                  \n\t"
    320 
    321                       "vmovl.u8   q14, d31                    \n\t"
    322                       "vmovl.u8   q13, d31                    \n\t"
    323                       "vmovl.u8   q12, d31                    \n\t"
    324 
    325                       // duplicate in 4/2/1 & 8pix vsns
    326                       "vmvn.8     d30, d3                     \n\t"
    327                       "vmlal.u8   q14, d30, d6                \n\t"
    328                       "vmlal.u8   q13, d30, d5                \n\t"
    329                       "vmlal.u8   q12, d30, d4                \n\t"
    330                       "vshr.u16   q8, q14, #5                 \n\t"
    331                       "vshr.u16   q9, q13, #6                 \n\t"
    332                       "vaddhn.u16 d6, q14, q8                 \n\t"
    333                       "vshr.u16   q8, q12, #5                 \n\t"
    334                       "vaddhn.u16 d5, q13, q9                 \n\t"
    335                       "vaddhn.u16 d4, q12, q8                 \n\t"
    336                       // intentionally don't calculate alpha
    337                       // result in d4-d6
    338 
    339             #ifdef SK_PMCOLOR_IS_RGBA
    340                       "vqadd.u8   d6, d6, d0                  \n\t"
    341                       "vqadd.u8   d5, d5, d1                  \n\t"
    342                       "vqadd.u8   d4, d4, d2                  \n\t"
    343             #else
    344                       "vqadd.u8   d6, d6, d2                  \n\t"
    345                       "vqadd.u8   d5, d5, d1                  \n\t"
    346                       "vqadd.u8   d4, d4, d0                  \n\t"
    347             #endif
    348 
    349                       // pack 8888 {d4-d6} to 0565 q10
    350                       "vshll.u8   q10, d6, #8                 \n\t"
    351                       "vshll.u8   q3, d5, #8                  \n\t"
    352                       "vshll.u8   q2, d4, #8                  \n\t"
    353                       "vsri.u16   q10, q3, #5                 \n\t"
    354                       "vsri.u16   q10, q2, #11                \n\t"
    355 
    356                       // store
    357                       "tst        %[count], #4                \n\t"
    358                       "beq        24f                         \n\t"
    359                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    360 
    361                       "24:                                        \n\t"
    362                       "tst        %[count], #2                \n\t"
    363                       "beq        22f                         \n\t"
    364                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    365 
    366                       "22:                                        \n\t"
    367                       "tst        %[count], #1                \n\t"
    368                       "beq        21f                         \n\t"
    369                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    370 
    371                       "21:                                        \n\t"
    372                       : [count] "+r" (count)
    373                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    374                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    375                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    376                       "d30","d31"
    377                       );
    378     }
    379 }
    380 
    381 #else // #ifdef SK_CPU_ARM32
    382 
    383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    384                            const SkPMColor* SK_RESTRICT src, int count,
    385                            U8CPU alpha, int /*x*/, int /*y*/) {
    386     SkASSERT(255 == alpha);
    387 
    388     if (count >= 16) {
    389         asm (
    390             "movi    v4.8h, #0x80                   \t\n"
    391 
    392             "1:                                     \t\n"
    393             "sub     %[count], %[count], #16        \t\n"
    394             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
    395             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
    396             "prfm    pldl1keep, [%[src],#512]       \t\n"
    397             "prfm    pldl1keep, [%[dst],#256]       \t\n"
    398             "ushr    v20.8h, v17.8h, #5             \t\n"
    399             "ushr    v31.8h, v16.8h, #5             \t\n"
    400             "xtn     v6.8b, v31.8h                  \t\n"
    401             "xtn2    v6.16b, v20.8h                 \t\n"
    402             "ushr    v20.8h, v17.8h, #11            \t\n"
    403             "shl     v19.16b, v6.16b, #2            \t\n"
    404             "ushr    v31.8h, v16.8h, #11            \t\n"
    405             "xtn     v22.8b, v31.8h                 \t\n"
    406             "xtn2    v22.16b, v20.8h                \t\n"
    407             "shl     v18.16b, v22.16b, #3           \t\n"
    408             "mvn     v3.16b, v3.16b                 \t\n"
    409             "xtn     v16.8b, v16.8h                 \t\n"
    410             "mov     v7.16b, v4.16b                 \t\n"
    411             "xtn2    v16.16b, v17.8h                \t\n"
    412             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
    413             "shl     v16.16b, v16.16b, #3           \t\n"
    414             "mov     v22.16b, v4.16b                \t\n"
    415             "ushr    v24.8h, v7.8h, #6              \t\n"
    416             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
    417             "ushr    v20.8h, v22.8h, #5             \t\n"
    418             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
    419             "cmp     %[count], #16                  \t\n"
    420             "mov     v6.16b, v4.16b                 \t\n"
    421             "mov     v5.16b, v4.16b                 \t\n"
    422             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
    423             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
    424             "mov     v17.16b, v4.16b                \t\n"
    425             "ushr    v19.8h, v6.8h, #5              \t\n"
    426             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
    427             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
    428             "ushr    v18.8h, v5.8h, #6              \t\n"
    429             "ushr    v21.8h, v17.8h, #5             \t\n"
    430             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
    431             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
    432             "mov     v22.16b, v4.16b                \t\n"
    433             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
    434             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
    435             "ushr    v5.8h, v22.8h, #5              \t\n"
    436             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
    437             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
    438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
    439             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
    440             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
    441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
    442             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
    443             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
    444 #else
    445 #error "This function only supports BGRA and RGBA."
    446 #endif
    447             "shll    v22.8h, v20.8b, #8             \t\n"
    448             "shll    v5.8h, v7.8b, #8               \t\n"
    449             "sri     v22.8h, v5.8h, #5              \t\n"
    450             "shll    v17.8h, v6.8b, #8              \t\n"
    451             "shll2   v23.8h, v20.16b, #8            \t\n"
    452             "shll2   v7.8h, v7.16b, #8              \t\n"
    453             "sri     v22.8h, v17.8h, #11            \t\n"
    454             "sri     v23.8h, v7.8h, #5              \t\n"
    455             "shll2   v6.8h, v6.16b, #8              \t\n"
    456             "st1     {v22.8h}, [%[dst]], #16        \t\n"
    457             "sri     v23.8h, v6.8h, #11             \t\n"
    458             "st1     {v23.8h}, [%[dst]], #16        \t\n"
    459             "b.ge    1b                             \t\n"
    460             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
    461             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    462                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
    463                "v31"
    464         );
    465     }
    466         // Leftovers
    467     if (count > 0) {
    468         do {
    469             SkPMColor c = *src++;
    470             SkPMColorAssert(c);
    471             if (c) {
    472                 *dst = SkSrcOver32To16(c, *dst);
    473             }
    474             dst += 1;
    475         } while (--count != 0);
    476     }
    477 }
    478 #endif // #ifdef SK_CPU_ARM32
    479 
    480 static uint32_t pmcolor_to_expand16(SkPMColor c) {
    481     unsigned r = SkGetPackedR32(c);
    482     unsigned g = SkGetPackedG32(c);
    483     unsigned b = SkGetPackedB32(c);
    484     return (g << 24) | (r << 13) | (b << 2);
    485 }
    486 
    487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
    488     uint32_t src_expand;
    489     unsigned scale;
    490     uint16x8_t vmask_blue;
    491 
    492     if (count <= 0) return;
    493     SkASSERT(((size_t)dst & 0x01) == 0);
    494 
    495     /*
    496      * This preamble code is in order to make dst aligned to 8 bytes
    497      * in the next mutiple bytes read & write access.
    498      */
    499     src_expand = pmcolor_to_expand16(src);
    500     scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
    501 
    502 #define DST_ALIGN 8
    503 
    504     /*
    505      * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
    506      */
    507     int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
    508 
    509     for (int i = 0; i < preamble_size; i+=2, dst++) {
    510         uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
    511         *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
    512         if (--count == 0)
    513             break;
    514     }
    515 
    516     int count16 = 0;
    517     count16 = count >> 4;
    518     vmask_blue = vmovq_n_u16(SK_B16_MASK);
    519 
    520     if (count16) {
    521         uint16x8_t wide_sr;
    522         uint16x8_t wide_sg;
    523         uint16x8_t wide_sb;
    524         uint16x8_t wide_256_sa;
    525 
    526         unsigned sr = SkGetPackedR32(src);
    527         unsigned sg = SkGetPackedG32(src);
    528         unsigned sb = SkGetPackedB32(src);
    529         unsigned sa = SkGetPackedA32(src);
    530 
    531         // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
    532         // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
    533         //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
    534         wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
    535 
    536         // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
    537         //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
    538         wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
    539 
    540         // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
    541         //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
    542         wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
    543 
    544         wide_256_sa =
    545             vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
    546 
    547         while (count16-- > 0) {
    548             uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
    549             uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
    550             vdst1 = vld1q_u16(dst);
    551             dst += 8;
    552             vdst2 = vld1q_u16(dst);
    553             dst -= 8;    //to store dst again.
    554 
    555             vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
    556             vdst1_b = vdst1 & vmask_blue;                              // extract blue
    557             vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
    558             vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
    559 
    560             vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
    561             vdst2_b = vdst2 & vmask_blue;                              // extract blue
    562             vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
    563             vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
    564 
    565             vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
    566             vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
    567             vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
    568 
    569             vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
    570             vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
    571             vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
    572 
    573             vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
    574             vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
    575             vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
    576 
    577             vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
    578             vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
    579 
    580             vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
    581             vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
    582             vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
    583 
    584             vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
    585             vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
    586 
    587             vst1q_u16(dst, vdst1);
    588             dst += 8;
    589             vst1q_u16(dst, vdst2);
    590             dst += 8;
    591         }
    592     }
    593 
    594     count &= 0xF;
    595     if (count > 0) {
    596         do {
    597             uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
    598             *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
    599             dst += 1;
    600         } while (--count != 0);
    601     }
    602 }
    603 
    604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
    605     prod += vdupq_n_u16(128);
    606     prod += vshrq_n_u16(prod, 8);
    607     return vshrq_n_u16(prod, 8);
    608 }
    609 
    610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    611                           const SkPMColor* SK_RESTRICT src, int count,
    612                           U8CPU alpha, int /*x*/, int /*y*/) {
    613    SkASSERT(255 > alpha);
    614 
    615     /* This code implements a Neon version of S32A_D565_Blend. The results have
    616      * a few mismatches compared to the original code. These mismatches never
    617      * exceed 1.
    618      */
    619 
    620     if (count >= 8) {
    621         uint16x8_t valpha_max, vmask_blue;
    622         uint8x8_t valpha;
    623 
    624         // prepare constants
    625         valpha_max = vmovq_n_u16(255);
    626         valpha = vdup_n_u8(alpha);
    627         vmask_blue = vmovq_n_u16(SK_B16_MASK);
    628 
    629         do {
    630             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    631             uint16x8_t vres_a, vres_r, vres_g, vres_b;
    632             uint8x8x4_t vsrc;
    633 
    634             // load pixels
    635             vdst = vld1q_u16(dst);
    636 #ifdef SK_CPU_ARM64
    637             vsrc = sk_vld4_u8_arm64_4(src);
    638 #else
    639 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    640             asm (
    641                 "vld4.u8 %h[vsrc], [%[src]]!"
    642                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
    643                 : :
    644             );
    645 #else
    646             register uint8x8_t d0 asm("d0");
    647             register uint8x8_t d1 asm("d1");
    648             register uint8x8_t d2 asm("d2");
    649             register uint8x8_t d3 asm("d3");
    650 
    651             asm volatile (
    652                 "vld4.u8    {d0-d3},[%[src]]!;"
    653                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    654                   [src] "+&r" (src)
    655                 : :
    656             );
    657             vsrc.val[0] = d0;
    658             vsrc.val[1] = d1;
    659             vsrc.val[2] = d2;
    660             vsrc.val[3] = d3;
    661 #endif
    662 #endif // #ifdef SK_CPU_ARM64
    663 
    664 
    665             // deinterleave dst
    666             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
    667             vdst_b = vdst & vmask_blue;                     // extract blue
    668             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
    669             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
    670 
    671             // shift src to 565
    672             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
    673             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
    674             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
    675 
    676             // calc src * src_scale
    677             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
    678             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
    679             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
    680             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
    681 
    682             // prepare dst_scale
    683             vres_a = SkDiv255Round_neon8(vres_a);
    684             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
    685 
    686             // add dst * dst_scale to previous result
    687             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
    688             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
    689             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
    690 
    691 #ifdef S32A_D565_BLEND_EXACT
    692             // It is possible to get exact results with this but it is slow,
    693             // even slower than C code in some cases
    694             vres_r = SkDiv255Round_neon8(vres_r);
    695             vres_g = SkDiv255Round_neon8(vres_g);
    696             vres_b = SkDiv255Round_neon8(vres_b);
    697 #else
    698             vres_r = vrshrq_n_u16(vres_r, 8);
    699             vres_g = vrshrq_n_u16(vres_g, 8);
    700             vres_b = vrshrq_n_u16(vres_b, 8);
    701 #endif
    702             // pack result
    703             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
    704             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
    705 
    706             // store
    707             vst1q_u16(dst, vres_b);
    708             dst += 8;
    709             count -= 8;
    710         } while (count >= 8);
    711     }
    712 
    713     // leftovers
    714     while (count-- > 0) {
    715         SkPMColor sc = *src++;
    716         if (sc) {
    717             uint16_t dc = *dst;
    718             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    719             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    720             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    721             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    722             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    723         }
    724         dst += 1;
    725     }
    726 }
    727 
    728 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    729  * each dither value is spaced out into byte lanes, and repeated
    730  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    731  * start of each row.
    732  */
    733 static const uint8_t gDitherMatrix_Neon[48] = {
    734     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    735     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    736     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    737     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    738 
    739 };
    740 
    741 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    742                                 int count, U8CPU alpha, int x, int y)
    743 {
    744 
    745     SkASSERT(255 > alpha);
    746 
    747     // rescale alpha to range 1 - 256
    748     int scale = SkAlpha255To256(alpha);
    749 
    750     if (count >= 8) {
    751         /* select row and offset for dither array */
    752         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    753 
    754         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
    755         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
    756 
    757         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
    758         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
    759 
    760         do {
    761 
    762             uint8x8x4_t vsrc;
    763             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
    764             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
    765             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
    766             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
    767             uint16x8_t vdst;
    768             uint16x8_t vdst_r, vdst_g, vdst_b;
    769             int16x8_t vres_r, vres_g, vres_b;
    770             int8x8_t vres8_r, vres8_g, vres8_b;
    771 
    772             // Load source and add dither
    773 #ifdef SK_CPU_ARM64
    774             vsrc = sk_vld4_u8_arm64_3(src);
    775 #else
    776             {
    777             register uint8x8_t d0 asm("d0");
    778             register uint8x8_t d1 asm("d1");
    779             register uint8x8_t d2 asm("d2");
    780             register uint8x8_t d3 asm("d3");
    781 
    782             asm (
    783                 "vld4.8    {d0-d3},[%[src]]! "
    784                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    785                 :
    786             );
    787             vsrc.val[0] = d0;
    788             vsrc.val[1] = d1;
    789             vsrc.val[2] = d2;
    790             }
    791 #endif
    792             vsrc_r = vsrc.val[NEON_R];
    793             vsrc_g = vsrc.val[NEON_G];
    794             vsrc_b = vsrc.val[NEON_B];
    795 
    796             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
    797             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
    798             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
    799 
    800             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
    801             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
    802             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
    803 
    804             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
    805             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
    806             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
    807 
    808             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
    809             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
    810             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
    811 
    812             // Load dst and unpack
    813             vdst = vld1q_u16(dst);
    814             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
    815             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
    816             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
    817 
    818             // subtract dst from src and widen
    819             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
    820             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
    821             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
    822 
    823             // multiply diffs by scale and shift
    824             vres_r = vmulq_s16(vres_r, vscale);
    825             vres_g = vmulq_s16(vres_g, vscale);
    826             vres_b = vmulq_s16(vres_b, vscale);
    827 
    828             vres8_r = vshrn_n_s16(vres_r, 8);
    829             vres8_g = vshrn_n_s16(vres_g, 8);
    830             vres8_b = vshrn_n_s16(vres_b, 8);
    831 
    832             // add dst to result
    833             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
    834             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
    835             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
    836 
    837             // put result into 565 format
    838             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
    839             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
    840 
    841             // Store result
    842             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
    843 
    844             // Next iteration
    845             dst += 8;
    846             count -= 8;
    847 
    848         } while (count >= 8);
    849     }
    850 
    851     // Leftovers
    852     if (count > 0) {
    853         int scale = SkAlpha255To256(alpha);
    854         DITHER_565_SCAN(y);
    855         do {
    856             SkPMColor c = *src++;
    857             SkPMColorAssert(c);
    858 
    859             int dither = DITHER_VALUE(x);
    860             int sr = SkGetPackedR32(c);
    861             int sg = SkGetPackedG32(c);
    862             int sb = SkGetPackedB32(c);
    863             sr = SkDITHER_R32To565(sr, dither);
    864             sg = SkDITHER_G32To565(sg, dither);
    865             sb = SkDITHER_B32To565(sb, dither);
    866 
    867             uint16_t d = *dst;
    868             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    869                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    870                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    871             DITHER_INC_X(x);
    872         } while (--count != 0);
    873     }
    874 }
    875 
    876 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    877                                 const SkPMColor* SK_RESTRICT src,
    878                                 int count, U8CPU alpha) {
    879 
    880     SkASSERT(255 == alpha);
    881     if (count > 0) {
    882 
    883 
    884     uint8x8_t alpha_mask;
    885 
    886     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    887     alpha_mask = vld1_u8(alpha_mask_setup);
    888 
    889     /* do the NEON unrolled code */
    890 #define    UNROLL    4
    891     while (count >= UNROLL) {
    892         uint8x8_t src_raw, dst_raw, dst_final;
    893         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    894 
    895         /* The two prefetches below may make the code slighlty
    896          * slower for small values of count but are worth having
    897          * in the general case.
    898          */
    899         __builtin_prefetch(src+32);
    900         __builtin_prefetch(dst+32);
    901 
    902         /* get the source */
    903         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    904 #if    UNROLL > 2
    905         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    906 #endif
    907 
    908         /* get and hold the dst too */
    909         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    910 #if    UNROLL > 2
    911         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    912 #endif
    913 
    914     /* 1st and 2nd bits of the unrolling */
    915     {
    916         uint8x8_t dst_cooked;
    917         uint16x8_t dst_wide;
    918         uint8x8_t alpha_narrow;
    919         uint16x8_t alpha_wide;
    920 
    921         /* get the alphas spread out properly */
    922         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    923         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    924 
    925         /* spread the dest */
    926         dst_wide = vmovl_u8(dst_raw);
    927 
    928         /* alpha mul the dest */
    929         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    930         dst_cooked = vshrn_n_u16(dst_wide, 8);
    931 
    932         /* sum -- ignoring any byte lane overflows */
    933         dst_final = vadd_u8(src_raw, dst_cooked);
    934     }
    935 
    936 #if    UNROLL > 2
    937     /* the 3rd and 4th bits of our unrolling */
    938     {
    939         uint8x8_t dst_cooked;
    940         uint16x8_t dst_wide;
    941         uint8x8_t alpha_narrow;
    942         uint16x8_t alpha_wide;
    943 
    944         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    945         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    946 
    947         /* spread the dest */
    948         dst_wide = vmovl_u8(dst_raw_2);
    949 
    950         /* alpha mul the dest */
    951         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    952         dst_cooked = vshrn_n_u16(dst_wide, 8);
    953 
    954         /* sum -- ignoring any byte lane overflows */
    955         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    956     }
    957 #endif
    958 
    959         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    960 #if    UNROLL > 2
    961         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    962 #endif
    963 
    964         src += UNROLL;
    965         dst += UNROLL;
    966         count -= UNROLL;
    967     }
    968 #undef    UNROLL
    969 
    970     /* do any residual iterations */
    971         while (--count >= 0) {
    972             *dst = SkPMSrcOver(*src, *dst);
    973             src += 1;
    974             dst += 1;
    975         }
    976     }
    977 }
    978 
    979 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
    980                                 const SkPMColor* SK_RESTRICT src,
    981                                 int count, U8CPU alpha) {
    982     SkASSERT(255 == alpha);
    983 
    984     if (count <= 0)
    985     return;
    986 
    987     /* Use these to check if src is transparent or opaque */
    988     const unsigned int ALPHA_OPAQ  = 0xFF000000;
    989     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
    990 
    991 #define UNROLL  4
    992     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
    993     const SkPMColor* SK_RESTRICT src_temp = src;
    994 
    995     /* set up the NEON variables */
    996     uint8x8_t alpha_mask;
    997     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    998     alpha_mask = vld1_u8(alpha_mask_setup);
    999 
   1000     uint8x8_t src_raw, dst_raw, dst_final;
   1001     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
   1002     uint8x8_t dst_cooked;
   1003     uint16x8_t dst_wide;
   1004     uint8x8_t alpha_narrow;
   1005     uint16x8_t alpha_wide;
   1006 
   1007     /* choose the first processing type */
   1008     if( src >= src_end)
   1009         goto TAIL;
   1010     if(*src <= ALPHA_TRANS)
   1011         goto ALPHA_0;
   1012     if(*src >= ALPHA_OPAQ)
   1013         goto ALPHA_255;
   1014     /* fall-thru */
   1015 
   1016 ALPHA_1_TO_254:
   1017     do {
   1018 
   1019         /* get the source */
   1020         src_raw = vreinterpret_u8_u32(vld1_u32(src));
   1021         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
   1022 
   1023         /* get and hold the dst too */
   1024         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
   1025         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
   1026 
   1027 
   1028         /* get the alphas spread out properly */
   1029         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
   1030         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   1031         /* we collapsed (255-a)+1 ... */
   1032         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1033 
   1034         /* spread the dest */
   1035         dst_wide = vmovl_u8(dst_raw);
   1036 
   1037         /* alpha mul the dest */
   1038         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1039         dst_cooked = vshrn_n_u16(dst_wide, 8);
   1040 
   1041         /* sum -- ignoring any byte lane overflows */
   1042         dst_final = vadd_u8(src_raw, dst_cooked);
   1043 
   1044         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
   1045         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   1046         /* we collapsed (255-a)+1 ... */
   1047         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1048 
   1049         /* spread the dest */
   1050         dst_wide = vmovl_u8(dst_raw_2);
   1051 
   1052         /* alpha mul the dest */
   1053         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1054         dst_cooked = vshrn_n_u16(dst_wide, 8);
   1055 
   1056         /* sum -- ignoring any byte lane overflows */
   1057         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
   1058 
   1059         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
   1060         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
   1061 
   1062         src += UNROLL;
   1063         dst += UNROLL;
   1064 
   1065         /* if 2 of the next pixels aren't between 1 and 254
   1066         it might make sense to go to the optimized loops */
   1067         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
   1068             break;
   1069 
   1070     } while(src < src_end);
   1071 
   1072     if (src >= src_end)
   1073         goto TAIL;
   1074 
   1075     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
   1076         goto ALPHA_255;
   1077 
   1078     /*fall-thru*/
   1079 
   1080 ALPHA_0:
   1081 
   1082     /*In this state, we know the current alpha is 0 and
   1083      we optimize for the next alpha also being zero. */
   1084     src_temp = src;  //so we don't have to increment dst every time
   1085     do {
   1086         if(*(++src) > ALPHA_TRANS)
   1087             break;
   1088         if(*(++src) > ALPHA_TRANS)
   1089             break;
   1090         if(*(++src) > ALPHA_TRANS)
   1091             break;
   1092         if(*(++src) > ALPHA_TRANS)
   1093             break;
   1094     } while(src < src_end);
   1095 
   1096     dst += (src - src_temp);
   1097 
   1098     /* no longer alpha 0, so determine where to go next. */
   1099     if( src >= src_end)
   1100         goto TAIL;
   1101     if(*src >= ALPHA_OPAQ)
   1102         goto ALPHA_255;
   1103     else
   1104         goto ALPHA_1_TO_254;
   1105 
   1106 ALPHA_255:
   1107     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
   1108         dst[0]=src[0];
   1109         dst[1]=src[1];
   1110         dst[2]=src[2];
   1111         dst[3]=src[3];
   1112         src+=UNROLL;
   1113         dst+=UNROLL;
   1114         if(src >= src_end)
   1115             goto TAIL;
   1116     }
   1117 
   1118     //Handle remainder.
   1119     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   1120         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   1121             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
   1122         }
   1123     }
   1124 
   1125     if( src >= src_end)
   1126         goto TAIL;
   1127     if(*src <= ALPHA_TRANS)
   1128         goto ALPHA_0;
   1129     else
   1130         goto ALPHA_1_TO_254;
   1131 
   1132 TAIL:
   1133     /* do any residual iterations */
   1134     src_end += UNROLL + 1;  //goto the real end
   1135     while(src != src_end) {
   1136         if( *src != 0 ) {
   1137             if( *src >= ALPHA_OPAQ ) {
   1138                 *dst = *src;
   1139             }
   1140             else {
   1141                 *dst = SkPMSrcOver(*src, *dst);
   1142             }
   1143         }
   1144         src++;
   1145         dst++;
   1146     }
   1147 
   1148 #undef    UNROLL
   1149     return;
   1150 }
   1151 
   1152 /* Neon version of S32_Blend_BlitRow32()
   1153  * portable version is in src/core/SkBlitRow_D32.cpp
   1154  */
   1155 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1156                               const SkPMColor* SK_RESTRICT src,
   1157                               int count, U8CPU alpha) {
   1158     SkASSERT(alpha <= 255);
   1159 
   1160     if (count <= 0) {
   1161         return;
   1162     }
   1163 
   1164     uint16_t src_scale = SkAlpha255To256(alpha);
   1165     uint16_t dst_scale = 256 - src_scale;
   1166 
   1167     while (count >= 2) {
   1168         uint8x8_t vsrc, vdst, vres;
   1169         uint16x8_t vsrc_wide, vdst_wide;
   1170 
   1171         /* These commented prefetches are a big win for count
   1172          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
   1173          * They also hurt a little (<5%) on an A15
   1174          */
   1175         //__builtin_prefetch(src+32);
   1176         //__builtin_prefetch(dst+32);
   1177 
   1178         // Load
   1179         vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1180         vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1181 
   1182         // Process src
   1183         vsrc_wide = vmovl_u8(vsrc);
   1184         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1185 
   1186         // Process dst
   1187         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1188 
   1189         // Combine
   1190         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1191 
   1192         // Store
   1193         vst1_u32(dst, vreinterpret_u32_u8(vres));
   1194 
   1195         src += 2;
   1196         dst += 2;
   1197         count -= 2;
   1198     }
   1199 
   1200     if (count == 1) {
   1201         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1202         uint16x8_t vsrc_wide, vdst_wide;
   1203 
   1204         // Load
   1205         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1206         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1207 
   1208         // Process
   1209         vsrc_wide = vmovl_u8(vsrc);
   1210         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1211         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1212         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1213 
   1214         // Store
   1215         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1216     }
   1217 }
   1218 
   1219 #ifdef SK_CPU_ARM32
   1220 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1221                          const SkPMColor* SK_RESTRICT src,
   1222                          int count, U8CPU alpha) {
   1223 
   1224     SkASSERT(255 >= alpha);
   1225 
   1226     if (count <= 0) {
   1227         return;
   1228     }
   1229 
   1230     unsigned alpha256 = SkAlpha255To256(alpha);
   1231 
   1232     // First deal with odd counts
   1233     if (count & 1) {
   1234         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1235         uint16x8_t vdst_wide, vsrc_wide;
   1236         unsigned dst_scale;
   1237 
   1238         // Load
   1239         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1240         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1241 
   1242         // Calc dst_scale
   1243         dst_scale = vget_lane_u8(vsrc, 3);
   1244         dst_scale *= alpha256;
   1245         dst_scale >>= 8;
   1246         dst_scale = 256 - dst_scale;
   1247 
   1248         // Process src
   1249         vsrc_wide = vmovl_u8(vsrc);
   1250         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
   1251 
   1252         // Process dst
   1253         vdst_wide = vmovl_u8(vdst);
   1254         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
   1255 
   1256         // Combine
   1257         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1258 
   1259         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1260         dst++;
   1261         src++;
   1262         count--;
   1263     }
   1264 
   1265     if (count) {
   1266         uint8x8_t alpha_mask;
   1267         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   1268         alpha_mask = vld1_u8(alpha_mask_setup);
   1269 
   1270         do {
   1271 
   1272             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
   1273             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
   1274 
   1275             __builtin_prefetch(src+32);
   1276             __builtin_prefetch(dst+32);
   1277 
   1278             // Load
   1279             vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1280             vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1281 
   1282             // Prepare src_scale
   1283             vsrc_scale = vdupq_n_u16(alpha256);
   1284 
   1285             // Calc dst_scale
   1286             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
   1287             vdst_scale = vmovl_u8(vsrc_alphas);
   1288             vdst_scale *= vsrc_scale;
   1289             vdst_scale = vshrq_n_u16(vdst_scale, 8);
   1290             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
   1291 
   1292             // Process src
   1293             vsrc_wide = vmovl_u8(vsrc);
   1294             vsrc_wide *= vsrc_scale;
   1295 
   1296             // Process dst
   1297             vdst_wide = vmovl_u8(vdst);
   1298             vdst_wide *= vdst_scale;
   1299 
   1300             // Combine
   1301             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1302 
   1303             vst1_u32(dst, vreinterpret_u32_u8(vres));
   1304 
   1305             src += 2;
   1306             dst += 2;
   1307             count -= 2;
   1308         } while(count);
   1309     }
   1310 }
   1311 
   1312 ///////////////////////////////////////////////////////////////////////////////
   1313 
   1314 #undef    DEBUG_OPAQUE_DITHER
   1315 
   1316 #if    defined(DEBUG_OPAQUE_DITHER)
   1317 static void showme8(char *str, void *p, int len)
   1318 {
   1319     static char buf[256];
   1320     char tbuf[32];
   1321     int i;
   1322     char *pc = (char*) p;
   1323     sprintf(buf,"%8s:", str);
   1324     for(i=0;i<len;i++) {
   1325         sprintf(tbuf, "   %02x", pc[i]);
   1326         strcat(buf, tbuf);
   1327     }
   1328     SkDebugf("%s\n", buf);
   1329 }
   1330 static void showme16(char *str, void *p, int len)
   1331 {
   1332     static char buf[256];
   1333     char tbuf[32];
   1334     int i;
   1335     uint16_t *pc = (uint16_t*) p;
   1336     sprintf(buf,"%8s:", str);
   1337     len = (len / sizeof(uint16_t));    /* passed as bytes */
   1338     for(i=0;i<len;i++) {
   1339         sprintf(tbuf, " %04x", pc[i]);
   1340         strcat(buf, tbuf);
   1341     }
   1342     SkDebugf("%s\n", buf);
   1343 }
   1344 #endif
   1345 #endif // #ifdef SK_CPU_ARM32
   1346 
   1347 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   1348                                    const SkPMColor* SK_RESTRICT src,
   1349                                    int count, U8CPU alpha, int x, int y) {
   1350     SkASSERT(255 == alpha);
   1351 
   1352 #define    UNROLL    8
   1353 
   1354     if (count >= UNROLL) {
   1355 
   1356 #if defined(DEBUG_OPAQUE_DITHER)
   1357     uint16_t tmpbuf[UNROLL];
   1358     int td[UNROLL];
   1359     int tdv[UNROLL];
   1360     int ta[UNROLL];
   1361     int tap[UNROLL];
   1362     uint16_t in_dst[UNROLL];
   1363     int offset = 0;
   1364     int noisy = 0;
   1365 #endif
   1366 
   1367     uint8x8_t dbase;
   1368     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1369     dbase = vld1_u8(dstart);
   1370 
   1371         do {
   1372         uint8x8x4_t vsrc;
   1373         uint8x8_t sr, sg, sb, sa, d;
   1374         uint16x8_t dst8, scale8, alpha8;
   1375         uint16x8_t dst_r, dst_g, dst_b;
   1376 
   1377 #if defined(DEBUG_OPAQUE_DITHER)
   1378         // calculate 8 elements worth into a temp buffer
   1379         {
   1380         int my_y = y;
   1381         int my_x = x;
   1382         SkPMColor* my_src = (SkPMColor*)src;
   1383         uint16_t* my_dst = dst;
   1384         int i;
   1385 
   1386         DITHER_565_SCAN(my_y);
   1387         for(i = 0; i < UNROLL; i++) {
   1388             SkPMColor c = *my_src++;
   1389             SkPMColorAssert(c);
   1390             if (c) {
   1391                 unsigned a = SkGetPackedA32(c);
   1392 
   1393                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
   1394                 tdv[i] = DITHER_VALUE(my_x);
   1395                 ta[i] = a;
   1396                 tap[i] = SkAlpha255To256(a);
   1397                 td[i] = d;
   1398 
   1399                 unsigned sr = SkGetPackedR32(c);
   1400                 unsigned sg = SkGetPackedG32(c);
   1401                 unsigned sb = SkGetPackedB32(c);
   1402                 sr = SkDITHER_R32_FOR_565(sr, d);
   1403                 sg = SkDITHER_G32_FOR_565(sg, d);
   1404                 sb = SkDITHER_B32_FOR_565(sb, d);
   1405 
   1406                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1407                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
   1408                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1409                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1410                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1411                 td[i] = d;
   1412             } else {
   1413                 tmpbuf[i] = *my_dst;
   1414                 ta[i] = tdv[i] = td[i] = 0xbeef;
   1415             }
   1416             in_dst[i] = *my_dst;
   1417             my_dst += 1;
   1418             DITHER_INC_X(my_x);
   1419         }
   1420         }
   1421 #endif
   1422 
   1423 #ifdef SK_CPU_ARM64
   1424         vsrc = sk_vld4_u8_arm64_4(src);
   1425 #else
   1426         {
   1427         register uint8x8_t d0 asm("d0");
   1428         register uint8x8_t d1 asm("d1");
   1429         register uint8x8_t d2 asm("d2");
   1430         register uint8x8_t d3 asm("d3");
   1431 
   1432         asm ("vld4.8    {d0-d3},[%[src]]! "
   1433             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
   1434             :
   1435         );
   1436         vsrc.val[0] = d0;
   1437         vsrc.val[1] = d1;
   1438         vsrc.val[2] = d2;
   1439         vsrc.val[3] = d3;
   1440         }
   1441 #endif
   1442         sa = vsrc.val[NEON_A];
   1443         sr = vsrc.val[NEON_R];
   1444         sg = vsrc.val[NEON_G];
   1445         sb = vsrc.val[NEON_B];
   1446 
   1447         /* calculate 'd', which will be 0..7
   1448          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
   1449          */
   1450         alpha8 = vmovl_u8(dbase);
   1451         alpha8 = vmlal_u8(alpha8, sa, dbase);
   1452         d = vshrn_n_u16(alpha8, 8);    // narrowing too
   1453 
   1454         // sr = sr - (sr>>5) + d
   1455         /* watching for 8-bit overflow.  d is 0..7; risky range of
   1456          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
   1457          * safe  as long as we do ((sr-sr>>5) + d)
   1458          */
   1459         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1460         sr = vadd_u8(sr, d);
   1461 
   1462         // sb = sb - (sb>>5) + d
   1463         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1464         sb = vadd_u8(sb, d);
   1465 
   1466         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1467         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1468         sg = vadd_u8(sg, vshr_n_u8(d,1));
   1469 
   1470         // need to pick up 8 dst's -- at 16 bits each, 128 bits
   1471         dst8 = vld1q_u16(dst);
   1472         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
   1473         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
   1474         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
   1475 
   1476         // blend
   1477         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1478 
   1479         // combine the addq and mul, save 3 insns
   1480         scale8 = vshrq_n_u16(scale8, 3);
   1481         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1482         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1483         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1484 
   1485         // repack to store
   1486         dst8 = vshrq_n_u16(dst_b, 5);
   1487         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1488         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1489 
   1490         vst1q_u16(dst, dst8);
   1491 
   1492 #if defined(DEBUG_OPAQUE_DITHER)
   1493         // verify my 8 elements match the temp buffer
   1494         {
   1495         int i, bad=0;
   1496         static int invocation;
   1497 
   1498         for (i = 0; i < UNROLL; i++) {
   1499             if (tmpbuf[i] != dst[i]) {
   1500                 bad=1;
   1501             }
   1502         }
   1503         if (bad) {
   1504             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
   1505                      invocation, offset);
   1506             SkDebugf("  alpha 0x%x\n", alpha);
   1507             for (i = 0; i < UNROLL; i++)
   1508                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
   1509                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
   1510                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
   1511 
   1512             showme16("alpha8", &alpha8, sizeof(alpha8));
   1513             showme16("scale8", &scale8, sizeof(scale8));
   1514             showme8("d", &d, sizeof(d));
   1515             showme16("dst8", &dst8, sizeof(dst8));
   1516             showme16("dst_b", &dst_b, sizeof(dst_b));
   1517             showme16("dst_g", &dst_g, sizeof(dst_g));
   1518             showme16("dst_r", &dst_r, sizeof(dst_r));
   1519             showme8("sb", &sb, sizeof(sb));
   1520             showme8("sg", &sg, sizeof(sg));
   1521             showme8("sr", &sr, sizeof(sr));
   1522 
   1523             return;
   1524         }
   1525         offset += UNROLL;
   1526         invocation++;
   1527         }
   1528 #endif
   1529         dst += UNROLL;
   1530         count -= UNROLL;
   1531         // skip x += UNROLL, since it's unchanged mod-4
   1532         } while (count >= UNROLL);
   1533     }
   1534 #undef    UNROLL
   1535 
   1536     // residuals
   1537     if (count > 0) {
   1538         DITHER_565_SCAN(y);
   1539         do {
   1540             SkPMColor c = *src++;
   1541             SkPMColorAssert(c);
   1542             if (c) {
   1543                 unsigned a = SkGetPackedA32(c);
   1544 
   1545                 // dither and alpha are just temporary variables to work-around
   1546                 // an ICE in debug.
   1547                 unsigned dither = DITHER_VALUE(x);
   1548                 unsigned alpha = SkAlpha255To256(a);
   1549                 int d = SkAlphaMul(dither, alpha);
   1550 
   1551                 unsigned sr = SkGetPackedR32(c);
   1552                 unsigned sg = SkGetPackedG32(c);
   1553                 unsigned sb = SkGetPackedB32(c);
   1554                 sr = SkDITHER_R32_FOR_565(sr, d);
   1555                 sg = SkDITHER_G32_FOR_565(sg, d);
   1556                 sb = SkDITHER_B32_FOR_565(sb, d);
   1557 
   1558                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1559                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1560                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1561                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1562                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1563             }
   1564             dst += 1;
   1565             DITHER_INC_X(x);
   1566         } while (--count != 0);
   1567     }
   1568 }
   1569 
   1570 ///////////////////////////////////////////////////////////////////////////////
   1571 
   1572 #undef    DEBUG_S32_OPAQUE_DITHER
   1573 
   1574 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1575                                  const SkPMColor* SK_RESTRICT src,
   1576                                  int count, U8CPU alpha, int x, int y) {
   1577     SkASSERT(255 == alpha);
   1578 
   1579 #define    UNROLL    8
   1580     if (count >= UNROLL) {
   1581     uint8x8_t d;
   1582     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1583     d = vld1_u8(dstart);
   1584 
   1585     while (count >= UNROLL) {
   1586         uint8x8_t sr, sg, sb;
   1587         uint16x8_t dr, dg, db;
   1588         uint16x8_t dst8;
   1589         uint8x8x4_t vsrc;
   1590 
   1591 #ifdef SK_CPU_ARM64
   1592         vsrc = sk_vld4_u8_arm64_3(src);
   1593 #else
   1594         {
   1595         register uint8x8_t d0 asm("d0");
   1596         register uint8x8_t d1 asm("d1");
   1597         register uint8x8_t d2 asm("d2");
   1598         register uint8x8_t d3 asm("d3");
   1599 
   1600         asm (
   1601             "vld4.8    {d0-d3},[%[src]]! "
   1602             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   1603             :
   1604         );
   1605         vsrc.val[0] = d0;
   1606         vsrc.val[1] = d1;
   1607         vsrc.val[2] = d2;
   1608         }
   1609 #endif
   1610         sr = vsrc.val[NEON_R];
   1611         sg = vsrc.val[NEON_G];
   1612         sb = vsrc.val[NEON_B];
   1613 
   1614         /* XXX: if we want to prefetch, hide it in the above asm()
   1615          * using the gcc __builtin_prefetch(), the prefetch will
   1616          * fall to the bottom of the loop -- it won't stick up
   1617          * at the top of the loop, just after the vld4.
   1618          */
   1619 
   1620         // sr = sr - (sr>>5) + d
   1621         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1622         dr = vaddl_u8(sr, d);
   1623 
   1624         // sb = sb - (sb>>5) + d
   1625         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1626         db = vaddl_u8(sb, d);
   1627 
   1628         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1629         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1630         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
   1631 
   1632         // pack high bits of each into 565 format  (rgb, b is lsb)
   1633         dst8 = vshrq_n_u16(db, 3);
   1634         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1635         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
   1636 
   1637         // store it
   1638         vst1q_u16(dst, dst8);
   1639 
   1640 #if    defined(DEBUG_S32_OPAQUE_DITHER)
   1641         // always good to know if we generated good results
   1642         {
   1643         int i, myx = x, myy = y;
   1644         DITHER_565_SCAN(myy);
   1645         for (i=0;i<UNROLL;i++) {
   1646             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
   1647             SkPMColor c = src[i-8];
   1648             unsigned dither = DITHER_VALUE(myx);
   1649             uint16_t val = SkDitherRGB32To565(c, dither);
   1650             if (val != dst[i]) {
   1651             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
   1652                 c, dither, val, dst[i], dstart[i]);
   1653             }
   1654             DITHER_INC_X(myx);
   1655         }
   1656         }
   1657 #endif
   1658 
   1659         dst += UNROLL;
   1660         // we don't need to increment src as the asm above has already done it
   1661         count -= UNROLL;
   1662         x += UNROLL;        // probably superfluous
   1663     }
   1664     }
   1665 #undef    UNROLL
   1666 
   1667     // residuals
   1668     if (count > 0) {
   1669         DITHER_565_SCAN(y);
   1670         do {
   1671             SkPMColor c = *src++;
   1672             SkPMColorAssert(c);
   1673             SkASSERT(SkGetPackedA32(c) == 255);
   1674 
   1675             unsigned dither = DITHER_VALUE(x);
   1676             *dst++ = SkDitherRGB32To565(c, dither);
   1677             DITHER_INC_X(x);
   1678         } while (--count != 0);
   1679     }
   1680 }
   1681 
   1682 ///////////////////////////////////////////////////////////////////////////////
   1683 
   1684 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
   1685     // no dither
   1686     S32_D565_Opaque_neon,
   1687     S32_D565_Blend_neon,
   1688     S32A_D565_Opaque_neon,
   1689 #if 0
   1690     S32A_D565_Blend_neon,
   1691 #else
   1692     NULL,   // https://code.google.com/p/skia/issues/detail?id=2797
   1693 #endif
   1694 
   1695     // dither
   1696     S32_D565_Opaque_Dither_neon,
   1697     S32_D565_Blend_Dither_neon,
   1698     S32A_D565_Opaque_Dither_neon,
   1699     NULL,   // S32A_D565_Blend_Dither
   1700 };
   1701 
   1702 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
   1703     Color32A_D565_neon,    // Color32_D565,
   1704     Color32A_D565_neon,    // Color32A_D565,
   1705     Color32A_D565_neon,    // Color32_D565_Dither,
   1706     Color32A_D565_neon,    // Color32A_D565_Dither
   1707 };
   1708 
   1709 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1710     NULL,   // S32_Opaque,
   1711     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1712     /*
   1713      * We have two choices for S32A_Opaque procs. The one reads the src alpha
   1714      * value and attempts to optimize accordingly.  The optimization is
   1715      * sensitive to the source content and is not a win in all cases. For
   1716      * example, if there are a lot of transitions between the alpha states,
   1717      * the performance will almost certainly be worse.  However, for many
   1718      * common cases the performance is equivalent or better than the standard
   1719      * case where we do not inspect the src alpha.
   1720      */
   1721 #if SK_A32_SHIFT == 24
   1722     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
   1723     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
   1724 #else
   1725     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
   1726 #endif
   1727 #ifdef SK_CPU_ARM32
   1728     S32A_Blend_BlitRow32_neon        // S32A_Blend
   1729 #else
   1730     NULL
   1731 #endif
   1732 };
   1733