Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkColor_opts_neon.h"
     18 #include <arm_neon.h>
     19 
     20 #ifdef SK_CPU_ARM64
     21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
     22     uint8x8x4_t vsrc;
     23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
     24 
     25     asm (
     26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     27         "mov    %[vsrc0].8b, v0.8b             \t\n"
     28         "mov    %[vsrc1].8b, v1.8b             \t\n"
     29         "mov    %[vsrc2].8b, v2.8b             \t\n"
     30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
     32         : : "v0", "v1", "v2", "v3"
     33     );
     34 
     35     vsrc.val[0] = vsrc_0;
     36     vsrc.val[1] = vsrc_1;
     37     vsrc.val[2] = vsrc_2;
     38 
     39     return vsrc;
     40 }
     41 
     42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
     43     uint8x8x4_t vsrc;
     44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
     45 
     46     asm (
     47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     48         "mov    %[vsrc0].8b, v0.8b             \t\n"
     49         "mov    %[vsrc1].8b, v1.8b             \t\n"
     50         "mov    %[vsrc2].8b, v2.8b             \t\n"
     51         "mov    %[vsrc3].8b, v3.8b             \t\n"
     52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
     54           [src] "+&r" (src)
     55         : : "v0", "v1", "v2", "v3"
     56     );
     57 
     58     vsrc.val[0] = vsrc_0;
     59     vsrc.val[1] = vsrc_1;
     60     vsrc.val[2] = vsrc_2;
     61     vsrc.val[3] = vsrc_3;
     62 
     63     return vsrc;
     64 }
     65 #endif
     66 
     67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     68                            const SkPMColor* SK_RESTRICT src, int count,
     69                            U8CPU alpha, int /*x*/, int /*y*/) {
     70     SkASSERT(255 == alpha);
     71 
     72     while (count >= 8) {
     73         uint8x8x4_t vsrc;
     74         uint16x8_t vdst;
     75 
     76         // Load
     77 #ifdef SK_CPU_ARM64
     78         vsrc = sk_vld4_u8_arm64_3(src);
     79 #else
     80         vsrc = vld4_u8((uint8_t*)src);
     81         src += 8;
     82 #endif
     83 
     84         // Convert src to 565
     85         vdst = SkPixel32ToPixel16_neon8(vsrc);
     86 
     87         // Store
     88         vst1q_u16(dst, vdst);
     89 
     90         // Prepare next iteration
     91         dst += 8;
     92         count -= 8;
     93     };
     94 
     95     // Leftovers
     96     while (count > 0) {
     97         SkPMColor c = *src++;
     98         SkPMColorAssert(c);
     99         *dst = SkPixel32ToPixel16_ToU16(c);
    100         dst++;
    101         count--;
    102     };
    103 }
    104 
    105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    106                           const SkPMColor* SK_RESTRICT src, int count,
    107                           U8CPU alpha, int /*x*/, int /*y*/) {
    108     SkASSERT(255 > alpha);
    109 
    110     uint16x8_t vmask_blue, vscale;
    111 
    112     // prepare constants
    113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
    114     vmask_blue = vmovq_n_u16(0x1F);
    115 
    116     while (count >= 8) {
    117         uint8x8x4_t vsrc;
    118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    119         uint16x8_t vres_r, vres_g, vres_b;
    120 
    121         // Load src
    122 #ifdef SK_CPU_ARM64
    123         vsrc = sk_vld4_u8_arm64_3(src);
    124 #else
    125         {
    126         register uint8x8_t d0 asm("d0");
    127         register uint8x8_t d1 asm("d1");
    128         register uint8x8_t d2 asm("d2");
    129         register uint8x8_t d3 asm("d3");
    130 
    131         asm (
    132             "vld4.8    {d0-d3},[%[src]]!"
    133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    134             :
    135         );
    136         vsrc.val[0] = d0;
    137         vsrc.val[1] = d1;
    138         vsrc.val[2] = d2;
    139         }
    140 #endif
    141 
    142         // Load and unpack dst
    143         vdst = vld1q_u16(dst);
    144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
    145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
    146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
    147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
    148 
    149         // Shift src to 565 range
    150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
    151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
    152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
    153 
    154         // Scale src - dst
    155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
    156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
    157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
    158 
    159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
    160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
    161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
    162 
    163         vres_r += vdst_r;
    164         vres_g += vdst_g;
    165         vres_b += vdst_b;
    166 
    167         // Combine
    168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
    169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
    170 
    171         // Store
    172         vst1q_u16(dst, vres_b);
    173         dst += 8;
    174         count -= 8;
    175     }
    176     if (count > 0) {
    177         int scale = SkAlpha255To256(alpha);
    178         do {
    179             SkPMColor c = *src++;
    180             SkPMColorAssert(c);
    181             uint16_t d = *dst;
    182             *dst++ = SkPackRGB16(
    183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
    184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
    185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
    186         } while (--count != 0);
    187     }
    188 }
    189 
    190 #ifdef SK_CPU_ARM32
    191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    192                            const SkPMColor* SK_RESTRICT src, int count,
    193                            U8CPU alpha, int /*x*/, int /*y*/) {
    194     SkASSERT(255 == alpha);
    195 
    196     if (count >= 8) {
    197         uint16_t* SK_RESTRICT keep_dst = 0;
    198 
    199         asm volatile (
    200                       "ands       ip, %[count], #7            \n\t"
    201                       "vmov.u8    d31, #1<<7                  \n\t"
    202                       "vld1.16    {q12}, [%[dst]]             \n\t"
    203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
    204                       // Thumb does not support the standard ARM conditional
    205                       // instructions but instead requires the 'it' instruction
    206                       // to signal conditional execution
    207                       "it eq                                  \n\t"
    208                       "moveq      ip, #8                      \n\t"
    209                       "mov        %[keep_dst], %[dst]         \n\t"
    210 
    211                       "add        %[src], %[src], ip, LSL#2   \n\t"
    212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
    213                       "subs       %[count], %[count], ip      \n\t"
    214                       "b          9f                          \n\t"
    215                       // LOOP
    216                       "2:                                         \n\t"
    217 
    218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
    219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
    220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
    221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
    222                       "subs       %[count], %[count], #8      \n\t"
    223                       "9:                                         \n\t"
    224                       "pld        [%[dst],#32]                \n\t"
    225                       // expand 0565 q12 to 8888 {d4-d7}
    226                       "vmovn.u16  d4, q12                     \n\t"
    227                       "vshr.u16   q11, q12, #5                \n\t"
    228                       "vshr.u16   q10, q12, #6+5              \n\t"
    229                       "vmovn.u16  d5, q11                     \n\t"
    230                       "vmovn.u16  d6, q10                     \n\t"
    231                       "vshl.u8    d4, d4, #3                  \n\t"
    232                       "vshl.u8    d5, d5, #2                  \n\t"
    233                       "vshl.u8    d6, d6, #3                  \n\t"
    234 
    235                       "vmovl.u8   q14, d31                    \n\t"
    236                       "vmovl.u8   q13, d31                    \n\t"
    237                       "vmovl.u8   q12, d31                    \n\t"
    238 
    239                       // duplicate in 4/2/1 & 8pix vsns
    240                       "vmvn.8     d30, d3                     \n\t"
    241                       "vmlal.u8   q14, d30, d6                \n\t"
    242                       "vmlal.u8   q13, d30, d5                \n\t"
    243                       "vmlal.u8   q12, d30, d4                \n\t"
    244                       "vshr.u16   q8, q14, #5                 \n\t"
    245                       "vshr.u16   q9, q13, #6                 \n\t"
    246                       "vaddhn.u16 d6, q14, q8                 \n\t"
    247                       "vshr.u16   q8, q12, #5                 \n\t"
    248                       "vaddhn.u16 d5, q13, q9                 \n\t"
    249                       "vaddhn.u16 d4, q12, q8                 \n\t"
    250                       // intentionally don't calculate alpha
    251                       // result in d4-d6
    252 
    253             #ifdef SK_PMCOLOR_IS_RGBA
    254                       "vqadd.u8   d6, d6, d0                  \n\t"
    255                       "vqadd.u8   d5, d5, d1                  \n\t"
    256                       "vqadd.u8   d4, d4, d2                  \n\t"
    257             #else
    258                       "vqadd.u8   d6, d6, d2                  \n\t"
    259                       "vqadd.u8   d5, d5, d1                  \n\t"
    260                       "vqadd.u8   d4, d4, d0                  \n\t"
    261             #endif
    262 
    263                       // pack 8888 {d4-d6} to 0565 q10
    264                       "vshll.u8   q10, d6, #8                 \n\t"
    265                       "vshll.u8   q3, d5, #8                  \n\t"
    266                       "vshll.u8   q2, d4, #8                  \n\t"
    267                       "vsri.u16   q10, q3, #5                 \n\t"
    268                       "vsri.u16   q10, q2, #11                \n\t"
    269 
    270                       "bne        2b                          \n\t"
    271 
    272                       "1:                                         \n\t"
    273                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
    274                       : [count] "+r" (count)
    275                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    276                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    277                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    278                       "d30","d31"
    279                       );
    280     }
    281     else
    282     {   // handle count < 8
    283         uint16_t* SK_RESTRICT keep_dst = 0;
    284 
    285         asm volatile (
    286                       "vmov.u8    d31, #1<<7                  \n\t"
    287                       "mov        %[keep_dst], %[dst]         \n\t"
    288 
    289                       "tst        %[count], #4                \n\t"
    290                       "beq        14f                         \n\t"
    291                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    292                       "vld1.32    {q1}, [%[src]]!             \n\t"
    293 
    294                       "14:                                        \n\t"
    295                       "tst        %[count], #2                \n\t"
    296                       "beq        12f                         \n\t"
    297                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    298                       "vld1.32    {d1}, [%[src]]!             \n\t"
    299 
    300                       "12:                                        \n\t"
    301                       "tst        %[count], #1                \n\t"
    302                       "beq        11f                         \n\t"
    303                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    304                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    305 
    306                       "11:                                        \n\t"
    307                       // unzips achieve the same as a vld4 operation
    308                       "vuzp.u16   q0, q1                      \n\t"
    309                       "vuzp.u8    d0, d1                      \n\t"
    310                       "vuzp.u8    d2, d3                      \n\t"
    311                       // expand 0565 q12 to 8888 {d4-d7}
    312                       "vmovn.u16  d4, q12                     \n\t"
    313                       "vshr.u16   q11, q12, #5                \n\t"
    314                       "vshr.u16   q10, q12, #6+5              \n\t"
    315                       "vmovn.u16  d5, q11                     \n\t"
    316                       "vmovn.u16  d6, q10                     \n\t"
    317                       "vshl.u8    d4, d4, #3                  \n\t"
    318                       "vshl.u8    d5, d5, #2                  \n\t"
    319                       "vshl.u8    d6, d6, #3                  \n\t"
    320 
    321                       "vmovl.u8   q14, d31                    \n\t"
    322                       "vmovl.u8   q13, d31                    \n\t"
    323                       "vmovl.u8   q12, d31                    \n\t"
    324 
    325                       // duplicate in 4/2/1 & 8pix vsns
    326                       "vmvn.8     d30, d3                     \n\t"
    327                       "vmlal.u8   q14, d30, d6                \n\t"
    328                       "vmlal.u8   q13, d30, d5                \n\t"
    329                       "vmlal.u8   q12, d30, d4                \n\t"
    330                       "vshr.u16   q8, q14, #5                 \n\t"
    331                       "vshr.u16   q9, q13, #6                 \n\t"
    332                       "vaddhn.u16 d6, q14, q8                 \n\t"
    333                       "vshr.u16   q8, q12, #5                 \n\t"
    334                       "vaddhn.u16 d5, q13, q9                 \n\t"
    335                       "vaddhn.u16 d4, q12, q8                 \n\t"
    336                       // intentionally don't calculate alpha
    337                       // result in d4-d6
    338 
    339             #ifdef SK_PMCOLOR_IS_RGBA
    340                       "vqadd.u8   d6, d6, d0                  \n\t"
    341                       "vqadd.u8   d5, d5, d1                  \n\t"
    342                       "vqadd.u8   d4, d4, d2                  \n\t"
    343             #else
    344                       "vqadd.u8   d6, d6, d2                  \n\t"
    345                       "vqadd.u8   d5, d5, d1                  \n\t"
    346                       "vqadd.u8   d4, d4, d0                  \n\t"
    347             #endif
    348 
    349                       // pack 8888 {d4-d6} to 0565 q10
    350                       "vshll.u8   q10, d6, #8                 \n\t"
    351                       "vshll.u8   q3, d5, #8                  \n\t"
    352                       "vshll.u8   q2, d4, #8                  \n\t"
    353                       "vsri.u16   q10, q3, #5                 \n\t"
    354                       "vsri.u16   q10, q2, #11                \n\t"
    355 
    356                       // store
    357                       "tst        %[count], #4                \n\t"
    358                       "beq        24f                         \n\t"
    359                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    360 
    361                       "24:                                        \n\t"
    362                       "tst        %[count], #2                \n\t"
    363                       "beq        22f                         \n\t"
    364                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    365 
    366                       "22:                                        \n\t"
    367                       "tst        %[count], #1                \n\t"
    368                       "beq        21f                         \n\t"
    369                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    370 
    371                       "21:                                        \n\t"
    372                       : [count] "+r" (count)
    373                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    374                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    375                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    376                       "d30","d31"
    377                       );
    378     }
    379 }
    380 
    381 #else // #ifdef SK_CPU_ARM32
    382 
    383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    384                            const SkPMColor* SK_RESTRICT src, int count,
    385                            U8CPU alpha, int /*x*/, int /*y*/) {
    386     SkASSERT(255 == alpha);
    387 
    388     if (count >= 16) {
    389         asm (
    390             "movi    v4.8h, #0x80                   \t\n"
    391 
    392             "1:                                     \t\n"
    393             "sub     %w[count], %w[count], #16      \t\n"
    394             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
    395             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
    396             "prfm    pldl1keep, [%[src],#512]       \t\n"
    397             "prfm    pldl1keep, [%[dst],#256]       \t\n"
    398             "ushr    v20.8h, v17.8h, #5             \t\n"
    399             "ushr    v31.8h, v16.8h, #5             \t\n"
    400             "xtn     v6.8b, v31.8h                  \t\n"
    401             "xtn2    v6.16b, v20.8h                 \t\n"
    402             "ushr    v20.8h, v17.8h, #11            \t\n"
    403             "shl     v19.16b, v6.16b, #2            \t\n"
    404             "ushr    v31.8h, v16.8h, #11            \t\n"
    405             "xtn     v22.8b, v31.8h                 \t\n"
    406             "xtn2    v22.16b, v20.8h                \t\n"
    407             "shl     v18.16b, v22.16b, #3           \t\n"
    408             "mvn     v3.16b, v3.16b                 \t\n"
    409             "xtn     v16.8b, v16.8h                 \t\n"
    410             "mov     v7.16b, v4.16b                 \t\n"
    411             "xtn2    v16.16b, v17.8h                \t\n"
    412             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
    413             "shl     v16.16b, v16.16b, #3           \t\n"
    414             "mov     v22.16b, v4.16b                \t\n"
    415             "ushr    v24.8h, v7.8h, #6              \t\n"
    416             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
    417             "ushr    v20.8h, v22.8h, #5             \t\n"
    418             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
    419             "cmp     %w[count], #16                 \t\n"
    420             "mov     v6.16b, v4.16b                 \t\n"
    421             "mov     v5.16b, v4.16b                 \t\n"
    422             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
    423             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
    424             "mov     v17.16b, v4.16b                \t\n"
    425             "ushr    v19.8h, v6.8h, #5              \t\n"
    426             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
    427             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
    428             "ushr    v18.8h, v5.8h, #6              \t\n"
    429             "ushr    v21.8h, v17.8h, #5             \t\n"
    430             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
    431             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
    432             "mov     v22.16b, v4.16b                \t\n"
    433             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
    434             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
    435             "ushr    v5.8h, v22.8h, #5              \t\n"
    436             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
    437             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
    438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
    439             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
    440             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
    441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
    442             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
    443             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
    444 #else
    445 #error "This function only supports BGRA and RGBA."
    446 #endif
    447             "shll    v22.8h, v20.8b, #8             \t\n"
    448             "shll    v5.8h, v7.8b, #8               \t\n"
    449             "sri     v22.8h, v5.8h, #5              \t\n"
    450             "shll    v17.8h, v6.8b, #8              \t\n"
    451             "shll2   v23.8h, v20.16b, #8            \t\n"
    452             "shll2   v7.8h, v7.16b, #8              \t\n"
    453             "sri     v22.8h, v17.8h, #11            \t\n"
    454             "sri     v23.8h, v7.8h, #5              \t\n"
    455             "shll2   v6.8h, v6.16b, #8              \t\n"
    456             "st1     {v22.8h}, [%[dst]], #16        \t\n"
    457             "sri     v23.8h, v6.8h, #11             \t\n"
    458             "st1     {v23.8h}, [%[dst]], #16        \t\n"
    459             "b.ge    1b                             \t\n"
    460             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
    461             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    462                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
    463                "v31"
    464         );
    465     }
    466         // Leftovers
    467     if (count > 0) {
    468         do {
    469             SkPMColor c = *src++;
    470             SkPMColorAssert(c);
    471             if (c) {
    472                 *dst = SkSrcOver32To16(c, *dst);
    473             }
    474             dst += 1;
    475         } while (--count != 0);
    476     }
    477 }
    478 #endif // #ifdef SK_CPU_ARM32
    479 
    480 static uint32_t pmcolor_to_expand16(SkPMColor c) {
    481     unsigned r = SkGetPackedR32(c);
    482     unsigned g = SkGetPackedG32(c);
    483     unsigned b = SkGetPackedB32(c);
    484     return (g << 24) | (r << 13) | (b << 2);
    485 }
    486 
    487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
    488     uint32_t src_expand;
    489     unsigned scale;
    490     uint16x8_t vmask_blue;
    491 
    492     if (count <= 0) return;
    493     SkASSERT(((size_t)dst & 0x01) == 0);
    494 
    495     /*
    496      * This preamble code is in order to make dst aligned to 8 bytes
    497      * in the next mutiple bytes read & write access.
    498      */
    499     src_expand = pmcolor_to_expand16(src);
    500     scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
    501 
    502 #define DST_ALIGN 8
    503 
    504     /*
    505      * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
    506      */
    507     int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
    508 
    509     for (int i = 0; i < preamble_size; i+=2, dst++) {
    510         uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
    511         *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
    512         if (--count == 0)
    513             break;
    514     }
    515 
    516     int count16 = 0;
    517     count16 = count >> 4;
    518     vmask_blue = vmovq_n_u16(SK_B16_MASK);
    519 
    520     if (count16) {
    521         uint16x8_t wide_sr;
    522         uint16x8_t wide_sg;
    523         uint16x8_t wide_sb;
    524         uint16x8_t wide_256_sa;
    525 
    526         unsigned sr = SkGetPackedR32(src);
    527         unsigned sg = SkGetPackedG32(src);
    528         unsigned sb = SkGetPackedB32(src);
    529         unsigned sa = SkGetPackedA32(src);
    530 
    531         // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
    532         // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
    533         //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
    534         wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
    535 
    536         // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
    537         //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
    538         wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
    539 
    540         // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
    541         //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
    542         wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
    543 
    544         wide_256_sa =
    545             vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
    546 
    547         while (count16-- > 0) {
    548             uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
    549             uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
    550             vdst1 = vld1q_u16(dst);
    551             dst += 8;
    552             vdst2 = vld1q_u16(dst);
    553             dst -= 8;    //to store dst again.
    554 
    555             vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
    556             vdst1_b = vdst1 & vmask_blue;                              // extract blue
    557             vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
    558             vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
    559 
    560             vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
    561             vdst2_b = vdst2 & vmask_blue;                              // extract blue
    562             vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
    563             vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
    564 
    565             vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
    566             vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
    567             vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
    568 
    569             vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
    570             vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
    571             vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
    572 
    573             vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
    574             vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
    575             vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
    576 
    577             vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
    578             vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
    579 
    580             vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
    581             vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
    582             vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
    583 
    584             vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
    585             vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
    586 
    587             vst1q_u16(dst, vdst1);
    588             dst += 8;
    589             vst1q_u16(dst, vdst2);
    590             dst += 8;
    591         }
    592     }
    593 
    594     count &= 0xF;
    595     if (count > 0) {
    596         do {
    597             uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
    598             *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
    599             dst += 1;
    600         } while (--count != 0);
    601     }
    602 }
    603 
    604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
    605     prod += vdupq_n_u16(128);
    606     prod += vshrq_n_u16(prod, 8);
    607     return vshrq_n_u16(prod, 8);
    608 }
    609 
    610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    611                           const SkPMColor* SK_RESTRICT src, int count,
    612                           U8CPU alpha, int /*x*/, int /*y*/) {
    613    SkASSERT(255 > alpha);
    614 
    615     /* This code implements a Neon version of S32A_D565_Blend. The results have
    616      * a few mismatches compared to the original code. These mismatches never
    617      * exceed 1.
    618      */
    619 
    620     if (count >= 8) {
    621         uint16x8_t valpha_max, vmask_blue;
    622         uint8x8_t valpha;
    623 
    624         // prepare constants
    625         valpha_max = vmovq_n_u16(255);
    626         valpha = vdup_n_u8(alpha);
    627         vmask_blue = vmovq_n_u16(SK_B16_MASK);
    628 
    629         do {
    630             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    631             uint16x8_t vres_a, vres_r, vres_g, vres_b;
    632             uint8x8x4_t vsrc;
    633 
    634             // load pixels
    635             vdst = vld1q_u16(dst);
    636 #ifdef SK_CPU_ARM64
    637             vsrc = sk_vld4_u8_arm64_4(src);
    638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    639             asm (
    640                 "vld4.u8 %h[vsrc], [%[src]]!"
    641                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
    642                 : :
    643             );
    644 #else
    645             register uint8x8_t d0 asm("d0");
    646             register uint8x8_t d1 asm("d1");
    647             register uint8x8_t d2 asm("d2");
    648             register uint8x8_t d3 asm("d3");
    649 
    650             asm volatile (
    651                 "vld4.u8    {d0-d3},[%[src]]!;"
    652                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    653                   [src] "+&r" (src)
    654                 : :
    655             );
    656             vsrc.val[0] = d0;
    657             vsrc.val[1] = d1;
    658             vsrc.val[2] = d2;
    659             vsrc.val[3] = d3;
    660 #endif
    661 
    662 
    663             // deinterleave dst
    664             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
    665             vdst_b = vdst & vmask_blue;                     // extract blue
    666             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
    667             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
    668 
    669             // shift src to 565
    670             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
    671             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
    672             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
    673 
    674             // calc src * src_scale
    675             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
    676             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
    677             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
    678             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
    679 
    680             // prepare dst_scale
    681             vres_a = SkDiv255Round_neon8(vres_a);
    682             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
    683 
    684             // add dst * dst_scale to previous result
    685             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
    686             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
    687             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
    688 
    689 #ifdef S32A_D565_BLEND_EXACT
    690             // It is possible to get exact results with this but it is slow,
    691             // even slower than C code in some cases
    692             vres_r = SkDiv255Round_neon8(vres_r);
    693             vres_g = SkDiv255Round_neon8(vres_g);
    694             vres_b = SkDiv255Round_neon8(vres_b);
    695 #else
    696             vres_r = vrshrq_n_u16(vres_r, 8);
    697             vres_g = vrshrq_n_u16(vres_g, 8);
    698             vres_b = vrshrq_n_u16(vres_b, 8);
    699 #endif
    700             // pack result
    701             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
    702             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
    703 
    704             // store
    705             vst1q_u16(dst, vres_b);
    706             dst += 8;
    707             count -= 8;
    708         } while (count >= 8);
    709     }
    710 
    711     // leftovers
    712     while (count-- > 0) {
    713         SkPMColor sc = *src++;
    714         if (sc) {
    715             uint16_t dc = *dst;
    716             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    717             unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
    718             unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
    719             unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
    720             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    721         }
    722         dst += 1;
    723     }
    724 }
    725 
    726 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    727  * each dither value is spaced out into byte lanes, and repeated
    728  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    729  * start of each row.
    730  */
    731 static const uint8_t gDitherMatrix_Neon[48] = {
    732     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    733     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    734     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    735     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    736 
    737 };
    738 
    739 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    740                                 int count, U8CPU alpha, int x, int y)
    741 {
    742 
    743     SkASSERT(255 > alpha);
    744 
    745     // rescale alpha to range 1 - 256
    746     int scale = SkAlpha255To256(alpha);
    747 
    748     if (count >= 8) {
    749         /* select row and offset for dither array */
    750         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    751 
    752         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
    753         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
    754 
    755         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
    756         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
    757 
    758         do {
    759 
    760             uint8x8x4_t vsrc;
    761             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
    762             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
    763             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
    764             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
    765             uint16x8_t vdst;
    766             uint16x8_t vdst_r, vdst_g, vdst_b;
    767             int16x8_t vres_r, vres_g, vres_b;
    768             int8x8_t vres8_r, vres8_g, vres8_b;
    769 
    770             // Load source and add dither
    771 #ifdef SK_CPU_ARM64
    772             vsrc = sk_vld4_u8_arm64_3(src);
    773 #else
    774             {
    775             register uint8x8_t d0 asm("d0");
    776             register uint8x8_t d1 asm("d1");
    777             register uint8x8_t d2 asm("d2");
    778             register uint8x8_t d3 asm("d3");
    779 
    780             asm (
    781                 "vld4.8    {d0-d3},[%[src]]! "
    782                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    783                 :
    784             );
    785             vsrc.val[0] = d0;
    786             vsrc.val[1] = d1;
    787             vsrc.val[2] = d2;
    788             }
    789 #endif
    790             vsrc_r = vsrc.val[NEON_R];
    791             vsrc_g = vsrc.val[NEON_G];
    792             vsrc_b = vsrc.val[NEON_B];
    793 
    794             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
    795             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
    796             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
    797 
    798             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
    799             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
    800             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
    801 
    802             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
    803             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
    804             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
    805 
    806             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
    807             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
    808             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
    809 
    810             // Load dst and unpack
    811             vdst = vld1q_u16(dst);
    812             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
    813             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
    814             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
    815 
    816             // subtract dst from src and widen
    817             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
    818             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
    819             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
    820 
    821             // multiply diffs by scale and shift
    822             vres_r = vmulq_s16(vres_r, vscale);
    823             vres_g = vmulq_s16(vres_g, vscale);
    824             vres_b = vmulq_s16(vres_b, vscale);
    825 
    826             vres8_r = vshrn_n_s16(vres_r, 8);
    827             vres8_g = vshrn_n_s16(vres_g, 8);
    828             vres8_b = vshrn_n_s16(vres_b, 8);
    829 
    830             // add dst to result
    831             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
    832             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
    833             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
    834 
    835             // put result into 565 format
    836             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
    837             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
    838 
    839             // Store result
    840             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
    841 
    842             // Next iteration
    843             dst += 8;
    844             count -= 8;
    845 
    846         } while (count >= 8);
    847     }
    848 
    849     // Leftovers
    850     if (count > 0) {
    851         int scale = SkAlpha255To256(alpha);
    852         DITHER_565_SCAN(y);
    853         do {
    854             SkPMColor c = *src++;
    855             SkPMColorAssert(c);
    856 
    857             int dither = DITHER_VALUE(x);
    858             int sr = SkGetPackedR32(c);
    859             int sg = SkGetPackedG32(c);
    860             int sb = SkGetPackedB32(c);
    861             sr = SkDITHER_R32To565(sr, dither);
    862             sg = SkDITHER_G32To565(sg, dither);
    863             sb = SkDITHER_B32To565(sb, dither);
    864 
    865             uint16_t d = *dst;
    866             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    867                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    868                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    869             DITHER_INC_X(x);
    870         } while (--count != 0);
    871     }
    872 }
    873 
    874 /* Neon version of S32_Blend_BlitRow32()
    875  * portable version is in src/core/SkBlitRow_D32.cpp
    876  */
    877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    878                               const SkPMColor* SK_RESTRICT src,
    879                               int count, U8CPU alpha) {
    880     SkASSERT(alpha <= 255);
    881 
    882     if (count <= 0) {
    883         return;
    884     }
    885 
    886     uint16_t src_scale = SkAlpha255To256(alpha);
    887     uint16_t dst_scale = 256 - src_scale;
    888 
    889     while (count >= 2) {
    890         uint8x8_t vsrc, vdst, vres;
    891         uint16x8_t vsrc_wide, vdst_wide;
    892 
    893         /* These commented prefetches are a big win for count
    894          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
    895          * They also hurt a little (<5%) on an A15
    896          */
    897         //__builtin_prefetch(src+32);
    898         //__builtin_prefetch(dst+32);
    899 
    900         // Load
    901         vsrc = vreinterpret_u8_u32(vld1_u32(src));
    902         vdst = vreinterpret_u8_u32(vld1_u32(dst));
    903 
    904         // Process src
    905         vsrc_wide = vmovl_u8(vsrc);
    906         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
    907 
    908         // Process dst
    909         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
    910 
    911         // Combine
    912         vdst_wide += vsrc_wide;
    913         vres = vshrn_n_u16(vdst_wide, 8);
    914 
    915         // Store
    916         vst1_u32(dst, vreinterpret_u32_u8(vres));
    917 
    918         src += 2;
    919         dst += 2;
    920         count -= 2;
    921     }
    922 
    923     if (count == 1) {
    924         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    925         uint16x8_t vsrc_wide, vdst_wide;
    926 
    927         // Load
    928         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    929         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    930 
    931         // Process
    932         vsrc_wide = vmovl_u8(vsrc);
    933         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
    934         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
    935         vdst_wide += vsrc_wide;
    936         vres = vshrn_n_u16(vdst_wide, 8);
    937 
    938         // Store
    939         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    940     }
    941 }
    942 
    943 #ifdef SK_CPU_ARM32
    944 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    945                          const SkPMColor* SK_RESTRICT src,
    946                          int count, U8CPU alpha) {
    947 
    948     SkASSERT(255 > alpha);
    949 
    950     if (count <= 0) {
    951         return;
    952     }
    953 
    954     unsigned alpha256 = SkAlpha255To256(alpha);
    955 
    956     // First deal with odd counts
    957     if (count & 1) {
    958         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    959         uint16x8_t vdst_wide, vsrc_wide;
    960         unsigned dst_scale;
    961 
    962         // Load
    963         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    964         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    965 
    966         // Calc dst_scale
    967         dst_scale = vget_lane_u8(vsrc, 3);
    968         dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
    969 
    970         // Process src
    971         vsrc_wide = vmovl_u8(vsrc);
    972         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
    973 
    974         // Process dst
    975         vdst_wide = vmovl_u8(vdst);
    976         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
    977 
    978         // Combine
    979         vdst_wide += vsrc_wide;
    980         vres = vshrn_n_u16(vdst_wide, 8);
    981 
    982         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    983         dst++;
    984         src++;
    985         count--;
    986     }
    987 
    988     if (count) {
    989         uint8x8_t alpha_mask;
    990         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    991         alpha_mask = vld1_u8(alpha_mask_setup);
    992 
    993         do {
    994 
    995             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
    996             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
    997 
    998             __builtin_prefetch(src+32);
    999             __builtin_prefetch(dst+32);
   1000 
   1001             // Load
   1002             vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1003             vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1004 
   1005             // Prepare src_scale
   1006             vsrc_scale = vdupq_n_u16(alpha256);
   1007 
   1008             // Calc dst_scale
   1009             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
   1010             vdst_scale = vmovl_u8(vsrc_alphas);
   1011             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
   1012             // A 16-bit lane would overflow if we used 0xFFFF here,
   1013             // so use an approximation with 0xFF00 that is off by 1,
   1014             // and add back 1 after to get the correct value.
   1015             // This is valid if alpha256 <= 255.
   1016             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
   1017             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
   1018             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
   1019 
   1020             // Process src
   1021             vsrc_wide = vmovl_u8(vsrc);
   1022             vsrc_wide *= vsrc_scale;
   1023 
   1024             // Process dst
   1025             vdst_wide = vmovl_u8(vdst);
   1026             vdst_wide *= vdst_scale;
   1027 
   1028             // Combine
   1029             vdst_wide += vsrc_wide;
   1030             vres = vshrn_n_u16(vdst_wide, 8);
   1031 
   1032             vst1_u32(dst, vreinterpret_u32_u8(vres));
   1033 
   1034             src += 2;
   1035             dst += 2;
   1036             count -= 2;
   1037         } while(count);
   1038     }
   1039 }
   1040 
   1041 ///////////////////////////////////////////////////////////////////////////////
   1042 
   1043 #endif // #ifdef SK_CPU_ARM32
   1044 
   1045 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   1046                                    const SkPMColor* SK_RESTRICT src,
   1047                                    int count, U8CPU alpha, int x, int y) {
   1048     SkASSERT(255 == alpha);
   1049 
   1050 #define    UNROLL    8
   1051 
   1052     if (count >= UNROLL) {
   1053 
   1054     uint8x8_t dbase;
   1055     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1056     dbase = vld1_u8(dstart);
   1057 
   1058         do {
   1059         uint8x8x4_t vsrc;
   1060         uint8x8_t sr, sg, sb, sa, d;
   1061         uint16x8_t dst8, scale8, alpha8;
   1062         uint16x8_t dst_r, dst_g, dst_b;
   1063 
   1064 #ifdef SK_CPU_ARM64
   1065         vsrc = sk_vld4_u8_arm64_4(src);
   1066 #else
   1067         {
   1068         register uint8x8_t d0 asm("d0");
   1069         register uint8x8_t d1 asm("d1");
   1070         register uint8x8_t d2 asm("d2");
   1071         register uint8x8_t d3 asm("d3");
   1072 
   1073         asm ("vld4.8    {d0-d3},[%[src]]! "
   1074             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
   1075             :
   1076         );
   1077         vsrc.val[0] = d0;
   1078         vsrc.val[1] = d1;
   1079         vsrc.val[2] = d2;
   1080         vsrc.val[3] = d3;
   1081         }
   1082 #endif
   1083         sa = vsrc.val[NEON_A];
   1084         sr = vsrc.val[NEON_R];
   1085         sg = vsrc.val[NEON_G];
   1086         sb = vsrc.val[NEON_B];
   1087 
   1088         /* calculate 'd', which will be 0..7
   1089          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
   1090          */
   1091         alpha8 = vmovl_u8(dbase);
   1092         alpha8 = vmlal_u8(alpha8, sa, dbase);
   1093         d = vshrn_n_u16(alpha8, 8);    // narrowing too
   1094 
   1095         // sr = sr - (sr>>5) + d
   1096         /* watching for 8-bit overflow.  d is 0..7; risky range of
   1097          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
   1098          * safe  as long as we do ((sr-sr>>5) + d)
   1099          */
   1100         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1101         sr = vadd_u8(sr, d);
   1102 
   1103         // sb = sb - (sb>>5) + d
   1104         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1105         sb = vadd_u8(sb, d);
   1106 
   1107         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1108         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1109         sg = vadd_u8(sg, vshr_n_u8(d,1));
   1110 
   1111         // need to pick up 8 dst's -- at 16 bits each, 128 bits
   1112         dst8 = vld1q_u16(dst);
   1113         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
   1114         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
   1115         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
   1116 
   1117         // blend
   1118         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1119 
   1120         // combine the addq and mul, save 3 insns
   1121         scale8 = vshrq_n_u16(scale8, 3);
   1122         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1123         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1124         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1125 
   1126         // repack to store
   1127         dst8 = vshrq_n_u16(dst_b, 5);
   1128         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1129         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1130 
   1131         vst1q_u16(dst, dst8);
   1132 
   1133         dst += UNROLL;
   1134         count -= UNROLL;
   1135         // skip x += UNROLL, since it's unchanged mod-4
   1136         } while (count >= UNROLL);
   1137     }
   1138 #undef    UNROLL
   1139 
   1140     // residuals
   1141     if (count > 0) {
   1142         DITHER_565_SCAN(y);
   1143         do {
   1144             SkPMColor c = *src++;
   1145             SkPMColorAssert(c);
   1146             if (c) {
   1147                 unsigned a = SkGetPackedA32(c);
   1148 
   1149                 // dither and alpha are just temporary variables to work-around
   1150                 // an ICE in debug.
   1151                 unsigned dither = DITHER_VALUE(x);
   1152                 unsigned alpha = SkAlpha255To256(a);
   1153                 int d = SkAlphaMul(dither, alpha);
   1154 
   1155                 unsigned sr = SkGetPackedR32(c);
   1156                 unsigned sg = SkGetPackedG32(c);
   1157                 unsigned sb = SkGetPackedB32(c);
   1158                 sr = SkDITHER_R32_FOR_565(sr, d);
   1159                 sg = SkDITHER_G32_FOR_565(sg, d);
   1160                 sb = SkDITHER_B32_FOR_565(sb, d);
   1161 
   1162                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1163                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1164                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1165                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1166                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1167             }
   1168             dst += 1;
   1169             DITHER_INC_X(x);
   1170         } while (--count != 0);
   1171     }
   1172 }
   1173 
   1174 ///////////////////////////////////////////////////////////////////////////////
   1175 
   1176 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1177                                  const SkPMColor* SK_RESTRICT src,
   1178                                  int count, U8CPU alpha, int x, int y) {
   1179     SkASSERT(255 == alpha);
   1180 
   1181 #define    UNROLL    8
   1182     if (count >= UNROLL) {
   1183     uint8x8_t d;
   1184     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1185     d = vld1_u8(dstart);
   1186 
   1187     while (count >= UNROLL) {
   1188         uint8x8_t sr, sg, sb;
   1189         uint16x8_t dr, dg, db;
   1190         uint16x8_t dst8;
   1191         uint8x8x4_t vsrc;
   1192 
   1193 #ifdef SK_CPU_ARM64
   1194         vsrc = sk_vld4_u8_arm64_3(src);
   1195 #else
   1196         {
   1197         register uint8x8_t d0 asm("d0");
   1198         register uint8x8_t d1 asm("d1");
   1199         register uint8x8_t d2 asm("d2");
   1200         register uint8x8_t d3 asm("d3");
   1201 
   1202         asm (
   1203             "vld4.8    {d0-d3},[%[src]]! "
   1204             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   1205             :
   1206         );
   1207         vsrc.val[0] = d0;
   1208         vsrc.val[1] = d1;
   1209         vsrc.val[2] = d2;
   1210         }
   1211 #endif
   1212         sr = vsrc.val[NEON_R];
   1213         sg = vsrc.val[NEON_G];
   1214         sb = vsrc.val[NEON_B];
   1215 
   1216         /* XXX: if we want to prefetch, hide it in the above asm()
   1217          * using the gcc __builtin_prefetch(), the prefetch will
   1218          * fall to the bottom of the loop -- it won't stick up
   1219          * at the top of the loop, just after the vld4.
   1220          */
   1221 
   1222         // sr = sr - (sr>>5) + d
   1223         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1224         dr = vaddl_u8(sr, d);
   1225 
   1226         // sb = sb - (sb>>5) + d
   1227         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1228         db = vaddl_u8(sb, d);
   1229 
   1230         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1231         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1232         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
   1233 
   1234         // pack high bits of each into 565 format  (rgb, b is lsb)
   1235         dst8 = vshrq_n_u16(db, 3);
   1236         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1237         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
   1238 
   1239         // store it
   1240         vst1q_u16(dst, dst8);
   1241 
   1242         dst += UNROLL;
   1243         // we don't need to increment src as the asm above has already done it
   1244         count -= UNROLL;
   1245         x += UNROLL;        // probably superfluous
   1246     }
   1247     }
   1248 #undef    UNROLL
   1249 
   1250     // residuals
   1251     if (count > 0) {
   1252         DITHER_565_SCAN(y);
   1253         do {
   1254             SkPMColor c = *src++;
   1255             SkPMColorAssert(c);
   1256             SkASSERT(SkGetPackedA32(c) == 255);
   1257 
   1258             unsigned dither = DITHER_VALUE(x);
   1259             *dst++ = SkDitherRGB32To565(c, dither);
   1260             DITHER_INC_X(x);
   1261         } while (--count != 0);
   1262     }
   1263 }
   1264 
   1265 ///////////////////////////////////////////////////////////////////////////////
   1266 
   1267 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
   1268     // no dither
   1269     S32_D565_Opaque_neon,
   1270     S32_D565_Blend_neon,
   1271     S32A_D565_Opaque_neon,
   1272 #if 0
   1273     S32A_D565_Blend_neon,
   1274 #else
   1275     nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
   1276 #endif
   1277 
   1278     // dither
   1279     S32_D565_Opaque_Dither_neon,
   1280     S32_D565_Blend_Dither_neon,
   1281     S32A_D565_Opaque_Dither_neon,
   1282     nullptr,   // S32A_D565_Blend_Dither
   1283 };
   1284 
   1285 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
   1286     Color32A_D565_neon,    // Color32_D565,
   1287     Color32A_D565_neon,    // Color32A_D565,
   1288     Color32A_D565_neon,    // Color32_D565_Dither,
   1289     Color32A_D565_neon,    // Color32A_D565_Dither
   1290 };
   1291 
   1292 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1293     nullptr,   // S32_Opaque,
   1294     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1295     nullptr,  // Ported to SkOpts
   1296 #ifdef SK_CPU_ARM32
   1297     S32A_Blend_BlitRow32_neon        // S32A_Blend
   1298 #else
   1299     nullptr
   1300 #endif
   1301 };
   1302