Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkColor_opts_neon.h"
     18 #include <arm_neon.h>
     19 
     20 #ifdef SK_CPU_ARM64
     21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
     22     uint8x8x4_t vsrc;
     23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
     24 
     25     asm (
     26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     27         "mov    %[vsrc0].8b, v0.8b             \t\n"
     28         "mov    %[vsrc1].8b, v1.8b             \t\n"
     29         "mov    %[vsrc2].8b, v2.8b             \t\n"
     30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
     32         : : "v0", "v1", "v2", "v3"
     33     );
     34 
     35     vsrc.val[0] = vsrc_0;
     36     vsrc.val[1] = vsrc_1;
     37     vsrc.val[2] = vsrc_2;
     38 
     39     return vsrc;
     40 }
     41 
     42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
     43     uint8x8x4_t vsrc;
     44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
     45 
     46     asm (
     47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
     48         "mov    %[vsrc0].8b, v0.8b             \t\n"
     49         "mov    %[vsrc1].8b, v1.8b             \t\n"
     50         "mov    %[vsrc2].8b, v2.8b             \t\n"
     51         "mov    %[vsrc3].8b, v3.8b             \t\n"
     52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
     53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
     54           [src] "+&r" (src)
     55         : : "v0", "v1", "v2", "v3"
     56     );
     57 
     58     vsrc.val[0] = vsrc_0;
     59     vsrc.val[1] = vsrc_1;
     60     vsrc.val[2] = vsrc_2;
     61     vsrc.val[3] = vsrc_3;
     62 
     63     return vsrc;
     64 }
     65 #endif
     66 
     67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     68                            const SkPMColor* SK_RESTRICT src, int count,
     69                            U8CPU alpha, int /*x*/, int /*y*/) {
     70     SkASSERT(255 == alpha);
     71 
     72     while (count >= 8) {
     73         uint8x8x4_t vsrc;
     74         uint16x8_t vdst;
     75 
     76         // Load
     77 #ifdef SK_CPU_ARM64
     78         vsrc = sk_vld4_u8_arm64_3(src);
     79 #else
     80         vsrc = vld4_u8((uint8_t*)src);
     81         src += 8;
     82 #endif
     83 
     84         // Convert src to 565
     85         vdst = SkPixel32ToPixel16_neon8(vsrc);
     86 
     87         // Store
     88         vst1q_u16(dst, vdst);
     89 
     90         // Prepare next iteration
     91         dst += 8;
     92         count -= 8;
     93     };
     94 
     95     // Leftovers
     96     while (count > 0) {
     97         SkPMColor c = *src++;
     98         SkPMColorAssert(c);
     99         *dst = SkPixel32ToPixel16_ToU16(c);
    100         dst++;
    101         count--;
    102     };
    103 }
    104 
    105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    106                           const SkPMColor* SK_RESTRICT src, int count,
    107                           U8CPU alpha, int /*x*/, int /*y*/) {
    108     SkASSERT(255 > alpha);
    109 
    110     uint16x8_t vmask_blue, vscale;
    111 
    112     // prepare constants
    113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
    114     vmask_blue = vmovq_n_u16(0x1F);
    115 
    116     while (count >= 8) {
    117         uint8x8x4_t vsrc;
    118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    119         uint16x8_t vres_r, vres_g, vres_b;
    120 
    121         // Load src
    122 #ifdef SK_CPU_ARM64
    123         vsrc = sk_vld4_u8_arm64_3(src);
    124 #else
    125         {
    126         register uint8x8_t d0 asm("d0");
    127         register uint8x8_t d1 asm("d1");
    128         register uint8x8_t d2 asm("d2");
    129         register uint8x8_t d3 asm("d3");
    130 
    131         asm (
    132             "vld4.8    {d0-d3},[%[src]]!"
    133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    134             :
    135         );
    136         vsrc.val[0] = d0;
    137         vsrc.val[1] = d1;
    138         vsrc.val[2] = d2;
    139         }
    140 #endif
    141 
    142         // Load and unpack dst
    143         vdst = vld1q_u16(dst);
    144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
    145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
    146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
    147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
    148 
    149         // Shift src to 565 range
    150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
    151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
    152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
    153 
    154         // Scale src - dst
    155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
    156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
    157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
    158 
    159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
    160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
    161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
    162 
    163         vres_r += vdst_r;
    164         vres_g += vdst_g;
    165         vres_b += vdst_b;
    166 
    167         // Combine
    168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
    169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
    170 
    171         // Store
    172         vst1q_u16(dst, vres_b);
    173         dst += 8;
    174         count -= 8;
    175     }
    176     if (count > 0) {
    177         int scale = SkAlpha255To256(alpha);
    178         do {
    179             SkPMColor c = *src++;
    180             SkPMColorAssert(c);
    181             uint16_t d = *dst;
    182             *dst++ = SkPackRGB16(
    183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
    184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
    185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
    186         } while (--count != 0);
    187     }
    188 }
    189 
    190 #ifdef SK_CPU_ARM32
    191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    192                            const SkPMColor* SK_RESTRICT src, int count,
    193                            U8CPU alpha, int /*x*/, int /*y*/) {
    194     SkASSERT(255 == alpha);
    195 
    196     if (count >= 8) {
    197         uint16_t* SK_RESTRICT keep_dst = 0;
    198 
    199         asm volatile (
    200                       "ands       ip, %[count], #7            \n\t"
    201                       "vmov.u8    d31, #1<<7                  \n\t"
    202                       "vld1.16    {q12}, [%[dst]]             \n\t"
    203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
    204                       // Thumb does not support the standard ARM conditional
    205                       // instructions but instead requires the 'it' instruction
    206                       // to signal conditional execution
    207                       "it eq                                  \n\t"
    208                       "moveq      ip, #8                      \n\t"
    209                       "mov        %[keep_dst], %[dst]         \n\t"
    210 
    211                       "add        %[src], %[src], ip, LSL#2   \n\t"
    212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
    213                       "subs       %[count], %[count], ip      \n\t"
    214                       "b          9f                          \n\t"
    215                       // LOOP
    216                       "2:                                         \n\t"
    217 
    218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
    219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
    220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
    221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
    222                       "subs       %[count], %[count], #8      \n\t"
    223                       "9:                                         \n\t"
    224                       "pld        [%[dst],#32]                \n\t"
    225                       // expand 0565 q12 to 8888 {d4-d7}
    226                       "vmovn.u16  d4, q12                     \n\t"
    227                       "vshr.u16   q11, q12, #5                \n\t"
    228                       "vshr.u16   q10, q12, #6+5              \n\t"
    229                       "vmovn.u16  d5, q11                     \n\t"
    230                       "vmovn.u16  d6, q10                     \n\t"
    231                       "vshl.u8    d4, d4, #3                  \n\t"
    232                       "vshl.u8    d5, d5, #2                  \n\t"
    233                       "vshl.u8    d6, d6, #3                  \n\t"
    234 
    235                       "vmovl.u8   q14, d31                    \n\t"
    236                       "vmovl.u8   q13, d31                    \n\t"
    237                       "vmovl.u8   q12, d31                    \n\t"
    238 
    239                       // duplicate in 4/2/1 & 8pix vsns
    240                       "vmvn.8     d30, d3                     \n\t"
    241                       "vmlal.u8   q14, d30, d6                \n\t"
    242                       "vmlal.u8   q13, d30, d5                \n\t"
    243                       "vmlal.u8   q12, d30, d4                \n\t"
    244                       "vshr.u16   q8, q14, #5                 \n\t"
    245                       "vshr.u16   q9, q13, #6                 \n\t"
    246                       "vaddhn.u16 d6, q14, q8                 \n\t"
    247                       "vshr.u16   q8, q12, #5                 \n\t"
    248                       "vaddhn.u16 d5, q13, q9                 \n\t"
    249                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    250                       "vaddhn.u16 d4, q12, q8                 \n\t"
    251                       // intentionally don't calculate alpha
    252                       // result in d4-d6
    253 
    254                       "vqadd.u8   d5, d5, d1                  \n\t"
    255                       "vqadd.u8   d4, d4, d2                  \n\t"
    256 
    257                       // pack 8888 {d4-d6} to 0565 q10
    258                       "vshll.u8   q10, d6, #8                 \n\t"
    259                       "vshll.u8   q3, d5, #8                  \n\t"
    260                       "vshll.u8   q2, d4, #8                  \n\t"
    261                       "vsri.u16   q10, q3, #5                 \n\t"
    262                       "vsri.u16   q10, q2, #11                \n\t"
    263 
    264                       "bne        2b                          \n\t"
    265 
    266                       "1:                                         \n\t"
    267                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
    268                       : [count] "+r" (count)
    269                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    270                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    271                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    272                       "d30","d31"
    273                       );
    274     }
    275     else
    276     {   // handle count < 8
    277         uint16_t* SK_RESTRICT keep_dst = 0;
    278 
    279         asm volatile (
    280                       "vmov.u8    d31, #1<<7                  \n\t"
    281                       "mov        %[keep_dst], %[dst]         \n\t"
    282 
    283                       "tst        %[count], #4                \n\t"
    284                       "beq        14f                         \n\t"
    285                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    286                       "vld1.32    {q1}, [%[src]]!             \n\t"
    287 
    288                       "14:                                        \n\t"
    289                       "tst        %[count], #2                \n\t"
    290                       "beq        12f                         \n\t"
    291                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    292                       "vld1.32    {d1}, [%[src]]!             \n\t"
    293 
    294                       "12:                                        \n\t"
    295                       "tst        %[count], #1                \n\t"
    296                       "beq        11f                         \n\t"
    297                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    298                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    299 
    300                       "11:                                        \n\t"
    301                       // unzips achieve the same as a vld4 operation
    302                       "vuzpq.u16  q0, q1                      \n\t"
    303                       "vuzp.u8    d0, d1                      \n\t"
    304                       "vuzp.u8    d2, d3                      \n\t"
    305                       // expand 0565 q12 to 8888 {d4-d7}
    306                       "vmovn.u16  d4, q12                     \n\t"
    307                       "vshr.u16   q11, q12, #5                \n\t"
    308                       "vshr.u16   q10, q12, #6+5              \n\t"
    309                       "vmovn.u16  d5, q11                     \n\t"
    310                       "vmovn.u16  d6, q10                     \n\t"
    311                       "vshl.u8    d4, d4, #3                  \n\t"
    312                       "vshl.u8    d5, d5, #2                  \n\t"
    313                       "vshl.u8    d6, d6, #3                  \n\t"
    314 
    315                       "vmovl.u8   q14, d31                    \n\t"
    316                       "vmovl.u8   q13, d31                    \n\t"
    317                       "vmovl.u8   q12, d31                    \n\t"
    318 
    319                       // duplicate in 4/2/1 & 8pix vsns
    320                       "vmvn.8     d30, d3                     \n\t"
    321                       "vmlal.u8   q14, d30, d6                \n\t"
    322                       "vmlal.u8   q13, d30, d5                \n\t"
    323                       "vmlal.u8   q12, d30, d4                \n\t"
    324                       "vshr.u16   q8, q14, #5                 \n\t"
    325                       "vshr.u16   q9, q13, #6                 \n\t"
    326                       "vaddhn.u16 d6, q14, q8                 \n\t"
    327                       "vshr.u16   q8, q12, #5                 \n\t"
    328                       "vaddhn.u16 d5, q13, q9                 \n\t"
    329                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    330                       "vaddhn.u16 d4, q12, q8                 \n\t"
    331                       // intentionally don't calculate alpha
    332                       // result in d4-d6
    333 
    334                       "vqadd.u8   d5, d5, d1                  \n\t"
    335                       "vqadd.u8   d4, d4, d2                  \n\t"
    336 
    337                       // pack 8888 {d4-d6} to 0565 q10
    338                       "vshll.u8   q10, d6, #8                 \n\t"
    339                       "vshll.u8   q3, d5, #8                  \n\t"
    340                       "vshll.u8   q2, d4, #8                  \n\t"
    341                       "vsri.u16   q10, q3, #5                 \n\t"
    342                       "vsri.u16   q10, q2, #11                \n\t"
    343 
    344                       // store
    345                       "tst        %[count], #4                \n\t"
    346                       "beq        24f                         \n\t"
    347                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    348 
    349                       "24:                                        \n\t"
    350                       "tst        %[count], #2                \n\t"
    351                       "beq        22f                         \n\t"
    352                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    353 
    354                       "22:                                        \n\t"
    355                       "tst        %[count], #1                \n\t"
    356                       "beq        21f                         \n\t"
    357                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    358 
    359                       "21:                                        \n\t"
    360                       : [count] "+r" (count)
    361                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    362                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    363                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    364                       "d30","d31"
    365                       );
    366     }
    367 }
    368 #endif
    369 
    370 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
    371     prod += vdupq_n_u16(128);
    372     prod += vshrq_n_u16(prod, 8);
    373     return vshrq_n_u16(prod, 8);
    374 }
    375 
    376 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    377                           const SkPMColor* SK_RESTRICT src, int count,
    378                           U8CPU alpha, int /*x*/, int /*y*/) {
    379    SkASSERT(255 > alpha);
    380 
    381     /* This code implements a Neon version of S32A_D565_Blend. The results have
    382      * a few mismatches compared to the original code. These mismatches never
    383      * exceed 1.
    384      */
    385 
    386     if (count >= 8) {
    387         uint16x8_t valpha_max, vmask_blue;
    388         uint8x8_t valpha;
    389 
    390         // prepare constants
    391         valpha_max = vmovq_n_u16(255);
    392         valpha = vdup_n_u8(alpha);
    393         vmask_blue = vmovq_n_u16(SK_B16_MASK);
    394 
    395         do {
    396             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
    397             uint16x8_t vres_a, vres_r, vres_g, vres_b;
    398             uint8x8x4_t vsrc;
    399 
    400             // load pixels
    401             vdst = vld1q_u16(dst);
    402 #ifdef SK_CPU_ARM64
    403             vsrc = sk_vld4_u8_arm64_4(src);
    404 #else
    405 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
    406             asm (
    407                 "vld4.u8 %h[vsrc], [%[src]]!"
    408                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
    409                 : :
    410             );
    411 #else
    412             register uint8x8_t d0 asm("d0");
    413             register uint8x8_t d1 asm("d1");
    414             register uint8x8_t d2 asm("d2");
    415             register uint8x8_t d3 asm("d3");
    416 
    417             asm volatile (
    418                 "vld4.u8    {d0-d3},[%[src]]!;"
    419                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
    420                   [src] "+&r" (src)
    421                 : :
    422             );
    423             vsrc.val[0] = d0;
    424             vsrc.val[1] = d1;
    425             vsrc.val[2] = d2;
    426             vsrc.val[3] = d3;
    427 #endif
    428 #endif // #ifdef SK_CPU_ARM64
    429 
    430 
    431             // deinterleave dst
    432             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
    433             vdst_b = vdst & vmask_blue;                     // extract blue
    434             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
    435             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
    436 
    437             // shift src to 565
    438             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
    439             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
    440             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
    441 
    442             // calc src * src_scale
    443             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
    444             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
    445             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
    446             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
    447 
    448             // prepare dst_scale
    449             vres_a = SkDiv255Round_neon8(vres_a);
    450             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
    451 
    452             // add dst * dst_scale to previous result
    453             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
    454             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
    455             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
    456 
    457 #ifdef S32A_D565_BLEND_EXACT
    458             // It is possible to get exact results with this but it is slow,
    459             // even slower than C code in some cases
    460             vres_r = SkDiv255Round_neon8(vres_r);
    461             vres_g = SkDiv255Round_neon8(vres_g);
    462             vres_b = SkDiv255Round_neon8(vres_b);
    463 #else
    464             vres_r = vrshrq_n_u16(vres_r, 8);
    465             vres_g = vrshrq_n_u16(vres_g, 8);
    466             vres_b = vrshrq_n_u16(vres_b, 8);
    467 #endif
    468             // pack result
    469             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
    470             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
    471 
    472             // store
    473             vst1q_u16(dst, vres_b);
    474             dst += 8;
    475             count -= 8;
    476         } while (count >= 8);
    477     }
    478 
    479     // leftovers
    480     while (count-- > 0) {
    481         SkPMColor sc = *src++;
    482         if (sc) {
    483             uint16_t dc = *dst;
    484             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    485             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    486             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    487             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    488             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    489         }
    490         dst += 1;
    491     }
    492 }
    493 
    494 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    495  * each dither value is spaced out into byte lanes, and repeated
    496  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    497  * start of each row.
    498  */
    499 static const uint8_t gDitherMatrix_Neon[48] = {
    500     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    501     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    502     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    503     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    504 
    505 };
    506 
    507 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    508                                 int count, U8CPU alpha, int x, int y)
    509 {
    510 
    511     SkASSERT(255 > alpha);
    512 
    513     // rescale alpha to range 1 - 256
    514     int scale = SkAlpha255To256(alpha);
    515 
    516     if (count >= 8) {
    517         /* select row and offset for dither array */
    518         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    519 
    520         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
    521         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
    522 
    523         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
    524         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
    525 
    526         do {
    527 
    528             uint8x8x4_t vsrc;
    529             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
    530             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
    531             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
    532             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
    533             uint16x8_t vdst;
    534             uint16x8_t vdst_r, vdst_g, vdst_b;
    535             int16x8_t vres_r, vres_g, vres_b;
    536             int8x8_t vres8_r, vres8_g, vres8_b;
    537 
    538             // Load source and add dither
    539 #ifdef SK_CPU_ARM64
    540             vsrc = sk_vld4_u8_arm64_3(src);
    541 #else
    542             {
    543             register uint8x8_t d0 asm("d0");
    544             register uint8x8_t d1 asm("d1");
    545             register uint8x8_t d2 asm("d2");
    546             register uint8x8_t d3 asm("d3");
    547 
    548             asm (
    549                 "vld4.8    {d0-d3},[%[src]]! "
    550                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
    551                 :
    552             );
    553             vsrc.val[0] = d0;
    554             vsrc.val[1] = d1;
    555             vsrc.val[2] = d2;
    556             }
    557 #endif
    558             vsrc_r = vsrc.val[NEON_R];
    559             vsrc_g = vsrc.val[NEON_G];
    560             vsrc_b = vsrc.val[NEON_B];
    561 
    562             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
    563             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
    564             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
    565 
    566             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
    567             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
    568             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
    569 
    570             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
    571             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
    572             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
    573 
    574             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
    575             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
    576             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
    577 
    578             // Load dst and unpack
    579             vdst = vld1q_u16(dst);
    580             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
    581             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
    582             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
    583 
    584             // subtract dst from src and widen
    585             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
    586             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
    587             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
    588 
    589             // multiply diffs by scale and shift
    590             vres_r = vmulq_s16(vres_r, vscale);
    591             vres_g = vmulq_s16(vres_g, vscale);
    592             vres_b = vmulq_s16(vres_b, vscale);
    593 
    594             vres8_r = vshrn_n_s16(vres_r, 8);
    595             vres8_g = vshrn_n_s16(vres_g, 8);
    596             vres8_b = vshrn_n_s16(vres_b, 8);
    597 
    598             // add dst to result
    599             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
    600             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
    601             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
    602 
    603             // put result into 565 format
    604             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
    605             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
    606 
    607             // Store result
    608             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
    609 
    610             // Next iteration
    611             dst += 8;
    612             count -= 8;
    613 
    614         } while (count >= 8);
    615     }
    616 
    617     // Leftovers
    618     if (count > 0) {
    619         int scale = SkAlpha255To256(alpha);
    620         DITHER_565_SCAN(y);
    621         do {
    622             SkPMColor c = *src++;
    623             SkPMColorAssert(c);
    624 
    625             int dither = DITHER_VALUE(x);
    626             int sr = SkGetPackedR32(c);
    627             int sg = SkGetPackedG32(c);
    628             int sb = SkGetPackedB32(c);
    629             sr = SkDITHER_R32To565(sr, dither);
    630             sg = SkDITHER_G32To565(sg, dither);
    631             sb = SkDITHER_B32To565(sb, dither);
    632 
    633             uint16_t d = *dst;
    634             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    635                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    636                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    637             DITHER_INC_X(x);
    638         } while (--count != 0);
    639     }
    640 }
    641 
    642 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    643                                 const SkPMColor* SK_RESTRICT src,
    644                                 int count, U8CPU alpha) {
    645 
    646     SkASSERT(255 == alpha);
    647     if (count > 0) {
    648 
    649 
    650     uint8x8_t alpha_mask;
    651 
    652     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    653     alpha_mask = vld1_u8(alpha_mask_setup);
    654 
    655     /* do the NEON unrolled code */
    656 #define    UNROLL    4
    657     while (count >= UNROLL) {
    658         uint8x8_t src_raw, dst_raw, dst_final;
    659         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    660 
    661         /* The two prefetches below may make the code slighlty
    662          * slower for small values of count but are worth having
    663          * in the general case.
    664          */
    665         __builtin_prefetch(src+32);
    666         __builtin_prefetch(dst+32);
    667 
    668         /* get the source */
    669         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    670 #if    UNROLL > 2
    671         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    672 #endif
    673 
    674         /* get and hold the dst too */
    675         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    676 #if    UNROLL > 2
    677         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    678 #endif
    679 
    680     /* 1st and 2nd bits of the unrolling */
    681     {
    682         uint8x8_t dst_cooked;
    683         uint16x8_t dst_wide;
    684         uint8x8_t alpha_narrow;
    685         uint16x8_t alpha_wide;
    686 
    687         /* get the alphas spread out properly */
    688         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    689         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    690 
    691         /* spread the dest */
    692         dst_wide = vmovl_u8(dst_raw);
    693 
    694         /* alpha mul the dest */
    695         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    696         dst_cooked = vshrn_n_u16(dst_wide, 8);
    697 
    698         /* sum -- ignoring any byte lane overflows */
    699         dst_final = vadd_u8(src_raw, dst_cooked);
    700     }
    701 
    702 #if    UNROLL > 2
    703     /* the 3rd and 4th bits of our unrolling */
    704     {
    705         uint8x8_t dst_cooked;
    706         uint16x8_t dst_wide;
    707         uint8x8_t alpha_narrow;
    708         uint16x8_t alpha_wide;
    709 
    710         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    711         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    712 
    713         /* spread the dest */
    714         dst_wide = vmovl_u8(dst_raw_2);
    715 
    716         /* alpha mul the dest */
    717         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    718         dst_cooked = vshrn_n_u16(dst_wide, 8);
    719 
    720         /* sum -- ignoring any byte lane overflows */
    721         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    722     }
    723 #endif
    724 
    725         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    726 #if    UNROLL > 2
    727         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    728 #endif
    729 
    730         src += UNROLL;
    731         dst += UNROLL;
    732         count -= UNROLL;
    733     }
    734 #undef    UNROLL
    735 
    736     /* do any residual iterations */
    737         while (--count >= 0) {
    738             *dst = SkPMSrcOver(*src, *dst);
    739             src += 1;
    740             dst += 1;
    741         }
    742     }
    743 }
    744 
    745 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
    746                                 const SkPMColor* SK_RESTRICT src,
    747                                 int count, U8CPU alpha) {
    748     SkASSERT(255 == alpha);
    749 
    750     if (count <= 0)
    751     return;
    752 
    753     /* Use these to check if src is transparent or opaque */
    754     const unsigned int ALPHA_OPAQ  = 0xFF000000;
    755     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
    756 
    757 #define UNROLL  4
    758     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
    759     const SkPMColor* SK_RESTRICT src_temp = src;
    760 
    761     /* set up the NEON variables */
    762     uint8x8_t alpha_mask;
    763     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    764     alpha_mask = vld1_u8(alpha_mask_setup);
    765 
    766     uint8x8_t src_raw, dst_raw, dst_final;
    767     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    768     uint8x8_t dst_cooked;
    769     uint16x8_t dst_wide;
    770     uint8x8_t alpha_narrow;
    771     uint16x8_t alpha_wide;
    772 
    773     /* choose the first processing type */
    774     if( src >= src_end)
    775         goto TAIL;
    776     if(*src <= ALPHA_TRANS)
    777         goto ALPHA_0;
    778     if(*src >= ALPHA_OPAQ)
    779         goto ALPHA_255;
    780     /* fall-thru */
    781 
    782 ALPHA_1_TO_254:
    783     do {
    784 
    785         /* get the source */
    786         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    787         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    788 
    789         /* get and hold the dst too */
    790         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    791         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    792 
    793 
    794         /* get the alphas spread out properly */
    795         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    796         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    797         /* we collapsed (255-a)+1 ... */
    798         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    799 
    800         /* spread the dest */
    801         dst_wide = vmovl_u8(dst_raw);
    802 
    803         /* alpha mul the dest */
    804         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    805         dst_cooked = vshrn_n_u16(dst_wide, 8);
    806 
    807         /* sum -- ignoring any byte lane overflows */
    808         dst_final = vadd_u8(src_raw, dst_cooked);
    809 
    810         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    811         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    812         /* we collapsed (255-a)+1 ... */
    813         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    814 
    815         /* spread the dest */
    816         dst_wide = vmovl_u8(dst_raw_2);
    817 
    818         /* alpha mul the dest */
    819         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    820         dst_cooked = vshrn_n_u16(dst_wide, 8);
    821 
    822         /* sum -- ignoring any byte lane overflows */
    823         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    824 
    825         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    826         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    827 
    828         src += UNROLL;
    829         dst += UNROLL;
    830 
    831         /* if 2 of the next pixels aren't between 1 and 254
    832         it might make sense to go to the optimized loops */
    833         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
    834             break;
    835 
    836     } while(src < src_end);
    837 
    838     if (src >= src_end)
    839         goto TAIL;
    840 
    841     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
    842         goto ALPHA_255;
    843 
    844     /*fall-thru*/
    845 
    846 ALPHA_0:
    847 
    848     /*In this state, we know the current alpha is 0 and
    849      we optimize for the next alpha also being zero. */
    850     src_temp = src;  //so we don't have to increment dst every time
    851     do {
    852         if(*(++src) > ALPHA_TRANS)
    853             break;
    854         if(*(++src) > ALPHA_TRANS)
    855             break;
    856         if(*(++src) > ALPHA_TRANS)
    857             break;
    858         if(*(++src) > ALPHA_TRANS)
    859             break;
    860     } while(src < src_end);
    861 
    862     dst += (src - src_temp);
    863 
    864     /* no longer alpha 0, so determine where to go next. */
    865     if( src >= src_end)
    866         goto TAIL;
    867     if(*src >= ALPHA_OPAQ)
    868         goto ALPHA_255;
    869     else
    870         goto ALPHA_1_TO_254;
    871 
    872 ALPHA_255:
    873     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
    874         dst[0]=src[0];
    875         dst[1]=src[1];
    876         dst[2]=src[2];
    877         dst[3]=src[3];
    878         src+=UNROLL;
    879         dst+=UNROLL;
    880         if(src >= src_end)
    881             goto TAIL;
    882     }
    883 
    884     //Handle remainder.
    885     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    886         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    887             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
    888         }
    889     }
    890 
    891     if( src >= src_end)
    892         goto TAIL;
    893     if(*src <= ALPHA_TRANS)
    894         goto ALPHA_0;
    895     else
    896         goto ALPHA_1_TO_254;
    897 
    898 TAIL:
    899     /* do any residual iterations */
    900     src_end += UNROLL + 1;  //goto the real end
    901     while(src != src_end) {
    902         if( *src != 0 ) {
    903             if( *src >= ALPHA_OPAQ ) {
    904                 *dst = *src;
    905             }
    906             else {
    907                 *dst = SkPMSrcOver(*src, *dst);
    908             }
    909         }
    910         src++;
    911         dst++;
    912     }
    913 
    914 #undef    UNROLL
    915     return;
    916 }
    917 
    918 /* Neon version of S32_Blend_BlitRow32()
    919  * portable version is in src/core/SkBlitRow_D32.cpp
    920  */
    921 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    922                               const SkPMColor* SK_RESTRICT src,
    923                               int count, U8CPU alpha) {
    924     SkASSERT(alpha <= 255);
    925 
    926     if (count <= 0) {
    927         return;
    928     }
    929 
    930     uint16_t src_scale = SkAlpha255To256(alpha);
    931     uint16_t dst_scale = 256 - src_scale;
    932 
    933     while (count >= 2) {
    934         uint8x8_t vsrc, vdst, vres;
    935         uint16x8_t vsrc_wide, vdst_wide;
    936 
    937         /* These commented prefetches are a big win for count
    938          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
    939          * They also hurt a little (<5%) on an A15
    940          */
    941         //__builtin_prefetch(src+32);
    942         //__builtin_prefetch(dst+32);
    943 
    944         // Load
    945         vsrc = vreinterpret_u8_u32(vld1_u32(src));
    946         vdst = vreinterpret_u8_u32(vld1_u32(dst));
    947 
    948         // Process src
    949         vsrc_wide = vmovl_u8(vsrc);
    950         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
    951 
    952         // Process dst
    953         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
    954 
    955         // Combine
    956         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
    957 
    958         // Store
    959         vst1_u32(dst, vreinterpret_u32_u8(vres));
    960 
    961         src += 2;
    962         dst += 2;
    963         count -= 2;
    964     }
    965 
    966     if (count == 1) {
    967         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    968         uint16x8_t vsrc_wide, vdst_wide;
    969 
    970         // Load
    971         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    972         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    973 
    974         // Process
    975         vsrc_wide = vmovl_u8(vsrc);
    976         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
    977         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
    978         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
    979 
    980         // Store
    981         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    982     }
    983 }
    984 
    985 #ifdef SK_CPU_ARM32
    986 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    987                          const SkPMColor* SK_RESTRICT src,
    988                          int count, U8CPU alpha) {
    989 
    990     SkASSERT(255 >= alpha);
    991 
    992     if (count <= 0) {
    993         return;
    994     }
    995 
    996     unsigned alpha256 = SkAlpha255To256(alpha);
    997 
    998     // First deal with odd counts
    999     if (count & 1) {
   1000         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1001         uint16x8_t vdst_wide, vsrc_wide;
   1002         unsigned dst_scale;
   1003 
   1004         // Load
   1005         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1006         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1007 
   1008         // Calc dst_scale
   1009         dst_scale = vget_lane_u8(vsrc, 3);
   1010         dst_scale *= alpha256;
   1011         dst_scale >>= 8;
   1012         dst_scale = 256 - dst_scale;
   1013 
   1014         // Process src
   1015         vsrc_wide = vmovl_u8(vsrc);
   1016         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
   1017 
   1018         // Process dst
   1019         vdst_wide = vmovl_u8(vdst);
   1020         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
   1021 
   1022         // Combine
   1023         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1024 
   1025         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1026         dst++;
   1027         src++;
   1028         count--;
   1029     }
   1030 
   1031     if (count) {
   1032         uint8x8_t alpha_mask;
   1033         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   1034         alpha_mask = vld1_u8(alpha_mask_setup);
   1035 
   1036         do {
   1037 
   1038             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
   1039             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
   1040 
   1041             __builtin_prefetch(src+32);
   1042             __builtin_prefetch(dst+32);
   1043 
   1044             // Load
   1045             vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1046             vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1047 
   1048             // Prepare src_scale
   1049             vsrc_scale = vdupq_n_u16(alpha256);
   1050 
   1051             // Calc dst_scale
   1052             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
   1053             vdst_scale = vmovl_u8(vsrc_alphas);
   1054             vdst_scale *= vsrc_scale;
   1055             vdst_scale = vshrq_n_u16(vdst_scale, 8);
   1056             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
   1057 
   1058             // Process src
   1059             vsrc_wide = vmovl_u8(vsrc);
   1060             vsrc_wide *= vsrc_scale;
   1061 
   1062             // Process dst
   1063             vdst_wide = vmovl_u8(vdst);
   1064             vdst_wide *= vdst_scale;
   1065 
   1066             // Combine
   1067             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1068 
   1069             vst1_u32(dst, vreinterpret_u32_u8(vres));
   1070 
   1071             src += 2;
   1072             dst += 2;
   1073             count -= 2;
   1074         } while(count);
   1075     }
   1076 }
   1077 
   1078 ///////////////////////////////////////////////////////////////////////////////
   1079 
   1080 #undef    DEBUG_OPAQUE_DITHER
   1081 
   1082 #if    defined(DEBUG_OPAQUE_DITHER)
   1083 static void showme8(char *str, void *p, int len)
   1084 {
   1085     static char buf[256];
   1086     char tbuf[32];
   1087     int i;
   1088     char *pc = (char*) p;
   1089     sprintf(buf,"%8s:", str);
   1090     for(i=0;i<len;i++) {
   1091         sprintf(tbuf, "   %02x", pc[i]);
   1092         strcat(buf, tbuf);
   1093     }
   1094     SkDebugf("%s\n", buf);
   1095 }
   1096 static void showme16(char *str, void *p, int len)
   1097 {
   1098     static char buf[256];
   1099     char tbuf[32];
   1100     int i;
   1101     uint16_t *pc = (uint16_t*) p;
   1102     sprintf(buf,"%8s:", str);
   1103     len = (len / sizeof(uint16_t));    /* passed as bytes */
   1104     for(i=0;i<len;i++) {
   1105         sprintf(tbuf, " %04x", pc[i]);
   1106         strcat(buf, tbuf);
   1107     }
   1108     SkDebugf("%s\n", buf);
   1109 }
   1110 #endif
   1111 #endif // #ifdef SK_CPU_ARM32
   1112 
   1113 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   1114                                    const SkPMColor* SK_RESTRICT src,
   1115                                    int count, U8CPU alpha, int x, int y) {
   1116     SkASSERT(255 == alpha);
   1117 
   1118 #define    UNROLL    8
   1119 
   1120     if (count >= UNROLL) {
   1121 
   1122 #if defined(DEBUG_OPAQUE_DITHER)
   1123     uint16_t tmpbuf[UNROLL];
   1124     int td[UNROLL];
   1125     int tdv[UNROLL];
   1126     int ta[UNROLL];
   1127     int tap[UNROLL];
   1128     uint16_t in_dst[UNROLL];
   1129     int offset = 0;
   1130     int noisy = 0;
   1131 #endif
   1132 
   1133     uint8x8_t dbase;
   1134     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1135     dbase = vld1_u8(dstart);
   1136 
   1137         do {
   1138         uint8x8x4_t vsrc;
   1139         uint8x8_t sr, sg, sb, sa, d;
   1140         uint16x8_t dst8, scale8, alpha8;
   1141         uint16x8_t dst_r, dst_g, dst_b;
   1142 
   1143 #if defined(DEBUG_OPAQUE_DITHER)
   1144         // calculate 8 elements worth into a temp buffer
   1145         {
   1146         int my_y = y;
   1147         int my_x = x;
   1148         SkPMColor* my_src = (SkPMColor*)src;
   1149         uint16_t* my_dst = dst;
   1150         int i;
   1151 
   1152         DITHER_565_SCAN(my_y);
   1153         for(i = 0; i < UNROLL; i++) {
   1154             SkPMColor c = *my_src++;
   1155             SkPMColorAssert(c);
   1156             if (c) {
   1157                 unsigned a = SkGetPackedA32(c);
   1158 
   1159                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
   1160                 tdv[i] = DITHER_VALUE(my_x);
   1161                 ta[i] = a;
   1162                 tap[i] = SkAlpha255To256(a);
   1163                 td[i] = d;
   1164 
   1165                 unsigned sr = SkGetPackedR32(c);
   1166                 unsigned sg = SkGetPackedG32(c);
   1167                 unsigned sb = SkGetPackedB32(c);
   1168                 sr = SkDITHER_R32_FOR_565(sr, d);
   1169                 sg = SkDITHER_G32_FOR_565(sg, d);
   1170                 sb = SkDITHER_B32_FOR_565(sb, d);
   1171 
   1172                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1173                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
   1174                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1175                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1176                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1177                 td[i] = d;
   1178             } else {
   1179                 tmpbuf[i] = *my_dst;
   1180                 ta[i] = tdv[i] = td[i] = 0xbeef;
   1181             }
   1182             in_dst[i] = *my_dst;
   1183             my_dst += 1;
   1184             DITHER_INC_X(my_x);
   1185         }
   1186         }
   1187 #endif
   1188 
   1189 #ifdef SK_CPU_ARM64
   1190         vsrc = sk_vld4_u8_arm64_4(src);
   1191 #else
   1192         {
   1193         register uint8x8_t d0 asm("d0");
   1194         register uint8x8_t d1 asm("d1");
   1195         register uint8x8_t d2 asm("d2");
   1196         register uint8x8_t d3 asm("d3");
   1197 
   1198         asm ("vld4.8    {d0-d3},[%[src]]! "
   1199             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
   1200             :
   1201         );
   1202         vsrc.val[0] = d0;
   1203         vsrc.val[1] = d1;
   1204         vsrc.val[2] = d2;
   1205         vsrc.val[3] = d3;
   1206         }
   1207 #endif
   1208         sa = vsrc.val[NEON_A];
   1209         sr = vsrc.val[NEON_R];
   1210         sg = vsrc.val[NEON_G];
   1211         sb = vsrc.val[NEON_B];
   1212 
   1213         /* calculate 'd', which will be 0..7
   1214          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
   1215          */
   1216         alpha8 = vmovl_u8(dbase);
   1217         alpha8 = vmlal_u8(alpha8, sa, dbase);
   1218         d = vshrn_n_u16(alpha8, 8);    // narrowing too
   1219 
   1220         // sr = sr - (sr>>5) + d
   1221         /* watching for 8-bit overflow.  d is 0..7; risky range of
   1222          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
   1223          * safe  as long as we do ((sr-sr>>5) + d)
   1224          */
   1225         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1226         sr = vadd_u8(sr, d);
   1227 
   1228         // sb = sb - (sb>>5) + d
   1229         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1230         sb = vadd_u8(sb, d);
   1231 
   1232         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1233         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1234         sg = vadd_u8(sg, vshr_n_u8(d,1));
   1235 
   1236         // need to pick up 8 dst's -- at 16 bits each, 128 bits
   1237         dst8 = vld1q_u16(dst);
   1238         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
   1239         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
   1240         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
   1241 
   1242         // blend
   1243         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1244 
   1245         // combine the addq and mul, save 3 insns
   1246         scale8 = vshrq_n_u16(scale8, 3);
   1247         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1248         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1249         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1250 
   1251         // repack to store
   1252         dst8 = vshrq_n_u16(dst_b, 5);
   1253         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1254         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1255 
   1256         vst1q_u16(dst, dst8);
   1257 
   1258 #if defined(DEBUG_OPAQUE_DITHER)
   1259         // verify my 8 elements match the temp buffer
   1260         {
   1261         int i, bad=0;
   1262         static int invocation;
   1263 
   1264         for (i = 0; i < UNROLL; i++) {
   1265             if (tmpbuf[i] != dst[i]) {
   1266                 bad=1;
   1267             }
   1268         }
   1269         if (bad) {
   1270             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
   1271                      invocation, offset);
   1272             SkDebugf("  alpha 0x%x\n", alpha);
   1273             for (i = 0; i < UNROLL; i++)
   1274                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
   1275                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
   1276                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
   1277 
   1278             showme16("alpha8", &alpha8, sizeof(alpha8));
   1279             showme16("scale8", &scale8, sizeof(scale8));
   1280             showme8("d", &d, sizeof(d));
   1281             showme16("dst8", &dst8, sizeof(dst8));
   1282             showme16("dst_b", &dst_b, sizeof(dst_b));
   1283             showme16("dst_g", &dst_g, sizeof(dst_g));
   1284             showme16("dst_r", &dst_r, sizeof(dst_r));
   1285             showme8("sb", &sb, sizeof(sb));
   1286             showme8("sg", &sg, sizeof(sg));
   1287             showme8("sr", &sr, sizeof(sr));
   1288 
   1289             return;
   1290         }
   1291         offset += UNROLL;
   1292         invocation++;
   1293         }
   1294 #endif
   1295         dst += UNROLL;
   1296         count -= UNROLL;
   1297         // skip x += UNROLL, since it's unchanged mod-4
   1298         } while (count >= UNROLL);
   1299     }
   1300 #undef    UNROLL
   1301 
   1302     // residuals
   1303     if (count > 0) {
   1304         DITHER_565_SCAN(y);
   1305         do {
   1306             SkPMColor c = *src++;
   1307             SkPMColorAssert(c);
   1308             if (c) {
   1309                 unsigned a = SkGetPackedA32(c);
   1310 
   1311                 // dither and alpha are just temporary variables to work-around
   1312                 // an ICE in debug.
   1313                 unsigned dither = DITHER_VALUE(x);
   1314                 unsigned alpha = SkAlpha255To256(a);
   1315                 int d = SkAlphaMul(dither, alpha);
   1316 
   1317                 unsigned sr = SkGetPackedR32(c);
   1318                 unsigned sg = SkGetPackedG32(c);
   1319                 unsigned sb = SkGetPackedB32(c);
   1320                 sr = SkDITHER_R32_FOR_565(sr, d);
   1321                 sg = SkDITHER_G32_FOR_565(sg, d);
   1322                 sb = SkDITHER_B32_FOR_565(sb, d);
   1323 
   1324                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1325                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1326                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1327                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1328                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1329             }
   1330             dst += 1;
   1331             DITHER_INC_X(x);
   1332         } while (--count != 0);
   1333     }
   1334 }
   1335 
   1336 ///////////////////////////////////////////////////////////////////////////////
   1337 
   1338 #undef    DEBUG_S32_OPAQUE_DITHER
   1339 
   1340 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1341                                  const SkPMColor* SK_RESTRICT src,
   1342                                  int count, U8CPU alpha, int x, int y) {
   1343     SkASSERT(255 == alpha);
   1344 
   1345 #define    UNROLL    8
   1346     if (count >= UNROLL) {
   1347     uint8x8_t d;
   1348     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1349     d = vld1_u8(dstart);
   1350 
   1351     while (count >= UNROLL) {
   1352         uint8x8_t sr, sg, sb;
   1353         uint16x8_t dr, dg, db;
   1354         uint16x8_t dst8;
   1355         uint8x8x4_t vsrc;
   1356 
   1357 #ifdef SK_CPU_ARM64
   1358         vsrc = sk_vld4_u8_arm64_3(src);
   1359 #else
   1360         {
   1361         register uint8x8_t d0 asm("d0");
   1362         register uint8x8_t d1 asm("d1");
   1363         register uint8x8_t d2 asm("d2");
   1364         register uint8x8_t d3 asm("d3");
   1365 
   1366         asm (
   1367             "vld4.8    {d0-d3},[%[src]]! "
   1368             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   1369             :
   1370         );
   1371         vsrc.val[0] = d0;
   1372         vsrc.val[1] = d1;
   1373         vsrc.val[2] = d2;
   1374         }
   1375 #endif
   1376         sr = vsrc.val[NEON_R];
   1377         sg = vsrc.val[NEON_G];
   1378         sb = vsrc.val[NEON_B];
   1379 
   1380         /* XXX: if we want to prefetch, hide it in the above asm()
   1381          * using the gcc __builtin_prefetch(), the prefetch will
   1382          * fall to the bottom of the loop -- it won't stick up
   1383          * at the top of the loop, just after the vld4.
   1384          */
   1385 
   1386         // sr = sr - (sr>>5) + d
   1387         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1388         dr = vaddl_u8(sr, d);
   1389 
   1390         // sb = sb - (sb>>5) + d
   1391         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1392         db = vaddl_u8(sb, d);
   1393 
   1394         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
   1395         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1396         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
   1397 
   1398         // pack high bits of each into 565 format  (rgb, b is lsb)
   1399         dst8 = vshrq_n_u16(db, 3);
   1400         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1401         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
   1402 
   1403         // store it
   1404         vst1q_u16(dst, dst8);
   1405 
   1406 #if    defined(DEBUG_S32_OPAQUE_DITHER)
   1407         // always good to know if we generated good results
   1408         {
   1409         int i, myx = x, myy = y;
   1410         DITHER_565_SCAN(myy);
   1411         for (i=0;i<UNROLL;i++) {
   1412             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
   1413             SkPMColor c = src[i-8];
   1414             unsigned dither = DITHER_VALUE(myx);
   1415             uint16_t val = SkDitherRGB32To565(c, dither);
   1416             if (val != dst[i]) {
   1417             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
   1418                 c, dither, val, dst[i], dstart[i]);
   1419             }
   1420             DITHER_INC_X(myx);
   1421         }
   1422         }
   1423 #endif
   1424 
   1425         dst += UNROLL;
   1426         // we don't need to increment src as the asm above has already done it
   1427         count -= UNROLL;
   1428         x += UNROLL;        // probably superfluous
   1429     }
   1430     }
   1431 #undef    UNROLL
   1432 
   1433     // residuals
   1434     if (count > 0) {
   1435         DITHER_565_SCAN(y);
   1436         do {
   1437             SkPMColor c = *src++;
   1438             SkPMColorAssert(c);
   1439             SkASSERT(SkGetPackedA32(c) == 255);
   1440 
   1441             unsigned dither = DITHER_VALUE(x);
   1442             *dst++ = SkDitherRGB32To565(c, dither);
   1443             DITHER_INC_X(x);
   1444         } while (--count != 0);
   1445     }
   1446 }
   1447 
   1448 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
   1449                       SkPMColor color) {
   1450     if (count <= 0) {
   1451         return;
   1452     }
   1453 
   1454     if (0 == color) {
   1455         if (src != dst) {
   1456             memcpy(dst, src, count * sizeof(SkPMColor));
   1457         }
   1458         return;
   1459     }
   1460 
   1461     unsigned colorA = SkGetPackedA32(color);
   1462     if (255 == colorA) {
   1463         sk_memset32(dst, color, count);
   1464         return;
   1465     }
   1466 
   1467     unsigned scale = 256 - SkAlpha255To256(colorA);
   1468 
   1469     if (count >= 8) {
   1470         uint32x4_t vcolor;
   1471         uint8x8_t vscale;
   1472 
   1473         vcolor = vdupq_n_u32(color);
   1474 
   1475         // scale numerical interval [0-255], so load as 8 bits
   1476         vscale = vdup_n_u8(scale);
   1477 
   1478         do {
   1479             // load src color, 8 pixels, 4 64 bit registers
   1480             // (and increment src).
   1481             uint32x2x4_t vsrc;
   1482 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
   1483             asm (
   1484                 "vld1.32    %h[vsrc], [%[src]]!"
   1485                 : [vsrc] "=w" (vsrc), [src] "+r" (src)
   1486                 : :
   1487             );
   1488 #else // 64bit targets and Clang
   1489             vsrc.val[0] = vld1_u32(src);
   1490             vsrc.val[1] = vld1_u32(src+2);
   1491             vsrc.val[2] = vld1_u32(src+4);
   1492             vsrc.val[3] = vld1_u32(src+6);
   1493             src += 8;
   1494 #endif
   1495 
   1496             // multiply long by scale, 64 bits at a time,
   1497             // destination into a 128 bit register.
   1498             uint16x8x4_t vtmp;
   1499             vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
   1500             vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
   1501             vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
   1502             vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
   1503 
   1504             // shift the 128 bit registers, containing the 16
   1505             // bit scaled values back to 8 bits, narrowing the
   1506             // results to 64 bit registers.
   1507             uint8x16x2_t vres;
   1508             vres.val[0] = vcombine_u8(
   1509                             vshrn_n_u16(vtmp.val[0], 8),
   1510                             vshrn_n_u16(vtmp.val[1], 8));
   1511             vres.val[1] = vcombine_u8(
   1512                             vshrn_n_u16(vtmp.val[2], 8),
   1513                             vshrn_n_u16(vtmp.val[3], 8));
   1514 
   1515             // adding back the color, using 128 bit registers.
   1516             uint32x4x2_t vdst;
   1517             vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
   1518                                                vreinterpretq_u8_u32(vcolor));
   1519             vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
   1520                                                vreinterpretq_u8_u32(vcolor));
   1521 
   1522             // store back the 8 calculated pixels (2 128 bit
   1523             // registers), and increment dst.
   1524 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
   1525             asm (
   1526                 "vst1.32    %h[vdst], [%[dst]]!"
   1527                 : [dst] "+r" (dst)
   1528                 : [vdst] "w" (vdst)
   1529                 : "memory"
   1530             );
   1531 #else // 64bit targets and Clang
   1532             vst1q_u32(dst, vdst.val[0]);
   1533             vst1q_u32(dst+4, vdst.val[1]);
   1534             dst += 8;
   1535 #endif
   1536             count -= 8;
   1537 
   1538         } while (count >= 8);
   1539     }
   1540 
   1541     while (count > 0) {
   1542         *dst = color + SkAlphaMulQ(*src, scale);
   1543         src += 1;
   1544         dst += 1;
   1545         count--;
   1546     }
   1547 }
   1548 
   1549 ///////////////////////////////////////////////////////////////////////////////
   1550 
   1551 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
   1552     // no dither
   1553     S32_D565_Opaque_neon,
   1554     S32_D565_Blend_neon,
   1555 #ifdef SK_CPU_ARM32
   1556     S32A_D565_Opaque_neon,
   1557 #else
   1558     NULL,
   1559 #endif
   1560     S32A_D565_Blend_neon,
   1561 
   1562     // dither
   1563     S32_D565_Opaque_Dither_neon,
   1564     S32_D565_Blend_Dither_neon,
   1565     S32A_D565_Opaque_Dither_neon,
   1566     NULL,   // S32A_D565_Blend_Dither
   1567 };
   1568 
   1569 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1570     NULL,   // S32_Opaque,
   1571     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1572     /*
   1573      * We have two choices for S32A_Opaque procs. The one reads the src alpha
   1574      * value and attempts to optimize accordingly.  The optimization is
   1575      * sensitive to the source content and is not a win in all cases. For
   1576      * example, if there are a lot of transitions between the alpha states,
   1577      * the performance will almost certainly be worse.  However, for many
   1578      * common cases the performance is equivalent or better than the standard
   1579      * case where we do not inspect the src alpha.
   1580      */
   1581 #if SK_A32_SHIFT == 24
   1582     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
   1583     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
   1584 #else
   1585     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
   1586 #endif
   1587 #ifdef SK_CPU_ARM32
   1588     S32A_Blend_BlitRow32_neon        // S32A_Blend
   1589 #else
   1590     NULL
   1591 #endif
   1592 };
   1593