Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkCachePreload_arm.h"
     18 
     19 #include <arm_neon.h>
     20 
     21 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     22                            const SkPMColor* SK_RESTRICT src, int count,
     23                            U8CPU alpha, int /*x*/, int /*y*/) {
     24     SkASSERT(255 == alpha);
     25 
     26     if (count >= 8) {
     27         uint16_t* SK_RESTRICT keep_dst = 0;
     28 
     29         asm volatile (
     30                       "ands       ip, %[count], #7            \n\t"
     31                       "vmov.u8    d31, #1<<7                  \n\t"
     32                       "vld1.16    {q12}, [%[dst]]             \n\t"
     33                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
     34                       // Thumb does not support the standard ARM conditional
     35                       // instructions but instead requires the 'it' instruction
     36                       // to signal conditional execution
     37                       "it eq                                  \n\t"
     38                       "moveq      ip, #8                      \n\t"
     39                       "mov        %[keep_dst], %[dst]         \n\t"
     40 
     41                       "add        %[src], %[src], ip, LSL#2   \n\t"
     42                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
     43                       "subs       %[count], %[count], ip      \n\t"
     44                       "b          9f                          \n\t"
     45                       // LOOP
     46                       "2:                                         \n\t"
     47 
     48                       "vld1.16    {q12}, [%[dst]]!            \n\t"
     49                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
     50                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
     51                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
     52                       "subs       %[count], %[count], #8      \n\t"
     53                       "9:                                         \n\t"
     54                       "pld        [%[dst],#32]                \n\t"
     55                       // expand 0565 q12 to 8888 {d4-d7}
     56                       "vmovn.u16  d4, q12                     \n\t"
     57                       "vshr.u16   q11, q12, #5                \n\t"
     58                       "vshr.u16   q10, q12, #6+5              \n\t"
     59                       "vmovn.u16  d5, q11                     \n\t"
     60                       "vmovn.u16  d6, q10                     \n\t"
     61                       "vshl.u8    d4, d4, #3                  \n\t"
     62                       "vshl.u8    d5, d5, #2                  \n\t"
     63                       "vshl.u8    d6, d6, #3                  \n\t"
     64 
     65                       "vmovl.u8   q14, d31                    \n\t"
     66                       "vmovl.u8   q13, d31                    \n\t"
     67                       "vmovl.u8   q12, d31                    \n\t"
     68 
     69                       // duplicate in 4/2/1 & 8pix vsns
     70                       "vmvn.8     d30, d3                     \n\t"
     71                       "vmlal.u8   q14, d30, d6                \n\t"
     72                       "vmlal.u8   q13, d30, d5                \n\t"
     73                       "vmlal.u8   q12, d30, d4                \n\t"
     74                       "vshr.u16   q8, q14, #5                 \n\t"
     75                       "vshr.u16   q9, q13, #6                 \n\t"
     76                       "vaddhn.u16 d6, q14, q8                 \n\t"
     77                       "vshr.u16   q8, q12, #5                 \n\t"
     78                       "vaddhn.u16 d5, q13, q9                 \n\t"
     79                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
     80                       "vaddhn.u16 d4, q12, q8                 \n\t"
     81                       // intentionally don't calculate alpha
     82                       // result in d4-d6
     83 
     84                       "vqadd.u8   d5, d5, d1                  \n\t"
     85                       "vqadd.u8   d4, d4, d2                  \n\t"
     86 
     87                       // pack 8888 {d4-d6} to 0565 q10
     88                       "vshll.u8   q10, d6, #8                 \n\t"
     89                       "vshll.u8   q3, d5, #8                  \n\t"
     90                       "vshll.u8   q2, d4, #8                  \n\t"
     91                       "vsri.u16   q10, q3, #5                 \n\t"
     92                       "vsri.u16   q10, q2, #11                \n\t"
     93 
     94                       "bne        2b                          \n\t"
     95 
     96                       "1:                                         \n\t"
     97                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
     98                       : [count] "+r" (count)
     99                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    100                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    101                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    102                       "d30","d31"
    103                       );
    104     }
    105     else
    106     {   // handle count < 8
    107         uint16_t* SK_RESTRICT keep_dst = 0;
    108 
    109         asm volatile (
    110                       "vmov.u8    d31, #1<<7                  \n\t"
    111                       "mov        %[keep_dst], %[dst]         \n\t"
    112 
    113                       "tst        %[count], #4                \n\t"
    114                       "beq        14f                         \n\t"
    115                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    116                       "vld1.32    {q1}, [%[src]]!             \n\t"
    117 
    118                       "14:                                        \n\t"
    119                       "tst        %[count], #2                \n\t"
    120                       "beq        12f                         \n\t"
    121                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    122                       "vld1.32    {d1}, [%[src]]!             \n\t"
    123 
    124                       "12:                                        \n\t"
    125                       "tst        %[count], #1                \n\t"
    126                       "beq        11f                         \n\t"
    127                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    128                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    129 
    130                       "11:                                        \n\t"
    131                       // unzips achieve the same as a vld4 operation
    132                       "vuzpq.u16  q0, q1                      \n\t"
    133                       "vuzp.u8    d0, d1                      \n\t"
    134                       "vuzp.u8    d2, d3                      \n\t"
    135                       // expand 0565 q12 to 8888 {d4-d7}
    136                       "vmovn.u16  d4, q12                     \n\t"
    137                       "vshr.u16   q11, q12, #5                \n\t"
    138                       "vshr.u16   q10, q12, #6+5              \n\t"
    139                       "vmovn.u16  d5, q11                     \n\t"
    140                       "vmovn.u16  d6, q10                     \n\t"
    141                       "vshl.u8    d4, d4, #3                  \n\t"
    142                       "vshl.u8    d5, d5, #2                  \n\t"
    143                       "vshl.u8    d6, d6, #3                  \n\t"
    144 
    145                       "vmovl.u8   q14, d31                    \n\t"
    146                       "vmovl.u8   q13, d31                    \n\t"
    147                       "vmovl.u8   q12, d31                    \n\t"
    148 
    149                       // duplicate in 4/2/1 & 8pix vsns
    150                       "vmvn.8     d30, d3                     \n\t"
    151                       "vmlal.u8   q14, d30, d6                \n\t"
    152                       "vmlal.u8   q13, d30, d5                \n\t"
    153                       "vmlal.u8   q12, d30, d4                \n\t"
    154                       "vshr.u16   q8, q14, #5                 \n\t"
    155                       "vshr.u16   q9, q13, #6                 \n\t"
    156                       "vaddhn.u16 d6, q14, q8                 \n\t"
    157                       "vshr.u16   q8, q12, #5                 \n\t"
    158                       "vaddhn.u16 d5, q13, q9                 \n\t"
    159                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    160                       "vaddhn.u16 d4, q12, q8                 \n\t"
    161                       // intentionally don't calculate alpha
    162                       // result in d4-d6
    163 
    164                       "vqadd.u8   d5, d5, d1                  \n\t"
    165                       "vqadd.u8   d4, d4, d2                  \n\t"
    166 
    167                       // pack 8888 {d4-d6} to 0565 q10
    168                       "vshll.u8   q10, d6, #8                 \n\t"
    169                       "vshll.u8   q3, d5, #8                  \n\t"
    170                       "vshll.u8   q2, d4, #8                  \n\t"
    171                       "vsri.u16   q10, q3, #5                 \n\t"
    172                       "vsri.u16   q10, q2, #11                \n\t"
    173 
    174                       // store
    175                       "tst        %[count], #4                \n\t"
    176                       "beq        24f                         \n\t"
    177                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    178 
    179                       "24:                                        \n\t"
    180                       "tst        %[count], #2                \n\t"
    181                       "beq        22f                         \n\t"
    182                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    183 
    184                       "22:                                        \n\t"
    185                       "tst        %[count], #1                \n\t"
    186                       "beq        21f                         \n\t"
    187                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    188 
    189                       "21:                                        \n\t"
    190                       : [count] "+r" (count)
    191                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    192                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    193                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    194                       "d30","d31"
    195                       );
    196     }
    197 }
    198 
    199 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    200                           const SkPMColor* SK_RESTRICT src, int count,
    201                           U8CPU alpha, int /*x*/, int /*y*/) {
    202 
    203     U8CPU alpha_for_asm = alpha;
    204 
    205     asm volatile (
    206     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
    207      * the original in two respects:
    208      *  1. The results have a few mismatches compared to the original code. These mismatches
    209      *     never exceed 1. It's possible to improve accuracy vs. a floating point
    210      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
    211      *     Rounding is not present in the code below, because although results would be closer
    212      *     to a floating point implementation, the number of mismatches compared to the
    213      *     original code would be far greater.
    214      *  2. On certain inputs, the original code can overflow, causing colour channels to
    215      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
    216      *     to affect another.
    217      */
    218 
    219 #if 1
    220         /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
    221                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
    222 #else
    223                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
    224 #endif
    225                   "vmov.u16   q3, #255                        \n\t"   // set up constant
    226                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
    227                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
    228                   "beq        2f                              \n\t"   // if count8 == 0, exit
    229                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
    230 
    231                   "1:                                             \n\t"
    232                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
    233                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
    234                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
    235                   //  and deinterleave
    236 
    237                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
    238                   "vand       q10, q0, q15                    \n\t"   // extract blue
    239                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
    240                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
    241                   // dstrgb = {q8, q9, q10}
    242 
    243                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
    244                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
    245                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
    246 
    247                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
    248                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
    249                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
    250                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
    251                   // srcrgba = {q11, q12, q13, q14}
    252 
    253                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
    254                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
    255                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
    256                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
    257 
    258                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
    259                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
    260                   // dst_scale = q2
    261 
    262                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
    263                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
    264                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
    265 
    266 #if 1
    267     // trying for a better match with SkDiv255Round(a)
    268     // C alg is:  a+=128; (a+a>>8)>>8
    269     // we'll use just a rounding shift [q2 is available for scratch]
    270                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
    271                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
    272                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    273 #else
    274     // arm's original "truncating divide by 256"
    275                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
    276                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
    277                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    278 #endif
    279 
    280                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
    281                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
    282                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
    283 
    284                   "bne        1b                              \n\t"   // if counter != 0, loop
    285                   "2:                                             \n\t"   // exit
    286 
    287                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
    288                   :
    289                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
    290                   );
    291 
    292     count &= 7;
    293     if (count > 0) {
    294         do {
    295             SkPMColor sc = *src++;
    296             if (sc) {
    297                 uint16_t dc = *dst;
    298                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    299                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    300                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    301                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    302                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    303             }
    304             dst += 1;
    305         } while (--count != 0);
    306     }
    307 }
    308 
    309 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    310  * each dither value is spaced out into byte lanes, and repeated
    311  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    312  * start of each row.
    313  */
    314 static const uint8_t gDitherMatrix_Neon[48] = {
    315     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    316     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    317     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    318     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    319 
    320 };
    321 
    322 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    323                                 int count, U8CPU alpha, int x, int y)
    324 {
    325     /* select row and offset for dither array */
    326     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    327 
    328     /* rescale alpha to range 0 - 256 */
    329     int scale = SkAlpha255To256(alpha);
    330 
    331     asm volatile (
    332                   "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
    333                   "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
    334                   "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
    335                   "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
    336                   "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
    337                   "1:                                                 \n\t"
    338                   "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
    339                   "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
    340                   "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
    341                   "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
    342                   "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
    343                   "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
    344                   "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
    345                   "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
    346                   "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
    347                   "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
    348                   "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
    349                   "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
    350                   "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
    351                   // load 8 pixels from dst, extract rgb
    352                   "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
    353                   "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
    354                   "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
    355                   "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
    356                   "vand           d17, d17, d29                   \n\t"   // and green with green mask
    357                   "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
    358                   "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
    359                   // src = {d22 (r), d23 (g), d24 (b)}
    360                   // dst = {d16 (r), d17 (g), d18 (b)}
    361                   // subtract dst from src and widen
    362                   "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
    363                   "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
    364                   "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
    365                   // multiply diffs by scale and shift
    366                   "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
    367                   "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
    368                   "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
    369                   "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
    370                   "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
    371                   "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
    372                   "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
    373                   // add dst to result
    374                   "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
    375                   "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
    376                   "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
    377                   // put result into 565 format
    378                   "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
    379                   "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
    380                   "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
    381                   "bgt            1b                              \n\t"   // loop if count > 0
    382                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
    383                   : [dstart] "r" (dstart), [scale] "r" (scale)
    384                   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
    385                   );
    386 
    387     DITHER_565_SCAN(y);
    388 
    389     while((count & 7) > 0)
    390     {
    391         SkPMColor c = *src++;
    392 
    393         int dither = DITHER_VALUE(x);
    394         int sr = SkGetPackedR32(c);
    395         int sg = SkGetPackedG32(c);
    396         int sb = SkGetPackedB32(c);
    397         sr = SkDITHER_R32To565(sr, dither);
    398         sg = SkDITHER_G32To565(sg, dither);
    399         sb = SkDITHER_B32To565(sb, dither);
    400 
    401         uint16_t d = *dst;
    402         *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    403                              SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    404                              SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    405         DITHER_INC_X(x);
    406         count--;
    407     }
    408 }
    409 
    410 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    411                                 const SkPMColor* SK_RESTRICT src,
    412                                 int count, U8CPU alpha) {
    413 
    414     SkASSERT(255 == alpha);
    415     if (count > 0) {
    416 
    417 
    418     uint8x8_t alpha_mask;
    419 
    420     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    421     alpha_mask = vld1_u8(alpha_mask_setup);
    422 
    423     /* do the NEON unrolled code */
    424 #define    UNROLL    4
    425     while (count >= UNROLL) {
    426         uint8x8_t src_raw, dst_raw, dst_final;
    427         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    428 
    429         /* The two prefetches below may make the code slighlty
    430          * slower for small values of count but are worth having
    431          * in the general case.
    432          */
    433         __builtin_prefetch(src+32);
    434         __builtin_prefetch(dst+32);
    435 
    436         /* get the source */
    437         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    438 #if    UNROLL > 2
    439         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    440 #endif
    441 
    442         /* get and hold the dst too */
    443         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    444 #if    UNROLL > 2
    445         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    446 #endif
    447 
    448     /* 1st and 2nd bits of the unrolling */
    449     {
    450         uint8x8_t dst_cooked;
    451         uint16x8_t dst_wide;
    452         uint8x8_t alpha_narrow;
    453         uint16x8_t alpha_wide;
    454 
    455         /* get the alphas spread out properly */
    456         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    457         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    458 
    459         /* spread the dest */
    460         dst_wide = vmovl_u8(dst_raw);
    461 
    462         /* alpha mul the dest */
    463         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    464         dst_cooked = vshrn_n_u16(dst_wide, 8);
    465 
    466         /* sum -- ignoring any byte lane overflows */
    467         dst_final = vadd_u8(src_raw, dst_cooked);
    468     }
    469 
    470 #if    UNROLL > 2
    471     /* the 3rd and 4th bits of our unrolling */
    472     {
    473         uint8x8_t dst_cooked;
    474         uint16x8_t dst_wide;
    475         uint8x8_t alpha_narrow;
    476         uint16x8_t alpha_wide;
    477 
    478         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    479         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    480 
    481         /* spread the dest */
    482         dst_wide = vmovl_u8(dst_raw_2);
    483 
    484         /* alpha mul the dest */
    485         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    486         dst_cooked = vshrn_n_u16(dst_wide, 8);
    487 
    488         /* sum -- ignoring any byte lane overflows */
    489         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    490     }
    491 #endif
    492 
    493         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    494 #if    UNROLL > 2
    495         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    496 #endif
    497 
    498         src += UNROLL;
    499         dst += UNROLL;
    500         count -= UNROLL;
    501     }
    502 #undef    UNROLL
    503 
    504     /* do any residual iterations */
    505         while (--count >= 0) {
    506             *dst = SkPMSrcOver(*src, *dst);
    507             src += 1;
    508             dst += 1;
    509         }
    510     }
    511 }
    512 
    513 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
    514                                 const SkPMColor* SK_RESTRICT src,
    515                                 int count, U8CPU alpha) {
    516     SkASSERT(255 == alpha);
    517 
    518     if (count <= 0)
    519     return;
    520 
    521     /* Use these to check if src is transparent or opaque */
    522     const unsigned int ALPHA_OPAQ  = 0xFF000000;
    523     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
    524 
    525 #define UNROLL  4
    526     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
    527     const SkPMColor* SK_RESTRICT src_temp = src;
    528 
    529     /* set up the NEON variables */
    530     uint8x8_t alpha_mask;
    531     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    532     alpha_mask = vld1_u8(alpha_mask_setup);
    533 
    534     uint8x8_t src_raw, dst_raw, dst_final;
    535     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    536     uint8x8_t dst_cooked;
    537     uint16x8_t dst_wide;
    538     uint8x8_t alpha_narrow;
    539     uint16x8_t alpha_wide;
    540 
    541     /* choose the first processing type */
    542     if( src >= src_end)
    543         goto TAIL;
    544     if(*src <= ALPHA_TRANS)
    545         goto ALPHA_0;
    546     if(*src >= ALPHA_OPAQ)
    547         goto ALPHA_255;
    548     /* fall-thru */
    549 
    550 ALPHA_1_TO_254:
    551     do {
    552 
    553         /* get the source */
    554         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    555         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    556 
    557         /* get and hold the dst too */
    558         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    559         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    560 
    561 
    562         /* get the alphas spread out properly */
    563         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    564         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    565         /* we collapsed (255-a)+1 ... */
    566         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    567 
    568         /* spread the dest */
    569         dst_wide = vmovl_u8(dst_raw);
    570 
    571         /* alpha mul the dest */
    572         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    573         dst_cooked = vshrn_n_u16(dst_wide, 8);
    574 
    575         /* sum -- ignoring any byte lane overflows */
    576         dst_final = vadd_u8(src_raw, dst_cooked);
    577 
    578         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    579         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    580         /* we collapsed (255-a)+1 ... */
    581         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    582 
    583         /* spread the dest */
    584         dst_wide = vmovl_u8(dst_raw_2);
    585 
    586         /* alpha mul the dest */
    587         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    588         dst_cooked = vshrn_n_u16(dst_wide, 8);
    589 
    590         /* sum -- ignoring any byte lane overflows */
    591         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    592 
    593         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    594         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    595 
    596         src += UNROLL;
    597         dst += UNROLL;
    598 
    599         /* if 2 of the next pixels aren't between 1 and 254
    600         it might make sense to go to the optimized loops */
    601         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
    602             break;
    603 
    604     } while(src < src_end);
    605 
    606     if (src >= src_end)
    607         goto TAIL;
    608 
    609     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
    610         goto ALPHA_255;
    611 
    612     /*fall-thru*/
    613 
    614 ALPHA_0:
    615 
    616     /*In this state, we know the current alpha is 0 and
    617      we optimize for the next alpha also being zero. */
    618     src_temp = src;  //so we don't have to increment dst every time
    619     do {
    620         if(*(++src) > ALPHA_TRANS)
    621             break;
    622         if(*(++src) > ALPHA_TRANS)
    623             break;
    624         if(*(++src) > ALPHA_TRANS)
    625             break;
    626         if(*(++src) > ALPHA_TRANS)
    627             break;
    628     } while(src < src_end);
    629 
    630     dst += (src - src_temp);
    631 
    632     /* no longer alpha 0, so determine where to go next. */
    633     if( src >= src_end)
    634         goto TAIL;
    635     if(*src >= ALPHA_OPAQ)
    636         goto ALPHA_255;
    637     else
    638         goto ALPHA_1_TO_254;
    639 
    640 ALPHA_255:
    641     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
    642         dst[0]=src[0];
    643         dst[1]=src[1];
    644         dst[2]=src[2];
    645         dst[3]=src[3];
    646         src+=UNROLL;
    647         dst+=UNROLL;
    648         if(src >= src_end)
    649             goto TAIL;
    650     }
    651 
    652     //Handle remainder.
    653     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    654         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    655             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
    656         }
    657     }
    658 
    659     if( src >= src_end)
    660         goto TAIL;
    661     if(*src <= ALPHA_TRANS)
    662         goto ALPHA_0;
    663     else
    664         goto ALPHA_1_TO_254;
    665 
    666 TAIL:
    667     /* do any residual iterations */
    668     src_end += UNROLL + 1;  //goto the real end
    669     while(src != src_end) {
    670         if( *src != 0 ) {
    671             if( *src >= ALPHA_OPAQ ) {
    672                 *dst = *src;
    673             }
    674             else {
    675                 *dst = SkPMSrcOver(*src, *dst);
    676             }
    677         }
    678         src++;
    679         dst++;
    680     }
    681 
    682 #undef    UNROLL
    683     return;
    684 }
    685 
    686 /* Neon version of S32_Blend_BlitRow32()
    687  * portable version is in src/core/SkBlitRow_D32.cpp
    688  */
    689 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    690                               const SkPMColor* SK_RESTRICT src,
    691                               int count, U8CPU alpha) {
    692     SkASSERT(alpha <= 255);
    693     if (count > 0) {
    694         uint16_t src_scale = SkAlpha255To256(alpha);
    695         uint16_t dst_scale = 256 - src_scale;
    696 
    697     /* run them N at a time through the NEON unit */
    698     /* note that each 1 is 4 bytes, each treated exactly the same,
    699      * so we can work under that guise. We *do* know that the src&dst
    700      * will be 32-bit aligned quantities, so we can specify that on
    701      * the load/store ops and do a neon 'reinterpret' to get us to
    702      * byte-sized (pun intended) pieces that we widen/multiply/shift
    703      * we're limited at 128 bits in the wide ops, which is 8x16bits
    704      * or a pair of 32 bit src/dsts.
    705      */
    706     /* we *could* manually unroll this loop so that we load 128 bits
    707      * (as a pair of 64s) from each of src and dst, processing them
    708      * in pieces. This might give us a little better management of
    709      * the memory latency, but my initial attempts here did not
    710      * produce an instruction stream that looked all that nice.
    711      */
    712 #define    UNROLL    2
    713     while (count >= UNROLL) {
    714         uint8x8_t  src_raw, dst_raw, dst_final;
    715         uint16x8_t  src_wide, dst_wide;
    716 
    717         /* get 64 bits of src, widen it, multiply by src_scale */
    718         src_raw = vreinterpret_u8_u32(vld1_u32(src));
    719         src_wide = vmovl_u8(src_raw);
    720         /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
    721         src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
    722 
    723         /* ditto with dst */
    724         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    725         dst_wide = vmovl_u8(dst_raw);
    726 
    727         /* combine add with dst multiply into mul-accumulate */
    728         dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
    729 
    730         dst_final = vshrn_n_u16(dst_wide, 8);
    731         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    732 
    733         src += UNROLL;
    734         dst += UNROLL;
    735         count -= UNROLL;
    736     }
    737     /* RBE: well, i don't like how gcc manages src/dst across the above
    738      * loop it's constantly calculating src+bias, dst+bias and it only
    739      * adjusts the real ones when we leave the loop. Not sure why
    740      * it's "hoisting down" (hoisting implies above in my lexicon ;))
    741      * the adjustments to src/dst/count, but it does...
    742      * (might be SSA-style internal logic...
    743      */
    744 
    745 #if    UNROLL == 2
    746     if (count == 1) {
    747             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    748     }
    749 #else
    750     if (count > 0) {
    751             do {
    752                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    753                 src += 1;
    754                 dst += 1;
    755             } while (--count > 0);
    756     }
    757 #endif
    758 
    759 #undef    UNROLL
    760     }
    761 }
    762 
    763 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    764                          const SkPMColor* SK_RESTRICT src,
    765                          int count, U8CPU alpha) {
    766 
    767     SkASSERT(255 >= alpha);
    768 
    769     if (count <= 0) {
    770         return;
    771     }
    772 
    773     unsigned alpha256 = SkAlpha255To256(alpha);
    774 
    775     // First deal with odd counts
    776     if (count & 1) {
    777         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    778         uint16x8_t vdst_wide, vsrc_wide;
    779         unsigned dst_scale;
    780 
    781         // Load
    782         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    783         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    784 
    785         // Calc dst_scale
    786         dst_scale = vget_lane_u8(vsrc, 3);
    787         dst_scale *= alpha256;
    788         dst_scale >>= 8;
    789         dst_scale = 256 - dst_scale;
    790 
    791         // Process src
    792         vsrc_wide = vmovl_u8(vsrc);
    793         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
    794 
    795         // Process dst
    796         vdst_wide = vmovl_u8(vdst);
    797         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
    798 
    799         // Combine
    800         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
    801 
    802         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    803         dst++;
    804         src++;
    805         count--;
    806     }
    807 
    808     if (count) {
    809         uint8x8_t alpha_mask;
    810         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    811         alpha_mask = vld1_u8(alpha_mask_setup);
    812 
    813         do {
    814 
    815             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
    816             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
    817 
    818             __builtin_prefetch(src+32);
    819             __builtin_prefetch(dst+32);
    820 
    821             // Load
    822             vsrc = vreinterpret_u8_u32(vld1_u32(src));
    823             vdst = vreinterpret_u8_u32(vld1_u32(dst));
    824 
    825             // Prepare src_scale
    826             vsrc_scale = vdupq_n_u16(alpha256);
    827 
    828             // Calc dst_scale
    829             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
    830             vdst_scale = vmovl_u8(vsrc_alphas);
    831             vdst_scale *= vsrc_scale;
    832             vdst_scale = vshrq_n_u16(vdst_scale, 8);
    833             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
    834 
    835             // Process src
    836             vsrc_wide = vmovl_u8(vsrc);
    837             vsrc_wide *= vsrc_scale;
    838 
    839             // Process dst
    840             vdst_wide = vmovl_u8(vdst);
    841             vdst_wide *= vdst_scale;
    842 
    843             // Combine
    844             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
    845 
    846             vst1_u32(dst, vreinterpret_u32_u8(vres));
    847 
    848             src += 2;
    849             dst += 2;
    850             count -= 2;
    851         } while(count);
    852     }
    853 }
    854 
    855 ///////////////////////////////////////////////////////////////////////////////
    856 
    857 #undef    DEBUG_OPAQUE_DITHER
    858 
    859 #if    defined(DEBUG_OPAQUE_DITHER)
    860 static void showme8(char *str, void *p, int len)
    861 {
    862     static char buf[256];
    863     char tbuf[32];
    864     int i;
    865     char *pc = (char*) p;
    866     sprintf(buf,"%8s:", str);
    867     for(i=0;i<len;i++) {
    868         sprintf(tbuf, "   %02x", pc[i]);
    869         strcat(buf, tbuf);
    870     }
    871     SkDebugf("%s\n", buf);
    872 }
    873 static void showme16(char *str, void *p, int len)
    874 {
    875     static char buf[256];
    876     char tbuf[32];
    877     int i;
    878     uint16_t *pc = (uint16_t*) p;
    879     sprintf(buf,"%8s:", str);
    880     len = (len / sizeof(uint16_t));    /* passed as bytes */
    881     for(i=0;i<len;i++) {
    882         sprintf(tbuf, " %04x", pc[i]);
    883         strcat(buf, tbuf);
    884     }
    885     SkDebugf("%s\n", buf);
    886 }
    887 #endif
    888 
    889 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
    890                                    const SkPMColor* SK_RESTRICT src,
    891                                    int count, U8CPU alpha, int x, int y) {
    892     SkASSERT(255 == alpha);
    893 
    894 #define    UNROLL    8
    895 
    896     if (count >= UNROLL) {
    897     uint8x8_t dbase;
    898 
    899 #if    defined(DEBUG_OPAQUE_DITHER)
    900     uint16_t tmpbuf[UNROLL];
    901     int td[UNROLL];
    902     int tdv[UNROLL];
    903     int ta[UNROLL];
    904     int tap[UNROLL];
    905     uint16_t in_dst[UNROLL];
    906     int offset = 0;
    907     int noisy = 0;
    908 #endif
    909 
    910     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    911     dbase = vld1_u8(dstart);
    912 
    913         do {
    914         uint8x8_t sr, sg, sb, sa, d;
    915         uint16x8_t dst8, scale8, alpha8;
    916         uint16x8_t dst_r, dst_g, dst_b;
    917 
    918 #if    defined(DEBUG_OPAQUE_DITHER)
    919     /* calculate 8 elements worth into a temp buffer */
    920     {
    921       int my_y = y;
    922       int my_x = x;
    923       SkPMColor* my_src = (SkPMColor*)src;
    924       uint16_t* my_dst = dst;
    925       int i;
    926 
    927           DITHER_565_SCAN(my_y);
    928           for(i=0;i<UNROLL;i++) {
    929             SkPMColor c = *my_src++;
    930             SkPMColorAssert(c);
    931             if (c) {
    932                 unsigned a = SkGetPackedA32(c);
    933 
    934                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
    935         tdv[i] = DITHER_VALUE(my_x);
    936         ta[i] = a;
    937         tap[i] = SkAlpha255To256(a);
    938         td[i] = d;
    939 
    940                 unsigned sr = SkGetPackedR32(c);
    941                 unsigned sg = SkGetPackedG32(c);
    942                 unsigned sb = SkGetPackedB32(c);
    943                 sr = SkDITHER_R32_FOR_565(sr, d);
    944                 sg = SkDITHER_G32_FOR_565(sg, d);
    945                 sb = SkDITHER_B32_FOR_565(sb, d);
    946 
    947                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    948                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
    949                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    950                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    951                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    952         td[i] = d;
    953 
    954             } else {
    955         tmpbuf[i] = *my_dst;
    956         ta[i] = tdv[i] = td[i] = 0xbeef;
    957         }
    958         in_dst[i] = *my_dst;
    959             my_dst += 1;
    960             DITHER_INC_X(my_x);
    961           }
    962     }
    963 #endif
    964 
    965         /* source is in ABGR */
    966         {
    967         register uint8x8_t d0 asm("d0");
    968         register uint8x8_t d1 asm("d1");
    969         register uint8x8_t d2 asm("d2");
    970         register uint8x8_t d3 asm("d3");
    971 
    972         asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
    973             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
    974             : "r" (src)
    975                     );
    976             sr = d0; sg = d1; sb = d2; sa = d3;
    977         }
    978 
    979         /* calculate 'd', which will be 0..7 */
    980         /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
    981 #if defined(SK_BUILD_FOR_ANDROID)
    982         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
    983         alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
    984 #else
    985         alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
    986 #endif
    987         alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
    988         d = vshrn_n_u16(alpha8, 8);    /* narrowing too */
    989 
    990         /* sr = sr - (sr>>5) + d */
    991         /* watching for 8-bit overflow.  d is 0..7; risky range of
    992          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
    993          * safe  as long as we do ((sr-sr>>5) + d) */
    994         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
    995         sr = vadd_u8(sr, d);
    996 
    997         /* sb = sb - (sb>>5) + d */
    998         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
    999         sb = vadd_u8(sb, d);
   1000 
   1001         /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
   1002         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1003         sg = vadd_u8(sg, vshr_n_u8(d,1));
   1004 
   1005         /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
   1006         dst8 = vld1q_u16(dst);
   1007         dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
   1008         dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
   1009         dst_r = vshrq_n_u16(dst8,11);    /* clearing hi bits */
   1010 
   1011         /* blend */
   1012 #if 1
   1013         /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
   1014         /* originally 255-sa + 1 */
   1015         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1016 #else
   1017         scale8 = vsubw_u8(vdupq_n_u16(255), sa);
   1018         scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
   1019 #endif
   1020 
   1021 #if 1
   1022         /* combine the addq and mul, save 3 insns */
   1023         scale8 = vshrq_n_u16(scale8, 3);
   1024         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1025         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1026         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1027 #else
   1028         /* known correct, but +3 insns over above */
   1029         scale8 = vshrq_n_u16(scale8, 3);
   1030         dst_b = vmulq_u16(dst_b, scale8);
   1031         dst_g = vmulq_u16(dst_g, scale8);
   1032         dst_r = vmulq_u16(dst_r, scale8);
   1033 
   1034         /* combine */
   1035         /* NB: vshll widens, need to preserve those bits */
   1036         dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
   1037         dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
   1038         dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
   1039 #endif
   1040 
   1041         /* repack to store */
   1042         dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
   1043         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1044         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1045 
   1046         vst1q_u16(dst, dst8);
   1047 
   1048 #if    defined(DEBUG_OPAQUE_DITHER)
   1049         /* verify my 8 elements match the temp buffer */
   1050     {
   1051        int i, bad=0;
   1052        static int invocation;
   1053 
   1054        for (i=0;i<UNROLL;i++)
   1055         if (tmpbuf[i] != dst[i]) bad=1;
   1056        if (bad) {
   1057         SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
   1058             invocation, offset);
   1059         SkDebugf("  alpha 0x%x\n", alpha);
   1060         for (i=0;i<UNROLL;i++)
   1061             SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
   1062             i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
   1063             dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
   1064 
   1065         showme16("alpha8", &alpha8, sizeof(alpha8));
   1066         showme16("scale8", &scale8, sizeof(scale8));
   1067         showme8("d", &d, sizeof(d));
   1068         showme16("dst8", &dst8, sizeof(dst8));
   1069         showme16("dst_b", &dst_b, sizeof(dst_b));
   1070         showme16("dst_g", &dst_g, sizeof(dst_g));
   1071         showme16("dst_r", &dst_r, sizeof(dst_r));
   1072         showme8("sb", &sb, sizeof(sb));
   1073         showme8("sg", &sg, sizeof(sg));
   1074         showme8("sr", &sr, sizeof(sr));
   1075 
   1076         /* cop out */
   1077         return;
   1078        }
   1079        offset += UNROLL;
   1080        invocation++;
   1081     }
   1082 #endif
   1083 
   1084             dst += UNROLL;
   1085         src += UNROLL;
   1086         count -= UNROLL;
   1087         /* skip x += UNROLL, since it's unchanged mod-4 */
   1088         } while (count >= UNROLL);
   1089     }
   1090 #undef    UNROLL
   1091 
   1092     /* residuals */
   1093     if (count > 0) {
   1094         DITHER_565_SCAN(y);
   1095         do {
   1096             SkPMColor c = *src++;
   1097             SkPMColorAssert(c);
   1098             if (c) {
   1099                 unsigned a = SkGetPackedA32(c);
   1100 
   1101                 // dither and alpha are just temporary variables to work-around
   1102                 // an ICE in debug.
   1103                 unsigned dither = DITHER_VALUE(x);
   1104                 unsigned alpha = SkAlpha255To256(a);
   1105                 int d = SkAlphaMul(dither, alpha);
   1106 
   1107                 unsigned sr = SkGetPackedR32(c);
   1108                 unsigned sg = SkGetPackedG32(c);
   1109                 unsigned sb = SkGetPackedB32(c);
   1110                 sr = SkDITHER_R32_FOR_565(sr, d);
   1111                 sg = SkDITHER_G32_FOR_565(sg, d);
   1112                 sb = SkDITHER_B32_FOR_565(sb, d);
   1113 
   1114                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1115                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1116                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1117                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1118                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1119             }
   1120             dst += 1;
   1121             DITHER_INC_X(x);
   1122         } while (--count != 0);
   1123     }
   1124 }
   1125 
   1126 ///////////////////////////////////////////////////////////////////////////////
   1127 
   1128 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
   1129  * speedup untested, but ARM version is 26 insns/iteration and
   1130  * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
   1131  * which is 10x the native version; that's pure instruction counts,
   1132  * not accounting for any instruction or memory latencies.
   1133  */
   1134 
   1135 #undef    DEBUG_S32_OPAQUE_DITHER
   1136 
   1137 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1138                                  const SkPMColor* SK_RESTRICT src,
   1139                                  int count, U8CPU alpha, int x, int y) {
   1140     SkASSERT(255 == alpha);
   1141 
   1142 #define    UNROLL    8
   1143     if (count >= UNROLL) {
   1144     uint8x8_t d;
   1145     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1146     d = vld1_u8(dstart);
   1147 
   1148     while (count >= UNROLL) {
   1149         uint8x8_t sr, sg, sb;
   1150         uint16x8_t dr, dg, db;
   1151         uint16x8_t dst8;
   1152 
   1153         /* source is in ABGR ordering (R == lsb) */
   1154         {
   1155         register uint8x8_t d0 asm("d0");
   1156         register uint8x8_t d1 asm("d1");
   1157         register uint8x8_t d2 asm("d2");
   1158         register uint8x8_t d3 asm("d3");
   1159 
   1160         asm ("vld4.8    {d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
   1161             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
   1162             : "r" (src)
   1163                     );
   1164             sr = d0; sg = d1; sb = d2;
   1165         }
   1166         /* XXX: if we want to prefetch, hide it in the above asm()
   1167          * using the gcc __builtin_prefetch(), the prefetch will
   1168          * fall to the bottom of the loop -- it won't stick up
   1169          * at the top of the loop, just after the vld4.
   1170          */
   1171 
   1172         /* sr = sr - (sr>>5) + d */
   1173         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1174         dr = vaddl_u8(sr, d);
   1175 
   1176         /* sb = sb - (sb>>5) + d */
   1177         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1178         db = vaddl_u8(sb, d);
   1179 
   1180         /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
   1181         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1182         dg = vaddl_u8(sg, vshr_n_u8(d,1));
   1183         /* XXX: check that the "d>>1" here is hoisted */
   1184 
   1185         /* pack high bits of each into 565 format  (rgb, b is lsb) */
   1186         dst8 = vshrq_n_u16(db, 3);
   1187         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1188         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
   1189 
   1190         /* store it */
   1191         vst1q_u16(dst, dst8);
   1192 
   1193 #if    defined(DEBUG_S32_OPAQUE_DITHER)
   1194         /* always good to know if we generated good results */
   1195         {
   1196         int i, myx = x, myy = y;
   1197         DITHER_565_SCAN(myy);
   1198         for (i=0;i<UNROLL;i++) {
   1199             SkPMColor c = src[i];
   1200             unsigned dither = DITHER_VALUE(myx);
   1201             uint16_t val = SkDitherRGB32To565(c, dither);
   1202             if (val != dst[i]) {
   1203             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
   1204                 c, dither, val, dst[i], dstart[i]);
   1205             }
   1206             DITHER_INC_X(myx);
   1207         }
   1208         }
   1209 #endif
   1210 
   1211         dst += UNROLL;
   1212         src += UNROLL;
   1213         count -= UNROLL;
   1214         x += UNROLL;        /* probably superfluous */
   1215     }
   1216     }
   1217 #undef    UNROLL
   1218 
   1219     /* residuals */
   1220     if (count > 0) {
   1221         DITHER_565_SCAN(y);
   1222         do {
   1223             SkPMColor c = *src++;
   1224             SkPMColorAssert(c);
   1225             SkASSERT(SkGetPackedA32(c) == 255);
   1226 
   1227             unsigned dither = DITHER_VALUE(x);
   1228             *dst++ = SkDitherRGB32To565(c, dither);
   1229             DITHER_INC_X(x);
   1230         } while (--count != 0);
   1231     }
   1232 }
   1233 
   1234 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
   1235                       SkPMColor color) {
   1236     if (count <= 0) {
   1237         return;
   1238     }
   1239 
   1240     if (0 == color) {
   1241         if (src != dst) {
   1242             memcpy(dst, src, count * sizeof(SkPMColor));
   1243         }
   1244         return;
   1245     }
   1246 
   1247     unsigned colorA = SkGetPackedA32(color);
   1248     if (255 == colorA) {
   1249         sk_memset32(dst, color, count);
   1250     } else {
   1251         unsigned scale = 256 - SkAlpha255To256(colorA);
   1252 
   1253         if (count >= 8) {
   1254             // at the end of this assembly, count will have been decremented
   1255             // to a negative value. That is, if count mod 8 = x, it will be
   1256             // -8 +x coming out.
   1257             asm volatile (
   1258                 PLD128(src, 0)
   1259 
   1260                 "vdup.32    q0, %[color]                \n\t"
   1261 
   1262                 PLD128(src, 128)
   1263 
   1264                 // scale numerical interval [0-255], so load as 8 bits
   1265                 "vdup.8     d2, %[scale]                \n\t"
   1266 
   1267                 PLD128(src, 256)
   1268 
   1269                 "subs       %[count], %[count], #8      \n\t"
   1270 
   1271                 PLD128(src, 384)
   1272 
   1273                 "Loop_Color32:                          \n\t"
   1274 
   1275                 // load src color, 8 pixels, 4 64 bit registers
   1276                 // (and increment src).
   1277                 "vld1.32    {d4-d7}, [%[src]]!          \n\t"
   1278 
   1279                 PLD128(src, 384)
   1280 
   1281                 // multiply long by scale, 64 bits at a time,
   1282                 // destination into a 128 bit register.
   1283                 "vmull.u8   q4, d4, d2                  \n\t"
   1284                 "vmull.u8   q5, d5, d2                  \n\t"
   1285                 "vmull.u8   q6, d6, d2                  \n\t"
   1286                 "vmull.u8   q7, d7, d2                  \n\t"
   1287 
   1288                 // shift the 128 bit registers, containing the 16
   1289                 // bit scaled values back to 8 bits, narrowing the
   1290                 // results to 64 bit registers.
   1291                 "vshrn.i16  d8, q4, #8                  \n\t"
   1292                 "vshrn.i16  d9, q5, #8                  \n\t"
   1293                 "vshrn.i16  d10, q6, #8                 \n\t"
   1294                 "vshrn.i16  d11, q7, #8                 \n\t"
   1295 
   1296                 // adding back the color, using 128 bit registers.
   1297                 "vadd.i8    q6, q4, q0                  \n\t"
   1298                 "vadd.i8    q7, q5, q0                  \n\t"
   1299 
   1300                 // store back the 8 calculated pixels (2 128 bit
   1301                 // registers), and increment dst.
   1302                 "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
   1303 
   1304                 "subs       %[count], %[count], #8      \n\t"
   1305                 "bge        Loop_Color32                \n\t"
   1306                 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
   1307                 : [color] "r" (color), [scale] "r" (scale)
   1308                 : "cc", "memory",
   1309                   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
   1310                   "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
   1311                           );
   1312             // At this point, if we went through the inline assembly, count is
   1313             // a negative value:
   1314             // if the value is -8, there is no pixel left to process.
   1315             // if the value is -7, there is one pixel left to process
   1316             // ...
   1317             // And'ing it with 7 will give us the number of pixels
   1318             // left to process.
   1319             count = count & 0x7;
   1320         }
   1321 
   1322         while (count > 0) {
   1323             *dst = color + SkAlphaMulQ(*src, scale);
   1324             src += 1;
   1325             dst += 1;
   1326             count--;
   1327         }
   1328     }
   1329 }
   1330 
   1331 ///////////////////////////////////////////////////////////////////////////////
   1332 
   1333 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
   1334     // no dither
   1335     // NOTE: For the two functions below, we don't have a special version
   1336     //       that assumes that each source pixel is opaque. But our S32A is
   1337     //       still faster than the default, so use it.
   1338     S32A_D565_Opaque_neon,  // really S32_D565_Opaque
   1339     S32A_D565_Blend_neon,   // really S32_D565_Blend
   1340     S32A_D565_Opaque_neon,
   1341     S32A_D565_Blend_neon,
   1342 
   1343     // dither
   1344     S32_D565_Opaque_Dither_neon,
   1345     S32_D565_Blend_Dither_neon,
   1346     S32A_D565_Opaque_Dither_neon,
   1347     NULL,   // S32A_D565_Blend_Dither
   1348 };
   1349 
   1350 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
   1351     NULL,   // S32_Opaque,
   1352     S32_Blend_BlitRow32_neon,        // S32_Blend,
   1353     /*
   1354      * We have two choices for S32A_Opaque procs. The one reads the src alpha
   1355      * value and attempts to optimize accordingly.  The optimization is
   1356      * sensitive to the source content and is not a win in all cases. For
   1357      * example, if there are a lot of transitions between the alpha states,
   1358      * the performance will almost certainly be worse.  However, for many
   1359      * common cases the performance is equivalent or better than the standard
   1360      * case where we do not inspect the src alpha.
   1361      */
   1362 #if SK_A32_SHIFT == 24
   1363     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
   1364     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
   1365 #else
   1366     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
   1367 #endif
   1368     S32A_Blend_BlitRow32_neon        // S32A_Blend
   1369 };
   1370