Home | History | Annotate | Download | only in opts
      1 /*
      2  **
      3  ** Copyright 2009, The Android Open Source Project
      4  **
      5  ** Licensed under the Apache License, Version 2.0 (the "License");
      6  ** you may not use this file except in compliance with the License.
      7  ** You may obtain a copy of the License at
      8  **
      9  **     http://www.apache.org/licenses/LICENSE-2.0
     10  **
     11  ** Unless required by applicable law or agreed to in writing, software
     12  ** distributed under the License is distributed on an "AS IS" BASIS,
     13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  ** See the License for the specific language governing permissions and
     15  ** limitations under the License.
     16  */
     17 
     18 #ifdef ANDROID
     19     #include <machine/cpu-features.h>
     20 #endif
     21 
     22 #include "SkBlitRow.h"
     23 #include "SkColorPriv.h"
     24 #include "SkDither.h"
     25 
     26 #if defined(__ARM_HAVE_NEON)
     27 #include <arm_neon.h>
     28 #endif
     29 
     30 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
     31 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     32                                   const SkPMColor* SK_RESTRICT src, int count,
     33                                   U8CPU alpha, int /*x*/, int /*y*/) {
     34     SkASSERT(255 == alpha);
     35 
     36     if (count >= 8) {
     37         uint16_t* SK_RESTRICT keep_dst;
     38 
     39         asm volatile (
     40                       "ands       ip, %[count], #7            \n\t"
     41                       "vmov.u8    d31, #1<<7                  \n\t"
     42                       "vld1.16    {q12}, [%[dst]]             \n\t"
     43                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
     44                       "moveq      ip, #8                      \n\t"
     45                       "mov        %[keep_dst], %[dst]         \n\t"
     46 
     47                       "add        %[src], %[src], ip, LSL#2   \n\t"
     48                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
     49                       "subs       %[count], %[count], ip      \n\t"
     50                       "b          9f                          \n\t"
     51                       // LOOP
     52                       "2:                                         \n\t"
     53 
     54                       "vld1.16    {q12}, [%[dst]]!            \n\t"
     55                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
     56                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
     57                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
     58                       "subs       %[count], %[count], #8      \n\t"
     59                       "9:                                         \n\t"
     60                       "pld        [%[dst],#32]                \n\t"
     61                       // expand 0565 q12 to 8888 {d4-d7}
     62                       "vmovn.u16  d4, q12                     \n\t"
     63                       "vshr.u16   q11, q12, #5                \n\t"
     64                       "vshr.u16   q10, q12, #6+5              \n\t"
     65                       "vmovn.u16  d5, q11                     \n\t"
     66                       "vmovn.u16  d6, q10                     \n\t"
     67                       "vshl.u8    d4, d4, #3                  \n\t"
     68                       "vshl.u8    d5, d5, #2                  \n\t"
     69                       "vshl.u8    d6, d6, #3                  \n\t"
     70 
     71                       "vmovl.u8   q14, d31                    \n\t"
     72                       "vmovl.u8   q13, d31                    \n\t"
     73                       "vmovl.u8   q12, d31                    \n\t"
     74 
     75                       // duplicate in 4/2/1 & 8pix vsns
     76                       "vmvn.8     d30, d3                     \n\t"
     77                       "vmlal.u8   q14, d30, d6                \n\t"
     78                       "vmlal.u8   q13, d30, d5                \n\t"
     79                       "vmlal.u8   q12, d30, d4                \n\t"
     80                       "vshr.u16   q8, q14, #5                 \n\t"
     81                       "vshr.u16   q9, q13, #6                 \n\t"
     82                       "vaddhn.u16 d6, q14, q8                 \n\t"
     83                       "vshr.u16   q8, q12, #5                 \n\t"
     84                       "vaddhn.u16 d5, q13, q9                 \n\t"
     85                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
     86                       "vaddhn.u16 d4, q12, q8                 \n\t"
     87                       // intentionally don't calculate alpha
     88                       // result in d4-d6
     89 
     90                       "vqadd.u8   d5, d5, d1                  \n\t"
     91                       "vqadd.u8   d4, d4, d2                  \n\t"
     92 
     93                       // pack 8888 {d4-d6} to 0565 q10
     94                       "vshll.u8   q10, d6, #8                 \n\t"
     95                       "vshll.u8   q3, d5, #8                  \n\t"
     96                       "vshll.u8   q2, d4, #8                  \n\t"
     97                       "vsri.u16   q10, q3, #5                 \n\t"
     98                       "vsri.u16   q10, q2, #11                \n\t"
     99 
    100                       "bne        2b                          \n\t"
    101 
    102                       "1:                                         \n\t"
    103                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
    104                       : [count] "+r" (count)
    105                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    106                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    107                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    108                       "d30","d31"
    109                       );
    110     }
    111     else
    112     {   // handle count < 8
    113         uint16_t* SK_RESTRICT keep_dst;
    114 
    115         asm volatile (
    116                       "vmov.u8    d31, #1<<7                  \n\t"
    117                       "mov        %[keep_dst], %[dst]         \n\t"
    118 
    119                       "tst        %[count], #4                \n\t"
    120                       "beq        14f                         \n\t"
    121                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    122                       "vld1.32    {q1}, [%[src]]!             \n\t"
    123 
    124                       "14:                                        \n\t"
    125                       "tst        %[count], #2                \n\t"
    126                       "beq        12f                         \n\t"
    127                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    128                       "vld1.32    {d1}, [%[src]]!             \n\t"
    129 
    130                       "12:                                        \n\t"
    131                       "tst        %[count], #1                \n\t"
    132                       "beq        11f                         \n\t"
    133                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    134                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    135 
    136                       "11:                                        \n\t"
    137                       // unzips achieve the same as a vld4 operation
    138                       "vuzpq.u16  q0, q1                      \n\t"
    139                       "vuzp.u8    d0, d1                      \n\t"
    140                       "vuzp.u8    d2, d3                      \n\t"
    141                       // expand 0565 q12 to 8888 {d4-d7}
    142                       "vmovn.u16  d4, q12                     \n\t"
    143                       "vshr.u16   q11, q12, #5                \n\t"
    144                       "vshr.u16   q10, q12, #6+5              \n\t"
    145                       "vmovn.u16  d5, q11                     \n\t"
    146                       "vmovn.u16  d6, q10                     \n\t"
    147                       "vshl.u8    d4, d4, #3                  \n\t"
    148                       "vshl.u8    d5, d5, #2                  \n\t"
    149                       "vshl.u8    d6, d6, #3                  \n\t"
    150 
    151                       "vmovl.u8   q14, d31                    \n\t"
    152                       "vmovl.u8   q13, d31                    \n\t"
    153                       "vmovl.u8   q12, d31                    \n\t"
    154 
    155                       // duplicate in 4/2/1 & 8pix vsns
    156                       "vmvn.8     d30, d3                     \n\t"
    157                       "vmlal.u8   q14, d30, d6                \n\t"
    158                       "vmlal.u8   q13, d30, d5                \n\t"
    159                       "vmlal.u8   q12, d30, d4                \n\t"
    160                       "vshr.u16   q8, q14, #5                 \n\t"
    161                       "vshr.u16   q9, q13, #6                 \n\t"
    162                       "vaddhn.u16 d6, q14, q8                 \n\t"
    163                       "vshr.u16   q8, q12, #5                 \n\t"
    164                       "vaddhn.u16 d5, q13, q9                 \n\t"
    165                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    166                       "vaddhn.u16 d4, q12, q8                 \n\t"
    167                       // intentionally don't calculate alpha
    168                       // result in d4-d6
    169 
    170                       "vqadd.u8   d5, d5, d1                  \n\t"
    171                       "vqadd.u8   d4, d4, d2                  \n\t"
    172 
    173                       // pack 8888 {d4-d6} to 0565 q10
    174                       "vshll.u8   q10, d6, #8                 \n\t"
    175                       "vshll.u8   q3, d5, #8                  \n\t"
    176                       "vshll.u8   q2, d4, #8                  \n\t"
    177                       "vsri.u16   q10, q3, #5                 \n\t"
    178                       "vsri.u16   q10, q2, #11                \n\t"
    179 
    180                       // store
    181                       "tst        %[count], #4                \n\t"
    182                       "beq        24f                         \n\t"
    183                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    184 
    185                       "24:                                        \n\t"
    186                       "tst        %[count], #2                \n\t"
    187                       "beq        22f                         \n\t"
    188                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    189 
    190                       "22:                                        \n\t"
    191                       "tst        %[count], #1                \n\t"
    192                       "beq        21f                         \n\t"
    193                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    194 
    195                       "21:                                        \n\t"
    196                       : [count] "+r" (count)
    197                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    198                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    199                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    200                       "d30","d31"
    201                       );
    202     }
    203 }
    204 
    205 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    206                                  const SkPMColor* SK_RESTRICT src, int count,
    207                                  U8CPU alpha, int /*x*/, int /*y*/) {
    208 
    209     U8CPU alpha_for_asm = alpha;
    210 
    211     asm volatile (
    212     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
    213      * the original in two respects:
    214      *  1. The results have a few mismatches compared to the original code. These mismatches
    215      *     never exceed 1. It's possible to improve accuracy vs. a floating point
    216      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
    217      *     Rounding is not present in the code below, because although results would be closer
    218      *     to a floating point implementation, the number of mismatches compared to the
    219      *     original code would be far greater.
    220      *  2. On certain inputs, the original code can overflow, causing colour channels to
    221      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
    222      *     to affect another.
    223      */
    224 
    225 #if 1
    226 		/* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
    227                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
    228 #else
    229                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
    230 #endif
    231                   "vmov.u16   q3, #255                        \n\t"   // set up constant
    232                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
    233                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
    234                   "beq        2f                              \n\t"   // if count8 == 0, exit
    235                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
    236 
    237                   "1:                                             \n\t"
    238                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
    239                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
    240                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
    241                   //  and deinterleave
    242 
    243                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
    244                   "vand       q10, q0, q15                    \n\t"   // extract blue
    245                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
    246                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
    247                   // dstrgb = {q8, q9, q10}
    248 
    249                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
    250                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
    251                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
    252 
    253                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
    254                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
    255                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
    256                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
    257                   // srcrgba = {q11, q12, q13, q14}
    258 
    259                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
    260                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
    261                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
    262                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
    263 
    264                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
    265                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
    266                   // dst_scale = q2
    267 
    268                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
    269                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
    270                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
    271 
    272 #if 1
    273 	// trying for a better match with SkDiv255Round(a)
    274 	// C alg is:  a+=128; (a+a>>8)>>8
    275 	// we'll use just a rounding shift [q2 is available for scratch]
    276                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
    277                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
    278                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    279 #else
    280 	// arm's original "truncating divide by 256"
    281                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
    282                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
    283                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    284 #endif
    285 
    286                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
    287                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
    288                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
    289 
    290                   "bne        1b                              \n\t"   // if counter != 0, loop
    291                   "2:                                             \n\t"   // exit
    292 
    293                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
    294                   :
    295                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
    296                   );
    297 
    298     count &= 7;
    299     if (count > 0) {
    300         do {
    301             SkPMColor sc = *src++;
    302             if (sc) {
    303                 uint16_t dc = *dst;
    304                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    305                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    306                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    307                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    308                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    309             }
    310             dst += 1;
    311         } while (--count != 0);
    312     }
    313 }
    314 
    315 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    316  * each dither value is spaced out into byte lanes, and repeated
    317  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    318  * start of each row.
    319  */
    320 static const uint8_t gDitherMatrix_Neon[48] = {
    321     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    322     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    323     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    324     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    325 
    326 };
    327 
    328 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    329                                        int count, U8CPU alpha, int x, int y)
    330 {
    331     /* select row and offset for dither array */
    332     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    333 
    334     /* rescale alpha to range 0 - 256 */
    335     int scale = SkAlpha255To256(alpha);
    336 
    337     asm volatile (
    338                   "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
    339                   "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
    340                   "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
    341                   "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
    342                   "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
    343                   "1:                                                 \n\t"
    344                   "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
    345                   "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
    346                   "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
    347                   "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
    348                   "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
    349                   "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
    350                   "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
    351                   "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
    352                   "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
    353                   "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
    354                   "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
    355                   "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
    356                   "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
    357                   // load 8 pixels from dst, extract rgb
    358                   "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
    359                   "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
    360                   "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
    361                   "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
    362                   "vand           d17, d17, d29                   \n\t"   // and green with green mask
    363                   "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
    364                   "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
    365                   // src = {d22 (r), d23 (g), d24 (b)}
    366                   // dst = {d16 (r), d17 (g), d18 (b)}
    367                   // subtract dst from src and widen
    368                   "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
    369                   "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
    370                   "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
    371                   // multiply diffs by scale and shift
    372                   "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
    373                   "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
    374                   "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
    375                   "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
    376                   "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
    377                   "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
    378                   "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
    379                   // add dst to result
    380                   "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
    381                   "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
    382                   "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
    383                   // put result into 565 format
    384                   "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
    385                   "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
    386                   "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
    387                   "bgt            1b                              \n\t"   // loop if count > 0
    388                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
    389                   : [dstart] "r" (dstart), [scale] "r" (scale)
    390                   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
    391                   );
    392 
    393     DITHER_565_SCAN(y);
    394 
    395     while((count & 7) > 0)
    396     {
    397         SkPMColor c = *src++;
    398 
    399         int dither = DITHER_VALUE(x);
    400         int sr = SkGetPackedR32(c);
    401         int sg = SkGetPackedG32(c);
    402         int sb = SkGetPackedB32(c);
    403         sr = SkDITHER_R32To565(sr, dither);
    404         sg = SkDITHER_G32To565(sg, dither);
    405         sb = SkDITHER_B32To565(sb, dither);
    406 
    407         uint16_t d = *dst;
    408         *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    409                              SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    410                              SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    411         DITHER_INC_X(x);
    412         count--;
    413     }
    414 }
    415 
    416 #define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
    417 #define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
    418 #define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
    419 #else
    420 #define S32A_D565_Opaque_PROC       NULL
    421 #define S32A_D565_Blend_PROC        NULL
    422 #define S32_D565_Blend_Dither_PROC  NULL
    423 #endif
    424 
    425 /* Don't have a special version that assumes each src is opaque, but our S32A
    426     is still faster than the default, so use it here
    427  */
    428 #define S32_D565_Opaque_PROC    S32A_D565_Opaque_PROC
    429 #define S32_D565_Blend_PROC     S32A_D565_Blend_PROC
    430 
    431 ///////////////////////////////////////////////////////////////////////////////
    432 
    433 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
    434 
    435 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    436                                   const SkPMColor* SK_RESTRICT src,
    437                                   int count, U8CPU alpha) {
    438 
    439     SkASSERT(255 == alpha);
    440     if (count > 0) {
    441 
    442 
    443 	uint8x8_t alpha_mask;
    444 
    445 	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    446 	alpha_mask = vld1_u8(alpha_mask_setup);
    447 
    448 	/* do the NEON unrolled code */
    449 #define	UNROLL	4
    450 	while (count >= UNROLL) {
    451 	    uint8x8_t src_raw, dst_raw, dst_final;
    452 	    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    453 
    454 	    /* get the source */
    455 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
    456 #if	UNROLL > 2
    457 	    src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    458 #endif
    459 
    460 	    /* get and hold the dst too */
    461 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    462 #if	UNROLL > 2
    463 	    dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    464 #endif
    465 
    466 	/* 1st and 2nd bits of the unrolling */
    467 	{
    468 	    uint8x8_t dst_cooked;
    469 	    uint16x8_t dst_wide;
    470 	    uint8x8_t alpha_narrow;
    471 	    uint16x8_t alpha_wide;
    472 
    473 	    /* get the alphas spread out properly */
    474 	    alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    475 #if 1
    476 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    477 	    /* we collapsed (255-a)+1 ... */
    478 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    479 #else
    480 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
    481 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
    482 #endif
    483 
    484 	    /* spread the dest */
    485 	    dst_wide = vmovl_u8(dst_raw);
    486 
    487 	    /* alpha mul the dest */
    488 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    489 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
    490 
    491 	    /* sum -- ignoring any byte lane overflows */
    492 	    dst_final = vadd_u8(src_raw, dst_cooked);
    493 	}
    494 
    495 #if	UNROLL > 2
    496 	/* the 3rd and 4th bits of our unrolling */
    497 	{
    498 	    uint8x8_t dst_cooked;
    499 	    uint16x8_t dst_wide;
    500 	    uint8x8_t alpha_narrow;
    501 	    uint16x8_t alpha_wide;
    502 
    503 	    alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    504 #if 1
    505 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    506 	    /* we collapsed (255-a)+1 ... */
    507 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    508 #else
    509 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
    510 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
    511 #endif
    512 
    513 	    /* spread the dest */
    514 	    dst_wide = vmovl_u8(dst_raw_2);
    515 
    516 	    /* alpha mul the dest */
    517 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    518 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
    519 
    520 	    /* sum -- ignoring any byte lane overflows */
    521 	    dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    522 	}
    523 #endif
    524 
    525 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    526 #if	UNROLL > 2
    527 	    vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    528 #endif
    529 
    530 	    src += UNROLL;
    531 	    dst += UNROLL;
    532 	    count -= UNROLL;
    533 	}
    534 #undef	UNROLL
    535 
    536 	/* do any residual iterations */
    537         while (--count >= 0) {
    538 #ifdef TEST_SRC_ALPHA
    539             SkPMColor sc = *src;
    540             if (sc) {
    541                 unsigned srcA = SkGetPackedA32(sc);
    542                 SkPMColor result = sc;
    543                 if (srcA != 255) {
    544                     result = SkPMSrcOver(sc, *dst);
    545                 }
    546                 *dst = result;
    547             }
    548 #else
    549             *dst = SkPMSrcOver(*src, *dst);
    550 #endif
    551             src += 1;
    552             dst += 1;
    553         }
    554     }
    555 }
    556 
    557 #define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_neon
    558 #else
    559 #define	S32A_Opaque_BlitRow32_PROC	NULL
    560 #endif
    561 
    562 /* Neon version of S32_Blend_BlitRow32()
    563  * portable version is in src/core/SkBlitRow_D32.cpp
    564  */
    565 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
    566 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    567                                 const SkPMColor* SK_RESTRICT src,
    568                                 int count, U8CPU alpha) {
    569     SkASSERT(alpha <= 255);
    570     if (count > 0) {
    571         uint16_t src_scale = SkAlpha255To256(alpha);
    572         uint16_t dst_scale = 256 - src_scale;
    573 
    574 	/* run them N at a time through the NEON unit */
    575 	/* note that each 1 is 4 bytes, each treated exactly the same,
    576 	 * so we can work under that guise. We *do* know that the src&dst
    577 	 * will be 32-bit aligned quantities, so we can specify that on
    578 	 * the load/store ops and do a neon 'reinterpret' to get us to
    579 	 * byte-sized (pun intended) pieces that we widen/multiply/shift
    580 	 * we're limited at 128 bits in the wide ops, which is 8x16bits
    581 	 * or a pair of 32 bit src/dsts.
    582 	 */
    583 	/* we *could* manually unroll this loop so that we load 128 bits
    584 	 * (as a pair of 64s) from each of src and dst, processing them
    585 	 * in pieces. This might give us a little better management of
    586 	 * the memory latency, but my initial attempts here did not
    587 	 * produce an instruction stream that looked all that nice.
    588 	 */
    589 #define	UNROLL	2
    590 	while (count >= UNROLL) {
    591 	    uint8x8_t  src_raw, dst_raw, dst_final;
    592 	    uint16x8_t  src_wide, dst_wide;
    593 
    594 	    /* get 64 bits of src, widen it, multiply by src_scale */
    595 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
    596 	    src_wide = vmovl_u8(src_raw);
    597 	    /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
    598 	    src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
    599 
    600 	    /* ditto with dst */
    601 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    602 	    dst_wide = vmovl_u8(dst_raw);
    603 
    604 	    /* combine add with dst multiply into mul-accumulate */
    605 	    dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
    606 
    607 	    dst_final = vshrn_n_u16(dst_wide, 8);
    608 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    609 
    610 	    src += UNROLL;
    611 	    dst += UNROLL;
    612 	    count -= UNROLL;
    613 	}
    614 	/* RBE: well, i don't like how gcc manages src/dst across the above
    615 	 * loop it's constantly calculating src+bias, dst+bias and it only
    616 	 * adjusts the real ones when we leave the loop. Not sure why
    617 	 * it's "hoisting down" (hoisting implies above in my lexicon ;))
    618 	 * the adjustments to src/dst/count, but it does...
    619 	 * (might be SSA-style internal logic...
    620 	 */
    621 
    622 #if	UNROLL == 2
    623 	if (count == 1) {
    624             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    625 	}
    626 #else
    627 	if (count > 0) {
    628             do {
    629                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    630                 src += 1;
    631                 dst += 1;
    632             } while (--count > 0);
    633 	}
    634 #endif
    635 
    636 #undef	UNROLL
    637     }
    638 }
    639 
    640 #define	S32_Blend_BlitRow32_PROC	S32_Blend_BlitRow32_neon
    641 #else
    642 #define	S32_Blend_BlitRow32_PROC	NULL
    643 #endif
    644 
    645 ///////////////////////////////////////////////////////////////////////////////
    646 
    647 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
    648 
    649 #undef	DEBUG_OPAQUE_DITHER
    650 
    651 #if	defined(DEBUG_OPAQUE_DITHER)
    652 static void showme8(char *str, void *p, int len)
    653 {
    654 	static char buf[256];
    655 	char tbuf[32];
    656 	int i;
    657 	char *pc = (char*) p;
    658 	sprintf(buf,"%8s:", str);
    659 	for(i=0;i<len;i++) {
    660 	    sprintf(tbuf, "   %02x", pc[i]);
    661 	    strcat(buf, tbuf);
    662 	}
    663 	SkDebugf("%s\n", buf);
    664 }
    665 static void showme16(char *str, void *p, int len)
    666 {
    667 	static char buf[256];
    668 	char tbuf[32];
    669 	int i;
    670 	uint16_t *pc = (uint16_t*) p;
    671 	sprintf(buf,"%8s:", str);
    672 	len = (len / sizeof(uint16_t));	/* passed as bytes */
    673 	for(i=0;i<len;i++) {
    674 	    sprintf(tbuf, " %04x", pc[i]);
    675 	    strcat(buf, tbuf);
    676 	}
    677 	SkDebugf("%s\n", buf);
    678 }
    679 #endif
    680 
    681 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
    682                                       const SkPMColor* SK_RESTRICT src,
    683                                       int count, U8CPU alpha, int x, int y) {
    684     SkASSERT(255 == alpha);
    685 
    686 #define	UNROLL	8
    687 
    688     if (count >= UNROLL) {
    689 	uint8x8_t dbase;
    690 
    691 #if	defined(DEBUG_OPAQUE_DITHER)
    692 	uint16_t tmpbuf[UNROLL];
    693 	int td[UNROLL];
    694 	int tdv[UNROLL];
    695 	int ta[UNROLL];
    696 	int tap[UNROLL];
    697 	uint16_t in_dst[UNROLL];
    698 	int offset = 0;
    699 	int noisy = 0;
    700 #endif
    701 
    702 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    703 	dbase = vld1_u8(dstart);
    704 
    705         do {
    706 	    uint8x8_t sr, sg, sb, sa, d;
    707 	    uint16x8_t dst8, scale8, alpha8;
    708 	    uint16x8_t dst_r, dst_g, dst_b;
    709 
    710 #if	defined(DEBUG_OPAQUE_DITHER)
    711 	/* calculate 8 elements worth into a temp buffer */
    712 	{
    713 	  int my_y = y;
    714 	  int my_x = x;
    715 	  SkPMColor* my_src = (SkPMColor*)src;
    716 	  uint16_t* my_dst = dst;
    717 	  int i;
    718 
    719           DITHER_565_SCAN(my_y);
    720           for(i=0;i<UNROLL;i++) {
    721             SkPMColor c = *my_src++;
    722             SkPMColorAssert(c);
    723             if (c) {
    724                 unsigned a = SkGetPackedA32(c);
    725 
    726                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
    727 		tdv[i] = DITHER_VALUE(my_x);
    728 		ta[i] = a;
    729 		tap[i] = SkAlpha255To256(a);
    730 		td[i] = d;
    731 
    732                 unsigned sr = SkGetPackedR32(c);
    733                 unsigned sg = SkGetPackedG32(c);
    734                 unsigned sb = SkGetPackedB32(c);
    735                 sr = SkDITHER_R32_FOR_565(sr, d);
    736                 sg = SkDITHER_G32_FOR_565(sg, d);
    737                 sb = SkDITHER_B32_FOR_565(sb, d);
    738 
    739                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    740                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
    741                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    742                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    743                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    744 		td[i] = d;
    745 
    746             } else {
    747 		tmpbuf[i] = *my_dst;
    748 		ta[i] = tdv[i] = td[i] = 0xbeef;
    749 	    }
    750 	    in_dst[i] = *my_dst;
    751             my_dst += 1;
    752             DITHER_INC_X(my_x);
    753           }
    754 	}
    755 #endif
    756 
    757 	    /* source is in ABGR */
    758 	    {
    759 		register uint8x8_t d0 asm("d0");
    760 		register uint8x8_t d1 asm("d1");
    761 		register uint8x8_t d2 asm("d2");
    762 		register uint8x8_t d3 asm("d3");
    763 
    764 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
    765 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
    766 		    : "r" (src)
    767                     );
    768 		    sr = d0; sg = d1; sb = d2; sa = d3;
    769 	    }
    770 
    771 	    /* calculate 'd', which will be 0..7 */
    772 	    /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
    773 #if 1
    774 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
    775 	    alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
    776 #else
    777 	    alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
    778 #endif
    779 	    alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
    780 	    d = vshrn_n_u16(alpha8, 8);	/* narrowing too */
    781 
    782 	    /* sr = sr - (sr>>5) + d */
    783 	    /* watching for 8-bit overflow.  d is 0..7; risky range of
    784 	     * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
    785 	     * safe  as long as we do ((sr-sr>>5) + d) */
    786 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
    787 	    sr = vadd_u8(sr, d);
    788 
    789 	    /* sb = sb - (sb>>5) + d */
    790 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
    791 	    sb = vadd_u8(sb, d);
    792 
    793 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
    794 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
    795 	    sg = vadd_u8(sg, vshr_n_u8(d,1));
    796 
    797 	    /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
    798 	    dst8 = vld1q_u16(dst);
    799 	    dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
    800 	    dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
    801 	    dst_r = vshrq_n_u16(dst8,11);	/* clearing hi bits */
    802 
    803 	    /* blend */
    804 #if 1
    805 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
    806 	    /* originally 255-sa + 1 */
    807 	    scale8 = vsubw_u8(vdupq_n_u16(256), sa);
    808 #else
    809 	    scale8 = vsubw_u8(vdupq_n_u16(255), sa);
    810 	    scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
    811 #endif
    812 
    813 #if 1
    814 	    /* combine the addq and mul, save 3 insns */
    815 	    scale8 = vshrq_n_u16(scale8, 3);
    816 	    dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
    817 	    dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
    818 	    dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
    819 #else
    820 	    /* known correct, but +3 insns over above */
    821 	    scale8 = vshrq_n_u16(scale8, 3);
    822 	    dst_b = vmulq_u16(dst_b, scale8);
    823 	    dst_g = vmulq_u16(dst_g, scale8);
    824 	    dst_r = vmulq_u16(dst_r, scale8);
    825 
    826 	    /* combine */
    827 	    /* NB: vshll widens, need to preserve those bits */
    828 	    dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
    829 	    dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
    830 	    dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
    831 #endif
    832 
    833 	    /* repack to store */
    834 	    dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
    835 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
    836 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
    837 
    838 	    vst1q_u16(dst, dst8);
    839 
    840 #if	defined(DEBUG_OPAQUE_DITHER)
    841 	    /* verify my 8 elements match the temp buffer */
    842 	{
    843 	   int i, bad=0;
    844 	   static int invocation;
    845 
    846 	   for (i=0;i<UNROLL;i++)
    847 		if (tmpbuf[i] != dst[i]) bad=1;
    848 	   if (bad) {
    849 		SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
    850 			invocation, offset);
    851 		SkDebugf("  alpha 0x%x\n", alpha);
    852 		for (i=0;i<UNROLL;i++)
    853 		    SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
    854 			i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
    855 			dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
    856 
    857 		showme16("alpha8", &alpha8, sizeof(alpha8));
    858 		showme16("scale8", &scale8, sizeof(scale8));
    859 		showme8("d", &d, sizeof(d));
    860 		showme16("dst8", &dst8, sizeof(dst8));
    861 		showme16("dst_b", &dst_b, sizeof(dst_b));
    862 		showme16("dst_g", &dst_g, sizeof(dst_g));
    863 		showme16("dst_r", &dst_r, sizeof(dst_r));
    864 		showme8("sb", &sb, sizeof(sb));
    865 		showme8("sg", &sg, sizeof(sg));
    866 		showme8("sr", &sr, sizeof(sr));
    867 
    868 		/* cop out */
    869 		return;
    870 	   }
    871 	   offset += UNROLL;
    872 	   invocation++;
    873 	}
    874 #endif
    875 
    876             dst += UNROLL;
    877 	    src += UNROLL;
    878 	    count -= UNROLL;
    879 	    /* skip x += UNROLL, since it's unchanged mod-4 */
    880         } while (count >= UNROLL);
    881     }
    882 #undef	UNROLL
    883 
    884     /* residuals */
    885     if (count > 0) {
    886         DITHER_565_SCAN(y);
    887         do {
    888             SkPMColor c = *src++;
    889             SkPMColorAssert(c);
    890             if (c) {
    891                 unsigned a = SkGetPackedA32(c);
    892 
    893                 // dither and alpha are just temporary variables to work-around
    894                 // an ICE in debug.
    895                 unsigned dither = DITHER_VALUE(x);
    896                 unsigned alpha = SkAlpha255To256(a);
    897                 int d = SkAlphaMul(dither, alpha);
    898 
    899                 unsigned sr = SkGetPackedR32(c);
    900                 unsigned sg = SkGetPackedG32(c);
    901                 unsigned sb = SkGetPackedB32(c);
    902                 sr = SkDITHER_R32_FOR_565(sr, d);
    903                 sg = SkDITHER_G32_FOR_565(sg, d);
    904                 sb = SkDITHER_B32_FOR_565(sb, d);
    905 
    906                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    907                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
    908                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    909                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    910                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    911             }
    912             dst += 1;
    913             DITHER_INC_X(x);
    914         } while (--count != 0);
    915     }
    916 }
    917 
    918 #define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
    919 #else
    920 #define	S32A_D565_Opaque_Dither_PROC NULL
    921 #endif
    922 
    923 ///////////////////////////////////////////////////////////////////////////////
    924 
    925 #if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
    926 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
    927  * speedup untested, but ARM version is 26 insns/iteration and
    928  * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
    929  * which is 10x the native version; that's pure instruction counts,
    930  * not accounting for any instruction or memory latencies.
    931  */
    932 
    933 #undef	DEBUG_S32_OPAQUE_DITHER
    934 
    935 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
    936                                      const SkPMColor* SK_RESTRICT src,
    937                                      int count, U8CPU alpha, int x, int y) {
    938     SkASSERT(255 == alpha);
    939 
    940 #define	UNROLL	8
    941     if (count >= UNROLL) {
    942 	uint8x8_t d;
    943 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    944 	d = vld1_u8(dstart);
    945 
    946 	while (count >= UNROLL) {
    947 	    uint8x8_t sr, sg, sb, sa;
    948 	    uint16x8_t dr, dg, db, da;
    949 	    uint16x8_t dst8;
    950 
    951 	    /* source is in ABGR ordering (R == lsb) */
    952 	    {
    953 		register uint8x8_t d0 asm("d0");
    954 		register uint8x8_t d1 asm("d1");
    955 		register uint8x8_t d2 asm("d2");
    956 		register uint8x8_t d3 asm("d3");
    957 
    958 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
    959 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
    960 		    : "r" (src)
    961                     );
    962 		    sr = d0; sg = d1; sb = d2; sa = d3;
    963 	    }
    964 	    /* XXX: if we want to prefetch, hide it in the above asm()
    965 	     * using the gcc __builtin_prefetch(), the prefetch will
    966 	     * fall to the bottom of the loop -- it won't stick up
    967 	     * at the top of the loop, just after the vld4.
    968 	     */
    969 
    970 	    /* sr = sr - (sr>>5) + d */
    971 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
    972 	    dr = vaddl_u8(sr, d);
    973 
    974 	    /* sb = sb - (sb>>5) + d */
    975 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
    976 	    db = vaddl_u8(sb, d);
    977 
    978 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
    979 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
    980 	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
    981 	    /* XXX: check that the "d>>1" here is hoisted */
    982 
    983 	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
    984 	    dst8 = vshrq_n_u16(db, 3);
    985 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
    986 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
    987 
    988 	    /* store it */
    989 	    vst1q_u16(dst, dst8);
    990 
    991 #if	defined(DEBUG_S32_OPAQUE_DITHER)
    992 	    /* always good to know if we generated good results */
    993 	    {
    994 		int i, myx = x, myy = y;
    995 		DITHER_565_SCAN(myy);
    996 		for (i=0;i<UNROLL;i++) {
    997 		    SkPMColor c = src[i];
    998 		    unsigned dither = DITHER_VALUE(myx);
    999 		    uint16_t val = SkDitherRGB32To565(c, dither);
   1000 		    if (val != dst[i]) {
   1001 			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
   1002 			    c, dither, val, dst[i], dstart[i]);
   1003 		    }
   1004 		    DITHER_INC_X(myx);
   1005 		}
   1006 	    }
   1007 #endif
   1008 
   1009 	    dst += UNROLL;
   1010 	    src += UNROLL;
   1011 	    count -= UNROLL;
   1012 	    x += UNROLL;		/* probably superfluous */
   1013 	}
   1014     }
   1015 #undef	UNROLL
   1016 
   1017     /* residuals */
   1018     if (count > 0) {
   1019         DITHER_565_SCAN(y);
   1020         do {
   1021             SkPMColor c = *src++;
   1022             SkPMColorAssert(c);
   1023             SkASSERT(SkGetPackedA32(c) == 255);
   1024 
   1025             unsigned dither = DITHER_VALUE(x);
   1026             *dst++ = SkDitherRGB32To565(c, dither);
   1027             DITHER_INC_X(x);
   1028         } while (--count != 0);
   1029     }
   1030 }
   1031 
   1032 #define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
   1033 #else
   1034 #define	S32_D565_Opaque_Dither_PROC NULL
   1035 #endif
   1036 
   1037 ///////////////////////////////////////////////////////////////////////////////
   1038 
   1039 static const SkBlitRow::Proc platform_565_procs[] = {
   1040     // no dither
   1041     S32_D565_Opaque_PROC,
   1042     S32_D565_Blend_PROC,
   1043     S32A_D565_Opaque_PROC,
   1044     S32A_D565_Blend_PROC,
   1045 
   1046     // dither
   1047     S32_D565_Opaque_Dither_PROC,
   1048     S32_D565_Blend_Dither_PROC,
   1049     S32A_D565_Opaque_Dither_PROC,
   1050     NULL,   // S32A_D565_Blend_Dither
   1051 };
   1052 
   1053 static const SkBlitRow::Proc platform_4444_procs[] = {
   1054     // no dither
   1055     NULL,   // S32_D4444_Opaque,
   1056     NULL,   // S32_D4444_Blend,
   1057     NULL,   // S32A_D4444_Opaque,
   1058     NULL,   // S32A_D4444_Blend,
   1059 
   1060     // dither
   1061     NULL,   // S32_D4444_Opaque_Dither,
   1062     NULL,   // S32_D4444_Blend_Dither,
   1063     NULL,   // S32A_D4444_Opaque_Dither,
   1064     NULL,   // S32A_D4444_Blend_Dither
   1065 };
   1066 
   1067 static const SkBlitRow::Proc32 platform_32_procs[] = {
   1068     NULL,   // S32_Opaque,
   1069     S32_Blend_BlitRow32_PROC,		// S32_Blend,
   1070     S32A_Opaque_BlitRow32_PROC,		// S32A_Opaque,
   1071     NULL,   // S32A_Blend,
   1072 };
   1073 
   1074 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
   1075     return platform_4444_procs[flags];
   1076 }
   1077 
   1078 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
   1079     return platform_565_procs[flags];
   1080 }
   1081 
   1082 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
   1083     return platform_32_procs[flags];
   1084 }
   1085 
   1086