Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2009 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 
      9 #include "SkBlitRow.h"
     10 #include "SkBlitMask.h"
     11 #include "SkColorPriv.h"
     12 #include "SkDither.h"
     13 
     14 #if defined(__ARM_HAVE_NEON)
     15 #include <arm_neon.h>
     16 #endif
     17 
     18 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
     19 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
     20                                   const SkPMColor* SK_RESTRICT src, int count,
     21                                   U8CPU alpha, int /*x*/, int /*y*/) {
     22     SkASSERT(255 == alpha);
     23 
     24     if (count >= 8) {
     25         uint16_t* SK_RESTRICT keep_dst;
     26 
     27         asm volatile (
     28                       "ands       ip, %[count], #7            \n\t"
     29                       "vmov.u8    d31, #1<<7                  \n\t"
     30                       "vld1.16    {q12}, [%[dst]]             \n\t"
     31                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
     32                       "moveq      ip, #8                      \n\t"
     33                       "mov        %[keep_dst], %[dst]         \n\t"
     34 
     35                       "add        %[src], %[src], ip, LSL#2   \n\t"
     36                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
     37                       "subs       %[count], %[count], ip      \n\t"
     38                       "b          9f                          \n\t"
     39                       // LOOP
     40                       "2:                                         \n\t"
     41 
     42                       "vld1.16    {q12}, [%[dst]]!            \n\t"
     43                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
     44                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
     45                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
     46                       "subs       %[count], %[count], #8      \n\t"
     47                       "9:                                         \n\t"
     48                       "pld        [%[dst],#32]                \n\t"
     49                       // expand 0565 q12 to 8888 {d4-d7}
     50                       "vmovn.u16  d4, q12                     \n\t"
     51                       "vshr.u16   q11, q12, #5                \n\t"
     52                       "vshr.u16   q10, q12, #6+5              \n\t"
     53                       "vmovn.u16  d5, q11                     \n\t"
     54                       "vmovn.u16  d6, q10                     \n\t"
     55                       "vshl.u8    d4, d4, #3                  \n\t"
     56                       "vshl.u8    d5, d5, #2                  \n\t"
     57                       "vshl.u8    d6, d6, #3                  \n\t"
     58 
     59                       "vmovl.u8   q14, d31                    \n\t"
     60                       "vmovl.u8   q13, d31                    \n\t"
     61                       "vmovl.u8   q12, d31                    \n\t"
     62 
     63                       // duplicate in 4/2/1 & 8pix vsns
     64                       "vmvn.8     d30, d3                     \n\t"
     65                       "vmlal.u8   q14, d30, d6                \n\t"
     66                       "vmlal.u8   q13, d30, d5                \n\t"
     67                       "vmlal.u8   q12, d30, d4                \n\t"
     68                       "vshr.u16   q8, q14, #5                 \n\t"
     69                       "vshr.u16   q9, q13, #6                 \n\t"
     70                       "vaddhn.u16 d6, q14, q8                 \n\t"
     71                       "vshr.u16   q8, q12, #5                 \n\t"
     72                       "vaddhn.u16 d5, q13, q9                 \n\t"
     73                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
     74                       "vaddhn.u16 d4, q12, q8                 \n\t"
     75                       // intentionally don't calculate alpha
     76                       // result in d4-d6
     77 
     78                       "vqadd.u8   d5, d5, d1                  \n\t"
     79                       "vqadd.u8   d4, d4, d2                  \n\t"
     80 
     81                       // pack 8888 {d4-d6} to 0565 q10
     82                       "vshll.u8   q10, d6, #8                 \n\t"
     83                       "vshll.u8   q3, d5, #8                  \n\t"
     84                       "vshll.u8   q2, d4, #8                  \n\t"
     85                       "vsri.u16   q10, q3, #5                 \n\t"
     86                       "vsri.u16   q10, q2, #11                \n\t"
     87 
     88                       "bne        2b                          \n\t"
     89 
     90                       "1:                                         \n\t"
     91                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
     92                       : [count] "+r" (count)
     93                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
     94                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
     95                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
     96                       "d30","d31"
     97                       );
     98     }
     99     else
    100     {   // handle count < 8
    101         uint16_t* SK_RESTRICT keep_dst;
    102 
    103         asm volatile (
    104                       "vmov.u8    d31, #1<<7                  \n\t"
    105                       "mov        %[keep_dst], %[dst]         \n\t"
    106 
    107                       "tst        %[count], #4                \n\t"
    108                       "beq        14f                         \n\t"
    109                       "vld1.16    {d25}, [%[dst]]!            \n\t"
    110                       "vld1.32    {q1}, [%[src]]!             \n\t"
    111 
    112                       "14:                                        \n\t"
    113                       "tst        %[count], #2                \n\t"
    114                       "beq        12f                         \n\t"
    115                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
    116                       "vld1.32    {d1}, [%[src]]!             \n\t"
    117 
    118                       "12:                                        \n\t"
    119                       "tst        %[count], #1                \n\t"
    120                       "beq        11f                         \n\t"
    121                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
    122                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
    123 
    124                       "11:                                        \n\t"
    125                       // unzips achieve the same as a vld4 operation
    126                       "vuzpq.u16  q0, q1                      \n\t"
    127                       "vuzp.u8    d0, d1                      \n\t"
    128                       "vuzp.u8    d2, d3                      \n\t"
    129                       // expand 0565 q12 to 8888 {d4-d7}
    130                       "vmovn.u16  d4, q12                     \n\t"
    131                       "vshr.u16   q11, q12, #5                \n\t"
    132                       "vshr.u16   q10, q12, #6+5              \n\t"
    133                       "vmovn.u16  d5, q11                     \n\t"
    134                       "vmovn.u16  d6, q10                     \n\t"
    135                       "vshl.u8    d4, d4, #3                  \n\t"
    136                       "vshl.u8    d5, d5, #2                  \n\t"
    137                       "vshl.u8    d6, d6, #3                  \n\t"
    138 
    139                       "vmovl.u8   q14, d31                    \n\t"
    140                       "vmovl.u8   q13, d31                    \n\t"
    141                       "vmovl.u8   q12, d31                    \n\t"
    142 
    143                       // duplicate in 4/2/1 & 8pix vsns
    144                       "vmvn.8     d30, d3                     \n\t"
    145                       "vmlal.u8   q14, d30, d6                \n\t"
    146                       "vmlal.u8   q13, d30, d5                \n\t"
    147                       "vmlal.u8   q12, d30, d4                \n\t"
    148                       "vshr.u16   q8, q14, #5                 \n\t"
    149                       "vshr.u16   q9, q13, #6                 \n\t"
    150                       "vaddhn.u16 d6, q14, q8                 \n\t"
    151                       "vshr.u16   q8, q12, #5                 \n\t"
    152                       "vaddhn.u16 d5, q13, q9                 \n\t"
    153                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
    154                       "vaddhn.u16 d4, q12, q8                 \n\t"
    155                       // intentionally don't calculate alpha
    156                       // result in d4-d6
    157 
    158                       "vqadd.u8   d5, d5, d1                  \n\t"
    159                       "vqadd.u8   d4, d4, d2                  \n\t"
    160 
    161                       // pack 8888 {d4-d6} to 0565 q10
    162                       "vshll.u8   q10, d6, #8                 \n\t"
    163                       "vshll.u8   q3, d5, #8                  \n\t"
    164                       "vshll.u8   q2, d4, #8                  \n\t"
    165                       "vsri.u16   q10, q3, #5                 \n\t"
    166                       "vsri.u16   q10, q2, #11                \n\t"
    167 
    168                       // store
    169                       "tst        %[count], #4                \n\t"
    170                       "beq        24f                         \n\t"
    171                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
    172 
    173                       "24:                                        \n\t"
    174                       "tst        %[count], #2                \n\t"
    175                       "beq        22f                         \n\t"
    176                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
    177 
    178                       "22:                                        \n\t"
    179                       "tst        %[count], #1                \n\t"
    180                       "beq        21f                         \n\t"
    181                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
    182 
    183                       "21:                                        \n\t"
    184                       : [count] "+r" (count)
    185                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
    186                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
    187                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
    188                       "d30","d31"
    189                       );
    190     }
    191 }
    192 
    193 static void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
    194                                  const SkPMColor* SK_RESTRICT src, int count,
    195                                  U8CPU alpha, int /*x*/, int /*y*/) {
    196 
    197     U8CPU alpha_for_asm = alpha;
    198 
    199     asm volatile (
    200     /* This code implements a Neon version of S32A_D565_Blend. The output differs from
    201      * the original in two respects:
    202      *  1. The results have a few mismatches compared to the original code. These mismatches
    203      *     never exceed 1. It's possible to improve accuracy vs. a floating point
    204      *     implementation by introducing rounding right shifts (vrshr) for the final stage.
    205      *     Rounding is not present in the code below, because although results would be closer
    206      *     to a floating point implementation, the number of mismatches compared to the
    207      *     original code would be far greater.
    208      *  2. On certain inputs, the original code can overflow, causing colour channels to
    209      *     mix. Although the Neon code can also overflow, it doesn't allow one colour channel
    210      *     to affect another.
    211      */
    212 
    213 #if 1
    214 		/* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */
    215                   "add        %[alpha], %[alpha], #1         \n\t"   // adjust range of alpha 0-256
    216 #else
    217                   "add        %[alpha], %[alpha], %[alpha], lsr #7    \n\t"   // adjust range of alpha 0-256
    218 #endif
    219                   "vmov.u16   q3, #255                        \n\t"   // set up constant
    220                   "movs       r4, %[count], lsr #3            \n\t"   // calc. count>>3
    221                   "vmov.u16   d2[0], %[alpha]                 \n\t"   // move alpha to Neon
    222                   "beq        2f                              \n\t"   // if count8 == 0, exit
    223                   "vmov.u16   q15, #0x1f                      \n\t"   // set up blue mask
    224 
    225                   "1:                                             \n\t"
    226                   "vld1.u16   {d0, d1}, [%[dst]]              \n\t"   // load eight dst RGB565 pixels
    227                   "subs       r4, r4, #1                      \n\t"   // decrement loop counter
    228                   "vld4.u8    {d24, d25, d26, d27}, [%[src]]! \n\t"   // load eight src ABGR32 pixels
    229                   //  and deinterleave
    230 
    231                   "vshl.u16   q9, q0, #5                      \n\t"   // shift green to top of lanes
    232                   "vand       q10, q0, q15                    \n\t"   // extract blue
    233                   "vshr.u16   q8, q0, #11                     \n\t"   // extract red
    234                   "vshr.u16   q9, q9, #10                     \n\t"   // extract green
    235                   // dstrgb = {q8, q9, q10}
    236 
    237                   "vshr.u8    d24, d24, #3                    \n\t"   // shift red to 565 range
    238                   "vshr.u8    d25, d25, #2                    \n\t"   // shift green to 565 range
    239                   "vshr.u8    d26, d26, #3                    \n\t"   // shift blue to 565 range
    240 
    241                   "vmovl.u8   q11, d24                        \n\t"   // widen red to 16 bits
    242                   "vmovl.u8   q12, d25                        \n\t"   // widen green to 16 bits
    243                   "vmovl.u8   q14, d27                        \n\t"   // widen alpha to 16 bits
    244                   "vmovl.u8   q13, d26                        \n\t"   // widen blue to 16 bits
    245                   // srcrgba = {q11, q12, q13, q14}
    246 
    247                   "vmul.u16   q2, q14, d2[0]                  \n\t"   // sa * src_scale
    248                   "vmul.u16   q11, q11, d2[0]                 \n\t"   // red result = src_red * src_scale
    249                   "vmul.u16   q12, q12, d2[0]                 \n\t"   // grn result = src_grn * src_scale
    250                   "vmul.u16   q13, q13, d2[0]                 \n\t"   // blu result = src_blu * src_scale
    251 
    252                   "vshr.u16   q2, q2, #8                      \n\t"   // sa * src_scale >> 8
    253                   "vsub.u16   q2, q3, q2                      \n\t"   // 255 - (sa * src_scale >> 8)
    254                   // dst_scale = q2
    255 
    256                   "vmla.u16   q11, q8, q2                     \n\t"   // red result += dst_red * dst_scale
    257                   "vmla.u16   q12, q9, q2                     \n\t"   // grn result += dst_grn * dst_scale
    258                   "vmla.u16   q13, q10, q2                    \n\t"   // blu result += dst_blu * dst_scale
    259 
    260 #if 1
    261 	// trying for a better match with SkDiv255Round(a)
    262 	// C alg is:  a+=128; (a+a>>8)>>8
    263 	// we'll use just a rounding shift [q2 is available for scratch]
    264                   "vrshr.u16   q11, q11, #8                    \n\t"   // shift down red
    265                   "vrshr.u16   q12, q12, #8                    \n\t"   // shift down green
    266                   "vrshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    267 #else
    268 	// arm's original "truncating divide by 256"
    269                   "vshr.u16   q11, q11, #8                    \n\t"   // shift down red
    270                   "vshr.u16   q12, q12, #8                    \n\t"   // shift down green
    271                   "vshr.u16   q13, q13, #8                    \n\t"   // shift down blue
    272 #endif
    273 
    274                   "vsli.u16   q13, q12, #5                    \n\t"   // insert green into blue
    275                   "vsli.u16   q13, q11, #11                   \n\t"   // insert red into green/blue
    276                   "vst1.16    {d26, d27}, [%[dst]]!           \n\t"   // write pixel back to dst, update ptr
    277 
    278                   "bne        1b                              \n\t"   // if counter != 0, loop
    279                   "2:                                             \n\t"   // exit
    280 
    281                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm)
    282                   :
    283                   : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
    284                   );
    285 
    286     count &= 7;
    287     if (count > 0) {
    288         do {
    289             SkPMColor sc = *src++;
    290             if (sc) {
    291                 uint16_t dc = *dst;
    292                 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
    293                 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
    294                 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
    295                 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
    296                 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
    297             }
    298             dst += 1;
    299         } while (--count != 0);
    300     }
    301 }
    302 
    303 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
    304  * each dither value is spaced out into byte lanes, and repeated
    305  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
    306  * start of each row.
    307  */
    308 static const uint8_t gDitherMatrix_Neon[48] = {
    309     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
    310     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
    311     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
    312     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
    313 
    314 };
    315 
    316 static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
    317                                        int count, U8CPU alpha, int x, int y)
    318 {
    319     /* select row and offset for dither array */
    320     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
    321 
    322     /* rescale alpha to range 0 - 256 */
    323     int scale = SkAlpha255To256(alpha);
    324 
    325     asm volatile (
    326                   "vld1.8         {d31}, [%[dstart]]              \n\t"   // load dither values
    327                   "vshr.u8        d30, d31, #1                    \n\t"   // calc. green dither values
    328                   "vdup.16        d6, %[scale]                    \n\t"   // duplicate scale into neon reg
    329                   "vmov.i8        d29, #0x3f                      \n\t"   // set up green mask
    330                   "vmov.i8        d28, #0x1f                      \n\t"   // set up blue mask
    331                   "1:                                                 \n\t"
    332                   "vld4.8         {d0, d1, d2, d3}, [%[src]]!     \n\t"   // load 8 pixels and split into argb
    333                   "vshr.u8        d22, d0, #5                     \n\t"   // calc. red >> 5
    334                   "vshr.u8        d23, d1, #6                     \n\t"   // calc. green >> 6
    335                   "vshr.u8        d24, d2, #5                     \n\t"   // calc. blue >> 5
    336                   "vaddl.u8       q8, d0, d31                     \n\t"   // add in dither to red and widen
    337                   "vaddl.u8       q9, d1, d30                     \n\t"   // add in dither to green and widen
    338                   "vaddl.u8       q10, d2, d31                    \n\t"   // add in dither to blue and widen
    339                   "vsubw.u8       q8, q8, d22                     \n\t"   // sub shifted red from result
    340                   "vsubw.u8       q9, q9, d23                     \n\t"   // sub shifted green from result
    341                   "vsubw.u8       q10, q10, d24                   \n\t"   // sub shifted blue from result
    342                   "vshrn.i16      d22, q8, #3                     \n\t"   // shift right and narrow to 5 bits
    343                   "vshrn.i16      d23, q9, #2                     \n\t"   // shift right and narrow to 6 bits
    344                   "vshrn.i16      d24, q10, #3                    \n\t"   // shift right and narrow to 5 bits
    345                   // load 8 pixels from dst, extract rgb
    346                   "vld1.16        {d0, d1}, [%[dst]]              \n\t"   // load 8 pixels
    347                   "vshrn.i16      d17, q0, #5                     \n\t"   // shift green down to bottom 6 bits
    348                   "vmovn.i16      d18, q0                         \n\t"   // narrow to get blue as bytes
    349                   "vshr.u16       q0, q0, #11                     \n\t"   // shift down to extract red
    350                   "vand           d17, d17, d29                   \n\t"   // and green with green mask
    351                   "vand           d18, d18, d28                   \n\t"   // and blue with blue mask
    352                   "vmovn.i16      d16, q0                         \n\t"   // narrow to get red as bytes
    353                   // src = {d22 (r), d23 (g), d24 (b)}
    354                   // dst = {d16 (r), d17 (g), d18 (b)}
    355                   // subtract dst from src and widen
    356                   "vsubl.s8       q0, d22, d16                    \n\t"   // subtract red src from dst
    357                   "vsubl.s8       q1, d23, d17                    \n\t"   // subtract green src from dst
    358                   "vsubl.s8       q2, d24, d18                    \n\t"   // subtract blue src from dst
    359                   // multiply diffs by scale and shift
    360                   "vmul.i16       q0, q0, d6[0]                   \n\t"   // multiply red by scale
    361                   "vmul.i16       q1, q1, d6[0]                   \n\t"   // multiply blue by scale
    362                   "vmul.i16       q2, q2, d6[0]                   \n\t"   // multiply green by scale
    363                   "subs           %[count], %[count], #8          \n\t"   // decrement loop counter
    364                   "vshrn.i16      d0, q0, #8                      \n\t"   // shift down red by 8 and narrow
    365                   "vshrn.i16      d2, q1, #8                      \n\t"   // shift down green by 8 and narrow
    366                   "vshrn.i16      d4, q2, #8                      \n\t"   // shift down blue by 8 and narrow
    367                   // add dst to result
    368                   "vaddl.s8       q0, d0, d16                     \n\t"   // add dst to red
    369                   "vaddl.s8       q1, d2, d17                     \n\t"   // add dst to green
    370                   "vaddl.s8       q2, d4, d18                     \n\t"   // add dst to blue
    371                   // put result into 565 format
    372                   "vsli.i16       q2, q1, #5                      \n\t"   // shift up green and insert into blue
    373                   "vsli.i16       q2, q0, #11                     \n\t"   // shift up red and insert into blue
    374                   "vst1.16        {d4, d5}, [%[dst]]!             \n\t"   // store result
    375                   "bgt            1b                              \n\t"   // loop if count > 0
    376                   : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
    377                   : [dstart] "r" (dstart), [scale] "r" (scale)
    378                   : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d28", "d29", "d30", "d31"
    379                   );
    380 
    381     DITHER_565_SCAN(y);
    382 
    383     while((count & 7) > 0)
    384     {
    385         SkPMColor c = *src++;
    386 
    387         int dither = DITHER_VALUE(x);
    388         int sr = SkGetPackedR32(c);
    389         int sg = SkGetPackedG32(c);
    390         int sb = SkGetPackedB32(c);
    391         sr = SkDITHER_R32To565(sr, dither);
    392         sg = SkDITHER_G32To565(sg, dither);
    393         sb = SkDITHER_B32To565(sb, dither);
    394 
    395         uint16_t d = *dst;
    396         *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
    397                              SkAlphaBlend(sg, SkGetPackedG16(d), scale),
    398                              SkAlphaBlend(sb, SkGetPackedB16(d), scale));
    399         DITHER_INC_X(x);
    400         count--;
    401     }
    402 }
    403 
    404 #define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
    405 #define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
    406 #define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
    407 #elif __ARM_ARCH__ >= 7 && !defined(SK_CPU_BENDIAN)
    408 static void S32A_D565_Opaque_v7(uint16_t* SK_RESTRICT dst,
    409                                   const SkPMColor* SK_RESTRICT src, int count,
    410                                   U8CPU alpha, int /*x*/, int /*y*/) {
    411     SkASSERT(255 == alpha);
    412 
    413     asm volatile (
    414                   "1:                                   \n\t"
    415                   "ldr     r3, [%[src]], #4             \n\t"
    416                   "cmp     r3, #0xff000000              \n\t"
    417                   "blo     2f                           \n\t"
    418                   "and     r4, r3, #0x0000f8            \n\t"
    419                   "and     r5, r3, #0x00fc00            \n\t"
    420                   "and     r6, r3, #0xf80000            \n\t"
    421                   "pld     [r1, #32]                    \n\t"
    422                   "lsl     r3, r4, #8                   \n\t"
    423                   "orr     r3, r3, r5, lsr #5           \n\t"
    424                   "orr     r3, r3, r6, lsr #19          \n\t"
    425                   "subs    %[count], %[count], #1       \n\t"
    426                   "strh    r3, [%[dst]], #2             \n\t"
    427                   "bne     1b                           \n\t"
    428                   "b       4f                           \n\t"
    429                   "2:                                   \n\t"
    430                   "lsrs    r7, r3, #24                  \n\t"
    431                   "beq     3f                           \n\t"
    432                   "ldrh    r4, [%[dst]]                 \n\t"
    433                   "rsb     r7, r7, #255                 \n\t"
    434                   "and     r6, r4, #0x001f              \n\t"
    435                   "ubfx    r5, r4, #5, #6               \n\t"
    436                   "pld     [r0, #16]                    \n\t"
    437                   "lsr     r4, r4, #11                  \n\t"
    438                   "smulbb  r6, r6, r7                   \n\t"
    439                   "smulbb  r5, r5, r7                   \n\t"
    440                   "smulbb  r4, r4, r7                   \n\t"
    441                   "ubfx    r7, r3, #16, #8              \n\t"
    442                   "ubfx    ip, r3, #8, #8               \n\t"
    443                   "and     r3, r3, #0xff                \n\t"
    444                   "add     r6, r6, #16                  \n\t"
    445                   "add     r5, r5, #32                  \n\t"
    446                   "add     r4, r4, #16                  \n\t"
    447                   "add     r6, r6, r6, lsr #5           \n\t"
    448                   "add     r5, r5, r5, lsr #6           \n\t"
    449                   "add     r4, r4, r4, lsr #5           \n\t"
    450                   "add     r6, r7, r6, lsr #5           \n\t"
    451                   "add     r5, ip, r5, lsr #6           \n\t"
    452                   "add     r4, r3, r4, lsr #5           \n\t"
    453                   "lsr     r6, r6, #3                   \n\t"
    454                   "and     r5, r5, #0xfc                \n\t"
    455                   "and     r4, r4, #0xf8                \n\t"
    456                   "orr     r6, r6, r5, lsl #3           \n\t"
    457                   "orr     r4, r6, r4, lsl #8           \n\t"
    458                   "strh    r4, [%[dst]], #2             \n\t"
    459                   "pld     [r1, #32]                    \n\t"
    460                   "subs    %[count], %[count], #1       \n\t"
    461                   "bne     1b                           \n\t"
    462                   "b       4f                           \n\t"
    463                   "3:                                   \n\t"
    464                   "subs    %[count], %[count], #1       \n\t"
    465                   "add     %[dst], %[dst], #2           \n\t"
    466                   "bne     1b                           \n\t"
    467                   "4:                                   \n\t"
    468                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
    469                   :
    470                   : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip"
    471                   );
    472 }
    473 #define S32A_D565_Opaque_PROC       S32A_D565_Opaque_v7
    474 #define S32A_D565_Blend_PROC        NULL
    475 #define S32_D565_Blend_Dither_PROC  NULL
    476 #else
    477 #define S32A_D565_Opaque_PROC       NULL
    478 #define S32A_D565_Blend_PROC        NULL
    479 #define S32_D565_Blend_Dither_PROC  NULL
    480 #endif
    481 
    482 /* Don't have a special version that assumes each src is opaque, but our S32A
    483     is still faster than the default, so use it here
    484  */
    485 #define S32_D565_Opaque_PROC    S32A_D565_Opaque_PROC
    486 #define S32_D565_Blend_PROC     S32A_D565_Blend_PROC
    487 
    488 ///////////////////////////////////////////////////////////////////////////////
    489 
    490 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) && defined(TEST_SRC_ALPHA)
    491 
    492 static void S32A_Opaque_BlitRow32_neon_test_alpha(SkPMColor* SK_RESTRICT dst,
    493                                   const SkPMColor* SK_RESTRICT src,
    494                                   int count, U8CPU alpha) {
    495 	SkASSERT(255 == alpha);
    496 	if (count <= 0)
    497 	return;
    498 
    499 	/* Use these to check if src is transparent or opaque */
    500 	const unsigned int ALPHA_OPAQ  = 0xFF000000;
    501 	const unsigned int ALPHA_TRANS = 0x00FFFFFF;
    502 
    503 #define UNROLL  4
    504 	const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
    505 	const SkPMColor* SK_RESTRICT src_temp = src;
    506 
    507 	/* set up the NEON variables */
    508 	uint8x8_t alpha_mask;
    509 	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    510 	alpha_mask = vld1_u8(alpha_mask_setup);
    511 
    512 	uint8x8_t src_raw, dst_raw, dst_final;
    513 	uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    514 	uint8x8_t dst_cooked;
    515 	uint16x8_t dst_wide;
    516 	uint8x8_t alpha_narrow;
    517 	uint16x8_t alpha_wide;
    518 
    519 	/* choose the first processing type */
    520 	if( src >= src_end)
    521 		goto TAIL;
    522 	if(*src <= ALPHA_TRANS)
    523 		goto ALPHA_0;
    524 	if(*src >= ALPHA_OPAQ)
    525 		goto ALPHA_255;
    526 	/* fall-thru */
    527 
    528 ALPHA_1_TO_254:
    529 	do {
    530 
    531 		/* get the source */
    532 		src_raw = vreinterpret_u8_u32(vld1_u32(src));
    533 		src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    534 
    535 		/* get and hold the dst too */
    536 		dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    537 		dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    538 
    539 
    540 		/* get the alphas spread out properly */
    541 		alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    542 		/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    543 		/* we collapsed (255-a)+1 ... */
    544 		alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    545 
    546 		/* spread the dest */
    547 		dst_wide = vmovl_u8(dst_raw);
    548 
    549 		/* alpha mul the dest */
    550 		dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    551 		dst_cooked = vshrn_n_u16(dst_wide, 8);
    552 
    553 		/* sum -- ignoring any byte lane overflows */
    554 		dst_final = vadd_u8(src_raw, dst_cooked);
    555 
    556 		alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    557 		/* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    558 		/* we collapsed (255-a)+1 ... */
    559 		alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    560 
    561 		/* spread the dest */
    562 		dst_wide = vmovl_u8(dst_raw_2);
    563 
    564 		/* alpha mul the dest */
    565 		dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    566 		dst_cooked = vshrn_n_u16(dst_wide, 8);
    567 
    568 		/* sum -- ignoring any byte lane overflows */
    569 		dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    570 
    571 		vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    572 		vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    573 
    574 		src += UNROLL;
    575 		dst += UNROLL;
    576 
    577 		/* if 2 of the next pixels aren't between 1 and 254
    578 		it might make sense to go to the optimized loops */
    579 		if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
    580 			break;
    581 
    582 	} while(src < src_end);
    583 
    584 	if (src >= src_end)
    585 		goto TAIL;
    586 
    587 	if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
    588 		goto ALPHA_255;
    589 
    590 	/*fall-thru*/
    591 
    592 ALPHA_0:
    593 
    594 	/*In this state, we know the current alpha is 0 and
    595 	 we optimize for the next alpha also being zero. */
    596 	src_temp = src;  //so we don't have to increment dst every time
    597 	do {
    598 		if(*(++src) > ALPHA_TRANS)
    599 			break;
    600 		if(*(++src) > ALPHA_TRANS)
    601 			break;
    602 		if(*(++src) > ALPHA_TRANS)
    603 			break;
    604 		if(*(++src) > ALPHA_TRANS)
    605 			break;
    606 	} while(src < src_end);
    607 
    608 	dst += (src - src_temp);
    609 
    610 	/* no longer alpha 0, so determine where to go next. */
    611 	if( src >= src_end)
    612 		goto TAIL;
    613 	if(*src >= ALPHA_OPAQ)
    614 		goto ALPHA_255;
    615 	else
    616 		goto ALPHA_1_TO_254;
    617 
    618 ALPHA_255:
    619 	while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
    620 		dst[0]=src[0];
    621 		dst[1]=src[1];
    622 		dst[2]=src[2];
    623 		dst[3]=src[3];
    624 		src+=UNROLL;
    625 		dst+=UNROLL;
    626 		if(src >= src_end)
    627 			goto TAIL;
    628 	}
    629 
    630 	//Handle remainder.
    631 	if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    632 		if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
    633 			if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
    634 		}
    635 	}
    636 
    637 	if( src >= src_end)
    638 		goto TAIL;
    639 	if(*src <= ALPHA_TRANS)
    640 		goto ALPHA_0;
    641 	else
    642 		goto ALPHA_1_TO_254;
    643 
    644 TAIL:
    645 	/* do any residual iterations */
    646 	src_end += UNROLL + 1;  //goto the real end
    647 	while(src != src_end) {
    648 		if( *src != 0 ) {
    649 			if( *src >= ALPHA_OPAQ ) {
    650 				*dst = *src;
    651 			}
    652 			else {
    653 				*dst = SkPMSrcOver(*src, *dst);
    654 			}
    655 		}
    656 		src++;
    657 		dst++;
    658 	}
    659 	return;
    660 }
    661 
    662 #define S32A_Opaque_BlitRow32_PROC  S32A_Opaque_BlitRow32_neon_test_alpha
    663 
    664 #elif defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
    665 
    666 static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
    667                                   const SkPMColor* SK_RESTRICT src,
    668                                   int count, U8CPU alpha) {
    669 
    670     SkASSERT(255 == alpha);
    671     if (count > 0) {
    672 
    673 
    674 	uint8x8_t alpha_mask;
    675 
    676 	static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    677 	alpha_mask = vld1_u8(alpha_mask_setup);
    678 
    679 	/* do the NEON unrolled code */
    680 #define	UNROLL	4
    681 	while (count >= UNROLL) {
    682 	    uint8x8_t src_raw, dst_raw, dst_final;
    683 	    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
    684 
    685 	    /* get the source */
    686 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
    687 #if	UNROLL > 2
    688 	    src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
    689 #endif
    690 
    691 	    /* get and hold the dst too */
    692 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
    693 #if	UNROLL > 2
    694 	    dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
    695 #endif
    696 
    697 	/* 1st and 2nd bits of the unrolling */
    698 	{
    699 	    uint8x8_t dst_cooked;
    700 	    uint16x8_t dst_wide;
    701 	    uint8x8_t alpha_narrow;
    702 	    uint16x8_t alpha_wide;
    703 
    704 	    /* get the alphas spread out properly */
    705 	    alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
    706 #if 1
    707 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    708 	    /* we collapsed (255-a)+1 ... */
    709 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    710 #else
    711 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
    712 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
    713 #endif
    714 
    715 	    /* spread the dest */
    716 	    dst_wide = vmovl_u8(dst_raw);
    717 
    718 	    /* alpha mul the dest */
    719 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    720 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
    721 
    722 	    /* sum -- ignoring any byte lane overflows */
    723 	    dst_final = vadd_u8(src_raw, dst_cooked);
    724 	}
    725 
    726 #if	UNROLL > 2
    727 	/* the 3rd and 4th bits of our unrolling */
    728 	{
    729 	    uint8x8_t dst_cooked;
    730 	    uint16x8_t dst_wide;
    731 	    uint8x8_t alpha_narrow;
    732 	    uint16x8_t alpha_wide;
    733 
    734 	    alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
    735 #if 1
    736 	    /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
    737 	    /* we collapsed (255-a)+1 ... */
    738 	    alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
    739 #else
    740 	    alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow);
    741 	    alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7));
    742 #endif
    743 
    744 	    /* spread the dest */
    745 	    dst_wide = vmovl_u8(dst_raw_2);
    746 
    747 	    /* alpha mul the dest */
    748 	    dst_wide = vmulq_u16 (dst_wide, alpha_wide);
    749 	    dst_cooked = vshrn_n_u16(dst_wide, 8);
    750 
    751 	    /* sum -- ignoring any byte lane overflows */
    752 	    dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
    753 	}
    754 #endif
    755 
    756 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
    757 #if	UNROLL > 2
    758 	    vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
    759 #endif
    760 
    761 	    src += UNROLL;
    762 	    dst += UNROLL;
    763 	    count -= UNROLL;
    764 	}
    765 #undef	UNROLL
    766 
    767 	/* do any residual iterations */
    768         while (--count >= 0) {
    769 #ifdef TEST_SRC_ALPHA
    770             SkPMColor sc = *src;
    771             if (sc) {
    772                 unsigned srcA = SkGetPackedA32(sc);
    773                 SkPMColor result = sc;
    774                 if (srcA != 255) {
    775                     result = SkPMSrcOver(sc, *dst);
    776                 }
    777                 *dst = result;
    778             }
    779 #else
    780             *dst = SkPMSrcOver(*src, *dst);
    781 #endif
    782             src += 1;
    783             dst += 1;
    784         }
    785     }
    786 }
    787 
    788 #define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_neon
    789 
    790 #elif defined (__ARM_ARCH__) /* #if defined(__ARM_HAVE_NEON) && defined... */
    791 
    792 #if defined(TEST_SRC_ALPHA)
    793 
    794 static void __attribute__((naked)) S32A_Opaque_BlitRow32_arm_test_alpha
    795                                         (SkPMColor* SK_RESTRICT dst,
    796                                          const SkPMColor* SK_RESTRICT src,
    797                                          int count, U8CPU alpha) {
    798 
    799 /* Optimizes for alpha == 0, alpha == 255, and 1 < alpha < 255 cases individually */
    800 /* Predicts that the next pixel will have the same alpha type as the current pixel */
    801 
    802 asm volatile (
    803 
    804     "\tSTMDB  r13!, {r4-r12, r14}        \n" /* saving r4-r12, lr on the stack */
    805                                              /* we should not save r0-r3 according to ABI */
    806 
    807     "\tCMP    r2, #0                     \n" /* if (count == 0) */
    808     "\tBEQ    9f                         \n" /* go to EXIT */
    809 
    810     "\tMOV    r12, #0xff                 \n" /* load the 0xff mask in r12 */
    811     "\tORR    r12, r12, r12, LSL #16     \n" /* convert it to 0xff00ff in r12 */
    812 
    813     "\tMOV    r14, #255                  \n" /* r14 = 255 */
    814                                              /* will be used later for left-side comparison */
    815 
    816     "\tADD    r2, %[src], r2, LSL #2     \n" /* r2 points to last array element which can be used */
    817     "\tSUB    r2, r2, #16                \n" /* as a base for 4-way processing algorithm */
    818 
    819     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer is bigger than */
    820     "\tBGT    8f                         \n" /* calculated marker for 4-way -> */
    821                                              /* use simple one-by-one processing */
    822 
    823     /* START OF DISPATCHING BLOCK */
    824 
    825     "\t0:                                \n"
    826 
    827     "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
    828 
    829     "\tLSR    r7, r3, #24                \n" /* if not all src alphas of 4-way block are equal -> */
    830     "\tCMP    r7, r4, LSR #24            \n"
    831     "\tCMPEQ  r7, r5, LSR #24            \n"
    832     "\tCMPEQ  r7, r6, LSR #24            \n"
    833     "\tBNE    1f                         \n" /* -> go to general 4-way processing routine */
    834 
    835     "\tCMP    r14, r7                    \n" /* if all src alphas are equal to 255 */
    836     "\tBEQ    3f                         \n" /* go to alpha == 255 optimized routine */
    837 
    838     "\tCMP    r7,  #0                    \n" /* if all src alphas are equal to 0 */
    839     "\tBEQ    6f                         \n" /* go to alpha == 0 optimized routine */
    840 
    841     /* END OF DISPATCHING BLOCK */
    842 
    843     /* START OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
    844 
    845     "\t1:                                \n"
    846                                              /* we do not have enough registers to make */
    847                                              /* 4-way [dst] loading -> we are using 2 * 2-way */
    848 
    849     "\tLDM    %[dst], {r7, r8}           \n" /* 1st 2-way loading of dst values to r7-r8 */
    850 
    851     /* PROCESSING BLOCK 1 */
    852     /* r3 = src, r7 = dst */
    853 
    854     "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source and storing to r11 */
    855     "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
    856     "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
    857     "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
    858     "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
    859     "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
    860     "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
    861     "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
    862     "\tORR    r7,  r9,  r10              \n" /* br | ag */
    863     "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
    864 
    865     /* PROCESSING BLOCK 2 */
    866     /* r4 = src, r8 = dst */
    867 
    868     "\tLSR    r11, r4,  #24              \n" /* see PROCESSING BLOCK 1 */
    869     "\tAND    r9,  r12, r8               \n"
    870     "\tRSB    r11, r11, #256             \n"
    871     "\tAND    r10, r12, r8, LSR #8       \n"
    872     "\tMUL    r9,  r9,  r11              \n"
    873     "\tAND    r9,  r12, r9, LSR #8       \n"
    874     "\tMUL    r10, r10, r11              \n"
    875     "\tAND    r10, r10, r12, LSL #8      \n"
    876     "\tORR    r8,  r9,  r10              \n"
    877     "\tADD    r8,  r4,  r8               \n"
    878 
    879     "\tSTM    %[dst]!, {r7, r8}          \n" /* 1st 2-way storing of processed dst values */
    880 
    881     "\tLDM    %[dst], {r9, r10}          \n" /* 2nd 2-way loading of dst values to r9-r10 */
    882 
    883     /* PROCESSING BLOCK 3 */
    884     /* r5 = src, r9 = dst */
    885 
    886     "\tLSR    r11, r5,  #24              \n" /* see PROCESSING BLOCK 1 */
    887     "\tAND    r7,  r12, r9               \n"
    888     "\tRSB    r11, r11, #256             \n"
    889     "\tAND    r8,  r12, r9, LSR #8       \n"
    890     "\tMUL    r7,  r7,  r11              \n"
    891     "\tAND    r7,  r12, r7, LSR #8       \n"
    892     "\tMUL    r8,  r8,  r11              \n"
    893     "\tAND    r8,  r8,  r12, LSL #8      \n"
    894     "\tORR    r9,  r7,  r8               \n"
    895     "\tADD    r9,  r5,  r9               \n"
    896 
    897     /* PROCESSING BLOCK 4 */
    898     /* r6 = src, r10 = dst */
    899 
    900     "\tLSR    r11, r6,  #24              \n" /* see PROCESSING BLOCK 1 */
    901     "\tAND    r7,  r12, r10              \n"
    902     "\tRSB    r11, r11, #256             \n"
    903     "\tAND    r8,  r12, r10, LSR #8      \n"
    904     "\tMUL    r7,  r7,  r11              \n"
    905     "\tAND    r7,  r12, r7, LSR #8       \n"
    906     "\tMUL    r8,  r8,  r11              \n"
    907     "\tAND    r8,  r8,  r12, LSL #8      \n"
    908     "\tORR    r10, r7,  r8               \n"
    909     "\tADD    r10, r6,  r10              \n"
    910 
    911     "\tSTM    %[dst]!, {r9, r10}         \n" /* 2nd 2-way storing of processed dst values */
    912 
    913     "\tCMP    %[src], r2                 \n" /* if our current [src] pointer <= calculated marker */
    914     "\tBLE    0b                         \n" /* we could run 4-way processing -> go to dispatcher */
    915     "\tBGT    8f                         \n" /* else -> use simple one-by-one processing */
    916 
    917     /* END OF BLOCK OPTIMIZED FOR 0 < ALPHA < 255 */
    918 
    919     /* START OF BLOCK OPTIMIZED FOR ALPHA == 255 */
    920 
    921     "\t2:                                \n" /* ENTRY 1: LOADING [src] to registers */
    922 
    923     "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
    924 
    925     "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
    926     "\tAND    r8, r5, r6                 \n"
    927     "\tAND    r9, r7, r8                 \n"
    928     "\tCMP    r14, r9, LSR #24           \n"
    929     "\tBNE    4f                         \n" /* -> go to alpha == 0 check */
    930 
    931     "\t3:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
    932 
    933     "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
    934 
    935     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
    936     "\tBLE    2b                         \n" /* we could run 4-way processing */
    937                                              /* because now we're in ALPHA == 255 state */
    938                                              /* run next cycle with priority alpha == 255 checks */
    939 
    940     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
    941                                              /* use simple one-by-one processing */
    942 
    943     "\t4:                                \n"
    944 
    945     "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
    946     "\tORR    r8, r5, r6                 \n"
    947     "\tORR    r9, r7, r8                 \n"
    948     "\tLSRS   r9, #24                    \n"
    949     "\tBNE    1b                         \n" /* -> go to general processing mode */
    950                                              /* (we already checked for alpha == 255) */
    951 
    952     "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
    953 
    954     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
    955     "\tBLE    5f                         \n" /* we could run 4-way processing one more time */
    956                                              /* because now we're in ALPHA == 0 state */
    957                                              /* run next cycle with priority alpha == 0 checks */
    958 
    959     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
    960                                              /* use simple one-by-one processing */
    961 
    962     /* END OF BLOCK OPTIMIZED FOR ALPHA == 255 */
    963 
    964     /* START OF BLOCK OPTIMIZED FOR ALPHA == 0 */
    965 
    966     "\t5:                                \n" /* ENTRY 1: LOADING [src] to registers */
    967 
    968     "\tLDM    %[src]!, {r3, r4, r5, r6}  \n" /* 4-way loading of source values to r3-r6 */
    969 
    970     "\tORR    r7, r3, r4                 \n" /* if not all alphas == 0 -> */
    971     "\tORR    r8, r5, r6                 \n"
    972     "\tORR    r9, r7, r8                 \n"
    973     "\tLSRS   r9, #24                    \n"
    974     "\tBNE    7f                         \n" /* -> go to alpha == 255 check */
    975 
    976     "\t6:                                \n" /* ENTRY 2: [src] already loaded by DISPATCHER */
    977 
    978     "\tADD    %[dst], %[dst], #16        \n" /* all src alphas == 0 -> do not change dst values */
    979 
    980     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
    981     "\tBLE    5b                         \n" /* we could run 4-way processing one more time */
    982                                              /* because now we're in ALPHA == 0 state */
    983                                              /* run next cycle with priority alpha == 0 checks */
    984 
    985     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
    986                                              /* use simple one-by-one processing */
    987     "\t7:                                \n"
    988 
    989     "\tAND    r7, r3, r4                 \n" /* if not all alphas == 255 -> */
    990     "\tAND    r8, r5, r6                 \n"
    991     "\tAND    r9, r7, r8                 \n"
    992     "\tCMP    r14, r9, LSR #24           \n"
    993     "\tBNE    1b                         \n" /* -> go to general processing mode */
    994                                              /* (we already checked for alpha == 0) */
    995 
    996     "\tSTM    %[dst]!, {r3, r4, r5, r6}  \n" /* all alphas == 255 -> 4-way copy [src] to [dst] */
    997 
    998     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer <= marker */
    999     "\tBLE    2b                         \n" /* we could run 4-way processing one more time */
   1000                                              /* because now we're in ALPHA == 255 state */
   1001                                              /* run next cycle with priority alpha == 255 checks */
   1002 
   1003     "\tBGT    8f                         \n" /* if our current [src] array pointer > marker */
   1004                                              /* use simple one-by-one processing */
   1005 
   1006     /* END OF BLOCK OPTIMIZED FOR ALPHA == 0 */
   1007 
   1008     /* START OF TAIL BLOCK */
   1009     /* (used when array is too small to be processed with 4-way algorithm)*/
   1010 
   1011     "\t8:                                \n"
   1012 
   1013     "\tADD    r2, r2, #16                \n" /* now r2 points to the element just after array */
   1014                                              /* we've done r2 = r2 - 16 at procedure start */
   1015 
   1016     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
   1017     "\tBEQ    9f                         \n" /* goto EXIT */
   1018 
   1019     /* TAIL PROCESSING BLOCK 1 */
   1020 
   1021     "\tLDR    r3, [%[src]], #4           \n" /* r3 = *src, src++ */
   1022     "\tLDR    r7, [%[dst]]               \n" /* r7 = *dst */
   1023 
   1024     "\tLSR    r11, r3,  #24              \n" /* extracting alpha from source */
   1025     "\tAND    r9,  r12, r7               \n" /* r9 = br masked by r12 (0xff00ff) */
   1026     "\tRSB    r11, r11, #256             \n" /* subtracting the alpha from 255 -> r11 = scale */
   1027     "\tAND    r10, r12, r7, LSR #8       \n" /* r10 = ag masked by r12 (0xff00ff) */
   1028     "\tMUL    r9,  r9,  r11              \n" /* br = br * scale */
   1029     "\tAND    r9,  r12, r9, LSR #8       \n" /* lsr br by 8 and mask it */
   1030     "\tMUL    r10, r10, r11              \n" /* ag = ag * scale */
   1031     "\tAND    r10, r10, r12, LSL #8      \n" /* mask ag with reverse mask */
   1032     "\tORR    r7,  r9,  r10              \n" /* br | ag */
   1033     "\tADD    r7,  r3,  r7               \n" /* dst = src + calc dest(r8) */
   1034 
   1035     "\tSTR    r7, [%[dst]], #4           \n" /* *dst = r7; dst++ */
   1036 
   1037     "\tCMP    %[src], r2                 \n" /* if our current [src] array pointer > final marker */
   1038     "\tBEQ    9f                         \n" /* goto EXIT */
   1039 
   1040     /* TAIL PROCESSING BLOCK 2 */
   1041 
   1042     "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
   1043     "\tLDR    r7, [%[dst]]               \n"
   1044 
   1045     "\tLSR    r11, r3,  #24              \n"
   1046     "\tAND    r9,  r12, r7               \n"
   1047     "\tRSB    r11, r11, #256             \n"
   1048     "\tAND    r10, r12, r7, LSR #8       \n"
   1049     "\tMUL    r9,  r9,  r11              \n"
   1050     "\tAND    r9,  r12, r9, LSR #8       \n"
   1051     "\tMUL    r10, r10, r11              \n"
   1052     "\tAND    r10, r10, r12, LSL #8      \n"
   1053     "\tORR    r7,  r9,  r10              \n"
   1054     "\tADD    r7,  r3,  r7               \n"
   1055 
   1056     "\tSTR    r7, [%[dst]], #4           \n"
   1057 
   1058     "\tCMP    %[src], r2                 \n"
   1059     "\tBEQ    9f                         \n"
   1060 
   1061     /* TAIL PROCESSING BLOCK 3 */
   1062 
   1063     "\tLDR    r3, [%[src]], #4           \n" /* see TAIL PROCESSING BLOCK 1 */
   1064     "\tLDR    r7, [%[dst]]               \n"
   1065 
   1066     "\tLSR    r11, r3,  #24              \n"
   1067     "\tAND    r9,  r12, r7               \n"
   1068     "\tRSB    r11, r11, #256             \n"
   1069     "\tAND    r10, r12, r7, LSR #8       \n"
   1070     "\tMUL    r9,  r9,  r11              \n"
   1071     "\tAND    r9,  r12, r9, LSR #8       \n"
   1072     "\tMUL    r10, r10, r11              \n"
   1073     "\tAND    r10, r10, r12, LSL #8      \n"
   1074     "\tORR    r7,  r9,  r10              \n"
   1075     "\tADD    r7,  r3,  r7               \n"
   1076 
   1077     "\tSTR    r7, [%[dst]], #4           \n"
   1078 
   1079     /* END OF TAIL BLOCK */
   1080 
   1081     "\t9:                                \n" /* EXIT */
   1082 
   1083     "\tLDMIA  r13!, {r4-r12, r14}        \n" /* restoring r4-r12, lr from stack */
   1084     "\tBX     lr                         \n" /* return */
   1085 
   1086     : [dst] "+r" (dst), [src] "+r" (src)
   1087     :
   1088     : "cc", "r2", "r3", "memory"
   1089 
   1090     );
   1091 
   1092 }
   1093 
   1094 #define	S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_arm_test_alpha
   1095 #else /* !defined(TEST_SRC_ALPHA) */
   1096 
   1097 static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
   1098                                   const SkPMColor* SK_RESTRICT src,
   1099                                   int count, U8CPU alpha) {
   1100 
   1101     SkASSERT(255 == alpha);
   1102 
   1103     /* Does not support the TEST_SRC_ALPHA case */
   1104     asm volatile (
   1105                   "cmp    %[count], #0               \n\t" /* comparing count with 0 */
   1106                   "beq    3f                         \n\t" /* if zero exit */
   1107 
   1108                   "mov    ip, #0xff                  \n\t" /* load the 0xff mask in ip */
   1109                   "orr    ip, ip, ip, lsl #16        \n\t" /* convert it to 0xff00ff in ip */
   1110 
   1111                   "cmp    %[count], #2               \n\t" /* compare count with 2 */
   1112                   "blt    2f                         \n\t" /* if less than 2 -> single loop */
   1113 
   1114                   /* Double Loop */
   1115                   "1:                                \n\t" /* <double loop> */
   1116                   "ldm    %[src]!, {r5,r6}           \n\t" /* load the src(s) at r5-r6 */
   1117                   "ldm    %[dst], {r7,r8}            \n\t" /* loading dst(s) into r7-r8 */
   1118                   "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
   1119 
   1120                   /* ----------- */
   1121                   "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
   1122                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
   1123                   "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
   1124 
   1125                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
   1126                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
   1127                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
   1128 
   1129                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
   1130                   "lsr    r4, r6, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
   1131                   "orr    r7, r9, r10                \n\t" /* br | ag*/
   1132 
   1133                   "add    r7, r5, r7                 \n\t" /* dst = src + calc dest(r7) */
   1134                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 255 -> r4=scale */
   1135 
   1136                   /* ----------- */
   1137                   "and    r9, ip, r8                 \n\t" /* r9 = br masked by ip */
   1138 
   1139                   "and    r10, ip, r8, lsr #8        \n\t" /* r10 = ag masked by ip */
   1140                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
   1141                   "sub    %[count], %[count], #2     \n\t"
   1142                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
   1143 
   1144                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
   1145                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag with reverse mask */
   1146                   "cmp    %[count], #1               \n\t" /* comparing count with 1 */
   1147                   "orr    r8, r9, r10                \n\t" /* br | ag */
   1148 
   1149                   "add    r8, r6, r8                 \n\t" /* dst = src + calc dest(r8) */
   1150 
   1151                   /* ----------------- */
   1152                   "stm    %[dst]!, {r7,r8}           \n\t" /* *dst = r7, increment dst by two (each times 4) */
   1153                   /* ----------------- */
   1154 
   1155                   "bgt    1b                         \n\t" /* if greater than 1 -> reloop */
   1156                   "blt    3f                         \n\t" /* if less than 1 -> exit */
   1157 
   1158                   /* Single Loop */
   1159                   "2:                                \n\t" /* <single loop> */
   1160                   "ldr    r5, [%[src]], #4           \n\t" /* load the src pointer into r5 r5=src */
   1161                   "ldr    r7, [%[dst]]               \n\t" /* loading dst into r7 */
   1162                   "lsr    r4, r5, #24                \n\t" /* extracting the alpha from source and storing it to r4 */
   1163 
   1164                   /* ----------- */
   1165                   "and    r9, ip, r7                 \n\t" /* r9 = br masked by ip */
   1166                   "rsb    r4, r4, #256               \n\t" /* subtracting the alpha from 256 -> r4=scale */
   1167 
   1168                   "and    r10, ip, r7, lsr #8        \n\t" /* r10 = ag masked by ip */
   1169                   "mul    r9, r9, r4                 \n\t" /* br = br * scale */
   1170                   "mul    r10, r10, r4               \n\t" /* ag = ag * scale */
   1171                   "and    r9, ip, r9, lsr #8         \n\t" /* lsr br by 8 and mask it */
   1172 
   1173                   "and    r10, r10, ip, lsl #8       \n\t" /* mask ag */
   1174                   "orr    r7, r9, r10                \n\t" /* br | ag */
   1175 
   1176                   "add    r7, r5, r7                 \n\t" /* *dst = src + calc dest(r7) */
   1177 
   1178                   /* ----------------- */
   1179                   "str    r7, [%[dst]], #4           \n\t" /* *dst = r7, increment dst by one (times 4) */
   1180                   /* ----------------- */
   1181 
   1182                   "3:                                \n\t" /* <exit> */
   1183                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
   1184                   :
   1185                   : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory"
   1186                   );
   1187 }
   1188 #define	S32A_Opaque_BlitRow32_PROC	S32A_Opaque_BlitRow32_arm
   1189 #endif /* !defined(TEST_SRC_ALPHA) */
   1190 #else /* ... #elif defined (__ARM_ARCH__) */
   1191 #define	S32A_Opaque_BlitRow32_PROC	NULL
   1192 #endif
   1193 
   1194 /*
   1195  * ARM asm version of S32A_Blend_BlitRow32
   1196  */
   1197 static void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst,
   1198                                  const SkPMColor* SK_RESTRICT src,
   1199                                  int count, U8CPU alpha) {
   1200     asm volatile (
   1201                   "cmp    %[count], #0               \n\t" /* comparing count with 0 */
   1202                   "beq    3f                         \n\t" /* if zero exit */
   1203 
   1204                   "mov    r12, #0xff                 \n\t" /* load the 0xff mask in r12 */
   1205                   "orr    r12, r12, r12, lsl #16     \n\t" /* convert it to 0xff00ff in r12 */
   1206 
   1207                   /* src1,2_scale */
   1208                   "add    %[alpha], %[alpha], #1     \n\t" /* loading %[alpha]=src_scale=alpha+1 */
   1209 
   1210                   "cmp    %[count], #2               \n\t" /* comparing count with 2 */
   1211                   "blt    2f                         \n\t" /* if less than 2 -> single loop */
   1212 
   1213                   /* Double Loop */
   1214                   "1:                                \n\t" /* <double loop> */
   1215                   "ldm    %[src]!, {r5, r6}          \n\t" /* loading src pointers into r5 and r6 */
   1216                   "ldm    %[dst], {r7, r8}           \n\t" /* loading dst pointers into r7 and r8 */
   1217 
   1218                   /* dst1_scale and dst2_scale*/
   1219                   "lsr    r9, r5, #24                \n\t" /* src >> 24 */
   1220                   "lsr    r10, r6, #24               \n\t" /* src >> 24 */
   1221                   "smulbb r9, r9, %[alpha]           \n\t" /* r9 = SkMulS16 r9 with src_scale */
   1222                   "smulbb r10, r10, %[alpha]         \n\t" /* r10 = SkMulS16 r10 with src_scale */
   1223                   "lsr    r9, r9, #8                 \n\t" /* r9 >> 8 */
   1224                   "lsr    r10, r10, #8               \n\t" /* r10 >> 8 */
   1225                   "rsb    r9, r9, #256               \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */
   1226                   "rsb    r10, r10, #256             \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */
   1227 
   1228                   /* ---------------------- */
   1229 
   1230                   /* src1, src1_scale */
   1231                   "and    r11, r12, r5, lsr #8       \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */
   1232                   "and    r4, r12, r5                \n\t" /* rb = r4 = r5 masked by r12 */
   1233                   "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
   1234                   "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
   1235                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
   1236                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
   1237                   "orr    r5, r11, r4                \n\t" /* r5 = (src1, src_scale) */
   1238 
   1239                   /* dst1, dst1_scale */
   1240                   "and    r11, r12, r7, lsr #8       \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */
   1241                   "and    r4, r12, r7                \n\t" /* rb = r4 = r7 masked by r12 */
   1242                   "mul    r11, r11, r9               \n\t" /* ag = r11 times dst_scale (r9) */
   1243                   "mul    r4, r4, r9                 \n\t" /* rb = r4 times dst_scale (r9) */
   1244                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
   1245                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
   1246                   "orr    r9, r11, r4                \n\t" /* r9 = (dst1, dst_scale) */
   1247 
   1248                   /* ---------------------- */
   1249                   "add    r9, r5, r9                 \n\t" /* *dst = src plus dst both scaled */
   1250                   /* ---------------------- */
   1251 
   1252                   /* ====================== */
   1253 
   1254                   /* src2, src2_scale */
   1255                   "and    r11, r12, r6, lsr #8       \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */
   1256                   "and    r4, r12, r6                \n\t" /* rb = r4 = r6 masked by r12 */
   1257                   "mul    r11, r11, %[alpha]         \n\t" /* ag = r11 times src_scale */
   1258                   "mul    r4, r4, %[alpha]           \n\t" /* rb = r4 times src_scale */
   1259                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
   1260                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
   1261                   "orr    r6, r11, r4                \n\t" /* r6 = (src2, src_scale) */
   1262 
   1263                   /* dst2, dst2_scale */
   1264                   "and    r11, r12, r8, lsr #8       \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */
   1265                   "and    r4, r12, r8                \n\t" /* rb = r4 = r8 masked by r12 */
   1266                   "mul    r11, r11, r10              \n\t" /* ag = r11 times dst_scale (r10) */
   1267                   "mul    r4, r4, r10                \n\t" /* rb = r4 times dst_scale (r6) */
   1268                   "and    r11, r11, r12, lsl #8      \n\t" /* ag masked by reverse mask (r12) */
   1269                   "and    r4, r12, r4, lsr #8        \n\t" /* rb masked by mask (r12) */
   1270                   "orr    r10, r11, r4               \n\t" /* r10 = (dst2, dst_scale) */
   1271 
   1272                   "sub    %[count], %[count], #2     \n\t" /* decrease count by 2 */
   1273                   /* ---------------------- */
   1274                   "add    r10, r6, r10               \n\t" /* *dst = src plus dst both scaled */
   1275                   /* ---------------------- */
   1276                   "cmp    %[count], #1               \n\t" /* compare count with 1 */
   1277                   /* ----------------- */
   1278                   "stm    %[dst]!, {r9, r10}         \n\t" /* copy r9 and r10 to r7 and r8 respectively */
   1279                   /* ----------------- */
   1280 
   1281                   "bgt    1b                         \n\t" /* if %[count] greater than 1 reloop */
   1282                   "blt    3f                         \n\t" /* if %[count] less than 1 exit */
   1283                                                            /* else get into the single loop */
   1284                   /* Single Loop */
   1285                   "2:                                \n\t" /* <single loop> */
   1286                   "ldr    r5, [%[src]], #4           \n\t" /* loading src pointer into r5: r5=src */
   1287                   "ldr    r7, [%[dst]]               \n\t" /* loading dst pointer into r7: r7=dst */
   1288 
   1289                   "lsr    r6, r5, #24                \n\t" /* src >> 24 */
   1290                   "and    r8, r12, r5, lsr #8        \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */
   1291                   "smulbb r6, r6, %[alpha]           \n\t" /* r6 = SkMulS16 with src_scale */
   1292                   "and    r9, r12, r5                \n\t" /* rb = r9 = r5 masked by r12 */
   1293                   "lsr    r6, r6, #8                 \n\t" /* r6 >> 8 */
   1294                   "mul    r8, r8, %[alpha]           \n\t" /* ag = r8 times scale */
   1295                   "rsb    r6, r6, #256               \n\t" /* r6 = 255 - r6 + 1 */
   1296 
   1297                   /* src, src_scale */
   1298                   "mul    r9, r9, %[alpha]           \n\t" /* rb = r9 times scale */
   1299                   "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
   1300                   "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
   1301                   "orr    r10, r8, r9                \n\t" /* r10 = (scr, src_scale) */
   1302 
   1303                   /* dst, dst_scale */
   1304                   "and    r8, r12, r7, lsr #8        \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */
   1305                   "and    r9, r12, r7                \n\t" /* rb = r9 = r7 masked by r12 */
   1306                   "mul    r8, r8, r6                 \n\t" /* ag = r8 times scale (r6) */
   1307                   "mul    r9, r9, r6                 \n\t" /* rb = r9 times scale (r6) */
   1308                   "and    r8, r8, r12, lsl #8        \n\t" /* ag masked by reverse mask (r12) */
   1309                   "and    r9, r12, r9, lsr #8        \n\t" /* rb masked by mask (r12) */
   1310                   "orr    r7, r8, r9                 \n\t" /* r7 = (dst, dst_scale) */
   1311 
   1312                   "add    r10, r7, r10               \n\t" /* *dst = src plus dst both scaled */
   1313 
   1314                   /* ----------------- */
   1315                   "str    r10, [%[dst]], #4          \n\t" /* *dst = r10, postincrement dst by one (times 4) */
   1316                   /* ----------------- */
   1317 
   1318                   "3:                                \n\t" /* <exit> */
   1319                   : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha)
   1320                   :
   1321                   : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory"
   1322                   );
   1323 
   1324 }
   1325 #define	S32A_Blend_BlitRow32_PROC	S32A_Blend_BlitRow32_arm
   1326 
   1327 /* Neon version of S32_Blend_BlitRow32()
   1328  * portable version is in src/core/SkBlitRow_D32.cpp
   1329  */
   1330 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
   1331 static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1332                                 const SkPMColor* SK_RESTRICT src,
   1333                                 int count, U8CPU alpha) {
   1334     SkASSERT(alpha <= 255);
   1335     if (count > 0) {
   1336         uint16_t src_scale = SkAlpha255To256(alpha);
   1337         uint16_t dst_scale = 256 - src_scale;
   1338 
   1339 	/* run them N at a time through the NEON unit */
   1340 	/* note that each 1 is 4 bytes, each treated exactly the same,
   1341 	 * so we can work under that guise. We *do* know that the src&dst
   1342 	 * will be 32-bit aligned quantities, so we can specify that on
   1343 	 * the load/store ops and do a neon 'reinterpret' to get us to
   1344 	 * byte-sized (pun intended) pieces that we widen/multiply/shift
   1345 	 * we're limited at 128 bits in the wide ops, which is 8x16bits
   1346 	 * or a pair of 32 bit src/dsts.
   1347 	 */
   1348 	/* we *could* manually unroll this loop so that we load 128 bits
   1349 	 * (as a pair of 64s) from each of src and dst, processing them
   1350 	 * in pieces. This might give us a little better management of
   1351 	 * the memory latency, but my initial attempts here did not
   1352 	 * produce an instruction stream that looked all that nice.
   1353 	 */
   1354 #define	UNROLL	2
   1355 	while (count >= UNROLL) {
   1356 	    uint8x8_t  src_raw, dst_raw, dst_final;
   1357 	    uint16x8_t  src_wide, dst_wide;
   1358 
   1359 	    /* get 64 bits of src, widen it, multiply by src_scale */
   1360 	    src_raw = vreinterpret_u8_u32(vld1_u32(src));
   1361 	    src_wide = vmovl_u8(src_raw);
   1362 	    /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
   1363 	    src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
   1364 
   1365 	    /* ditto with dst */
   1366 	    dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
   1367 	    dst_wide = vmovl_u8(dst_raw);
   1368 
   1369 	    /* combine add with dst multiply into mul-accumulate */
   1370 	    dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
   1371 
   1372 	    dst_final = vshrn_n_u16(dst_wide, 8);
   1373 	    vst1_u32(dst, vreinterpret_u32_u8(dst_final));
   1374 
   1375 	    src += UNROLL;
   1376 	    dst += UNROLL;
   1377 	    count -= UNROLL;
   1378 	}
   1379 	/* RBE: well, i don't like how gcc manages src/dst across the above
   1380 	 * loop it's constantly calculating src+bias, dst+bias and it only
   1381 	 * adjusts the real ones when we leave the loop. Not sure why
   1382 	 * it's "hoisting down" (hoisting implies above in my lexicon ;))
   1383 	 * the adjustments to src/dst/count, but it does...
   1384 	 * (might be SSA-style internal logic...
   1385 	 */
   1386 
   1387 #if	UNROLL == 2
   1388 	if (count == 1) {
   1389             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
   1390 	}
   1391 #else
   1392 	if (count > 0) {
   1393             do {
   1394                 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
   1395                 src += 1;
   1396                 dst += 1;
   1397             } while (--count > 0);
   1398 	}
   1399 #endif
   1400 
   1401 #undef	UNROLL
   1402     }
   1403 }
   1404 
   1405 #define	S32_Blend_BlitRow32_PROC	S32_Blend_BlitRow32_neon
   1406 #else
   1407 #define	S32_Blend_BlitRow32_PROC	NULL
   1408 #endif
   1409 
   1410 ///////////////////////////////////////////////////////////////////////////////
   1411 
   1412 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
   1413 
   1414 #undef	DEBUG_OPAQUE_DITHER
   1415 
   1416 #if	defined(DEBUG_OPAQUE_DITHER)
   1417 static void showme8(char *str, void *p, int len)
   1418 {
   1419 	static char buf[256];
   1420 	char tbuf[32];
   1421 	int i;
   1422 	char *pc = (char*) p;
   1423 	sprintf(buf,"%8s:", str);
   1424 	for(i=0;i<len;i++) {
   1425 	    sprintf(tbuf, "   %02x", pc[i]);
   1426 	    strcat(buf, tbuf);
   1427 	}
   1428 	SkDebugf("%s\n", buf);
   1429 }
   1430 static void showme16(char *str, void *p, int len)
   1431 {
   1432 	static char buf[256];
   1433 	char tbuf[32];
   1434 	int i;
   1435 	uint16_t *pc = (uint16_t*) p;
   1436 	sprintf(buf,"%8s:", str);
   1437 	len = (len / sizeof(uint16_t));	/* passed as bytes */
   1438 	for(i=0;i<len;i++) {
   1439 	    sprintf(tbuf, " %04x", pc[i]);
   1440 	    strcat(buf, tbuf);
   1441 	}
   1442 	SkDebugf("%s\n", buf);
   1443 }
   1444 #endif
   1445 
   1446 static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   1447                                       const SkPMColor* SK_RESTRICT src,
   1448                                       int count, U8CPU alpha, int x, int y) {
   1449     SkASSERT(255 == alpha);
   1450 
   1451 #define	UNROLL	8
   1452 
   1453     if (count >= UNROLL) {
   1454 	uint8x8_t dbase;
   1455 
   1456 #if	defined(DEBUG_OPAQUE_DITHER)
   1457 	uint16_t tmpbuf[UNROLL];
   1458 	int td[UNROLL];
   1459 	int tdv[UNROLL];
   1460 	int ta[UNROLL];
   1461 	int tap[UNROLL];
   1462 	uint16_t in_dst[UNROLL];
   1463 	int offset = 0;
   1464 	int noisy = 0;
   1465 #endif
   1466 
   1467 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1468 	dbase = vld1_u8(dstart);
   1469 
   1470         do {
   1471 	    uint8x8_t sr, sg, sb, sa, d;
   1472 	    uint16x8_t dst8, scale8, alpha8;
   1473 	    uint16x8_t dst_r, dst_g, dst_b;
   1474 
   1475 #if	defined(DEBUG_OPAQUE_DITHER)
   1476 	/* calculate 8 elements worth into a temp buffer */
   1477 	{
   1478 	  int my_y = y;
   1479 	  int my_x = x;
   1480 	  SkPMColor* my_src = (SkPMColor*)src;
   1481 	  uint16_t* my_dst = dst;
   1482 	  int i;
   1483 
   1484           DITHER_565_SCAN(my_y);
   1485           for(i=0;i<UNROLL;i++) {
   1486             SkPMColor c = *my_src++;
   1487             SkPMColorAssert(c);
   1488             if (c) {
   1489                 unsigned a = SkGetPackedA32(c);
   1490 
   1491                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
   1492 		tdv[i] = DITHER_VALUE(my_x);
   1493 		ta[i] = a;
   1494 		tap[i] = SkAlpha255To256(a);
   1495 		td[i] = d;
   1496 
   1497                 unsigned sr = SkGetPackedR32(c);
   1498                 unsigned sg = SkGetPackedG32(c);
   1499                 unsigned sb = SkGetPackedB32(c);
   1500                 sr = SkDITHER_R32_FOR_565(sr, d);
   1501                 sg = SkDITHER_G32_FOR_565(sg, d);
   1502                 sb = SkDITHER_B32_FOR_565(sb, d);
   1503 
   1504                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1505                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
   1506                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1507                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1508                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1509 		td[i] = d;
   1510 
   1511             } else {
   1512 		tmpbuf[i] = *my_dst;
   1513 		ta[i] = tdv[i] = td[i] = 0xbeef;
   1514 	    }
   1515 	    in_dst[i] = *my_dst;
   1516             my_dst += 1;
   1517             DITHER_INC_X(my_x);
   1518           }
   1519 	}
   1520 #endif
   1521 
   1522 	    /* source is in ABGR */
   1523 	    {
   1524 		register uint8x8_t d0 asm("d0");
   1525 		register uint8x8_t d1 asm("d1");
   1526 		register uint8x8_t d2 asm("d2");
   1527 		register uint8x8_t d3 asm("d3");
   1528 
   1529 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
   1530 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
   1531 		    : "r" (src)
   1532                     );
   1533 		    sr = d0; sg = d1; sb = d2; sa = d3;
   1534 	    }
   1535 
   1536 	    /* calculate 'd', which will be 0..7 */
   1537 	    /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
   1538 #if defined(SK_BUILD_FOR_ANDROID)
   1539 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
   1540 	    alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
   1541 #else
   1542 	    alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
   1543 #endif
   1544 	    alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
   1545 	    d = vshrn_n_u16(alpha8, 8);	/* narrowing too */
   1546 
   1547 	    /* sr = sr - (sr>>5) + d */
   1548 	    /* watching for 8-bit overflow.  d is 0..7; risky range of
   1549 	     * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
   1550 	     * safe  as long as we do ((sr-sr>>5) + d) */
   1551 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1552 	    sr = vadd_u8(sr, d);
   1553 
   1554 	    /* sb = sb - (sb>>5) + d */
   1555 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1556 	    sb = vadd_u8(sb, d);
   1557 
   1558 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
   1559 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1560 	    sg = vadd_u8(sg, vshr_n_u8(d,1));
   1561 
   1562 	    /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
   1563 	    dst8 = vld1q_u16(dst);
   1564 	    dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
   1565 	    dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
   1566 	    dst_r = vshrq_n_u16(dst8,11);	/* clearing hi bits */
   1567 
   1568 	    /* blend */
   1569 #if 1
   1570 	    /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
   1571 	    /* originally 255-sa + 1 */
   1572 	    scale8 = vsubw_u8(vdupq_n_u16(256), sa);
   1573 #else
   1574 	    scale8 = vsubw_u8(vdupq_n_u16(255), sa);
   1575 	    scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
   1576 #endif
   1577 
   1578 #if 1
   1579 	    /* combine the addq and mul, save 3 insns */
   1580 	    scale8 = vshrq_n_u16(scale8, 3);
   1581 	    dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
   1582 	    dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
   1583 	    dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
   1584 #else
   1585 	    /* known correct, but +3 insns over above */
   1586 	    scale8 = vshrq_n_u16(scale8, 3);
   1587 	    dst_b = vmulq_u16(dst_b, scale8);
   1588 	    dst_g = vmulq_u16(dst_g, scale8);
   1589 	    dst_r = vmulq_u16(dst_r, scale8);
   1590 
   1591 	    /* combine */
   1592 	    /* NB: vshll widens, need to preserve those bits */
   1593 	    dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
   1594 	    dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
   1595 	    dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
   1596 #endif
   1597 
   1598 	    /* repack to store */
   1599 	    dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
   1600 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
   1601 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
   1602 
   1603 	    vst1q_u16(dst, dst8);
   1604 
   1605 #if	defined(DEBUG_OPAQUE_DITHER)
   1606 	    /* verify my 8 elements match the temp buffer */
   1607 	{
   1608 	   int i, bad=0;
   1609 	   static int invocation;
   1610 
   1611 	   for (i=0;i<UNROLL;i++)
   1612 		if (tmpbuf[i] != dst[i]) bad=1;
   1613 	   if (bad) {
   1614 		SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
   1615 			invocation, offset);
   1616 		SkDebugf("  alpha 0x%x\n", alpha);
   1617 		for (i=0;i<UNROLL;i++)
   1618 		    SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
   1619 			i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
   1620 			dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
   1621 
   1622 		showme16("alpha8", &alpha8, sizeof(alpha8));
   1623 		showme16("scale8", &scale8, sizeof(scale8));
   1624 		showme8("d", &d, sizeof(d));
   1625 		showme16("dst8", &dst8, sizeof(dst8));
   1626 		showme16("dst_b", &dst_b, sizeof(dst_b));
   1627 		showme16("dst_g", &dst_g, sizeof(dst_g));
   1628 		showme16("dst_r", &dst_r, sizeof(dst_r));
   1629 		showme8("sb", &sb, sizeof(sb));
   1630 		showme8("sg", &sg, sizeof(sg));
   1631 		showme8("sr", &sr, sizeof(sr));
   1632 
   1633 		/* cop out */
   1634 		return;
   1635 	   }
   1636 	   offset += UNROLL;
   1637 	   invocation++;
   1638 	}
   1639 #endif
   1640 
   1641             dst += UNROLL;
   1642 	    src += UNROLL;
   1643 	    count -= UNROLL;
   1644 	    /* skip x += UNROLL, since it's unchanged mod-4 */
   1645         } while (count >= UNROLL);
   1646     }
   1647 #undef	UNROLL
   1648 
   1649     /* residuals */
   1650     if (count > 0) {
   1651         DITHER_565_SCAN(y);
   1652         do {
   1653             SkPMColor c = *src++;
   1654             SkPMColorAssert(c);
   1655             if (c) {
   1656                 unsigned a = SkGetPackedA32(c);
   1657 
   1658                 // dither and alpha are just temporary variables to work-around
   1659                 // an ICE in debug.
   1660                 unsigned dither = DITHER_VALUE(x);
   1661                 unsigned alpha = SkAlpha255To256(a);
   1662                 int d = SkAlphaMul(dither, alpha);
   1663 
   1664                 unsigned sr = SkGetPackedR32(c);
   1665                 unsigned sg = SkGetPackedG32(c);
   1666                 unsigned sb = SkGetPackedB32(c);
   1667                 sr = SkDITHER_R32_FOR_565(sr, d);
   1668                 sg = SkDITHER_G32_FOR_565(sg, d);
   1669                 sb = SkDITHER_B32_FOR_565(sb, d);
   1670 
   1671                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1672                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1673                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1674                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1675                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1676             }
   1677             dst += 1;
   1678             DITHER_INC_X(x);
   1679         } while (--count != 0);
   1680     }
   1681 }
   1682 
   1683 #define	S32A_D565_Opaque_Dither_PROC S32A_D565_Opaque_Dither_neon
   1684 #else
   1685 #define	S32A_D565_Opaque_Dither_PROC NULL
   1686 #endif
   1687 
   1688 ///////////////////////////////////////////////////////////////////////////////
   1689 
   1690 #if	defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
   1691 /* 2009/10/27: RBE says "a work in progress"; debugging says ok;
   1692  * speedup untested, but ARM version is 26 insns/iteration and
   1693  * this NEON version is 21 insns/iteration-of-8 (2.62insns/element)
   1694  * which is 10x the native version; that's pure instruction counts,
   1695  * not accounting for any instruction or memory latencies.
   1696  */
   1697 
   1698 #undef	DEBUG_S32_OPAQUE_DITHER
   1699 
   1700 static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
   1701                                      const SkPMColor* SK_RESTRICT src,
   1702                                      int count, U8CPU alpha, int x, int y) {
   1703     SkASSERT(255 == alpha);
   1704 
   1705 #define	UNROLL	8
   1706     if (count >= UNROLL) {
   1707 	uint8x8_t d;
   1708 	const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1709 	d = vld1_u8(dstart);
   1710 
   1711 	while (count >= UNROLL) {
   1712 	    uint8x8_t sr, sg, sb, sa;
   1713 	    uint16x8_t dr, dg, db, da;
   1714 	    uint16x8_t dst8;
   1715 
   1716 	    /* source is in ABGR ordering (R == lsb) */
   1717 	    {
   1718 		register uint8x8_t d0 asm("d0");
   1719 		register uint8x8_t d1 asm("d1");
   1720 		register uint8x8_t d2 asm("d2");
   1721 		register uint8x8_t d3 asm("d3");
   1722 
   1723 		asm ("vld4.8	{d0-d3},[%4]  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
   1724 		    : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
   1725 		    : "r" (src)
   1726                     );
   1727 		    sr = d0; sg = d1; sb = d2; sa = d3;
   1728 	    }
   1729 	    /* XXX: if we want to prefetch, hide it in the above asm()
   1730 	     * using the gcc __builtin_prefetch(), the prefetch will
   1731 	     * fall to the bottom of the loop -- it won't stick up
   1732 	     * at the top of the loop, just after the vld4.
   1733 	     */
   1734 
   1735 	    /* sr = sr - (sr>>5) + d */
   1736 	    sr = vsub_u8(sr, vshr_n_u8(sr, 5));
   1737 	    dr = vaddl_u8(sr, d);
   1738 
   1739 	    /* sb = sb - (sb>>5) + d */
   1740 	    sb = vsub_u8(sb, vshr_n_u8(sb, 5));
   1741 	    db = vaddl_u8(sb, d);
   1742 
   1743 	    /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
   1744 	    sg = vsub_u8(sg, vshr_n_u8(sg, 6));
   1745 	    dg = vaddl_u8(sg, vshr_n_u8(d,1));
   1746 	    /* XXX: check that the "d>>1" here is hoisted */
   1747 
   1748 	    /* pack high bits of each into 565 format  (rgb, b is lsb) */
   1749 	    dst8 = vshrq_n_u16(db, 3);
   1750 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
   1751 	    dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11);
   1752 
   1753 	    /* store it */
   1754 	    vst1q_u16(dst, dst8);
   1755 
   1756 #if	defined(DEBUG_S32_OPAQUE_DITHER)
   1757 	    /* always good to know if we generated good results */
   1758 	    {
   1759 		int i, myx = x, myy = y;
   1760 		DITHER_565_SCAN(myy);
   1761 		for (i=0;i<UNROLL;i++) {
   1762 		    SkPMColor c = src[i];
   1763 		    unsigned dither = DITHER_VALUE(myx);
   1764 		    uint16_t val = SkDitherRGB32To565(c, dither);
   1765 		    if (val != dst[i]) {
   1766 			SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
   1767 			    c, dither, val, dst[i], dstart[i]);
   1768 		    }
   1769 		    DITHER_INC_X(myx);
   1770 		}
   1771 	    }
   1772 #endif
   1773 
   1774 	    dst += UNROLL;
   1775 	    src += UNROLL;
   1776 	    count -= UNROLL;
   1777 	    x += UNROLL;		/* probably superfluous */
   1778 	}
   1779     }
   1780 #undef	UNROLL
   1781 
   1782     /* residuals */
   1783     if (count > 0) {
   1784         DITHER_565_SCAN(y);
   1785         do {
   1786             SkPMColor c = *src++;
   1787             SkPMColorAssert(c);
   1788             SkASSERT(SkGetPackedA32(c) == 255);
   1789 
   1790             unsigned dither = DITHER_VALUE(x);
   1791             *dst++ = SkDitherRGB32To565(c, dither);
   1792             DITHER_INC_X(x);
   1793         } while (--count != 0);
   1794     }
   1795 }
   1796 
   1797 #define	S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon
   1798 #else
   1799 #define	S32_D565_Opaque_Dither_PROC NULL
   1800 #endif
   1801 
   1802 ///////////////////////////////////////////////////////////////////////////////
   1803 
   1804 static const SkBlitRow::Proc platform_565_procs[] = {
   1805     // no dither
   1806     S32_D565_Opaque_PROC,
   1807     S32_D565_Blend_PROC,
   1808     S32A_D565_Opaque_PROC,
   1809     S32A_D565_Blend_PROC,
   1810 
   1811     // dither
   1812     S32_D565_Opaque_Dither_PROC,
   1813     S32_D565_Blend_Dither_PROC,
   1814     S32A_D565_Opaque_Dither_PROC,
   1815     NULL,   // S32A_D565_Blend_Dither
   1816 };
   1817 
   1818 static const SkBlitRow::Proc platform_4444_procs[] = {
   1819     // no dither
   1820     NULL,   // S32_D4444_Opaque,
   1821     NULL,   // S32_D4444_Blend,
   1822     NULL,   // S32A_D4444_Opaque,
   1823     NULL,   // S32A_D4444_Blend,
   1824 
   1825     // dither
   1826     NULL,   // S32_D4444_Opaque_Dither,
   1827     NULL,   // S32_D4444_Blend_Dither,
   1828     NULL,   // S32A_D4444_Opaque_Dither,
   1829     NULL,   // S32A_D4444_Blend_Dither
   1830 };
   1831 
   1832 static const SkBlitRow::Proc32 platform_32_procs[] = {
   1833     NULL,   // S32_Opaque,
   1834     S32_Blend_BlitRow32_PROC,		// S32_Blend,
   1835     S32A_Opaque_BlitRow32_PROC,		// S32A_Opaque,
   1836     S32A_Blend_BlitRow32_PROC		// S32A_Blend
   1837 };
   1838 
   1839 SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) {
   1840     return platform_4444_procs[flags];
   1841 }
   1842 
   1843 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
   1844     return platform_565_procs[flags];
   1845 }
   1846 
   1847 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
   1848     return platform_32_procs[flags];
   1849 }
   1850 
   1851 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
   1852     return NULL;
   1853 }
   1854 
   1855 ///////////////////////////////////////////////////////////////////////////////
   1856 
   1857 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
   1858                                                      SkMask::Format maskFormat,
   1859                                                      SkColor color) {
   1860     return NULL;
   1861 }
   1862 
   1863 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
   1864     return NULL;
   1865 }
   1866 
   1867 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
   1868                                                  SkMask::Format maskFormat,
   1869                                                  RowFlags flags) {
   1870     return NULL;
   1871 }
   1872