Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmapProcState_opts_SSE2.h"
     10 #include "SkBlitRow_opts_SSE2.h"
     11 #include "SkColorPriv.h"
     12 #include "SkColor_opts_SSE2.h"
     13 #include "SkDither.h"
     14 #include "SkMSAN.h"
     15 #include "SkUtils.h"
     16 
     17 /* SSE2 version of S32_Blend_BlitRow32()
     18  * portable version is in core/SkBlitRow_D32.cpp
     19  */
     20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     21                               const SkPMColor* SK_RESTRICT src,
     22                               int count, U8CPU alpha) {
     23     SkASSERT(alpha <= 255);
     24     if (count <= 0) {
     25         return;
     26     }
     27 
     28     uint32_t src_scale = SkAlpha255To256(alpha);
     29 
     30     if (count >= 4) {
     31         SkASSERT(((size_t)dst & 0x03) == 0);
     32         while (((size_t)dst & 0x0F) != 0) {
     33             *dst = SkPMLerp(*src, *dst, src_scale);
     34             src++;
     35             dst++;
     36             count--;
     37         }
     38 
     39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     40         __m128i *d = reinterpret_cast<__m128i*>(dst);
     41 
     42         while (count >= 4) {
     43             // Load 4 pixels each of src and dest.
     44             __m128i src_pixel = _mm_loadu_si128(s);
     45             __m128i dst_pixel = _mm_load_si128(d);
     46 
     47             __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
     48             _mm_store_si128(d, result);
     49             s++;
     50             d++;
     51             count -= 4;
     52         }
     53         src = reinterpret_cast<const SkPMColor*>(s);
     54         dst = reinterpret_cast<SkPMColor*>(d);
     55     }
     56 
     57     while (count > 0) {
     58         *dst = SkPMLerp(*src, *dst, src_scale);
     59         src++;
     60         dst++;
     61         count--;
     62     }
     63 }
     64 
     65 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     66                                const SkPMColor* SK_RESTRICT src,
     67                                int count, U8CPU alpha) {
     68     SkASSERT(alpha <= 255);
     69     if (count <= 0) {
     70         return;
     71     }
     72 
     73     if (count >= 4) {
     74         while (((size_t)dst & 0x0F) != 0) {
     75             *dst = SkBlendARGB32(*src, *dst, alpha);
     76             src++;
     77             dst++;
     78             count--;
     79         }
     80 
     81         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     82         __m128i *d = reinterpret_cast<__m128i*>(dst);
     83         while (count >= 4) {
     84             // Load 4 pixels each of src and dest.
     85             __m128i src_pixel = _mm_loadu_si128(s);
     86             __m128i dst_pixel = _mm_load_si128(d);
     87 
     88             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
     89             _mm_store_si128(d, result);
     90             s++;
     91             d++;
     92             count -= 4;
     93         }
     94         src = reinterpret_cast<const SkPMColor*>(s);
     95         dst = reinterpret_cast<SkPMColor*>(d);
     96     }
     97 
     98     while (count > 0) {
     99         *dst = SkBlendARGB32(*src, *dst, alpha);
    100         src++;
    101         dst++;
    102         count--;
    103     }
    104 }
    105 
    106 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
    107     SkASSERT(count > 0);
    108 
    109     uint32_t src_expand = (SkGetPackedG32(src) << 24) |
    110                           (SkGetPackedR32(src) << 13) |
    111                           (SkGetPackedB32(src) << 2);
    112     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
    113 
    114     // Check if we have enough pixels to run SIMD
    115     if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
    116         __m128i* dst_wide;
    117         const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
    118         const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
    119         const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
    120         const __m128i scale_wide = _mm_set1_epi16(scale);
    121         const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
    122         const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
    123 
    124         // Align dst to an even 16 byte address (0-7 pixels)
    125         while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
    126             *dst = SkBlend32_RGB16(src_expand, *dst, scale);
    127             dst += 1;
    128             count--;
    129         }
    130 
    131         dst_wide = reinterpret_cast<__m128i*>(dst);
    132         do {
    133             // Load eight RGB565 pixels
    134             __m128i pixels = _mm_load_si128(dst_wide);
    135 
    136             // Mask out sub-pixels
    137             __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
    138             __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
    139             pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
    140             __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
    141 
    142             // Scale with alpha
    143             pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
    144             pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
    145             pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
    146 
    147             // Add src_X_wide and shift down again
    148             pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
    149             pixel_R = _mm_srli_epi16(pixel_R, 5);
    150             pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
    151             pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
    152             pixel_B = _mm_srli_epi16(pixel_B, 5);
    153 
    154             // Combine into RGB565 and store
    155             pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
    156             pixel_G = _mm_and_si128(pixel_G, mask_green);
    157             pixels = _mm_or_si128(pixel_R, pixel_G);
    158             pixels = _mm_or_si128(pixels, pixel_B);
    159             _mm_store_si128(dst_wide, pixels);
    160             count -= 8;
    161             dst_wide++;
    162         } while (count >= 8);
    163 
    164         dst = reinterpret_cast<uint16_t*>(dst_wide);
    165     }
    166 
    167     // Small loop to handle remaining pixels.
    168     while (count > 0) {
    169         *dst = SkBlend32_RGB16(src_expand, *dst, scale);
    170         dst += 1;
    171         count--;
    172     }
    173 }
    174 
    175 // The following (left) shifts cause the top 5 bits of the mask components to
    176 // line up with the corresponding components in an SkPMColor.
    177 // Note that the mask's RGB16 order may differ from the SkPMColor order.
    178 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    179 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    180 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
    181 
    182 #if SK_R16x5_R32x5_SHIFT == 0
    183     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
    184 #elif SK_R16x5_R32x5_SHIFT > 0
    185     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
    186 #else
    187     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
    188 #endif
    189 
    190 #if SK_G16x5_G32x5_SHIFT == 0
    191     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
    192 #elif SK_G16x5_G32x5_SHIFT > 0
    193     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
    194 #else
    195     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
    196 #endif
    197 
    198 #if SK_B16x5_B32x5_SHIFT == 0
    199     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
    200 #elif SK_B16x5_B32x5_SHIFT > 0
    201     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
    202 #else
    203     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
    204 #endif
    205 
    206 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
    207                                  __m128i &mask, __m128i &srcA) {
    208     // In the following comments, the components of src, dst and mask are
    209     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    210     // by an R, G, B, or A suffix. Components of one of the four pixels that
    211     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    212     // example is the blue channel of the second destination pixel. Memory
    213     // layout is shown for an ARGB byte order in a color value.
    214 
    215     // src and srcA store 8-bit values interleaved with zeros.
    216     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    217     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
    218     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
    219     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
    220     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
    221     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    222     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    223 
    224     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    225     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    226     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    227                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    228 
    229     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    230     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    231                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    232 
    233     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    234     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    235                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    236 
    237     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    238     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    239     // 8-bit position
    240     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    241     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    242     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    243 
    244     // Interleave R,G,B into the lower byte of word.
    245     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    246     // 16-bit values, padded by zero.
    247     __m128i maskLo, maskHi;
    248     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    249     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    250     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    251     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    252 
    253     // Upscale from 0..31 to 0..32
    254     // (allows to replace division by left-shift further down)
    255     // Left-shift each component by 4 and add the result back to that component,
    256     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    257     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    258     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    259 
    260     // Multiply each component of maskLo and maskHi by srcA
    261     maskLo = _mm_mullo_epi16(maskLo, srcA);
    262     maskHi = _mm_mullo_epi16(maskHi, srcA);
    263 
    264     // Left shift mask components by 8 (divide by 256)
    265     maskLo = _mm_srli_epi16(maskLo, 8);
    266     maskHi = _mm_srli_epi16(maskHi, 8);
    267 
    268     // Interleave R,G,B into the lower byte of the word
    269     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    270     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    271     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    272     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    273 
    274     // mask = (src - dst) * mask
    275     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    276     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    277 
    278     // mask = (src - dst) * mask >> 5
    279     maskLo = _mm_srai_epi16(maskLo, 5);
    280     maskHi = _mm_srai_epi16(maskHi, 5);
    281 
    282     // Add two pixels into result.
    283     // result = dst + ((src - dst) * mask >> 5)
    284     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    285     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    286 
    287     // Pack into 4 32bit dst pixels.
    288     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    289     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    290     // clamping to 255 if necessary.
    291     return _mm_packus_epi16(resultLo, resultHi);
    292 }
    293 
    294 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
    295                                        __m128i &mask) {
    296     // In the following comments, the components of src, dst and mask are
    297     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    298     // by an R, G, B, or A suffix. Components of one of the four pixels that
    299     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    300     // example is the blue channel of the second destination pixel. Memory
    301     // layout is shown for an ARGB byte order in a color value.
    302 
    303     // src and srcA store 8-bit values interleaved with zeros.
    304     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    305     // mask stores 16-bit values (shown as high and low bytes) interleaved with
    306     // zeros
    307     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    308     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    309 
    310     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    311     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    312     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    313                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    314 
    315     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    316     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    317                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    318 
    319     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    320     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    321                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    322 
    323     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    324     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    325     // 8-bit position
    326     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    327     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    328     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    329 
    330     // Interleave R,G,B into the lower byte of word.
    331     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    332     // 16-bit values, padded by zero.
    333     __m128i maskLo, maskHi;
    334     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    335     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    336     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    337     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    338 
    339     // Upscale from 0..31 to 0..32
    340     // (allows to replace division by left-shift further down)
    341     // Left-shift each component by 4 and add the result back to that component,
    342     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    343     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    344     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    345 
    346     // Interleave R,G,B into the lower byte of the word
    347     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    348     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    349     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    350     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    351 
    352     // mask = (src - dst) * mask
    353     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    354     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    355 
    356     // mask = (src - dst) * mask >> 5
    357     maskLo = _mm_srai_epi16(maskLo, 5);
    358     maskHi = _mm_srai_epi16(maskHi, 5);
    359 
    360     // Add two pixels into result.
    361     // result = dst + ((src - dst) * mask >> 5)
    362     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    363     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    364 
    365     // Pack into 4 32bit dst pixels and force opaque.
    366     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    367     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    368     // clamping to 255 if necessary. Set alpha components to 0xFF.
    369     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
    370                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
    371 }
    372 
    373 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
    374                          SkColor src, int width, SkPMColor) {
    375     if (width <= 0) {
    376         return;
    377     }
    378 
    379     int srcA = SkColorGetA(src);
    380     int srcR = SkColorGetR(src);
    381     int srcG = SkColorGetG(src);
    382     int srcB = SkColorGetB(src);
    383 
    384     srcA = SkAlpha255To256(srcA);
    385 
    386     if (width >= 4) {
    387         SkASSERT(((size_t)dst & 0x03) == 0);
    388         while (((size_t)dst & 0x0F) != 0) {
    389             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    390             mask++;
    391             dst++;
    392             width--;
    393         }
    394 
    395         __m128i *d = reinterpret_cast<__m128i*>(dst);
    396         // Set alpha to 0xFF and replicate source four times in SSE register.
    397         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    398         // Interleave with zeros to get two sets of four 16-bit values.
    399         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    400         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    401         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    402         __m128i srcA_sse = _mm_set1_epi16(srcA);
    403         while (width >= 4) {
    404             // Load four destination pixels into dst_sse.
    405             __m128i dst_sse = _mm_load_si128(d);
    406             // Load four 16-bit masks into lower half of mask_sse.
    407             __m128i mask_sse = _mm_loadl_epi64(
    408                                    reinterpret_cast<const __m128i*>(mask));
    409 
    410             // Check whether masks are equal to 0 and get the highest bit
    411             // of each byte of result, if masks are all zero, we will get
    412             // pack_cmp to 0xFFFF
    413             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    414                                              _mm_setzero_si128()));
    415 
    416             // if mask pixels are not all zero, we will blend the dst pixels
    417             if (pack_cmp != 0xFFFF) {
    418                 // Unpack 4 16bit mask pixels to
    419                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    420                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    421                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    422                                               _mm_setzero_si128());
    423 
    424                 // Process 4 32bit dst pixels
    425                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
    426                                                    mask_sse, srcA_sse);
    427                 _mm_store_si128(d, result);
    428             }
    429 
    430             d++;
    431             mask += 4;
    432             width -= 4;
    433         }
    434 
    435         dst = reinterpret_cast<SkPMColor*>(d);
    436     }
    437 
    438     while (width > 0) {
    439         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    440         mask++;
    441         dst++;
    442         width--;
    443     }
    444 }
    445 
    446 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
    447                                SkColor src, int width, SkPMColor opaqueDst) {
    448     if (width <= 0) {
    449         return;
    450     }
    451 
    452     int srcR = SkColorGetR(src);
    453     int srcG = SkColorGetG(src);
    454     int srcB = SkColorGetB(src);
    455 
    456     if (width >= 4) {
    457         SkASSERT(((size_t)dst & 0x03) == 0);
    458         while (((size_t)dst & 0x0F) != 0) {
    459             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    460             mask++;
    461             dst++;
    462             width--;
    463         }
    464 
    465         __m128i *d = reinterpret_cast<__m128i*>(dst);
    466         // Set alpha to 0xFF and replicate source four times in SSE register.
    467         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    468         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    469         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    470         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    471         while (width >= 4) {
    472             // Load four destination pixels into dst_sse.
    473             __m128i dst_sse = _mm_load_si128(d);
    474             // Load four 16-bit masks into lower half of mask_sse.
    475             __m128i mask_sse = _mm_loadl_epi64(
    476                                    reinterpret_cast<const __m128i*>(mask));
    477 
    478             // Check whether masks are equal to 0 and get the highest bit
    479             // of each byte of result, if masks are all zero, we will get
    480             // pack_cmp to 0xFFFF
    481             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    482                                              _mm_setzero_si128()));
    483 
    484             // if mask pixels are not all zero, we will blend the dst pixels
    485             if (pack_cmp != 0xFFFF) {
    486                 // Unpack 4 16bit mask pixels to
    487                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    488                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    489                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    490                                               _mm_setzero_si128());
    491 
    492                 // Process 4 32bit dst pixels
    493                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
    494                                                          mask_sse);
    495                 _mm_store_si128(d, result);
    496             }
    497 
    498             d++;
    499             mask += 4;
    500             width -= 4;
    501         }
    502 
    503         dst = reinterpret_cast<SkPMColor*>(d);
    504     }
    505 
    506     while (width > 0) {
    507         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    508         mask++;
    509         dst++;
    510         width--;
    511     }
    512 }
    513 
    514 /* SSE2 version of S32_D565_Opaque()
    515  * portable version is in core/SkBlitRow_D16.cpp
    516  */
    517 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    518                           const SkPMColor* SK_RESTRICT src, int count,
    519                           U8CPU alpha, int /*x*/, int /*y*/) {
    520     SkASSERT(255 == alpha);
    521 
    522     if (count <= 0) {
    523         return;
    524     }
    525 
    526     if (count >= 8) {
    527         while (((size_t)dst & 0x0F) != 0) {
    528             SkPMColor c = *src++;
    529             SkPMColorAssert(c);
    530 
    531             *dst++ = SkPixel32ToPixel16_ToU16(c);
    532             count--;
    533         }
    534 
    535         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    536         __m128i* d = reinterpret_cast<__m128i*>(dst);
    537 
    538         while (count >= 8) {
    539             // Load 8 pixels of src.
    540             __m128i src_pixel1 = _mm_loadu_si128(s++);
    541             __m128i src_pixel2 = _mm_loadu_si128(s++);
    542 
    543             __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
    544             _mm_store_si128(d++, d_pixel);
    545             count -= 8;
    546         }
    547         src = reinterpret_cast<const SkPMColor*>(s);
    548         dst = reinterpret_cast<uint16_t*>(d);
    549     }
    550 
    551     if (count > 0) {
    552         do {
    553             SkPMColor c = *src++;
    554             SkPMColorAssert(c);
    555             *dst++ = SkPixel32ToPixel16_ToU16(c);
    556         } while (--count != 0);
    557     }
    558 }
    559 
    560 /* SSE2 version of S32A_D565_Opaque()
    561  * portable version is in core/SkBlitRow_D16.cpp
    562  */
    563 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    564                            const SkPMColor* SK_RESTRICT src,
    565                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
    566     SkASSERT(255 == alpha);
    567 
    568     if (count <= 0) {
    569         return;
    570     }
    571 
    572     if (count >= 8) {
    573         // Make dst 16 bytes alignment
    574         while (((size_t)dst & 0x0F) != 0) {
    575             SkPMColor c = *src++;
    576             if (c) {
    577               *dst = SkSrcOver32To16(c, *dst);
    578             }
    579             dst += 1;
    580             count--;
    581         }
    582 
    583         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    584         __m128i* d = reinterpret_cast<__m128i*>(dst);
    585         __m128i var255 = _mm_set1_epi16(255);
    586         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
    587         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
    588         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
    589 
    590         while (count >= 8) {
    591             // Load 8 pixels of src.
    592             __m128i src_pixel1 = _mm_loadu_si128(s++);
    593             __m128i src_pixel2 = _mm_loadu_si128(s++);
    594 
    595             // Check whether src pixels are equal to 0 and get the highest bit
    596             // of each byte of result, if src pixels are all zero, src_cmp1 and
    597             // src_cmp2 will be 0xFFFF.
    598             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
    599                                              _mm_setzero_si128()));
    600             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
    601                                              _mm_setzero_si128()));
    602             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
    603                 d++;
    604                 count -= 8;
    605                 continue;
    606             }
    607 
    608             // Load 8 pixels of dst.
    609             __m128i dst_pixel = _mm_load_si128(d);
    610 
    611             // Extract A from src.
    612             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
    613             sa1 = _mm_srli_epi32(sa1, 24);
    614             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
    615             sa2 = _mm_srli_epi32(sa2, 24);
    616             __m128i sa = _mm_packs_epi32(sa1, sa2);
    617 
    618             // Extract R from src.
    619             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    620             sr1 = _mm_srli_epi32(sr1, 24);
    621             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    622             sr2 = _mm_srli_epi32(sr2, 24);
    623             __m128i sr = _mm_packs_epi32(sr1, sr2);
    624 
    625             // Extract G from src.
    626             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    627             sg1 = _mm_srli_epi32(sg1, 24);
    628             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
    629             sg2 = _mm_srli_epi32(sg2, 24);
    630             __m128i sg = _mm_packs_epi32(sg1, sg2);
    631 
    632             // Extract B from src.
    633             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
    634             sb1 = _mm_srli_epi32(sb1, 24);
    635             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
    636             sb2 = _mm_srli_epi32(sb2, 24);
    637             __m128i sb = _mm_packs_epi32(sb1, sb2);
    638 
    639             // Extract R G B from dst.
    640             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
    641             dr = _mm_and_si128(dr, r16_mask);
    642             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
    643             dg = _mm_and_si128(dg, g16_mask);
    644             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
    645             db = _mm_and_si128(db, b16_mask);
    646 
    647             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
    648 
    649             // Calculate R G B of result.
    650             // Original algorithm is in SkSrcOver32To16().
    651             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
    652             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
    653             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
    654             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
    655             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
    656             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
    657 
    658             // Pack R G B into 16-bit color.
    659             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
    660 
    661             // Store 8 16-bit colors in dst.
    662             _mm_store_si128(d++, d_pixel);
    663             count -= 8;
    664         }
    665 
    666         src = reinterpret_cast<const SkPMColor*>(s);
    667         dst = reinterpret_cast<uint16_t*>(d);
    668     }
    669 
    670     if (count > 0) {
    671         do {
    672             SkPMColor c = *src++;
    673             SkPMColorAssert(c);
    674             if (c) {
    675                 *dst = SkSrcOver32To16(c, *dst);
    676             }
    677             dst += 1;
    678         } while (--count != 0);
    679     }
    680 }
    681 
    682 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
    683                                  const SkPMColor* SK_RESTRICT src,
    684                                  int count, U8CPU alpha, int x, int y) {
    685     SkASSERT(255 == alpha);
    686 
    687     if (count <= 0) {
    688         return;
    689     }
    690 
    691     if (count >= 8) {
    692         while (((size_t)dst & 0x0F) != 0) {
    693             DITHER_565_SCAN(y);
    694             SkPMColor c = *src++;
    695             SkPMColorAssert(c);
    696 
    697             unsigned dither = DITHER_VALUE(x);
    698             *dst++ = SkDitherRGB32To565(c, dither);
    699             DITHER_INC_X(x);
    700             count--;
    701         }
    702 
    703         unsigned short dither_value[8];
    704         __m128i dither;
    705 #ifdef ENABLE_DITHER_MATRIX_4X4
    706         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
    707         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
    708         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
    709         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
    710         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
    711 #else
    712         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
    713         dither_value[0] = dither_value[4] = (dither_scan
    714                                              >> (((x) & 3) << 2)) & 0xF;
    715         dither_value[1] = dither_value[5] = (dither_scan
    716                                              >> (((x + 1) & 3) << 2)) & 0xF;
    717         dither_value[2] = dither_value[6] = (dither_scan
    718                                              >> (((x + 2) & 3) << 2)) & 0xF;
    719         dither_value[3] = dither_value[7] = (dither_scan
    720                                              >> (((x + 3) & 3) << 2)) & 0xF;
    721 #endif
    722         dither = _mm_loadu_si128((__m128i*) dither_value);
    723 
    724         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    725         __m128i* d = reinterpret_cast<__m128i*>(dst);
    726 
    727         while (count >= 8) {
    728             // Load 8 pixels of src.
    729             __m128i src_pixel1 = _mm_loadu_si128(s++);
    730             __m128i src_pixel2 = _mm_loadu_si128(s++);
    731 
    732             // Extract R from src.
    733             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    734             sr1 = _mm_srli_epi32(sr1, 24);
    735             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    736             sr2 = _mm_srli_epi32(sr2, 24);
    737             __m128i sr = _mm_packs_epi32(sr1, sr2);
    738 
    739             // SkDITHER_R32To565(sr, dither)
    740             __m128i sr_offset = _mm_srli_epi16(sr, 5);
    741             sr = _mm_add_epi16(sr, dither);
    742             sr = _mm_sub_epi16(sr, sr_offset);
    743             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
    744 
    745             // Extract G from src.
    746             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    747             sg1 = _mm_srli_epi32(sg1, 24);
    748             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
    749             sg2 = _mm_srli_epi32(sg2, 24);
    750             __m128i sg = _mm_packs_epi32(sg1, sg2);
    751 
    752             // SkDITHER_R32To565(sg, dither)
    753             __m128i sg_offset = _mm_srli_epi16(sg, 6);
    754             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
    755             sg = _mm_sub_epi16(sg, sg_offset);
    756             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
    757 
    758             // Extract B from src.
    759             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
    760             sb1 = _mm_srli_epi32(sb1, 24);
    761             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
    762             sb2 = _mm_srli_epi32(sb2, 24);
    763             __m128i sb = _mm_packs_epi32(sb1, sb2);
    764 
    765             // SkDITHER_R32To565(sb, dither)
    766             __m128i sb_offset = _mm_srli_epi16(sb, 5);
    767             sb = _mm_add_epi16(sb, dither);
    768             sb = _mm_sub_epi16(sb, sb_offset);
    769             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
    770 
    771             // Pack and store 16-bit dst pixel.
    772             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
    773             _mm_store_si128(d++, d_pixel);
    774 
    775             count -= 8;
    776             x += 8;
    777         }
    778 
    779         src = reinterpret_cast<const SkPMColor*>(s);
    780         dst = reinterpret_cast<uint16_t*>(d);
    781     }
    782 
    783     if (count > 0) {
    784         DITHER_565_SCAN(y);
    785         do {
    786             SkPMColor c = *src++;
    787             SkPMColorAssert(c);
    788 
    789             unsigned dither = DITHER_VALUE(x);
    790             *dst++ = SkDitherRGB32To565(c, dither);
    791             DITHER_INC_X(x);
    792         } while (--count != 0);
    793     }
    794 }
    795 
    796 /* SSE2 version of S32A_D565_Opaque_Dither()
    797  * portable version is in core/SkBlitRow_D16.cpp
    798  */
    799 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
    800                                   const SkPMColor* SK_RESTRICT src,
    801                                   int count, U8CPU alpha, int x, int y) {
    802     SkASSERT(255 == alpha);
    803 
    804     if (count <= 0) {
    805         return;
    806     }
    807 
    808     if (count >= 8) {
    809         while (((size_t)dst & 0x0F) != 0) {
    810             DITHER_565_SCAN(y);
    811             SkPMColor c = *src++;
    812             SkPMColorAssert(c);
    813             if (c) {
    814                 unsigned a = SkGetPackedA32(c);
    815 
    816                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
    817 
    818                 unsigned sr = SkGetPackedR32(c);
    819                 unsigned sg = SkGetPackedG32(c);
    820                 unsigned sb = SkGetPackedB32(c);
    821                 sr = SkDITHER_R32_FOR_565(sr, d);
    822                 sg = SkDITHER_G32_FOR_565(sg, d);
    823                 sb = SkDITHER_B32_FOR_565(sb, d);
    824 
    825                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    826                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
    827                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    828                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    829                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    830             }
    831             dst += 1;
    832             DITHER_INC_X(x);
    833             count--;
    834         }
    835 
    836         unsigned short dither_value[8];
    837         __m128i dither, dither_cur;
    838 #ifdef ENABLE_DITHER_MATRIX_4X4
    839         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
    840         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
    841         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
    842         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
    843         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
    844 #else
    845         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
    846         dither_value[0] = dither_value[4] = (dither_scan
    847                                              >> (((x) & 3) << 2)) & 0xF;
    848         dither_value[1] = dither_value[5] = (dither_scan
    849                                              >> (((x + 1) & 3) << 2)) & 0xF;
    850         dither_value[2] = dither_value[6] = (dither_scan
    851                                              >> (((x + 2) & 3) << 2)) & 0xF;
    852         dither_value[3] = dither_value[7] = (dither_scan
    853                                              >> (((x + 3) & 3) << 2)) & 0xF;
    854 #endif
    855         dither = _mm_loadu_si128((__m128i*) dither_value);
    856 
    857         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    858         __m128i* d = reinterpret_cast<__m128i*>(dst);
    859         __m128i var256 = _mm_set1_epi16(256);
    860         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
    861         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
    862         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
    863 
    864         while (count >= 8) {
    865             // Load 8 pixels of src and dst.
    866             __m128i src_pixel1 = _mm_loadu_si128(s++);
    867             __m128i src_pixel2 = _mm_loadu_si128(s++);
    868             __m128i dst_pixel = _mm_load_si128(d);
    869 
    870             // Extract A from src.
    871             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
    872             sa1 = _mm_srli_epi32(sa1, 24);
    873             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
    874             sa2 = _mm_srli_epi32(sa2, 24);
    875             __m128i sa = _mm_packs_epi32(sa1, sa2);
    876 
    877             // Calculate current dither value.
    878             dither_cur = _mm_mullo_epi16(dither,
    879                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
    880             dither_cur = _mm_srli_epi16(dither_cur, 8);
    881 
    882             // Extract R from src.
    883             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    884             sr1 = _mm_srli_epi32(sr1, 24);
    885             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    886             sr2 = _mm_srli_epi32(sr2, 24);
    887             __m128i sr = _mm_packs_epi32(sr1, sr2);
    888 
    889             // SkDITHER_R32_FOR_565(sr, d)
    890             __m128i sr_offset = _mm_srli_epi16(sr, 5);
    891             sr = _mm_add_epi16(sr, dither_cur);
    892             sr = _mm_sub_epi16(sr, sr_offset);
    893 
    894             // Expand sr.
    895             sr = _mm_slli_epi16(sr, 2);
    896 
    897             // Extract G from src.
    898             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    899             sg1 = _mm_srli_epi32(sg1, 24);
    900             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
    901             sg2 = _mm_srli_epi32(sg2, 24);
    902             __m128i sg = _mm_packs_epi32(sg1, sg2);
    903 
    904             // sg = SkDITHER_G32_FOR_565(sg, d).
    905             __m128i sg_offset = _mm_srli_epi16(sg, 6);
    906             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
    907             sg = _mm_sub_epi16(sg, sg_offset);
    908 
    909             // Expand sg.
    910             sg = _mm_slli_epi16(sg, 3);
    911 
    912             // Extract B from src.
    913             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
    914             sb1 = _mm_srli_epi32(sb1, 24);
    915             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
    916             sb2 = _mm_srli_epi32(sb2, 24);
    917             __m128i sb = _mm_packs_epi32(sb1, sb2);
    918 
    919             // sb = SkDITHER_B32_FOR_565(sb, d).
    920             __m128i sb_offset = _mm_srli_epi16(sb, 5);
    921             sb = _mm_add_epi16(sb, dither_cur);
    922             sb = _mm_sub_epi16(sb, sb_offset);
    923 
    924             // Expand sb.
    925             sb = _mm_slli_epi16(sb, 2);
    926 
    927             // Extract R G B from dst.
    928             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
    929             dr = _mm_and_si128(dr, r16_mask);
    930             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
    931             dg = _mm_and_si128(dg, g16_mask);
    932             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
    933             db = _mm_and_si128(db, b16_mask);
    934 
    935             // SkAlpha255To256(255 - a) >> 3
    936             __m128i isa = _mm_sub_epi16(var256, sa);
    937             isa = _mm_srli_epi16(isa, 3);
    938 
    939             dr = _mm_mullo_epi16(dr, isa);
    940             dr = _mm_add_epi16(dr, sr);
    941             dr = _mm_srli_epi16(dr, 5);
    942 
    943             dg = _mm_mullo_epi16(dg, isa);
    944             dg = _mm_add_epi16(dg, sg);
    945             dg = _mm_srli_epi16(dg, 5);
    946 
    947             db = _mm_mullo_epi16(db, isa);
    948             db = _mm_add_epi16(db, sb);
    949             db = _mm_srli_epi16(db, 5);
    950 
    951             // Package and store dst pixel.
    952             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
    953             _mm_store_si128(d++, d_pixel);
    954 
    955             count -= 8;
    956             x += 8;
    957         }
    958 
    959         src = reinterpret_cast<const SkPMColor*>(s);
    960         dst = reinterpret_cast<uint16_t*>(d);
    961     }
    962 
    963     if (count > 0) {
    964         DITHER_565_SCAN(y);
    965         do {
    966             SkPMColor c = *src++;
    967             SkPMColorAssert(c);
    968             if (c) {
    969                 unsigned a = SkGetPackedA32(c);
    970 
    971                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
    972 
    973                 unsigned sr = SkGetPackedR32(c);
    974                 unsigned sg = SkGetPackedG32(c);
    975                 unsigned sb = SkGetPackedB32(c);
    976                 sr = SkDITHER_R32_FOR_565(sr, d);
    977                 sg = SkDITHER_G32_FOR_565(sg, d);
    978                 sb = SkDITHER_B32_FOR_565(sb, d);
    979 
    980                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    981                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
    982                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    983                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    984                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    985             }
    986             dst += 1;
    987             DITHER_INC_X(x);
    988         } while (--count != 0);
    989     }
    990 }
    991