Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmapProcState_opts_SSE2.h"
     10 #include "SkBlitRow_opts_SSE2.h"
     11 #include "SkColorPriv.h"
     12 #include "SkColor_opts_SSE2.h"
     13 #include "SkDither.h"
     14 #include "SkUtils.h"
     15 
     16 /* SSE2 version of S32_Blend_BlitRow32()
     17  * portable version is in core/SkBlitRow_D32.cpp
     18  */
     19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     20                               const SkPMColor* SK_RESTRICT src,
     21                               int count, U8CPU alpha) {
     22     SkASSERT(alpha <= 255);
     23     if (count <= 0) {
     24         return;
     25     }
     26 
     27     uint32_t src_scale = SkAlpha255To256(alpha);
     28     uint32_t dst_scale = 256 - src_scale;
     29 
     30     if (count >= 4) {
     31         SkASSERT(((size_t)dst & 0x03) == 0);
     32         while (((size_t)dst & 0x0F) != 0) {
     33             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     34             src++;
     35             dst++;
     36             count--;
     37         }
     38 
     39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     40         __m128i *d = reinterpret_cast<__m128i*>(dst);
     41 
     42         while (count >= 4) {
     43             // Load 4 pixels each of src and dest.
     44             __m128i src_pixel = _mm_loadu_si128(s);
     45             __m128i dst_pixel = _mm_load_si128(d);
     46 
     47             src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
     48             dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
     49 
     50             // Add result
     51             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     52             _mm_store_si128(d, result);
     53             s++;
     54             d++;
     55             count -= 4;
     56         }
     57         src = reinterpret_cast<const SkPMColor*>(s);
     58         dst = reinterpret_cast<SkPMColor*>(d);
     59     }
     60 
     61     while (count > 0) {
     62         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     63         src++;
     64         dst++;
     65         count--;
     66     }
     67 }
     68 
     69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     70                                 const SkPMColor* SK_RESTRICT src,
     71                                 int count, U8CPU alpha) {
     72     SkASSERT(alpha == 255);
     73     if (count <= 0) {
     74         return;
     75     }
     76 
     77 #ifdef SK_USE_ACCURATE_BLENDING
     78     if (count >= 4) {
     79         SkASSERT(((size_t)dst & 0x03) == 0);
     80         while (((size_t)dst & 0x0F) != 0) {
     81             *dst = SkPMSrcOver(*src, *dst);
     82             src++;
     83             dst++;
     84             count--;
     85         }
     86 
     87         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     88         __m128i *d = reinterpret_cast<__m128i*>(dst);
     89         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     90         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
     91         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
     92         while (count >= 4) {
     93             // Load 4 pixels
     94             __m128i src_pixel = _mm_loadu_si128(s);
     95             __m128i dst_pixel = _mm_load_si128(d);
     96 
     97             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     98             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
     99             // Shift alphas down to lower 8 bits of each quad.
    100             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
    101 
    102             // Copy alpha to upper 3rd byte of each quad
    103             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
    104 
    105             // Subtract alphas from 255, to get 0..255
    106             alpha = _mm_sub_epi16(c_255, alpha);
    107 
    108             // Multiply by red and blue by src alpha.
    109             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    110             // Multiply by alpha and green by src alpha.
    111             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    112 
    113             // dst_rb_low = (dst_rb >> 8)
    114             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
    115             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
    116 
    117             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
    118             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
    119             dst_rb = _mm_add_epi16(dst_rb, c_128);
    120             dst_rb = _mm_srli_epi16(dst_rb, 8);
    121 
    122             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
    123             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
    124             dst_ag = _mm_add_epi16(dst_ag, c_128);
    125             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    126 
    127             // Combine back into RGBA.
    128             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    129 
    130             // Add result
    131             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    132             _mm_store_si128(d, result);
    133             s++;
    134             d++;
    135             count -= 4;
    136         }
    137         src = reinterpret_cast<const SkPMColor*>(s);
    138         dst = reinterpret_cast<SkPMColor*>(d);
    139     }
    140 
    141     while (count > 0) {
    142         *dst = SkPMSrcOver(*src, *dst);
    143         src++;
    144         dst++;
    145         count--;
    146     }
    147 #else
    148     int count16 = count / 16;
    149     __m128i* dst4 = (__m128i*)dst;
    150     const __m128i* src4 = (const __m128i*)src;
    151 
    152     for (int i = 0; i < count16 * 4; i += 4) {
    153         // Load 16 source pixels.
    154         __m128i s0 = _mm_loadu_si128(src4+i+0),
    155                 s1 = _mm_loadu_si128(src4+i+1),
    156                 s2 = _mm_loadu_si128(src4+i+2),
    157                 s3 = _mm_loadu_si128(src4+i+3);
    158 
    159         const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
    160         const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
    161         __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
    162         if (0xffff == _mm_movemask_epi8(cmp)) {
    163             // All 16 source pixels are fully transparent. There's nothing to do!
    164             continue;
    165         }
    166         const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
    167         cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
    168         if (0xffff == _mm_movemask_epi8(cmp)) {
    169             // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
    170             _mm_storeu_si128(dst4+i+0, s0);
    171             _mm_storeu_si128(dst4+i+1, s1);
    172             _mm_storeu_si128(dst4+i+2, s2);
    173             _mm_storeu_si128(dst4+i+3, s3);
    174             continue;
    175         }
    176         // The general slow case: do the blend for all 16 pixels.
    177         _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
    178         _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
    179         _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
    180         _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
    181     }
    182 
    183     // Wrap up the last <= 15 pixels.
    184     SkASSERT(count - (count16*16) <= 15);
    185     for (int i = count16*16; i < count; i++) {
    186         // This check is not really necessarily, but it prevents pointless autovectorization.
    187         if (src[i] & 0xFF000000) {
    188             dst[i] = SkPMSrcOver(src[i], dst[i]);
    189         }
    190     }
    191 #endif
    192 }
    193 
    194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    195                                const SkPMColor* SK_RESTRICT src,
    196                                int count, U8CPU alpha) {
    197     SkASSERT(alpha <= 255);
    198     if (count <= 0) {
    199         return;
    200     }
    201 
    202     if (count >= 4) {
    203         while (((size_t)dst & 0x0F) != 0) {
    204             *dst = SkBlendARGB32(*src, *dst, alpha);
    205             src++;
    206             dst++;
    207             count--;
    208         }
    209 
    210         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    211         __m128i *d = reinterpret_cast<__m128i*>(dst);
    212         while (count >= 4) {
    213             // Load 4 pixels each of src and dest.
    214             __m128i src_pixel = _mm_loadu_si128(s);
    215             __m128i dst_pixel = _mm_load_si128(d);
    216 
    217             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
    218             _mm_store_si128(d, result);
    219             s++;
    220             d++;
    221             count -= 4;
    222         }
    223         src = reinterpret_cast<const SkPMColor*>(s);
    224         dst = reinterpret_cast<SkPMColor*>(d);
    225     }
    226 
    227     while (count > 0) {
    228         *dst = SkBlendARGB32(*src, *dst, alpha);
    229         src++;
    230         dst++;
    231         count--;
    232     }
    233 }
    234 
    235 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
    236     SkASSERT(count > 0);
    237 
    238     uint32_t src_expand = (SkGetPackedG32(src) << 24) |
    239                           (SkGetPackedR32(src) << 13) |
    240                           (SkGetPackedB32(src) << 2);
    241     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
    242 
    243     // Check if we have enough pixels to run SIMD
    244     if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
    245         __m128i* dst_wide;
    246         const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
    247         const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
    248         const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
    249         const __m128i scale_wide = _mm_set1_epi16(scale);
    250         const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
    251         const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
    252 
    253         // Align dst to an even 16 byte address (0-7 pixels)
    254         while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
    255             *dst = SkBlend32_RGB16(src_expand, *dst, scale);
    256             dst += 1;
    257             count--;
    258         }
    259 
    260         dst_wide = reinterpret_cast<__m128i*>(dst);
    261         do {
    262             // Load eight RGB565 pixels
    263             __m128i pixels = _mm_load_si128(dst_wide);
    264 
    265             // Mask out sub-pixels
    266             __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
    267             __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
    268             pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
    269             __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
    270 
    271             // Scale with alpha
    272             pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
    273             pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
    274             pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
    275 
    276             // Add src_X_wide and shift down again
    277             pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
    278             pixel_R = _mm_srli_epi16(pixel_R, 5);
    279             pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
    280             pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
    281             pixel_B = _mm_srli_epi16(pixel_B, 5);
    282 
    283             // Combine into RGB565 and store
    284             pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
    285             pixel_G = _mm_and_si128(pixel_G, mask_green);
    286             pixels = _mm_or_si128(pixel_R, pixel_G);
    287             pixels = _mm_or_si128(pixels, pixel_B);
    288             _mm_store_si128(dst_wide, pixels);
    289             count -= 8;
    290             dst_wide++;
    291         } while (count >= 8);
    292 
    293         dst = reinterpret_cast<uint16_t*>(dst_wide);
    294     }
    295 
    296     // Small loop to handle remaining pixels.
    297     while (count > 0) {
    298         *dst = SkBlend32_RGB16(src_expand, *dst, scale);
    299         dst += 1;
    300         count--;
    301     }
    302 }
    303 
    304 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
    305                                size_t maskRB, SkColor origColor,
    306                                int width, int height) {
    307     SkPMColor color = SkPreMultiplyColor(origColor);
    308     size_t dstOffset = dstRB - (width << 2);
    309     size_t maskOffset = maskRB - width;
    310     SkPMColor* dst = (SkPMColor *)device;
    311     const uint8_t* mask = (const uint8_t*)maskPtr;
    312     do {
    313         int count = width;
    314         if (count >= 4) {
    315             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
    316                 *dst = SkBlendARGB32(color, *dst, *mask);
    317                 mask++;
    318                 dst++;
    319                 count--;
    320             }
    321             __m128i *d = reinterpret_cast<__m128i*>(dst);
    322             __m128i src_pixel = _mm_set1_epi32(color);
    323             while (count >= 4) {
    324                 // Load 4 dst pixels
    325                 __m128i dst_pixel = _mm_load_si128(d);
    326 
    327                 // Set the alpha value
    328                 __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask));
    329                 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());
    330                 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128());
    331 
    332                 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide);
    333                 _mm_store_si128(d, result);
    334                 // Load the next 4 dst pixels and alphas
    335                 mask = mask + 4;
    336                 d++;
    337                 count -= 4;
    338             }
    339             dst = reinterpret_cast<SkPMColor*>(d);
    340         }
    341         while (count > 0) {
    342             *dst= SkBlendARGB32(color, *dst, *mask);
    343             dst += 1;
    344             mask++;
    345             count --;
    346         }
    347         dst = (SkPMColor *)((char*)dst + dstOffset);
    348         mask += maskOffset;
    349     } while (--height != 0);
    350 }
    351 
    352 // The following (left) shifts cause the top 5 bits of the mask components to
    353 // line up with the corresponding components in an SkPMColor.
    354 // Note that the mask's RGB16 order may differ from the SkPMColor order.
    355 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    356 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    357 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
    358 
    359 #if SK_R16x5_R32x5_SHIFT == 0
    360     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
    361 #elif SK_R16x5_R32x5_SHIFT > 0
    362     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
    363 #else
    364     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
    365 #endif
    366 
    367 #if SK_G16x5_G32x5_SHIFT == 0
    368     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
    369 #elif SK_G16x5_G32x5_SHIFT > 0
    370     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
    371 #else
    372     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
    373 #endif
    374 
    375 #if SK_B16x5_B32x5_SHIFT == 0
    376     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
    377 #elif SK_B16x5_B32x5_SHIFT > 0
    378     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
    379 #else
    380     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
    381 #endif
    382 
    383 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
    384                                  __m128i &mask, __m128i &srcA) {
    385     // In the following comments, the components of src, dst and mask are
    386     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    387     // by an R, G, B, or A suffix. Components of one of the four pixels that
    388     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    389     // example is the blue channel of the second destination pixel. Memory
    390     // layout is shown for an ARGB byte order in a color value.
    391 
    392     // src and srcA store 8-bit values interleaved with zeros.
    393     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    394     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
    395     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
    396     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
    397     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
    398     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    399     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    400 
    401     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    402     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    403     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    404                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    405 
    406     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    407     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    408                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    409 
    410     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    411     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    412                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    413 
    414     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    415     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    416     // 8-bit position
    417     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    418     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    419     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    420 
    421     // Interleave R,G,B into the lower byte of word.
    422     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    423     // 16-bit values, padded by zero.
    424     __m128i maskLo, maskHi;
    425     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    426     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    427     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    428     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    429 
    430     // Upscale from 0..31 to 0..32
    431     // (allows to replace division by left-shift further down)
    432     // Left-shift each component by 4 and add the result back to that component,
    433     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    434     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    435     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    436 
    437     // Multiply each component of maskLo and maskHi by srcA
    438     maskLo = _mm_mullo_epi16(maskLo, srcA);
    439     maskHi = _mm_mullo_epi16(maskHi, srcA);
    440 
    441     // Left shift mask components by 8 (divide by 256)
    442     maskLo = _mm_srli_epi16(maskLo, 8);
    443     maskHi = _mm_srli_epi16(maskHi, 8);
    444 
    445     // Interleave R,G,B into the lower byte of the word
    446     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    447     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    448     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    449     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    450 
    451     // mask = (src - dst) * mask
    452     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    453     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    454 
    455     // mask = (src - dst) * mask >> 5
    456     maskLo = _mm_srai_epi16(maskLo, 5);
    457     maskHi = _mm_srai_epi16(maskHi, 5);
    458 
    459     // Add two pixels into result.
    460     // result = dst + ((src - dst) * mask >> 5)
    461     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    462     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    463 
    464     // Pack into 4 32bit dst pixels.
    465     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    466     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    467     // clamping to 255 if necessary.
    468     return _mm_packus_epi16(resultLo, resultHi);
    469 }
    470 
    471 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
    472                                        __m128i &mask) {
    473     // In the following comments, the components of src, dst and mask are
    474     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    475     // by an R, G, B, or A suffix. Components of one of the four pixels that
    476     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    477     // example is the blue channel of the second destination pixel. Memory
    478     // layout is shown for an ARGB byte order in a color value.
    479 
    480     // src and srcA store 8-bit values interleaved with zeros.
    481     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    482     // mask stores 16-bit values (shown as high and low bytes) interleaved with
    483     // zeros
    484     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    485     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    486 
    487     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    488     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    489     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    490                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    491 
    492     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    493     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    494                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    495 
    496     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    497     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    498                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    499 
    500     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    501     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    502     // 8-bit position
    503     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    504     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    505     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    506 
    507     // Interleave R,G,B into the lower byte of word.
    508     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    509     // 16-bit values, padded by zero.
    510     __m128i maskLo, maskHi;
    511     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    512     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    513     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    514     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    515 
    516     // Upscale from 0..31 to 0..32
    517     // (allows to replace division by left-shift further down)
    518     // Left-shift each component by 4 and add the result back to that component,
    519     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    520     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    521     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    522 
    523     // Interleave R,G,B into the lower byte of the word
    524     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    525     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    526     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    527     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    528 
    529     // mask = (src - dst) * mask
    530     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    531     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    532 
    533     // mask = (src - dst) * mask >> 5
    534     maskLo = _mm_srai_epi16(maskLo, 5);
    535     maskHi = _mm_srai_epi16(maskHi, 5);
    536 
    537     // Add two pixels into result.
    538     // result = dst + ((src - dst) * mask >> 5)
    539     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    540     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    541 
    542     // Pack into 4 32bit dst pixels and force opaque.
    543     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    544     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    545     // clamping to 255 if necessary. Set alpha components to 0xFF.
    546     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
    547                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
    548 }
    549 
    550 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
    551                          SkColor src, int width, SkPMColor) {
    552     if (width <= 0) {
    553         return;
    554     }
    555 
    556     int srcA = SkColorGetA(src);
    557     int srcR = SkColorGetR(src);
    558     int srcG = SkColorGetG(src);
    559     int srcB = SkColorGetB(src);
    560 
    561     srcA = SkAlpha255To256(srcA);
    562 
    563     if (width >= 4) {
    564         SkASSERT(((size_t)dst & 0x03) == 0);
    565         while (((size_t)dst & 0x0F) != 0) {
    566             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    567             mask++;
    568             dst++;
    569             width--;
    570         }
    571 
    572         __m128i *d = reinterpret_cast<__m128i*>(dst);
    573         // Set alpha to 0xFF and replicate source four times in SSE register.
    574         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    575         // Interleave with zeros to get two sets of four 16-bit values.
    576         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    577         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    578         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    579         __m128i srcA_sse = _mm_set1_epi16(srcA);
    580         while (width >= 4) {
    581             // Load four destination pixels into dst_sse.
    582             __m128i dst_sse = _mm_load_si128(d);
    583             // Load four 16-bit masks into lower half of mask_sse.
    584             __m128i mask_sse = _mm_loadl_epi64(
    585                                    reinterpret_cast<const __m128i*>(mask));
    586 
    587             // Check whether masks are equal to 0 and get the highest bit
    588             // of each byte of result, if masks are all zero, we will get
    589             // pack_cmp to 0xFFFF
    590             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    591                                              _mm_setzero_si128()));
    592 
    593             // if mask pixels are not all zero, we will blend the dst pixels
    594             if (pack_cmp != 0xFFFF) {
    595                 // Unpack 4 16bit mask pixels to
    596                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    597                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    598                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    599                                               _mm_setzero_si128());
    600 
    601                 // Process 4 32bit dst pixels
    602                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
    603                                                    mask_sse, srcA_sse);
    604                 _mm_store_si128(d, result);
    605             }
    606 
    607             d++;
    608             mask += 4;
    609             width -= 4;
    610         }
    611 
    612         dst = reinterpret_cast<SkPMColor*>(d);
    613     }
    614 
    615     while (width > 0) {
    616         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    617         mask++;
    618         dst++;
    619         width--;
    620     }
    621 }
    622 
    623 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
    624                                SkColor src, int width, SkPMColor opaqueDst) {
    625     if (width <= 0) {
    626         return;
    627     }
    628 
    629     int srcR = SkColorGetR(src);
    630     int srcG = SkColorGetG(src);
    631     int srcB = SkColorGetB(src);
    632 
    633     if (width >= 4) {
    634         SkASSERT(((size_t)dst & 0x03) == 0);
    635         while (((size_t)dst & 0x0F) != 0) {
    636             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    637             mask++;
    638             dst++;
    639             width--;
    640         }
    641 
    642         __m128i *d = reinterpret_cast<__m128i*>(dst);
    643         // Set alpha to 0xFF and replicate source four times in SSE register.
    644         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    645         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    646         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    647         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    648         while (width >= 4) {
    649             // Load four destination pixels into dst_sse.
    650             __m128i dst_sse = _mm_load_si128(d);
    651             // Load four 16-bit masks into lower half of mask_sse.
    652             __m128i mask_sse = _mm_loadl_epi64(
    653                                    reinterpret_cast<const __m128i*>(mask));
    654 
    655             // Check whether masks are equal to 0 and get the highest bit
    656             // of each byte of result, if masks are all zero, we will get
    657             // pack_cmp to 0xFFFF
    658             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    659                                              _mm_setzero_si128()));
    660 
    661             // if mask pixels are not all zero, we will blend the dst pixels
    662             if (pack_cmp != 0xFFFF) {
    663                 // Unpack 4 16bit mask pixels to
    664                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    665                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    666                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    667                                               _mm_setzero_si128());
    668 
    669                 // Process 4 32bit dst pixels
    670                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
    671                                                          mask_sse);
    672                 _mm_store_si128(d, result);
    673             }
    674 
    675             d++;
    676             mask += 4;
    677             width -= 4;
    678         }
    679 
    680         dst = reinterpret_cast<SkPMColor*>(d);
    681     }
    682 
    683     while (width > 0) {
    684         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    685         mask++;
    686         dst++;
    687         width--;
    688     }
    689 }
    690 
    691 /* SSE2 version of S32_D565_Opaque()
    692  * portable version is in core/SkBlitRow_D16.cpp
    693  */
    694 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    695                           const SkPMColor* SK_RESTRICT src, int count,
    696                           U8CPU alpha, int /*x*/, int /*y*/) {
    697     SkASSERT(255 == alpha);
    698 
    699     if (count <= 0) {
    700         return;
    701     }
    702 
    703     if (count >= 8) {
    704         while (((size_t)dst & 0x0F) != 0) {
    705             SkPMColor c = *src++;
    706             SkPMColorAssert(c);
    707 
    708             *dst++ = SkPixel32ToPixel16_ToU16(c);
    709             count--;
    710         }
    711 
    712         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    713         __m128i* d = reinterpret_cast<__m128i*>(dst);
    714 
    715         while (count >= 8) {
    716             // Load 8 pixels of src.
    717             __m128i src_pixel1 = _mm_loadu_si128(s++);
    718             __m128i src_pixel2 = _mm_loadu_si128(s++);
    719 
    720             __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
    721             _mm_store_si128(d++, d_pixel);
    722             count -= 8;
    723         }
    724         src = reinterpret_cast<const SkPMColor*>(s);
    725         dst = reinterpret_cast<uint16_t*>(d);
    726     }
    727 
    728     if (count > 0) {
    729         do {
    730             SkPMColor c = *src++;
    731             SkPMColorAssert(c);
    732             *dst++ = SkPixel32ToPixel16_ToU16(c);
    733         } while (--count != 0);
    734     }
    735 }
    736 
    737 /* SSE2 version of S32A_D565_Opaque()
    738  * portable version is in core/SkBlitRow_D16.cpp
    739  */
    740 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    741                            const SkPMColor* SK_RESTRICT src,
    742                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
    743     SkASSERT(255 == alpha);
    744 
    745     if (count <= 0) {
    746         return;
    747     }
    748 
    749     if (count >= 8) {
    750         // Make dst 16 bytes alignment
    751         while (((size_t)dst & 0x0F) != 0) {
    752             SkPMColor c = *src++;
    753             if (c) {
    754               *dst = SkSrcOver32To16(c, *dst);
    755             }
    756             dst += 1;
    757             count--;
    758         }
    759 
    760         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    761         __m128i* d = reinterpret_cast<__m128i*>(dst);
    762         __m128i var255 = _mm_set1_epi16(255);
    763         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
    764         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
    765         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
    766 
    767         while (count >= 8) {
    768             // Load 8 pixels of src.
    769             __m128i src_pixel1 = _mm_loadu_si128(s++);
    770             __m128i src_pixel2 = _mm_loadu_si128(s++);
    771 
    772             // Check whether src pixels are equal to 0 and get the highest bit
    773             // of each byte of result, if src pixels are all zero, src_cmp1 and
    774             // src_cmp2 will be 0xFFFF.
    775             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
    776                                              _mm_setzero_si128()));
    777             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
    778                                              _mm_setzero_si128()));
    779             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
    780                 d++;
    781                 count -= 8;
    782                 continue;
    783             }
    784 
    785             // Load 8 pixels of dst.
    786             __m128i dst_pixel = _mm_load_si128(d);
    787 
    788             // Extract A from src.
    789             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
    790             sa1 = _mm_srli_epi32(sa1, 24);
    791             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
    792             sa2 = _mm_srli_epi32(sa2, 24);
    793             __m128i sa = _mm_packs_epi32(sa1, sa2);
    794 
    795             // Extract R from src.
    796             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    797             sr1 = _mm_srli_epi32(sr1, 24);
    798             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    799             sr2 = _mm_srli_epi32(sr2, 24);
    800             __m128i sr = _mm_packs_epi32(sr1, sr2);
    801 
    802             // Extract G from src.
    803             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    804             sg1 = _mm_srli_epi32(sg1, 24);
    805             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
    806             sg2 = _mm_srli_epi32(sg2, 24);
    807             __m128i sg = _mm_packs_epi32(sg1, sg2);
    808 
    809             // Extract B from src.
    810             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
    811             sb1 = _mm_srli_epi32(sb1, 24);
    812             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
    813             sb2 = _mm_srli_epi32(sb2, 24);
    814             __m128i sb = _mm_packs_epi32(sb1, sb2);
    815 
    816             // Extract R G B from dst.
    817             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
    818             dr = _mm_and_si128(dr, r16_mask);
    819             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
    820             dg = _mm_and_si128(dg, g16_mask);
    821             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
    822             db = _mm_and_si128(db, b16_mask);
    823 
    824             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
    825 
    826             // Calculate R G B of result.
    827             // Original algorithm is in SkSrcOver32To16().
    828             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
    829             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
    830             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
    831             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
    832             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
    833             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
    834 
    835             // Pack R G B into 16-bit color.
    836             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
    837 
    838             // Store 8 16-bit colors in dst.
    839             _mm_store_si128(d++, d_pixel);
    840             count -= 8;
    841         }
    842 
    843         src = reinterpret_cast<const SkPMColor*>(s);
    844         dst = reinterpret_cast<uint16_t*>(d);
    845     }
    846 
    847     if (count > 0) {
    848         do {
    849             SkPMColor c = *src++;
    850             SkPMColorAssert(c);
    851             if (c) {
    852                 *dst = SkSrcOver32To16(c, *dst);
    853             }
    854             dst += 1;
    855         } while (--count != 0);
    856     }
    857 }
    858 
    859 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
    860                                  const SkPMColor* SK_RESTRICT src,
    861                                  int count, U8CPU alpha, int x, int y) {
    862     SkASSERT(255 == alpha);
    863 
    864     if (count <= 0) {
    865         return;
    866     }
    867 
    868     if (count >= 8) {
    869         while (((size_t)dst & 0x0F) != 0) {
    870             DITHER_565_SCAN(y);
    871             SkPMColor c = *src++;
    872             SkPMColorAssert(c);
    873 
    874             unsigned dither = DITHER_VALUE(x);
    875             *dst++ = SkDitherRGB32To565(c, dither);
    876             DITHER_INC_X(x);
    877             count--;
    878         }
    879 
    880         unsigned short dither_value[8];
    881         __m128i dither;
    882 #ifdef ENABLE_DITHER_MATRIX_4X4
    883         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
    884         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
    885         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
    886         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
    887         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
    888 #else
    889         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
    890         dither_value[0] = dither_value[4] = (dither_scan
    891                                              >> (((x) & 3) << 2)) & 0xF;
    892         dither_value[1] = dither_value[5] = (dither_scan
    893                                              >> (((x + 1) & 3) << 2)) & 0xF;
    894         dither_value[2] = dither_value[6] = (dither_scan
    895                                              >> (((x + 2) & 3) << 2)) & 0xF;
    896         dither_value[3] = dither_value[7] = (dither_scan
    897                                              >> (((x + 3) & 3) << 2)) & 0xF;
    898 #endif
    899         dither = _mm_loadu_si128((__m128i*) dither_value);
    900 
    901         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    902         __m128i* d = reinterpret_cast<__m128i*>(dst);
    903 
    904         while (count >= 8) {
    905             // Load 8 pixels of src.
    906             __m128i src_pixel1 = _mm_loadu_si128(s++);
    907             __m128i src_pixel2 = _mm_loadu_si128(s++);
    908 
    909             // Extract R from src.
    910             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    911             sr1 = _mm_srli_epi32(sr1, 24);
    912             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    913             sr2 = _mm_srli_epi32(sr2, 24);
    914             __m128i sr = _mm_packs_epi32(sr1, sr2);
    915 
    916             // SkDITHER_R32To565(sr, dither)
    917             __m128i sr_offset = _mm_srli_epi16(sr, 5);
    918             sr = _mm_add_epi16(sr, dither);
    919             sr = _mm_sub_epi16(sr, sr_offset);
    920             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
    921 
    922             // Extract G from src.
    923             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    924             sg1 = _mm_srli_epi32(sg1, 24);
    925             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
    926             sg2 = _mm_srli_epi32(sg2, 24);
    927             __m128i sg = _mm_packs_epi32(sg1, sg2);
    928 
    929             // SkDITHER_R32To565(sg, dither)
    930             __m128i sg_offset = _mm_srli_epi16(sg, 6);
    931             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
    932             sg = _mm_sub_epi16(sg, sg_offset);
    933             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
    934 
    935             // Extract B from src.
    936             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
    937             sb1 = _mm_srli_epi32(sb1, 24);
    938             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
    939             sb2 = _mm_srli_epi32(sb2, 24);
    940             __m128i sb = _mm_packs_epi32(sb1, sb2);
    941 
    942             // SkDITHER_R32To565(sb, dither)
    943             __m128i sb_offset = _mm_srli_epi16(sb, 5);
    944             sb = _mm_add_epi16(sb, dither);
    945             sb = _mm_sub_epi16(sb, sb_offset);
    946             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
    947 
    948             // Pack and store 16-bit dst pixel.
    949             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
    950             _mm_store_si128(d++, d_pixel);
    951 
    952             count -= 8;
    953             x += 8;
    954         }
    955 
    956         src = reinterpret_cast<const SkPMColor*>(s);
    957         dst = reinterpret_cast<uint16_t*>(d);
    958     }
    959 
    960     if (count > 0) {
    961         DITHER_565_SCAN(y);
    962         do {
    963             SkPMColor c = *src++;
    964             SkPMColorAssert(c);
    965 
    966             unsigned dither = DITHER_VALUE(x);
    967             *dst++ = SkDitherRGB32To565(c, dither);
    968             DITHER_INC_X(x);
    969         } while (--count != 0);
    970     }
    971 }
    972 
    973 /* SSE2 version of S32A_D565_Opaque_Dither()
    974  * portable version is in core/SkBlitRow_D16.cpp
    975  */
    976 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
    977                                   const SkPMColor* SK_RESTRICT src,
    978                                   int count, U8CPU alpha, int x, int y) {
    979     SkASSERT(255 == alpha);
    980 
    981     if (count <= 0) {
    982         return;
    983     }
    984 
    985     if (count >= 8) {
    986         while (((size_t)dst & 0x0F) != 0) {
    987             DITHER_565_SCAN(y);
    988             SkPMColor c = *src++;
    989             SkPMColorAssert(c);
    990             if (c) {
    991                 unsigned a = SkGetPackedA32(c);
    992 
    993                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
    994 
    995                 unsigned sr = SkGetPackedR32(c);
    996                 unsigned sg = SkGetPackedG32(c);
    997                 unsigned sb = SkGetPackedB32(c);
    998                 sr = SkDITHER_R32_FOR_565(sr, d);
    999                 sg = SkDITHER_G32_FOR_565(sg, d);
   1000                 sb = SkDITHER_B32_FOR_565(sb, d);
   1001 
   1002                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1003                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1004                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1005                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1006                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1007             }
   1008             dst += 1;
   1009             DITHER_INC_X(x);
   1010             count--;
   1011         }
   1012 
   1013         unsigned short dither_value[8];
   1014         __m128i dither, dither_cur;
   1015 #ifdef ENABLE_DITHER_MATRIX_4X4
   1016         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
   1017         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
   1018         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
   1019         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
   1020         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
   1021 #else
   1022         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
   1023         dither_value[0] = dither_value[4] = (dither_scan
   1024                                              >> (((x) & 3) << 2)) & 0xF;
   1025         dither_value[1] = dither_value[5] = (dither_scan
   1026                                              >> (((x + 1) & 3) << 2)) & 0xF;
   1027         dither_value[2] = dither_value[6] = (dither_scan
   1028                                              >> (((x + 2) & 3) << 2)) & 0xF;
   1029         dither_value[3] = dither_value[7] = (dither_scan
   1030                                              >> (((x + 3) & 3) << 2)) & 0xF;
   1031 #endif
   1032         dither = _mm_loadu_si128((__m128i*) dither_value);
   1033 
   1034         const __m128i* s = reinterpret_cast<const __m128i*>(src);
   1035         __m128i* d = reinterpret_cast<__m128i*>(dst);
   1036         __m128i var256 = _mm_set1_epi16(256);
   1037         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
   1038         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
   1039         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
   1040 
   1041         while (count >= 8) {
   1042             // Load 8 pixels of src and dst.
   1043             __m128i src_pixel1 = _mm_loadu_si128(s++);
   1044             __m128i src_pixel2 = _mm_loadu_si128(s++);
   1045             __m128i dst_pixel = _mm_load_si128(d);
   1046 
   1047             // Extract A from src.
   1048             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
   1049             sa1 = _mm_srli_epi32(sa1, 24);
   1050             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
   1051             sa2 = _mm_srli_epi32(sa2, 24);
   1052             __m128i sa = _mm_packs_epi32(sa1, sa2);
   1053 
   1054             // Calculate current dither value.
   1055             dither_cur = _mm_mullo_epi16(dither,
   1056                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
   1057             dither_cur = _mm_srli_epi16(dither_cur, 8);
   1058 
   1059             // Extract R from src.
   1060             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
   1061             sr1 = _mm_srli_epi32(sr1, 24);
   1062             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
   1063             sr2 = _mm_srli_epi32(sr2, 24);
   1064             __m128i sr = _mm_packs_epi32(sr1, sr2);
   1065 
   1066             // SkDITHER_R32_FOR_565(sr, d)
   1067             __m128i sr_offset = _mm_srli_epi16(sr, 5);
   1068             sr = _mm_add_epi16(sr, dither_cur);
   1069             sr = _mm_sub_epi16(sr, sr_offset);
   1070 
   1071             // Expand sr.
   1072             sr = _mm_slli_epi16(sr, 2);
   1073 
   1074             // Extract G from src.
   1075             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
   1076             sg1 = _mm_srli_epi32(sg1, 24);
   1077             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
   1078             sg2 = _mm_srli_epi32(sg2, 24);
   1079             __m128i sg = _mm_packs_epi32(sg1, sg2);
   1080 
   1081             // sg = SkDITHER_G32_FOR_565(sg, d).
   1082             __m128i sg_offset = _mm_srli_epi16(sg, 6);
   1083             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
   1084             sg = _mm_sub_epi16(sg, sg_offset);
   1085 
   1086             // Expand sg.
   1087             sg = _mm_slli_epi16(sg, 3);
   1088 
   1089             // Extract B from src.
   1090             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
   1091             sb1 = _mm_srli_epi32(sb1, 24);
   1092             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
   1093             sb2 = _mm_srli_epi32(sb2, 24);
   1094             __m128i sb = _mm_packs_epi32(sb1, sb2);
   1095 
   1096             // sb = SkDITHER_B32_FOR_565(sb, d).
   1097             __m128i sb_offset = _mm_srli_epi16(sb, 5);
   1098             sb = _mm_add_epi16(sb, dither_cur);
   1099             sb = _mm_sub_epi16(sb, sb_offset);
   1100 
   1101             // Expand sb.
   1102             sb = _mm_slli_epi16(sb, 2);
   1103 
   1104             // Extract R G B from dst.
   1105             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
   1106             dr = _mm_and_si128(dr, r16_mask);
   1107             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
   1108             dg = _mm_and_si128(dg, g16_mask);
   1109             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
   1110             db = _mm_and_si128(db, b16_mask);
   1111 
   1112             // SkAlpha255To256(255 - a) >> 3
   1113             __m128i isa = _mm_sub_epi16(var256, sa);
   1114             isa = _mm_srli_epi16(isa, 3);
   1115 
   1116             dr = _mm_mullo_epi16(dr, isa);
   1117             dr = _mm_add_epi16(dr, sr);
   1118             dr = _mm_srli_epi16(dr, 5);
   1119 
   1120             dg = _mm_mullo_epi16(dg, isa);
   1121             dg = _mm_add_epi16(dg, sg);
   1122             dg = _mm_srli_epi16(dg, 5);
   1123 
   1124             db = _mm_mullo_epi16(db, isa);
   1125             db = _mm_add_epi16(db, sb);
   1126             db = _mm_srli_epi16(db, 5);
   1127 
   1128             // Package and store dst pixel.
   1129             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
   1130             _mm_store_si128(d++, d_pixel);
   1131 
   1132             count -= 8;
   1133             x += 8;
   1134         }
   1135 
   1136         src = reinterpret_cast<const SkPMColor*>(s);
   1137         dst = reinterpret_cast<uint16_t*>(d);
   1138     }
   1139 
   1140     if (count > 0) {
   1141         DITHER_565_SCAN(y);
   1142         do {
   1143             SkPMColor c = *src++;
   1144             SkPMColorAssert(c);
   1145             if (c) {
   1146                 unsigned a = SkGetPackedA32(c);
   1147 
   1148                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
   1149 
   1150                 unsigned sr = SkGetPackedR32(c);
   1151                 unsigned sg = SkGetPackedG32(c);
   1152                 unsigned sb = SkGetPackedB32(c);
   1153                 sr = SkDITHER_R32_FOR_565(sr, d);
   1154                 sg = SkDITHER_G32_FOR_565(sg, d);
   1155                 sb = SkDITHER_B32_FOR_565(sb, d);
   1156 
   1157                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1158                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1159                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1160                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1161                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1162             }
   1163             dst += 1;
   1164             DITHER_INC_X(x);
   1165         } while (--count != 0);
   1166     }
   1167 }
   1168