Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmapProcState_opts_SSE2.h"
     10 #include "SkBlitRow_opts_SSE2.h"
     11 #include "SkColorPriv.h"
     12 #include "SkColor_opts_SSE2.h"
     13 #include "SkDither.h"
     14 #include "SkMSAN.h"
     15 #include "SkUtils.h"
     16 
     17 /* SSE2 version of S32_Blend_BlitRow32()
     18  * portable version is in core/SkBlitRow_D32.cpp
     19  */
     20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     21                               const SkPMColor* SK_RESTRICT src,
     22                               int count, U8CPU alpha) {
     23     SkASSERT(alpha <= 255);
     24     if (count <= 0) {
     25         return;
     26     }
     27 
     28     uint32_t src_scale = SkAlpha255To256(alpha);
     29     uint32_t dst_scale = 256 - src_scale;
     30 
     31     if (count >= 4) {
     32         SkASSERT(((size_t)dst & 0x03) == 0);
     33         while (((size_t)dst & 0x0F) != 0) {
     34             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     35             src++;
     36             dst++;
     37             count--;
     38         }
     39 
     40         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     41         __m128i *d = reinterpret_cast<__m128i*>(dst);
     42 
     43         while (count >= 4) {
     44             // Load 4 pixels each of src and dest.
     45             __m128i src_pixel = _mm_loadu_si128(s);
     46             __m128i dst_pixel = _mm_load_si128(d);
     47 
     48             src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
     49             dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
     50 
     51             // Add result
     52             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     53             _mm_store_si128(d, result);
     54             s++;
     55             d++;
     56             count -= 4;
     57         }
     58         src = reinterpret_cast<const SkPMColor*>(s);
     59         dst = reinterpret_cast<SkPMColor*>(d);
     60     }
     61 
     62     while (count > 0) {
     63         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     64         src++;
     65         dst++;
     66         count--;
     67     }
     68 }
     69 
     70 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     71                                 const SkPMColor* SK_RESTRICT src,
     72                                 int count, U8CPU alpha) {
     73     sk_msan_assert_initialized(src, src+count);
     74 
     75     SkASSERT(alpha == 255);
     76     if (count <= 0) {
     77         return;
     78     }
     79 
     80 #ifdef SK_USE_ACCURATE_BLENDING
     81     if (count >= 4) {
     82         SkASSERT(((size_t)dst & 0x03) == 0);
     83         while (((size_t)dst & 0x0F) != 0) {
     84             *dst = SkPMSrcOver(*src, *dst);
     85             src++;
     86             dst++;
     87             count--;
     88         }
     89 
     90         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     91         __m128i *d = reinterpret_cast<__m128i*>(dst);
     92         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     93         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
     94         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
     95         while (count >= 4) {
     96             // Load 4 pixels
     97             __m128i src_pixel = _mm_loadu_si128(s);
     98             __m128i dst_pixel = _mm_load_si128(d);
     99 
    100             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    101             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    102             // Shift alphas down to lower 8 bits of each quad.
    103             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
    104 
    105             // Copy alpha to upper 3rd byte of each quad
    106             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
    107 
    108             // Subtract alphas from 255, to get 0..255
    109             alpha = _mm_sub_epi16(c_255, alpha);
    110 
    111             // Multiply by red and blue by src alpha.
    112             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    113             // Multiply by alpha and green by src alpha.
    114             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    115 
    116             // dst_rb_low = (dst_rb >> 8)
    117             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
    118             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
    119 
    120             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
    121             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
    122             dst_rb = _mm_add_epi16(dst_rb, c_128);
    123             dst_rb = _mm_srli_epi16(dst_rb, 8);
    124 
    125             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
    126             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
    127             dst_ag = _mm_add_epi16(dst_ag, c_128);
    128             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    129 
    130             // Combine back into RGBA.
    131             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    132 
    133             // Add result
    134             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    135             _mm_store_si128(d, result);
    136             s++;
    137             d++;
    138             count -= 4;
    139         }
    140         src = reinterpret_cast<const SkPMColor*>(s);
    141         dst = reinterpret_cast<SkPMColor*>(d);
    142     }
    143 
    144     while (count > 0) {
    145         *dst = SkPMSrcOver(*src, *dst);
    146         src++;
    147         dst++;
    148         count--;
    149     }
    150 #else
    151     int count16 = count / 16;
    152     __m128i* dst4 = (__m128i*)dst;
    153     const __m128i* src4 = (const __m128i*)src;
    154 
    155     for (int i = 0; i < count16 * 4; i += 4) {
    156         // Load 16 source pixels.
    157         __m128i s0 = _mm_loadu_si128(src4+i+0),
    158                 s1 = _mm_loadu_si128(src4+i+1),
    159                 s2 = _mm_loadu_si128(src4+i+2),
    160                 s3 = _mm_loadu_si128(src4+i+3);
    161 
    162         const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
    163         const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
    164         __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128());
    165         if (0xffff == _mm_movemask_epi8(cmp)) {
    166             // All 16 source pixels are fully transparent. There's nothing to do!
    167             continue;
    168         }
    169         const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
    170         cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask);
    171         if (0xffff == _mm_movemask_epi8(cmp)) {
    172             // All 16 source pixels are fully opaque. There's no need to read dst or blend it.
    173             _mm_storeu_si128(dst4+i+0, s0);
    174             _mm_storeu_si128(dst4+i+1, s1);
    175             _mm_storeu_si128(dst4+i+2, s2);
    176             _mm_storeu_si128(dst4+i+3, s3);
    177             continue;
    178         }
    179         // The general slow case: do the blend for all 16 pixels.
    180         _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
    181         _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
    182         _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
    183         _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
    184     }
    185 
    186     // Wrap up the last <= 15 pixels.
    187     SkASSERT(count - (count16*16) <= 15);
    188     for (int i = count16*16; i < count; i++) {
    189         // This check is not really necessarily, but it prevents pointless autovectorization.
    190         if (src[i] & 0xFF000000) {
    191             dst[i] = SkPMSrcOver(src[i], dst[i]);
    192         }
    193     }
    194 #endif
    195 }
    196 
    197 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    198                                const SkPMColor* SK_RESTRICT src,
    199                                int count, U8CPU alpha) {
    200     SkASSERT(alpha <= 255);
    201     if (count <= 0) {
    202         return;
    203     }
    204 
    205     if (count >= 4) {
    206         while (((size_t)dst & 0x0F) != 0) {
    207             *dst = SkBlendARGB32(*src, *dst, alpha);
    208             src++;
    209             dst++;
    210             count--;
    211         }
    212 
    213         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    214         __m128i *d = reinterpret_cast<__m128i*>(dst);
    215         while (count >= 4) {
    216             // Load 4 pixels each of src and dest.
    217             __m128i src_pixel = _mm_loadu_si128(s);
    218             __m128i dst_pixel = _mm_load_si128(d);
    219 
    220             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
    221             _mm_store_si128(d, result);
    222             s++;
    223             d++;
    224             count -= 4;
    225         }
    226         src = reinterpret_cast<const SkPMColor*>(s);
    227         dst = reinterpret_cast<SkPMColor*>(d);
    228     }
    229 
    230     while (count > 0) {
    231         *dst = SkBlendARGB32(*src, *dst, alpha);
    232         src++;
    233         dst++;
    234         count--;
    235     }
    236 }
    237 
    238 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
    239     SkASSERT(count > 0);
    240 
    241     uint32_t src_expand = (SkGetPackedG32(src) << 24) |
    242                           (SkGetPackedR32(src) << 13) |
    243                           (SkGetPackedB32(src) << 2);
    244     unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
    245 
    246     // Check if we have enough pixels to run SIMD
    247     if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
    248         __m128i* dst_wide;
    249         const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
    250         const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
    251         const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
    252         const __m128i scale_wide = _mm_set1_epi16(scale);
    253         const __m128i mask_blue  = _mm_set1_epi16(SK_B16_MASK);
    254         const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
    255 
    256         // Align dst to an even 16 byte address (0-7 pixels)
    257         while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
    258             *dst = SkBlend32_RGB16(src_expand, *dst, scale);
    259             dst += 1;
    260             count--;
    261         }
    262 
    263         dst_wide = reinterpret_cast<__m128i*>(dst);
    264         do {
    265             // Load eight RGB565 pixels
    266             __m128i pixels = _mm_load_si128(dst_wide);
    267 
    268             // Mask out sub-pixels
    269             __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
    270             __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
    271             pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
    272             __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
    273 
    274             // Scale with alpha
    275             pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
    276             pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
    277             pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
    278 
    279             // Add src_X_wide and shift down again
    280             pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
    281             pixel_R = _mm_srli_epi16(pixel_R, 5);
    282             pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
    283             pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
    284             pixel_B = _mm_srli_epi16(pixel_B, 5);
    285 
    286             // Combine into RGB565 and store
    287             pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
    288             pixel_G = _mm_and_si128(pixel_G, mask_green);
    289             pixels = _mm_or_si128(pixel_R, pixel_G);
    290             pixels = _mm_or_si128(pixels, pixel_B);
    291             _mm_store_si128(dst_wide, pixels);
    292             count -= 8;
    293             dst_wide++;
    294         } while (count >= 8);
    295 
    296         dst = reinterpret_cast<uint16_t*>(dst_wide);
    297     }
    298 
    299     // Small loop to handle remaining pixels.
    300     while (count > 0) {
    301         *dst = SkBlend32_RGB16(src_expand, *dst, scale);
    302         dst += 1;
    303         count--;
    304     }
    305 }
    306 
    307 // The following (left) shifts cause the top 5 bits of the mask components to
    308 // line up with the corresponding components in an SkPMColor.
    309 // Note that the mask's RGB16 order may differ from the SkPMColor order.
    310 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    311 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    312 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
    313 
    314 #if SK_R16x5_R32x5_SHIFT == 0
    315     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
    316 #elif SK_R16x5_R32x5_SHIFT > 0
    317     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
    318 #else
    319     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
    320 #endif
    321 
    322 #if SK_G16x5_G32x5_SHIFT == 0
    323     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
    324 #elif SK_G16x5_G32x5_SHIFT > 0
    325     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
    326 #else
    327     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
    328 #endif
    329 
    330 #if SK_B16x5_B32x5_SHIFT == 0
    331     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
    332 #elif SK_B16x5_B32x5_SHIFT > 0
    333     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
    334 #else
    335     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
    336 #endif
    337 
    338 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
    339                                  __m128i &mask, __m128i &srcA) {
    340     // In the following comments, the components of src, dst and mask are
    341     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    342     // by an R, G, B, or A suffix. Components of one of the four pixels that
    343     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    344     // example is the blue channel of the second destination pixel. Memory
    345     // layout is shown for an ARGB byte order in a color value.
    346 
    347     // src and srcA store 8-bit values interleaved with zeros.
    348     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    349     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
    350     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
    351     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
    352     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
    353     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    354     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    355 
    356     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    357     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    358     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    359                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    360 
    361     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    362     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    363                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    364 
    365     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    366     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    367                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    368 
    369     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    370     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    371     // 8-bit position
    372     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    373     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    374     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    375 
    376     // Interleave R,G,B into the lower byte of word.
    377     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    378     // 16-bit values, padded by zero.
    379     __m128i maskLo, maskHi;
    380     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    381     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    382     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    383     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    384 
    385     // Upscale from 0..31 to 0..32
    386     // (allows to replace division by left-shift further down)
    387     // Left-shift each component by 4 and add the result back to that component,
    388     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    389     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    390     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    391 
    392     // Multiply each component of maskLo and maskHi by srcA
    393     maskLo = _mm_mullo_epi16(maskLo, srcA);
    394     maskHi = _mm_mullo_epi16(maskHi, srcA);
    395 
    396     // Left shift mask components by 8 (divide by 256)
    397     maskLo = _mm_srli_epi16(maskLo, 8);
    398     maskHi = _mm_srli_epi16(maskHi, 8);
    399 
    400     // Interleave R,G,B into the lower byte of the word
    401     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    402     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    403     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    404     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    405 
    406     // mask = (src - dst) * mask
    407     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    408     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    409 
    410     // mask = (src - dst) * mask >> 5
    411     maskLo = _mm_srai_epi16(maskLo, 5);
    412     maskHi = _mm_srai_epi16(maskHi, 5);
    413 
    414     // Add two pixels into result.
    415     // result = dst + ((src - dst) * mask >> 5)
    416     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    417     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    418 
    419     // Pack into 4 32bit dst pixels.
    420     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    421     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    422     // clamping to 255 if necessary.
    423     return _mm_packus_epi16(resultLo, resultHi);
    424 }
    425 
    426 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
    427                                        __m128i &mask) {
    428     // In the following comments, the components of src, dst and mask are
    429     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    430     // by an R, G, B, or A suffix. Components of one of the four pixels that
    431     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    432     // example is the blue channel of the second destination pixel. Memory
    433     // layout is shown for an ARGB byte order in a color value.
    434 
    435     // src and srcA store 8-bit values interleaved with zeros.
    436     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    437     // mask stores 16-bit values (shown as high and low bytes) interleaved with
    438     // zeros
    439     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    440     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    441 
    442     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    443     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    444     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    445                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    446 
    447     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    448     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    449                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    450 
    451     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    452     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    453                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    454 
    455     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    456     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    457     // 8-bit position
    458     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    459     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    460     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    461 
    462     // Interleave R,G,B into the lower byte of word.
    463     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    464     // 16-bit values, padded by zero.
    465     __m128i maskLo, maskHi;
    466     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    467     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    468     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    469     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    470 
    471     // Upscale from 0..31 to 0..32
    472     // (allows to replace division by left-shift further down)
    473     // Left-shift each component by 4 and add the result back to that component,
    474     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    475     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    476     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    477 
    478     // Interleave R,G,B into the lower byte of the word
    479     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    480     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    481     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    482     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    483 
    484     // mask = (src - dst) * mask
    485     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    486     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    487 
    488     // mask = (src - dst) * mask >> 5
    489     maskLo = _mm_srai_epi16(maskLo, 5);
    490     maskHi = _mm_srai_epi16(maskHi, 5);
    491 
    492     // Add two pixels into result.
    493     // result = dst + ((src - dst) * mask >> 5)
    494     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    495     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    496 
    497     // Pack into 4 32bit dst pixels and force opaque.
    498     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    499     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    500     // clamping to 255 if necessary. Set alpha components to 0xFF.
    501     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
    502                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
    503 }
    504 
    505 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
    506                          SkColor src, int width, SkPMColor) {
    507     if (width <= 0) {
    508         return;
    509     }
    510 
    511     int srcA = SkColorGetA(src);
    512     int srcR = SkColorGetR(src);
    513     int srcG = SkColorGetG(src);
    514     int srcB = SkColorGetB(src);
    515 
    516     srcA = SkAlpha255To256(srcA);
    517 
    518     if (width >= 4) {
    519         SkASSERT(((size_t)dst & 0x03) == 0);
    520         while (((size_t)dst & 0x0F) != 0) {
    521             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    522             mask++;
    523             dst++;
    524             width--;
    525         }
    526 
    527         __m128i *d = reinterpret_cast<__m128i*>(dst);
    528         // Set alpha to 0xFF and replicate source four times in SSE register.
    529         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    530         // Interleave with zeros to get two sets of four 16-bit values.
    531         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    532         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    533         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    534         __m128i srcA_sse = _mm_set1_epi16(srcA);
    535         while (width >= 4) {
    536             // Load four destination pixels into dst_sse.
    537             __m128i dst_sse = _mm_load_si128(d);
    538             // Load four 16-bit masks into lower half of mask_sse.
    539             __m128i mask_sse = _mm_loadl_epi64(
    540                                    reinterpret_cast<const __m128i*>(mask));
    541 
    542             // Check whether masks are equal to 0 and get the highest bit
    543             // of each byte of result, if masks are all zero, we will get
    544             // pack_cmp to 0xFFFF
    545             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    546                                              _mm_setzero_si128()));
    547 
    548             // if mask pixels are not all zero, we will blend the dst pixels
    549             if (pack_cmp != 0xFFFF) {
    550                 // Unpack 4 16bit mask pixels to
    551                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    552                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    553                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    554                                               _mm_setzero_si128());
    555 
    556                 // Process 4 32bit dst pixels
    557                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
    558                                                    mask_sse, srcA_sse);
    559                 _mm_store_si128(d, result);
    560             }
    561 
    562             d++;
    563             mask += 4;
    564             width -= 4;
    565         }
    566 
    567         dst = reinterpret_cast<SkPMColor*>(d);
    568     }
    569 
    570     while (width > 0) {
    571         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    572         mask++;
    573         dst++;
    574         width--;
    575     }
    576 }
    577 
    578 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
    579                                SkColor src, int width, SkPMColor opaqueDst) {
    580     if (width <= 0) {
    581         return;
    582     }
    583 
    584     int srcR = SkColorGetR(src);
    585     int srcG = SkColorGetG(src);
    586     int srcB = SkColorGetB(src);
    587 
    588     if (width >= 4) {
    589         SkASSERT(((size_t)dst & 0x03) == 0);
    590         while (((size_t)dst & 0x0F) != 0) {
    591             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    592             mask++;
    593             dst++;
    594             width--;
    595         }
    596 
    597         __m128i *d = reinterpret_cast<__m128i*>(dst);
    598         // Set alpha to 0xFF and replicate source four times in SSE register.
    599         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    600         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    601         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    602         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    603         while (width >= 4) {
    604             // Load four destination pixels into dst_sse.
    605             __m128i dst_sse = _mm_load_si128(d);
    606             // Load four 16-bit masks into lower half of mask_sse.
    607             __m128i mask_sse = _mm_loadl_epi64(
    608                                    reinterpret_cast<const __m128i*>(mask));
    609 
    610             // Check whether masks are equal to 0 and get the highest bit
    611             // of each byte of result, if masks are all zero, we will get
    612             // pack_cmp to 0xFFFF
    613             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    614                                              _mm_setzero_si128()));
    615 
    616             // if mask pixels are not all zero, we will blend the dst pixels
    617             if (pack_cmp != 0xFFFF) {
    618                 // Unpack 4 16bit mask pixels to
    619                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    620                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    621                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    622                                               _mm_setzero_si128());
    623 
    624                 // Process 4 32bit dst pixels
    625                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
    626                                                          mask_sse);
    627                 _mm_store_si128(d, result);
    628             }
    629 
    630             d++;
    631             mask += 4;
    632             width -= 4;
    633         }
    634 
    635         dst = reinterpret_cast<SkPMColor*>(d);
    636     }
    637 
    638     while (width > 0) {
    639         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    640         mask++;
    641         dst++;
    642         width--;
    643     }
    644 }
    645 
    646 /* SSE2 version of S32_D565_Opaque()
    647  * portable version is in core/SkBlitRow_D16.cpp
    648  */
    649 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    650                           const SkPMColor* SK_RESTRICT src, int count,
    651                           U8CPU alpha, int /*x*/, int /*y*/) {
    652     SkASSERT(255 == alpha);
    653 
    654     if (count <= 0) {
    655         return;
    656     }
    657 
    658     if (count >= 8) {
    659         while (((size_t)dst & 0x0F) != 0) {
    660             SkPMColor c = *src++;
    661             SkPMColorAssert(c);
    662 
    663             *dst++ = SkPixel32ToPixel16_ToU16(c);
    664             count--;
    665         }
    666 
    667         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    668         __m128i* d = reinterpret_cast<__m128i*>(dst);
    669 
    670         while (count >= 8) {
    671             // Load 8 pixels of src.
    672             __m128i src_pixel1 = _mm_loadu_si128(s++);
    673             __m128i src_pixel2 = _mm_loadu_si128(s++);
    674 
    675             __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
    676             _mm_store_si128(d++, d_pixel);
    677             count -= 8;
    678         }
    679         src = reinterpret_cast<const SkPMColor*>(s);
    680         dst = reinterpret_cast<uint16_t*>(d);
    681     }
    682 
    683     if (count > 0) {
    684         do {
    685             SkPMColor c = *src++;
    686             SkPMColorAssert(c);
    687             *dst++ = SkPixel32ToPixel16_ToU16(c);
    688         } while (--count != 0);
    689     }
    690 }
    691 
    692 /* SSE2 version of S32A_D565_Opaque()
    693  * portable version is in core/SkBlitRow_D16.cpp
    694  */
    695 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    696                            const SkPMColor* SK_RESTRICT src,
    697                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
    698     SkASSERT(255 == alpha);
    699 
    700     if (count <= 0) {
    701         return;
    702     }
    703 
    704     if (count >= 8) {
    705         // Make dst 16 bytes alignment
    706         while (((size_t)dst & 0x0F) != 0) {
    707             SkPMColor c = *src++;
    708             if (c) {
    709               *dst = SkSrcOver32To16(c, *dst);
    710             }
    711             dst += 1;
    712             count--;
    713         }
    714 
    715         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    716         __m128i* d = reinterpret_cast<__m128i*>(dst);
    717         __m128i var255 = _mm_set1_epi16(255);
    718         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
    719         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
    720         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
    721 
    722         while (count >= 8) {
    723             // Load 8 pixels of src.
    724             __m128i src_pixel1 = _mm_loadu_si128(s++);
    725             __m128i src_pixel2 = _mm_loadu_si128(s++);
    726 
    727             // Check whether src pixels are equal to 0 and get the highest bit
    728             // of each byte of result, if src pixels are all zero, src_cmp1 and
    729             // src_cmp2 will be 0xFFFF.
    730             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
    731                                              _mm_setzero_si128()));
    732             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
    733                                              _mm_setzero_si128()));
    734             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
    735                 d++;
    736                 count -= 8;
    737                 continue;
    738             }
    739 
    740             // Load 8 pixels of dst.
    741             __m128i dst_pixel = _mm_load_si128(d);
    742 
    743             // Extract A from src.
    744             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
    745             sa1 = _mm_srli_epi32(sa1, 24);
    746             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
    747             sa2 = _mm_srli_epi32(sa2, 24);
    748             __m128i sa = _mm_packs_epi32(sa1, sa2);
    749 
    750             // Extract R from src.
    751             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    752             sr1 = _mm_srli_epi32(sr1, 24);
    753             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    754             sr2 = _mm_srli_epi32(sr2, 24);
    755             __m128i sr = _mm_packs_epi32(sr1, sr2);
    756 
    757             // Extract G from src.
    758             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    759             sg1 = _mm_srli_epi32(sg1, 24);
    760             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
    761             sg2 = _mm_srli_epi32(sg2, 24);
    762             __m128i sg = _mm_packs_epi32(sg1, sg2);
    763 
    764             // Extract B from src.
    765             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
    766             sb1 = _mm_srli_epi32(sb1, 24);
    767             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
    768             sb2 = _mm_srli_epi32(sb2, 24);
    769             __m128i sb = _mm_packs_epi32(sb1, sb2);
    770 
    771             // Extract R G B from dst.
    772             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
    773             dr = _mm_and_si128(dr, r16_mask);
    774             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
    775             dg = _mm_and_si128(dg, g16_mask);
    776             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
    777             db = _mm_and_si128(db, b16_mask);
    778 
    779             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
    780 
    781             // Calculate R G B of result.
    782             // Original algorithm is in SkSrcOver32To16().
    783             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
    784             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
    785             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
    786             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
    787             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
    788             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
    789 
    790             // Pack R G B into 16-bit color.
    791             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
    792 
    793             // Store 8 16-bit colors in dst.
    794             _mm_store_si128(d++, d_pixel);
    795             count -= 8;
    796         }
    797 
    798         src = reinterpret_cast<const SkPMColor*>(s);
    799         dst = reinterpret_cast<uint16_t*>(d);
    800     }
    801 
    802     if (count > 0) {
    803         do {
    804             SkPMColor c = *src++;
    805             SkPMColorAssert(c);
    806             if (c) {
    807                 *dst = SkSrcOver32To16(c, *dst);
    808             }
    809             dst += 1;
    810         } while (--count != 0);
    811     }
    812 }
    813 
    814 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
    815                                  const SkPMColor* SK_RESTRICT src,
    816                                  int count, U8CPU alpha, int x, int y) {
    817     SkASSERT(255 == alpha);
    818 
    819     if (count <= 0) {
    820         return;
    821     }
    822 
    823     if (count >= 8) {
    824         while (((size_t)dst & 0x0F) != 0) {
    825             DITHER_565_SCAN(y);
    826             SkPMColor c = *src++;
    827             SkPMColorAssert(c);
    828 
    829             unsigned dither = DITHER_VALUE(x);
    830             *dst++ = SkDitherRGB32To565(c, dither);
    831             DITHER_INC_X(x);
    832             count--;
    833         }
    834 
    835         unsigned short dither_value[8];
    836         __m128i dither;
    837 #ifdef ENABLE_DITHER_MATRIX_4X4
    838         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
    839         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
    840         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
    841         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
    842         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
    843 #else
    844         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
    845         dither_value[0] = dither_value[4] = (dither_scan
    846                                              >> (((x) & 3) << 2)) & 0xF;
    847         dither_value[1] = dither_value[5] = (dither_scan
    848                                              >> (((x + 1) & 3) << 2)) & 0xF;
    849         dither_value[2] = dither_value[6] = (dither_scan
    850                                              >> (((x + 2) & 3) << 2)) & 0xF;
    851         dither_value[3] = dither_value[7] = (dither_scan
    852                                              >> (((x + 3) & 3) << 2)) & 0xF;
    853 #endif
    854         dither = _mm_loadu_si128((__m128i*) dither_value);
    855 
    856         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    857         __m128i* d = reinterpret_cast<__m128i*>(dst);
    858 
    859         while (count >= 8) {
    860             // Load 8 pixels of src.
    861             __m128i src_pixel1 = _mm_loadu_si128(s++);
    862             __m128i src_pixel2 = _mm_loadu_si128(s++);
    863 
    864             // Extract R from src.
    865             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    866             sr1 = _mm_srli_epi32(sr1, 24);
    867             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    868             sr2 = _mm_srli_epi32(sr2, 24);
    869             __m128i sr = _mm_packs_epi32(sr1, sr2);
    870 
    871             // SkDITHER_R32To565(sr, dither)
    872             __m128i sr_offset = _mm_srli_epi16(sr, 5);
    873             sr = _mm_add_epi16(sr, dither);
    874             sr = _mm_sub_epi16(sr, sr_offset);
    875             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
    876 
    877             // Extract G from src.
    878             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    879             sg1 = _mm_srli_epi32(sg1, 24);
    880             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
    881             sg2 = _mm_srli_epi32(sg2, 24);
    882             __m128i sg = _mm_packs_epi32(sg1, sg2);
    883 
    884             // SkDITHER_R32To565(sg, dither)
    885             __m128i sg_offset = _mm_srli_epi16(sg, 6);
    886             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
    887             sg = _mm_sub_epi16(sg, sg_offset);
    888             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
    889 
    890             // Extract B from src.
    891             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
    892             sb1 = _mm_srli_epi32(sb1, 24);
    893             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
    894             sb2 = _mm_srli_epi32(sb2, 24);
    895             __m128i sb = _mm_packs_epi32(sb1, sb2);
    896 
    897             // SkDITHER_R32To565(sb, dither)
    898             __m128i sb_offset = _mm_srli_epi16(sb, 5);
    899             sb = _mm_add_epi16(sb, dither);
    900             sb = _mm_sub_epi16(sb, sb_offset);
    901             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
    902 
    903             // Pack and store 16-bit dst pixel.
    904             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
    905             _mm_store_si128(d++, d_pixel);
    906 
    907             count -= 8;
    908             x += 8;
    909         }
    910 
    911         src = reinterpret_cast<const SkPMColor*>(s);
    912         dst = reinterpret_cast<uint16_t*>(d);
    913     }
    914 
    915     if (count > 0) {
    916         DITHER_565_SCAN(y);
    917         do {
    918             SkPMColor c = *src++;
    919             SkPMColorAssert(c);
    920 
    921             unsigned dither = DITHER_VALUE(x);
    922             *dst++ = SkDitherRGB32To565(c, dither);
    923             DITHER_INC_X(x);
    924         } while (--count != 0);
    925     }
    926 }
    927 
    928 /* SSE2 version of S32A_D565_Opaque_Dither()
    929  * portable version is in core/SkBlitRow_D16.cpp
    930  */
    931 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
    932                                   const SkPMColor* SK_RESTRICT src,
    933                                   int count, U8CPU alpha, int x, int y) {
    934     SkASSERT(255 == alpha);
    935 
    936     if (count <= 0) {
    937         return;
    938     }
    939 
    940     if (count >= 8) {
    941         while (((size_t)dst & 0x0F) != 0) {
    942             DITHER_565_SCAN(y);
    943             SkPMColor c = *src++;
    944             SkPMColorAssert(c);
    945             if (c) {
    946                 unsigned a = SkGetPackedA32(c);
    947 
    948                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
    949 
    950                 unsigned sr = SkGetPackedR32(c);
    951                 unsigned sg = SkGetPackedG32(c);
    952                 unsigned sb = SkGetPackedB32(c);
    953                 sr = SkDITHER_R32_FOR_565(sr, d);
    954                 sg = SkDITHER_G32_FOR_565(sg, d);
    955                 sb = SkDITHER_B32_FOR_565(sb, d);
    956 
    957                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
    958                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
    959                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
    960                 // now src and dst expanded are in g:11 r:10 x:1 b:10
    961                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
    962             }
    963             dst += 1;
    964             DITHER_INC_X(x);
    965             count--;
    966         }
    967 
    968         unsigned short dither_value[8];
    969         __m128i dither, dither_cur;
    970 #ifdef ENABLE_DITHER_MATRIX_4X4
    971         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
    972         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
    973         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
    974         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
    975         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
    976 #else
    977         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
    978         dither_value[0] = dither_value[4] = (dither_scan
    979                                              >> (((x) & 3) << 2)) & 0xF;
    980         dither_value[1] = dither_value[5] = (dither_scan
    981                                              >> (((x + 1) & 3) << 2)) & 0xF;
    982         dither_value[2] = dither_value[6] = (dither_scan
    983                                              >> (((x + 2) & 3) << 2)) & 0xF;
    984         dither_value[3] = dither_value[7] = (dither_scan
    985                                              >> (((x + 3) & 3) << 2)) & 0xF;
    986 #endif
    987         dither = _mm_loadu_si128((__m128i*) dither_value);
    988 
    989         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    990         __m128i* d = reinterpret_cast<__m128i*>(dst);
    991         __m128i var256 = _mm_set1_epi16(256);
    992         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
    993         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
    994         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
    995 
    996         while (count >= 8) {
    997             // Load 8 pixels of src and dst.
    998             __m128i src_pixel1 = _mm_loadu_si128(s++);
    999             __m128i src_pixel2 = _mm_loadu_si128(s++);
   1000             __m128i dst_pixel = _mm_load_si128(d);
   1001 
   1002             // Extract A from src.
   1003             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
   1004             sa1 = _mm_srli_epi32(sa1, 24);
   1005             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
   1006             sa2 = _mm_srli_epi32(sa2, 24);
   1007             __m128i sa = _mm_packs_epi32(sa1, sa2);
   1008 
   1009             // Calculate current dither value.
   1010             dither_cur = _mm_mullo_epi16(dither,
   1011                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
   1012             dither_cur = _mm_srli_epi16(dither_cur, 8);
   1013 
   1014             // Extract R from src.
   1015             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
   1016             sr1 = _mm_srli_epi32(sr1, 24);
   1017             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
   1018             sr2 = _mm_srli_epi32(sr2, 24);
   1019             __m128i sr = _mm_packs_epi32(sr1, sr2);
   1020 
   1021             // SkDITHER_R32_FOR_565(sr, d)
   1022             __m128i sr_offset = _mm_srli_epi16(sr, 5);
   1023             sr = _mm_add_epi16(sr, dither_cur);
   1024             sr = _mm_sub_epi16(sr, sr_offset);
   1025 
   1026             // Expand sr.
   1027             sr = _mm_slli_epi16(sr, 2);
   1028 
   1029             // Extract G from src.
   1030             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
   1031             sg1 = _mm_srli_epi32(sg1, 24);
   1032             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
   1033             sg2 = _mm_srli_epi32(sg2, 24);
   1034             __m128i sg = _mm_packs_epi32(sg1, sg2);
   1035 
   1036             // sg = SkDITHER_G32_FOR_565(sg, d).
   1037             __m128i sg_offset = _mm_srli_epi16(sg, 6);
   1038             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
   1039             sg = _mm_sub_epi16(sg, sg_offset);
   1040 
   1041             // Expand sg.
   1042             sg = _mm_slli_epi16(sg, 3);
   1043 
   1044             // Extract B from src.
   1045             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
   1046             sb1 = _mm_srli_epi32(sb1, 24);
   1047             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
   1048             sb2 = _mm_srli_epi32(sb2, 24);
   1049             __m128i sb = _mm_packs_epi32(sb1, sb2);
   1050 
   1051             // sb = SkDITHER_B32_FOR_565(sb, d).
   1052             __m128i sb_offset = _mm_srli_epi16(sb, 5);
   1053             sb = _mm_add_epi16(sb, dither_cur);
   1054             sb = _mm_sub_epi16(sb, sb_offset);
   1055 
   1056             // Expand sb.
   1057             sb = _mm_slli_epi16(sb, 2);
   1058 
   1059             // Extract R G B from dst.
   1060             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
   1061             dr = _mm_and_si128(dr, r16_mask);
   1062             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
   1063             dg = _mm_and_si128(dg, g16_mask);
   1064             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
   1065             db = _mm_and_si128(db, b16_mask);
   1066 
   1067             // SkAlpha255To256(255 - a) >> 3
   1068             __m128i isa = _mm_sub_epi16(var256, sa);
   1069             isa = _mm_srli_epi16(isa, 3);
   1070 
   1071             dr = _mm_mullo_epi16(dr, isa);
   1072             dr = _mm_add_epi16(dr, sr);
   1073             dr = _mm_srli_epi16(dr, 5);
   1074 
   1075             dg = _mm_mullo_epi16(dg, isa);
   1076             dg = _mm_add_epi16(dg, sg);
   1077             dg = _mm_srli_epi16(dg, 5);
   1078 
   1079             db = _mm_mullo_epi16(db, isa);
   1080             db = _mm_add_epi16(db, sb);
   1081             db = _mm_srli_epi16(db, 5);
   1082 
   1083             // Package and store dst pixel.
   1084             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
   1085             _mm_store_si128(d++, d_pixel);
   1086 
   1087             count -= 8;
   1088             x += 8;
   1089         }
   1090 
   1091         src = reinterpret_cast<const SkPMColor*>(s);
   1092         dst = reinterpret_cast<uint16_t*>(d);
   1093     }
   1094 
   1095     if (count > 0) {
   1096         DITHER_565_SCAN(y);
   1097         do {
   1098             SkPMColor c = *src++;
   1099             SkPMColorAssert(c);
   1100             if (c) {
   1101                 unsigned a = SkGetPackedA32(c);
   1102 
   1103                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
   1104 
   1105                 unsigned sr = SkGetPackedR32(c);
   1106                 unsigned sg = SkGetPackedG32(c);
   1107                 unsigned sb = SkGetPackedB32(c);
   1108                 sr = SkDITHER_R32_FOR_565(sr, d);
   1109                 sg = SkDITHER_G32_FOR_565(sg, d);
   1110                 sb = SkDITHER_B32_FOR_565(sb, d);
   1111 
   1112                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1113                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1114                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1115                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1116                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1117             }
   1118             dst += 1;
   1119             DITHER_INC_X(x);
   1120         } while (--count != 0);
   1121     }
   1122 }
   1123