Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmapProcState_opts_SSE2.h"
     10 #include "SkBlitRow_opts_SSE2.h"
     11 #include "SkColorPriv.h"
     12 #include "SkColor_opts_SSE2.h"
     13 #include "SkDither.h"
     14 #include "SkUtils.h"
     15 
     16 /* SSE2 version of S32_Blend_BlitRow32()
     17  * portable version is in core/SkBlitRow_D32.cpp
     18  */
     19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     20                               const SkPMColor* SK_RESTRICT src,
     21                               int count, U8CPU alpha) {
     22     SkASSERT(alpha <= 255);
     23     if (count <= 0) {
     24         return;
     25     }
     26 
     27     uint32_t src_scale = SkAlpha255To256(alpha);
     28     uint32_t dst_scale = 256 - src_scale;
     29 
     30     if (count >= 4) {
     31         SkASSERT(((size_t)dst & 0x03) == 0);
     32         while (((size_t)dst & 0x0F) != 0) {
     33             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     34             src++;
     35             dst++;
     36             count--;
     37         }
     38 
     39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     40         __m128i *d = reinterpret_cast<__m128i*>(dst);
     41         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     42         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
     43 
     44         // Move scale factors to upper byte of word
     45         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
     46         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
     47         while (count >= 4) {
     48             // Load 4 pixels each of src and dest.
     49             __m128i src_pixel = _mm_loadu_si128(s);
     50             __m128i dst_pixel = _mm_load_si128(d);
     51 
     52             // Interleave Atom port 0/1 operations based on the execution port
     53             // constraints that multiply can only be executed on port 0 (while
     54             // boolean operations can be executed on either port 0 or port 1)
     55             // because GCC currently doesn't do a good job scheduling
     56             // instructions based on these constraints.
     57 
     58             // Get red and blue pixels into lower byte of each word.
     59             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
     60             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
     61 
     62             // Multiply by scale.
     63             // (4 x (0, rs.h, 0, bs.h))
     64             // where rs.h stands for the higher byte of r * scale, and
     65             // bs.h the higher byte of b * scale.
     66             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
     67 
     68             // Get alpha and green pixels into higher byte of each word.
     69             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
     70             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
     71 
     72             // Multiply by scale.
     73             // (4 x (as.h, as.l, gs.h, gs.l))
     74             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
     75 
     76             // Clear the lower byte of the a*scale and g*scale results
     77             // (4 x (as.h, 0, gs.h, 0))
     78             src_ag = _mm_and_si128(src_ag, ag_mask);
     79 
     80             // Operations the destination pixels are the same as on the
     81             // source pixels. See the comments above.
     82             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     83             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
     84             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
     85             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
     86             dst_ag = _mm_and_si128(dst_ag, ag_mask);
     87 
     88             // Combine back into RGBA.
     89             // (4 x (as.h, rs.h, gs.h, bs.h))
     90             src_pixel = _mm_or_si128(src_rb, src_ag);
     91             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     92 
     93             // Add result
     94             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     95             _mm_store_si128(d, result);
     96             s++;
     97             d++;
     98             count -= 4;
     99         }
    100         src = reinterpret_cast<const SkPMColor*>(s);
    101         dst = reinterpret_cast<SkPMColor*>(d);
    102     }
    103 
    104     while (count > 0) {
    105         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    106         src++;
    107         dst++;
    108         count--;
    109     }
    110 }
    111 
    112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    113                                 const SkPMColor* SK_RESTRICT src,
    114                                 int count, U8CPU alpha) {
    115     SkASSERT(alpha == 255);
    116     if (count <= 0) {
    117         return;
    118     }
    119 
    120     if (count >= 4) {
    121         SkASSERT(((size_t)dst & 0x03) == 0);
    122         while (((size_t)dst & 0x0F) != 0) {
    123             *dst = SkPMSrcOver(*src, *dst);
    124             src++;
    125             dst++;
    126             count--;
    127         }
    128 
    129         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    130         __m128i *d = reinterpret_cast<__m128i*>(dst);
    131 #ifdef SK_USE_ACCURATE_BLENDING
    132         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    133         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
    134         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
    135         while (count >= 4) {
    136             // Load 4 pixels
    137             __m128i src_pixel = _mm_loadu_si128(s);
    138             __m128i dst_pixel = _mm_load_si128(d);
    139 
    140             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    141             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    142             // Shift alphas down to lower 8 bits of each quad.
    143             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
    144 
    145             // Copy alpha to upper 3rd byte of each quad
    146             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
    147 
    148             // Subtract alphas from 255, to get 0..255
    149             alpha = _mm_sub_epi16(c_255, alpha);
    150 
    151             // Multiply by red and blue by src alpha.
    152             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    153             // Multiply by alpha and green by src alpha.
    154             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    155 
    156             // dst_rb_low = (dst_rb >> 8)
    157             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
    158             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
    159 
    160             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
    161             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
    162             dst_rb = _mm_add_epi16(dst_rb, c_128);
    163             dst_rb = _mm_srli_epi16(dst_rb, 8);
    164 
    165             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
    166             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
    167             dst_ag = _mm_add_epi16(dst_ag, c_128);
    168             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    169 
    170             // Combine back into RGBA.
    171             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    172 
    173             // Add result
    174             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    175             _mm_store_si128(d, result);
    176             s++;
    177             d++;
    178             count -= 4;
    179         }
    180 #else
    181         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    182         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
    183         while (count >= 4) {
    184             // Load 4 pixels
    185             __m128i src_pixel = _mm_loadu_si128(s);
    186             __m128i dst_pixel = _mm_load_si128(d);
    187 
    188             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    189             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    190 
    191             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
    192             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
    193 
    194             // (a0, a0, a1, a1, a2, g2, a3, g3)
    195             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
    196 
    197             // (a0, a0, a1, a1, a2, a2, a3, a3)
    198             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
    199 
    200             // Subtract alphas from 256, to get 1..256
    201             alpha = _mm_sub_epi16(c_256, alpha);
    202 
    203             // Multiply by red and blue by src alpha.
    204             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    205             // Multiply by alpha and green by src alpha.
    206             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    207 
    208             // Divide by 256.
    209             dst_rb = _mm_srli_epi16(dst_rb, 8);
    210 
    211             // Mask out high bits (already in the right place)
    212             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    213 
    214             // Combine back into RGBA.
    215             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    216 
    217             // Add result
    218             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    219             _mm_store_si128(d, result);
    220             s++;
    221             d++;
    222             count -= 4;
    223         }
    224 #endif
    225         src = reinterpret_cast<const SkPMColor*>(s);
    226         dst = reinterpret_cast<SkPMColor*>(d);
    227     }
    228 
    229     while (count > 0) {
    230         *dst = SkPMSrcOver(*src, *dst);
    231         src++;
    232         dst++;
    233         count--;
    234     }
    235 }
    236 
    237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    238                                const SkPMColor* SK_RESTRICT src,
    239                                int count, U8CPU alpha) {
    240     SkASSERT(alpha <= 255);
    241     if (count <= 0) {
    242         return;
    243     }
    244 
    245     if (count >= 4) {
    246         while (((size_t)dst & 0x0F) != 0) {
    247             *dst = SkBlendARGB32(*src, *dst, alpha);
    248             src++;
    249             dst++;
    250             count--;
    251         }
    252 
    253         uint32_t src_scale = SkAlpha255To256(alpha);
    254 
    255         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    256         __m128i *d = reinterpret_cast<__m128i*>(dst);
    257         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
    258         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    259         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
    260         while (count >= 4) {
    261             // Load 4 pixels each of src and dest.
    262             __m128i src_pixel = _mm_loadu_si128(s);
    263             __m128i dst_pixel = _mm_load_si128(d);
    264 
    265             // Get red and blue pixels into lower byte of each word.
    266             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    267             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    268 
    269             // Get alpha and green into lower byte of each word.
    270             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    271             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    272 
    273             // Put per-pixel alpha in low byte of each word.
    274             // After the following two statements, the dst_alpha looks like
    275             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
    276             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    277             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    278 
    279             // dst_alpha = dst_alpha * src_scale
    280             // Because src_scales are in the higher byte of each word and
    281             // we use mulhi here, the resulting alpha values are already
    282             // in the right place and don't need to be divided by 256.
    283             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
    284             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
    285 
    286             // Subtract alphas from 256, to get 1..256
    287             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    288 
    289             // Multiply red and blue by dst pixel alpha.
    290             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    291             // Multiply alpha and green by dst pixel alpha.
    292             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    293 
    294             // Multiply red and blue by global alpha.
    295             // (4 x (0, rs.h, 0, bs.h))
    296             // where rs.h stands for the higher byte of r * src_scale,
    297             // and bs.h the higher byte of b * src_scale.
    298             // Again, because we use mulhi, the resuling red and blue
    299             // values are already in the right place and don't need to
    300             // be divided by 256.
    301             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
    302             // Multiply alpha and green by global alpha.
    303             // (4 x (0, as.h, 0, gs.h))
    304             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
    305 
    306             // Divide by 256.
    307             dst_rb = _mm_srli_epi16(dst_rb, 8);
    308 
    309             // Mask out low bits (goodies already in the right place; no need to divide)
    310             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    311             // Shift alpha and green to higher byte of each word.
    312             // (4 x (as.h, 0, gs.h, 0))
    313             src_ag = _mm_slli_epi16(src_ag, 8);
    314 
    315             // Combine back into RGBA.
    316             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    317             src_pixel = _mm_or_si128(src_rb, src_ag);
    318 
    319             // Add two pixels into result.
    320             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    321             _mm_store_si128(d, result);
    322             s++;
    323             d++;
    324             count -= 4;
    325         }
    326         src = reinterpret_cast<const SkPMColor*>(s);
    327         dst = reinterpret_cast<SkPMColor*>(d);
    328     }
    329 
    330     while (count > 0) {
    331         *dst = SkBlendARGB32(*src, *dst, alpha);
    332         src++;
    333         dst++;
    334         count--;
    335     }
    336 }
    337 
    338 /* SSE2 version of Color32()
    339  * portable version is in core/SkBlitRow_D32.cpp
    340  */
    341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
    342                   SkPMColor color) {
    343     if (count <= 0) {
    344         return;
    345     }
    346 
    347     if (0 == color) {
    348         if (src != dst) {
    349             memcpy(dst, src, count * sizeof(SkPMColor));
    350         }
    351         return;
    352     }
    353 
    354     unsigned colorA = SkGetPackedA32(color);
    355     if (255 == colorA) {
    356         sk_memset32(dst, color, count);
    357     } else {
    358         unsigned scale = 256 - SkAlpha255To256(colorA);
    359 
    360         if (count >= 4) {
    361             SkASSERT(((size_t)dst & 0x03) == 0);
    362             while (((size_t)dst & 0x0F) != 0) {
    363                 *dst = color + SkAlphaMulQ(*src, scale);
    364                 src++;
    365                 dst++;
    366                 count--;
    367             }
    368 
    369             const __m128i *s = reinterpret_cast<const __m128i*>(src);
    370             __m128i *d = reinterpret_cast<__m128i*>(dst);
    371             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    372             __m128i src_scale_wide = _mm_set1_epi16(scale);
    373             __m128i color_wide = _mm_set1_epi32(color);
    374             while (count >= 4) {
    375                 // Load 4 pixels each of src and dest.
    376                 __m128i src_pixel = _mm_loadu_si128(s);
    377 
    378                 // Get red and blue pixels into lower byte of each word.
    379                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    380 
    381                 // Get alpha and green into lower byte of each word.
    382                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    383 
    384                 // Multiply by scale.
    385                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    386                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    387 
    388                 // Divide by 256.
    389                 src_rb = _mm_srli_epi16(src_rb, 8);
    390                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    391 
    392                 // Combine back into RGBA.
    393                 src_pixel = _mm_or_si128(src_rb, src_ag);
    394 
    395                 // Add color to result.
    396                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
    397 
    398                 // Store result.
    399                 _mm_store_si128(d, result);
    400                 s++;
    401                 d++;
    402                 count -= 4;
    403             }
    404             src = reinterpret_cast<const SkPMColor*>(s);
    405             dst = reinterpret_cast<SkPMColor*>(d);
    406         }
    407 
    408         while (count > 0) {
    409             *dst = color + SkAlphaMulQ(*src, scale);
    410             src += 1;
    411             dst += 1;
    412             count--;
    413         }
    414     }
    415 }
    416 
    417 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
    418                                size_t maskRB, SkColor origColor,
    419                                int width, int height) {
    420     SkPMColor color = SkPreMultiplyColor(origColor);
    421     size_t dstOffset = dstRB - (width << 2);
    422     size_t maskOffset = maskRB - width;
    423     SkPMColor* dst = (SkPMColor *)device;
    424     const uint8_t* mask = (const uint8_t*)maskPtr;
    425     do {
    426         int count = width;
    427         if (count >= 4) {
    428             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
    429                 *dst = SkBlendARGB32(color, *dst, *mask);
    430                 mask++;
    431                 dst++;
    432                 count--;
    433             }
    434             __m128i *d = reinterpret_cast<__m128i*>(dst);
    435             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    436             __m128i c_256 = _mm_set1_epi16(256);
    437             __m128i c_1 = _mm_set1_epi16(1);
    438             __m128i src_pixel = _mm_set1_epi32(color);
    439             while (count >= 4) {
    440                 // Load 4 pixels each of src and dest.
    441                 __m128i dst_pixel = _mm_load_si128(d);
    442 
    443                 //set the aphla value
    444                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
    445                                 0, *(mask+3),0, \
    446                                 *(mask+2),0, *(mask+2),\
    447                                 0,*(mask+1), 0,*(mask+1),\
    448                                 0, *mask,0,*mask);
    449 
    450                 //call SkAlpha255To256()
    451                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
    452 
    453                 // Get red and blue pixels into lower byte of each word.
    454                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    455                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    456 
    457                 // Get alpha and green into lower byte of each word.
    458                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    459                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    460 
    461                 // Put per-pixel alpha in low byte of each word.
    462                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    463                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    464 
    465                 // dst_alpha = dst_alpha * src_scale
    466                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
    467 
    468                 // Divide by 256.
    469                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
    470 
    471                 // Subtract alphas from 256, to get 1..256
    472                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    473                 // Multiply red and blue by dst pixel alpha.
    474                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    475                 // Multiply alpha and green by dst pixel alpha.
    476                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    477 
    478                 // Multiply red and blue by global alpha.
    479                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    480                 // Multiply alpha and green by global alpha.
    481                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    482                 // Divide by 256.
    483                 dst_rb = _mm_srli_epi16(dst_rb, 8);
    484                 src_rb = _mm_srli_epi16(src_rb, 8);
    485 
    486                 // Mask out low bits (goodies already in the right place; no need to divide)
    487                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    488                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    489 
    490                 // Combine back into RGBA.
    491                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    492                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
    493 
    494                 // Add two pixels into result.
    495                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
    496                 _mm_store_si128(d, result);
    497                 // load the next 4 pixel
    498                 mask = mask + 4;
    499                 d++;
    500                 count -= 4;
    501             }
    502             dst = reinterpret_cast<SkPMColor *>(d);
    503         }
    504         while (count > 0) {
    505             *dst= SkBlendARGB32(color, *dst, *mask);
    506             dst += 1;
    507             mask++;
    508             count --;
    509         }
    510         dst = (SkPMColor *)((char*)dst + dstOffset);
    511         mask += maskOffset;
    512     } while (--height != 0);
    513 }
    514 
    515 // The following (left) shifts cause the top 5 bits of the mask components to
    516 // line up with the corresponding components in an SkPMColor.
    517 // Note that the mask's RGB16 order may differ from the SkPMColor order.
    518 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    519 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    520 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
    521 
    522 #if SK_R16x5_R32x5_SHIFT == 0
    523     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
    524 #elif SK_R16x5_R32x5_SHIFT > 0
    525     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
    526 #else
    527     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
    528 #endif
    529 
    530 #if SK_G16x5_G32x5_SHIFT == 0
    531     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
    532 #elif SK_G16x5_G32x5_SHIFT > 0
    533     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
    534 #else
    535     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
    536 #endif
    537 
    538 #if SK_B16x5_B32x5_SHIFT == 0
    539     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
    540 #elif SK_B16x5_B32x5_SHIFT > 0
    541     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
    542 #else
    543     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
    544 #endif
    545 
    546 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
    547                                  __m128i &mask, __m128i &srcA) {
    548     // In the following comments, the components of src, dst and mask are
    549     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    550     // by an R, G, B, or A suffix. Components of one of the four pixels that
    551     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    552     // example is the blue channel of the second destination pixel. Memory
    553     // layout is shown for an ARGB byte order in a color value.
    554 
    555     // src and srcA store 8-bit values interleaved with zeros.
    556     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    557     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
    558     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
    559     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
    560     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
    561     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    562     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    563 
    564     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    565     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    566     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    567                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    568 
    569     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    570     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    571                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    572 
    573     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    574     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    575                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    576 
    577     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    578     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    579     // 8-bit position
    580     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    581     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    582     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    583 
    584     // Interleave R,G,B into the lower byte of word.
    585     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    586     // 16-bit values, padded by zero.
    587     __m128i maskLo, maskHi;
    588     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    589     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    590     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    591     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    592 
    593     // Upscale from 0..31 to 0..32
    594     // (allows to replace division by left-shift further down)
    595     // Left-shift each component by 4 and add the result back to that component,
    596     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    597     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    598     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    599 
    600     // Multiply each component of maskLo and maskHi by srcA
    601     maskLo = _mm_mullo_epi16(maskLo, srcA);
    602     maskHi = _mm_mullo_epi16(maskHi, srcA);
    603 
    604     // Left shift mask components by 8 (divide by 256)
    605     maskLo = _mm_srli_epi16(maskLo, 8);
    606     maskHi = _mm_srli_epi16(maskHi, 8);
    607 
    608     // Interleave R,G,B into the lower byte of the word
    609     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    610     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    611     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    612     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    613 
    614     // mask = (src - dst) * mask
    615     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    616     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    617 
    618     // mask = (src - dst) * mask >> 5
    619     maskLo = _mm_srai_epi16(maskLo, 5);
    620     maskHi = _mm_srai_epi16(maskHi, 5);
    621 
    622     // Add two pixels into result.
    623     // result = dst + ((src - dst) * mask >> 5)
    624     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    625     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    626 
    627     // Pack into 4 32bit dst pixels.
    628     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    629     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    630     // clamping to 255 if necessary.
    631     return _mm_packus_epi16(resultLo, resultHi);
    632 }
    633 
    634 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
    635                                        __m128i &mask) {
    636     // In the following comments, the components of src, dst and mask are
    637     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    638     // by an R, G, B, or A suffix. Components of one of the four pixels that
    639     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    640     // example is the blue channel of the second destination pixel. Memory
    641     // layout is shown for an ARGB byte order in a color value.
    642 
    643     // src and srcA store 8-bit values interleaved with zeros.
    644     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    645     // mask stores 16-bit values (shown as high and low bytes) interleaved with
    646     // zeros
    647     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    648     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    649 
    650     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    651     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    652     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    653                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    654 
    655     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    656     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    657                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    658 
    659     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    660     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    661                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    662 
    663     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    664     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    665     // 8-bit position
    666     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    667     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    668     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    669 
    670     // Interleave R,G,B into the lower byte of word.
    671     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    672     // 16-bit values, padded by zero.
    673     __m128i maskLo, maskHi;
    674     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    675     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    676     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    677     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    678 
    679     // Upscale from 0..31 to 0..32
    680     // (allows to replace division by left-shift further down)
    681     // Left-shift each component by 4 and add the result back to that component,
    682     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    683     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    684     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    685 
    686     // Interleave R,G,B into the lower byte of the word
    687     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    688     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    689     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    690     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    691 
    692     // mask = (src - dst) * mask
    693     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    694     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    695 
    696     // mask = (src - dst) * mask >> 5
    697     maskLo = _mm_srai_epi16(maskLo, 5);
    698     maskHi = _mm_srai_epi16(maskHi, 5);
    699 
    700     // Add two pixels into result.
    701     // result = dst + ((src - dst) * mask >> 5)
    702     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    703     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    704 
    705     // Pack into 4 32bit dst pixels and force opaque.
    706     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    707     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    708     // clamping to 255 if necessary. Set alpha components to 0xFF.
    709     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
    710                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
    711 }
    712 
    713 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
    714                          SkColor src, int width, SkPMColor) {
    715     if (width <= 0) {
    716         return;
    717     }
    718 
    719     int srcA = SkColorGetA(src);
    720     int srcR = SkColorGetR(src);
    721     int srcG = SkColorGetG(src);
    722     int srcB = SkColorGetB(src);
    723 
    724     srcA = SkAlpha255To256(srcA);
    725 
    726     if (width >= 4) {
    727         SkASSERT(((size_t)dst & 0x03) == 0);
    728         while (((size_t)dst & 0x0F) != 0) {
    729             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    730             mask++;
    731             dst++;
    732             width--;
    733         }
    734 
    735         __m128i *d = reinterpret_cast<__m128i*>(dst);
    736         // Set alpha to 0xFF and replicate source four times in SSE register.
    737         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    738         // Interleave with zeros to get two sets of four 16-bit values.
    739         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    740         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    741         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    742         __m128i srcA_sse = _mm_set1_epi16(srcA);
    743         while (width >= 4) {
    744             // Load four destination pixels into dst_sse.
    745             __m128i dst_sse = _mm_load_si128(d);
    746             // Load four 16-bit masks into lower half of mask_sse.
    747             __m128i mask_sse = _mm_loadl_epi64(
    748                                    reinterpret_cast<const __m128i*>(mask));
    749 
    750             // Check whether masks are equal to 0 and get the highest bit
    751             // of each byte of result, if masks are all zero, we will get
    752             // pack_cmp to 0xFFFF
    753             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    754                                              _mm_setzero_si128()));
    755 
    756             // if mask pixels are not all zero, we will blend the dst pixels
    757             if (pack_cmp != 0xFFFF) {
    758                 // Unpack 4 16bit mask pixels to
    759                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    760                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    761                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    762                                               _mm_setzero_si128());
    763 
    764                 // Process 4 32bit dst pixels
    765                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
    766                                                    mask_sse, srcA_sse);
    767                 _mm_store_si128(d, result);
    768             }
    769 
    770             d++;
    771             mask += 4;
    772             width -= 4;
    773         }
    774 
    775         dst = reinterpret_cast<SkPMColor*>(d);
    776     }
    777 
    778     while (width > 0) {
    779         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    780         mask++;
    781         dst++;
    782         width--;
    783     }
    784 }
    785 
    786 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
    787                                SkColor src, int width, SkPMColor opaqueDst) {
    788     if (width <= 0) {
    789         return;
    790     }
    791 
    792     int srcR = SkColorGetR(src);
    793     int srcG = SkColorGetG(src);
    794     int srcB = SkColorGetB(src);
    795 
    796     if (width >= 4) {
    797         SkASSERT(((size_t)dst & 0x03) == 0);
    798         while (((size_t)dst & 0x0F) != 0) {
    799             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    800             mask++;
    801             dst++;
    802             width--;
    803         }
    804 
    805         __m128i *d = reinterpret_cast<__m128i*>(dst);
    806         // Set alpha to 0xFF and replicate source four times in SSE register.
    807         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    808         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    809         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    810         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    811         while (width >= 4) {
    812             // Load four destination pixels into dst_sse.
    813             __m128i dst_sse = _mm_load_si128(d);
    814             // Load four 16-bit masks into lower half of mask_sse.
    815             __m128i mask_sse = _mm_loadl_epi64(
    816                                    reinterpret_cast<const __m128i*>(mask));
    817 
    818             // Check whether masks are equal to 0 and get the highest bit
    819             // of each byte of result, if masks are all zero, we will get
    820             // pack_cmp to 0xFFFF
    821             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    822                                              _mm_setzero_si128()));
    823 
    824             // if mask pixels are not all zero, we will blend the dst pixels
    825             if (pack_cmp != 0xFFFF) {
    826                 // Unpack 4 16bit mask pixels to
    827                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    828                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    829                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    830                                               _mm_setzero_si128());
    831 
    832                 // Process 4 32bit dst pixels
    833                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
    834                                                          mask_sse);
    835                 _mm_store_si128(d, result);
    836             }
    837 
    838             d++;
    839             mask += 4;
    840             width -= 4;
    841         }
    842 
    843         dst = reinterpret_cast<SkPMColor*>(d);
    844     }
    845 
    846     while (width > 0) {
    847         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    848         mask++;
    849         dst++;
    850         width--;
    851     }
    852 }
    853 
    854 /* SSE2 version of S32_D565_Opaque()
    855  * portable version is in core/SkBlitRow_D16.cpp
    856  */
    857 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    858                           const SkPMColor* SK_RESTRICT src, int count,
    859                           U8CPU alpha, int /*x*/, int /*y*/) {
    860     SkASSERT(255 == alpha);
    861 
    862     if (count <= 0) {
    863         return;
    864     }
    865 
    866     if (count >= 8) {
    867         while (((size_t)dst & 0x0F) != 0) {
    868             SkPMColor c = *src++;
    869             SkPMColorAssert(c);
    870 
    871             *dst++ = SkPixel32ToPixel16_ToU16(c);
    872             count--;
    873         }
    874 
    875         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    876         __m128i* d = reinterpret_cast<__m128i*>(dst);
    877         __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
    878         __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
    879         __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
    880 
    881         while (count >= 8) {
    882             // Load 8 pixels of src.
    883             __m128i src_pixel1 = _mm_loadu_si128(s++);
    884             __m128i src_pixel2 = _mm_loadu_si128(s++);
    885 
    886             // Calculate result r.
    887             __m128i r1 = _mm_srli_epi32(src_pixel1,
    888                                         SK_R32_SHIFT + (8 - SK_R16_BITS));
    889             r1 = _mm_and_si128(r1, r16_mask);
    890             __m128i r2 = _mm_srli_epi32(src_pixel2,
    891                                         SK_R32_SHIFT + (8 - SK_R16_BITS));
    892             r2 = _mm_and_si128(r2, r16_mask);
    893             __m128i r = _mm_packs_epi32(r1, r2);
    894 
    895             // Calculate result g.
    896             __m128i g1 = _mm_srli_epi32(src_pixel1,
    897                                         SK_G32_SHIFT + (8 - SK_G16_BITS));
    898             g1 = _mm_and_si128(g1, g16_mask);
    899             __m128i g2 = _mm_srli_epi32(src_pixel2,
    900                                         SK_G32_SHIFT + (8 - SK_G16_BITS));
    901             g2 = _mm_and_si128(g2, g16_mask);
    902             __m128i g = _mm_packs_epi32(g1, g2);
    903 
    904             // Calculate result b.
    905             __m128i b1 = _mm_srli_epi32(src_pixel1,
    906                                         SK_B32_SHIFT + (8 - SK_B16_BITS));
    907             b1 = _mm_and_si128(b1, b16_mask);
    908             __m128i b2 = _mm_srli_epi32(src_pixel2,
    909                                         SK_B32_SHIFT + (8 - SK_B16_BITS));
    910             b2 = _mm_and_si128(b2, b16_mask);
    911             __m128i b = _mm_packs_epi32(b1, b2);
    912 
    913             // Store 8 16-bit colors in dst.
    914             __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
    915             _mm_store_si128(d++, d_pixel);
    916             count -= 8;
    917         }
    918         src = reinterpret_cast<const SkPMColor*>(s);
    919         dst = reinterpret_cast<uint16_t*>(d);
    920     }
    921 
    922     if (count > 0) {
    923         do {
    924             SkPMColor c = *src++;
    925             SkPMColorAssert(c);
    926             *dst++ = SkPixel32ToPixel16_ToU16(c);
    927         } while (--count != 0);
    928     }
    929 }
    930 
    931 /* SSE2 version of S32A_D565_Opaque()
    932  * portable version is in core/SkBlitRow_D16.cpp
    933  */
    934 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
    935                            const SkPMColor* SK_RESTRICT src,
    936                            int count, U8CPU alpha, int /*x*/, int /*y*/) {
    937     SkASSERT(255 == alpha);
    938 
    939     if (count <= 0) {
    940         return;
    941     }
    942 
    943     if (count >= 8) {
    944         // Make dst 16 bytes alignment
    945         while (((size_t)dst & 0x0F) != 0) {
    946             SkPMColor c = *src++;
    947             if (c) {
    948               *dst = SkSrcOver32To16(c, *dst);
    949             }
    950             dst += 1;
    951             count--;
    952         }
    953 
    954         const __m128i* s = reinterpret_cast<const __m128i*>(src);
    955         __m128i* d = reinterpret_cast<__m128i*>(dst);
    956         __m128i var255 = _mm_set1_epi16(255);
    957         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
    958         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
    959         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
    960 
    961         while (count >= 8) {
    962             // Load 8 pixels of src.
    963             __m128i src_pixel1 = _mm_loadu_si128(s++);
    964             __m128i src_pixel2 = _mm_loadu_si128(s++);
    965 
    966             // Check whether src pixels are equal to 0 and get the highest bit
    967             // of each byte of result, if src pixels are all zero, src_cmp1 and
    968             // src_cmp2 will be 0xFFFF.
    969             int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
    970                                              _mm_setzero_si128()));
    971             int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
    972                                              _mm_setzero_si128()));
    973             if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
    974                 d++;
    975                 count -= 8;
    976                 continue;
    977             }
    978 
    979             // Load 8 pixels of dst.
    980             __m128i dst_pixel = _mm_load_si128(d);
    981 
    982             // Extract A from src.
    983             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
    984             sa1 = _mm_srli_epi32(sa1, 24);
    985             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
    986             sa2 = _mm_srli_epi32(sa2, 24);
    987             __m128i sa = _mm_packs_epi32(sa1, sa2);
    988 
    989             // Extract R from src.
    990             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
    991             sr1 = _mm_srli_epi32(sr1, 24);
    992             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
    993             sr2 = _mm_srli_epi32(sr2, 24);
    994             __m128i sr = _mm_packs_epi32(sr1, sr2);
    995 
    996             // Extract G from src.
    997             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
    998             sg1 = _mm_srli_epi32(sg1, 24);
    999             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
   1000             sg2 = _mm_srli_epi32(sg2, 24);
   1001             __m128i sg = _mm_packs_epi32(sg1, sg2);
   1002 
   1003             // Extract B from src.
   1004             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
   1005             sb1 = _mm_srli_epi32(sb1, 24);
   1006             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
   1007             sb2 = _mm_srli_epi32(sb2, 24);
   1008             __m128i sb = _mm_packs_epi32(sb1, sb2);
   1009 
   1010             // Extract R G B from dst.
   1011             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
   1012             dr = _mm_and_si128(dr, r16_mask);
   1013             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
   1014             dg = _mm_and_si128(dg, g16_mask);
   1015             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
   1016             db = _mm_and_si128(db, b16_mask);
   1017 
   1018             __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
   1019 
   1020             // Calculate R G B of result.
   1021             // Original algorithm is in SkSrcOver32To16().
   1022             dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
   1023             dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
   1024             dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
   1025             dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
   1026             db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
   1027             db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
   1028 
   1029             // Pack R G B into 16-bit color.
   1030             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
   1031 
   1032             // Store 8 16-bit colors in dst.
   1033             _mm_store_si128(d++, d_pixel);
   1034             count -= 8;
   1035         }
   1036 
   1037         src = reinterpret_cast<const SkPMColor*>(s);
   1038         dst = reinterpret_cast<uint16_t*>(d);
   1039     }
   1040 
   1041     if (count > 0) {
   1042         do {
   1043             SkPMColor c = *src++;
   1044             SkPMColorAssert(c);
   1045             if (c) {
   1046                 *dst = SkSrcOver32To16(c, *dst);
   1047             }
   1048             dst += 1;
   1049         } while (--count != 0);
   1050     }
   1051 }
   1052 
   1053 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
   1054                                  const SkPMColor* SK_RESTRICT src,
   1055                                  int count, U8CPU alpha, int x, int y) {
   1056     SkASSERT(255 == alpha);
   1057 
   1058     if (count <= 0) {
   1059         return;
   1060     }
   1061 
   1062     if (count >= 8) {
   1063         while (((size_t)dst & 0x0F) != 0) {
   1064             DITHER_565_SCAN(y);
   1065             SkPMColor c = *src++;
   1066             SkPMColorAssert(c);
   1067 
   1068             unsigned dither = DITHER_VALUE(x);
   1069             *dst++ = SkDitherRGB32To565(c, dither);
   1070             DITHER_INC_X(x);
   1071             count--;
   1072         }
   1073 
   1074         unsigned short dither_value[8];
   1075         __m128i dither;
   1076 #ifdef ENABLE_DITHER_MATRIX_4X4
   1077         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
   1078         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
   1079         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
   1080         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
   1081         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
   1082 #else
   1083         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
   1084         dither_value[0] = dither_value[4] = (dither_scan
   1085                                              >> (((x) & 3) << 2)) & 0xF;
   1086         dither_value[1] = dither_value[5] = (dither_scan
   1087                                              >> (((x + 1) & 3) << 2)) & 0xF;
   1088         dither_value[2] = dither_value[6] = (dither_scan
   1089                                              >> (((x + 2) & 3) << 2)) & 0xF;
   1090         dither_value[3] = dither_value[7] = (dither_scan
   1091                                              >> (((x + 3) & 3) << 2)) & 0xF;
   1092 #endif
   1093         dither = _mm_loadu_si128((__m128i*) dither_value);
   1094 
   1095         const __m128i* s = reinterpret_cast<const __m128i*>(src);
   1096         __m128i* d = reinterpret_cast<__m128i*>(dst);
   1097 
   1098         while (count >= 8) {
   1099             // Load 8 pixels of src.
   1100             __m128i src_pixel1 = _mm_loadu_si128(s++);
   1101             __m128i src_pixel2 = _mm_loadu_si128(s++);
   1102 
   1103             // Extract R from src.
   1104             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
   1105             sr1 = _mm_srli_epi32(sr1, 24);
   1106             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
   1107             sr2 = _mm_srli_epi32(sr2, 24);
   1108             __m128i sr = _mm_packs_epi32(sr1, sr2);
   1109 
   1110             // SkDITHER_R32To565(sr, dither)
   1111             __m128i sr_offset = _mm_srli_epi16(sr, 5);
   1112             sr = _mm_add_epi16(sr, dither);
   1113             sr = _mm_sub_epi16(sr, sr_offset);
   1114             sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
   1115 
   1116             // Extract G from src.
   1117             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
   1118             sg1 = _mm_srli_epi32(sg1, 24);
   1119             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
   1120             sg2 = _mm_srli_epi32(sg2, 24);
   1121             __m128i sg = _mm_packs_epi32(sg1, sg2);
   1122 
   1123             // SkDITHER_R32To565(sg, dither)
   1124             __m128i sg_offset = _mm_srli_epi16(sg, 6);
   1125             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
   1126             sg = _mm_sub_epi16(sg, sg_offset);
   1127             sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
   1128 
   1129             // Extract B from src.
   1130             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
   1131             sb1 = _mm_srli_epi32(sb1, 24);
   1132             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
   1133             sb2 = _mm_srli_epi32(sb2, 24);
   1134             __m128i sb = _mm_packs_epi32(sb1, sb2);
   1135 
   1136             // SkDITHER_R32To565(sb, dither)
   1137             __m128i sb_offset = _mm_srli_epi16(sb, 5);
   1138             sb = _mm_add_epi16(sb, dither);
   1139             sb = _mm_sub_epi16(sb, sb_offset);
   1140             sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
   1141 
   1142             // Pack and store 16-bit dst pixel.
   1143             __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
   1144             _mm_store_si128(d++, d_pixel);
   1145 
   1146             count -= 8;
   1147             x += 8;
   1148         }
   1149 
   1150         src = reinterpret_cast<const SkPMColor*>(s);
   1151         dst = reinterpret_cast<uint16_t*>(d);
   1152     }
   1153 
   1154     if (count > 0) {
   1155         DITHER_565_SCAN(y);
   1156         do {
   1157             SkPMColor c = *src++;
   1158             SkPMColorAssert(c);
   1159 
   1160             unsigned dither = DITHER_VALUE(x);
   1161             *dst++ = SkDitherRGB32To565(c, dither);
   1162             DITHER_INC_X(x);
   1163         } while (--count != 0);
   1164     }
   1165 }
   1166 
   1167 /* SSE2 version of S32A_D565_Opaque_Dither()
   1168  * portable version is in core/SkBlitRow_D16.cpp
   1169  */
   1170 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
   1171                                   const SkPMColor* SK_RESTRICT src,
   1172                                   int count, U8CPU alpha, int x, int y) {
   1173     SkASSERT(255 == alpha);
   1174 
   1175     if (count <= 0) {
   1176         return;
   1177     }
   1178 
   1179     if (count >= 8) {
   1180         while (((size_t)dst & 0x0F) != 0) {
   1181             DITHER_565_SCAN(y);
   1182             SkPMColor c = *src++;
   1183             SkPMColorAssert(c);
   1184             if (c) {
   1185                 unsigned a = SkGetPackedA32(c);
   1186 
   1187                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
   1188 
   1189                 unsigned sr = SkGetPackedR32(c);
   1190                 unsigned sg = SkGetPackedG32(c);
   1191                 unsigned sb = SkGetPackedB32(c);
   1192                 sr = SkDITHER_R32_FOR_565(sr, d);
   1193                 sg = SkDITHER_G32_FOR_565(sg, d);
   1194                 sb = SkDITHER_B32_FOR_565(sb, d);
   1195 
   1196                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1197                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1198                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1199                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1200                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1201             }
   1202             dst += 1;
   1203             DITHER_INC_X(x);
   1204             count--;
   1205         }
   1206 
   1207         unsigned short dither_value[8];
   1208         __m128i dither, dither_cur;
   1209 #ifdef ENABLE_DITHER_MATRIX_4X4
   1210         const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
   1211         dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
   1212         dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
   1213         dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
   1214         dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
   1215 #else
   1216         const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
   1217         dither_value[0] = dither_value[4] = (dither_scan
   1218                                              >> (((x) & 3) << 2)) & 0xF;
   1219         dither_value[1] = dither_value[5] = (dither_scan
   1220                                              >> (((x + 1) & 3) << 2)) & 0xF;
   1221         dither_value[2] = dither_value[6] = (dither_scan
   1222                                              >> (((x + 2) & 3) << 2)) & 0xF;
   1223         dither_value[3] = dither_value[7] = (dither_scan
   1224                                              >> (((x + 3) & 3) << 2)) & 0xF;
   1225 #endif
   1226         dither = _mm_loadu_si128((__m128i*) dither_value);
   1227 
   1228         const __m128i* s = reinterpret_cast<const __m128i*>(src);
   1229         __m128i* d = reinterpret_cast<__m128i*>(dst);
   1230         __m128i var256 = _mm_set1_epi16(256);
   1231         __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
   1232         __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
   1233         __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
   1234 
   1235         while (count >= 8) {
   1236             // Load 8 pixels of src and dst.
   1237             __m128i src_pixel1 = _mm_loadu_si128(s++);
   1238             __m128i src_pixel2 = _mm_loadu_si128(s++);
   1239             __m128i dst_pixel = _mm_load_si128(d);
   1240 
   1241             // Extract A from src.
   1242             __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
   1243             sa1 = _mm_srli_epi32(sa1, 24);
   1244             __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
   1245             sa2 = _mm_srli_epi32(sa2, 24);
   1246             __m128i sa = _mm_packs_epi32(sa1, sa2);
   1247 
   1248             // Calculate current dither value.
   1249             dither_cur = _mm_mullo_epi16(dither,
   1250                                          _mm_add_epi16(sa, _mm_set1_epi16(1)));
   1251             dither_cur = _mm_srli_epi16(dither_cur, 8);
   1252 
   1253             // Extract R from src.
   1254             __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
   1255             sr1 = _mm_srli_epi32(sr1, 24);
   1256             __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
   1257             sr2 = _mm_srli_epi32(sr2, 24);
   1258             __m128i sr = _mm_packs_epi32(sr1, sr2);
   1259 
   1260             // SkDITHER_R32_FOR_565(sr, d)
   1261             __m128i sr_offset = _mm_srli_epi16(sr, 5);
   1262             sr = _mm_add_epi16(sr, dither_cur);
   1263             sr = _mm_sub_epi16(sr, sr_offset);
   1264 
   1265             // Expand sr.
   1266             sr = _mm_slli_epi16(sr, 2);
   1267 
   1268             // Extract G from src.
   1269             __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
   1270             sg1 = _mm_srli_epi32(sg1, 24);
   1271             __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
   1272             sg2 = _mm_srli_epi32(sg2, 24);
   1273             __m128i sg = _mm_packs_epi32(sg1, sg2);
   1274 
   1275             // sg = SkDITHER_G32_FOR_565(sg, d).
   1276             __m128i sg_offset = _mm_srli_epi16(sg, 6);
   1277             sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
   1278             sg = _mm_sub_epi16(sg, sg_offset);
   1279 
   1280             // Expand sg.
   1281             sg = _mm_slli_epi16(sg, 3);
   1282 
   1283             // Extract B from src.
   1284             __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
   1285             sb1 = _mm_srli_epi32(sb1, 24);
   1286             __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
   1287             sb2 = _mm_srli_epi32(sb2, 24);
   1288             __m128i sb = _mm_packs_epi32(sb1, sb2);
   1289 
   1290             // sb = SkDITHER_B32_FOR_565(sb, d).
   1291             __m128i sb_offset = _mm_srli_epi16(sb, 5);
   1292             sb = _mm_add_epi16(sb, dither_cur);
   1293             sb = _mm_sub_epi16(sb, sb_offset);
   1294 
   1295             // Expand sb.
   1296             sb = _mm_slli_epi16(sb, 2);
   1297 
   1298             // Extract R G B from dst.
   1299             __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
   1300             dr = _mm_and_si128(dr, r16_mask);
   1301             __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
   1302             dg = _mm_and_si128(dg, g16_mask);
   1303             __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
   1304             db = _mm_and_si128(db, b16_mask);
   1305 
   1306             // SkAlpha255To256(255 - a) >> 3
   1307             __m128i isa = _mm_sub_epi16(var256, sa);
   1308             isa = _mm_srli_epi16(isa, 3);
   1309 
   1310             dr = _mm_mullo_epi16(dr, isa);
   1311             dr = _mm_add_epi16(dr, sr);
   1312             dr = _mm_srli_epi16(dr, 5);
   1313 
   1314             dg = _mm_mullo_epi16(dg, isa);
   1315             dg = _mm_add_epi16(dg, sg);
   1316             dg = _mm_srli_epi16(dg, 5);
   1317 
   1318             db = _mm_mullo_epi16(db, isa);
   1319             db = _mm_add_epi16(db, sb);
   1320             db = _mm_srli_epi16(db, 5);
   1321 
   1322             // Package and store dst pixel.
   1323             __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
   1324             _mm_store_si128(d++, d_pixel);
   1325 
   1326             count -= 8;
   1327             x += 8;
   1328         }
   1329 
   1330         src = reinterpret_cast<const SkPMColor*>(s);
   1331         dst = reinterpret_cast<uint16_t*>(d);
   1332     }
   1333 
   1334     if (count > 0) {
   1335         DITHER_565_SCAN(y);
   1336         do {
   1337             SkPMColor c = *src++;
   1338             SkPMColorAssert(c);
   1339             if (c) {
   1340                 unsigned a = SkGetPackedA32(c);
   1341 
   1342                 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
   1343 
   1344                 unsigned sr = SkGetPackedR32(c);
   1345                 unsigned sg = SkGetPackedG32(c);
   1346                 unsigned sb = SkGetPackedB32(c);
   1347                 sr = SkDITHER_R32_FOR_565(sr, d);
   1348                 sg = SkDITHER_G32_FOR_565(sg, d);
   1349                 sb = SkDITHER_B32_FOR_565(sb, d);
   1350 
   1351                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
   1352                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
   1353                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
   1354                 // now src and dst expanded are in g:11 r:10 x:1 b:10
   1355                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
   1356             }
   1357             dst += 1;
   1358             DITHER_INC_X(x);
   1359         } while (--count != 0);
   1360     }
   1361 }
   1362