Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 
      9 #include "SkBlitRow_opts_SSE2.h"
     10 #include "SkColorPriv.h"
     11 #include "SkUtils.h"
     12 
     13 #include <emmintrin.h>
     14 
     15 /* SSE2 version of S32_Blend_BlitRow32()
     16  * portable version is in core/SkBlitRow_D32.cpp
     17  */
     18 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     19                               const SkPMColor* SK_RESTRICT src,
     20                               int count, U8CPU alpha) {
     21     SkASSERT(alpha <= 255);
     22     if (count <= 0) {
     23         return;
     24     }
     25 
     26     uint32_t src_scale = SkAlpha255To256(alpha);
     27     uint32_t dst_scale = 256 - src_scale;
     28 
     29     if (count >= 4) {
     30         SkASSERT(((size_t)dst & 0x03) == 0);
     31         while (((size_t)dst & 0x0F) != 0) {
     32             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     33             src++;
     34             dst++;
     35             count--;
     36         }
     37 
     38         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     39         __m128i *d = reinterpret_cast<__m128i*>(dst);
     40         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     41         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
     42 
     43         // Move scale factors to upper byte of word
     44         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
     45         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
     46         while (count >= 4) {
     47             // Load 4 pixels each of src and dest.
     48             __m128i src_pixel = _mm_loadu_si128(s);
     49             __m128i dst_pixel = _mm_load_si128(d);
     50 
     51             // Interleave Atom port 0/1 operations based on the execution port
     52             // constraints that multiply can only be executed on port 0 (while
     53             // boolean operations can be executed on either port 0 or port 1)
     54             // because GCC currently doesn't do a good job scheduling
     55             // instructions based on these constraints.
     56 
     57             // Get red and blue pixels into lower byte of each word.
     58             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
     59             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
     60 
     61             // Multiply by scale.
     62             // (4 x (0, rs.h, 0, bs.h))
     63             // where rs.h stands for the higher byte of r * scale, and
     64             // bs.h the higher byte of b * scale.
     65             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
     66 
     67             // Get alpha and green pixels into higher byte of each word.
     68             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
     69             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
     70 
     71             // Multiply by scale.
     72             // (4 x (as.h, as.l, gs.h, gs.l))
     73             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
     74 
     75             // Clear the lower byte of the a*scale and g*scale results
     76             // (4 x (as.h, 0, gs.h, 0))
     77             src_ag = _mm_and_si128(src_ag, ag_mask);
     78 
     79             // Operations the destination pixels are the same as on the
     80             // source pixels. See the comments above.
     81             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     82             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
     83             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
     84             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
     85             dst_ag = _mm_and_si128(dst_ag, ag_mask);
     86 
     87             // Combine back into RGBA.
     88             // (4 x (as.h, rs.h, gs.h, bs.h))
     89             src_pixel = _mm_or_si128(src_rb, src_ag);
     90             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     91 
     92             // Add result
     93             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     94             _mm_store_si128(d, result);
     95             s++;
     96             d++;
     97             count -= 4;
     98         }
     99         src = reinterpret_cast<const SkPMColor*>(s);
    100         dst = reinterpret_cast<SkPMColor*>(d);
    101     }
    102 
    103     while (count > 0) {
    104         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    105         src++;
    106         dst++;
    107         count--;
    108     }
    109 }
    110 
    111 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    112                                 const SkPMColor* SK_RESTRICT src,
    113                                 int count, U8CPU alpha) {
    114     SkASSERT(alpha == 255);
    115     if (count <= 0) {
    116         return;
    117     }
    118 
    119     if (count >= 4) {
    120         SkASSERT(((size_t)dst & 0x03) == 0);
    121         while (((size_t)dst & 0x0F) != 0) {
    122             *dst = SkPMSrcOver(*src, *dst);
    123             src++;
    124             dst++;
    125             count--;
    126         }
    127 
    128         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    129         __m128i *d = reinterpret_cast<__m128i*>(dst);
    130 #ifdef SK_USE_ACCURATE_BLENDING
    131         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    132         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
    133         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
    134         while (count >= 4) {
    135             // Load 4 pixels
    136             __m128i src_pixel = _mm_loadu_si128(s);
    137             __m128i dst_pixel = _mm_load_si128(d);
    138 
    139             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    140             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    141             // Shift alphas down to lower 8 bits of each quad.
    142             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
    143 
    144             // Copy alpha to upper 3rd byte of each quad
    145             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
    146 
    147             // Subtract alphas from 255, to get 0..255
    148             alpha = _mm_sub_epi16(c_255, alpha);
    149 
    150             // Multiply by red and blue by src alpha.
    151             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    152             // Multiply by alpha and green by src alpha.
    153             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    154 
    155             // dst_rb_low = (dst_rb >> 8)
    156             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
    157             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
    158 
    159             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
    160             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
    161             dst_rb = _mm_add_epi16(dst_rb, c_128);
    162             dst_rb = _mm_srli_epi16(dst_rb, 8);
    163 
    164             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
    165             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
    166             dst_ag = _mm_add_epi16(dst_ag, c_128);
    167             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    168 
    169             // Combine back into RGBA.
    170             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    171 
    172             // Add result
    173             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    174             _mm_store_si128(d, result);
    175             s++;
    176             d++;
    177             count -= 4;
    178         }
    179     #else
    180         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    181         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
    182         while (count >= 4) {
    183             // Load 4 pixels
    184             __m128i src_pixel = _mm_loadu_si128(s);
    185             __m128i dst_pixel = _mm_load_si128(d);
    186 
    187             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    188             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    189 
    190             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
    191             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
    192 
    193             // (a0, a0, a1, a1, a2, g2, a3, g3)
    194             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
    195 
    196             // (a0, a0, a1, a1, a2, a2, a3, a3)
    197             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
    198 
    199             // Subtract alphas from 256, to get 1..256
    200             alpha = _mm_sub_epi16(c_256, alpha);
    201 
    202             // Multiply by red and blue by src alpha.
    203             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    204             // Multiply by alpha and green by src alpha.
    205             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    206 
    207             // Divide by 256.
    208             dst_rb = _mm_srli_epi16(dst_rb, 8);
    209 
    210             // Mask out high bits (already in the right place)
    211             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    212 
    213             // Combine back into RGBA.
    214             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    215 
    216             // Add result
    217             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    218             _mm_store_si128(d, result);
    219             s++;
    220             d++;
    221             count -= 4;
    222         }
    223 #endif
    224         src = reinterpret_cast<const SkPMColor*>(s);
    225         dst = reinterpret_cast<SkPMColor*>(d);
    226     }
    227 
    228     while (count > 0) {
    229         *dst = SkPMSrcOver(*src, *dst);
    230         src++;
    231         dst++;
    232         count--;
    233     }
    234 }
    235 
    236 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    237                                const SkPMColor* SK_RESTRICT src,
    238                                int count, U8CPU alpha) {
    239     SkASSERT(alpha <= 255);
    240     if (count <= 0) {
    241         return;
    242     }
    243 
    244     if (count >= 4) {
    245         while (((size_t)dst & 0x0F) != 0) {
    246             *dst = SkBlendARGB32(*src, *dst, alpha);
    247             src++;
    248             dst++;
    249             count--;
    250         }
    251 
    252         uint32_t src_scale = SkAlpha255To256(alpha);
    253 
    254         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    255         __m128i *d = reinterpret_cast<__m128i*>(dst);
    256         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
    257         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    258         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
    259         while (count >= 4) {
    260             // Load 4 pixels each of src and dest.
    261             __m128i src_pixel = _mm_loadu_si128(s);
    262             __m128i dst_pixel = _mm_load_si128(d);
    263 
    264             // Get red and blue pixels into lower byte of each word.
    265             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    266             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    267 
    268             // Get alpha and green into lower byte of each word.
    269             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    270             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    271 
    272             // Put per-pixel alpha in low byte of each word.
    273             // After the following two statements, the dst_alpha looks like
    274             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
    275             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    276             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    277 
    278             // dst_alpha = dst_alpha * src_scale
    279             // Because src_scales are in the higher byte of each word and
    280             // we use mulhi here, the resulting alpha values are already
    281             // in the right place and don't need to be divided by 256.
    282             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
    283             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
    284 
    285             // Subtract alphas from 256, to get 1..256
    286             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    287 
    288             // Multiply red and blue by dst pixel alpha.
    289             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    290             // Multiply alpha and green by dst pixel alpha.
    291             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    292 
    293             // Multiply red and blue by global alpha.
    294             // (4 x (0, rs.h, 0, bs.h))
    295             // where rs.h stands for the higher byte of r * src_scale,
    296             // and bs.h the higher byte of b * src_scale.
    297             // Again, because we use mulhi, the resuling red and blue
    298             // values are already in the right place and don't need to
    299             // be divided by 256.
    300             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
    301             // Multiply alpha and green by global alpha.
    302             // (4 x (0, as.h, 0, gs.h))
    303             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
    304 
    305             // Divide by 256.
    306             dst_rb = _mm_srli_epi16(dst_rb, 8);
    307 
    308             // Mask out low bits (goodies already in the right place; no need to divide)
    309             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    310             // Shift alpha and green to higher byte of each word.
    311             // (4 x (as.h, 0, gs.h, 0))
    312             src_ag = _mm_slli_epi16(src_ag, 8);
    313 
    314             // Combine back into RGBA.
    315             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    316             src_pixel = _mm_or_si128(src_rb, src_ag);
    317 
    318             // Add two pixels into result.
    319             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    320             _mm_store_si128(d, result);
    321             s++;
    322             d++;
    323             count -= 4;
    324         }
    325         src = reinterpret_cast<const SkPMColor*>(s);
    326         dst = reinterpret_cast<SkPMColor*>(d);
    327     }
    328 
    329     while (count > 0) {
    330         *dst = SkBlendARGB32(*src, *dst, alpha);
    331         src++;
    332         dst++;
    333         count--;
    334     }
    335 }
    336 
    337 /* SSE2 version of Color32()
    338  * portable version is in core/SkBlitRow_D32.cpp
    339  */
    340 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
    341                   SkPMColor color) {
    342 
    343     if (count <= 0) {
    344         return;
    345     }
    346 
    347     if (0 == color) {
    348         if (src != dst) {
    349             memcpy(dst, src, count * sizeof(SkPMColor));
    350         }
    351         return;
    352     }
    353 
    354     unsigned colorA = SkGetPackedA32(color);
    355     if (255 == colorA) {
    356         sk_memset32(dst, color, count);
    357     } else {
    358         unsigned scale = 256 - SkAlpha255To256(colorA);
    359 
    360         if (count >= 4) {
    361             SkASSERT(((size_t)dst & 0x03) == 0);
    362             while (((size_t)dst & 0x0F) != 0) {
    363                 *dst = color + SkAlphaMulQ(*src, scale);
    364                 src++;
    365                 dst++;
    366                 count--;
    367             }
    368 
    369             const __m128i *s = reinterpret_cast<const __m128i*>(src);
    370             __m128i *d = reinterpret_cast<__m128i*>(dst);
    371             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    372             __m128i src_scale_wide = _mm_set1_epi16(scale);
    373             __m128i color_wide = _mm_set1_epi32(color);
    374             while (count >= 4) {
    375                 // Load 4 pixels each of src and dest.
    376                 __m128i src_pixel = _mm_loadu_si128(s);
    377 
    378                 // Get red and blue pixels into lower byte of each word.
    379                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    380 
    381                 // Get alpha and green into lower byte of each word.
    382                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    383 
    384                 // Multiply by scale.
    385                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    386                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    387 
    388                 // Divide by 256.
    389                 src_rb = _mm_srli_epi16(src_rb, 8);
    390                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    391 
    392                 // Combine back into RGBA.
    393                 src_pixel = _mm_or_si128(src_rb, src_ag);
    394 
    395                 // Add color to result.
    396                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
    397 
    398                 // Store result.
    399                 _mm_store_si128(d, result);
    400                 s++;
    401                 d++;
    402                 count -= 4;
    403             }
    404             src = reinterpret_cast<const SkPMColor*>(s);
    405             dst = reinterpret_cast<SkPMColor*>(d);
    406          }
    407 
    408         while (count > 0) {
    409             *dst = color + SkAlphaMulQ(*src, scale);
    410             src += 1;
    411             dst += 1;
    412             count--;
    413         }
    414     }
    415 }
    416 
    417 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
    418                                size_t maskRB, SkColor origColor,
    419                                int width, int height) {
    420     SkPMColor color = SkPreMultiplyColor(origColor);
    421     size_t dstOffset = dstRB - (width << 2);
    422     size_t maskOffset = maskRB - width;
    423     SkPMColor* dst = (SkPMColor *)device;
    424     const uint8_t* mask = (const uint8_t*)maskPtr;
    425     do {
    426         int count = width;
    427         if (count >= 4) {
    428             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
    429                 *dst = SkBlendARGB32(color, *dst, *mask);
    430                 mask++;
    431                 dst++;
    432                 count--;
    433             }
    434             __m128i *d = reinterpret_cast<__m128i*>(dst);
    435             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    436             __m128i c_256 = _mm_set1_epi16(256);
    437             __m128i c_1 = _mm_set1_epi16(1);
    438             __m128i src_pixel = _mm_set1_epi32(color);
    439             while (count >= 4) {
    440                 // Load 4 pixels each of src and dest.
    441                 __m128i dst_pixel = _mm_load_si128(d);
    442 
    443                 //set the aphla value
    444                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
    445                                 0, *(mask+3),0, \
    446                                 *(mask+2),0, *(mask+2),\
    447                                 0,*(mask+1), 0,*(mask+1),\
    448                                 0, *mask,0,*mask);
    449 
    450                 //call SkAlpha255To256()
    451                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
    452 
    453                 // Get red and blue pixels into lower byte of each word.
    454                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    455                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    456 
    457                 // Get alpha and green into lower byte of each word.
    458                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    459                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    460 
    461                 // Put per-pixel alpha in low byte of each word.
    462                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    463                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    464 
    465                 // dst_alpha = dst_alpha * src_scale
    466                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
    467 
    468                 // Divide by 256.
    469                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
    470 
    471                 // Subtract alphas from 256, to get 1..256
    472                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    473                 // Multiply red and blue by dst pixel alpha.
    474                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    475                 // Multiply alpha and green by dst pixel alpha.
    476                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    477 
    478                 // Multiply red and blue by global alpha.
    479                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    480                 // Multiply alpha and green by global alpha.
    481                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    482                 // Divide by 256.
    483                 dst_rb = _mm_srli_epi16(dst_rb, 8);
    484                 src_rb = _mm_srli_epi16(src_rb, 8);
    485 
    486                 // Mask out low bits (goodies already in the right place; no need to divide)
    487                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    488                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    489 
    490                 // Combine back into RGBA.
    491                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    492                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
    493 
    494                 // Add two pixels into result.
    495                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
    496                 _mm_store_si128(d, result);
    497                 // load the next 4 pixel
    498                 mask = mask + 4;
    499                 d++;
    500                 count -= 4;
    501             }
    502             dst = reinterpret_cast<SkPMColor *>(d);
    503         }
    504         while(count > 0) {
    505             *dst= SkBlendARGB32(color, *dst, *mask);
    506             dst += 1;
    507             mask++;
    508             count --;
    509         }
    510         dst = (SkPMColor *)((char*)dst + dstOffset);
    511         mask += maskOffset;
    512     } while (--height != 0);
    513 }
    514 
    515 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
    516                                  __m128i &mask, __m128i &scale) {
    517     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    518     __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
    519                               16-SK_R16_SHIFT-(SK_R16_BITS-5)),
    520                               _mm_set1_epi32(0x001F0000));
    521 
    522     __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
    523                               8-SK_G16_SHIFT-(SK_G16_BITS-5)),
    524                               _mm_set1_epi32(0x00001F00));
    525 
    526     __m128i b = _mm_and_si128(_mm_slli_epi32(mask,
    527                               SK_B16_BITS-5),
    528                               _mm_set1_epi32(0x0000001F));
    529 
    530     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    531     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    532 
    533     // Interleave R,G,B into the lower byte of word.
    534     __m128i maskLo, maskHi;
    535     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    536     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    537 
    538     // Upscale to 0..32
    539     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    540     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    541 
    542     maskLo = _mm_mullo_epi16(maskLo, scale);
    543     maskHi = _mm_mullo_epi16(maskHi, scale);
    544 
    545     maskLo = _mm_srli_epi16(maskLo, 8);
    546     maskHi = _mm_srli_epi16(maskHi, 8);
    547 
    548     // Interleave R,G,B into the lower byte of the word.
    549     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    550     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    551 
    552     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
    553     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
    554 
    555     maskLo = _mm_srai_epi16(maskLo, 5);
    556     maskHi = _mm_srai_epi16(maskHi, 5);
    557 
    558     // Add two pixels into result.
    559     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    560     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    561 
    562     // Pack into 4 32bit dst pixels
    563     return _mm_packus_epi16(resultLo, resultHi);
    564 }
    565 
    566 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
    567                                        __m128i &mask) {
    568     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    569     __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
    570                               16-SK_R16_SHIFT-(SK_R16_BITS-5)),
    571                               _mm_set1_epi32(0x001F0000));
    572 
    573     __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
    574                               8-SK_G16_SHIFT-(SK_G16_BITS-5)),
    575                               _mm_set1_epi32(0x00001F00));
    576 
    577     __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
    578                               _mm_set1_epi32(0x0000001F));
    579 
    580     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    581     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    582 
    583     // Interleave R,G,B into the lower byte of word.
    584     __m128i maskLo, maskHi;
    585     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    586     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    587 
    588     // Upscale to 0..32
    589     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    590     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    591 
    592     // Interleave R,G,B into the lower byte of the word.
    593     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    594     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    595 
    596     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
    597     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
    598 
    599     maskLo = _mm_srai_epi16(maskLo, 5);
    600     maskHi = _mm_srai_epi16(maskHi, 5);
    601 
    602     // Add two pixels into result.
    603     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    604     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    605 
    606     // Pack into 4 32bit dst pixels
    607     return _mm_packus_epi16(resultLo, resultHi);
    608 }
    609 
    610 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
    611                          SkColor color, int width, SkPMColor) {
    612     if (width <= 0) {
    613         return;
    614     }
    615 
    616     int srcA = SkColorGetA(color);
    617     int srcR = SkColorGetR(color);
    618     int srcG = SkColorGetG(color);
    619     int srcB = SkColorGetB(color);
    620 
    621     srcA = SkAlpha255To256(srcA);
    622 
    623     if (width >= 4) {
    624         SkASSERT(((size_t)dst & 0x03) == 0);
    625         while (((size_t)dst & 0x0F) != 0) {
    626             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
    627             src++;
    628             dst++;
    629             width--;
    630         }
    631 
    632         __m128i *d = reinterpret_cast<__m128i*>(dst);
    633         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    634         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
    635         __m128i scale = _mm_set1_epi16(srcA);
    636         while (width >= 4) {
    637             __m128i dst_pixel = _mm_load_si128(d);
    638             __m128i mask_pixel = _mm_loadl_epi64(
    639                                      reinterpret_cast<const __m128i*>(src));
    640 
    641             // Check whether mask_pixels are equal to 0 and get the highest bit
    642             // of each byte of result, if mask pixes are all zero, we will get
    643             // pack_cmp to 0xFFFF
    644             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
    645                                              _mm_setzero_si128()));
    646 
    647             // if mask pixels are not all zero, we will blend the dst pixels
    648             if (pack_cmp != 0xFFFF) {
    649                 // Unpack 4 16bit mask pixels to
    650                 // (p0, 0, p1, 0, p2, 0, p3, 0)
    651                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
    652                                                 _mm_setzero_si128());
    653 
    654                 // Process 4 32bit dst pixels
    655                 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
    656                                                    mask_pixel, scale);
    657                 _mm_store_si128(d, result);
    658             }
    659 
    660             d++;
    661             src += 4;
    662             width -= 4;
    663         }
    664 
    665         dst = reinterpret_cast<SkPMColor*>(d);
    666     }
    667 
    668     while (width > 0) {
    669         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
    670         src++;
    671         dst++;
    672         width--;
    673     }
    674 }
    675 
    676 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
    677                                SkColor color, int width, SkPMColor opaqueDst) {
    678     if (width <= 0) {
    679         return;
    680     }
    681 
    682     int srcR = SkColorGetR(color);
    683     int srcG = SkColorGetG(color);
    684     int srcB = SkColorGetB(color);
    685 
    686     if (width >= 4) {
    687         SkASSERT(((size_t)dst & 0x03) == 0);
    688         while (((size_t)dst & 0x0F) != 0) {
    689             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
    690             src++;
    691             dst++;
    692             width--;
    693         }
    694 
    695         __m128i *d = reinterpret_cast<__m128i*>(dst);
    696         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    697         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
    698         while (width >= 4) {
    699             __m128i dst_pixel = _mm_load_si128(d);
    700             __m128i mask_pixel = _mm_loadl_epi64(
    701                                      reinterpret_cast<const __m128i*>(src));
    702 
    703             // Check whether mask_pixels are equal to 0 and get the highest bit
    704             // of each byte of result, if mask pixes are all zero, we will get
    705             // pack_cmp to 0xFFFF
    706             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
    707                                              _mm_setzero_si128()));
    708 
    709             // if mask pixels are not all zero, we will blend the dst pixels
    710             if (pack_cmp != 0xFFFF) {
    711                 // Unpack 4 16bit mask pixels to
    712                 // (p0, 0, p1, 0, p2, 0, p3, 0)
    713                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
    714                                                 _mm_setzero_si128());
    715 
    716                 // Process 4 32bit dst pixels
    717                 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
    718                                                          mask_pixel);
    719                 _mm_store_si128(d, result);
    720             }
    721 
    722             d++;
    723             src += 4;
    724             width -= 4;
    725         }
    726 
    727         dst = reinterpret_cast<SkPMColor*>(d);
    728     }
    729 
    730     while (width > 0) {
    731         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
    732         src++;
    733         dst++;
    734         width--;
    735     }
    736 }
    737