Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 
      9 #include "SkBlitRow_opts_SSE2.h"
     10 #include "SkBitmapProcState_opts_SSE2.h"
     11 #include "SkColorPriv.h"
     12 #include "SkUtils.h"
     13 
     14 #include <emmintrin.h>
     15 
     16 /* SSE2 version of S32_Blend_BlitRow32()
     17  * portable version is in core/SkBlitRow_D32.cpp
     18  */
     19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     20                               const SkPMColor* SK_RESTRICT src,
     21                               int count, U8CPU alpha) {
     22     SkASSERT(alpha <= 255);
     23     if (count <= 0) {
     24         return;
     25     }
     26 
     27     uint32_t src_scale = SkAlpha255To256(alpha);
     28     uint32_t dst_scale = 256 - src_scale;
     29 
     30     if (count >= 4) {
     31         SkASSERT(((size_t)dst & 0x03) == 0);
     32         while (((size_t)dst & 0x0F) != 0) {
     33             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     34             src++;
     35             dst++;
     36             count--;
     37         }
     38 
     39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     40         __m128i *d = reinterpret_cast<__m128i*>(dst);
     41         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     42         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
     43 
     44         // Move scale factors to upper byte of word
     45         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
     46         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
     47         while (count >= 4) {
     48             // Load 4 pixels each of src and dest.
     49             __m128i src_pixel = _mm_loadu_si128(s);
     50             __m128i dst_pixel = _mm_load_si128(d);
     51 
     52             // Interleave Atom port 0/1 operations based on the execution port
     53             // constraints that multiply can only be executed on port 0 (while
     54             // boolean operations can be executed on either port 0 or port 1)
     55             // because GCC currently doesn't do a good job scheduling
     56             // instructions based on these constraints.
     57 
     58             // Get red and blue pixels into lower byte of each word.
     59             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
     60             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
     61 
     62             // Multiply by scale.
     63             // (4 x (0, rs.h, 0, bs.h))
     64             // where rs.h stands for the higher byte of r * scale, and
     65             // bs.h the higher byte of b * scale.
     66             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
     67 
     68             // Get alpha and green pixels into higher byte of each word.
     69             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
     70             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
     71 
     72             // Multiply by scale.
     73             // (4 x (as.h, as.l, gs.h, gs.l))
     74             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
     75 
     76             // Clear the lower byte of the a*scale and g*scale results
     77             // (4 x (as.h, 0, gs.h, 0))
     78             src_ag = _mm_and_si128(src_ag, ag_mask);
     79 
     80             // Operations the destination pixels are the same as on the
     81             // source pixels. See the comments above.
     82             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     83             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
     84             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
     85             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
     86             dst_ag = _mm_and_si128(dst_ag, ag_mask);
     87 
     88             // Combine back into RGBA.
     89             // (4 x (as.h, rs.h, gs.h, bs.h))
     90             src_pixel = _mm_or_si128(src_rb, src_ag);
     91             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     92 
     93             // Add result
     94             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     95             _mm_store_si128(d, result);
     96             s++;
     97             d++;
     98             count -= 4;
     99         }
    100         src = reinterpret_cast<const SkPMColor*>(s);
    101         dst = reinterpret_cast<SkPMColor*>(d);
    102     }
    103 
    104     while (count > 0) {
    105         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
    106         src++;
    107         dst++;
    108         count--;
    109     }
    110 }
    111 
    112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    113                                 const SkPMColor* SK_RESTRICT src,
    114                                 int count, U8CPU alpha) {
    115     SkASSERT(alpha == 255);
    116     if (count <= 0) {
    117         return;
    118     }
    119 
    120     if (count >= 4) {
    121         SkASSERT(((size_t)dst & 0x03) == 0);
    122         while (((size_t)dst & 0x0F) != 0) {
    123             *dst = SkPMSrcOver(*src, *dst);
    124             src++;
    125             dst++;
    126             count--;
    127         }
    128 
    129         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    130         __m128i *d = reinterpret_cast<__m128i*>(dst);
    131 #ifdef SK_USE_ACCURATE_BLENDING
    132         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    133         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
    134         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
    135         while (count >= 4) {
    136             // Load 4 pixels
    137             __m128i src_pixel = _mm_loadu_si128(s);
    138             __m128i dst_pixel = _mm_load_si128(d);
    139 
    140             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    141             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    142             // Shift alphas down to lower 8 bits of each quad.
    143             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
    144 
    145             // Copy alpha to upper 3rd byte of each quad
    146             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
    147 
    148             // Subtract alphas from 255, to get 0..255
    149             alpha = _mm_sub_epi16(c_255, alpha);
    150 
    151             // Multiply by red and blue by src alpha.
    152             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    153             // Multiply by alpha and green by src alpha.
    154             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    155 
    156             // dst_rb_low = (dst_rb >> 8)
    157             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
    158             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
    159 
    160             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
    161             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
    162             dst_rb = _mm_add_epi16(dst_rb, c_128);
    163             dst_rb = _mm_srli_epi16(dst_rb, 8);
    164 
    165             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
    166             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
    167             dst_ag = _mm_add_epi16(dst_ag, c_128);
    168             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    169 
    170             // Combine back into RGBA.
    171             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    172 
    173             // Add result
    174             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    175             _mm_store_si128(d, result);
    176             s++;
    177             d++;
    178             count -= 4;
    179         }
    180     #else
    181         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    182         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
    183         while (count >= 4) {
    184             // Load 4 pixels
    185             __m128i src_pixel = _mm_loadu_si128(s);
    186             __m128i dst_pixel = _mm_load_si128(d);
    187 
    188             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    189             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    190 
    191             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
    192             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
    193 
    194             // (a0, a0, a1, a1, a2, g2, a3, g3)
    195             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
    196 
    197             // (a0, a0, a1, a1, a2, a2, a3, a3)
    198             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
    199 
    200             // Subtract alphas from 256, to get 1..256
    201             alpha = _mm_sub_epi16(c_256, alpha);
    202 
    203             // Multiply by red and blue by src alpha.
    204             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    205             // Multiply by alpha and green by src alpha.
    206             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    207 
    208             // Divide by 256.
    209             dst_rb = _mm_srli_epi16(dst_rb, 8);
    210 
    211             // Mask out high bits (already in the right place)
    212             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    213 
    214             // Combine back into RGBA.
    215             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    216 
    217             // Add result
    218             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    219             _mm_store_si128(d, result);
    220             s++;
    221             d++;
    222             count -= 4;
    223         }
    224 #endif
    225         src = reinterpret_cast<const SkPMColor*>(s);
    226         dst = reinterpret_cast<SkPMColor*>(d);
    227     }
    228 
    229     while (count > 0) {
    230         *dst = SkPMSrcOver(*src, *dst);
    231         src++;
    232         dst++;
    233         count--;
    234     }
    235 }
    236 
    237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    238                                const SkPMColor* SK_RESTRICT src,
    239                                int count, U8CPU alpha) {
    240     SkASSERT(alpha <= 255);
    241     if (count <= 0) {
    242         return;
    243     }
    244 
    245     if (count >= 4) {
    246         while (((size_t)dst & 0x0F) != 0) {
    247             *dst = SkBlendARGB32(*src, *dst, alpha);
    248             src++;
    249             dst++;
    250             count--;
    251         }
    252 
    253         uint32_t src_scale = SkAlpha255To256(alpha);
    254 
    255         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    256         __m128i *d = reinterpret_cast<__m128i*>(dst);
    257         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
    258         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    259         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
    260         while (count >= 4) {
    261             // Load 4 pixels each of src and dest.
    262             __m128i src_pixel = _mm_loadu_si128(s);
    263             __m128i dst_pixel = _mm_load_si128(d);
    264 
    265             // Get red and blue pixels into lower byte of each word.
    266             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    267             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    268 
    269             // Get alpha and green into lower byte of each word.
    270             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    271             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    272 
    273             // Put per-pixel alpha in low byte of each word.
    274             // After the following two statements, the dst_alpha looks like
    275             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
    276             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    277             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    278 
    279             // dst_alpha = dst_alpha * src_scale
    280             // Because src_scales are in the higher byte of each word and
    281             // we use mulhi here, the resulting alpha values are already
    282             // in the right place and don't need to be divided by 256.
    283             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
    284             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
    285 
    286             // Subtract alphas from 256, to get 1..256
    287             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    288 
    289             // Multiply red and blue by dst pixel alpha.
    290             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    291             // Multiply alpha and green by dst pixel alpha.
    292             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    293 
    294             // Multiply red and blue by global alpha.
    295             // (4 x (0, rs.h, 0, bs.h))
    296             // where rs.h stands for the higher byte of r * src_scale,
    297             // and bs.h the higher byte of b * src_scale.
    298             // Again, because we use mulhi, the resuling red and blue
    299             // values are already in the right place and don't need to
    300             // be divided by 256.
    301             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
    302             // Multiply alpha and green by global alpha.
    303             // (4 x (0, as.h, 0, gs.h))
    304             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
    305 
    306             // Divide by 256.
    307             dst_rb = _mm_srli_epi16(dst_rb, 8);
    308 
    309             // Mask out low bits (goodies already in the right place; no need to divide)
    310             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    311             // Shift alpha and green to higher byte of each word.
    312             // (4 x (as.h, 0, gs.h, 0))
    313             src_ag = _mm_slli_epi16(src_ag, 8);
    314 
    315             // Combine back into RGBA.
    316             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    317             src_pixel = _mm_or_si128(src_rb, src_ag);
    318 
    319             // Add two pixels into result.
    320             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    321             _mm_store_si128(d, result);
    322             s++;
    323             d++;
    324             count -= 4;
    325         }
    326         src = reinterpret_cast<const SkPMColor*>(s);
    327         dst = reinterpret_cast<SkPMColor*>(d);
    328     }
    329 
    330     while (count > 0) {
    331         *dst = SkBlendARGB32(*src, *dst, alpha);
    332         src++;
    333         dst++;
    334         count--;
    335     }
    336 }
    337 
    338 /* SSE2 version of Color32()
    339  * portable version is in core/SkBlitRow_D32.cpp
    340  */
    341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
    342                   SkPMColor color) {
    343 
    344     if (count <= 0) {
    345         return;
    346     }
    347 
    348     if (0 == color) {
    349         if (src != dst) {
    350             memcpy(dst, src, count * sizeof(SkPMColor));
    351         }
    352         return;
    353     }
    354 
    355     unsigned colorA = SkGetPackedA32(color);
    356     if (255 == colorA) {
    357         sk_memset32(dst, color, count);
    358     } else {
    359         unsigned scale = 256 - SkAlpha255To256(colorA);
    360 
    361         if (count >= 4) {
    362             SkASSERT(((size_t)dst & 0x03) == 0);
    363             while (((size_t)dst & 0x0F) != 0) {
    364                 *dst = color + SkAlphaMulQ(*src, scale);
    365                 src++;
    366                 dst++;
    367                 count--;
    368             }
    369 
    370             const __m128i *s = reinterpret_cast<const __m128i*>(src);
    371             __m128i *d = reinterpret_cast<__m128i*>(dst);
    372             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    373             __m128i src_scale_wide = _mm_set1_epi16(scale);
    374             __m128i color_wide = _mm_set1_epi32(color);
    375             while (count >= 4) {
    376                 // Load 4 pixels each of src and dest.
    377                 __m128i src_pixel = _mm_loadu_si128(s);
    378 
    379                 // Get red and blue pixels into lower byte of each word.
    380                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    381 
    382                 // Get alpha and green into lower byte of each word.
    383                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    384 
    385                 // Multiply by scale.
    386                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    387                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    388 
    389                 // Divide by 256.
    390                 src_rb = _mm_srli_epi16(src_rb, 8);
    391                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    392 
    393                 // Combine back into RGBA.
    394                 src_pixel = _mm_or_si128(src_rb, src_ag);
    395 
    396                 // Add color to result.
    397                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
    398 
    399                 // Store result.
    400                 _mm_store_si128(d, result);
    401                 s++;
    402                 d++;
    403                 count -= 4;
    404             }
    405             src = reinterpret_cast<const SkPMColor*>(s);
    406             dst = reinterpret_cast<SkPMColor*>(d);
    407          }
    408 
    409         while (count > 0) {
    410             *dst = color + SkAlphaMulQ(*src, scale);
    411             src += 1;
    412             dst += 1;
    413             count--;
    414         }
    415     }
    416 }
    417 
    418 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
    419                                size_t maskRB, SkColor origColor,
    420                                int width, int height) {
    421     SkPMColor color = SkPreMultiplyColor(origColor);
    422     size_t dstOffset = dstRB - (width << 2);
    423     size_t maskOffset = maskRB - width;
    424     SkPMColor* dst = (SkPMColor *)device;
    425     const uint8_t* mask = (const uint8_t*)maskPtr;
    426     do {
    427         int count = width;
    428         if (count >= 4) {
    429             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
    430                 *dst = SkBlendARGB32(color, *dst, *mask);
    431                 mask++;
    432                 dst++;
    433                 count--;
    434             }
    435             __m128i *d = reinterpret_cast<__m128i*>(dst);
    436             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    437             __m128i c_256 = _mm_set1_epi16(256);
    438             __m128i c_1 = _mm_set1_epi16(1);
    439             __m128i src_pixel = _mm_set1_epi32(color);
    440             while (count >= 4) {
    441                 // Load 4 pixels each of src and dest.
    442                 __m128i dst_pixel = _mm_load_si128(d);
    443 
    444                 //set the aphla value
    445                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
    446                                 0, *(mask+3),0, \
    447                                 *(mask+2),0, *(mask+2),\
    448                                 0,*(mask+1), 0,*(mask+1),\
    449                                 0, *mask,0,*mask);
    450 
    451                 //call SkAlpha255To256()
    452                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
    453 
    454                 // Get red and blue pixels into lower byte of each word.
    455                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    456                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    457 
    458                 // Get alpha and green into lower byte of each word.
    459                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    460                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    461 
    462                 // Put per-pixel alpha in low byte of each word.
    463                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    464                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    465 
    466                 // dst_alpha = dst_alpha * src_scale
    467                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
    468 
    469                 // Divide by 256.
    470                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
    471 
    472                 // Subtract alphas from 256, to get 1..256
    473                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    474                 // Multiply red and blue by dst pixel alpha.
    475                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    476                 // Multiply alpha and green by dst pixel alpha.
    477                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    478 
    479                 // Multiply red and blue by global alpha.
    480                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    481                 // Multiply alpha and green by global alpha.
    482                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    483                 // Divide by 256.
    484                 dst_rb = _mm_srli_epi16(dst_rb, 8);
    485                 src_rb = _mm_srli_epi16(src_rb, 8);
    486 
    487                 // Mask out low bits (goodies already in the right place; no need to divide)
    488                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    489                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    490 
    491                 // Combine back into RGBA.
    492                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    493                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
    494 
    495                 // Add two pixels into result.
    496                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
    497                 _mm_store_si128(d, result);
    498                 // load the next 4 pixel
    499                 mask = mask + 4;
    500                 d++;
    501                 count -= 4;
    502             }
    503             dst = reinterpret_cast<SkPMColor *>(d);
    504         }
    505         while(count > 0) {
    506             *dst= SkBlendARGB32(color, *dst, *mask);
    507             dst += 1;
    508             mask++;
    509             count --;
    510         }
    511         dst = (SkPMColor *)((char*)dst + dstOffset);
    512         mask += maskOffset;
    513     } while (--height != 0);
    514 }
    515 
    516 // The following (left) shifts cause the top 5 bits of the mask components to
    517 // line up with the corresponding components in an SkPMColor.
    518 // Note that the mask's RGB16 order may differ from the SkPMColor order.
    519 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    520 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    521 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
    522 
    523 #if SK_R16x5_R32x5_SHIFT == 0
    524     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
    525 #elif SK_R16x5_R32x5_SHIFT > 0
    526     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
    527 #else
    528     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
    529 #endif
    530 
    531 #if SK_G16x5_G32x5_SHIFT == 0
    532     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
    533 #elif SK_G16x5_G32x5_SHIFT > 0
    534     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
    535 #else
    536     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
    537 #endif
    538 
    539 #if SK_B16x5_B32x5_SHIFT == 0
    540     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
    541 #elif SK_B16x5_B32x5_SHIFT > 0
    542     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
    543 #else
    544     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
    545 #endif
    546 
    547 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
    548                                  __m128i &mask, __m128i &srcA) {
    549     // In the following comments, the components of src, dst and mask are
    550     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    551     // by an R, G, B, or A suffix. Components of one of the four pixels that
    552     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    553     // example is the blue channel of the second destination pixel. Memory
    554     // layout is shown for an ARGB byte order in a color value.
    555 
    556     // src and srcA store 8-bit values interleaved with zeros.
    557     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    558     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
    559     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
    560     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
    561     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
    562     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    563     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    564 
    565     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    566     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    567     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    568                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    569 
    570     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    571     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    572                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    573 
    574     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    575     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    576                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    577 
    578     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    579     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    580     // 8-bit position
    581     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    582     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    583     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    584 
    585     // Interleave R,G,B into the lower byte of word.
    586     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    587     // 16-bit values, padded by zero.
    588     __m128i maskLo, maskHi;
    589     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    590     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    591     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    592     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    593 
    594     // Upscale from 0..31 to 0..32
    595     // (allows to replace division by left-shift further down)
    596     // Left-shift each component by 4 and add the result back to that component,
    597     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    598     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    599     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    600 
    601     // Multiply each component of maskLo and maskHi by srcA
    602     maskLo = _mm_mullo_epi16(maskLo, srcA);
    603     maskHi = _mm_mullo_epi16(maskHi, srcA);
    604 
    605     // Left shift mask components by 8 (divide by 256)
    606     maskLo = _mm_srli_epi16(maskLo, 8);
    607     maskHi = _mm_srli_epi16(maskHi, 8);
    608 
    609     // Interleave R,G,B into the lower byte of the word
    610     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    611     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    612     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    613     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    614 
    615     // mask = (src - dst) * mask
    616     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    617     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    618 
    619     // mask = (src - dst) * mask >> 5
    620     maskLo = _mm_srai_epi16(maskLo, 5);
    621     maskHi = _mm_srai_epi16(maskHi, 5);
    622 
    623     // Add two pixels into result.
    624     // result = dst + ((src - dst) * mask >> 5)
    625     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    626     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    627 
    628     // Pack into 4 32bit dst pixels.
    629     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    630     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    631     // clamping to 255 if necessary.
    632     return _mm_packus_epi16(resultLo, resultHi);
    633 }
    634 
    635 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
    636                                        __m128i &mask) {
    637     // In the following comments, the components of src, dst and mask are
    638     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    639     // by an R, G, B, or A suffix. Components of one of the four pixels that
    640     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    641     // example is the blue channel of the second destination pixel. Memory
    642     // layout is shown for an ARGB byte order in a color value.
    643 
    644     // src and srcA store 8-bit values interleaved with zeros.
    645     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    646     // mask stores 16-bit values (shown as high and low bytes) interleaved with
    647     // zeros
    648     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    649     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    650 
    651     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    652     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    653     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    654                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    655 
    656     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    657     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    658                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    659 
    660     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    661     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    662                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    663 
    664     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    665     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    666     // 8-bit position
    667     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    668     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    669     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    670 
    671     // Interleave R,G,B into the lower byte of word.
    672     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    673     // 16-bit values, padded by zero.
    674     __m128i maskLo, maskHi;
    675     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    676     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    677     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    678     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    679 
    680     // Upscale from 0..31 to 0..32
    681     // (allows to replace division by left-shift further down)
    682     // Left-shift each component by 4 and add the result back to that component,
    683     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    684     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    685     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    686 
    687     // Interleave R,G,B into the lower byte of the word
    688     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    689     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    690     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    691     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    692 
    693     // mask = (src - dst) * mask
    694     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    695     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    696 
    697     // mask = (src - dst) * mask >> 5
    698     maskLo = _mm_srai_epi16(maskLo, 5);
    699     maskHi = _mm_srai_epi16(maskHi, 5);
    700 
    701     // Add two pixels into result.
    702     // result = dst + ((src - dst) * mask >> 5)
    703     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    704     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    705 
    706     // Pack into 4 32bit dst pixels and force opaque.
    707     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    708     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    709     // clamping to 255 if necessary. Set alpha components to 0xFF.
    710     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
    711                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
    712 }
    713 
    714 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
    715                          SkColor src, int width, SkPMColor) {
    716     if (width <= 0) {
    717         return;
    718     }
    719 
    720     int srcA = SkColorGetA(src);
    721     int srcR = SkColorGetR(src);
    722     int srcG = SkColorGetG(src);
    723     int srcB = SkColorGetB(src);
    724 
    725     srcA = SkAlpha255To256(srcA);
    726 
    727     if (width >= 4) {
    728         SkASSERT(((size_t)dst & 0x03) == 0);
    729         while (((size_t)dst & 0x0F) != 0) {
    730             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    731             mask++;
    732             dst++;
    733             width--;
    734         }
    735 
    736         __m128i *d = reinterpret_cast<__m128i*>(dst);
    737         // Set alpha to 0xFF and replicate source four times in SSE register.
    738         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    739         // Interleave with zeros to get two sets of four 16-bit values.
    740         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    741         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    742         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    743         __m128i srcA_sse = _mm_set1_epi16(srcA);
    744         while (width >= 4) {
    745             // Load four destination pixels into dst_sse.
    746             __m128i dst_sse = _mm_load_si128(d);
    747             // Load four 16-bit masks into lower half of mask_sse.
    748             __m128i mask_sse = _mm_loadl_epi64(
    749                                    reinterpret_cast<const __m128i*>(mask));
    750 
    751             // Check whether masks are equal to 0 and get the highest bit
    752             // of each byte of result, if masks are all zero, we will get
    753             // pack_cmp to 0xFFFF
    754             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    755                                              _mm_setzero_si128()));
    756 
    757             // if mask pixels are not all zero, we will blend the dst pixels
    758             if (pack_cmp != 0xFFFF) {
    759                 // Unpack 4 16bit mask pixels to
    760                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    761                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    762                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    763                                               _mm_setzero_si128());
    764 
    765                 // Process 4 32bit dst pixels
    766                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
    767                                                    mask_sse, srcA_sse);
    768                 _mm_store_si128(d, result);
    769             }
    770 
    771             d++;
    772             mask += 4;
    773             width -= 4;
    774         }
    775 
    776         dst = reinterpret_cast<SkPMColor*>(d);
    777     }
    778 
    779     while (width > 0) {
    780         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    781         mask++;
    782         dst++;
    783         width--;
    784     }
    785 }
    786 
    787 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
    788                                SkColor src, int width, SkPMColor opaqueDst) {
    789     if (width <= 0) {
    790         return;
    791     }
    792 
    793     int srcR = SkColorGetR(src);
    794     int srcG = SkColorGetG(src);
    795     int srcB = SkColorGetB(src);
    796 
    797     if (width >= 4) {
    798         SkASSERT(((size_t)dst & 0x03) == 0);
    799         while (((size_t)dst & 0x0F) != 0) {
    800             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    801             mask++;
    802             dst++;
    803             width--;
    804         }
    805 
    806         __m128i *d = reinterpret_cast<__m128i*>(dst);
    807         // Set alpha to 0xFF and replicate source four times in SSE register.
    808         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    809         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    810         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    811         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    812         while (width >= 4) {
    813             // Load four destination pixels into dst_sse.
    814             __m128i dst_sse = _mm_load_si128(d);
    815             // Load four 16-bit masks into lower half of mask_sse.
    816             __m128i mask_sse = _mm_loadl_epi64(
    817                                    reinterpret_cast<const __m128i*>(mask));
    818 
    819             // Check whether masks are equal to 0 and get the highest bit
    820             // of each byte of result, if masks are all zero, we will get
    821             // pack_cmp to 0xFFFF
    822             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    823                                              _mm_setzero_si128()));
    824 
    825             // if mask pixels are not all zero, we will blend the dst pixels
    826             if (pack_cmp != 0xFFFF) {
    827                 // Unpack 4 16bit mask pixels to
    828                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    829                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    830                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    831                                               _mm_setzero_si128());
    832 
    833                 // Process 4 32bit dst pixels
    834                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
    835                                                          mask_sse);
    836                 _mm_store_si128(d, result);
    837             }
    838 
    839             d++;
    840             mask += 4;
    841             width -= 4;
    842         }
    843 
    844         dst = reinterpret_cast<SkPMColor*>(d);
    845     }
    846 
    847     while (width > 0) {
    848         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    849         mask++;
    850         dst++;
    851         width--;
    852     }
    853 }
    854