Home | History | Annotate | Download | only in opts
      1 /*
      2  **
      3  ** Copyright 2009, The Android Open Source Project
      4  **
      5  ** Licensed under the Apache License, Version 2.0 (the "License");
      6  ** you may not use this file except in compliance with the License.
      7  ** You may obtain a copy of the License at
      8  **
      9  **     http://www.apache.org/licenses/LICENSE-2.0
     10  **
     11  ** Unless required by applicable law or agreed to in writing, software
     12  ** distributed under the License is distributed on an "AS IS" BASIS,
     13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  ** See the License for the specific language governing permissions and
     15  ** limitations under the License.
     16  */
     17 
     18 #include "SkBlitRow_opts_SSE2.h"
     19 #include "SkColorPriv.h"
     20 #include "SkUtils.h"
     21 
     22 #include <emmintrin.h>
     23 
     24 /* SSE2 version of S32_Blend_BlitRow32()
     25  * portable version is in core/SkBlitRow_D32.cpp
     26  */
     27 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     28                               const SkPMColor* SK_RESTRICT src,
     29                               int count, U8CPU alpha) {
     30     SkASSERT(alpha <= 255);
     31     if (count <= 0) {
     32         return;
     33     }
     34 
     35     uint32_t src_scale = SkAlpha255To256(alpha);
     36     uint32_t dst_scale = 256 - src_scale;
     37 
     38     if (count >= 4) {
     39         SkASSERT(((size_t)dst & 0x03) == 0);
     40         while (((size_t)dst & 0x0F) != 0) {
     41             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     42             src++;
     43             dst++;
     44             count--;
     45         }
     46 
     47         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     48         __m128i *d = reinterpret_cast<__m128i*>(dst);
     49         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
     50         __m128i src_scale_wide = _mm_set1_epi16(src_scale);
     51         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
     52         while (count >= 4) {
     53             // Load 4 pixels each of src and dest.
     54             __m128i src_pixel = _mm_loadu_si128(s);
     55             __m128i dst_pixel = _mm_load_si128(d);
     56 
     57             // Get red and blue pixels into lower byte of each word.
     58             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
     59             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
     60 
     61             // Get alpha and green into lower byte of each word.
     62             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
     63             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
     64 
     65             // Multiply by scale.
     66             src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
     67             src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
     68             dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
     69             dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
     70 
     71             // Divide by 256.
     72             src_rb = _mm_srli_epi16(src_rb, 8);
     73             dst_rb = _mm_srli_epi16(dst_rb, 8);
     74             src_ag = _mm_andnot_si128(rb_mask, src_ag);
     75             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
     76 
     77             // Combine back into RGBA.
     78             src_pixel = _mm_or_si128(src_rb, src_ag);
     79             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
     80 
     81             // Add result
     82             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
     83             _mm_store_si128(d, result);
     84             s++;
     85             d++;
     86             count -= 4;
     87         }
     88         src = reinterpret_cast<const SkPMColor*>(s);
     89         dst = reinterpret_cast<SkPMColor*>(d);
     90     }
     91 
     92     while (count > 0) {
     93         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
     94         src++;
     95         dst++;
     96         count--;
     97     }
     98 }
     99 
    100 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    101                                 const SkPMColor* SK_RESTRICT src,
    102                                 int count, U8CPU alpha) {
    103     SkASSERT(alpha == 255);
    104     if (count <= 0) {
    105         return;
    106     }
    107 
    108     if (count >= 4) {
    109         SkASSERT(((size_t)dst & 0x03) == 0);
    110         while (((size_t)dst & 0x0F) != 0) {
    111             *dst = SkPMSrcOver(*src, *dst);
    112             src++;
    113             dst++;
    114             count--;
    115         }
    116 
    117         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    118         __m128i *d = reinterpret_cast<__m128i*>(dst);
    119 #ifdef SK_USE_ACCURATE_BLENDING
    120         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    121         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
    122         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
    123         while (count >= 4) {
    124             // Load 4 pixels
    125             __m128i src_pixel = _mm_loadu_si128(s);
    126             __m128i dst_pixel = _mm_load_si128(d);
    127 
    128             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    129             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    130             // Shift alphas down to lower 8 bits of each quad.
    131             __m128i alpha = _mm_srli_epi32(src_pixel, 24);
    132 
    133             // Copy alpha to upper 3rd byte of each quad
    134             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
    135 
    136             // Subtract alphas from 255, to get 0..255
    137             alpha = _mm_sub_epi16(c_255, alpha);
    138 
    139             // Multiply by red and blue by src alpha.
    140             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    141             // Multiply by alpha and green by src alpha.
    142             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    143 
    144             // dst_rb_low = (dst_rb >> 8)
    145             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
    146             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
    147 
    148             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
    149             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
    150             dst_rb = _mm_add_epi16(dst_rb, c_128);
    151             dst_rb = _mm_srli_epi16(dst_rb, 8);
    152 
    153             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
    154             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
    155             dst_ag = _mm_add_epi16(dst_ag, c_128);
    156             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    157 
    158             // Combine back into RGBA.
    159             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    160 
    161             // Add result
    162             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    163             _mm_store_si128(d, result);
    164             s++;
    165             d++;
    166             count -= 4;
    167         }
    168     #else
    169         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    170         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
    171         while (count >= 4) {
    172             // Load 4 pixels
    173             __m128i src_pixel = _mm_loadu_si128(s);
    174             __m128i dst_pixel = _mm_load_si128(d);
    175 
    176             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    177             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    178 
    179             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
    180             __m128i alpha = _mm_srli_epi16(src_pixel, 8);
    181 
    182             // (a0, a0, a1, a1, a2, g2, a3, g3)
    183             alpha = _mm_shufflehi_epi16(alpha, 0xF5);
    184 
    185             // (a0, a0, a1, a1, a2, a2, a3, a3)
    186             alpha = _mm_shufflelo_epi16(alpha, 0xF5);
    187 
    188             // Subtract alphas from 256, to get 1..256
    189             alpha = _mm_sub_epi16(c_256, alpha);
    190 
    191             // Multiply by red and blue by src alpha.
    192             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
    193             // Multiply by alpha and green by src alpha.
    194             dst_ag = _mm_mullo_epi16(dst_ag, alpha);
    195 
    196             // Divide by 256.
    197             dst_rb = _mm_srli_epi16(dst_rb, 8);
    198 
    199             // Mask out high bits (already in the right place)
    200             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    201 
    202             // Combine back into RGBA.
    203             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    204 
    205             // Add result
    206             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    207             _mm_store_si128(d, result);
    208             s++;
    209             d++;
    210             count -= 4;
    211         }
    212 #endif
    213         src = reinterpret_cast<const SkPMColor*>(s);
    214         dst = reinterpret_cast<SkPMColor*>(d);
    215     }
    216 
    217     while (count > 0) {
    218         *dst = SkPMSrcOver(*src, *dst);
    219         src++;
    220         dst++;
    221         count--;
    222     }
    223 }
    224 
    225 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
    226                                const SkPMColor* SK_RESTRICT src,
    227                                int count, U8CPU alpha) {
    228     SkASSERT(alpha <= 255);
    229     if (count <= 0) {
    230         return;
    231     }
    232 
    233     if (count >= 4) {
    234         while (((size_t)dst & 0x0F) != 0) {
    235             *dst = SkBlendARGB32(*src, *dst, alpha);
    236             src++;
    237             dst++;
    238             count--;
    239         }
    240 
    241         uint32_t src_scale = SkAlpha255To256(alpha);
    242 
    243         const __m128i *s = reinterpret_cast<const __m128i*>(src);
    244         __m128i *d = reinterpret_cast<__m128i*>(dst);
    245         __m128i src_scale_wide = _mm_set1_epi16(src_scale);
    246         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    247         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
    248         while (count >= 4) {
    249             // Load 4 pixels each of src and dest.
    250             __m128i src_pixel = _mm_loadu_si128(s);
    251             __m128i dst_pixel = _mm_load_si128(d);
    252 
    253             // Get red and blue pixels into lower byte of each word.
    254             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    255             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    256 
    257             // Get alpha and green into lower byte of each word.
    258             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    259             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    260 
    261             // Put per-pixel alpha in low byte of each word.
    262             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    263             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    264 
    265             // dst_alpha = dst_alpha * src_scale
    266             dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
    267 
    268             // Divide by 256.
    269             dst_alpha = _mm_srli_epi16(dst_alpha, 8);
    270 
    271             // Subtract alphas from 256, to get 1..256
    272             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    273 
    274             // Multiply red and blue by dst pixel alpha.
    275             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    276             // Multiply alpha and green by dst pixel alpha.
    277             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    278 
    279             // Multiply red and blue by global alpha.
    280             src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    281             // Multiply alpha and green by global alpha.
    282             src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    283 
    284             // Divide by 256.
    285             dst_rb = _mm_srli_epi16(dst_rb, 8);
    286             src_rb = _mm_srli_epi16(src_rb, 8);
    287 
    288             // Mask out low bits (goodies already in the right place; no need to divide)
    289             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    290             src_ag = _mm_andnot_si128(rb_mask, src_ag);
    291 
    292             // Combine back into RGBA.
    293             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    294             src_pixel = _mm_or_si128(src_rb, src_ag);
    295 
    296             // Add two pixels into result.
    297             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
    298             _mm_store_si128(d, result);
    299             s++;
    300             d++;
    301             count -= 4;
    302         }
    303         src = reinterpret_cast<const SkPMColor*>(s);
    304         dst = reinterpret_cast<SkPMColor*>(d);
    305     }
    306 
    307     while (count > 0) {
    308         *dst = SkBlendARGB32(*src, *dst, alpha);
    309         src++;
    310         dst++;
    311         count--;
    312     }
    313 }
    314 
    315 /* SSE2 version of Color32()
    316  * portable version is in core/SkBlitRow_D32.cpp
    317  */
    318 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
    319                   SkPMColor color) {
    320 
    321     if (count <= 0) {
    322         return;
    323     }
    324 
    325     if (0 == color) {
    326         if (src != dst) {
    327             memcpy(dst, src, count * sizeof(SkPMColor));
    328         }
    329     }
    330 
    331     unsigned colorA = SkGetPackedA32(color);
    332     if (255 == colorA) {
    333         sk_memset32(dst, color, count);
    334     } else {
    335         unsigned scale = 256 - SkAlpha255To256(colorA);
    336 
    337         if (count >= 4) {
    338             SkASSERT(((size_t)dst & 0x03) == 0);
    339             while (((size_t)dst & 0x0F) != 0) {
    340                 *dst = color + SkAlphaMulQ(*src, scale);
    341                 src++;
    342                 dst++;
    343                 count--;
    344             }
    345 
    346             const __m128i *s = reinterpret_cast<const __m128i*>(src);
    347             __m128i *d = reinterpret_cast<__m128i*>(dst);
    348             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    349             __m128i src_scale_wide = _mm_set1_epi16(scale);
    350             __m128i color_wide = _mm_set1_epi32(color);
    351             while (count >= 4) {
    352                 // Load 4 pixels each of src and dest.
    353                 __m128i src_pixel = _mm_loadu_si128(s);
    354 
    355                 // Get red and blue pixels into lower byte of each word.
    356                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    357 
    358                 // Get alpha and green into lower byte of each word.
    359                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    360 
    361                 // Multiply by scale.
    362                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    363                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    364 
    365                 // Divide by 256.
    366                 src_rb = _mm_srli_epi16(src_rb, 8);
    367                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    368 
    369                 // Combine back into RGBA.
    370                 src_pixel = _mm_or_si128(src_rb, src_ag);
    371 
    372                 // Add color to result.
    373                 __m128i result = _mm_add_epi8(color_wide, src_pixel);
    374 
    375                 // Store result.
    376                 _mm_store_si128(d, result);
    377                 s++;
    378                 d++;
    379                 count -= 4;
    380             }
    381             src = reinterpret_cast<const SkPMColor*>(s);
    382             dst = reinterpret_cast<SkPMColor*>(d);
    383          }
    384 
    385         while (count > 0) {
    386             *dst = color + SkAlphaMulQ(*src, scale);
    387             src += 1;
    388             dst += 1;
    389             count--;
    390         }
    391     }
    392 }
    393 
    394 void SkARGB32_BlitMask_SSE2(void* device, size_t dstRB,
    395                             SkBitmap::Config dstConfig, const uint8_t* mask,
    396                             size_t maskRB, SkColor origColor,
    397                             int width, int height)
    398 {
    399     SkPMColor color = SkPreMultiplyColor(origColor);
    400     size_t dstOffset = dstRB - (width << 2);
    401     size_t maskOffset = maskRB - width;
    402     SkPMColor* dst = (SkPMColor *)device;
    403     do {
    404         int count = width;
    405         if (count >= 4) {
    406             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
    407                 *dst = SkBlendARGB32(color, *dst, *mask);
    408                 mask++;
    409                 dst++;
    410                 count--;
    411             }
    412             __m128i *d = reinterpret_cast<__m128i*>(dst);
    413             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
    414             __m128i c_256 = _mm_set1_epi16(256);
    415             __m128i c_1 = _mm_set1_epi16(1);
    416             __m128i src_pixel = _mm_set1_epi32(color);
    417             while (count >= 4) {
    418                 // Load 4 pixels each of src and dest.
    419                 __m128i dst_pixel = _mm_load_si128(d);
    420 
    421                 //set the aphla value
    422                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
    423                                 0, *(mask+3),0, \
    424                                 *(mask+2),0, *(mask+2),\
    425                                 0,*(mask+1), 0,*(mask+1),\
    426                                 0, *mask,0,*mask);
    427 
    428                 //call SkAlpha255To256()
    429                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
    430 
    431                 // Get red and blue pixels into lower byte of each word.
    432                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
    433                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
    434 
    435                 // Get alpha and green into lower byte of each word.
    436                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
    437                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
    438 
    439                 // Put per-pixel alpha in low byte of each word.
    440                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
    441                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
    442 
    443                 // dst_alpha = dst_alpha * src_scale
    444                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
    445 
    446                 // Divide by 256.
    447                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
    448 
    449                 // Subtract alphas from 256, to get 1..256
    450                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
    451                 // Multiply red and blue by dst pixel alpha.
    452                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
    453                 // Multiply alpha and green by dst pixel alpha.
    454                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
    455 
    456                 // Multiply red and blue by global alpha.
    457                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
    458                 // Multiply alpha and green by global alpha.
    459                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
    460                 // Divide by 256.
    461                 dst_rb = _mm_srli_epi16(dst_rb, 8);
    462                 src_rb = _mm_srli_epi16(src_rb, 8);
    463 
    464                 // Mask out low bits (goodies already in the right place; no need to divide)
    465                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
    466                 src_ag = _mm_andnot_si128(rb_mask, src_ag);
    467 
    468                 // Combine back into RGBA.
    469                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
    470                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
    471 
    472                 // Add two pixels into result.
    473                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
    474                 _mm_store_si128(d, result);
    475                 // load the next 4 pixel
    476                 mask = mask + 4;
    477                 d++;
    478                 count -= 4;
    479             }
    480             dst = reinterpret_cast<SkPMColor *>(d);
    481         }
    482         while(count > 0) {
    483             *dst= SkBlendARGB32(color, *dst, *mask);
    484             dst += 1;
    485             mask++;
    486             count --;
    487         }
    488         dst = (SkPMColor *)((char*)dst + dstOffset);
    489         mask += maskOffset;
    490     } while (--height != 0);
    491 }
    492