Home | History | Annotate | Download | only in core
      1 /*
      2  * Copyright 2011 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "Sk4px.h"
      9 #include "SkBlitRow.h"
     10 #include "SkColorData.h"
     11 #include "SkOpts.h"
     12 #include "SkUtils.h"
     13 
     14 // Everyone agrees memcpy() is the best way to do this.
     15 static void blit_row_s32_opaque(SkPMColor* dst,
     16                                 const SkPMColor* src,
     17                                 int count,
     18                                 U8CPU alpha) {
     19     SkASSERT(255 == alpha);
     20     memcpy(dst, src, count * sizeof(SkPMColor));
     21 }
     22 
     23 // We have SSE2, NEON, and portable implementations of
     24 // blit_row_s32_blend() and blit_row_s32a_blend().
     25 
     26 // TODO(mtklein): can we do better in NEON than 2 pixels at a time?
     27 
     28 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     29     #include <emmintrin.h>
     30 
     31     static inline __m128i SkPMLerp_SSE2(const __m128i& src,
     32                                         const __m128i& dst,
     33                                         const unsigned src_scale) {
     34         // Computes dst + (((src - dst)*src_scale)>>8)
     35         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
     36 
     37         // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
     38         __m128i src_rb = _mm_and_si128(mask, src);
     39         __m128i src_ag = _mm_srli_epi16(src, 8);
     40         __m128i dst_rb = _mm_and_si128(mask, dst);
     41         __m128i dst_ag = _mm_srli_epi16(dst, 8);
     42 
     43         // Compute scaled differences.
     44         __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
     45         __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
     46         __m128i s = _mm_set1_epi16(src_scale);
     47         diff_rb = _mm_mullo_epi16(diff_rb, s);
     48         diff_ag = _mm_mullo_epi16(diff_ag, s);
     49 
     50         // Pack the differences back together.
     51         diff_rb = _mm_srli_epi16(diff_rb, 8);
     52         diff_ag = _mm_andnot_si128(mask, diff_ag);
     53         __m128i diff = _mm_or_si128(diff_rb, diff_ag);
     54 
     55         // Add difference to destination.
     56         return _mm_add_epi8(dst, diff);
     57     }
     58 
     59 
     60     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
     61         SkASSERT(alpha <= 255);
     62 
     63         auto src4 = (const __m128i*)src;
     64         auto dst4 = (      __m128i*)dst;
     65 
     66         while (count >= 4) {
     67             _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
     68                                                  _mm_loadu_si128(dst4),
     69                                                  SkAlpha255To256(alpha)));
     70             src4++;
     71             dst4++;
     72             count -= 4;
     73         }
     74 
     75         src = (const SkPMColor*)src4;
     76         dst = (      SkPMColor*)dst4;
     77 
     78         while (count --> 0) {
     79             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
     80             src++;
     81             dst++;
     82         }
     83     }
     84 
     85     static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
     86                                              const __m128i& dst,
     87                                              const unsigned aa) {
     88         unsigned alpha = SkAlpha255To256(aa);
     89         __m128i src_scale = _mm_set1_epi16(alpha);
     90         // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
     91         __m128i dst_scale = _mm_srli_epi32(src, 24);
     92         // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
     93         dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
     94         dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
     95         dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
     96         dst_scale = _mm_srli_epi32(dst_scale, 8);
     97         // Duplicate scales into 2x16-bit pattern per pixel.
     98         dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
     99         dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
    100 
    101         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
    102 
    103         // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
    104         __m128i src_rb = _mm_and_si128(mask, src);
    105         __m128i src_ag = _mm_srli_epi16(src, 8);
    106         __m128i dst_rb = _mm_and_si128(mask, dst);
    107         __m128i dst_ag = _mm_srli_epi16(dst, 8);
    108 
    109         // Scale them.
    110         src_rb = _mm_mullo_epi16(src_rb, src_scale);
    111         src_ag = _mm_mullo_epi16(src_ag, src_scale);
    112         dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
    113         dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
    114 
    115         // Add the scaled source and destination.
    116         dst_rb = _mm_add_epi16(src_rb, dst_rb);
    117         dst_ag = _mm_add_epi16(src_ag, dst_ag);
    118 
    119         // Unsplay the halves back together.
    120         dst_rb = _mm_srli_epi16(dst_rb, 8);
    121         dst_ag = _mm_andnot_si128(mask, dst_ag);
    122         return _mm_or_si128(dst_rb, dst_ag);
    123     }
    124 
    125     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
    126         SkASSERT(alpha <= 255);
    127 
    128         auto src4 = (const __m128i*)src;
    129         auto dst4 = (      __m128i*)dst;
    130 
    131         while (count >= 4) {
    132             _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
    133                                                       _mm_loadu_si128(dst4),
    134                                                       alpha));
    135             src4++;
    136             dst4++;
    137             count -= 4;
    138         }
    139 
    140         src = (const SkPMColor*)src4;
    141         dst = (      SkPMColor*)dst4;
    142 
    143         while (count --> 0) {
    144             *dst = SkBlendARGB32(*src, *dst, alpha);
    145             src++;
    146             dst++;
    147         }
    148     }
    149 
    150 #elif defined(SK_ARM_HAS_NEON)
    151     #include <arm_neon.h>
    152 
    153     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
    154         SkASSERT(alpha <= 255);
    155 
    156         uint16_t src_scale = SkAlpha255To256(alpha);
    157         uint16_t dst_scale = 256 - src_scale;
    158 
    159         while (count >= 2) {
    160             uint8x8_t vsrc, vdst, vres;
    161             uint16x8_t vsrc_wide, vdst_wide;
    162 
    163             vsrc = vreinterpret_u8_u32(vld1_u32(src));
    164             vdst = vreinterpret_u8_u32(vld1_u32(dst));
    165 
    166             vsrc_wide = vmovl_u8(vsrc);
    167             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
    168 
    169             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
    170 
    171             vdst_wide += vsrc_wide;
    172             vres = vshrn_n_u16(vdst_wide, 8);
    173 
    174             vst1_u32(dst, vreinterpret_u32_u8(vres));
    175 
    176             src += 2;
    177             dst += 2;
    178             count -= 2;
    179         }
    180 
    181         if (count == 1) {
    182             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    183             uint16x8_t vsrc_wide, vdst_wide;
    184 
    185             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    186             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    187 
    188             vsrc_wide = vmovl_u8(vsrc);
    189             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
    190             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
    191             vdst_wide += vsrc_wide;
    192             vres = vshrn_n_u16(vdst_wide, 8);
    193 
    194             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    195         }
    196     }
    197 
    198     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
    199         SkASSERT(alpha < 255);
    200 
    201         unsigned alpha256 = SkAlpha255To256(alpha);
    202 
    203         if (count & 1) {
    204             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    205             uint16x8_t vdst_wide, vsrc_wide;
    206             unsigned dst_scale;
    207 
    208             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    209             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    210 
    211             dst_scale = vget_lane_u8(vsrc, 3);
    212             dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
    213 
    214             vsrc_wide = vmovl_u8(vsrc);
    215             vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
    216 
    217             vdst_wide = vmovl_u8(vdst);
    218             vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
    219 
    220             vdst_wide += vsrc_wide;
    221             vres = vshrn_n_u16(vdst_wide, 8);
    222 
    223             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    224             dst++;
    225             src++;
    226             count--;
    227         }
    228 
    229         uint8x8_t alpha_mask;
    230         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    231         alpha_mask = vld1_u8(alpha_mask_setup);
    232 
    233         while (count) {
    234 
    235             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
    236             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
    237 
    238             __builtin_prefetch(src+32);
    239             __builtin_prefetch(dst+32);
    240 
    241             vsrc = vreinterpret_u8_u32(vld1_u32(src));
    242             vdst = vreinterpret_u8_u32(vld1_u32(dst));
    243 
    244             vsrc_scale = vdupq_n_u16(alpha256);
    245 
    246             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
    247             vdst_scale = vmovl_u8(vsrc_alphas);
    248             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
    249             // A 16-bit lane would overflow if we used 0xFFFF here,
    250             // so use an approximation with 0xFF00 that is off by 1,
    251             // and add back 1 after to get the correct value.
    252             // This is valid if alpha256 <= 255.
    253             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
    254             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
    255             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
    256 
    257             vsrc_wide = vmovl_u8(vsrc);
    258             vsrc_wide *= vsrc_scale;
    259 
    260             vdst_wide = vmovl_u8(vdst);
    261             vdst_wide *= vdst_scale;
    262 
    263             vdst_wide += vsrc_wide;
    264             vres = vshrn_n_u16(vdst_wide, 8);
    265 
    266             vst1_u32(dst, vreinterpret_u32_u8(vres));
    267 
    268             src += 2;
    269             dst += 2;
    270             count -= 2;
    271         }
    272     }
    273 
    274 #else
    275     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
    276         SkASSERT(alpha <= 255);
    277         while (count --> 0) {
    278             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
    279             src++;
    280             dst++;
    281         }
    282     }
    283 
    284     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
    285         SkASSERT(alpha <= 255);
    286         while (count --> 0) {
    287             *dst = SkBlendARGB32(*src, *dst, alpha);
    288             src++;
    289             dst++;
    290         }
    291     }
    292 #endif
    293 
    294 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
    295     static const SkBlitRow::Proc32 kProcs[] = {
    296         blit_row_s32_opaque,
    297         blit_row_s32_blend,
    298         nullptr,  // blit_row_s32a_opaque is in SkOpts
    299         blit_row_s32a_blend
    300     };
    301 
    302     SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
    303     flags &= SK_ARRAY_COUNT(kProcs) - 1;  // just to be safe
    304 
    305     return flags == 2 ? SkOpts::blit_row_s32a_opaque
    306                       : kProcs[flags];
    307 }
    308 
    309 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
    310     switch (SkGetPackedA32(color)) {
    311         case   0: memmove(dst, src, count * sizeof(SkPMColor)); return;
    312         case 255: sk_memset32(dst, color, count);               return;
    313     }
    314 
    315     unsigned invA = 255 - SkGetPackedA32(color);
    316     invA += invA >> 7;
    317     SkASSERT(invA < 256);  // We've should have already handled alpha == 0 externally.
    318 
    319     Sk16h colorHighAndRound = (Sk4px::DupPMColor(color).widen() << 8) + Sk16h(128);
    320     Sk16b invA_16x(invA);
    321 
    322     Sk4px::MapSrc(count, dst, src, [&](const Sk4px& src4) -> Sk4px {
    323         return (src4 * invA_16x).addNarrowHi(colorHighAndRound);
    324     });
    325 }
    326