Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2018 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #ifndef SkBitmapProcState_opts_DEFINED
      9 #define SkBitmapProcState_opts_DEFINED
     10 
     11 #include "SkBitmapProcState.h"
     12 
     13 // SkBitmapProcState optimized Shader, Sample, or Matrix procs.
     14 //
     15 // Only S32_alpha_D32_filter_DX exploits instructions beyond
     16 // our common baseline SSE2/NEON instruction sets, so that's
     17 // all that lives here.
     18 //
     19 // The rest are scattershot at the moment but I want to get them
     20 // all migrated to be normal code inside SkBitmapProcState.cpp.
     21 
     22 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
     23     #include <immintrin.h>
     24 #elif defined(SK_ARM_HAS_NEON)
     25     #include <arm_neon.h>
     26 #endif
     27 
     28 namespace SK_OPTS_NS {
     29 
     30 // This same basic packing scheme is used throughout the file.
     31 static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
     32     // The top 14 bits are the integer coordinate x0 or y0.
     33     *v0 = packed >> 18;
     34 
     35     // The bottom 14 bits are the integer coordinate x1 or y1.
     36     *v1 = packed & 0x3fff;
     37 
     38     // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
     39     *w = (packed >> 14) & 0xf;
     40 }
     41 
     42 #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
     43 
     44     // As above, 4x.
     45     static void decode_packed_coordinates_and_weight(__m128i packed,
     46                                                      int v0[4], int v1[4], __m128i* w) {
     47         _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18));
     48         _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
     49         *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
     50     }
     51 
     52     // This is the crux of the SSSE3 implementation,
     53     // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
     54     static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1,
     55                                            uint32_t B0, uint32_t B1,
     56                                            const __m128i& interlaced_x_weights) {
     57         // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp.
     58         //
     59         // It takes two arguments interlaced byte-wise:
     60         //    - first  arg: [ x,y, ... 7 more pairs of 8-bit values ...]
     61         //    - second arg: [ z,w, ... 7 more pairs of 8-bit values ...]
     62         // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ].
     63         //
     64         // That's why we go to all this trouble to make interlaced_x_weights,
     65         // and here we're interlacing A0 with A1, B0 with B1 to match.
     66 
     67         __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
     68                 interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
     69 
     70         return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
     71                                  interlaced_x_weights);
     72     }
     73 
     74     // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
     75     // Returns two pixels, with each channel in a 16-bit lane of the __m128i.
     76     static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1,
     77                                                  uint32_t A2, uint32_t A3,
     78                                                  uint32_t B0, uint32_t B1,
     79                                                  uint32_t B2, uint32_t B3,
     80                                                  const __m128i& interlaced_x_weights,
     81                                                  int wy) {
     82         // The stored Y weight wy is for y1, and y0 gets a weight 16-wy.
     83         const __m128i wy1 = _mm_set1_epi16(wy),
     84                       wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1);
     85 
     86         // First interpolate in X,
     87         // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights.
     88         __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
     89                 row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
     90 
     91         // Interpolate in Y across the two rows,
     92         // then scale everything down by the maximum total weight 16x16 = 256.
     93         return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0),
     94                                             _mm_mullo_epi16(row1, wy1)), 8);
     95     }
     96 
     97     /*not static*/ inline
     98     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
     99                                  const uint32_t* xy, int count, uint32_t* colors) {
    100         SkASSERT(count > 0 && colors != nullptr);
    101         SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
    102         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
    103 
    104         int alpha = s.fAlphaScale;
    105 
    106         // Return (px * s.fAlphaScale) / 256.   (s.fAlphaScale is in [0,256].)
    107         auto scale_by_alpha = [alpha](const __m128i& px) {
    108             return alpha == 256 ? px
    109                                 : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8);
    110         };
    111 
    112         // We're in _DX_ mode here, so we're only varying in X.
    113         // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
    114         // All the other entries in xy will be pairs of X coordinates and the X weight.
    115         int y0, y1, wy;
    116         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
    117 
    118         auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
    119              row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
    120 
    121         while (count >= 4) {
    122             // We can really get going, loading 4 X pairs at a time to produce 4 output pixels.
    123             const __m128i xx = _mm_loadu_si128((const __m128i*)xy);
    124 
    125             int x0[4],
    126                 x1[4];
    127             __m128i wx;
    128             decode_packed_coordinates_and_weight(xx, x0, x1, &wx);
    129 
    130             // Splat out each x weight wx four times (one for each pixel channel) as wx1,
    131             // and sixteen minus that as the weight for x0, wx0.
    132             __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
    133                     wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
    134 
    135             // We need to interlace wx0 and wx1 for _mm_maddubs_epi16().
    136             __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1),
    137                     interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1);
    138 
    139             // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
    140             // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
    141             __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]],
    142                                                 row1[x0[0]], row1[x1[0]],
    143                                                 row0[x0[1]], row0[x1[1]],
    144                                                 row1[x0[1]], row1[x1[1]],
    145                                                 interlaced_x_weights_AB, wy);
    146 
    147             // Once more with the other half of the x-weights for two more pixels C,D.
    148             __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]],
    149                                                 row1[x0[2]], row1[x1[2]],
    150                                                 row0[x0[3]], row0[x1[3]],
    151                                                 row1[x0[3]], row1[x1[3]],
    152                                                 interlaced_x_weights_CD, wy);
    153 
    154             // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
    155             _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB),
    156                                                                 scale_by_alpha(CD)));
    157             xy     += 4;
    158             colors += 4;
    159             count  -= 4;
    160         }
    161 
    162         while (count --> 0) {
    163             // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
    164             int x0, x1, wx;
    165             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
    166 
    167             // As above, splat out wx four times as wx1, and sixteen minus that as wx0.
    168             __m128i wx1 = _mm_set1_epi8(wx),     // This splats it out 16 times, but that's fine.
    169                     wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
    170 
    171             __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1);
    172 
    173             __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
    174                                                row1[x0], row1[x1],
    175                                                       0,        0,
    176                                                       0,        0,
    177                                                interlaced_x_weights_A, wy);
    178 
    179             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128()));
    180         }
    181     }
    182 
    183 
    184 #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
    185 
    186     // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
    187 
    188     /*not static*/ inline
    189     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
    190                                  const uint32_t* xy, int count, uint32_t* colors) {
    191         SkASSERT(count > 0 && colors != nullptr);
    192         SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
    193         SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
    194         SkASSERT(s.fAlphaScale <= 256);
    195 
    196         int y0, y1, wy;
    197         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
    198 
    199         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
    200              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
    201 
    202         // We'll put one pixel in the low 4 16-bit lanes to line up with wy,
    203         // and another in the upper 4 16-bit lanes to line up with 16 - wy.
    204         const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16(   wy),
    205                                                 _mm_set1_epi16(16-wy));
    206 
    207         while (count --> 0) {
    208             int x0, x1, wx;
    209             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
    210 
    211             // Load the 4 pixels we're interpolating.
    212             const __m128i a00 = _mm_cvtsi32_si128(row0[x0]),
    213                           a01 = _mm_cvtsi32_si128(row0[x1]),
    214                           a10 = _mm_cvtsi32_si128(row1[x0]),
    215                           a11 = _mm_cvtsi32_si128(row1[x1]);
    216 
    217             // Line up low-x pixels a00 and a10 with allY.
    218             __m128i a00a10 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a10, a00),
    219                                                _mm_setzero_si128());
    220 
    221             // Scale by allY and 16-wx.
    222             a00a10 = _mm_mullo_epi16(a00a10, allY);
    223             a00a10 = _mm_mullo_epi16(a00a10, _mm_set1_epi16(16-wx));
    224 
    225 
    226             // Line up high-x pixels a01 and a11 with allY.
    227             __m128i a01a11 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a11, a01),
    228                                                _mm_setzero_si128());
    229 
    230             // Scale by allY and wx.
    231             a01a11 = _mm_mullo_epi16(a01a11, allY);
    232             a01a11 = _mm_mullo_epi16(a01a11, _mm_set1_epi16(wx));
    233 
    234 
    235             // Add the two intermediates, summing across in one direction.
    236             __m128i halves = _mm_add_epi16(a00a10, a01a11);
    237 
    238             // Add the two halves to each other to sum in the other direction.
    239             __m128i sum = _mm_add_epi16(halves, _mm_srli_si128(halves, 8));
    240 
    241             // Get back to [0,255] by dividing by maximum weight 16x16 = 256.
    242             sum = _mm_srli_epi16(sum, 8);
    243 
    244             if (s.fAlphaScale < 256) {
    245                 // Scale by alpha, which is in [0,256].
    246                 sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));
    247                 sum = _mm_srli_epi16(sum, 8);
    248             }
    249 
    250             // Pack back into 8-bit values and store.
    251             *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));
    252         }
    253     }
    254 
    255 #else
    256 
    257     // The NEON code only actually differs from the portable code in the
    258     // filtering step after we've loaded all four pixels we want to bilerp.
    259 
    260     #if defined(SK_ARM_HAS_NEON)
    261         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
    262                                               SkPMColor a00, SkPMColor a01,
    263                                               SkPMColor a10, SkPMColor a11,
    264                                               SkPMColor *dst,
    265                                               uint16_t scale) {
    266             uint8x8_t vy, vconst16_8, v16_y, vres;
    267             uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
    268             uint32x2_t va0, va1;
    269             uint16x8_t tmp1, tmp2;
    270 
    271             vy = vdup_n_u8(y);                // duplicate y into vy
    272             vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
    273             v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
    274 
    275             va0 = vdup_n_u32(a00);            // duplicate a00
    276             va1 = vdup_n_u32(a10);            // duplicate a10
    277             va0 = vset_lane_u32(a01, va0, 1); // set top to a01
    278             va1 = vset_lane_u32(a11, va1, 1); // set top to a11
    279 
    280             tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
    281             tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
    282 
    283             vx = vdup_n_u16(x);                // duplicate x into vx
    284             vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
    285             v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
    286 
    287             tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
    288             tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
    289             tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
    290             tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
    291 
    292             if (scale < 256) {
    293                 vscale = vdup_n_u16(scale);        // duplicate scale
    294                 tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
    295                 tmp = vmul_u16(tmp, vscale);       // multiply result by scale
    296             }
    297 
    298             vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
    299             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
    300         }
    301     #else
    302         static void filter_and_scale_by_alpha(unsigned x, unsigned y,
    303                                               SkPMColor a00, SkPMColor a01,
    304                                               SkPMColor a10, SkPMColor a11,
    305                                               SkPMColor* dstColor,
    306                                               unsigned alphaScale) {
    307             SkASSERT((unsigned)x <= 0xF);
    308             SkASSERT((unsigned)y <= 0xF);
    309             SkASSERT(alphaScale <= 256);
    310 
    311             int xy = x * y;
    312             const uint32_t mask = 0xFF00FF;
    313 
    314             int scale = 256 - 16*y - 16*x + xy;
    315             uint32_t lo = (a00 & mask) * scale;
    316             uint32_t hi = ((a00 >> 8) & mask) * scale;
    317 
    318             scale = 16*x - xy;
    319             lo += (a01 & mask) * scale;
    320             hi += ((a01 >> 8) & mask) * scale;
    321 
    322             scale = 16*y - xy;
    323             lo += (a10 & mask) * scale;
    324             hi += ((a10 >> 8) & mask) * scale;
    325 
    326             lo += (a11 & mask) * xy;
    327             hi += ((a11 >> 8) & mask) * xy;
    328 
    329             if (alphaScale < 256) {
    330                 lo = ((lo >> 8) & mask) * alphaScale;
    331                 hi = ((hi >> 8) & mask) * alphaScale;
    332             }
    333 
    334             *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
    335         }
    336     #endif
    337 
    338 
    339     /*not static*/ inline
    340     void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
    341                                  const uint32_t* xy, int count, SkPMColor* colors) {
    342         SkASSERT(count > 0 && colors != nullptr);
    343         SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
    344         SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
    345         SkASSERT(s.fAlphaScale <= 256);
    346 
    347         int y0, y1, wy;
    348         decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
    349 
    350         auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),
    351              row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );
    352 
    353         while (count --> 0) {
    354             int x0, x1, wx;
    355             decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
    356 
    357             filter_and_scale_by_alpha(wx, wy,
    358                                       row0[x0], row0[x1],
    359                                       row1[x0], row1[x1],
    360                                       colors++,
    361                                       s.fAlphaScale);
    362         }
    363     }
    364 
    365 #endif
    366 
    367 }  // namespace SK_OPTS_NS
    368 
    369 #endif
    370