Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2009 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmapProcState_opts_SSE2.h"
     10 #include "SkBitmapProcState_utils.h"
     11 #include "SkColorData.h"
     12 #include "SkPaint.h"
     13 #include "SkUtils.h"
     14 
     15 void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
     16                                    const uint32_t* xy,
     17                                    int count, uint32_t* colors) {
     18     SkASSERT(count > 0 && colors != nullptr);
     19     SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
     20     SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
     21     SkASSERT(s.fAlphaScale == 256);
     22 
     23     const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
     24     size_t rb = s.fPixmap.rowBytes();
     25     uint32_t XY = *xy++;
     26     unsigned y0 = XY >> 14;
     27     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
     28     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
     29     unsigned subY = y0 & 0xF;
     30 
     31     // ( 0,  0,  0,  0,  0,  0,  0, 16)
     32     __m128i sixteen = _mm_cvtsi32_si128(16);
     33 
     34     // ( 0,  0,  0,  0, 16, 16, 16, 16)
     35     sixteen = _mm_shufflelo_epi16(sixteen, 0);
     36 
     37     // ( 0,  0,  0,  0,  0,  0,  0,  y)
     38     __m128i allY = _mm_cvtsi32_si128(subY);
     39 
     40     // ( 0,  0,  0,  0,  y,  y,  y,  y)
     41     allY = _mm_shufflelo_epi16(allY, 0);
     42 
     43     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
     44     __m128i negY = _mm_sub_epi16(sixteen, allY);
     45 
     46     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
     47     allY = _mm_unpacklo_epi64(allY, negY);
     48 
     49     // (16, 16, 16, 16, 16, 16, 16, 16 )
     50     sixteen = _mm_shuffle_epi32(sixteen, 0);
     51 
     52     // ( 0,  0,  0,  0,  0,  0,  0,  0)
     53     __m128i zero = _mm_setzero_si128();
     54     do {
     55         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
     56         unsigned x0 = XX >> 18;
     57         unsigned x1 = XX & 0x3FFF;
     58 
     59         // (0, 0, 0, 0, 0, 0, 0, x)
     60         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
     61 
     62         // (0, 0, 0, 0, x, x, x, x)
     63         allX = _mm_shufflelo_epi16(allX, 0);
     64 
     65         // (x, x, x, x, x, x, x, x)
     66         allX = _mm_shuffle_epi32(allX, 0);
     67 
     68         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
     69         __m128i negX = _mm_sub_epi16(sixteen, allX);
     70 
     71         // Load 4 samples (pixels).
     72         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
     73         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
     74         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
     75         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
     76 
     77         // (0, 0, a00, a10)
     78         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
     79 
     80         // Expand to 16 bits per component.
     81         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
     82 
     83         // ((a00 * (16-y)), (a10 * y)).
     84         a00a10 = _mm_mullo_epi16(a00a10, allY);
     85 
     86         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
     87         a00a10 = _mm_mullo_epi16(a00a10, negX);
     88 
     89         // (0, 0, a01, a10)
     90         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
     91 
     92         // Expand to 16 bits per component.
     93         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
     94 
     95         // (a01 * (16-y)), (a11 * y)
     96         a01a11 = _mm_mullo_epi16(a01a11, allY);
     97 
     98         // (a01 * (16-y) * x), (a11 * y * x)
     99         a01a11 = _mm_mullo_epi16(a01a11, allX);
    100 
    101         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
    102         __m128i sum = _mm_add_epi16(a00a10, a01a11);
    103 
    104         // (DC, a00*w00 + a01*w01)
    105         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
    106 
    107         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
    108         sum = _mm_add_epi16(sum, shifted);
    109 
    110         // Divide each 16 bit component by 256.
    111         sum = _mm_srli_epi16(sum, 8);
    112 
    113         // Pack lower 4 16 bit values of sum into lower 4 bytes.
    114         sum = _mm_packus_epi16(sum, zero);
    115 
    116         // Extract low int and store.
    117         *colors++ = _mm_cvtsi128_si32(sum);
    118     } while (--count > 0);
    119 }
    120 
    121 void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
    122                                   const uint32_t* xy,
    123                                   int count, uint32_t* colors) {
    124     SkASSERT(count > 0 && colors != nullptr);
    125     SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
    126     SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
    127     SkASSERT(s.fAlphaScale < 256);
    128 
    129     const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
    130     size_t rb = s.fPixmap.rowBytes();
    131     uint32_t XY = *xy++;
    132     unsigned y0 = XY >> 14;
    133     const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
    134     const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
    135     unsigned subY = y0 & 0xF;
    136 
    137     // ( 0,  0,  0,  0,  0,  0,  0, 16)
    138     __m128i sixteen = _mm_cvtsi32_si128(16);
    139 
    140     // ( 0,  0,  0,  0, 16, 16, 16, 16)
    141     sixteen = _mm_shufflelo_epi16(sixteen, 0);
    142 
    143     // ( 0,  0,  0,  0,  0,  0,  0,  y)
    144     __m128i allY = _mm_cvtsi32_si128(subY);
    145 
    146     // ( 0,  0,  0,  0,  y,  y,  y,  y)
    147     allY = _mm_shufflelo_epi16(allY, 0);
    148 
    149     // ( 0,  0,  0,  0, 16-y, 16-y, 16-y, 16-y)
    150     __m128i negY = _mm_sub_epi16(sixteen, allY);
    151 
    152     // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
    153     allY = _mm_unpacklo_epi64(allY, negY);
    154 
    155     // (16, 16, 16, 16, 16, 16, 16, 16 )
    156     sixteen = _mm_shuffle_epi32(sixteen, 0);
    157 
    158     // ( 0,  0,  0,  0,  0,  0,  0,  0)
    159     __m128i zero = _mm_setzero_si128();
    160 
    161     // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
    162     __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
    163 
    164     do {
    165         uint32_t XX = *xy++;    // x0:14 | 4 | x1:14
    166         unsigned x0 = XX >> 18;
    167         unsigned x1 = XX & 0x3FFF;
    168 
    169         // (0, 0, 0, 0, 0, 0, 0, x)
    170         __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
    171 
    172         // (0, 0, 0, 0, x, x, x, x)
    173         allX = _mm_shufflelo_epi16(allX, 0);
    174 
    175         // (x, x, x, x, x, x, x, x)
    176         allX = _mm_shuffle_epi32(allX, 0);
    177 
    178         // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
    179         __m128i negX = _mm_sub_epi16(sixteen, allX);
    180 
    181         // Load 4 samples (pixels).
    182         __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
    183         __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
    184         __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
    185         __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
    186 
    187         // (0, 0, a00, a10)
    188         __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
    189 
    190         // Expand to 16 bits per component.
    191         a00a10 = _mm_unpacklo_epi8(a00a10, zero);
    192 
    193         // ((a00 * (16-y)), (a10 * y)).
    194         a00a10 = _mm_mullo_epi16(a00a10, allY);
    195 
    196         // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
    197         a00a10 = _mm_mullo_epi16(a00a10, negX);
    198 
    199         // (0, 0, a01, a10)
    200         __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
    201 
    202         // Expand to 16 bits per component.
    203         a01a11 = _mm_unpacklo_epi8(a01a11, zero);
    204 
    205         // (a01 * (16-y)), (a11 * y)
    206         a01a11 = _mm_mullo_epi16(a01a11, allY);
    207 
    208         // (a01 * (16-y) * x), (a11 * y * x)
    209         a01a11 = _mm_mullo_epi16(a01a11, allX);
    210 
    211         // (a00*w00 + a01*w01, a10*w10 + a11*w11)
    212         __m128i sum = _mm_add_epi16(a00a10, a01a11);
    213 
    214         // (DC, a00*w00 + a01*w01)
    215         __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
    216 
    217         // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
    218         sum = _mm_add_epi16(sum, shifted);
    219 
    220         // Divide each 16 bit component by 256.
    221         sum = _mm_srli_epi16(sum, 8);
    222 
    223         // Multiply by alpha.
    224         sum = _mm_mullo_epi16(sum, alpha);
    225 
    226         // Divide each 16 bit component by 256.
    227         sum = _mm_srli_epi16(sum, 8);
    228 
    229         // Pack lower 4 16 bit values of sum into lower 4 bytes.
    230         sum = _mm_packus_epi16(sum, zero);
    231 
    232         // Extract low int and store.
    233         *colors++ = _mm_cvtsi128_si32(sum);
    234     } while (--count > 0);
    235 }
    236 
    237 static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
    238                                                  SkFixed one) {
    239     unsigned i = SkClampMax(f >> 16, max);
    240     i = (i << 4) | ((f >> 12) & 0xF);
    241     return (i << 14) | SkClampMax((f + one) >> 16, max);
    242 }
    243 
    244 /*  SSE version of ClampX_ClampY_filter_scale()
    245  *  portable version is in core/SkBitmapProcState_matrix.h
    246  */
    247 void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
    248                                      int count, int x, int y) {
    249     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    250                              SkMatrix::kScale_Mask)) == 0);
    251     SkASSERT(s.fInvKy == 0);
    252 
    253     const unsigned maxX = s.fPixmap.width() - 1;
    254     const SkFixed one = s.fFilterOneX;
    255     const SkFixed dx = s.fInvSx;
    256 
    257     const SkBitmapProcStateAutoMapper mapper(s, x, y);
    258     const SkFixed fy = mapper.fixedY();
    259     const unsigned maxY = s.fPixmap.height() - 1;
    260     // compute our two Y values up front
    261     *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
    262     // now initialize fx
    263     SkFixed fx = mapper.fixedX();
    264 
    265     // test if we don't need to apply the tile proc
    266     if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
    267         if (count >= 4) {
    268             // SSE version of decal_filter_scale
    269             while ((size_t(xy) & 0x0F) != 0) {
    270                 SkASSERT((fx >> (16 + 14)) == 0);
    271                 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
    272                 fx += dx;
    273                 count--;
    274             }
    275 
    276             __m128i wide_1    = _mm_set1_epi32(1);
    277             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
    278             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
    279                                               fx + dx, fx);
    280 
    281             while (count >= 4) {
    282                 __m128i wide_out;
    283 
    284                 wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
    285                 wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
    286                                         _mm_srai_epi32(wide_fx, 16), wide_1));
    287 
    288                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
    289 
    290                 xy += 4;
    291                 fx += dx * 4;
    292                 wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
    293                 count -= 4;
    294             } // while count >= 4
    295         } // if count >= 4
    296 
    297         while (count-- > 0) {
    298             SkASSERT((fx >> (16 + 14)) == 0);
    299             *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
    300             fx += dx;
    301         }
    302     } else {
    303         // SSE2 only support 16bit interger max & min, so only process the case
    304         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
    305         // height, there should be rare bitmap whose height will be greater
    306         // than max 16bit interger in the real world.
    307         if ((count >= 4) && (maxX <= 0xFFFF)) {
    308             while (((size_t)xy & 0x0F) != 0) {
    309                 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
    310                 fx += dx;
    311                 count--;
    312             }
    313 
    314             __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
    315                                               fx + dx, fx);
    316             __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
    317             __m128i wide_one  = _mm_set1_epi32(one);
    318             __m128i wide_maxX = _mm_set1_epi32(maxX);
    319             __m128i wide_mask = _mm_set1_epi32(0xF);
    320 
    321              while (count >= 4) {
    322                 __m128i wide_i;
    323                 __m128i wide_lo;
    324                 __m128i wide_fx1;
    325 
    326                 // i = SkClampMax(f>>16,maxX)
    327                 wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
    328                                        _mm_setzero_si128());
    329                 wide_i = _mm_min_epi16(wide_i, wide_maxX);
    330 
    331                 // i<<4 | EXTRACT_LOW_BITS(fx)
    332                 wide_lo = _mm_srli_epi32(wide_fx, 12);
    333                 wide_lo = _mm_and_si128(wide_lo, wide_mask);
    334                 wide_i  = _mm_slli_epi32(wide_i, 4);
    335                 wide_i  = _mm_or_si128(wide_i, wide_lo);
    336 
    337                 // i<<14
    338                 wide_i = _mm_slli_epi32(wide_i, 14);
    339 
    340                 // SkClampMax(((f+one))>>16,max)
    341                 wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
    342                 wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
    343                                                         _mm_setzero_si128());
    344                 wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
    345 
    346                 // final combination
    347                 wide_i = _mm_or_si128(wide_i, wide_fx1);
    348                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
    349 
    350                 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
    351                 fx += dx * 4;
    352                 xy += 4;
    353                 count -= 4;
    354             } // while count >= 4
    355         } // if count >= 4
    356 
    357         while (count-- > 0) {
    358             *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
    359             fx += dx;
    360         }
    361     }
    362 }
    363 
    364 /*  SSE version of ClampX_ClampY_nofilter_scale()
    365  *  portable version is in core/SkBitmapProcState_matrix.h
    366  */
    367 void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
    368                                     uint32_t xy[], int count, int x, int y) {
    369     SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    370                              SkMatrix::kScale_Mask)) == 0);
    371 
    372     // we store y, x, x, x, x, x
    373     const unsigned maxX = s.fPixmap.width() - 1;
    374     const SkBitmapProcStateAutoMapper mapper(s, x, y);
    375     const unsigned maxY = s.fPixmap.height() - 1;
    376     *xy++ = SkClampMax(mapper.intY(), maxY);
    377     SkFixed fx = mapper.fixedX();
    378 
    379     if (0 == maxX) {
    380         // all of the following X values must be 0
    381         memset(xy, 0, count * sizeof(uint16_t));
    382         return;
    383     }
    384 
    385     const SkFixed dx = s.fInvSx;
    386 
    387     // test if we don't need to apply the tile proc
    388     if ((unsigned)(fx >> 16) <= maxX &&
    389         (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
    390         // SSE version of decal_nofilter_scale
    391         if (count >= 8) {
    392             while (((size_t)xy & 0x0F) != 0) {
    393                 *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
    394                 fx += 2 * dx;
    395                 count -= 2;
    396             }
    397 
    398             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
    399             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
    400 
    401             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
    402                                              fx + dx, fx);
    403             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
    404 
    405             while (count >= 8) {
    406                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
    407                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
    408 
    409                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
    410                                                       wide_out_high);
    411                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
    412 
    413                 wide_low = _mm_add_epi32(wide_low, wide_dx8);
    414                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
    415 
    416                 xy += 4;
    417                 fx += dx * 8;
    418                 count -= 8;
    419             }
    420         } // if count >= 8
    421 
    422         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
    423         while (count-- > 0) {
    424             *xx++ = SkToU16(fx >> 16);
    425             fx += dx;
    426         }
    427     } else {
    428         // SSE2 only support 16bit interger max & min, so only process the case
    429         // maxX less than the max 16bit interger. Actually maxX is the bitmap's
    430         // height, there should be rare bitmap whose height will be greater
    431         // than max 16bit interger in the real world.
    432         if ((count >= 8) && (maxX <= 0xFFFF)) {
    433             while (((size_t)xy & 0x0F) != 0) {
    434                 *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
    435                                         SkClampMax(fx >> 16, maxX));
    436                 fx += 2 * dx;
    437                 count -= 2;
    438             }
    439 
    440             __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
    441             __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
    442 
    443             __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
    444                                              fx + dx, fx);
    445             __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
    446             __m128i wide_maxX = _mm_set1_epi32(maxX);
    447 
    448             while (count >= 8) {
    449                 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
    450                 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
    451 
    452                 wide_out_low  = _mm_max_epi16(wide_out_low,
    453                                               _mm_setzero_si128());
    454                 wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
    455                 wide_out_high = _mm_max_epi16(wide_out_high,
    456                                               _mm_setzero_si128());
    457                 wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
    458 
    459                 __m128i wide_result = _mm_packs_epi32(wide_out_low,
    460                                                       wide_out_high);
    461                 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
    462 
    463                 wide_low  = _mm_add_epi32(wide_low, wide_dx8);
    464                 wide_high = _mm_add_epi32(wide_high, wide_dx8);
    465 
    466                 xy += 4;
    467                 fx += dx * 8;
    468                 count -= 8;
    469             }
    470         } // if count >= 8
    471 
    472         uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
    473         while (count-- > 0) {
    474             *xx++ = SkClampMax(fx >> 16, maxX);
    475             fx += dx;
    476         }
    477     }
    478 }
    479