Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2013 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBitmapProcState.h"
      9 #include "SkBitmap.h"
     10 #include "SkColor.h"
     11 #include "SkColorPriv.h"
     12 #include "SkUnPreMultiply.h"
     13 #include "SkShader.h"
     14 #include "SkConvolver.h"
     15 
     16 #include "SkBitmapFilter_opts_SSE2.h"
     17 
     18 #include <emmintrin.h>
     19 
     20 #if 0
     21 static inline void print128i(__m128i value) {
     22     int *v = (int*) &value;
     23     printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
     24 }
     25 
     26 static inline void print128i_16(__m128i value) {
     27     short *v = (short*) &value;
     28     printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
     29 }
     30 
     31 static inline void print128i_8(__m128i value) {
     32     unsigned char *v = (unsigned char*) &value;
     33     printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
     34            v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
     35            v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
     36            );
     37 }
     38 
     39 static inline void print128f(__m128 value) {
     40     float *f = (float*) &value;
     41     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
     42 }
     43 #endif
     44 
     45 // because the border is handled specially, this is guaranteed to have all 16 pixels
     46 // available to it without running off the bitmap's edge.
     47 
     48 int debug_x = 20;
     49 int debug_y = 255;
     50 
     51 void highQualityFilter_SSE2(const SkBitmapProcState& s, int x, int y,
     52                         SkPMColor* SK_RESTRICT colors, int count) {
     53 
     54     const int maxX = s.fBitmap->width() - 1;
     55     const int maxY = s.fBitmap->height() - 1;
     56 
     57     while (count-- > 0) {
     58         SkPoint srcPt;
     59         s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
     60                     SkIntToScalar(y), &srcPt);
     61         srcPt.fX -= SK_ScalarHalf;
     62         srcPt.fY -= SK_ScalarHalf;
     63 
     64         int sx = SkScalarFloorToInt(srcPt.fX);
     65         int sy = SkScalarFloorToInt(srcPt.fY);
     66 
     67         __m128 weight = _mm_setzero_ps();
     68         __m128 accum = _mm_setzero_ps();
     69 
     70         int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f)));
     71         int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f)));
     72         int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f)));
     73         int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f)));
     74 
     75         for (int src_y = y0; src_y <= y1; src_y++) {
     76             float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fY - src_y));
     77 
     78             for (int src_x = x0; src_x <= x1 ; src_x++) {
     79                 float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fX - src_x));
     80 
     81                 float combined_weight = xweight * yweight;
     82 
     83                 SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y);
     84 
     85                 __m128i c = _mm_cvtsi32_si128( color );
     86                 c = _mm_unpacklo_epi8(c, _mm_setzero_si128());
     87                 c = _mm_unpacklo_epi16(c, _mm_setzero_si128());
     88 
     89                 __m128 cfloat = _mm_cvtepi32_ps( c );
     90 
     91                 __m128 weightVector = _mm_set1_ps(combined_weight);
     92 
     93                 accum = _mm_add_ps(accum, _mm_mul_ps(cfloat, weightVector));
     94                 weight = _mm_add_ps( weight, weightVector );
     95             }
     96         }
     97 
     98         accum = _mm_div_ps(accum, weight);
     99         accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));
    100 
    101         __m128i accumInt = _mm_cvtps_epi32( accum );
    102 
    103         int localResult[4];
    104         _mm_storeu_si128((__m128i *) (localResult), accumInt);
    105         int a = SkClampMax(localResult[0], 255);
    106         int r = SkClampMax(localResult[1], a);
    107         int g = SkClampMax(localResult[2], a);
    108         int b = SkClampMax(localResult[3], a);
    109 
    110         *colors++ = SkPackARGB32(a, r, g, b);
    111 
    112         x++;
    113     }
    114 }
    115 
    116 void highQualityFilter_ScaleOnly_SSE2(const SkBitmapProcState &s, int x, int y,
    117                              SkPMColor *SK_RESTRICT colors, int count) {
    118     const int maxX = s.fBitmap->width() - 1;
    119     const int maxY = s.fBitmap->height() - 1;
    120 
    121     SkPoint srcPt;
    122     s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
    123                 SkIntToScalar(y), &srcPt);
    124     srcPt.fY -= SK_ScalarHalf;
    125     int sy = SkScalarFloorToInt(srcPt.fY);
    126 
    127     int y0 = SkTMax(0, int(ceil(sy-s.getBitmapFilter()->width() + 0.5f)));
    128     int y1 = SkTMin(maxY, int(floor(sy+s.getBitmapFilter()->width() + 0.5f)));
    129 
    130     while (count-- > 0) {
    131         srcPt.fX -= SK_ScalarHalf;
    132         srcPt.fY -= SK_ScalarHalf;
    133 
    134         int sx = SkScalarFloorToInt(srcPt.fX);
    135 
    136         float weight = 0;
    137         __m128 accum = _mm_setzero_ps();
    138 
    139         int x0 = SkTMax(0, int(ceil(sx-s.getBitmapFilter()->width() + 0.5f)));
    140         int x1 = SkTMin(maxX, int(floor(sx+s.getBitmapFilter()->width() + 0.5f)));
    141 
    142         for (int src_y = y0; src_y <= y1; src_y++) {
    143             float yweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fY - src_y));
    144 
    145             for (int src_x = x0; src_x <= x1 ; src_x++) {
    146                 float xweight = SkScalarToFloat(s.getBitmapFilter()->lookupScalar(srcPt.fX - src_x));
    147 
    148                 float combined_weight = xweight * yweight;
    149 
    150                 SkPMColor color = *s.fBitmap->getAddr32(src_x, src_y);
    151 
    152                 __m128 c = _mm_set_ps((float)SkGetPackedB32(color),
    153                                       (float)SkGetPackedG32(color),
    154                                       (float)SkGetPackedR32(color),
    155                                       (float)SkGetPackedA32(color));
    156 
    157                 __m128 weightVector = _mm_set1_ps(combined_weight);
    158 
    159                 accum = _mm_add_ps(accum, _mm_mul_ps(c, weightVector));
    160                 weight += combined_weight;
    161             }
    162         }
    163 
    164         __m128 totalWeightVector = _mm_set1_ps(weight);
    165         accum = _mm_div_ps(accum, totalWeightVector);
    166         accum = _mm_add_ps(accum, _mm_set1_ps(0.5f));
    167 
    168         float localResult[4];
    169         _mm_storeu_ps(localResult, accum);
    170         int a = SkClampMax(int(localResult[0]), 255);
    171         int r = SkClampMax(int(localResult[1]), a);
    172         int g = SkClampMax(int(localResult[2]), a);
    173         int b = SkClampMax(int(localResult[3]), a);
    174 
    175         *colors++ = SkPackARGB32(a, r, g, b);
    176 
    177         x++;
    178 
    179         s.fInvProc(s.fInvMatrix, SkIntToScalar(x),
    180                     SkIntToScalar(y), &srcPt);
    181 
    182     }
    183 }
    184 
    185 // Convolves horizontally along a single row. The row data is given in
    186 // |src_data| and continues for the num_values() of the filter.
    187 void convolveHorizontally_SSE2(const unsigned char* src_data,
    188                                const SkConvolutionFilter1D& filter,
    189                                unsigned char* out_row,
    190                                bool /*has_alpha*/) {
    191   int num_values = filter.numValues();
    192 
    193   int filter_offset, filter_length;
    194   __m128i zero = _mm_setzero_si128();
    195   __m128i mask[4];
    196   // |mask| will be used to decimate all extra filter coefficients that are
    197   // loaded by SIMD when |filter_length| is not divisible by 4.
    198   // mask[0] is not used in following algorithm.
    199   mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
    200   mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
    201   mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
    202 
    203   // Output one pixel each iteration, calculating all channels (RGBA) together.
    204   for (int out_x = 0; out_x < num_values; out_x++) {
    205     const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
    206         filter.FilterForValue(out_x, &filter_offset, &filter_length);
    207 
    208     __m128i accum = _mm_setzero_si128();
    209 
    210     // Compute the first pixel in this row that the filter affects. It will
    211     // touch |filter_length| pixels (4 bytes each) after this.
    212     const __m128i* row_to_filter =
    213         reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
    214 
    215     // We will load and accumulate with four coefficients per iteration.
    216     for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
    217 
    218       // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
    219       __m128i coeff, coeff16;
    220       // [16] xx xx xx xx c3 c2 c1 c0
    221       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    222       // [16] xx xx xx xx c1 c1 c0 c0
    223       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    224       // [16] c1 c1 c1 c1 c0 c0 c0 c0
    225       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    226 
    227       // Load four pixels => unpack the first two pixels to 16 bits =>
    228       // multiply with coefficients => accumulate the convolution result.
    229       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    230       __m128i src8 = _mm_loadu_si128(row_to_filter);
    231       // [16] a1 b1 g1 r1 a0 b0 g0 r0
    232       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    233       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    234       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    235       // [32]  a0*c0 b0*c0 g0*c0 r0*c0
    236       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    237       accum = _mm_add_epi32(accum, t);
    238       // [32]  a1*c1 b1*c1 g1*c1 r1*c1
    239       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    240       accum = _mm_add_epi32(accum, t);
    241 
    242       // Duplicate 3rd and 4th coefficients for all channels =>
    243       // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
    244       // => accumulate the convolution results.
    245       // [16] xx xx xx xx c3 c3 c2 c2
    246       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    247       // [16] c3 c3 c3 c3 c2 c2 c2 c2
    248       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    249       // [16] a3 g3 b3 r3 a2 g2 b2 r2
    250       src16 = _mm_unpackhi_epi8(src8, zero);
    251       mul_hi = _mm_mulhi_epi16(src16, coeff16);
    252       mul_lo = _mm_mullo_epi16(src16, coeff16);
    253       // [32]  a2*c2 b2*c2 g2*c2 r2*c2
    254       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    255       accum = _mm_add_epi32(accum, t);
    256       // [32]  a3*c3 b3*c3 g3*c3 r3*c3
    257       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    258       accum = _mm_add_epi32(accum, t);
    259 
    260       // Advance the pixel and coefficients pointers.
    261       row_to_filter += 1;
    262       filter_values += 4;
    263     }
    264 
    265     // When |filter_length| is not divisible by 4, we need to decimate some of
    266     // the filter coefficient that was loaded incorrectly to zero; Other than
    267     // that the algorithm is same with above, exceot that the 4th pixel will be
    268     // always absent.
    269     int r = filter_length&3;
    270     if (r) {
    271       // Note: filter_values must be padded to align_up(filter_offset, 8).
    272       __m128i coeff, coeff16;
    273       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    274       // Mask out extra filter taps.
    275       coeff = _mm_and_si128(coeff, mask[r]);
    276       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    277       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    278 
    279       // Note: line buffer must be padded to align_up(filter_offset, 16).
    280       // We resolve this by use C-version for the last horizontal line.
    281       __m128i src8 = _mm_loadu_si128(row_to_filter);
    282       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    283       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    284       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    285       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    286       accum = _mm_add_epi32(accum, t);
    287       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    288       accum = _mm_add_epi32(accum, t);
    289 
    290       src16 = _mm_unpackhi_epi8(src8, zero);
    291       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    292       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    293       mul_hi = _mm_mulhi_epi16(src16, coeff16);
    294       mul_lo = _mm_mullo_epi16(src16, coeff16);
    295       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    296       accum = _mm_add_epi32(accum, t);
    297     }
    298 
    299     // Shift right for fixed point implementation.
    300     accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
    301 
    302     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    303     accum = _mm_packs_epi32(accum, zero);
    304     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    305     accum = _mm_packus_epi16(accum, zero);
    306 
    307     // Store the pixel value of 32 bits.
    308     *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
    309     out_row += 4;
    310   }
    311 }
    312 
    313 // Convolves horizontally along four rows. The row data is given in
    314 // |src_data| and continues for the num_values() of the filter.
    315 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
    316 // refer to that function for detailed comments.
    317 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
    318                                     const SkConvolutionFilter1D& filter,
    319                                     unsigned char* out_row[4]) {
    320   int num_values = filter.numValues();
    321 
    322   int filter_offset, filter_length;
    323   __m128i zero = _mm_setzero_si128();
    324   __m128i mask[4];
    325   // |mask| will be used to decimate all extra filter coefficients that are
    326   // loaded by SIMD when |filter_length| is not divisible by 4.
    327   // mask[0] is not used in following algorithm.
    328   mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
    329   mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
    330   mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
    331 
    332   // Output one pixel each iteration, calculating all channels (RGBA) together.
    333   for (int out_x = 0; out_x < num_values; out_x++) {
    334     const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
    335         filter.FilterForValue(out_x, &filter_offset, &filter_length);
    336 
    337     // four pixels in a column per iteration.
    338     __m128i accum0 = _mm_setzero_si128();
    339     __m128i accum1 = _mm_setzero_si128();
    340     __m128i accum2 = _mm_setzero_si128();
    341     __m128i accum3 = _mm_setzero_si128();
    342     int start = (filter_offset<<2);
    343     // We will load and accumulate with four coefficients per iteration.
    344     for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
    345       __m128i coeff, coeff16lo, coeff16hi;
    346       // [16] xx xx xx xx c3 c2 c1 c0
    347       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    348       // [16] xx xx xx xx c1 c1 c0 c0
    349       coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    350       // [16] c1 c1 c1 c1 c0 c0 c0 c0
    351       coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    352       // [16] xx xx xx xx c3 c3 c2 c2
    353       coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    354       // [16] c3 c3 c3 c3 c2 c2 c2 c2
    355       coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    356 
    357       __m128i src8, src16, mul_hi, mul_lo, t;
    358 
    359 #define ITERATION(src, accum)                                          \
    360       src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
    361       src16 = _mm_unpacklo_epi8(src8, zero);                           \
    362       mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
    363       mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
    364       t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    365       accum = _mm_add_epi32(accum, t);                                 \
    366       t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    367       accum = _mm_add_epi32(accum, t);                                 \
    368       src16 = _mm_unpackhi_epi8(src8, zero);                           \
    369       mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
    370       mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
    371       t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    372       accum = _mm_add_epi32(accum, t);                                 \
    373       t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    374       accum = _mm_add_epi32(accum, t)
    375 
    376       ITERATION(src_data[0] + start, accum0);
    377       ITERATION(src_data[1] + start, accum1);
    378       ITERATION(src_data[2] + start, accum2);
    379       ITERATION(src_data[3] + start, accum3);
    380 
    381       start += 16;
    382       filter_values += 4;
    383     }
    384 
    385     int r = filter_length & 3;
    386     if (r) {
    387       // Note: filter_values must be padded to align_up(filter_offset, 8);
    388       __m128i coeff;
    389       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    390       // Mask out extra filter taps.
    391       coeff = _mm_and_si128(coeff, mask[r]);
    392 
    393       __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    394       /* c1 c1 c1 c1 c0 c0 c0 c0 */
    395       coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    396       __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    397       coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    398 
    399       __m128i src8, src16, mul_hi, mul_lo, t;
    400 
    401       ITERATION(src_data[0] + start, accum0);
    402       ITERATION(src_data[1] + start, accum1);
    403       ITERATION(src_data[2] + start, accum2);
    404       ITERATION(src_data[3] + start, accum3);
    405     }
    406 
    407     accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    408     accum0 = _mm_packs_epi32(accum0, zero);
    409     accum0 = _mm_packus_epi16(accum0, zero);
    410     accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    411     accum1 = _mm_packs_epi32(accum1, zero);
    412     accum1 = _mm_packus_epi16(accum1, zero);
    413     accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    414     accum2 = _mm_packs_epi32(accum2, zero);
    415     accum2 = _mm_packus_epi16(accum2, zero);
    416     accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
    417     accum3 = _mm_packs_epi32(accum3, zero);
    418     accum3 = _mm_packus_epi16(accum3, zero);
    419 
    420     *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
    421     *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
    422     *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
    423     *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
    424 
    425     out_row[0] += 4;
    426     out_row[1] += 4;
    427     out_row[2] += 4;
    428     out_row[3] += 4;
    429   }
    430 }
    431 
    432 // Does vertical convolution to produce one output row. The filter values and
    433 // length are given in the first two parameters. These are applied to each
    434 // of the rows pointed to in the |source_data_rows| array, with each row
    435 // being |pixel_width| wide.
    436 //
    437 // The output must have room for |pixel_width * 4| bytes.
    438 template<bool has_alpha>
    439 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
    440                              int filter_length,
    441                              unsigned char* const* source_data_rows,
    442                              int pixel_width,
    443                              unsigned char* out_row) {
    444   int width = pixel_width & ~3;
    445 
    446   __m128i zero = _mm_setzero_si128();
    447   __m128i accum0, accum1, accum2, accum3, coeff16;
    448   const __m128i* src;
    449   // Output four pixels per iteration (16 bytes).
    450   for (int out_x = 0; out_x < width; out_x += 4) {
    451 
    452     // Accumulated result for each pixel. 32 bits per RGBA channel.
    453     accum0 = _mm_setzero_si128();
    454     accum1 = _mm_setzero_si128();
    455     accum2 = _mm_setzero_si128();
    456     accum3 = _mm_setzero_si128();
    457 
    458     // Convolve with one filter coefficient per iteration.
    459     for (int filter_y = 0; filter_y < filter_length; filter_y++) {
    460 
    461       // Duplicate the filter coefficient 8 times.
    462       // [16] cj cj cj cj cj cj cj cj
    463       coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    464 
    465       // Load four pixels (16 bytes) together.
    466       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    467       src = reinterpret_cast<const __m128i*>(
    468           &source_data_rows[filter_y][out_x << 2]);
    469       __m128i src8 = _mm_loadu_si128(src);
    470 
    471       // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
    472       // multiply with current coefficient => accumulate the result.
    473       // [16] a1 b1 g1 r1 a0 b0 g0 r0
    474       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    475       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    476       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    477       // [32] a0 b0 g0 r0
    478       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    479       accum0 = _mm_add_epi32(accum0, t);
    480       // [32] a1 b1 g1 r1
    481       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    482       accum1 = _mm_add_epi32(accum1, t);
    483 
    484       // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
    485       // multiply with current coefficient => accumulate the result.
    486       // [16] a3 b3 g3 r3 a2 b2 g2 r2
    487       src16 = _mm_unpackhi_epi8(src8, zero);
    488       mul_hi = _mm_mulhi_epi16(src16, coeff16);
    489       mul_lo = _mm_mullo_epi16(src16, coeff16);
    490       // [32] a2 b2 g2 r2
    491       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    492       accum2 = _mm_add_epi32(accum2, t);
    493       // [32] a3 b3 g3 r3
    494       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    495       accum3 = _mm_add_epi32(accum3, t);
    496     }
    497 
    498     // Shift right for fixed point implementation.
    499     accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    500     accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    501     accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    502     accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
    503 
    504     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    505     // [16] a1 b1 g1 r1 a0 b0 g0 r0
    506     accum0 = _mm_packs_epi32(accum0, accum1);
    507     // [16] a3 b3 g3 r3 a2 b2 g2 r2
    508     accum2 = _mm_packs_epi32(accum2, accum3);
    509 
    510     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    511     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    512     accum0 = _mm_packus_epi16(accum0, accum2);
    513 
    514     if (has_alpha) {
    515       // Compute the max(ri, gi, bi) for each pixel.
    516       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    517       __m128i a = _mm_srli_epi32(accum0, 8);
    518       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    519       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    520       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    521       a = _mm_srli_epi32(accum0, 16);
    522       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    523       b = _mm_max_epu8(a, b);  // Max of r and g and b.
    524       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    525       b = _mm_slli_epi32(b, 24);
    526 
    527       // Make sure the value of alpha channel is always larger than maximum
    528       // value of color channels.
    529       accum0 = _mm_max_epu8(b, accum0);
    530     } else {
    531       // Set value of alpha channels to 0xFF.
    532       __m128i mask = _mm_set1_epi32(0xff000000);
    533       accum0 = _mm_or_si128(accum0, mask);
    534     }
    535 
    536     // Store the convolution result (16 bytes) and advance the pixel pointers.
    537     _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
    538     out_row += 16;
    539   }
    540 
    541   // When the width of the output is not divisible by 4, We need to save one
    542   // pixel (4 bytes) each time. And also the fourth pixel is always absent.
    543   if (pixel_width & 3) {
    544     accum0 = _mm_setzero_si128();
    545     accum1 = _mm_setzero_si128();
    546     accum2 = _mm_setzero_si128();
    547     for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
    548       coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    549       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    550       src = reinterpret_cast<const __m128i*>(
    551           &source_data_rows[filter_y][width<<2]);
    552       __m128i src8 = _mm_loadu_si128(src);
    553       // [16] a1 b1 g1 r1 a0 b0 g0 r0
    554       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    555       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    556       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    557       // [32] a0 b0 g0 r0
    558       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    559       accum0 = _mm_add_epi32(accum0, t);
    560       // [32] a1 b1 g1 r1
    561       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    562       accum1 = _mm_add_epi32(accum1, t);
    563       // [16] a3 b3 g3 r3 a2 b2 g2 r2
    564       src16 = _mm_unpackhi_epi8(src8, zero);
    565       mul_hi = _mm_mulhi_epi16(src16, coeff16);
    566       mul_lo = _mm_mullo_epi16(src16, coeff16);
    567       // [32] a2 b2 g2 r2
    568       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    569       accum2 = _mm_add_epi32(accum2, t);
    570     }
    571 
    572     accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    573     accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    574     accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    575     // [16] a1 b1 g1 r1 a0 b0 g0 r0
    576     accum0 = _mm_packs_epi32(accum0, accum1);
    577     // [16] a3 b3 g3 r3 a2 b2 g2 r2
    578     accum2 = _mm_packs_epi32(accum2, zero);
    579     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    580     accum0 = _mm_packus_epi16(accum0, accum2);
    581     if (has_alpha) {
    582       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    583       __m128i a = _mm_srli_epi32(accum0, 8);
    584       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    585       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    586       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    587       a = _mm_srli_epi32(accum0, 16);
    588       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    589       b = _mm_max_epu8(a, b);  // Max of r and g and b.
    590       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    591       b = _mm_slli_epi32(b, 24);
    592       accum0 = _mm_max_epu8(b, accum0);
    593     } else {
    594       __m128i mask = _mm_set1_epi32(0xff000000);
    595       accum0 = _mm_or_si128(accum0, mask);
    596     }
    597 
    598     for (int out_x = width; out_x < pixel_width; out_x++) {
    599       *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
    600       accum0 = _mm_srli_si128(accum0, 4);
    601       out_row += 4;
    602     }
    603   }
    604 }
    605 
    606 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
    607                              int filter_length,
    608                              unsigned char* const* source_data_rows,
    609                              int pixel_width,
    610                              unsigned char* out_row,
    611                              bool has_alpha) {
    612   if (has_alpha) {
    613     convolveVertically_SSE2<true>(filter_values,
    614                                   filter_length,
    615                                   source_data_rows,
    616                                   pixel_width,
    617                                   out_row);
    618   } else {
    619     convolveVertically_SSE2<false>(filter_values,
    620                                    filter_length,
    621                                    source_data_rows,
    622                                    pixel_width,
    623                                    out_row);
    624   }
    625 }
    626 
    627 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
    628     // Padding |paddingCount| of more dummy coefficients after the coefficients
    629     // of last filter to prevent SIMD instructions which load 8 or 16 bytes
    630     // together to access invalid memory areas. We are not trying to align the
    631     // coefficients right now due to the opaqueness of <vector> implementation.
    632     // This has to be done after all |AddFilter| calls.
    633     for (int i = 0; i < 8; ++i) {
    634         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
    635     }
    636 }
    637