Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2013 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmap.h"
     10 #include "SkBitmapFilter_opts_SSE2.h"
     11 #include "SkBitmapProcState.h"
     12 #include "SkColor.h"
     13 #include "SkColorPriv.h"
     14 #include "SkConvolver.h"
     15 #include "SkShader.h"
     16 #include "SkUnPreMultiply.h"
     17 
     18 #if 0
     19 static inline void print128i(__m128i value) {
     20     int *v = (int*) &value;
     21     printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
     22 }
     23 
     24 static inline void print128i_16(__m128i value) {
     25     short *v = (short*) &value;
     26     printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
     27 }
     28 
     29 static inline void print128i_8(__m128i value) {
     30     unsigned char *v = (unsigned char*) &value;
     31     printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
     32            v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
     33            v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
     34            );
     35 }
     36 
     37 static inline void print128f(__m128 value) {
     38     float *f = (float*) &value;
     39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
     40 }
     41 #endif
     42 
     43 // Convolves horizontally along a single row. The row data is given in
     44 // |src_data| and continues for the num_values() of the filter.
     45 void convolveHorizontally_SSE2(const unsigned char* src_data,
     46                                const SkConvolutionFilter1D& filter,
     47                                unsigned char* out_row,
     48                                bool /*has_alpha*/) {
     49     int num_values = filter.numValues();
     50 
     51     int filter_offset, filter_length;
     52     __m128i zero = _mm_setzero_si128();
     53     __m128i mask[4];
     54     // |mask| will be used to decimate all extra filter coefficients that are
     55     // loaded by SIMD when |filter_length| is not divisible by 4.
     56     // mask[0] is not used in following algorithm.
     57     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
     58     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
     59     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
     60 
     61     // Output one pixel each iteration, calculating all channels (RGBA) together.
     62     for (int out_x = 0; out_x < num_values; out_x++) {
     63         const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
     64             filter.FilterForValue(out_x, &filter_offset, &filter_length);
     65 
     66         __m128i accum = _mm_setzero_si128();
     67 
     68         // Compute the first pixel in this row that the filter affects. It will
     69         // touch |filter_length| pixels (4 bytes each) after this.
     70         const __m128i* row_to_filter =
     71             reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
     72 
     73         // We will load and accumulate with four coefficients per iteration.
     74         for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
     75 
     76             // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
     77             __m128i coeff, coeff16;
     78             // [16] xx xx xx xx c3 c2 c1 c0
     79             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
     80             // [16] xx xx xx xx c1 c1 c0 c0
     81             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
     82             // [16] c1 c1 c1 c1 c0 c0 c0 c0
     83             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
     84 
     85             // Load four pixels => unpack the first two pixels to 16 bits =>
     86             // multiply with coefficients => accumulate the convolution result.
     87             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     88             __m128i src8 = _mm_loadu_si128(row_to_filter);
     89             // [16] a1 b1 g1 r1 a0 b0 g0 r0
     90             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
     91             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
     92             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
     93             // [32]  a0*c0 b0*c0 g0*c0 r0*c0
     94             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     95             accum = _mm_add_epi32(accum, t);
     96             // [32]  a1*c1 b1*c1 g1*c1 r1*c1
     97             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     98             accum = _mm_add_epi32(accum, t);
     99 
    100             // Duplicate 3rd and 4th coefficients for all channels =>
    101             // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
    102             // => accumulate the convolution results.
    103             // [16] xx xx xx xx c3 c3 c2 c2
    104             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    105             // [16] c3 c3 c3 c3 c2 c2 c2 c2
    106             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    107             // [16] a3 g3 b3 r3 a2 g2 b2 r2
    108             src16 = _mm_unpackhi_epi8(src8, zero);
    109             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    110             mul_lo = _mm_mullo_epi16(src16, coeff16);
    111             // [32]  a2*c2 b2*c2 g2*c2 r2*c2
    112             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    113             accum = _mm_add_epi32(accum, t);
    114             // [32]  a3*c3 b3*c3 g3*c3 r3*c3
    115             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    116             accum = _mm_add_epi32(accum, t);
    117 
    118             // Advance the pixel and coefficients pointers.
    119             row_to_filter += 1;
    120             filter_values += 4;
    121         }
    122 
    123         // When |filter_length| is not divisible by 4, we need to decimate some of
    124         // the filter coefficient that was loaded incorrectly to zero; Other than
    125         // that the algorithm is same with above, exceot that the 4th pixel will be
    126         // always absent.
    127         int r = filter_length&3;
    128         if (r) {
    129             // Note: filter_values must be padded to align_up(filter_offset, 8).
    130             __m128i coeff, coeff16;
    131             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    132             // Mask out extra filter taps.
    133             coeff = _mm_and_si128(coeff, mask[r]);
    134             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    135             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    136 
    137             // Note: line buffer must be padded to align_up(filter_offset, 16).
    138             // We resolve this by use C-version for the last horizontal line.
    139             __m128i src8 = _mm_loadu_si128(row_to_filter);
    140             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    141             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    142             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    143             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    144             accum = _mm_add_epi32(accum, t);
    145             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    146             accum = _mm_add_epi32(accum, t);
    147 
    148             src16 = _mm_unpackhi_epi8(src8, zero);
    149             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    150             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    151             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    152             mul_lo = _mm_mullo_epi16(src16, coeff16);
    153             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    154             accum = _mm_add_epi32(accum, t);
    155         }
    156 
    157         // Shift right for fixed point implementation.
    158         accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
    159 
    160         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    161         accum = _mm_packs_epi32(accum, zero);
    162         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    163         accum = _mm_packus_epi16(accum, zero);
    164 
    165         // Store the pixel value of 32 bits.
    166         *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
    167         out_row += 4;
    168     }
    169 }
    170 
    171 // Convolves horizontally along four rows. The row data is given in
    172 // |src_data| and continues for the num_values() of the filter.
    173 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
    174 // refer to that function for detailed comments.
    175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
    176                                     const SkConvolutionFilter1D& filter,
    177                                     unsigned char* out_row[4],
    178                                     size_t outRowBytes) {
    179     SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];)
    180 
    181     int num_values = filter.numValues();
    182 
    183     int filter_offset, filter_length;
    184     __m128i zero = _mm_setzero_si128();
    185     __m128i mask[4];
    186     // |mask| will be used to decimate all extra filter coefficients that are
    187     // loaded by SIMD when |filter_length| is not divisible by 4.
    188     // mask[0] is not used in following algorithm.
    189     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
    190     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
    191     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
    192 
    193     // Output one pixel each iteration, calculating all channels (RGBA) together.
    194     for (int out_x = 0; out_x < num_values; out_x++) {
    195         const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
    196             filter.FilterForValue(out_x, &filter_offset, &filter_length);
    197 
    198         // four pixels in a column per iteration.
    199         __m128i accum0 = _mm_setzero_si128();
    200         __m128i accum1 = _mm_setzero_si128();
    201         __m128i accum2 = _mm_setzero_si128();
    202         __m128i accum3 = _mm_setzero_si128();
    203         int start = (filter_offset<<2);
    204         // We will load and accumulate with four coefficients per iteration.
    205         for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
    206             __m128i coeff, coeff16lo, coeff16hi;
    207             // [16] xx xx xx xx c3 c2 c1 c0
    208             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    209             // [16] xx xx xx xx c1 c1 c0 c0
    210             coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    211             // [16] c1 c1 c1 c1 c0 c0 c0 c0
    212             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    213             // [16] xx xx xx xx c3 c3 c2 c2
    214             coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    215             // [16] c3 c3 c3 c3 c2 c2 c2 c2
    216             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    217 
    218             __m128i src8, src16, mul_hi, mul_lo, t;
    219 
    220 #define ITERATION(src, accum)                                                \
    221             src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
    222             src16 = _mm_unpacklo_epi8(src8, zero);                           \
    223             mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
    224             mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
    225             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    226             accum = _mm_add_epi32(accum, t);                                 \
    227             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    228             accum = _mm_add_epi32(accum, t);                                 \
    229             src16 = _mm_unpackhi_epi8(src8, zero);                           \
    230             mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
    231             mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
    232             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    233             accum = _mm_add_epi32(accum, t);                                 \
    234             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    235             accum = _mm_add_epi32(accum, t)
    236 
    237             ITERATION(src_data[0] + start, accum0);
    238             ITERATION(src_data[1] + start, accum1);
    239             ITERATION(src_data[2] + start, accum2);
    240             ITERATION(src_data[3] + start, accum3);
    241 
    242             start += 16;
    243             filter_values += 4;
    244         }
    245 
    246         int r = filter_length & 3;
    247         if (r) {
    248             // Note: filter_values must be padded to align_up(filter_offset, 8);
    249             __m128i coeff;
    250             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    251             // Mask out extra filter taps.
    252             coeff = _mm_and_si128(coeff, mask[r]);
    253 
    254             __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    255             /* c1 c1 c1 c1 c0 c0 c0 c0 */
    256             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    257             __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    258             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    259 
    260             __m128i src8, src16, mul_hi, mul_lo, t;
    261 
    262             ITERATION(src_data[0] + start, accum0);
    263             ITERATION(src_data[1] + start, accum1);
    264             ITERATION(src_data[2] + start, accum2);
    265             ITERATION(src_data[3] + start, accum3);
    266         }
    267 
    268         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    269         accum0 = _mm_packs_epi32(accum0, zero);
    270         accum0 = _mm_packus_epi16(accum0, zero);
    271         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    272         accum1 = _mm_packs_epi32(accum1, zero);
    273         accum1 = _mm_packus_epi16(accum1, zero);
    274         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    275         accum2 = _mm_packs_epi32(accum2, zero);
    276         accum2 = _mm_packus_epi16(accum2, zero);
    277         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
    278         accum3 = _mm_packs_epi32(accum3, zero);
    279         accum3 = _mm_packus_epi16(accum3, zero);
    280 
    281         // We seem to be running off the edge here (chromium:491660).
    282         SkASSERT(((size_t)out_row[0] - (size_t)out_row_0_start) < outRowBytes);
    283 
    284         *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
    285         *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
    286         *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
    287         *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
    288 
    289         out_row[0] += 4;
    290         out_row[1] += 4;
    291         out_row[2] += 4;
    292         out_row[3] += 4;
    293     }
    294 }
    295 
    296 // Does vertical convolution to produce one output row. The filter values and
    297 // length are given in the first two parameters. These are applied to each
    298 // of the rows pointed to in the |source_data_rows| array, with each row
    299 // being |pixel_width| wide.
    300 //
    301 // The output must have room for |pixel_width * 4| bytes.
    302 template<bool has_alpha>
    303 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
    304                              int filter_length,
    305                              unsigned char* const* source_data_rows,
    306                              int pixel_width,
    307                              unsigned char* out_row) {
    308     int width = pixel_width & ~3;
    309 
    310     __m128i zero = _mm_setzero_si128();
    311     __m128i accum0, accum1, accum2, accum3, coeff16;
    312     const __m128i* src;
    313     // Output four pixels per iteration (16 bytes).
    314     for (int out_x = 0; out_x < width; out_x += 4) {
    315 
    316         // Accumulated result for each pixel. 32 bits per RGBA channel.
    317         accum0 = _mm_setzero_si128();
    318         accum1 = _mm_setzero_si128();
    319         accum2 = _mm_setzero_si128();
    320         accum3 = _mm_setzero_si128();
    321 
    322         // Convolve with one filter coefficient per iteration.
    323         for (int filter_y = 0; filter_y < filter_length; filter_y++) {
    324 
    325             // Duplicate the filter coefficient 8 times.
    326             // [16] cj cj cj cj cj cj cj cj
    327             coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    328 
    329             // Load four pixels (16 bytes) together.
    330             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    331             src = reinterpret_cast<const __m128i*>(
    332                 &source_data_rows[filter_y][out_x << 2]);
    333             __m128i src8 = _mm_loadu_si128(src);
    334 
    335             // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
    336             // multiply with current coefficient => accumulate the result.
    337             // [16] a1 b1 g1 r1 a0 b0 g0 r0
    338             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    339             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    340             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    341             // [32] a0 b0 g0 r0
    342             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    343             accum0 = _mm_add_epi32(accum0, t);
    344             // [32] a1 b1 g1 r1
    345             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    346             accum1 = _mm_add_epi32(accum1, t);
    347 
    348             // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
    349             // multiply with current coefficient => accumulate the result.
    350             // [16] a3 b3 g3 r3 a2 b2 g2 r2
    351             src16 = _mm_unpackhi_epi8(src8, zero);
    352             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    353             mul_lo = _mm_mullo_epi16(src16, coeff16);
    354             // [32] a2 b2 g2 r2
    355             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    356             accum2 = _mm_add_epi32(accum2, t);
    357             // [32] a3 b3 g3 r3
    358             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    359             accum3 = _mm_add_epi32(accum3, t);
    360         }
    361 
    362         // Shift right for fixed point implementation.
    363         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    364         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    365         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    366         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
    367 
    368         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    369         // [16] a1 b1 g1 r1 a0 b0 g0 r0
    370         accum0 = _mm_packs_epi32(accum0, accum1);
    371         // [16] a3 b3 g3 r3 a2 b2 g2 r2
    372         accum2 = _mm_packs_epi32(accum2, accum3);
    373 
    374         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    375         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    376         accum0 = _mm_packus_epi16(accum0, accum2);
    377 
    378         if (has_alpha) {
    379             // Compute the max(ri, gi, bi) for each pixel.
    380             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    381             __m128i a = _mm_srli_epi32(accum0, 8);
    382             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    383             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    384             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    385             a = _mm_srli_epi32(accum0, 16);
    386             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    387             b = _mm_max_epu8(a, b);  // Max of r and g and b.
    388             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    389             b = _mm_slli_epi32(b, 24);
    390 
    391             // Make sure the value of alpha channel is always larger than maximum
    392             // value of color channels.
    393             accum0 = _mm_max_epu8(b, accum0);
    394         } else {
    395             // Set value of alpha channels to 0xFF.
    396             __m128i mask = _mm_set1_epi32(0xff000000);
    397             accum0 = _mm_or_si128(accum0, mask);
    398         }
    399 
    400         // Store the convolution result (16 bytes) and advance the pixel pointers.
    401         _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
    402         out_row += 16;
    403     }
    404 
    405     // When the width of the output is not divisible by 4, We need to save one
    406     // pixel (4 bytes) each time. And also the fourth pixel is always absent.
    407     if (pixel_width & 3) {
    408         accum0 = _mm_setzero_si128();
    409         accum1 = _mm_setzero_si128();
    410         accum2 = _mm_setzero_si128();
    411         for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
    412             coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    413             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    414             src = reinterpret_cast<const __m128i*>(
    415                 &source_data_rows[filter_y][width<<2]);
    416             __m128i src8 = _mm_loadu_si128(src);
    417             // [16] a1 b1 g1 r1 a0 b0 g0 r0
    418             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    419             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    420             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    421             // [32] a0 b0 g0 r0
    422             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    423             accum0 = _mm_add_epi32(accum0, t);
    424             // [32] a1 b1 g1 r1
    425             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    426             accum1 = _mm_add_epi32(accum1, t);
    427             // [16] a3 b3 g3 r3 a2 b2 g2 r2
    428             src16 = _mm_unpackhi_epi8(src8, zero);
    429             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    430             mul_lo = _mm_mullo_epi16(src16, coeff16);
    431             // [32] a2 b2 g2 r2
    432             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    433             accum2 = _mm_add_epi32(accum2, t);
    434         }
    435 
    436         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    437         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    438         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    439         // [16] a1 b1 g1 r1 a0 b0 g0 r0
    440         accum0 = _mm_packs_epi32(accum0, accum1);
    441         // [16] a3 b3 g3 r3 a2 b2 g2 r2
    442         accum2 = _mm_packs_epi32(accum2, zero);
    443         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    444         accum0 = _mm_packus_epi16(accum0, accum2);
    445         if (has_alpha) {
    446             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    447             __m128i a = _mm_srli_epi32(accum0, 8);
    448             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    449             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    450             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    451             a = _mm_srli_epi32(accum0, 16);
    452             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    453             b = _mm_max_epu8(a, b);  // Max of r and g and b.
    454             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    455             b = _mm_slli_epi32(b, 24);
    456             accum0 = _mm_max_epu8(b, accum0);
    457         } else {
    458             __m128i mask = _mm_set1_epi32(0xff000000);
    459             accum0 = _mm_or_si128(accum0, mask);
    460         }
    461 
    462         for (int out_x = width; out_x < pixel_width; out_x++) {
    463             *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
    464             accum0 = _mm_srli_si128(accum0, 4);
    465             out_row += 4;
    466         }
    467     }
    468 }
    469 
    470 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
    471                              int filter_length,
    472                              unsigned char* const* source_data_rows,
    473                              int pixel_width,
    474                              unsigned char* out_row,
    475                              bool has_alpha) {
    476     if (has_alpha) {
    477         convolveVertically_SSE2<true>(filter_values,
    478                                       filter_length,
    479                                       source_data_rows,
    480                                       pixel_width,
    481                                       out_row);
    482     } else {
    483         convolveVertically_SSE2<false>(filter_values,
    484                                        filter_length,
    485                                        source_data_rows,
    486                                        pixel_width,
    487                                        out_row);
    488     }
    489 }
    490 
    491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
    492     // Padding |paddingCount| of more dummy coefficients after the coefficients
    493     // of last filter to prevent SIMD instructions which load 8 or 16 bytes
    494     // together to access invalid memory areas. We are not trying to align the
    495     // coefficients right now due to the opaqueness of <vector> implementation.
    496     // This has to be done after all |AddFilter| calls.
    497     for (int i = 0; i < 8; ++i) {
    498         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
    499     }
    500 }
    501