Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2013 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmap.h"
     10 #include "SkBitmapFilter_opts_SSE2.h"
     11 #include "SkBitmapProcState.h"
     12 #include "SkColor.h"
     13 #include "SkColorPriv.h"
     14 #include "SkConvolver.h"
     15 #include "SkShader.h"
     16 #include "SkUnPreMultiply.h"
     17 
     18 #if 0
     19 static inline void print128i(__m128i value) {
     20     int *v = (int*) &value;
     21     printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]);
     22 }
     23 
     24 static inline void print128i_16(__m128i value) {
     25     short *v = (short*) &value;
     26     printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
     27 }
     28 
     29 static inline void print128i_8(__m128i value) {
     30     unsigned char *v = (unsigned char*) &value;
     31     printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u\n",
     32            v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
     33            v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15]
     34            );
     35 }
     36 
     37 static inline void print128f(__m128 value) {
     38     float *f = (float*) &value;
     39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]);
     40 }
     41 #endif
     42 
     43 // Convolves horizontally along a single row. The row data is given in
     44 // |src_data| and continues for the num_values() of the filter.
     45 void convolveHorizontally_SSE2(const unsigned char* src_data,
     46                                const SkConvolutionFilter1D& filter,
     47                                unsigned char* out_row,
     48                                bool /*has_alpha*/) {
     49     int num_values = filter.numValues();
     50 
     51     int filter_offset, filter_length;
     52     __m128i zero = _mm_setzero_si128();
     53     __m128i mask[4];
     54     // |mask| will be used to decimate all extra filter coefficients that are
     55     // loaded by SIMD when |filter_length| is not divisible by 4.
     56     // mask[0] is not used in following algorithm.
     57     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
     58     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
     59     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
     60 
     61     // Output one pixel each iteration, calculating all channels (RGBA) together.
     62     for (int out_x = 0; out_x < num_values; out_x++) {
     63         const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
     64             filter.FilterForValue(out_x, &filter_offset, &filter_length);
     65 
     66         __m128i accum = _mm_setzero_si128();
     67 
     68         // Compute the first pixel in this row that the filter affects. It will
     69         // touch |filter_length| pixels (4 bytes each) after this.
     70         const __m128i* row_to_filter =
     71             reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
     72 
     73         // We will load and accumulate with four coefficients per iteration.
     74         for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
     75 
     76             // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
     77             __m128i coeff, coeff16;
     78             // [16] xx xx xx xx c3 c2 c1 c0
     79             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
     80             // [16] xx xx xx xx c1 c1 c0 c0
     81             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
     82             // [16] c1 c1 c1 c1 c0 c0 c0 c0
     83             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
     84 
     85             // Load four pixels => unpack the first two pixels to 16 bits =>
     86             // multiply with coefficients => accumulate the convolution result.
     87             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     88             __m128i src8 = _mm_loadu_si128(row_to_filter);
     89             // [16] a1 b1 g1 r1 a0 b0 g0 r0
     90             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
     91             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
     92             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
     93             // [32]  a0*c0 b0*c0 g0*c0 r0*c0
     94             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     95             accum = _mm_add_epi32(accum, t);
     96             // [32]  a1*c1 b1*c1 g1*c1 r1*c1
     97             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     98             accum = _mm_add_epi32(accum, t);
     99 
    100             // Duplicate 3rd and 4th coefficients for all channels =>
    101             // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
    102             // => accumulate the convolution results.
    103             // [16] xx xx xx xx c3 c3 c2 c2
    104             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    105             // [16] c3 c3 c3 c3 c2 c2 c2 c2
    106             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    107             // [16] a3 g3 b3 r3 a2 g2 b2 r2
    108             src16 = _mm_unpackhi_epi8(src8, zero);
    109             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    110             mul_lo = _mm_mullo_epi16(src16, coeff16);
    111             // [32]  a2*c2 b2*c2 g2*c2 r2*c2
    112             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    113             accum = _mm_add_epi32(accum, t);
    114             // [32]  a3*c3 b3*c3 g3*c3 r3*c3
    115             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    116             accum = _mm_add_epi32(accum, t);
    117 
    118             // Advance the pixel and coefficients pointers.
    119             row_to_filter += 1;
    120             filter_values += 4;
    121         }
    122 
    123         // When |filter_length| is not divisible by 4, we need to decimate some of
    124         // the filter coefficient that was loaded incorrectly to zero; Other than
    125         // that the algorithm is same with above, exceot that the 4th pixel will be
    126         // always absent.
    127         int r = filter_length&3;
    128         if (r) {
    129             // Note: filter_values must be padded to align_up(filter_offset, 8).
    130             __m128i coeff, coeff16;
    131             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    132             // Mask out extra filter taps.
    133             coeff = _mm_and_si128(coeff, mask[r]);
    134             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    135             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    136 
    137             // Note: line buffer must be padded to align_up(filter_offset, 16).
    138             // We resolve this by use C-version for the last horizontal line.
    139             __m128i src8 = _mm_loadu_si128(row_to_filter);
    140             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    141             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    142             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    143             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    144             accum = _mm_add_epi32(accum, t);
    145             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    146             accum = _mm_add_epi32(accum, t);
    147 
    148             src16 = _mm_unpackhi_epi8(src8, zero);
    149             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    150             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    151             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    152             mul_lo = _mm_mullo_epi16(src16, coeff16);
    153             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    154             accum = _mm_add_epi32(accum, t);
    155         }
    156 
    157         // Shift right for fixed point implementation.
    158         accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
    159 
    160         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    161         accum = _mm_packs_epi32(accum, zero);
    162         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    163         accum = _mm_packus_epi16(accum, zero);
    164 
    165         // Store the pixel value of 32 bits.
    166         *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
    167         out_row += 4;
    168     }
    169 }
    170 
    171 // Convolves horizontally along four rows. The row data is given in
    172 // |src_data| and continues for the num_values() of the filter.
    173 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
    174 // refer to that function for detailed comments.
    175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
    176                                     const SkConvolutionFilter1D& filter,
    177                                     unsigned char* out_row[4]) {
    178     int num_values = filter.numValues();
    179 
    180     int filter_offset, filter_length;
    181     __m128i zero = _mm_setzero_si128();
    182     __m128i mask[4];
    183     // |mask| will be used to decimate all extra filter coefficients that are
    184     // loaded by SIMD when |filter_length| is not divisible by 4.
    185     // mask[0] is not used in following algorithm.
    186     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
    187     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
    188     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
    189 
    190     // Output one pixel each iteration, calculating all channels (RGBA) together.
    191     for (int out_x = 0; out_x < num_values; out_x++) {
    192         const SkConvolutionFilter1D::ConvolutionFixed* filter_values =
    193             filter.FilterForValue(out_x, &filter_offset, &filter_length);
    194 
    195         // four pixels in a column per iteration.
    196         __m128i accum0 = _mm_setzero_si128();
    197         __m128i accum1 = _mm_setzero_si128();
    198         __m128i accum2 = _mm_setzero_si128();
    199         __m128i accum3 = _mm_setzero_si128();
    200         int start = (filter_offset<<2);
    201         // We will load and accumulate with four coefficients per iteration.
    202         for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
    203             __m128i coeff, coeff16lo, coeff16hi;
    204             // [16] xx xx xx xx c3 c2 c1 c0
    205             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    206             // [16] xx xx xx xx c1 c1 c0 c0
    207             coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    208             // [16] c1 c1 c1 c1 c0 c0 c0 c0
    209             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    210             // [16] xx xx xx xx c3 c3 c2 c2
    211             coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    212             // [16] c3 c3 c3 c3 c2 c2 c2 c2
    213             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    214 
    215             __m128i src8, src16, mul_hi, mul_lo, t;
    216 
    217 #define ITERATION(src, accum)                                                \
    218             src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
    219             src16 = _mm_unpacklo_epi8(src8, zero);                           \
    220             mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
    221             mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
    222             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    223             accum = _mm_add_epi32(accum, t);                                 \
    224             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    225             accum = _mm_add_epi32(accum, t);                                 \
    226             src16 = _mm_unpackhi_epi8(src8, zero);                           \
    227             mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
    228             mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
    229             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    230             accum = _mm_add_epi32(accum, t);                                 \
    231             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    232             accum = _mm_add_epi32(accum, t)
    233 
    234             ITERATION(src_data[0] + start, accum0);
    235             ITERATION(src_data[1] + start, accum1);
    236             ITERATION(src_data[2] + start, accum2);
    237             ITERATION(src_data[3] + start, accum3);
    238 
    239             start += 16;
    240             filter_values += 4;
    241         }
    242 
    243         int r = filter_length & 3;
    244         if (r) {
    245             // Note: filter_values must be padded to align_up(filter_offset, 8);
    246             __m128i coeff;
    247             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    248             // Mask out extra filter taps.
    249             coeff = _mm_and_si128(coeff, mask[r]);
    250 
    251             __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    252             /* c1 c1 c1 c1 c0 c0 c0 c0 */
    253             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    254             __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    255             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    256 
    257             __m128i src8, src16, mul_hi, mul_lo, t;
    258 
    259             ITERATION(src_data[0] + start, accum0);
    260             ITERATION(src_data[1] + start, accum1);
    261             ITERATION(src_data[2] + start, accum2);
    262             ITERATION(src_data[3] + start, accum3);
    263         }
    264 
    265         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    266         accum0 = _mm_packs_epi32(accum0, zero);
    267         accum0 = _mm_packus_epi16(accum0, zero);
    268         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    269         accum1 = _mm_packs_epi32(accum1, zero);
    270         accum1 = _mm_packus_epi16(accum1, zero);
    271         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    272         accum2 = _mm_packs_epi32(accum2, zero);
    273         accum2 = _mm_packus_epi16(accum2, zero);
    274         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
    275         accum3 = _mm_packs_epi32(accum3, zero);
    276         accum3 = _mm_packus_epi16(accum3, zero);
    277 
    278         *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
    279         *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
    280         *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
    281         *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
    282 
    283         out_row[0] += 4;
    284         out_row[1] += 4;
    285         out_row[2] += 4;
    286         out_row[3] += 4;
    287     }
    288 }
    289 
    290 // Does vertical convolution to produce one output row. The filter values and
    291 // length are given in the first two parameters. These are applied to each
    292 // of the rows pointed to in the |source_data_rows| array, with each row
    293 // being |pixel_width| wide.
    294 //
    295 // The output must have room for |pixel_width * 4| bytes.
    296 template<bool has_alpha>
    297 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
    298                              int filter_length,
    299                              unsigned char* const* source_data_rows,
    300                              int pixel_width,
    301                              unsigned char* out_row) {
    302     int width = pixel_width & ~3;
    303 
    304     __m128i zero = _mm_setzero_si128();
    305     __m128i accum0, accum1, accum2, accum3, coeff16;
    306     const __m128i* src;
    307     // Output four pixels per iteration (16 bytes).
    308     for (int out_x = 0; out_x < width; out_x += 4) {
    309 
    310         // Accumulated result for each pixel. 32 bits per RGBA channel.
    311         accum0 = _mm_setzero_si128();
    312         accum1 = _mm_setzero_si128();
    313         accum2 = _mm_setzero_si128();
    314         accum3 = _mm_setzero_si128();
    315 
    316         // Convolve with one filter coefficient per iteration.
    317         for (int filter_y = 0; filter_y < filter_length; filter_y++) {
    318 
    319             // Duplicate the filter coefficient 8 times.
    320             // [16] cj cj cj cj cj cj cj cj
    321             coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    322 
    323             // Load four pixels (16 bytes) together.
    324             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    325             src = reinterpret_cast<const __m128i*>(
    326                 &source_data_rows[filter_y][out_x << 2]);
    327             __m128i src8 = _mm_loadu_si128(src);
    328 
    329             // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
    330             // multiply with current coefficient => accumulate the result.
    331             // [16] a1 b1 g1 r1 a0 b0 g0 r0
    332             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    333             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    334             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    335             // [32] a0 b0 g0 r0
    336             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    337             accum0 = _mm_add_epi32(accum0, t);
    338             // [32] a1 b1 g1 r1
    339             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    340             accum1 = _mm_add_epi32(accum1, t);
    341 
    342             // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
    343             // multiply with current coefficient => accumulate the result.
    344             // [16] a3 b3 g3 r3 a2 b2 g2 r2
    345             src16 = _mm_unpackhi_epi8(src8, zero);
    346             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    347             mul_lo = _mm_mullo_epi16(src16, coeff16);
    348             // [32] a2 b2 g2 r2
    349             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    350             accum2 = _mm_add_epi32(accum2, t);
    351             // [32] a3 b3 g3 r3
    352             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    353             accum3 = _mm_add_epi32(accum3, t);
    354         }
    355 
    356         // Shift right for fixed point implementation.
    357         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    358         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    359         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    360         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
    361 
    362         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    363         // [16] a1 b1 g1 r1 a0 b0 g0 r0
    364         accum0 = _mm_packs_epi32(accum0, accum1);
    365         // [16] a3 b3 g3 r3 a2 b2 g2 r2
    366         accum2 = _mm_packs_epi32(accum2, accum3);
    367 
    368         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    369         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    370         accum0 = _mm_packus_epi16(accum0, accum2);
    371 
    372         if (has_alpha) {
    373             // Compute the max(ri, gi, bi) for each pixel.
    374             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    375             __m128i a = _mm_srli_epi32(accum0, 8);
    376             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    377             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    378             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    379             a = _mm_srli_epi32(accum0, 16);
    380             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    381             b = _mm_max_epu8(a, b);  // Max of r and g and b.
    382             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    383             b = _mm_slli_epi32(b, 24);
    384 
    385             // Make sure the value of alpha channel is always larger than maximum
    386             // value of color channels.
    387             accum0 = _mm_max_epu8(b, accum0);
    388         } else {
    389             // Set value of alpha channels to 0xFF.
    390             __m128i mask = _mm_set1_epi32(0xff000000);
    391             accum0 = _mm_or_si128(accum0, mask);
    392         }
    393 
    394         // Store the convolution result (16 bytes) and advance the pixel pointers.
    395         _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
    396         out_row += 16;
    397     }
    398 
    399     // When the width of the output is not divisible by 4, We need to save one
    400     // pixel (4 bytes) each time. And also the fourth pixel is always absent.
    401     if (pixel_width & 3) {
    402         accum0 = _mm_setzero_si128();
    403         accum1 = _mm_setzero_si128();
    404         accum2 = _mm_setzero_si128();
    405         for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
    406             coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    407             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    408             src = reinterpret_cast<const __m128i*>(
    409                 &source_data_rows[filter_y][width<<2]);
    410             __m128i src8 = _mm_loadu_si128(src);
    411             // [16] a1 b1 g1 r1 a0 b0 g0 r0
    412             __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    413             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    414             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    415             // [32] a0 b0 g0 r0
    416             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    417             accum0 = _mm_add_epi32(accum0, t);
    418             // [32] a1 b1 g1 r1
    419             t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    420             accum1 = _mm_add_epi32(accum1, t);
    421             // [16] a3 b3 g3 r3 a2 b2 g2 r2
    422             src16 = _mm_unpackhi_epi8(src8, zero);
    423             mul_hi = _mm_mulhi_epi16(src16, coeff16);
    424             mul_lo = _mm_mullo_epi16(src16, coeff16);
    425             // [32] a2 b2 g2 r2
    426             t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    427             accum2 = _mm_add_epi32(accum2, t);
    428         }
    429 
    430         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    431         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    432         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    433         // [16] a1 b1 g1 r1 a0 b0 g0 r0
    434         accum0 = _mm_packs_epi32(accum0, accum1);
    435         // [16] a3 b3 g3 r3 a2 b2 g2 r2
    436         accum2 = _mm_packs_epi32(accum2, zero);
    437         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    438         accum0 = _mm_packus_epi16(accum0, accum2);
    439         if (has_alpha) {
    440             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    441             __m128i a = _mm_srli_epi32(accum0, 8);
    442             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    443             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    444             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    445             a = _mm_srli_epi32(accum0, 16);
    446             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    447             b = _mm_max_epu8(a, b);  // Max of r and g and b.
    448             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    449             b = _mm_slli_epi32(b, 24);
    450             accum0 = _mm_max_epu8(b, accum0);
    451         } else {
    452             __m128i mask = _mm_set1_epi32(0xff000000);
    453             accum0 = _mm_or_si128(accum0, mask);
    454         }
    455 
    456         for (int out_x = width; out_x < pixel_width; out_x++) {
    457             *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
    458             accum0 = _mm_srli_si128(accum0, 4);
    459             out_row += 4;
    460         }
    461     }
    462 }
    463 
    464 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filter_values,
    465                              int filter_length,
    466                              unsigned char* const* source_data_rows,
    467                              int pixel_width,
    468                              unsigned char* out_row,
    469                              bool has_alpha) {
    470     if (has_alpha) {
    471         convolveVertically_SSE2<true>(filter_values,
    472                                       filter_length,
    473                                       source_data_rows,
    474                                       pixel_width,
    475                                       out_row);
    476     } else {
    477         convolveVertically_SSE2<false>(filter_values,
    478                                        filter_length,
    479                                        source_data_rows,
    480                                        pixel_width,
    481                                        out_row);
    482     }
    483 }
    484 
    485 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) {
    486     // Padding |paddingCount| of more dummy coefficients after the coefficients
    487     // of last filter to prevent SIMD instructions which load 8 or 16 bytes
    488     // together to access invalid memory areas. We are not trying to align the
    489     // coefficients right now due to the opaqueness of <vector> implementation.
    490     // This has to be done after all |AddFilter| calls.
    491     for (int i = 0; i < 8; ++i) {
    492         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
    493     }
    494 }
    495