Home | History | Annotate | Download | only in ext
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <algorithm>
      6 
      7 #include "skia/ext/convolver.h"
      8 #include "skia/ext/convolver_SSE2.h"
      9 #include "third_party/skia/include/core/SkTypes.h"
     10 
     11 #include <emmintrin.h>  // ARCH_CPU_X86_FAMILY was defined in build/config.h
     12 
     13 namespace skia {
     14 
     15 // Convolves horizontally along a single row. The row data is given in
     16 // |src_data| and continues for the num_values() of the filter.
     17 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
     18                                const ConvolutionFilter1D& filter,
     19                                unsigned char* out_row,
     20                                bool /*has_alpha*/) {
     21   int num_values = filter.num_values();
     22 
     23   int filter_offset, filter_length;
     24   __m128i zero = _mm_setzero_si128();
     25   __m128i mask[4];
     26   // |mask| will be used to decimate all extra filter coefficients that are
     27   // loaded by SIMD when |filter_length| is not divisible by 4.
     28   // mask[0] is not used in following algorithm.
     29   mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
     30   mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
     31   mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
     32 
     33   // Output one pixel each iteration, calculating all channels (RGBA) together.
     34   for (int out_x = 0; out_x < num_values; out_x++) {
     35     const ConvolutionFilter1D::Fixed* filter_values =
     36         filter.FilterForValue(out_x, &filter_offset, &filter_length);
     37 
     38     __m128i accum = _mm_setzero_si128();
     39 
     40     // Compute the first pixel in this row that the filter affects. It will
     41     // touch |filter_length| pixels (4 bytes each) after this.
     42     const __m128i* row_to_filter =
     43         reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]);
     44 
     45     // We will load and accumulate with four coefficients per iteration.
     46     for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) {
     47 
     48       // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
     49       __m128i coeff, coeff16;
     50       // [16] xx xx xx xx c3 c2 c1 c0
     51       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
     52       // [16] xx xx xx xx c1 c1 c0 c0
     53       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
     54       // [16] c1 c1 c1 c1 c0 c0 c0 c0
     55       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
     56 
     57       // Load four pixels => unpack the first two pixels to 16 bits =>
     58       // multiply with coefficients => accumulate the convolution result.
     59       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     60       __m128i src8 = _mm_loadu_si128(row_to_filter);
     61       // [16] a1 b1 g1 r1 a0 b0 g0 r0
     62       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
     63       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
     64       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
     65       // [32]  a0*c0 b0*c0 g0*c0 r0*c0
     66       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     67       accum = _mm_add_epi32(accum, t);
     68       // [32]  a1*c1 b1*c1 g1*c1 r1*c1
     69       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     70       accum = _mm_add_epi32(accum, t);
     71 
     72       // Duplicate 3rd and 4th coefficients for all channels =>
     73       // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
     74       // => accumulate the convolution results.
     75       // [16] xx xx xx xx c3 c3 c2 c2
     76       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
     77       // [16] c3 c3 c3 c3 c2 c2 c2 c2
     78       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
     79       // [16] a3 g3 b3 r3 a2 g2 b2 r2
     80       src16 = _mm_unpackhi_epi8(src8, zero);
     81       mul_hi = _mm_mulhi_epi16(src16, coeff16);
     82       mul_lo = _mm_mullo_epi16(src16, coeff16);
     83       // [32]  a2*c2 b2*c2 g2*c2 r2*c2
     84       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     85       accum = _mm_add_epi32(accum, t);
     86       // [32]  a3*c3 b3*c3 g3*c3 r3*c3
     87       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     88       accum = _mm_add_epi32(accum, t);
     89 
     90       // Advance the pixel and coefficients pointers.
     91       row_to_filter += 1;
     92       filter_values += 4;
     93     }
     94 
     95     // When |filter_length| is not divisible by 4, we need to decimate some of
     96     // the filter coefficient that was loaded incorrectly to zero; Other than
     97     // that the algorithm is same with above, exceot that the 4th pixel will be
     98     // always absent.
     99     int r = filter_length&3;
    100     if (r) {
    101       // Note: filter_values must be padded to align_up(filter_offset, 8).
    102       __m128i coeff, coeff16;
    103       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    104       // Mask out extra filter taps.
    105       coeff = _mm_and_si128(coeff, mask[r]);
    106       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    107       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    108 
    109       // Note: line buffer must be padded to align_up(filter_offset, 16).
    110       // We resolve this by use C-version for the last horizontal line.
    111       __m128i src8 = _mm_loadu_si128(row_to_filter);
    112       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    113       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    114       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    115       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    116       accum = _mm_add_epi32(accum, t);
    117       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    118       accum = _mm_add_epi32(accum, t);
    119 
    120       src16 = _mm_unpackhi_epi8(src8, zero);
    121       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    122       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
    123       mul_hi = _mm_mulhi_epi16(src16, coeff16);
    124       mul_lo = _mm_mullo_epi16(src16, coeff16);
    125       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    126       accum = _mm_add_epi32(accum, t);
    127     }
    128 
    129     // Shift right for fixed point implementation.
    130     accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits);
    131 
    132     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    133     accum = _mm_packs_epi32(accum, zero);
    134     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    135     accum = _mm_packus_epi16(accum, zero);
    136 
    137     // Store the pixel value of 32 bits.
    138     *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum);
    139     out_row += 4;
    140   }
    141 }
    142 
    143 // Convolves horizontally along four rows. The row data is given in
    144 // |src_data| and continues for the num_values() of the filter.
    145 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
    146 // refer to that function for detailed comments.
    147 void Convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
    148                                     const ConvolutionFilter1D& filter,
    149                                     unsigned char* out_row[4]) {
    150   int num_values = filter.num_values();
    151 
    152   int filter_offset, filter_length;
    153   __m128i zero = _mm_setzero_si128();
    154   __m128i mask[4];
    155   // |mask| will be used to decimate all extra filter coefficients that are
    156   // loaded by SIMD when |filter_length| is not divisible by 4.
    157   // mask[0] is not used in following algorithm.
    158   mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
    159   mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
    160   mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
    161 
    162   // Output one pixel each iteration, calculating all channels (RGBA) together.
    163   for (int out_x = 0; out_x < num_values; out_x++) {
    164     const ConvolutionFilter1D::Fixed* filter_values =
    165         filter.FilterForValue(out_x, &filter_offset, &filter_length);
    166 
    167     // four pixels in a column per iteration.
    168     __m128i accum0 = _mm_setzero_si128();
    169     __m128i accum1 = _mm_setzero_si128();
    170     __m128i accum2 = _mm_setzero_si128();
    171     __m128i accum3 = _mm_setzero_si128();
    172     int start = (filter_offset<<2);
    173     // We will load and accumulate with four coefficients per iteration.
    174     for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) {
    175       __m128i coeff, coeff16lo, coeff16hi;
    176       // [16] xx xx xx xx c3 c2 c1 c0
    177       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    178       // [16] xx xx xx xx c1 c1 c0 c0
    179       coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    180       // [16] c1 c1 c1 c1 c0 c0 c0 c0
    181       coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    182       // [16] xx xx xx xx c3 c3 c2 c2
    183       coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    184       // [16] c3 c3 c3 c3 c2 c2 c2 c2
    185       coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    186 
    187       __m128i src8, src16, mul_hi, mul_lo, t;
    188 
    189 #define ITERATION(src, accum)                                          \
    190       src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \
    191       src16 = _mm_unpacklo_epi8(src8, zero);                           \
    192       mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \
    193       mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \
    194       t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    195       accum = _mm_add_epi32(accum, t);                                 \
    196       t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    197       accum = _mm_add_epi32(accum, t);                                 \
    198       src16 = _mm_unpackhi_epi8(src8, zero);                           \
    199       mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \
    200       mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \
    201       t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \
    202       accum = _mm_add_epi32(accum, t);                                 \
    203       t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \
    204       accum = _mm_add_epi32(accum, t)
    205 
    206       ITERATION(src_data[0] + start, accum0);
    207       ITERATION(src_data[1] + start, accum1);
    208       ITERATION(src_data[2] + start, accum2);
    209       ITERATION(src_data[3] + start, accum3);
    210 
    211       start += 16;
    212       filter_values += 4;
    213     }
    214 
    215     int r = filter_length & 3;
    216     if (r) {
    217       // Note: filter_values must be padded to align_up(filter_offset, 8);
    218       __m128i coeff;
    219       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
    220       // Mask out extra filter taps.
    221       coeff = _mm_and_si128(coeff, mask[r]);
    222 
    223       __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
    224       /* c1 c1 c1 c1 c0 c0 c0 c0 */
    225       coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo);
    226       __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
    227       coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi);
    228 
    229       __m128i src8, src16, mul_hi, mul_lo, t;
    230 
    231       ITERATION(src_data[0] + start, accum0);
    232       ITERATION(src_data[1] + start, accum1);
    233       ITERATION(src_data[2] + start, accum2);
    234       ITERATION(src_data[3] + start, accum3);
    235     }
    236 
    237     accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
    238     accum0 = _mm_packs_epi32(accum0, zero);
    239     accum0 = _mm_packus_epi16(accum0, zero);
    240     accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
    241     accum1 = _mm_packs_epi32(accum1, zero);
    242     accum1 = _mm_packus_epi16(accum1, zero);
    243     accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
    244     accum2 = _mm_packs_epi32(accum2, zero);
    245     accum2 = _mm_packus_epi16(accum2, zero);
    246     accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
    247     accum3 = _mm_packs_epi32(accum3, zero);
    248     accum3 = _mm_packus_epi16(accum3, zero);
    249 
    250     *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
    251     *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
    252     *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
    253     *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3);
    254 
    255     out_row[0] += 4;
    256     out_row[1] += 4;
    257     out_row[2] += 4;
    258     out_row[3] += 4;
    259   }
    260 }
    261 
    262 // Does vertical convolution to produce one output row. The filter values and
    263 // length are given in the first two parameters. These are applied to each
    264 // of the rows pointed to in the |source_data_rows| array, with each row
    265 // being |pixel_width| wide.
    266 //
    267 // The output must have room for |pixel_width * 4| bytes.
    268 template<bool has_alpha>
    269 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
    270                              int filter_length,
    271                              unsigned char* const* source_data_rows,
    272                              int pixel_width,
    273                              unsigned char* out_row) {
    274   int width = pixel_width & ~3;
    275 
    276   __m128i zero = _mm_setzero_si128();
    277   __m128i accum0, accum1, accum2, accum3, coeff16;
    278   const __m128i* src;
    279   // Output four pixels per iteration (16 bytes).
    280   for (int out_x = 0; out_x < width; out_x += 4) {
    281 
    282     // Accumulated result for each pixel. 32 bits per RGBA channel.
    283     accum0 = _mm_setzero_si128();
    284     accum1 = _mm_setzero_si128();
    285     accum2 = _mm_setzero_si128();
    286     accum3 = _mm_setzero_si128();
    287 
    288     // Convolve with one filter coefficient per iteration.
    289     for (int filter_y = 0; filter_y < filter_length; filter_y++) {
    290 
    291       // Duplicate the filter coefficient 8 times.
    292       // [16] cj cj cj cj cj cj cj cj
    293       coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    294 
    295       // Load four pixels (16 bytes) together.
    296       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    297       src = reinterpret_cast<const __m128i*>(
    298           &source_data_rows[filter_y][out_x << 2]);
    299       __m128i src8 = _mm_loadu_si128(src);
    300 
    301       // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
    302       // multiply with current coefficient => accumulate the result.
    303       // [16] a1 b1 g1 r1 a0 b0 g0 r0
    304       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    305       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    306       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    307       // [32] a0 b0 g0 r0
    308       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    309       accum0 = _mm_add_epi32(accum0, t);
    310       // [32] a1 b1 g1 r1
    311       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    312       accum1 = _mm_add_epi32(accum1, t);
    313 
    314       // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
    315       // multiply with current coefficient => accumulate the result.
    316       // [16] a3 b3 g3 r3 a2 b2 g2 r2
    317       src16 = _mm_unpackhi_epi8(src8, zero);
    318       mul_hi = _mm_mulhi_epi16(src16, coeff16);
    319       mul_lo = _mm_mullo_epi16(src16, coeff16);
    320       // [32] a2 b2 g2 r2
    321       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    322       accum2 = _mm_add_epi32(accum2, t);
    323       // [32] a3 b3 g3 r3
    324       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    325       accum3 = _mm_add_epi32(accum3, t);
    326     }
    327 
    328     // Shift right for fixed point implementation.
    329     accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
    330     accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
    331     accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
    332     accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits);
    333 
    334     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    335     // [16] a1 b1 g1 r1 a0 b0 g0 r0
    336     accum0 = _mm_packs_epi32(accum0, accum1);
    337     // [16] a3 b3 g3 r3 a2 b2 g2 r2
    338     accum2 = _mm_packs_epi32(accum2, accum3);
    339 
    340     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    341     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    342     accum0 = _mm_packus_epi16(accum0, accum2);
    343 
    344     if (has_alpha) {
    345       // Compute the max(ri, gi, bi) for each pixel.
    346       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    347       __m128i a = _mm_srli_epi32(accum0, 8);
    348       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    349       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    350       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    351       a = _mm_srli_epi32(accum0, 16);
    352       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    353       b = _mm_max_epu8(a, b);  // Max of r and g and b.
    354       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    355       b = _mm_slli_epi32(b, 24);
    356 
    357       // Make sure the value of alpha channel is always larger than maximum
    358       // value of color channels.
    359       accum0 = _mm_max_epu8(b, accum0);
    360     } else {
    361       // Set value of alpha channels to 0xFF.
    362       __m128i mask = _mm_set1_epi32(0xff000000);
    363       accum0 = _mm_or_si128(accum0, mask);
    364     }
    365 
    366     // Store the convolution result (16 bytes) and advance the pixel pointers.
    367     _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0);
    368     out_row += 16;
    369   }
    370 
    371   // When the width of the output is not divisible by 4, We need to save one
    372   // pixel (4 bytes) each time. And also the fourth pixel is always absent.
    373   if (pixel_width & 3) {
    374     accum0 = _mm_setzero_si128();
    375     accum1 = _mm_setzero_si128();
    376     accum2 = _mm_setzero_si128();
    377     for (int filter_y = 0; filter_y < filter_length; ++filter_y) {
    378       coeff16 = _mm_set1_epi16(filter_values[filter_y]);
    379       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    380       src = reinterpret_cast<const __m128i*>(
    381           &source_data_rows[filter_y][width<<2]);
    382       __m128i src8 = _mm_loadu_si128(src);
    383       // [16] a1 b1 g1 r1 a0 b0 g0 r0
    384       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    385       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    386       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    387       // [32] a0 b0 g0 r0
    388       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    389       accum0 = _mm_add_epi32(accum0, t);
    390       // [32] a1 b1 g1 r1
    391       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    392       accum1 = _mm_add_epi32(accum1, t);
    393       // [16] a3 b3 g3 r3 a2 b2 g2 r2
    394       src16 = _mm_unpackhi_epi8(src8, zero);
    395       mul_hi = _mm_mulhi_epi16(src16, coeff16);
    396       mul_lo = _mm_mullo_epi16(src16, coeff16);
    397       // [32] a2 b2 g2 r2
    398       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    399       accum2 = _mm_add_epi32(accum2, t);
    400     }
    401 
    402     accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits);
    403     accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits);
    404     accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits);
    405     // [16] a1 b1 g1 r1 a0 b0 g0 r0
    406     accum0 = _mm_packs_epi32(accum0, accum1);
    407     // [16] a3 b3 g3 r3 a2 b2 g2 r2
    408     accum2 = _mm_packs_epi32(accum2, zero);
    409     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    410     accum0 = _mm_packus_epi16(accum0, accum2);
    411     if (has_alpha) {
    412       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    413       __m128i a = _mm_srli_epi32(accum0, 8);
    414       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    415       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    416       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    417       a = _mm_srli_epi32(accum0, 16);
    418       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    419       b = _mm_max_epu8(a, b);  // Max of r and g and b.
    420       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    421       b = _mm_slli_epi32(b, 24);
    422       accum0 = _mm_max_epu8(b, accum0);
    423     } else {
    424       __m128i mask = _mm_set1_epi32(0xff000000);
    425       accum0 = _mm_or_si128(accum0, mask);
    426     }
    427 
    428     for (int out_x = width; out_x < pixel_width; out_x++) {
    429       *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
    430       accum0 = _mm_srli_si128(accum0, 4);
    431       out_row += 4;
    432     }
    433   }
    434 }
    435 
    436 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
    437                              int filter_length,
    438                              unsigned char* const* source_data_rows,
    439                              int pixel_width,
    440                              unsigned char* out_row,
    441                              bool has_alpha) {
    442   if (has_alpha) {
    443     ConvolveVertically_SSE2<true>(filter_values,
    444                                   filter_length,
    445                                   source_data_rows,
    446                                   pixel_width,
    447                                   out_row);
    448   } else {
    449     ConvolveVertically_SSE2<false>(filter_values,
    450                                    filter_length,
    451                                    source_data_rows,
    452                                    pixel_width,
    453                                    out_row);
    454   }
    455 }
    456 
    457 }  // namespace skia
    458