1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <algorithm> 6 7 #include "skia/ext/convolver.h" 8 #include "skia/ext/convolver_SSE2.h" 9 #include "third_party/skia/include/core/SkTypes.h" 10 11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h 12 13 namespace skia { 14 15 // Convolves horizontally along a single row. The row data is given in 16 // |src_data| and continues for the num_values() of the filter. 17 void ConvolveHorizontally_SSE2(const unsigned char* src_data, 18 const ConvolutionFilter1D& filter, 19 unsigned char* out_row, 20 bool /*has_alpha*/) { 21 int num_values = filter.num_values(); 22 23 int filter_offset, filter_length; 24 __m128i zero = _mm_setzero_si128(); 25 __m128i mask[4]; 26 // |mask| will be used to decimate all extra filter coefficients that are 27 // loaded by SIMD when |filter_length| is not divisible by 4. 28 // mask[0] is not used in following algorithm. 29 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); 30 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); 31 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); 32 33 // Output one pixel each iteration, calculating all channels (RGBA) together. 34 for (int out_x = 0; out_x < num_values; out_x++) { 35 const ConvolutionFilter1D::Fixed* filter_values = 36 filter.FilterForValue(out_x, &filter_offset, &filter_length); 37 38 __m128i accum = _mm_setzero_si128(); 39 40 // Compute the first pixel in this row that the filter affects. It will 41 // touch |filter_length| pixels (4 bytes each) after this. 42 const __m128i* row_to_filter = 43 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); 44 45 // We will load and accumulate with four coefficients per iteration. 46 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { 47 48 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. 49 __m128i coeff, coeff16; 50 // [16] xx xx xx xx c3 c2 c1 c0 51 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 52 // [16] xx xx xx xx c1 c1 c0 c0 53 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 54 // [16] c1 c1 c1 c1 c0 c0 c0 c0 55 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 56 57 // Load four pixels => unpack the first two pixels to 16 bits => 58 // multiply with coefficients => accumulate the convolution result. 59 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 60 __m128i src8 = _mm_loadu_si128(row_to_filter); 61 // [16] a1 b1 g1 r1 a0 b0 g0 r0 62 __m128i src16 = _mm_unpacklo_epi8(src8, zero); 63 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 64 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 65 // [32] a0*c0 b0*c0 g0*c0 r0*c0 66 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 67 accum = _mm_add_epi32(accum, t); 68 // [32] a1*c1 b1*c1 g1*c1 r1*c1 69 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 70 accum = _mm_add_epi32(accum, t); 71 72 // Duplicate 3rd and 4th coefficients for all channels => 73 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients 74 // => accumulate the convolution results. 75 // [16] xx xx xx xx c3 c3 c2 c2 76 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 77 // [16] c3 c3 c3 c3 c2 c2 c2 c2 78 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 79 // [16] a3 g3 b3 r3 a2 g2 b2 r2 80 src16 = _mm_unpackhi_epi8(src8, zero); 81 mul_hi = _mm_mulhi_epi16(src16, coeff16); 82 mul_lo = _mm_mullo_epi16(src16, coeff16); 83 // [32] a2*c2 b2*c2 g2*c2 r2*c2 84 t = _mm_unpacklo_epi16(mul_lo, mul_hi); 85 accum = _mm_add_epi32(accum, t); 86 // [32] a3*c3 b3*c3 g3*c3 r3*c3 87 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 88 accum = _mm_add_epi32(accum, t); 89 90 // Advance the pixel and coefficients pointers. 91 row_to_filter += 1; 92 filter_values += 4; 93 } 94 95 // When |filter_length| is not divisible by 4, we need to decimate some of 96 // the filter coefficient that was loaded incorrectly to zero; Other than 97 // that the algorithm is same with above, exceot that the 4th pixel will be 98 // always absent. 99 int r = filter_length&3; 100 if (r) { 101 // Note: filter_values must be padded to align_up(filter_offset, 8). 102 __m128i coeff, coeff16; 103 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 104 // Mask out extra filter taps. 105 coeff = _mm_and_si128(coeff, mask[r]); 106 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 107 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 108 109 // Note: line buffer must be padded to align_up(filter_offset, 16). 110 // We resolve this by use C-version for the last horizontal line. 111 __m128i src8 = _mm_loadu_si128(row_to_filter); 112 __m128i src16 = _mm_unpacklo_epi8(src8, zero); 113 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 114 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 115 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 116 accum = _mm_add_epi32(accum, t); 117 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 118 accum = _mm_add_epi32(accum, t); 119 120 src16 = _mm_unpackhi_epi8(src8, zero); 121 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 122 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 123 mul_hi = _mm_mulhi_epi16(src16, coeff16); 124 mul_lo = _mm_mullo_epi16(src16, coeff16); 125 t = _mm_unpacklo_epi16(mul_lo, mul_hi); 126 accum = _mm_add_epi32(accum, t); 127 } 128 129 // Shift right for fixed point implementation. 130 accum = _mm_srai_epi32(accum, ConvolutionFilter1D::kShiftBits); 131 132 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). 133 accum = _mm_packs_epi32(accum, zero); 134 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). 135 accum = _mm_packus_epi16(accum, zero); 136 137 // Store the pixel value of 32 bits. 138 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); 139 out_row += 4; 140 } 141 } 142 143 // Convolves horizontally along four rows. The row data is given in 144 // |src_data| and continues for the num_values() of the filter. 145 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please 146 // refer to that function for detailed comments. 147 void Convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], 148 const ConvolutionFilter1D& filter, 149 unsigned char* out_row[4]) { 150 int num_values = filter.num_values(); 151 152 int filter_offset, filter_length; 153 __m128i zero = _mm_setzero_si128(); 154 __m128i mask[4]; 155 // |mask| will be used to decimate all extra filter coefficients that are 156 // loaded by SIMD when |filter_length| is not divisible by 4. 157 // mask[0] is not used in following algorithm. 158 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); 159 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); 160 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); 161 162 // Output one pixel each iteration, calculating all channels (RGBA) together. 163 for (int out_x = 0; out_x < num_values; out_x++) { 164 const ConvolutionFilter1D::Fixed* filter_values = 165 filter.FilterForValue(out_x, &filter_offset, &filter_length); 166 167 // four pixels in a column per iteration. 168 __m128i accum0 = _mm_setzero_si128(); 169 __m128i accum1 = _mm_setzero_si128(); 170 __m128i accum2 = _mm_setzero_si128(); 171 __m128i accum3 = _mm_setzero_si128(); 172 int start = (filter_offset<<2); 173 // We will load and accumulate with four coefficients per iteration. 174 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { 175 __m128i coeff, coeff16lo, coeff16hi; 176 // [16] xx xx xx xx c3 c2 c1 c0 177 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 178 // [16] xx xx xx xx c1 c1 c0 c0 179 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 180 // [16] c1 c1 c1 c1 c0 c0 c0 c0 181 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); 182 // [16] xx xx xx xx c3 c3 c2 c2 183 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 184 // [16] c3 c3 c3 c3 c2 c2 c2 c2 185 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); 186 187 __m128i src8, src16, mul_hi, mul_lo, t; 188 189 #define ITERATION(src, accum) \ 190 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ 191 src16 = _mm_unpacklo_epi8(src8, zero); \ 192 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ 193 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ 194 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ 195 accum = _mm_add_epi32(accum, t); \ 196 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ 197 accum = _mm_add_epi32(accum, t); \ 198 src16 = _mm_unpackhi_epi8(src8, zero); \ 199 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ 200 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ 201 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ 202 accum = _mm_add_epi32(accum, t); \ 203 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ 204 accum = _mm_add_epi32(accum, t) 205 206 ITERATION(src_data[0] + start, accum0); 207 ITERATION(src_data[1] + start, accum1); 208 ITERATION(src_data[2] + start, accum2); 209 ITERATION(src_data[3] + start, accum3); 210 211 start += 16; 212 filter_values += 4; 213 } 214 215 int r = filter_length & 3; 216 if (r) { 217 // Note: filter_values must be padded to align_up(filter_offset, 8); 218 __m128i coeff; 219 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); 220 // Mask out extra filter taps. 221 coeff = _mm_and_si128(coeff, mask[r]); 222 223 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 224 /* c1 c1 c1 c1 c0 c0 c0 c0 */ 225 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); 226 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 227 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); 228 229 __m128i src8, src16, mul_hi, mul_lo, t; 230 231 ITERATION(src_data[0] + start, accum0); 232 ITERATION(src_data[1] + start, accum1); 233 ITERATION(src_data[2] + start, accum2); 234 ITERATION(src_data[3] + start, accum3); 235 } 236 237 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); 238 accum0 = _mm_packs_epi32(accum0, zero); 239 accum0 = _mm_packus_epi16(accum0, zero); 240 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); 241 accum1 = _mm_packs_epi32(accum1, zero); 242 accum1 = _mm_packus_epi16(accum1, zero); 243 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); 244 accum2 = _mm_packs_epi32(accum2, zero); 245 accum2 = _mm_packus_epi16(accum2, zero); 246 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); 247 accum3 = _mm_packs_epi32(accum3, zero); 248 accum3 = _mm_packus_epi16(accum3, zero); 249 250 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); 251 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); 252 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); 253 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); 254 255 out_row[0] += 4; 256 out_row[1] += 4; 257 out_row[2] += 4; 258 out_row[3] += 4; 259 } 260 } 261 262 // Does vertical convolution to produce one output row. The filter values and 263 // length are given in the first two parameters. These are applied to each 264 // of the rows pointed to in the |source_data_rows| array, with each row 265 // being |pixel_width| wide. 266 // 267 // The output must have room for |pixel_width * 4| bytes. 268 template<bool has_alpha> 269 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values, 270 int filter_length, 271 unsigned char* const* source_data_rows, 272 int pixel_width, 273 unsigned char* out_row) { 274 int width = pixel_width & ~3; 275 276 __m128i zero = _mm_setzero_si128(); 277 __m128i accum0, accum1, accum2, accum3, coeff16; 278 const __m128i* src; 279 // Output four pixels per iteration (16 bytes). 280 for (int out_x = 0; out_x < width; out_x += 4) { 281 282 // Accumulated result for each pixel. 32 bits per RGBA channel. 283 accum0 = _mm_setzero_si128(); 284 accum1 = _mm_setzero_si128(); 285 accum2 = _mm_setzero_si128(); 286 accum3 = _mm_setzero_si128(); 287 288 // Convolve with one filter coefficient per iteration. 289 for (int filter_y = 0; filter_y < filter_length; filter_y++) { 290 291 // Duplicate the filter coefficient 8 times. 292 // [16] cj cj cj cj cj cj cj cj 293 coeff16 = _mm_set1_epi16(filter_values[filter_y]); 294 295 // Load four pixels (16 bytes) together. 296 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 297 src = reinterpret_cast<const __m128i*>( 298 &source_data_rows[filter_y][out_x << 2]); 299 __m128i src8 = _mm_loadu_si128(src); 300 301 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => 302 // multiply with current coefficient => accumulate the result. 303 // [16] a1 b1 g1 r1 a0 b0 g0 r0 304 __m128i src16 = _mm_unpacklo_epi8(src8, zero); 305 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 306 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 307 // [32] a0 b0 g0 r0 308 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 309 accum0 = _mm_add_epi32(accum0, t); 310 // [32] a1 b1 g1 r1 311 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 312 accum1 = _mm_add_epi32(accum1, t); 313 314 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => 315 // multiply with current coefficient => accumulate the result. 316 // [16] a3 b3 g3 r3 a2 b2 g2 r2 317 src16 = _mm_unpackhi_epi8(src8, zero); 318 mul_hi = _mm_mulhi_epi16(src16, coeff16); 319 mul_lo = _mm_mullo_epi16(src16, coeff16); 320 // [32] a2 b2 g2 r2 321 t = _mm_unpacklo_epi16(mul_lo, mul_hi); 322 accum2 = _mm_add_epi32(accum2, t); 323 // [32] a3 b3 g3 r3 324 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 325 accum3 = _mm_add_epi32(accum3, t); 326 } 327 328 // Shift right for fixed point implementation. 329 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); 330 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); 331 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); 332 accum3 = _mm_srai_epi32(accum3, ConvolutionFilter1D::kShiftBits); 333 334 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). 335 // [16] a1 b1 g1 r1 a0 b0 g0 r0 336 accum0 = _mm_packs_epi32(accum0, accum1); 337 // [16] a3 b3 g3 r3 a2 b2 g2 r2 338 accum2 = _mm_packs_epi32(accum2, accum3); 339 340 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). 341 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 342 accum0 = _mm_packus_epi16(accum0, accum2); 343 344 if (has_alpha) { 345 // Compute the max(ri, gi, bi) for each pixel. 346 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 347 __m128i a = _mm_srli_epi32(accum0, 8); 348 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 349 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. 350 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 351 a = _mm_srli_epi32(accum0, 16); 352 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 353 b = _mm_max_epu8(a, b); // Max of r and g and b. 354 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 355 b = _mm_slli_epi32(b, 24); 356 357 // Make sure the value of alpha channel is always larger than maximum 358 // value of color channels. 359 accum0 = _mm_max_epu8(b, accum0); 360 } else { 361 // Set value of alpha channels to 0xFF. 362 __m128i mask = _mm_set1_epi32(0xff000000); 363 accum0 = _mm_or_si128(accum0, mask); 364 } 365 366 // Store the convolution result (16 bytes) and advance the pixel pointers. 367 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); 368 out_row += 16; 369 } 370 371 // When the width of the output is not divisible by 4, We need to save one 372 // pixel (4 bytes) each time. And also the fourth pixel is always absent. 373 if (pixel_width & 3) { 374 accum0 = _mm_setzero_si128(); 375 accum1 = _mm_setzero_si128(); 376 accum2 = _mm_setzero_si128(); 377 for (int filter_y = 0; filter_y < filter_length; ++filter_y) { 378 coeff16 = _mm_set1_epi16(filter_values[filter_y]); 379 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 380 src = reinterpret_cast<const __m128i*>( 381 &source_data_rows[filter_y][width<<2]); 382 __m128i src8 = _mm_loadu_si128(src); 383 // [16] a1 b1 g1 r1 a0 b0 g0 r0 384 __m128i src16 = _mm_unpacklo_epi8(src8, zero); 385 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 386 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 387 // [32] a0 b0 g0 r0 388 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 389 accum0 = _mm_add_epi32(accum0, t); 390 // [32] a1 b1 g1 r1 391 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 392 accum1 = _mm_add_epi32(accum1, t); 393 // [16] a3 b3 g3 r3 a2 b2 g2 r2 394 src16 = _mm_unpackhi_epi8(src8, zero); 395 mul_hi = _mm_mulhi_epi16(src16, coeff16); 396 mul_lo = _mm_mullo_epi16(src16, coeff16); 397 // [32] a2 b2 g2 r2 398 t = _mm_unpacklo_epi16(mul_lo, mul_hi); 399 accum2 = _mm_add_epi32(accum2, t); 400 } 401 402 accum0 = _mm_srai_epi32(accum0, ConvolutionFilter1D::kShiftBits); 403 accum1 = _mm_srai_epi32(accum1, ConvolutionFilter1D::kShiftBits); 404 accum2 = _mm_srai_epi32(accum2, ConvolutionFilter1D::kShiftBits); 405 // [16] a1 b1 g1 r1 a0 b0 g0 r0 406 accum0 = _mm_packs_epi32(accum0, accum1); 407 // [16] a3 b3 g3 r3 a2 b2 g2 r2 408 accum2 = _mm_packs_epi32(accum2, zero); 409 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 410 accum0 = _mm_packus_epi16(accum0, accum2); 411 if (has_alpha) { 412 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 413 __m128i a = _mm_srli_epi32(accum0, 8); 414 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 415 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. 416 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 417 a = _mm_srli_epi32(accum0, 16); 418 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 419 b = _mm_max_epu8(a, b); // Max of r and g and b. 420 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 421 b = _mm_slli_epi32(b, 24); 422 accum0 = _mm_max_epu8(b, accum0); 423 } else { 424 __m128i mask = _mm_set1_epi32(0xff000000); 425 accum0 = _mm_or_si128(accum0, mask); 426 } 427 428 for (int out_x = width; out_x < pixel_width; out_x++) { 429 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); 430 accum0 = _mm_srli_si128(accum0, 4); 431 out_row += 4; 432 } 433 } 434 } 435 436 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values, 437 int filter_length, 438 unsigned char* const* source_data_rows, 439 int pixel_width, 440 unsigned char* out_row, 441 bool has_alpha) { 442 if (has_alpha) { 443 ConvolveVertically_SSE2<true>(filter_values, 444 filter_length, 445 source_data_rows, 446 pixel_width, 447 out_row); 448 } else { 449 ConvolveVertically_SSE2<false>(filter_values, 450 filter_length, 451 source_data_rows, 452 pixel_width, 453 out_row); 454 } 455 } 456 457 } // namespace skia 458