Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBitmapProcState.h"
      9 #include "SkBitmapProcState_filter.h"
     10 #include "SkColorPriv.h"
     11 #include "SkFilterProc.h"
     12 #include "SkPaint.h"
     13 #include "SkShader.h"   // for tilemodes
     14 #include "SkUtilsArm.h"
     15 
     16 // Required to ensure the table is part of the final binary.
     17 extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
     18 
     19 #define   NAME_WRAP(x)  x ## _neon
     20 #include "SkBitmapProcState_filter_neon.h"
     21 #include "SkBitmapProcState_procs.h"
     22 
     23 const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
     24     S32_opaque_D32_nofilter_DXDY_neon,
     25     S32_alpha_D32_nofilter_DXDY_neon,
     26     S32_opaque_D32_nofilter_DX_neon,
     27     S32_alpha_D32_nofilter_DX_neon,
     28     S32_opaque_D32_filter_DXDY_neon,
     29     S32_alpha_D32_filter_DXDY_neon,
     30     S32_opaque_D32_filter_DX_neon,
     31     S32_alpha_D32_filter_DX_neon,
     32 
     33     S16_opaque_D32_nofilter_DXDY_neon,
     34     S16_alpha_D32_nofilter_DXDY_neon,
     35     S16_opaque_D32_nofilter_DX_neon,
     36     S16_alpha_D32_nofilter_DX_neon,
     37     S16_opaque_D32_filter_DXDY_neon,
     38     S16_alpha_D32_filter_DXDY_neon,
     39     S16_opaque_D32_filter_DX_neon,
     40     S16_alpha_D32_filter_DX_neon,
     41 
     42     SI8_opaque_D32_nofilter_DXDY_neon,
     43     SI8_alpha_D32_nofilter_DXDY_neon,
     44     SI8_opaque_D32_nofilter_DX_neon,
     45     SI8_alpha_D32_nofilter_DX_neon,
     46     SI8_opaque_D32_filter_DXDY_neon,
     47     SI8_alpha_D32_filter_DXDY_neon,
     48     SI8_opaque_D32_filter_DX_neon,
     49     SI8_alpha_D32_filter_DX_neon,
     50 
     51     S4444_opaque_D32_nofilter_DXDY_neon,
     52     S4444_alpha_D32_nofilter_DXDY_neon,
     53     S4444_opaque_D32_nofilter_DX_neon,
     54     S4444_alpha_D32_nofilter_DX_neon,
     55     S4444_opaque_D32_filter_DXDY_neon,
     56     S4444_alpha_D32_filter_DXDY_neon,
     57     S4444_opaque_D32_filter_DX_neon,
     58     S4444_alpha_D32_filter_DX_neon,
     59 
     60     // A8 treats alpha/opauqe the same (equally efficient)
     61     SA8_alpha_D32_nofilter_DXDY_neon,
     62     SA8_alpha_D32_nofilter_DXDY_neon,
     63     SA8_alpha_D32_nofilter_DX_neon,
     64     SA8_alpha_D32_nofilter_DX_neon,
     65     SA8_alpha_D32_filter_DXDY_neon,
     66     SA8_alpha_D32_filter_DXDY_neon,
     67     SA8_alpha_D32_filter_DX_neon,
     68     SA8_alpha_D32_filter_DX_neon,
     69 
     70     // todo: possibly specialize on opaqueness
     71     SG8_alpha_D32_nofilter_DXDY_neon,
     72     SG8_alpha_D32_nofilter_DXDY_neon,
     73     SG8_alpha_D32_nofilter_DX_neon,
     74     SG8_alpha_D32_nofilter_DX_neon,
     75     SG8_alpha_D32_filter_DXDY_neon,
     76     SG8_alpha_D32_filter_DXDY_neon,
     77     SG8_alpha_D32_filter_DX_neon,
     78     SG8_alpha_D32_filter_DX_neon,
     79 };
     80 
     81 ///////////////////////////////////////////////////////////////////////////////
     82 
     83 #include <arm_neon.h>
     84 #include "SkConvolver.h"
     85 
     86 // Convolves horizontally along a single row. The row data is given in
     87 // |srcData| and continues for the numValues() of the filter.
     88 void convolveHorizontally_neon(const unsigned char* srcData,
     89                                const SkConvolutionFilter1D& filter,
     90                                unsigned char* outRow,
     91                                bool hasAlpha) {
     92     // Loop over each pixel on this row in the output image.
     93     int numValues = filter.numValues();
     94     for (int outX = 0; outX < numValues; outX++) {
     95         uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
     96         uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
     97         uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
     98         uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
     99         // Get the filter that determines the current output pixel.
    100         int filterOffset, filterLength;
    101         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
    102             filter.FilterForValue(outX, &filterOffset, &filterLength);
    103 
    104         // Compute the first pixel in this row that the filter affects. It will
    105         // touch |filterLength| pixels (4 bytes each) after this.
    106         const unsigned char* rowToFilter = &srcData[filterOffset * 4];
    107 
    108         // Apply the filter to the row to get the destination pixel in |accum|.
    109         int32x4_t accum = vdupq_n_s32(0);
    110         for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
    111             // Load 4 coefficients
    112             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
    113             coeffs = vld1_s16(filterValues);
    114             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
    115             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
    116             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
    117             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
    118 
    119             // Load pixels and calc
    120             uint8x16_t pixels = vld1q_u8(rowToFilter);
    121             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
    122             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
    123 
    124             int16x4_t p0_src = vget_low_s16(p01_16);
    125             int16x4_t p1_src = vget_high_s16(p01_16);
    126             int16x4_t p2_src = vget_low_s16(p23_16);
    127             int16x4_t p3_src = vget_high_s16(p23_16);
    128 
    129             int32x4_t p0 = vmull_s16(p0_src, coeff0);
    130             int32x4_t p1 = vmull_s16(p1_src, coeff1);
    131             int32x4_t p2 = vmull_s16(p2_src, coeff2);
    132             int32x4_t p3 = vmull_s16(p3_src, coeff3);
    133 
    134             accum += p0;
    135             accum += p1;
    136             accum += p2;
    137             accum += p3;
    138 
    139             // Advance the pointers
    140             rowToFilter += 16;
    141             filterValues += 4;
    142         }
    143         int r = filterLength & 3;
    144         if (r) {
    145             const uint16_t mask[4][4] = {
    146                 {0, 0, 0, 0},
    147                 {0xFFFF, 0, 0, 0},
    148                 {0xFFFF, 0xFFFF, 0, 0},
    149                 {0xFFFF, 0xFFFF, 0xFFFF, 0}
    150             };
    151             uint16x4_t coeffs;
    152             int16x4_t coeff0, coeff1, coeff2;
    153             coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
    154             coeffs &= vld1_u16(&mask[r][0]);
    155             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
    156             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
    157             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
    158 
    159             // Load pixels and calc
    160             uint8x16_t pixels = vld1q_u8(rowToFilter);
    161             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
    162             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
    163             int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
    164             int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
    165             int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
    166 
    167             accum += p0;
    168             accum += p1;
    169             accum += p2;
    170         }
    171 
    172         // Bring this value back in range. All of the filter scaling factors
    173         // are in fixed point with kShiftBits bits of fractional part.
    174         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
    175 
    176         // Pack and store the new pixel.
    177         int16x4_t accum16 = vqmovn_s32(accum);
    178         uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
    179         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
    180         outRow += 4;
    181     }
    182 }
    183 
    184 // Does vertical convolution to produce one output row. The filter values and
    185 // length are given in the first two parameters. These are applied to each
    186 // of the rows pointed to in the |sourceDataRows| array, with each row
    187 // being |pixelWidth| wide.
    188 //
    189 // The output must have room for |pixelWidth * 4| bytes.
    190 template<bool hasAlpha>
    191 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
    192                              int filterLength,
    193                              unsigned char* const* sourceDataRows,
    194                              int pixelWidth,
    195                              unsigned char* outRow) {
    196     int width = pixelWidth & ~3;
    197 
    198     int32x4_t accum0, accum1, accum2, accum3;
    199     int16x4_t coeff16;
    200 
    201     // Output four pixels per iteration (16 bytes).
    202     for (int outX = 0; outX < width; outX += 4) {
    203 
    204         // Accumulated result for each pixel. 32 bits per RGBA channel.
    205         accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);
    206 
    207         // Convolve with one filter coefficient per iteration.
    208         for (int filterY = 0; filterY < filterLength; filterY++) {
    209 
    210             // Duplicate the filter coefficient 4 times.
    211             // [16] cj cj cj cj
    212             coeff16 = vdup_n_s16(filterValues[filterY]);
    213 
    214             // Load four pixels (16 bytes) together.
    215             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    216             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
    217 
    218             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
    219             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
    220             int16x4_t src16_0 = vget_low_s16(src16_01);
    221             int16x4_t src16_1 = vget_high_s16(src16_01);
    222             int16x4_t src16_2 = vget_low_s16(src16_23);
    223             int16x4_t src16_3 = vget_high_s16(src16_23);
    224 
    225             accum0 += vmull_s16(src16_0, coeff16);
    226             accum1 += vmull_s16(src16_1, coeff16);
    227             accum2 += vmull_s16(src16_2, coeff16);
    228             accum3 += vmull_s16(src16_3, coeff16);
    229         }
    230 
    231         // Shift right for fixed point implementation.
    232         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
    233         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
    234         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
    235         accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
    236 
    237         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    238         // [16] a1 b1 g1 r1 a0 b0 g0 r0
    239         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
    240         // [16] a3 b3 g3 r3 a2 b2 g2 r2
    241         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
    242 
    243         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    244         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    245         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
    246 
    247         if (hasAlpha) {
    248             // Compute the max(ri, gi, bi) for each pixel.
    249             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    250             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
    251             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    252             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
    253             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    254             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
    255             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    256             b = vmaxq_u8(a, b); // Max of r and g and b.
    257             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    258             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
    259 
    260             // Make sure the value of alpha channel is always larger than maximum
    261             // value of color channels.
    262             accum8 = vmaxq_u8(b, accum8);
    263         } else {
    264             // Set value of alpha channels to 0xFF.
    265             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
    266         }
    267 
    268         // Store the convolution result (16 bytes) and advance the pixel pointers.
    269         vst1q_u8(outRow, accum8);
    270         outRow += 16;
    271     }
    272 
    273     // Process the leftovers when the width of the output is not divisible
    274     // by 4, that is at most 3 pixels.
    275     int r = pixelWidth & 3;
    276     if (r) {
    277 
    278         accum0 = accum1 = accum2 = vdupq_n_s32(0);
    279 
    280         for (int filterY = 0; filterY < filterLength; ++filterY) {
    281             coeff16 = vdup_n_s16(filterValues[filterY]);
    282 
    283             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    284             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
    285 
    286             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
    287             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
    288             int16x4_t src16_0 = vget_low_s16(src16_01);
    289             int16x4_t src16_1 = vget_high_s16(src16_01);
    290             int16x4_t src16_2 = vget_low_s16(src16_23);
    291 
    292             accum0 += vmull_s16(src16_0, coeff16);
    293             accum1 += vmull_s16(src16_1, coeff16);
    294             accum2 += vmull_s16(src16_2, coeff16);
    295         }
    296 
    297         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
    298         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
    299         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
    300 
    301         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
    302         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
    303 
    304         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
    305 
    306         if (hasAlpha) {
    307             // Compute the max(ri, gi, bi) for each pixel.
    308             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    309             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
    310             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    311             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
    312             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    313             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
    314             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    315             b = vmaxq_u8(a, b); // Max of r and g and b.
    316             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    317             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
    318 
    319             // Make sure the value of alpha channel is always larger than maximum
    320             // value of color channels.
    321             accum8 = vmaxq_u8(b, accum8);
    322         } else {
    323             // Set value of alpha channels to 0xFF.
    324             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
    325         }
    326 
    327         switch(r) {
    328         case 1:
    329             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
    330             break;
    331         case 2:
    332             vst1_u32(reinterpret_cast<uint32_t*>(outRow),
    333                      vreinterpret_u32_u8(vget_low_u8(accum8)));
    334             break;
    335         case 3:
    336             vst1_u32(reinterpret_cast<uint32_t*>(outRow),
    337                      vreinterpret_u32_u8(vget_low_u8(accum8)));
    338             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
    339             break;
    340         }
    341     }
    342 }
    343 
    344 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
    345                              int filterLength,
    346                              unsigned char* const* sourceDataRows,
    347                              int pixelWidth,
    348                              unsigned char* outRow,
    349                              bool sourceHasAlpha) {
    350     if (sourceHasAlpha) {
    351         convolveVertically_neon<true>(filterValues, filterLength,
    352                                       sourceDataRows, pixelWidth,
    353                                       outRow);
    354     } else {
    355         convolveVertically_neon<false>(filterValues, filterLength,
    356                                        sourceDataRows, pixelWidth,
    357                                        outRow);
    358     }
    359 }
    360 
    361 // Convolves horizontally along four rows. The row data is given in
    362 // |src_data| and continues for the num_values() of the filter.
    363 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
    364 // refer to that function for detailed comments.
    365 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
    366                                     const SkConvolutionFilter1D& filter,
    367                                     unsigned char* outRow[4],
    368                                     size_t outRowBytes) {
    369 
    370     uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
    371     uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
    372     uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
    373     uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
    374     int num_values = filter.numValues();
    375 
    376     int filterOffset, filterLength;
    377     // |mask| will be used to decimate all extra filter coefficients that are
    378     // loaded by SIMD when |filter_length| is not divisible by 4.
    379     // mask[0] is not used in following algorithm.
    380     const uint16_t mask[4][4] = {
    381         {0, 0, 0, 0},
    382         {0xFFFF, 0, 0, 0},
    383         {0xFFFF, 0xFFFF, 0, 0},
    384         {0xFFFF, 0xFFFF, 0xFFFF, 0}
    385     };
    386 
    387     // Output one pixel each iteration, calculating all channels (RGBA) together.
    388     for (int outX = 0; outX < num_values; outX++) {
    389 
    390         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
    391         filter.FilterForValue(outX, &filterOffset, &filterLength);
    392 
    393         // four pixels in a column per iteration.
    394         int32x4_t accum0 = vdupq_n_s32(0);
    395         int32x4_t accum1 = vdupq_n_s32(0);
    396         int32x4_t accum2 = vdupq_n_s32(0);
    397         int32x4_t accum3 = vdupq_n_s32(0);
    398 
    399         int start = (filterOffset<<2);
    400 
    401         // We will load and accumulate with four coefficients per iteration.
    402         for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {
    403             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
    404 
    405             coeffs = vld1_s16(filterValues);
    406             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
    407             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
    408             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
    409             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
    410 
    411             uint8x16_t pixels;
    412             int16x8_t p01_16, p23_16;
    413             int32x4_t p0, p1, p2, p3;
    414 
    415 
    416 #define ITERATION(src, accum)                                       \
    417     pixels = vld1q_u8(src);                                         \
    418     p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));  \
    419     p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
    420     p0 = vmull_s16(vget_low_s16(p01_16), coeff0);                   \
    421     p1 = vmull_s16(vget_high_s16(p01_16), coeff1);                  \
    422     p2 = vmull_s16(vget_low_s16(p23_16), coeff2);                   \
    423     p3 = vmull_s16(vget_high_s16(p23_16), coeff3);                  \
    424     accum += p0;                                                    \
    425     accum += p1;                                                    \
    426     accum += p2;                                                    \
    427     accum += p3
    428 
    429             ITERATION(srcData[0] + start, accum0);
    430             ITERATION(srcData[1] + start, accum1);
    431             ITERATION(srcData[2] + start, accum2);
    432             ITERATION(srcData[3] + start, accum3);
    433 
    434             start += 16;
    435             filterValues += 4;
    436         }
    437 
    438         int r = filterLength & 3;
    439         if (r) {
    440             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
    441             coeffs = vld1_s16(filterValues);
    442             coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
    443             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
    444             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
    445             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
    446             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
    447 
    448             uint8x16_t pixels;
    449             int16x8_t p01_16, p23_16;
    450             int32x4_t p0, p1, p2, p3;
    451 
    452             ITERATION(srcData[0] + start, accum0);
    453             ITERATION(srcData[1] + start, accum1);
    454             ITERATION(srcData[2] + start, accum2);
    455             ITERATION(srcData[3] + start, accum3);
    456         }
    457 
    458         int16x4_t accum16;
    459         uint8x8_t res0, res1, res2, res3;
    460 
    461 #define PACK_RESULT(accum, res)                                         \
    462         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);  \
    463         accum16 = vqmovn_s32(accum);                                    \
    464         res = vqmovun_s16(vcombine_s16(accum16, accum16));
    465 
    466         PACK_RESULT(accum0, res0);
    467         PACK_RESULT(accum1, res1);
    468         PACK_RESULT(accum2, res2);
    469         PACK_RESULT(accum3, res3);
    470 
    471         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
    472         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
    473         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
    474         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
    475         outRow[0] += 4;
    476         outRow[1] += 4;
    477         outRow[2] += 4;
    478         outRow[3] += 4;
    479     }
    480 }
    481 
    482 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
    483     // Padding |paddingCount| of more dummy coefficients after the coefficients
    484     // of last filter to prevent SIMD instructions which load 8 or 16 bytes
    485     // together to access invalid memory areas. We are not trying to align the
    486     // coefficients right now due to the opaqueness of <vector> implementation.
    487     // This has to be done after all |AddFilter| calls.
    488     for (int i = 0; i < 8; ++i) {
    489         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
    490     }
    491 }
    492 
    493 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
    494     procs->fExtraHorizontalReads = 3;
    495     procs->fConvolveVertically = &convolveVertically_neon;
    496     procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
    497     procs->fConvolveHorizontally = &convolveHorizontally_neon;
    498     procs->fApplySIMDPadding = &applySIMDPadding_neon;
    499 }
    500