Home | History | Annotate | Download | only in opts
      1 
      2 /*
      3  * Copyright 2012 Google Inc.
      4  *
      5  * Use of this source code is governed by a BSD-style license that can be
      6  * found in the LICENSE file.
      7  */
      8 #include "SkBitmapProcState.h"
      9 #include "SkBitmapProcState_filter.h"
     10 #include "SkColorPriv.h"
     11 #include "SkFilterProc.h"
     12 #include "SkPaint.h"
     13 #include "SkShader.h"   // for tilemodes
     14 #include "SkUtilsArm.h"
     15 
     16 // Required to ensure the table is part of the final binary.
     17 extern const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[];
     18 extern const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[];
     19 
     20 #define   NAME_WRAP(x)  x ## _neon
     21 #include "SkBitmapProcState_filter_neon.h"
     22 #include "SkBitmapProcState_procs.h"
     23 
     24 const SkBitmapProcState::SampleProc32 gSkBitmapProcStateSample32_neon[] = {
     25     S32_opaque_D32_nofilter_DXDY_neon,
     26     S32_alpha_D32_nofilter_DXDY_neon,
     27     S32_opaque_D32_nofilter_DX_neon,
     28     S32_alpha_D32_nofilter_DX_neon,
     29     S32_opaque_D32_filter_DXDY_neon,
     30     S32_alpha_D32_filter_DXDY_neon,
     31     S32_opaque_D32_filter_DX_neon,
     32     S32_alpha_D32_filter_DX_neon,
     33 
     34     S16_opaque_D32_nofilter_DXDY_neon,
     35     S16_alpha_D32_nofilter_DXDY_neon,
     36     S16_opaque_D32_nofilter_DX_neon,
     37     S16_alpha_D32_nofilter_DX_neon,
     38     S16_opaque_D32_filter_DXDY_neon,
     39     S16_alpha_D32_filter_DXDY_neon,
     40     S16_opaque_D32_filter_DX_neon,
     41     S16_alpha_D32_filter_DX_neon,
     42 
     43     SI8_opaque_D32_nofilter_DXDY_neon,
     44     SI8_alpha_D32_nofilter_DXDY_neon,
     45     SI8_opaque_D32_nofilter_DX_neon,
     46     SI8_alpha_D32_nofilter_DX_neon,
     47     SI8_opaque_D32_filter_DXDY_neon,
     48     SI8_alpha_D32_filter_DXDY_neon,
     49     SI8_opaque_D32_filter_DX_neon,
     50     SI8_alpha_D32_filter_DX_neon,
     51 
     52     S4444_opaque_D32_nofilter_DXDY_neon,
     53     S4444_alpha_D32_nofilter_DXDY_neon,
     54     S4444_opaque_D32_nofilter_DX_neon,
     55     S4444_alpha_D32_nofilter_DX_neon,
     56     S4444_opaque_D32_filter_DXDY_neon,
     57     S4444_alpha_D32_filter_DXDY_neon,
     58     S4444_opaque_D32_filter_DX_neon,
     59     S4444_alpha_D32_filter_DX_neon,
     60 
     61     // A8 treats alpha/opauqe the same (equally efficient)
     62     SA8_alpha_D32_nofilter_DXDY_neon,
     63     SA8_alpha_D32_nofilter_DXDY_neon,
     64     SA8_alpha_D32_nofilter_DX_neon,
     65     SA8_alpha_D32_nofilter_DX_neon,
     66     SA8_alpha_D32_filter_DXDY_neon,
     67     SA8_alpha_D32_filter_DXDY_neon,
     68     SA8_alpha_D32_filter_DX_neon,
     69     SA8_alpha_D32_filter_DX_neon
     70 };
     71 
     72 const SkBitmapProcState::SampleProc16 gSkBitmapProcStateSample16_neon[] = {
     73     S32_D16_nofilter_DXDY_neon,
     74     S32_D16_nofilter_DX_neon,
     75     S32_D16_filter_DXDY_neon,
     76     S32_D16_filter_DX_neon,
     77 
     78     S16_D16_nofilter_DXDY_neon,
     79     S16_D16_nofilter_DX_neon,
     80     S16_D16_filter_DXDY_neon,
     81     S16_D16_filter_DX_neon,
     82 
     83     SI8_D16_nofilter_DXDY_neon,
     84     SI8_D16_nofilter_DX_neon,
     85     SI8_D16_filter_DXDY_neon,
     86     SI8_D16_filter_DX_neon,
     87 
     88     // Don't support 4444 -> 565
     89     NULL, NULL, NULL, NULL,
     90     // Don't support A8 -> 565
     91     NULL, NULL, NULL, NULL
     92 };
     93 
     94 ///////////////////////////////////////////////////////////////////////////////
     95 
     96 #include <arm_neon.h>
     97 #include "SkConvolver.h"
     98 
     99 // Convolves horizontally along a single row. The row data is given in
    100 // |srcData| and continues for the numValues() of the filter.
    101 void convolveHorizontally_neon(const unsigned char* srcData,
    102                                const SkConvolutionFilter1D& filter,
    103                                unsigned char* outRow,
    104                                bool hasAlpha) {
    105     // Loop over each pixel on this row in the output image.
    106     int numValues = filter.numValues();
    107     for (int outX = 0; outX < numValues; outX++) {
    108         uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
    109         uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
    110         uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
    111         uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
    112         // Get the filter that determines the current output pixel.
    113         int filterOffset, filterLength;
    114         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
    115             filter.FilterForValue(outX, &filterOffset, &filterLength);
    116 
    117         // Compute the first pixel in this row that the filter affects. It will
    118         // touch |filterLength| pixels (4 bytes each) after this.
    119         const unsigned char* rowToFilter = &srcData[filterOffset * 4];
    120 
    121         // Apply the filter to the row to get the destination pixel in |accum|.
    122         int32x4_t accum = vdupq_n_s32(0);
    123         for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
    124             // Load 4 coefficients
    125             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
    126             coeffs = vld1_s16(filterValues);
    127             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
    128             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
    129             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
    130             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
    131 
    132             // Load pixels and calc
    133             uint8x16_t pixels = vld1q_u8(rowToFilter);
    134             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
    135             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
    136 
    137             int16x4_t p0_src = vget_low_s16(p01_16);
    138             int16x4_t p1_src = vget_high_s16(p01_16);
    139             int16x4_t p2_src = vget_low_s16(p23_16);
    140             int16x4_t p3_src = vget_high_s16(p23_16);
    141 
    142             int32x4_t p0 = vmull_s16(p0_src, coeff0);
    143             int32x4_t p1 = vmull_s16(p1_src, coeff1);
    144             int32x4_t p2 = vmull_s16(p2_src, coeff2);
    145             int32x4_t p3 = vmull_s16(p3_src, coeff3);
    146 
    147             accum += p0;
    148             accum += p1;
    149             accum += p2;
    150             accum += p3;
    151 
    152             // Advance the pointers
    153             rowToFilter += 16;
    154             filterValues += 4;
    155         }
    156         int r = filterLength & 3;
    157         if (r) {
    158             const uint16_t mask[4][4] = {
    159                 {0, 0, 0, 0},
    160                 {0xFFFF, 0, 0, 0},
    161                 {0xFFFF, 0xFFFF, 0, 0},
    162                 {0xFFFF, 0xFFFF, 0xFFFF, 0}
    163             };
    164             uint16x4_t coeffs;
    165             int16x4_t coeff0, coeff1, coeff2;
    166             coeffs = vld1_u16(reinterpret_cast<const uint16_t*>(filterValues));
    167             coeffs &= vld1_u16(&mask[r][0]);
    168             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask0));
    169             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask1));
    170             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_u16(coeffs), coeff_mask2));
    171 
    172             // Load pixels and calc
    173             uint8x16_t pixels = vld1q_u8(rowToFilter);
    174             int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
    175             int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
    176             int32x4_t p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
    177             int32x4_t p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
    178             int32x4_t p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
    179 
    180             accum += p0;
    181             accum += p1;
    182             accum += p2;
    183         }
    184 
    185         // Bring this value back in range. All of the filter scaling factors
    186         // are in fixed point with kShiftBits bits of fractional part.
    187         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
    188 
    189         // Pack and store the new pixel.
    190         int16x4_t accum16 = vqmovn_s32(accum);
    191         uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
    192         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_u8(accum8), 0);
    193         outRow += 4;
    194     }
    195 }
    196 
    197 // Does vertical convolution to produce one output row. The filter values and
    198 // length are given in the first two parameters. These are applied to each
    199 // of the rows pointed to in the |sourceDataRows| array, with each row
    200 // being |pixelWidth| wide.
    201 //
    202 // The output must have room for |pixelWidth * 4| bytes.
    203 template<bool hasAlpha>
    204 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
    205                              int filterLength,
    206                              unsigned char* const* sourceDataRows,
    207                              int pixelWidth,
    208                              unsigned char* outRow) {
    209     int width = pixelWidth & ~3;
    210 
    211     int32x4_t accum0, accum1, accum2, accum3;
    212     int16x4_t coeff16;
    213 
    214     // Output four pixels per iteration (16 bytes).
    215     for (int outX = 0; outX < width; outX += 4) {
    216 
    217         // Accumulated result for each pixel. 32 bits per RGBA channel.
    218         accum0 = accum1 = accum2 = accum3 = vdupq_n_s32(0);
    219 
    220         // Convolve with one filter coefficient per iteration.
    221         for (int filterY = 0; filterY < filterLength; filterY++) {
    222 
    223             // Duplicate the filter coefficient 4 times.
    224             // [16] cj cj cj cj
    225             coeff16 = vdup_n_s16(filterValues[filterY]);
    226 
    227             // Load four pixels (16 bytes) together.
    228             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    229             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
    230 
    231             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
    232             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
    233             int16x4_t src16_0 = vget_low_s16(src16_01);
    234             int16x4_t src16_1 = vget_high_s16(src16_01);
    235             int16x4_t src16_2 = vget_low_s16(src16_23);
    236             int16x4_t src16_3 = vget_high_s16(src16_23);
    237 
    238             accum0 += vmull_s16(src16_0, coeff16);
    239             accum1 += vmull_s16(src16_1, coeff16);
    240             accum2 += vmull_s16(src16_2, coeff16);
    241             accum3 += vmull_s16(src16_3, coeff16);
    242         }
    243 
    244         // Shift right for fixed point implementation.
    245         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
    246         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
    247         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
    248         accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
    249 
    250         // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    251         // [16] a1 b1 g1 r1 a0 b0 g0 r0
    252         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
    253         // [16] a3 b3 g3 r3 a2 b2 g2 r2
    254         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
    255 
    256         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    257         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    258         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
    259 
    260         if (hasAlpha) {
    261             // Compute the max(ri, gi, bi) for each pixel.
    262             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    263             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
    264             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    265             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
    266             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    267             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
    268             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    269             b = vmaxq_u8(a, b); // Max of r and g and b.
    270             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    271             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
    272 
    273             // Make sure the value of alpha channel is always larger than maximum
    274             // value of color channels.
    275             accum8 = vmaxq_u8(b, accum8);
    276         } else {
    277             // Set value of alpha channels to 0xFF.
    278             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
    279         }
    280 
    281         // Store the convolution result (16 bytes) and advance the pixel pointers.
    282         vst1q_u8(outRow, accum8);
    283         outRow += 16;
    284     }
    285 
    286     // Process the leftovers when the width of the output is not divisible
    287     // by 4, that is at most 3 pixels.
    288     int r = pixelWidth & 3;
    289     if (r) {
    290 
    291         accum0 = accum1 = accum2 = vdupq_n_s32(0);
    292 
    293         for (int filterY = 0; filterY < filterLength; ++filterY) {
    294             coeff16 = vdup_n_s16(filterValues[filterY]);
    295 
    296             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    297             uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
    298 
    299             int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
    300             int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
    301             int16x4_t src16_0 = vget_low_s16(src16_01);
    302             int16x4_t src16_1 = vget_high_s16(src16_01);
    303             int16x4_t src16_2 = vget_low_s16(src16_23);
    304 
    305             accum0 += vmull_s16(src16_0, coeff16);
    306             accum1 += vmull_s16(src16_1, coeff16);
    307             accum2 += vmull_s16(src16_2, coeff16);
    308         }
    309 
    310         accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
    311         accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
    312         accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
    313 
    314         int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
    315         int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
    316 
    317         uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
    318 
    319         if (hasAlpha) {
    320             // Compute the max(ri, gi, bi) for each pixel.
    321             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    322             uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
    323             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    324             uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
    325             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    326             a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
    327             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    328             b = vmaxq_u8(a, b); // Max of r and g and b.
    329             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    330             b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
    331 
    332             // Make sure the value of alpha channel is always larger than maximum
    333             // value of color channels.
    334             accum8 = vmaxq_u8(b, accum8);
    335         } else {
    336             // Set value of alpha channels to 0xFF.
    337             accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdupq_n_u32(0xFF000000));
    338         }
    339 
    340         switch(r) {
    341         case 1:
    342             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpretq_u32_u8(accum8), 0);
    343             break;
    344         case 2:
    345             vst1_u32(reinterpret_cast<uint32_t*>(outRow),
    346                      vreinterpret_u32_u8(vget_low_u8(accum8)));
    347             break;
    348         case 3:
    349             vst1_u32(reinterpret_cast<uint32_t*>(outRow),
    350                      vreinterpret_u32_u8(vget_low_u8(accum8)));
    351             vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpretq_u32_u8(accum8), 2);
    352             break;
    353         }
    354     }
    355 }
    356 
    357 void convolveVertically_neon(const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
    358                              int filterLength,
    359                              unsigned char* const* sourceDataRows,
    360                              int pixelWidth,
    361                              unsigned char* outRow,
    362                              bool sourceHasAlpha) {
    363     if (sourceHasAlpha) {
    364         convolveVertically_neon<true>(filterValues, filterLength,
    365                                       sourceDataRows, pixelWidth,
    366                                       outRow);
    367     } else {
    368         convolveVertically_neon<false>(filterValues, filterLength,
    369                                        sourceDataRows, pixelWidth,
    370                                        outRow);
    371     }
    372 }
    373 
    374 // Convolves horizontally along four rows. The row data is given in
    375 // |src_data| and continues for the num_values() of the filter.
    376 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
    377 // refer to that function for detailed comments.
    378 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
    379                                     const SkConvolutionFilter1D& filter,
    380                                     unsigned char* outRow[4]) {
    381 
    382     uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
    383     uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
    384     uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
    385     uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
    386     int num_values = filter.numValues();
    387 
    388     int filterOffset, filterLength;
    389     // |mask| will be used to decimate all extra filter coefficients that are
    390     // loaded by SIMD when |filter_length| is not divisible by 4.
    391     // mask[0] is not used in following algorithm.
    392     const uint16_t mask[4][4] = {
    393         {0, 0, 0, 0},
    394         {0xFFFF, 0, 0, 0},
    395         {0xFFFF, 0xFFFF, 0, 0},
    396         {0xFFFF, 0xFFFF, 0xFFFF, 0}
    397     };
    398 
    399     // Output one pixel each iteration, calculating all channels (RGBA) together.
    400     for (int outX = 0; outX < num_values; outX++) {
    401 
    402         const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
    403         filter.FilterForValue(outX, &filterOffset, &filterLength);
    404 
    405         // four pixels in a column per iteration.
    406         int32x4_t accum0 = vdupq_n_s32(0);
    407         int32x4_t accum1 = vdupq_n_s32(0);
    408         int32x4_t accum2 = vdupq_n_s32(0);
    409         int32x4_t accum3 = vdupq_n_s32(0);
    410 
    411         int start = (filterOffset<<2);
    412 
    413         // We will load and accumulate with four coefficients per iteration.
    414         for (int filter_x = 0; filter_x < (filterLength >> 2); filter_x++) {
    415             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
    416 
    417             coeffs = vld1_s16(filterValues);
    418             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
    419             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
    420             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
    421             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
    422 
    423             uint8x16_t pixels;
    424             int16x8_t p01_16, p23_16;
    425             int32x4_t p0, p1, p2, p3;
    426 
    427 
    428 #define ITERATION(src, accum)                                       \
    429     pixels = vld1q_u8(src);                                         \
    430     p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));  \
    431     p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \
    432     p0 = vmull_s16(vget_low_s16(p01_16), coeff0);                   \
    433     p1 = vmull_s16(vget_high_s16(p01_16), coeff1);                  \
    434     p2 = vmull_s16(vget_low_s16(p23_16), coeff2);                   \
    435     p3 = vmull_s16(vget_high_s16(p23_16), coeff3);                  \
    436     accum += p0;                                                    \
    437     accum += p1;                                                    \
    438     accum += p2;                                                    \
    439     accum += p3
    440 
    441             ITERATION(srcData[0] + start, accum0);
    442             ITERATION(srcData[1] + start, accum1);
    443             ITERATION(srcData[2] + start, accum2);
    444             ITERATION(srcData[3] + start, accum3);
    445 
    446             start += 16;
    447             filterValues += 4;
    448         }
    449 
    450         int r = filterLength & 3;
    451         if (r) {
    452             int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
    453             coeffs = vld1_s16(filterValues);
    454             coeffs &= vreinterpret_s16_u16(vld1_u16(&mask[r][0]));
    455             coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
    456             coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
    457             coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
    458             coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
    459 
    460             uint8x16_t pixels;
    461             int16x8_t p01_16, p23_16;
    462             int32x4_t p0, p1, p2, p3;
    463 
    464             ITERATION(srcData[0] + start, accum0);
    465             ITERATION(srcData[1] + start, accum1);
    466             ITERATION(srcData[2] + start, accum2);
    467             ITERATION(srcData[3] + start, accum3);
    468         }
    469 
    470         int16x4_t accum16;
    471         uint8x8_t res0, res1, res2, res3;
    472 
    473 #define PACK_RESULT(accum, res)                                         \
    474         accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);  \
    475         accum16 = vqmovn_s32(accum);                                    \
    476         res = vqmovun_s16(vcombine_s16(accum16, accum16));
    477 
    478         PACK_RESULT(accum0, res0);
    479         PACK_RESULT(accum1, res1);
    480         PACK_RESULT(accum2, res2);
    481         PACK_RESULT(accum3, res3);
    482 
    483         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u32_u8(res0), 0);
    484         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u32_u8(res1), 0);
    485         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u32_u8(res2), 0);
    486         vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u32_u8(res3), 0);
    487         outRow[0] += 4;
    488         outRow[1] += 4;
    489         outRow[2] += 4;
    490         outRow[3] += 4;
    491     }
    492 }
    493 
    494 void applySIMDPadding_neon(SkConvolutionFilter1D *filter) {
    495     // Padding |paddingCount| of more dummy coefficients after the coefficients
    496     // of last filter to prevent SIMD instructions which load 8 or 16 bytes
    497     // together to access invalid memory areas. We are not trying to align the
    498     // coefficients right now due to the opaqueness of <vector> implementation.
    499     // This has to be done after all |AddFilter| calls.
    500     for (int i = 0; i < 8; ++i) {
    501         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFixed>(0));
    502     }
    503 }
    504 
    505 void platformConvolutionProcs_arm_neon(SkConvolutionProcs* procs) {
    506     procs->fExtraHorizontalReads = 3;
    507     procs->fConvolveVertically = &convolveVertically_neon;
    508     procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_neon;
    509     procs->fConvolveHorizontally = &convolveHorizontally_neon;
    510     procs->fApplySIMDPadding = &applySIMDPadding_neon;
    511 }
    512