Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2013 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 
      9 #include "SkBitmap.h"
     10 #include "SkColorPriv.h"
     11 #include "SkBlurImage_opts.h"
     12 #include "SkRect.h"
     13 
     14 #include <arm_neon.h>
     15 
     16 namespace {
     17 
     18 enum BlurDirection {
     19     kX, kY
     20 };
     21 
     22 /**
     23  * Helper function to load 2 pixels from diffent rows to a 8x8 NEON register
     24  * and also pre-load pixels for future read
     25  */
     26 template<BlurDirection srcDirection>
     27 inline uint8x8_t load_2_pixels(const SkPMColor* src, int srcStride) {
     28     if (srcDirection == kX) {
     29         uint32x2_t temp = vdup_n_u32(0);
     30         // 10% faster by adding these 2 prefetches
     31         SK_PREFETCH(src + 16);
     32         SK_PREFETCH(src + srcStride + 16);
     33         return vreinterpret_u8_u32(vld1_lane_u32(src + srcStride, vld1_lane_u32(src, temp, 0), 1));
     34      } else {
     35          return vld1_u8((uint8_t*)src);
     36      }
     37 }
     38 
     39 /**
     40  * Helper function to store the low 8-bits from a 16x8 NEON register to 2 rows
     41  */
     42 template<BlurDirection dstDirection>
     43 inline void store_2_pixels(uint16x8_t result16x8, SkPMColor* dst, int dstStride) {
     44     if (dstDirection == kX) {
     45         uint32x2_t temp = vreinterpret_u32_u8(vmovn_u16(result16x8));
     46         vst1_lane_u32(dst, temp, 0);
     47         vst1_lane_u32(dst + dstStride, temp, 1);
     48     } else {
     49         uint8x8_t temp = vmovn_u16(result16x8);
     50         vst1_u8((uint8_t*)dst, temp);
     51     }
     52 }
     53 
     54 /**
     55  * fast path for kernel size less than 128
     56  */
     57 template<BlurDirection srcDirection, BlurDirection dstDirection>
     58 void SkDoubleRowBoxBlur_NEON(const SkPMColor** src, int srcStride, SkPMColor** dst, int kernelSize,
     59                         int leftOffset, int rightOffset, int width, int* height)
     60 {
     61     const int rightBorder = SkMin32(rightOffset + 1, width);
     62     const int srcStrideX = srcDirection == kX ? 1 : srcStride;
     63     const int dstStrideX = dstDirection == kX ? 1 : *height;
     64     const int srcStrideY = srcDirection == kX ? srcStride : 1;
     65     const int dstStrideY = dstDirection == kX ? width : 1;
     66     const uint16x8_t scale = vdupq_n_u16((1 << 15) / kernelSize);
     67 
     68     for (; *height >= 2; *height -= 2) {
     69         uint16x8_t sum = vdupq_n_u16(0);
     70         const SkPMColor* p = *src;
     71         for (int i = 0; i < rightBorder; i++) {
     72             sum = vaddw_u8(sum,
     73                 load_2_pixels<srcDirection>(p, srcStride));
     74             p += srcStrideX;
     75         }
     76 
     77         const SkPMColor* sptr = *src;
     78         SkPMColor* dptr = *dst;
     79         for (int x = 0; x < width; x++) {
     80             // val = (sum * scale * 2 + 0x8000) >> 16
     81             uint16x8_t resultPixels = vreinterpretq_u16_s16(vqrdmulhq_s16(
     82                 vreinterpretq_s16_u16(sum), vreinterpretq_s16_u16(scale)));
     83             store_2_pixels<dstDirection>(resultPixels, dptr, width);
     84 
     85             if (x >= leftOffset) {
     86                 sum = vsubw_u8(sum,
     87                     load_2_pixels<srcDirection>(sptr - leftOffset * srcStrideX, srcStride));
     88             }
     89             if (x + rightOffset + 1 < width) {
     90                 sum = vaddw_u8(sum,
     91                     load_2_pixels<srcDirection>(sptr + (rightOffset + 1) * srcStrideX, srcStride));
     92             }
     93             sptr += srcStrideX;
     94             dptr += dstStrideX;
     95         }
     96         *src += srcStrideY * 2;
     97         *dst += dstStrideY * 2;
     98     }
     99 }
    100 
    101 
    102 /**
    103  * Helper function to spread the components of a 32-bit integer into the
    104  * lower 8 bits of each 16-bit element of a NEON register.
    105  */
    106 
    107 static inline uint16x4_t expand(uint32_t a) {
    108     // ( ARGB ) -> ( ARGB ARGB ) -> ( A R G B A R G B )
    109     uint8x8_t v8 = vreinterpret_u8_u32(vdup_n_u32(a));
    110     // ( A R G B A R G B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B )
    111     return vget_low_u16(vmovl_u8(v8));
    112 }
    113 
    114 template<BlurDirection srcDirection, BlurDirection dstDirection>
    115 void SkBoxBlur_NEON(const SkPMColor* src, int srcStride, SkPMColor* dst, int kernelSize,
    116                     int leftOffset, int rightOffset, int width, int height)
    117 {
    118     const int rightBorder = SkMin32(rightOffset + 1, width);
    119     const int srcStrideX = srcDirection == kX ? 1 : srcStride;
    120     const int dstStrideX = dstDirection == kX ? 1 : height;
    121     const int srcStrideY = srcDirection == kX ? srcStride : 1;
    122     const int dstStrideY = dstDirection == kX ? width : 1;
    123     const uint32x4_t scale = vdupq_n_u32((1 << 24) / kernelSize);
    124     const uint32x4_t half = vdupq_n_u32(1 << 23);
    125 
    126     if (1 < kernelSize && kernelSize < 128)
    127     {
    128         SkDoubleRowBoxBlur_NEON<srcDirection, dstDirection>(&src, srcStride, &dst, kernelSize,
    129             leftOffset, rightOffset, width, &height);
    130     }
    131 
    132     for (; height > 0; height--) {
    133         uint32x4_t sum = vdupq_n_u32(0);
    134         const SkPMColor* p = src;
    135         for (int i = 0; i < rightBorder; ++i) {
    136             sum = vaddw_u16(sum, expand(*p));
    137             p += srcStrideX;
    138         }
    139 
    140         const SkPMColor* sptr = src;
    141         SkPMColor* dptr = dst;
    142         for (int x = 0; x < width; ++x) {
    143             // ( half+sumA*scale half+sumR*scale half+sumG*scale half+sumB*scale )
    144             uint32x4_t result = vmlaq_u32(half, sum, scale);
    145 
    146             // Saturated conversion to 16-bit.
    147             // ( AAAA RRRR GGGG BBBB ) -> ( 0A 0R 0G 0B )
    148             uint16x4_t result16 = vqshrn_n_u32(result, 16);
    149 
    150             // Saturated conversion to 8-bit.
    151             // ( 0A 0R 0G 0B ) -> ( 0A 0R 0G 0B 0A 0R 0G 0B ) -> ( A R G B A R G B )
    152             uint8x8_t result8 = vqshrn_n_u16(vcombine_u16(result16, result16), 8);
    153 
    154             // ( A R G B A R G B ) -> ( ARGB ARGB ) -> ( ARGB )
    155             // Store low 32 bits to destination.
    156             vst1_lane_u32(dptr, vreinterpret_u32_u8(result8), 0);
    157 
    158             if (x >= leftOffset) {
    159                 const SkPMColor* l = sptr - leftOffset * srcStrideX;
    160                 sum = vsubw_u16(sum, expand(*l));
    161             }
    162             if (x + rightOffset + 1 < width) {
    163                 const SkPMColor* r = sptr + (rightOffset + 1) * srcStrideX;
    164                 sum = vaddw_u16(sum, expand(*r));
    165             }
    166             sptr += srcStrideX;
    167             if (srcDirection == kX) {
    168                 SK_PREFETCH(sptr + (rightOffset + 16) * srcStrideX);
    169             }
    170             dptr += dstStrideX;
    171         }
    172         src += srcStrideY;
    173         dst += dstStrideY;
    174     }
    175 }
    176 
    177 } // namespace
    178 
    179 bool SkBoxBlurGetPlatformProcs_NEON(SkBoxBlurProc* boxBlurX,
    180                                     SkBoxBlurProc* boxBlurXY,
    181                                     SkBoxBlurProc* boxBlurYX) {
    182     *boxBlurX = SkBoxBlur_NEON<kX, kX>;
    183     *boxBlurXY = SkBoxBlur_NEON<kX, kY>;
    184     *boxBlurYX = SkBoxBlur_NEON<kY, kX>;
    185     return true;
    186 }
    187