Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "SkBlitRow_opts_arm_neon.h"
      9 
     10 #include "SkBlitMask.h"
     11 #include "SkBlitRow.h"
     12 #include "SkColorPriv.h"
     13 #include "SkDither.h"
     14 #include "SkMathPriv.h"
     15 #include "SkUtils.h"
     16 
     17 #include "SkColor_opts_neon.h"
     18 #include <arm_neon.h>
     19 
     20 /* Neon version of S32_Blend_BlitRow32()
     21  * portable version is in src/core/SkBlitRow_D32.cpp
     22  */
     23 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
     24                               const SkPMColor* SK_RESTRICT src,
     25                               int count, U8CPU alpha) {
     26     SkASSERT(alpha <= 255);
     27 
     28     if (count <= 0) {
     29         return;
     30     }
     31 
     32     uint16_t src_scale = SkAlpha255To256(alpha);
     33     uint16_t dst_scale = 256 - src_scale;
     34 
     35     while (count >= 2) {
     36         uint8x8_t vsrc, vdst, vres;
     37         uint16x8_t vsrc_wide, vdst_wide;
     38 
     39         /* These commented prefetches are a big win for count
     40          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
     41          * They also hurt a little (<5%) on an A15
     42          */
     43         //__builtin_prefetch(src+32);
     44         //__builtin_prefetch(dst+32);
     45 
     46         // Load
     47         vsrc = vreinterpret_u8_u32(vld1_u32(src));
     48         vdst = vreinterpret_u8_u32(vld1_u32(dst));
     49 
     50         // Process src
     51         vsrc_wide = vmovl_u8(vsrc);
     52         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
     53 
     54         // Process dst
     55         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
     56 
     57         // Combine
     58         vdst_wide += vsrc_wide;
     59         vres = vshrn_n_u16(vdst_wide, 8);
     60 
     61         // Store
     62         vst1_u32(dst, vreinterpret_u32_u8(vres));
     63 
     64         src += 2;
     65         dst += 2;
     66         count -= 2;
     67     }
     68 
     69     if (count == 1) {
     70         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
     71         uint16x8_t vsrc_wide, vdst_wide;
     72 
     73         // Load
     74         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
     75         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
     76 
     77         // Process
     78         vsrc_wide = vmovl_u8(vsrc);
     79         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
     80         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
     81         vdst_wide += vsrc_wide;
     82         vres = vshrn_n_u16(vdst_wide, 8);
     83 
     84         // Store
     85         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
     86     }
     87 }
     88 
     89 #ifdef SK_CPU_ARM32
     90 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
     91                          const SkPMColor* SK_RESTRICT src,
     92                          int count, U8CPU alpha) {
     93 
     94     SkASSERT(255 > alpha);
     95 
     96     if (count <= 0) {
     97         return;
     98     }
     99 
    100     unsigned alpha256 = SkAlpha255To256(alpha);
    101 
    102     // First deal with odd counts
    103     if (count & 1) {
    104         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
    105         uint16x8_t vdst_wide, vsrc_wide;
    106         unsigned dst_scale;
    107 
    108         // Load
    109         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
    110         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
    111 
    112         // Calc dst_scale
    113         dst_scale = vget_lane_u8(vsrc, 3);
    114         dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
    115 
    116         // Process src
    117         vsrc_wide = vmovl_u8(vsrc);
    118         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
    119 
    120         // Process dst
    121         vdst_wide = vmovl_u8(vdst);
    122         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
    123 
    124         // Combine
    125         vdst_wide += vsrc_wide;
    126         vres = vshrn_n_u16(vdst_wide, 8);
    127 
    128         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
    129         dst++;
    130         src++;
    131         count--;
    132     }
    133 
    134     if (count) {
    135         uint8x8_t alpha_mask;
    136         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
    137         alpha_mask = vld1_u8(alpha_mask_setup);
    138 
    139         do {
    140 
    141             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
    142             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
    143 
    144             __builtin_prefetch(src+32);
    145             __builtin_prefetch(dst+32);
    146 
    147             // Load
    148             vsrc = vreinterpret_u8_u32(vld1_u32(src));
    149             vdst = vreinterpret_u8_u32(vld1_u32(dst));
    150 
    151             // Prepare src_scale
    152             vsrc_scale = vdupq_n_u16(alpha256);
    153 
    154             // Calc dst_scale
    155             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
    156             vdst_scale = vmovl_u8(vsrc_alphas);
    157             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
    158             // A 16-bit lane would overflow if we used 0xFFFF here,
    159             // so use an approximation with 0xFF00 that is off by 1,
    160             // and add back 1 after to get the correct value.
    161             // This is valid if alpha256 <= 255.
    162             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
    163             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
    164             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
    165 
    166             // Process src
    167             vsrc_wide = vmovl_u8(vsrc);
    168             vsrc_wide *= vsrc_scale;
    169 
    170             // Process dst
    171             vdst_wide = vmovl_u8(vdst);
    172             vdst_wide *= vdst_scale;
    173 
    174             // Combine
    175             vdst_wide += vsrc_wide;
    176             vres = vshrn_n_u16(vdst_wide, 8);
    177 
    178             vst1_u32(dst, vreinterpret_u32_u8(vres));
    179 
    180             src += 2;
    181             dst += 2;
    182             count -= 2;
    183         } while(count);
    184     }
    185 }
    186 
    187 ///////////////////////////////////////////////////////////////////////////////
    188 
    189 #endif // #ifdef SK_CPU_ARM32
    190 
    191 ///////////////////////////////////////////////////////////////////////////////
    192 
    193 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
    194     nullptr,   // S32_Opaque,
    195     S32_Blend_BlitRow32_neon,        // S32_Blend,
    196     nullptr,  // Ported to SkOpts
    197 #ifdef SK_CPU_ARM32
    198     S32A_Blend_BlitRow32_neon        // S32A_Blend
    199 #else
    200     nullptr
    201 #endif
    202 };
    203