Home | History | Annotate | Download | only in opts
      1 
      2 /*
      3  * Copyright 2012 The Android Open Source Project
      4  *
      5  * Use of this source code is governed by a BSD-style license that can be
      6  * found in the LICENSE file.
      7  */
      8 
      9 
     10 #include <arm_neon.h>
     11 #include "SkColorPriv.h"
     12 
     13 /*
     14  * Filter_32_opaque
     15  *
     16  * There is no hard-n-fast rule that the filtering must produce
     17  * exact results for the color components, but if the 4 incoming colors are
     18  * all opaque, then the output color must also be opaque. Subsequent parts of
     19  * the drawing pipeline may rely on this (e.g. which blitrow proc to use).
     20  *
     21  */
     22 // Chrome on Android uses -Os so we need to force these inline. Otherwise
     23 // calling the function in the inner loops will cause significant overhead on
     24 // some platforms.
     25 static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y,
     26                                                    SkPMColor a00, SkPMColor a01,
     27                                                    SkPMColor a10, SkPMColor a11,
     28                                                    SkPMColor *dst) {
     29     uint8x8_t vy, vconst16_8, v16_y, vres;
     30     uint16x4_t vx, vconst16_16, v16_x, tmp;
     31     uint32x2_t va0, va1;
     32     uint16x8_t tmp1, tmp2;
     33 
     34     vy = vdup_n_u8(y);                // duplicate y into vy
     35     vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
     36     v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
     37 
     38     va0 = vdup_n_u32(a00);            // duplicate a00
     39     va1 = vdup_n_u32(a10);            // duplicate a10
     40     va0 = vset_lane_u32(a01, va0, 1); // set top to a01
     41     va1 = vset_lane_u32(a11, va1, 1); // set top to a11
     42 
     43     tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
     44     tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
     45 
     46     vx = vdup_n_u16(x);                // duplicate x into vx
     47     vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
     48     v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
     49 
     50     tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
     51     tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
     52     tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
     53     tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
     54 
     55     vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
     56     vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
     57 }
     58 
     59 static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
     60                                                   SkPMColor a00, SkPMColor a01,
     61                                                   SkPMColor a10, SkPMColor a11,
     62                                                   SkPMColor *dst,
     63                                                   uint16_t scale) {
     64     uint8x8_t vy, vconst16_8, v16_y, vres;
     65     uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
     66     uint32x2_t va0, va1;
     67     uint16x8_t tmp1, tmp2;
     68 
     69     vy = vdup_n_u8(y);                // duplicate y into vy
     70     vconst16_8 = vmov_n_u8(16);       // set up constant in vconst16_8
     71     v16_y = vsub_u8(vconst16_8, vy);  // v16_y = 16-y
     72 
     73     va0 = vdup_n_u32(a00);            // duplicate a00
     74     va1 = vdup_n_u32(a10);            // duplicate a10
     75     va0 = vset_lane_u32(a01, va0, 1); // set top to a01
     76     va1 = vset_lane_u32(a11, va1, 1); // set top to a11
     77 
     78     tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
     79     tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy);    // tmp2 = [a11|a10] * y
     80 
     81     vx = vdup_n_u16(x);                // duplicate x into vx
     82     vconst16_16 = vmov_n_u16(16);      // set up constant in vconst16_16
     83     v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
     84 
     85     tmp = vmul_u16(vget_high_u16(tmp1), vx);        // tmp  = a01 * x
     86     tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx);   // tmp += a11 * x
     87     tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
     88     tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
     89 
     90     vscale = vdup_n_u16(scale);        // duplicate scale
     91     tmp = vshr_n_u16(tmp, 8);          // shift down result by 8
     92     tmp = vmul_u16(tmp, vscale);       // multiply result by scale
     93 
     94     vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
     95     vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);         // store result
     96 }
     97