Home | History | Annotate | Download | only in opts
      1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola
      2  *
      3  * Use of this source code is governed by a BSD-style license that can be
      4  * found in the LICENSE file.
      5  */
      6 
      7 #include "SkBitmapProcState.h"
      8 #include "SkPerspIter.h"
      9 #include "SkShader.h"
     10 #include "SkUtilsArm.h"
     11 
     12 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[];
     13 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[];
     14 
     15 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
     16 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count);
     17 
     18 static unsigned SK_USHIFT16(unsigned x) {
     19     return x >> 16;
     20 }
     21 
     22 #define MAKENAME(suffix)        ClampX_ClampY ## suffix ## _neon
     23 #define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
     24 #define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
     25 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
     26 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
     27 #define CHECK_FOR_DECAL
     28 #include "SkBitmapProcState_matrix_clamp_neon.h"
     29 
     30 #define MAKENAME(suffix)        RepeatX_RepeatY ## suffix ## _neon
     31 #define TILEX_PROCF(fx, max)    SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1))
     32 #define TILEY_PROCF(fy, max)    SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1))
     33 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
     34 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
     35 #include "SkBitmapProcState_matrix_repeat_neon.h"
     36 
     37 
     38 void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
     39 {
     40     int i;
     41 
     42     if (count >= 8) {
     43         /* SkFixed is 16.16 fixed point */
     44         SkFixed dx2 = dx+dx;
     45         SkFixed dx4 = dx2+dx2;
     46         SkFixed dx8 = dx4+dx4;
     47 
     48         /* now build fx/fx+dx/fx+2dx/fx+3dx */
     49         SkFixed fx1, fx2, fx3;
     50         int32x2_t lower, upper;
     51         int32x4_t lbase, hbase;
     52         uint16_t *dst16 = (uint16_t *)dst;
     53 
     54         fx1 = fx+dx;
     55         fx2 = fx1+dx;
     56         fx3 = fx2+dx;
     57 
     58         /* avoid an 'lbase unitialized' warning */
     59         lbase = vdupq_n_s32(fx);
     60         lbase = vsetq_lane_s32(fx1, lbase, 1);
     61         lbase = vsetq_lane_s32(fx2, lbase, 2);
     62         lbase = vsetq_lane_s32(fx3, lbase, 3);
     63         hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
     64 
     65         /* take upper 16 of each, store, and bump everything */
     66         do {
     67             int32x4_t lout, hout;
     68             uint16x8_t hi16;
     69 
     70             lout = lbase;
     71             hout = hbase;
     72             /* gets hi's of all louts then hi's of all houts */
     73             asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
     74             hi16 = vreinterpretq_u16_s32(hout);
     75             vst1q_u16(dst16, hi16);
     76 
     77             /* on to the next */
     78             lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
     79             hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
     80             dst16 += 8;
     81             count -= 8;
     82             fx += dx8;
     83         } while (count >= 8);
     84         dst = (uint32_t *) dst16;
     85     }
     86 
     87     uint16_t* xx = (uint16_t*)dst;
     88     for (i = count; i > 0; --i) {
     89         *xx++ = SkToU16(fx >> 16); fx += dx;
     90     }
     91 }
     92 
     93 void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
     94 {
     95     if (count >= 8) {
     96         int32x4_t wide_fx;
     97         int32x4_t wide_fx2;
     98         int32x4_t wide_dx8 = vdupq_n_s32(dx*8);
     99 
    100         wide_fx = vdupq_n_s32(fx);
    101         wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
    102         wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
    103         wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
    104 
    105         wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx));
    106 
    107         while (count >= 8) {
    108             int32x4_t wide_out;
    109             int32x4_t wide_out2;
    110 
    111             wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14);
    112             wide_out = vorrq_s32(wide_out,
    113             vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1)));
    114 
    115             wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14);
    116             wide_out2 = vorrq_s32(wide_out2,
    117             vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1)));
    118 
    119             vst1q_u32(dst, vreinterpretq_u32_s32(wide_out));
    120             vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2));
    121 
    122             dst += 8;
    123             fx += dx*8;
    124             wide_fx = vaddq_s32(wide_fx, wide_dx8);
    125             wide_fx2 = vaddq_s32(wide_fx2, wide_dx8);
    126             count -= 8;
    127         }
    128     }
    129 
    130     if (count & 1)
    131     {
    132         SkASSERT((fx >> (16 + 14)) == 0);
    133         *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
    134         fx += dx;
    135     }
    136     while ((count -= 2) >= 0)
    137     {
    138         SkASSERT((fx >> (16 + 14)) == 0);
    139         *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
    140         fx += dx;
    141 
    142         *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
    143         fx += dx;
    144     }
    145 }
    146