1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola 2 * 3 * Use of this source code is governed by a BSD-style license that can be 4 * found in the LICENSE file. 5 */ 6 7 #include "SkBitmapProcState.h" 8 #include "SkPerspIter.h" 9 #include "SkShader.h" 10 #include "SkUtilsArm.h" 11 #include "SkBitmapProcState_utils.h" 12 13 extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; 14 extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; 15 16 static void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 17 static void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count); 18 19 #define MAKENAME(suffix) ClampX_ClampY ## suffix ## _neon 20 #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) 21 #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) 22 #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) 23 #define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) 24 #define CHECK_FOR_DECAL 25 #include "SkBitmapProcState_matrix_clamp_neon.h" 26 27 #define MAKENAME(suffix) RepeatX_RepeatY ## suffix ## _neon 28 #define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) 29 #define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) 30 #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 31 #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) 32 #include "SkBitmapProcState_matrix_repeat_neon.h" 33 34 35 36 void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { 37 if (count >= 8) { 38 // SkFixed is 16.16 fixed point 39 SkFixed dx8 = dx * 8; 40 int32x4_t vdx8 = vdupq_n_s32(dx8); 41 42 // setup lbase and hbase 43 int32x4_t lbase, hbase; 44 lbase = vdupq_n_s32(fx); 45 lbase = vsetq_lane_s32(fx + dx, lbase, 1); 46 lbase = vsetq_lane_s32(fx + dx + dx, lbase, 2); 47 lbase = vsetq_lane_s32(fx + dx + dx + dx, lbase, 3); 48 hbase = lbase + vdupq_n_s32(4 * dx); 49 50 do { 51 // store the upper 16 bits 52 vst1q_u32(dst, vreinterpretq_u32_s16( 53 vuzpq_s16(vreinterpretq_s16_s32(lbase), vreinterpretq_s16_s32(hbase)).val[1] 54 )); 55 56 // on to the next group of 8 57 lbase += vdx8; 58 hbase += vdx8; 59 dst += 4; // we did 8 elements but the result is twice smaller 60 count -= 8; 61 fx += dx8; 62 } while (count >= 8); 63 } 64 65 uint16_t* xx = (uint16_t*)dst; 66 for (int i = count; i > 0; --i) { 67 *xx++ = SkToU16(fx >> 16); fx += dx; 68 } 69 } 70 71 void decal_filter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count) { 72 if (count >= 8) { 73 SkFixed dx8 = dx * 8; 74 int32x4_t vdx8 = vdupq_n_s32(dx8); 75 76 int32x4_t wide_fx, wide_fx2; 77 wide_fx = vdupq_n_s32(fx); 78 wide_fx = vsetq_lane_s32(fx + dx, wide_fx, 1); 79 wide_fx = vsetq_lane_s32(fx + dx + dx, wide_fx, 2); 80 wide_fx = vsetq_lane_s32(fx + dx + dx + dx, wide_fx, 3); 81 82 wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(4 * dx)); 83 84 while (count >= 8) { 85 int32x4_t wide_out; 86 int32x4_t wide_out2; 87 88 wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); 89 wide_out = wide_out | (vshrq_n_s32(wide_fx,16) + vdupq_n_s32(1)); 90 91 wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); 92 wide_out2 = wide_out2 | (vshrq_n_s32(wide_fx2,16) + vdupq_n_s32(1)); 93 94 vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); 95 vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); 96 97 dst += 8; 98 fx += dx8; 99 wide_fx += vdx8; 100 wide_fx2 += vdx8; 101 count -= 8; 102 } 103 } 104 105 if (count & 1) 106 { 107 SkASSERT((fx >> (16 + 14)) == 0); 108 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 109 fx += dx; 110 } 111 while ((count -= 2) >= 0) 112 { 113 SkASSERT((fx >> (16 + 14)) == 0); 114 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 115 fx += dx; 116 117 *dst++ = (fx >> 12 << 14) | ((fx >> 16) + 1); 118 fx += dx; 119 } 120 } 121