1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include "./vpx_config.h" 13 14 static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( 15 unsigned char *s, 16 int p, 17 const unsigned char *blimit) { 18 uint8_t *sp; 19 uint8x16_t qblimit, q0u8; 20 uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; 21 int16x8_t q2s16, q3s16, q13s16; 22 int8x8_t d8s8, d9s8; 23 int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; 24 25 qblimit = vdupq_n_u8(*blimit); 26 27 sp = s - (p << 1); 28 q5u8 = vld1q_u8(sp); 29 sp += p; 30 q6u8 = vld1q_u8(sp); 31 sp += p; 32 q7u8 = vld1q_u8(sp); 33 sp += p; 34 q8u8 = vld1q_u8(sp); 35 36 q15u8 = vabdq_u8(q6u8, q7u8); 37 q14u8 = vabdq_u8(q5u8, q8u8); 38 39 q15u8 = vqaddq_u8(q15u8, q15u8); 40 q14u8 = vshrq_n_u8(q14u8, 1); 41 q0u8 = vdupq_n_u8(0x80); 42 q13s16 = vdupq_n_s16(3); 43 q15u8 = vqaddq_u8(q15u8, q14u8); 44 45 q5u8 = veorq_u8(q5u8, q0u8); 46 q6u8 = veorq_u8(q6u8, q0u8); 47 q7u8 = veorq_u8(q7u8, q0u8); 48 q8u8 = veorq_u8(q8u8, q0u8); 49 50 q15u8 = vcgeq_u8(qblimit, q15u8); 51 52 q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), 53 vget_low_s8(vreinterpretq_s8_u8(q6u8))); 54 q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), 55 vget_high_s8(vreinterpretq_s8_u8(q6u8))); 56 57 q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), 58 vreinterpretq_s8_u8(q8u8)); 59 60 q2s16 = vmulq_s16(q2s16, q13s16); 61 q3s16 = vmulq_s16(q3s16, q13s16); 62 63 q10u8 = vdupq_n_u8(3); 64 q9u8 = vdupq_n_u8(4); 65 66 q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); 67 q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); 68 69 d8s8 = vqmovn_s16(q2s16); 70 d9s8 = vqmovn_s16(q3s16); 71 q4s8 = vcombine_s8(d8s8, d9s8); 72 73 q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); 74 75 q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); 76 q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); 77 q2s8 = vshrq_n_s8(q2s8, 3); 78 q3s8 = vshrq_n_s8(q3s8, 3); 79 80 q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); 81 q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); 82 83 q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); 84 q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); 85 86 vst1q_u8(s, q7u8); 87 s -= p; 88 vst1q_u8(s, q6u8); 89 return; 90 } 91 92 void vp8_loop_filter_bhs_neon( 93 unsigned char *y_ptr, 94 int y_stride, 95 const unsigned char *blimit) { 96 y_ptr += y_stride * 4; 97 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); 98 y_ptr += y_stride * 4; 99 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); 100 y_ptr += y_stride * 4; 101 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); 102 return; 103 } 104 105 void vp8_loop_filter_mbhs_neon( 106 unsigned char *y_ptr, 107 int y_stride, 108 const unsigned char *blimit) { 109 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); 110 return; 111 } 112