Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include "./vpx_config.h"
     13 
     14 static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
     15         unsigned char *s,
     16         int p,
     17         const unsigned char *blimit) {
     18     uint8_t *sp;
     19     uint8x16_t qblimit, q0u8;
     20     uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
     21     int16x8_t q2s16, q3s16, q13s16;
     22     int8x8_t d8s8, d9s8;
     23     int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
     24 
     25     qblimit = vdupq_n_u8(*blimit);
     26 
     27     sp = s - (p << 1);
     28     q5u8 = vld1q_u8(sp);
     29     sp += p;
     30     q6u8 = vld1q_u8(sp);
     31     sp += p;
     32     q7u8 = vld1q_u8(sp);
     33     sp += p;
     34     q8u8 = vld1q_u8(sp);
     35 
     36     q15u8 = vabdq_u8(q6u8, q7u8);
     37     q14u8 = vabdq_u8(q5u8, q8u8);
     38 
     39     q15u8 = vqaddq_u8(q15u8, q15u8);
     40     q14u8 = vshrq_n_u8(q14u8, 1);
     41     q0u8 = vdupq_n_u8(0x80);
     42     q13s16 = vdupq_n_s16(3);
     43     q15u8 = vqaddq_u8(q15u8, q14u8);
     44 
     45     q5u8 = veorq_u8(q5u8, q0u8);
     46     q6u8 = veorq_u8(q6u8, q0u8);
     47     q7u8 = veorq_u8(q7u8, q0u8);
     48     q8u8 = veorq_u8(q8u8, q0u8);
     49 
     50     q15u8 = vcgeq_u8(qblimit, q15u8);
     51 
     52     q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
     53                      vget_low_s8(vreinterpretq_s8_u8(q6u8)));
     54     q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
     55                      vget_high_s8(vreinterpretq_s8_u8(q6u8)));
     56 
     57     q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
     58                      vreinterpretq_s8_u8(q8u8));
     59 
     60     q2s16 = vmulq_s16(q2s16, q13s16);
     61     q3s16 = vmulq_s16(q3s16, q13s16);
     62 
     63     q10u8 = vdupq_n_u8(3);
     64     q9u8 = vdupq_n_u8(4);
     65 
     66     q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
     67     q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
     68 
     69     d8s8 = vqmovn_s16(q2s16);
     70     d9s8 = vqmovn_s16(q3s16);
     71     q4s8 = vcombine_s8(d8s8, d9s8);
     72 
     73     q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
     74 
     75     q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
     76     q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
     77     q2s8 = vshrq_n_s8(q2s8, 3);
     78     q3s8 = vshrq_n_s8(q3s8, 3);
     79 
     80     q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
     81     q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
     82 
     83     q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
     84     q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
     85 
     86     vst1q_u8(s, q7u8);
     87     s -= p;
     88     vst1q_u8(s, q6u8);
     89     return;
     90 }
     91 
     92 void vp8_loop_filter_bhs_neon(
     93         unsigned char *y_ptr,
     94         int y_stride,
     95         const unsigned char *blimit) {
     96     y_ptr += y_stride * 4;
     97     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
     98     y_ptr += y_stride * 4;
     99     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
    100     y_ptr += y_stride * 4;
    101     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
    102     return;
    103 }
    104 
    105 void vp8_loop_filter_mbhs_neon(
    106         unsigned char *y_ptr,
    107         int y_stride,
    108         const unsigned char *blimit) {
    109     vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
    110     return;
    111 }
    112