Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VPX_DSP_LOOPFILTER_MSA_H_
     12 #define VPX_DSP_LOOPFILTER_MSA_H_
     13 
     14 #include "vpx_dsp/mips/macros_msa.h"
     15 
     16 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
     17                            p0_out, q0_out, q1_out)                        \
     18   {                                                                       \
     19     v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;                \
     20     const v16i8 cnst4b = __msa_ldi_b(4);                                  \
     21     const v16i8 cnst3b = __msa_ldi_b(3);                                  \
     22                                                                           \
     23     p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                              \
     24     p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                              \
     25     q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                              \
     26     q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                              \
     27                                                                           \
     28     filt = __msa_subs_s_b(p1_m, q1_m);                                    \
     29     filt &= hev;                                                          \
     30     q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                               \
     31     filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
     32     filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
     33     filt = __msa_adds_s_b(filt, q0_sub_p0);                               \
     34     filt &= mask;                                                         \
     35     t1 = __msa_adds_s_b(filt, cnst4b);                                    \
     36     t1 >>= cnst3b;                                                        \
     37     t2 = __msa_adds_s_b(filt, cnst3b);                                    \
     38     t2 >>= cnst3b;                                                        \
     39     q0_m = __msa_subs_s_b(q0_m, t1);                                      \
     40     q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                             \
     41     p0_m = __msa_adds_s_b(p0_m, t2);                                      \
     42     p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                             \
     43     filt = __msa_srari_b(t1, 1);                                          \
     44     hev = __msa_xori_b(hev, 0xff);                                        \
     45     filt &= hev;                                                          \
     46     q1_m = __msa_subs_s_b(q1_m, filt);                                    \
     47     q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                             \
     48     p1_m = __msa_adds_s_b(p1_m, filt);                                    \
     49     p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                             \
     50   }
     51 
     52 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
     53   {                                                                      \
     54     v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
     55     v16u8 zero_in = { 0 };                                               \
     56                                                                          \
     57     tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
     58     p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
     59     q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
     60     p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
     61     q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
     62                                                                          \
     63     p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
     64     flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
     65     p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
     66     flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
     67                                                                          \
     68     flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
     69     flat_out = __msa_xori_b(flat_out, 0xff);                             \
     70     flat_out = flat_out & (mask);                                        \
     71   }
     72 
     73 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
     74                   q6_in, q7_in, flat_in, flat2_out)                       \
     75   {                                                                       \
     76     v16u8 tmp_flat5, zero_in = { 0 };                                     \
     77     v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
     78     v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
     79                                                                           \
     80     tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
     81     p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
     82     q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
     83     p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
     84     q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                           \
     85     p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                           \
     86     q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                           \
     87     p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                           \
     88     q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                           \
     89                                                                           \
     90     p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);                \
     91     flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);                  \
     92     flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                    \
     93     p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);                \
     94     flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                    \
     95     p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
     96     flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
     97                                                                           \
     98     flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
     99     flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
    100     flat2_out = flat2_out & flat_in;                                      \
    101   }
    102 
    103 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
    104                     p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
    105                     q1_filt8_out, q2_filt8_out)                             \
    106   {                                                                         \
    107     v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
    108                                                                             \
    109     tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
    110     tmp_filt8_0 = p3_in << 1;                                               \
    111                                                                             \
    112     tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
    113     tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
    114     p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
    115                                                                             \
    116     tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
    117     p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
    118                                                                             \
    119     tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
    120     tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
    121     tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
    122     tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
    123     p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
    124                                                                             \
    125     tmp_filt8_0 = q2_in + q3_in;                                            \
    126     tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
    127     tmp_filt8_1 = q3_in + q3_in;                                            \
    128     tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
    129     q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
    130                                                                             \
    131     tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
    132     tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
    133     q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
    134                                                                             \
    135     tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
    136     tmp_filt8_0 = q1_in + q3_in;                                            \
    137     tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
    138     q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
    139   }
    140 
    141 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
    142                      limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
    143                      flat_out)                                               \
    144   {                                                                          \
    145     v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
    146     v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
    147                                                                              \
    148     /* absolute subtraction of pixel values */                               \
    149     p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                             \
    150     p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                             \
    151     p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                             \
    152     q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                             \
    153     q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                             \
    154     q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                             \
    155     p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                             \
    156     p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                             \
    157                                                                              \
    158     /* calculation of hev */                                                 \
    159     flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);                    \
    160     hev_out = thresh_in < (v16u8)flat_out;                                   \
    161                                                                              \
    162     /* calculation of mask */                                                \
    163     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);               \
    164     p1_asub_q1_m >>= 1;                                                      \
    165     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);               \
    166                                                                              \
    167     mask_out = b_limit_in < p0_asub_q0_m;                                    \
    168     mask_out = __msa_max_u_b(flat_out, mask_out);                            \
    169     p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);                \
    170     mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);                        \
    171     q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);                \
    172     mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);                        \
    173                                                                              \
    174     mask_out = limit_in < (v16u8)mask_out;                                   \
    175     mask_out = __msa_xori_b(mask_out, 0xff);                                 \
    176   }
    177 #endif /* VPX_DSP_LOOPFILTER_MSA_H_ */
    178