Home | History | Annotate | Download | only in msa
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp9_rtcd.h"
     12 #include "vp9/common/vp9_onyxc_int.h"
     13 #include "vpx_dsp/mips/macros_msa.h"
     14 
     15 static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
     16                                     uint8_t *dst_ptr, int32_t dst_stride,
     17                                     int32_t src_weight) {
     18   int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
     19   int32_t row;
     20   uint64_t src0_d, src1_d, dst0_d, dst1_d;
     21   v16i8 src0 = { 0 };
     22   v16i8 src1 = { 0 };
     23   v16i8 dst0 = { 0 };
     24   v16i8 dst1 = { 0 };
     25   v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
     26 
     27   src_wt = __msa_fill_h(src_weight);
     28   dst_wt = __msa_fill_h(dst_weight);
     29 
     30   for (row = 2; row--;) {
     31     LD2(src_ptr, src_stride, src0_d, src1_d);
     32     src_ptr += (2 * src_stride);
     33     LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
     34     INSERT_D2_SB(src0_d, src1_d, src0);
     35     INSERT_D2_SB(dst0_d, dst1_d, dst0);
     36 
     37     LD2(src_ptr, src_stride, src0_d, src1_d);
     38     src_ptr += (2 * src_stride);
     39     LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
     40     INSERT_D2_SB(src0_d, src1_d, src1);
     41     INSERT_D2_SB(dst0_d, dst1_d, dst1);
     42 
     43     UNPCK_UB_SH(src0, src_r, src_l);
     44     UNPCK_UB_SH(dst0, dst_r, dst_l);
     45     res_h_r = (src_r * src_wt);
     46     res_h_r += (dst_r * dst_wt);
     47     res_h_l = (src_l * src_wt);
     48     res_h_l += (dst_l * dst_wt);
     49     SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
     50     dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
     51     ST8x2_UB(dst0, dst_ptr, dst_stride);
     52     dst_ptr += (2 * dst_stride);
     53 
     54     UNPCK_UB_SH(src1, src_r, src_l);
     55     UNPCK_UB_SH(dst1, dst_r, dst_l);
     56     res_h_r = (src_r * src_wt);
     57     res_h_r += (dst_r * dst_wt);
     58     res_h_l = (src_l * src_wt);
     59     res_h_l += (dst_l * dst_wt);
     60     SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
     61     dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
     62     ST8x2_UB(dst1, dst_ptr, dst_stride);
     63     dst_ptr += (2 * dst_stride);
     64   }
     65 }
     66 
     67 static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
     68                                       int32_t src_stride, uint8_t *dst_ptr,
     69                                       int32_t dst_stride, int32_t src_weight) {
     70   int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
     71   int32_t row;
     72   v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
     73   v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
     74 
     75   src_wt = __msa_fill_h(src_weight);
     76   dst_wt = __msa_fill_h(dst_weight);
     77 
     78   for (row = 4; row--;) {
     79     LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
     80     src_ptr += (4 * src_stride);
     81     LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
     82 
     83     UNPCK_UB_SH(src0, src_r, src_l);
     84     UNPCK_UB_SH(dst0, dst_r, dst_l);
     85     res_h_r = (src_r * src_wt);
     86     res_h_r += (dst_r * dst_wt);
     87     res_h_l = (src_l * src_wt);
     88     res_h_l += (dst_l * dst_wt);
     89     SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
     90     PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
     91     dst_ptr += dst_stride;
     92 
     93     UNPCK_UB_SH(src1, src_r, src_l);
     94     UNPCK_UB_SH(dst1, dst_r, dst_l);
     95     res_h_r = (src_r * src_wt);
     96     res_h_r += (dst_r * dst_wt);
     97     res_h_l = (src_l * src_wt);
     98     res_h_l += (dst_l * dst_wt);
     99     SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    100     PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
    101     dst_ptr += dst_stride;
    102 
    103     UNPCK_UB_SH(src2, src_r, src_l);
    104     UNPCK_UB_SH(dst2, dst_r, dst_l);
    105     res_h_r = (src_r * src_wt);
    106     res_h_r += (dst_r * dst_wt);
    107     res_h_l = (src_l * src_wt);
    108     res_h_l += (dst_l * dst_wt);
    109     SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    110     PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
    111     dst_ptr += dst_stride;
    112 
    113     UNPCK_UB_SH(src3, src_r, src_l);
    114     UNPCK_UB_SH(dst3, dst_r, dst_l);
    115     res_h_r = (src_r * src_wt);
    116     res_h_r += (dst_r * dst_wt);
    117     res_h_l = (src_l * src_wt);
    118     res_h_l += (dst_l * dst_wt);
    119     SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
    120     PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
    121     dst_ptr += dst_stride;
    122   }
    123 }
    124 
    125 void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
    126                                  uint8_t *dst, int dst_stride, int src_weight) {
    127   filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
    128 }
    129 
    130 void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
    131                                    uint8_t *dst, int dst_stride,
    132                                    int src_weight) {
    133   filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
    134 }
    135