Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_dsp/mips/loopfilter_msa.h"
     12 
     13 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
     14                               const uint8_t *b_limit_ptr,
     15                               const uint8_t *limit_ptr,
     16                               const uint8_t *thresh_ptr) {
     17   uint64_t p1_d, p0_d, q0_d, q1_d;
     18   v16u8 mask, hev, flat, thresh, b_limit, limit;
     19   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
     20 
     21   /* load vector elements */
     22   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
     23 
     24   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
     25   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
     26   limit = (v16u8)__msa_fill_b(*limit_ptr);
     27 
     28   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
     29                mask, flat);
     30   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
     31 
     32   p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
     33   p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
     34   q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
     35   q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
     36   SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
     37 }
     38 
     39 void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
     40                                    const uint8_t *b_limit0_ptr,
     41                                    const uint8_t *limit0_ptr,
     42                                    const uint8_t *thresh0_ptr,
     43                                    const uint8_t *b_limit1_ptr,
     44                                    const uint8_t *limit1_ptr,
     45                                    const uint8_t *thresh1_ptr) {
     46   v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
     47   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
     48 
     49   /* load vector elements */
     50   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
     51 
     52   thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
     53   thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
     54   thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
     55 
     56   b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
     57   b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
     58   b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
     59 
     60   limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
     61   limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
     62   limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
     63 
     64   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
     65                mask, flat);
     66   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
     67 
     68   ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
     69 }
     70 
     71 void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
     72                             const uint8_t *b_limit_ptr,
     73                             const uint8_t *limit_ptr,
     74                             const uint8_t *thresh_ptr) {
     75   v16u8 mask, hev, flat, limit, thresh, b_limit;
     76   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
     77   v8i16 vec0, vec1, vec2, vec3;
     78 
     79   LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
     80 
     81   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
     82   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
     83   limit = (v16u8)__msa_fill_b(*limit_ptr);
     84 
     85   TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
     86                      q3);
     87   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
     88                mask, flat);
     89   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
     90   ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
     91   ILVRL_H2_SH(vec1, vec0, vec2, vec3);
     92 
     93   src -= 2;
     94   ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
     95   src += 4 * pitch;
     96   ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
     97 }
     98 
     99 void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
    100                                  const uint8_t *b_limit0_ptr,
    101                                  const uint8_t *limit0_ptr,
    102                                  const uint8_t *thresh0_ptr,
    103                                  const uint8_t *b_limit1_ptr,
    104                                  const uint8_t *limit1_ptr,
    105                                  const uint8_t *thresh1_ptr) {
    106   v16u8 mask, hev, flat;
    107   v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
    108   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    109   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
    110   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
    111   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
    112 
    113   LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
    114   LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
    115          row14, row15);
    116 
    117   TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
    118                       row9, row10, row11, row12, row13, row14, row15, p3, p2,
    119                       p1, p0, q0, q1, q2, q3);
    120 
    121   thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
    122   thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
    123   thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
    124 
    125   b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
    126   b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
    127   b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
    128 
    129   limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
    130   limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
    131   limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
    132 
    133   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
    134                mask, flat);
    135   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
    136   ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    137   ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
    138   ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
    139   ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
    140 
    141   src -= 2;
    142 
    143   ST4x8_UB(tmp2, tmp3, src, pitch);
    144   src += (8 * pitch);
    145   ST4x8_UB(tmp4, tmp5, src, pitch);
    146 }
    147