Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "vpx_dsp/mips/loopfilter_msa.h"
     12 
     13 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
     14                               const uint8_t *b_limit_ptr,
     15                               const uint8_t *limit_ptr,
     16                               const uint8_t *thresh_ptr) {
     17   uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
     18   v16u8 mask, hev, flat, thresh, b_limit, limit;
     19   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
     20   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
     21   v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
     22   v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
     23   v16i8 zero = { 0 };
     24 
     25   /* load vector elements */
     26   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
     27 
     28   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
     29   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
     30   limit = (v16u8)__msa_fill_b(*limit_ptr);
     31 
     32   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
     33                mask, flat);
     34   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
     35   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
     36 
     37   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
     38 
     39   if (__msa_test_bz_v(flat)) {
     40     p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
     41     p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
     42     q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
     43     q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
     44     SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
     45   } else {
     46     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
     47                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
     48     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
     49                 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
     50 
     51     /* convert 16 bit output data into 8 bit */
     52     PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
     53                 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
     54     PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
     55 
     56     /* store pixel values */
     57     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
     58     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
     59     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
     60     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
     61     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
     62     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
     63 
     64     p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
     65     p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
     66     p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
     67     q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
     68     q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
     69     q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
     70 
     71     src -= 3 * pitch;
     72 
     73     SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
     74     src += (4 * pitch);
     75     SD(q1_d, src);
     76     src += pitch;
     77     SD(q2_d, src);
     78   }
     79 }
     80 
     81 void vpx_lpf_horizontal_8_dual_msa(
     82     uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
     83     const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
     84     const uint8_t *thresh1) {
     85   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
     86   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
     87   v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
     88   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
     89   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
     90   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
     91   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
     92   v16u8 zero = { 0 };
     93 
     94   /* load vector elements */
     95   LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
     96 
     97   thresh = (v16u8)__msa_fill_b(*thresh0);
     98   tmp = (v16u8)__msa_fill_b(*thresh1);
     99   thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
    100 
    101   b_limit = (v16u8)__msa_fill_b(*b_limit0);
    102   tmp = (v16u8)__msa_fill_b(*b_limit1);
    103   b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
    104 
    105   limit = (v16u8)__msa_fill_b(*limit0);
    106   tmp = (v16u8)__msa_fill_b(*limit1);
    107   limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
    108 
    109   /* mask and hev */
    110   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
    111                mask, flat);
    112   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
    113   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
    114 
    115   if (__msa_test_bz_v(flat)) {
    116     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
    117   } else {
    118     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
    119                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
    120     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
    121                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
    122 
    123     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    124     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
    125     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
    126                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
    127 
    128     /* convert 16 bit output data into 8 bit */
    129     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
    130                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
    131                 p0_filt8_r, q0_filt8_r);
    132     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
    133                 q2_filt8_r);
    134 
    135     /* store pixel values */
    136     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    137     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    138     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    139     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    140     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    141     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
    142 
    143     src -= 3 * pitch;
    144 
    145     ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
    146     src += (4 * pitch);
    147     ST_UB2(q1_out, q2_out, src, pitch);
    148     src += (2 * pitch);
    149   }
    150 }
    151 
    152 void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
    153                             const uint8_t *b_limit_ptr,
    154                             const uint8_t *limit_ptr,
    155                             const uint8_t *thresh_ptr) {
    156   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    157   v16u8 p1_out, p0_out, q0_out, q1_out;
    158   v16u8 flat, mask, hev, thresh, b_limit, limit;
    159   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
    160   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
    161   v16u8 zero = { 0 };
    162   v8i16 vec0, vec1, vec2, vec3, vec4;
    163 
    164   /* load vector elements */
    165   LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
    166 
    167   TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
    168                      q3);
    169 
    170   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
    171   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
    172   limit = (v16u8)__msa_fill_b(*limit_ptr);
    173 
    174   /* mask and hev */
    175   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
    176                mask, flat);
    177   /* flat4 */
    178   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
    179   /* filter4 */
    180   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
    181 
    182   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
    183 
    184   if (__msa_test_bz_v(flat)) {
    185     /* Store 4 pixels p1-_q1 */
    186     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    187     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    188 
    189     src -= 2;
    190     ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
    191     src += 4 * pitch;
    192     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    193   } else {
    194     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
    195                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
    196     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
    197                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
    198     /* convert 16 bit output data into 8 bit */
    199     PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
    200                 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
    201                 p0_filt8_r, q0_filt8_r);
    202     PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
    203                 q2_filt8_r);
    204 
    205     /* store pixel values */
    206     p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    207     p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    208     p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    209     q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    210     q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    211     q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
    212 
    213     /* Store 6 pixels p2-_q2 */
    214     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    215     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    216     vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
    217 
    218     src -= 3;
    219     ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
    220     ST2x4_UB(vec4, 0, src + 4, pitch);
    221     src += (4 * pitch);
    222     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    223     ST2x4_UB(vec4, 4, src + 4, pitch);
    224   }
    225 }
    226 
    227 void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
    228                                  const uint8_t *b_limit0, const uint8_t *limit0,
    229                                  const uint8_t *thresh0,
    230                                  const uint8_t *b_limit1, const uint8_t *limit1,
    231                                  const uint8_t *thresh1) {
    232   uint8_t *temp_src;
    233   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
    234   v16u8 p1_out, p0_out, q0_out, q1_out;
    235   v16u8 flat, mask, hev, thresh, b_limit, limit;
    236   v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
    237   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
    238   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
    239   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
    240   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
    241   v16u8 zero = { 0 };
    242   v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    243 
    244   temp_src = src - 4;
    245 
    246   LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
    247   temp_src += (8 * pitch);
    248   LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
    249 
    250   /* transpose 16x8 matrix into 8x16 */
    251   TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
    252                       row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
    253                       q3);
    254 
    255   thresh = (v16u8)__msa_fill_b(*thresh0);
    256   vec0 = (v8i16)__msa_fill_b(*thresh1);
    257   thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
    258 
    259   b_limit = (v16u8)__msa_fill_b(*b_limit0);
    260   vec0 = (v8i16)__msa_fill_b(*b_limit1);
    261   b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
    262 
    263   limit = (v16u8)__msa_fill_b(*limit0);
    264   vec0 = (v8i16)__msa_fill_b(*limit1);
    265   limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
    266 
    267   /* mask and hev */
    268   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
    269                mask, flat);
    270   /* flat4 */
    271   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
    272   /* filter4 */
    273   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
    274 
    275   if (__msa_test_bz_v(flat)) {
    276     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    277     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    278     ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    279     ILVRL_H2_SH(vec1, vec0, vec4, vec5);
    280 
    281     src -= 2;
    282     ST4x8_UB(vec2, vec3, src, pitch);
    283     src += 8 * pitch;
    284     ST4x8_UB(vec4, vec5, src, pitch);
    285   } else {
    286     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
    287                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
    288     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
    289                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
    290 
    291     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    292     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
    293 
    294     /* filter8 */
    295     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
    296                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
    297 
    298     /* convert 16 bit output data into 8 bit */
    299     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
    300                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
    301                 p0_filt8_r, q0_filt8_r);
    302     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
    303                 q2_filt8_r);
    304 
    305     /* store pixel values */
    306     p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    307     p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    308     p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    309     q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    310     q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    311     q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
    312 
    313     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    314     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
    315     ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
    316     ILVRL_H2_SH(vec1, vec0, vec6, vec7);
    317     ILVRL_B2_SH(q2, q1, vec2, vec5);
    318 
    319     src -= 3;
    320     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    321     ST2x4_UB(vec2, 0, src + 4, pitch);
    322     src += (4 * pitch);
    323     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
    324     ST2x4_UB(vec2, 4, src + 4, pitch);
    325     src += (4 * pitch);
    326     ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
    327     ST2x4_UB(vec5, 0, src + 4, pitch);
    328     src += (4 * pitch);
    329     ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
    330     ST2x4_UB(vec5, 4, src + 4, pitch);
    331   }
    332 }
    333