Home | History | Annotate | Download | only in msa
      1 /*
      2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp9_rtcd.h"
     12 #include "vpx_dsp/mips/macros_msa.h"
     13 
     14 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
     15                                             uint32_t stride,
     16                                             uint8_t *frm2_ptr,
     17                                             int32_t filt_sth,
     18                                             int32_t filt_wgt,
     19                                             uint32_t *acc,
     20                                             uint16_t *cnt) {
     21   uint32_t row;
     22   uint64_t f0, f1, f2, f3;
     23   v16i8 frm2, frm1 = { 0 };
     24   v16i8 frm4, frm3 = { 0 };
     25   v16u8 frm_r, frm_l;
     26   v8i16 frm2_r, frm2_l;
     27   v8i16 diff0, diff1, mod0_h, mod1_h;
     28   v4i32 cnst3, cnst16, filt_wt, strength;
     29   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
     30   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
     31   v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
     32   v4i32 acc0, acc1, acc2, acc3;
     33   v8i16 cnt0, cnt1;
     34 
     35   filt_wt = __msa_fill_w(filt_wgt);
     36   strength = __msa_fill_w(filt_sth);
     37   cnst3 = __msa_ldi_w(3);
     38   cnst16 = __msa_ldi_w(16);
     39 
     40   for (row = 2; row--;) {
     41     LD4(frm1_ptr, stride, f0, f1, f2, f3);
     42     frm1_ptr += (4 * stride);
     43 
     44     LD_SB2(frm2_ptr, 16, frm2, frm4);
     45     frm2_ptr += 32;
     46 
     47     LD_SW2(acc, 4, acc0, acc1);
     48     LD_SW2(acc + 8, 4, acc2, acc3);
     49     LD_SH2(cnt, 8, cnt0, cnt1);
     50 
     51     INSERT_D2_SB(f0, f1, frm1);
     52     INSERT_D2_SB(f2, f3, frm3);
     53     ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
     54     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
     55     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
     56     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
     57     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
     58          diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
     59     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
     60          mod0_w, mod1_w, mod2_w, mod3_w);
     61     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
     62 
     63     diff0_r = (mod0_w < cnst16);
     64     diff0_l = (mod1_w < cnst16);
     65     diff1_r = (mod2_w < cnst16);
     66     diff1_l = (mod3_w < cnst16);
     67 
     68     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
     69          mod0_w, mod1_w, mod2_w, mod3_w);
     70 
     71     mod0_w = diff0_r & mod0_w;
     72     mod1_w = diff0_l & mod1_w;
     73     mod2_w = diff1_r & mod2_w;
     74     mod3_w = diff1_l & mod3_w;
     75 
     76     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
     77          mod0_w, mod1_w, mod2_w, mod3_w);
     78     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
     79     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
     80     ST_SH2(mod0_h, mod1_h, cnt, 8);
     81     cnt += 16;
     82 
     83     UNPCK_UB_SH(frm2, frm2_r, frm2_l);
     84     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
     85     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
     86     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
     87          mod0_w, mod1_w, mod2_w, mod3_w);
     88     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
     89          mod0_w, mod1_w, mod2_w, mod3_w);
     90 
     91     ST_SW2(mod0_w, mod1_w, acc, 4);
     92     acc += 8;
     93     ST_SW2(mod2_w, mod3_w, acc, 4);
     94     acc += 8;
     95 
     96     LD_SW2(acc, 4, acc0, acc1);
     97     LD_SW2(acc + 8, 4, acc2, acc3);
     98     LD_SH2(cnt, 8, cnt0, cnt1);
     99 
    100     ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
    101     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    102     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    103     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    104     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
    105          diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
    106     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
    107          mod0_w, mod1_w, mod2_w, mod3_w);
    108     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
    109 
    110     diff0_r = (mod0_w < cnst16);
    111     diff0_l = (mod1_w < cnst16);
    112     diff1_r = (mod2_w < cnst16);
    113     diff1_l = (mod3_w < cnst16);
    114 
    115     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
    116          mod0_w, mod1_w, mod2_w, mod3_w);
    117 
    118     mod0_w = diff0_r & mod0_w;
    119     mod1_w = diff0_l & mod1_w;
    120     mod2_w = diff1_r & mod2_w;
    121     mod3_w = diff1_l & mod3_w;
    122 
    123     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
    124          mod0_w, mod1_w, mod2_w, mod3_w);
    125     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    126     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    127     ST_SH2(mod0_h, mod1_h, cnt, 8);
    128     cnt += 16;
    129     UNPCK_UB_SH(frm4, frm2_r, frm2_l);
    130     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    131     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    132     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
    133          mod0_w, mod1_w, mod2_w, mod3_w);
    134     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
    135          mod0_w, mod1_w, mod2_w, mod3_w);
    136 
    137     ST_SW2(mod0_w, mod1_w, acc, 4);
    138     acc += 8;
    139     ST_SW2(mod2_w, mod3_w, acc, 4);
    140     acc += 8;
    141   }
    142 }
    143 
    144 static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr,
    145                                              uint32_t stride,
    146                                              uint8_t *frm2_ptr,
    147                                              int32_t filt_sth,
    148                                              int32_t filt_wgt,
    149                                              uint32_t *acc,
    150                                              uint16_t *cnt) {
    151   uint32_t row;
    152   v16i8 frm1, frm2, frm3, frm4;
    153   v16u8 frm_r, frm_l;
    154   v16i8 zero = { 0 };
    155   v8u16 frm2_r, frm2_l;
    156   v8i16 diff0, diff1, mod0_h, mod1_h;
    157   v4i32 cnst3, cnst16, filt_wt, strength;
    158   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
    159   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
    160   v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
    161   v4i32 acc0, acc1, acc2, acc3;
    162   v8i16 cnt0, cnt1;
    163 
    164   filt_wt = __msa_fill_w(filt_wgt);
    165   strength = __msa_fill_w(filt_sth);
    166   cnst3 = __msa_ldi_w(3);
    167   cnst16 = __msa_ldi_w(16);
    168 
    169   for (row = 8; row--;) {
    170     LD_SB2(frm1_ptr, stride, frm1, frm3);
    171     frm1_ptr += stride;
    172 
    173     LD_SB2(frm2_ptr, 16, frm2, frm4);
    174     frm2_ptr += 16;
    175 
    176     LD_SW2(acc, 4, acc0, acc1);
    177     LD_SW2(acc, 4, acc2, acc3);
    178     LD_SH2(cnt, 8, cnt0, cnt1);
    179 
    180     ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
    181     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    182     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    183     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    184     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
    185          mod0_w, mod1_w, mod2_w, mod3_w);
    186     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
    187          mod0_w, mod1_w, mod2_w, mod3_w);
    188     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
    189 
    190     diff0_r = (mod0_w < cnst16);
    191     diff0_l = (mod1_w < cnst16);
    192     diff1_r = (mod2_w < cnst16);
    193     diff1_l = (mod3_w < cnst16);
    194 
    195     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
    196          mod0_w, mod1_w, mod2_w, mod3_w);
    197 
    198     mod0_w = diff0_r & mod0_w;
    199     mod1_w = diff0_l & mod1_w;
    200     mod2_w = diff1_r & mod2_w;
    201     mod3_w = diff1_l & mod3_w;
    202 
    203     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
    204          mod0_w, mod1_w, mod2_w, mod3_w);
    205     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    206     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    207     ST_SH2(mod0_h, mod1_h, cnt, 8);
    208     cnt += 16;
    209 
    210     ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
    211     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    212     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    213     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
    214          mod0_w, mod1_w, mod2_w, mod3_w);
    215     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
    216          mod0_w, mod1_w, mod2_w, mod3_w);
    217 
    218     ST_SW2(mod0_w, mod1_w, acc, 4);
    219     acc += 8;
    220     ST_SW2(mod2_w, mod3_w, acc, 4);
    221     acc += 8;
    222 
    223     LD_SW2(acc, 4, acc0, acc1);
    224     LD_SW2(acc + 8, 4, acc2, acc3);
    225     LD_SH2(cnt, 8, cnt0, cnt1);
    226 
    227     ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
    228     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    229     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    230     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    231     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
    232          mod0_w, mod1_w, mod2_w, mod3_w);
    233     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3,
    234          mod0_w, mod1_w, mod2_w, mod3_w);
    235     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
    236 
    237     diff0_r = (mod0_w < cnst16);
    238     diff0_l = (mod1_w < cnst16);
    239     diff1_r = (mod2_w < cnst16);
    240     diff1_l = (mod3_w < cnst16);
    241 
    242     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w,
    243          mod0_w, mod1_w, mod2_w, mod3_w);
    244 
    245     mod0_w = diff0_r & mod0_w;
    246     mod1_w = diff0_l & mod1_w;
    247     mod2_w = diff1_r & mod2_w;
    248     mod3_w = diff1_l & mod3_w;
    249 
    250     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
    251          mod0_w, mod1_w, mod2_w, mod3_w);
    252     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    253     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    254     ST_SH2(mod0_h, mod1_h, cnt, 8);
    255     cnt += 16;
    256 
    257     ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
    258     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    259     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    260     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
    261          mod0_w, mod1_w, mod2_w, mod3_w);
    262     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
    263          mod0_w, mod1_w, mod2_w, mod3_w);
    264     ST_SW2(mod0_w, mod1_w, acc, 4);
    265     acc += 8;
    266     ST_SW2(mod2_w, mod3_w, acc, 4);
    267     acc += 8;
    268 
    269     frm1_ptr += stride;
    270     frm2_ptr += 16;
    271   }
    272 }
    273 
    274 void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
    275                                    uint8_t *frame2_ptr, uint32_t blk_w,
    276                                    uint32_t blk_h, int32_t strength,
    277                                    int32_t filt_wgt, uint32_t *accu,
    278                                    uint16_t *cnt) {
    279   if (8 == (blk_w * blk_h)) {
    280     temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr,
    281                                     strength, filt_wgt, accu, cnt);
    282   } else if (16 == (blk_w * blk_h)) {
    283     temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr,
    284                                      strength, filt_wgt, accu, cnt);
    285   } else {
    286     vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
    287                                 strength, filt_wgt, accu, cnt);
    288   }
    289 }
    290