Home | History | Annotate | Download | only in msa
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include "config/av1_rtcd.h"
     13 
     14 #include "aom_dsp/mips/macros_msa.h"
     15 
     16 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
     17                                             uint8_t *frm2_ptr, int32_t filt_sth,
     18                                             int32_t filt_wgt, uint32_t *acc,
     19                                             uint16_t *cnt) {
     20   uint32_t row;
     21   uint64_t f0, f1, f2, f3;
     22   v16i8 frm2, frm1 = { 0 };
     23   v16i8 frm4, frm3 = { 0 };
     24   v16u8 frm_r, frm_l;
     25   v8i16 frm2_r, frm2_l;
     26   v8i16 diff0, diff1, mod0_h, mod1_h;
     27   v4i32 cnst3, cnst16, filt_wt, strength;
     28   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
     29   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
     30   v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
     31   v4i32 acc0, acc1, acc2, acc3;
     32   v8i16 cnt0, cnt1;
     33 
     34   filt_wt = __msa_fill_w(filt_wgt);
     35   strength = __msa_fill_w(filt_sth);
     36   cnst3 = __msa_ldi_w(3);
     37   cnst16 = __msa_ldi_w(16);
     38 
     39   for (row = 2; row--;) {
     40     LD4(frm1_ptr, stride, f0, f1, f2, f3);
     41     frm1_ptr += (4 * stride);
     42 
     43     LD_SB2(frm2_ptr, 16, frm2, frm4);
     44     frm2_ptr += 32;
     45 
     46     LD_SW2(acc, 4, acc0, acc1);
     47     LD_SW2(acc + 8, 4, acc2, acc3);
     48     LD_SH2(cnt, 8, cnt0, cnt1);
     49 
     50     INSERT_D2_SB(f0, f1, frm1);
     51     INSERT_D2_SB(f2, f3, frm3);
     52     ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
     53     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
     54     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
     55     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
     56     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
     57          mod0_w, mod1_w, mod2_w, mod3_w);
     58     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
     59          mod1_w, mod2_w, mod3_w);
     60     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
     61 
     62     diff0_r = (mod0_w < cnst16);
     63     diff0_l = (mod1_w < cnst16);
     64     diff1_r = (mod2_w < cnst16);
     65     diff1_l = (mod3_w < cnst16);
     66 
     67     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
     68          mod1_w, mod2_w, mod3_w);
     69 
     70     mod0_w = diff0_r & mod0_w;
     71     mod1_w = diff0_l & mod1_w;
     72     mod2_w = diff1_r & mod2_w;
     73     mod3_w = diff1_l & mod3_w;
     74 
     75     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
     76          mod0_w, mod1_w, mod2_w, mod3_w);
     77     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
     78     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
     79     ST_SH2(mod0_h, mod1_h, cnt, 8);
     80     cnt += 16;
     81 
     82     UNPCK_UB_SH(frm2, frm2_r, frm2_l);
     83     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
     84     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
     85     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
     86          mod0_w, mod1_w, mod2_w, mod3_w);
     87     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
     88          mod2_w, mod3_w);
     89 
     90     ST_SW2(mod0_w, mod1_w, acc, 4);
     91     acc += 8;
     92     ST_SW2(mod2_w, mod3_w, acc, 4);
     93     acc += 8;
     94 
     95     LD_SW2(acc, 4, acc0, acc1);
     96     LD_SW2(acc + 8, 4, acc2, acc3);
     97     LD_SH2(cnt, 8, cnt0, cnt1);
     98 
     99     ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
    100     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    101     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    102     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    103     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
    104          mod0_w, mod1_w, mod2_w, mod3_w);
    105     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
    106          mod1_w, mod2_w, mod3_w);
    107     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
    108 
    109     diff0_r = (mod0_w < cnst16);
    110     diff0_l = (mod1_w < cnst16);
    111     diff1_r = (mod2_w < cnst16);
    112     diff1_l = (mod3_w < cnst16);
    113 
    114     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
    115          mod1_w, mod2_w, mod3_w);
    116 
    117     mod0_w = diff0_r & mod0_w;
    118     mod1_w = diff0_l & mod1_w;
    119     mod2_w = diff1_r & mod2_w;
    120     mod3_w = diff1_l & mod3_w;
    121 
    122     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
    123          mod0_w, mod1_w, mod2_w, mod3_w);
    124     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    125     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    126     ST_SH2(mod0_h, mod1_h, cnt, 8);
    127     cnt += 16;
    128     UNPCK_UB_SH(frm4, frm2_r, frm2_l);
    129     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    130     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    131     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
    132          mod0_w, mod1_w, mod2_w, mod3_w);
    133     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
    134          mod2_w, mod3_w);
    135 
    136     ST_SW2(mod0_w, mod1_w, acc, 4);
    137     acc += 8;
    138     ST_SW2(mod2_w, mod3_w, acc, 4);
    139     acc += 8;
    140   }
    141 }
    142 
    143 static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
    144                                              uint8_t *frm2_ptr,
    145                                              int32_t filt_sth, int32_t filt_wgt,
    146                                              uint32_t *acc, uint16_t *cnt) {
    147   uint32_t row;
    148   v16i8 frm1, frm2, frm3, frm4;
    149   v16u8 frm_r, frm_l;
    150   v16i8 zero = { 0 };
    151   v8u16 frm2_r, frm2_l;
    152   v8i16 diff0, diff1, mod0_h, mod1_h;
    153   v4i32 cnst3, cnst16, filt_wt, strength;
    154   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
    155   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
    156   v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
    157   v4i32 acc0, acc1, acc2, acc3;
    158   v8i16 cnt0, cnt1;
    159 
    160   filt_wt = __msa_fill_w(filt_wgt);
    161   strength = __msa_fill_w(filt_sth);
    162   cnst3 = __msa_ldi_w(3);
    163   cnst16 = __msa_ldi_w(16);
    164 
    165   for (row = 8; row--;) {
    166     LD_SB2(frm1_ptr, stride, frm1, frm3);
    167     frm1_ptr += stride;
    168 
    169     LD_SB2(frm2_ptr, 16, frm2, frm4);
    170     frm2_ptr += 16;
    171 
    172     LD_SW2(acc, 4, acc0, acc1);
    173     LD_SW2(acc, 4, acc2, acc3);
    174     LD_SH2(cnt, 8, cnt0, cnt1);
    175 
    176     ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
    177     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    178     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    179     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    180     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
    181          mod0_w, mod1_w, mod2_w, mod3_w);
    182     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
    183          mod1_w, mod2_w, mod3_w);
    184     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
    185 
    186     diff0_r = (mod0_w < cnst16);
    187     diff0_l = (mod1_w < cnst16);
    188     diff1_r = (mod2_w < cnst16);
    189     diff1_l = (mod3_w < cnst16);
    190 
    191     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
    192          mod1_w, mod2_w, mod3_w);
    193 
    194     mod0_w = diff0_r & mod0_w;
    195     mod1_w = diff0_l & mod1_w;
    196     mod2_w = diff1_r & mod2_w;
    197     mod3_w = diff1_l & mod3_w;
    198 
    199     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
    200          mod0_w, mod1_w, mod2_w, mod3_w);
    201     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    202     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    203     ST_SH2(mod0_h, mod1_h, cnt, 8);
    204     cnt += 16;
    205 
    206     ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
    207     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    208     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    209     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
    210          mod0_w, mod1_w, mod2_w, mod3_w);
    211     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
    212          mod2_w, mod3_w);
    213 
    214     ST_SW2(mod0_w, mod1_w, acc, 4);
    215     acc += 8;
    216     ST_SW2(mod2_w, mod3_w, acc, 4);
    217     acc += 8;
    218 
    219     LD_SW2(acc, 4, acc0, acc1);
    220     LD_SW2(acc + 8, 4, acc2, acc3);
    221     LD_SH2(cnt, 8, cnt0, cnt1);
    222 
    223     ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
    224     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
    225     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
    226     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
    227     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
    228          mod0_w, mod1_w, mod2_w, mod3_w);
    229     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
    230          mod1_w, mod2_w, mod3_w);
    231     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
    232 
    233     diff0_r = (mod0_w < cnst16);
    234     diff0_l = (mod1_w < cnst16);
    235     diff1_r = (mod2_w < cnst16);
    236     diff1_l = (mod3_w < cnst16);
    237 
    238     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
    239          mod1_w, mod2_w, mod3_w);
    240 
    241     mod0_w = diff0_r & mod0_w;
    242     mod1_w = diff0_l & mod1_w;
    243     mod2_w = diff1_r & mod2_w;
    244     mod3_w = diff1_l & mod3_w;
    245 
    246     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
    247          mod0_w, mod1_w, mod2_w, mod3_w);
    248     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
    249     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
    250     ST_SH2(mod0_h, mod1_h, cnt, 8);
    251     cnt += 16;
    252 
    253     ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
    254     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
    255     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
    256     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
    257          mod0_w, mod1_w, mod2_w, mod3_w);
    258     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
    259          mod2_w, mod3_w);
    260     ST_SW2(mod0_w, mod1_w, acc, 4);
    261     acc += 8;
    262     ST_SW2(mod2_w, mod3_w, acc, 4);
    263     acc += 8;
    264 
    265     frm1_ptr += stride;
    266     frm2_ptr += 16;
    267   }
    268 }
    269 
    270 // TODO(yunqing) The following optimization is not used since c code changes.
    271 void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
    272                                    uint8_t *frame2_ptr, uint32_t blk_w,
    273                                    uint32_t blk_h, int32_t strength,
    274                                    int32_t filt_wgt, uint32_t *accu,
    275                                    uint16_t *cnt) {
    276   if (8 == (blk_w * blk_h)) {
    277     temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
    278                                     filt_wgt, accu, cnt);
    279   } else if (16 == (blk_w * blk_h)) {
    280     temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
    281                                      filt_wgt, accu, cnt);
    282   } else {
    283     av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
    284                                 strength, filt_wgt, accu, cnt);
    285   }
    286 }
    287