1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vp9_rtcd.h" 12 #include "vpx_dsp/mips/macros_msa.h" 13 14 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, 15 uint32_t stride, 16 uint8_t *frm2_ptr, 17 int32_t filt_sth, 18 int32_t filt_wgt, 19 uint32_t *acc, 20 uint16_t *cnt) { 21 uint32_t row; 22 uint64_t f0, f1, f2, f3; 23 v16i8 frm2, frm1 = { 0 }; 24 v16i8 frm4, frm3 = { 0 }; 25 v16u8 frm_r, frm_l; 26 v8i16 frm2_r, frm2_l; 27 v8i16 diff0, diff1, mod0_h, mod1_h; 28 v4i32 cnst3, cnst16, filt_wt, strength; 29 v4i32 mod0_w, mod1_w, mod2_w, mod3_w; 30 v4i32 diff0_r, diff0_l, diff1_r, diff1_l; 31 v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; 32 v4i32 acc0, acc1, acc2, acc3; 33 v8i16 cnt0, cnt1; 34 35 filt_wt = __msa_fill_w(filt_wgt); 36 strength = __msa_fill_w(filt_sth); 37 cnst3 = __msa_ldi_w(3); 38 cnst16 = __msa_ldi_w(16); 39 40 for (row = 2; row--;) { 41 LD4(frm1_ptr, stride, f0, f1, f2, f3); 42 frm1_ptr += (4 * stride); 43 44 LD_SB2(frm2_ptr, 16, frm2, frm4); 45 frm2_ptr += 32; 46 47 LD_SW2(acc, 4, acc0, acc1); 48 LD_SW2(acc + 8, 4, acc2, acc3); 49 LD_SH2(cnt, 8, cnt0, cnt1); 50 51 INSERT_D2_SB(f0, f1, frm1); 52 INSERT_D2_SB(f2, f3, frm3); 53 ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); 54 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 55 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 56 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 57 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, 58 diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); 59 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, 60 mod0_w, mod1_w, mod2_w, mod3_w); 61 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 62 63 diff0_r = (mod0_w < cnst16); 64 diff0_l = (mod1_w < cnst16); 65 diff1_r = (mod2_w < cnst16); 66 diff1_l = (mod3_w < cnst16); 67 68 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, 69 mod0_w, mod1_w, mod2_w, mod3_w); 70 71 mod0_w = diff0_r & mod0_w; 72 mod1_w = diff0_l & mod1_w; 73 mod2_w = diff1_r & mod2_w; 74 mod3_w = diff1_l & mod3_w; 75 76 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 77 mod0_w, mod1_w, mod2_w, mod3_w); 78 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 79 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 80 ST_SH2(mod0_h, mod1_h, cnt, 8); 81 cnt += 16; 82 83 UNPCK_UB_SH(frm2, frm2_r, frm2_l); 84 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 85 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 86 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 87 mod0_w, mod1_w, mod2_w, mod3_w); 88 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, 89 mod0_w, mod1_w, mod2_w, mod3_w); 90 91 ST_SW2(mod0_w, mod1_w, acc, 4); 92 acc += 8; 93 ST_SW2(mod2_w, mod3_w, acc, 4); 94 acc += 8; 95 96 LD_SW2(acc, 4, acc0, acc1); 97 LD_SW2(acc + 8, 4, acc2, acc3); 98 LD_SH2(cnt, 8, cnt0, cnt1); 99 100 ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); 101 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 102 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 103 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 104 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, 105 diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); 106 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, 107 mod0_w, mod1_w, mod2_w, mod3_w); 108 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 109 110 diff0_r = (mod0_w < cnst16); 111 diff0_l = (mod1_w < cnst16); 112 diff1_r = (mod2_w < cnst16); 113 diff1_l = (mod3_w < cnst16); 114 115 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, 116 mod0_w, mod1_w, mod2_w, mod3_w); 117 118 mod0_w = diff0_r & mod0_w; 119 mod1_w = diff0_l & mod1_w; 120 mod2_w = diff1_r & mod2_w; 121 mod3_w = diff1_l & mod3_w; 122 123 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 124 mod0_w, mod1_w, mod2_w, mod3_w); 125 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 126 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 127 ST_SH2(mod0_h, mod1_h, cnt, 8); 128 cnt += 16; 129 UNPCK_UB_SH(frm4, frm2_r, frm2_l); 130 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 131 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 132 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 133 mod0_w, mod1_w, mod2_w, mod3_w); 134 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, 135 mod0_w, mod1_w, mod2_w, mod3_w); 136 137 ST_SW2(mod0_w, mod1_w, acc, 4); 138 acc += 8; 139 ST_SW2(mod2_w, mod3_w, acc, 4); 140 acc += 8; 141 } 142 } 143 144 static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, 145 uint32_t stride, 146 uint8_t *frm2_ptr, 147 int32_t filt_sth, 148 int32_t filt_wgt, 149 uint32_t *acc, 150 uint16_t *cnt) { 151 uint32_t row; 152 v16i8 frm1, frm2, frm3, frm4; 153 v16u8 frm_r, frm_l; 154 v16i8 zero = { 0 }; 155 v8u16 frm2_r, frm2_l; 156 v8i16 diff0, diff1, mod0_h, mod1_h; 157 v4i32 cnst3, cnst16, filt_wt, strength; 158 v4i32 mod0_w, mod1_w, mod2_w, mod3_w; 159 v4i32 diff0_r, diff0_l, diff1_r, diff1_l; 160 v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; 161 v4i32 acc0, acc1, acc2, acc3; 162 v8i16 cnt0, cnt1; 163 164 filt_wt = __msa_fill_w(filt_wgt); 165 strength = __msa_fill_w(filt_sth); 166 cnst3 = __msa_ldi_w(3); 167 cnst16 = __msa_ldi_w(16); 168 169 for (row = 8; row--;) { 170 LD_SB2(frm1_ptr, stride, frm1, frm3); 171 frm1_ptr += stride; 172 173 LD_SB2(frm2_ptr, 16, frm2, frm4); 174 frm2_ptr += 16; 175 176 LD_SW2(acc, 4, acc0, acc1); 177 LD_SW2(acc, 4, acc2, acc3); 178 LD_SH2(cnt, 8, cnt0, cnt1); 179 180 ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); 181 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 182 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 183 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 184 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 185 mod0_w, mod1_w, mod2_w, mod3_w); 186 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, 187 mod0_w, mod1_w, mod2_w, mod3_w); 188 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 189 190 diff0_r = (mod0_w < cnst16); 191 diff0_l = (mod1_w < cnst16); 192 diff1_r = (mod2_w < cnst16); 193 diff1_l = (mod3_w < cnst16); 194 195 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, 196 mod0_w, mod1_w, mod2_w, mod3_w); 197 198 mod0_w = diff0_r & mod0_w; 199 mod1_w = diff0_l & mod1_w; 200 mod2_w = diff1_r & mod2_w; 201 mod3_w = diff1_l & mod3_w; 202 203 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 204 mod0_w, mod1_w, mod2_w, mod3_w); 205 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 206 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 207 ST_SH2(mod0_h, mod1_h, cnt, 8); 208 cnt += 16; 209 210 ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); 211 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 212 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 213 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 214 mod0_w, mod1_w, mod2_w, mod3_w); 215 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, 216 mod0_w, mod1_w, mod2_w, mod3_w); 217 218 ST_SW2(mod0_w, mod1_w, acc, 4); 219 acc += 8; 220 ST_SW2(mod2_w, mod3_w, acc, 4); 221 acc += 8; 222 223 LD_SW2(acc, 4, acc0, acc1); 224 LD_SW2(acc + 8, 4, acc2, acc3); 225 LD_SH2(cnt, 8, cnt0, cnt1); 226 227 ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); 228 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 229 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 230 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 231 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 232 mod0_w, mod1_w, mod2_w, mod3_w); 233 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, 234 mod0_w, mod1_w, mod2_w, mod3_w); 235 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 236 237 diff0_r = (mod0_w < cnst16); 238 diff0_l = (mod1_w < cnst16); 239 diff1_r = (mod2_w < cnst16); 240 diff1_l = (mod3_w < cnst16); 241 242 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, 243 mod0_w, mod1_w, mod2_w, mod3_w); 244 245 mod0_w = diff0_r & mod0_w; 246 mod1_w = diff0_l & mod1_w; 247 mod2_w = diff1_r & mod2_w; 248 mod3_w = diff1_l & mod3_w; 249 250 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 251 mod0_w, mod1_w, mod2_w, mod3_w); 252 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 253 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 254 ST_SH2(mod0_h, mod1_h, cnt, 8); 255 cnt += 16; 256 257 ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); 258 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 259 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 260 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 261 mod0_w, mod1_w, mod2_w, mod3_w); 262 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, 263 mod0_w, mod1_w, mod2_w, mod3_w); 264 ST_SW2(mod0_w, mod1_w, acc, 4); 265 acc += 8; 266 ST_SW2(mod2_w, mod3_w, acc, 4); 267 acc += 8; 268 269 frm1_ptr += stride; 270 frm2_ptr += 16; 271 } 272 } 273 274 void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, 275 uint8_t *frame2_ptr, uint32_t blk_w, 276 uint32_t blk_h, int32_t strength, 277 int32_t filt_wgt, uint32_t *accu, 278 uint16_t *cnt) { 279 if (8 == (blk_w * blk_h)) { 280 temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, 281 strength, filt_wgt, accu, cnt); 282 } else if (16 == (blk_w * blk_h)) { 283 temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, 284 strength, filt_wgt, accu, cnt); 285 } else { 286 vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, 287 strength, filt_wgt, accu, cnt); 288 } 289 } 290