1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "config/av1_rtcd.h" 13 14 #include "aom_dsp/mips/macros_msa.h" 15 16 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, 17 uint8_t *frm2_ptr, int32_t filt_sth, 18 int32_t filt_wgt, uint32_t *acc, 19 uint16_t *cnt) { 20 uint32_t row; 21 uint64_t f0, f1, f2, f3; 22 v16i8 frm2, frm1 = { 0 }; 23 v16i8 frm4, frm3 = { 0 }; 24 v16u8 frm_r, frm_l; 25 v8i16 frm2_r, frm2_l; 26 v8i16 diff0, diff1, mod0_h, mod1_h; 27 v4i32 cnst3, cnst16, filt_wt, strength; 28 v4i32 mod0_w, mod1_w, mod2_w, mod3_w; 29 v4i32 diff0_r, diff0_l, diff1_r, diff1_l; 30 v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; 31 v4i32 acc0, acc1, acc2, acc3; 32 v8i16 cnt0, cnt1; 33 34 filt_wt = __msa_fill_w(filt_wgt); 35 strength = __msa_fill_w(filt_sth); 36 cnst3 = __msa_ldi_w(3); 37 cnst16 = __msa_ldi_w(16); 38 39 for (row = 2; row--;) { 40 LD4(frm1_ptr, stride, f0, f1, f2, f3); 41 frm1_ptr += (4 * stride); 42 43 LD_SB2(frm2_ptr, 16, frm2, frm4); 44 frm2_ptr += 32; 45 46 LD_SW2(acc, 4, acc0, acc1); 47 LD_SW2(acc + 8, 4, acc2, acc3); 48 LD_SH2(cnt, 8, cnt0, cnt1); 49 50 INSERT_D2_SB(f0, f1, frm1); 51 INSERT_D2_SB(f2, f3, frm3); 52 ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); 53 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 54 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 55 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 56 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 57 mod0_w, mod1_w, mod2_w, mod3_w); 58 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, 59 mod1_w, mod2_w, mod3_w); 60 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 61 62 diff0_r = (mod0_w < cnst16); 63 diff0_l = (mod1_w < cnst16); 64 diff1_r = (mod2_w < cnst16); 65 diff1_l = (mod3_w < cnst16); 66 67 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, 68 mod1_w, mod2_w, mod3_w); 69 70 mod0_w = diff0_r & mod0_w; 71 mod1_w = diff0_l & mod1_w; 72 mod2_w = diff1_r & mod2_w; 73 mod3_w = diff1_l & mod3_w; 74 75 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 76 mod0_w, mod1_w, mod2_w, mod3_w); 77 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 78 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 79 ST_SH2(mod0_h, mod1_h, cnt, 8); 80 cnt += 16; 81 82 UNPCK_UB_SH(frm2, frm2_r, frm2_l); 83 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 84 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 85 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 86 mod0_w, mod1_w, mod2_w, mod3_w); 87 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 88 mod2_w, mod3_w); 89 90 ST_SW2(mod0_w, mod1_w, acc, 4); 91 acc += 8; 92 ST_SW2(mod2_w, mod3_w, acc, 4); 93 acc += 8; 94 95 LD_SW2(acc, 4, acc0, acc1); 96 LD_SW2(acc + 8, 4, acc2, acc3); 97 LD_SH2(cnt, 8, cnt0, cnt1); 98 99 ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); 100 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 101 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 102 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 103 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 104 mod0_w, mod1_w, mod2_w, mod3_w); 105 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, 106 mod1_w, mod2_w, mod3_w); 107 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 108 109 diff0_r = (mod0_w < cnst16); 110 diff0_l = (mod1_w < cnst16); 111 diff1_r = (mod2_w < cnst16); 112 diff1_l = (mod3_w < cnst16); 113 114 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, 115 mod1_w, mod2_w, mod3_w); 116 117 mod0_w = diff0_r & mod0_w; 118 mod1_w = diff0_l & mod1_w; 119 mod2_w = diff1_r & mod2_w; 120 mod3_w = diff1_l & mod3_w; 121 122 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 123 mod0_w, mod1_w, mod2_w, mod3_w); 124 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 125 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 126 ST_SH2(mod0_h, mod1_h, cnt, 8); 127 cnt += 16; 128 UNPCK_UB_SH(frm4, frm2_r, frm2_l); 129 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 130 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 131 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 132 mod0_w, mod1_w, mod2_w, mod3_w); 133 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 134 mod2_w, mod3_w); 135 136 ST_SW2(mod0_w, mod1_w, acc, 4); 137 acc += 8; 138 ST_SW2(mod2_w, mod3_w, acc, 4); 139 acc += 8; 140 } 141 } 142 143 static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride, 144 uint8_t *frm2_ptr, 145 int32_t filt_sth, int32_t filt_wgt, 146 uint32_t *acc, uint16_t *cnt) { 147 uint32_t row; 148 v16i8 frm1, frm2, frm3, frm4; 149 v16u8 frm_r, frm_l; 150 v16i8 zero = { 0 }; 151 v8u16 frm2_r, frm2_l; 152 v8i16 diff0, diff1, mod0_h, mod1_h; 153 v4i32 cnst3, cnst16, filt_wt, strength; 154 v4i32 mod0_w, mod1_w, mod2_w, mod3_w; 155 v4i32 diff0_r, diff0_l, diff1_r, diff1_l; 156 v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; 157 v4i32 acc0, acc1, acc2, acc3; 158 v8i16 cnt0, cnt1; 159 160 filt_wt = __msa_fill_w(filt_wgt); 161 strength = __msa_fill_w(filt_sth); 162 cnst3 = __msa_ldi_w(3); 163 cnst16 = __msa_ldi_w(16); 164 165 for (row = 8; row--;) { 166 LD_SB2(frm1_ptr, stride, frm1, frm3); 167 frm1_ptr += stride; 168 169 LD_SB2(frm2_ptr, 16, frm2, frm4); 170 frm2_ptr += 16; 171 172 LD_SW2(acc, 4, acc0, acc1); 173 LD_SW2(acc, 4, acc2, acc3); 174 LD_SH2(cnt, 8, cnt0, cnt1); 175 176 ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); 177 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 178 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 179 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 180 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 181 mod0_w, mod1_w, mod2_w, mod3_w); 182 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, 183 mod1_w, mod2_w, mod3_w); 184 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 185 186 diff0_r = (mod0_w < cnst16); 187 diff0_l = (mod1_w < cnst16); 188 diff1_r = (mod2_w < cnst16); 189 diff1_l = (mod3_w < cnst16); 190 191 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, 192 mod1_w, mod2_w, mod3_w); 193 194 mod0_w = diff0_r & mod0_w; 195 mod1_w = diff0_l & mod1_w; 196 mod2_w = diff1_r & mod2_w; 197 mod3_w = diff1_l & mod3_w; 198 199 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 200 mod0_w, mod1_w, mod2_w, mod3_w); 201 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 202 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 203 ST_SH2(mod0_h, mod1_h, cnt, 8); 204 cnt += 16; 205 206 ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); 207 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 208 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 209 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 210 mod0_w, mod1_w, mod2_w, mod3_w); 211 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 212 mod2_w, mod3_w); 213 214 ST_SW2(mod0_w, mod1_w, acc, 4); 215 acc += 8; 216 ST_SW2(mod2_w, mod3_w, acc, 4); 217 acc += 8; 218 219 LD_SW2(acc, 4, acc0, acc1); 220 LD_SW2(acc + 8, 4, acc2, acc3); 221 LD_SH2(cnt, 8, cnt0, cnt1); 222 223 ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); 224 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); 225 UNPCK_SH_SW(diff0, diff0_r, diff0_l); 226 UNPCK_SH_SW(diff1, diff1_r, diff1_l); 227 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, 228 mod0_w, mod1_w, mod2_w, mod3_w); 229 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, 230 mod1_w, mod2_w, mod3_w); 231 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); 232 233 diff0_r = (mod0_w < cnst16); 234 diff0_l = (mod1_w < cnst16); 235 diff1_r = (mod2_w < cnst16); 236 diff1_l = (mod3_w < cnst16); 237 238 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, 239 mod1_w, mod2_w, mod3_w); 240 241 mod0_w = diff0_r & mod0_w; 242 mod1_w = diff0_l & mod1_w; 243 mod2_w = diff1_r & mod2_w; 244 mod3_w = diff1_l & mod3_w; 245 246 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, 247 mod0_w, mod1_w, mod2_w, mod3_w); 248 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); 249 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); 250 ST_SH2(mod0_h, mod1_h, cnt, 8); 251 cnt += 16; 252 253 ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); 254 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); 255 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); 256 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, 257 mod0_w, mod1_w, mod2_w, mod3_w); 258 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, 259 mod2_w, mod3_w); 260 ST_SW2(mod0_w, mod1_w, acc, 4); 261 acc += 8; 262 ST_SW2(mod2_w, mod3_w, acc, 4); 263 acc += 8; 264 265 frm1_ptr += stride; 266 frm2_ptr += 16; 267 } 268 } 269 270 // TODO(yunqing) The following optimization is not used since c code changes. 271 void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, 272 uint8_t *frame2_ptr, uint32_t blk_w, 273 uint32_t blk_h, int32_t strength, 274 int32_t filt_wgt, uint32_t *accu, 275 uint16_t *cnt) { 276 if (8 == (blk_w * blk_h)) { 277 temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, 278 filt_wgt, accu, cnt); 279 } else if (16 == (blk_w * blk_h)) { 280 temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, 281 filt_wgt, accu, cnt); 282 } else { 283 av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, 284 strength, filt_wgt, accu, cnt); 285 } 286 } 287