1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vp8_rtcd.h" 12 #include "vp8/common/loopfilter.h" 13 #include "vp8/common/mips/msa/vp8_macros_msa.h" 14 15 #define VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) \ 16 { \ 17 v16u8 p1_a_sub_q1, p0_a_sub_q0; \ 18 \ 19 p0_a_sub_q0 = __msa_asub_u_b(p0, q0); \ 20 p1_a_sub_q1 = __msa_asub_u_b(p1, q1); \ 21 p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1); \ 22 p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0); \ 23 mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1); \ 24 mask = ((v16u8)mask <= b_limit); \ 25 } 26 27 #define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ 28 { \ 29 v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ 30 const v16i8 cnst4b = __msa_ldi_b(4); \ 31 const v16i8 cnst3b = __msa_ldi_b(3); \ 32 \ 33 p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ 34 p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ 35 q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ 36 q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ 37 \ 38 filt = __msa_subs_s_b(p1_m, q1_m); \ 39 filt &= hev; \ 40 q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ 41 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 42 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 43 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 44 filt &= mask; \ 45 t1 = __msa_adds_s_b(filt, cnst4b); \ 46 t1 >>= cnst3b; \ 47 t2 = __msa_adds_s_b(filt, cnst3b); \ 48 t2 >>= cnst3b; \ 49 q0_m = __msa_subs_s_b(q0_m, t1); \ 50 q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ 51 p0_m = __msa_adds_s_b(p0_m, t2); \ 52 p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ 53 filt = __msa_srari_b(t1, 1); \ 54 hev = __msa_xori_b(hev, 0xff); \ 55 filt &= hev; \ 56 q1_m = __msa_subs_s_b(q1_m, filt); \ 57 q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ 58 p1_m = __msa_adds_s_b(p1_m, filt); \ 59 p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ 60 } 61 62 #define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ 63 { \ 64 v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \ 65 v16i8 q0_sub_p0; \ 66 const v16i8 cnst4b = __msa_ldi_b(4); \ 67 const v16i8 cnst3b = __msa_ldi_b(3); \ 68 \ 69 p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ 70 p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ 71 q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ 72 q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ 73 \ 74 filt = __msa_subs_s_b(p1_m, q1_m); \ 75 q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ 76 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 77 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 78 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 79 filt &= mask; \ 80 filt1 = __msa_adds_s_b(filt, cnst4b); \ 81 filt1 >>= cnst3b; \ 82 filt2 = __msa_adds_s_b(filt, cnst3b); \ 83 filt2 >>= cnst3b; \ 84 q0_m = __msa_subs_s_b(q0_m, filt1); \ 85 p0_m = __msa_adds_s_b(p0_m, filt2); \ 86 q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \ 87 p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \ 88 } 89 90 #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ 91 { \ 92 v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ 93 v16i8 u, filt, t1, t2, filt_sign, q0_sub_p0; \ 94 v8i16 filt_r, filt_l, u_r, u_l; \ 95 v8i16 temp0, temp1, temp2, temp3; \ 96 const v16i8 cnst4b = __msa_ldi_b(4); \ 97 const v16i8 cnst3b = __msa_ldi_b(3); \ 98 const v8i16 cnst9h = __msa_ldi_h(9); \ 99 const v8i16 cnst63h = __msa_ldi_h(63); \ 100 \ 101 p2_m = (v16i8)__msa_xori_b(p2, 0x80); \ 102 p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ 103 p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ 104 q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ 105 q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ 106 q2_m = (v16i8)__msa_xori_b(q2, 0x80); \ 107 \ 108 filt = __msa_subs_s_b(p1_m, q1_m); \ 109 q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ 110 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 111 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 112 filt = __msa_adds_s_b(filt, q0_sub_p0); \ 113 filt &= mask; \ 114 \ 115 t2 = filt & hev; \ 116 hev = __msa_xori_b(hev, 0xff); \ 117 filt &= hev; \ 118 t1 = __msa_adds_s_b(t2, cnst4b); \ 119 t1 >>= cnst3b; \ 120 t2 = __msa_adds_s_b(t2, cnst3b); \ 121 t2 >>= cnst3b; \ 122 q0_m = __msa_subs_s_b(q0_m, t1); \ 123 p0_m = __msa_adds_s_b(p0_m, t2); \ 124 filt_sign = __msa_clti_s_b(filt, 0); \ 125 ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ 126 temp0 = filt_r * cnst9h; \ 127 temp1 = temp0 + cnst63h; \ 128 temp2 = filt_l * cnst9h; \ 129 temp3 = temp2 + cnst63h; \ 130 \ 131 u_r = temp1 >> 7; \ 132 u_r = __msa_sat_s_h(u_r, 7); \ 133 u_l = temp3 >> 7; \ 134 u_l = __msa_sat_s_h(u_l, 7); \ 135 u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ 136 q2_m = __msa_subs_s_b(q2_m, u); \ 137 p2_m = __msa_adds_s_b(p2_m, u); \ 138 q2 = __msa_xori_b((v16u8)q2_m, 0x80); \ 139 p2 = __msa_xori_b((v16u8)p2_m, 0x80); \ 140 \ 141 temp1 += temp0; \ 142 temp3 += temp2; \ 143 \ 144 u_r = temp1 >> 7; \ 145 u_r = __msa_sat_s_h(u_r, 7); \ 146 u_l = temp3 >> 7; \ 147 u_l = __msa_sat_s_h(u_l, 7); \ 148 u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ 149 q1_m = __msa_subs_s_b(q1_m, u); \ 150 p1_m = __msa_adds_s_b(p1_m, u); \ 151 q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ 152 p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ 153 \ 154 temp1 += temp0; \ 155 temp3 += temp2; \ 156 \ 157 u_r = temp1 >> 7; \ 158 u_r = __msa_sat_s_h(u_r, 7); \ 159 u_l = temp3 >> 7; \ 160 u_l = __msa_sat_s_h(u_l, 7); \ 161 u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ 162 q0_m = __msa_subs_s_b(q0_m, u); \ 163 p0_m = __msa_adds_s_b(p0_m, u); \ 164 q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ 165 p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ 166 } 167 168 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ 169 limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ 170 flat_out) \ 171 { \ 172 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ 173 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ 174 \ 175 p3_asub_p2_m = __msa_asub_u_b((p3_in), (p2_in)); \ 176 p2_asub_p1_m = __msa_asub_u_b((p2_in), (p1_in)); \ 177 p1_asub_p0_m = __msa_asub_u_b((p1_in), (p0_in)); \ 178 q1_asub_q0_m = __msa_asub_u_b((q1_in), (q0_in)); \ 179 q2_asub_q1_m = __msa_asub_u_b((q2_in), (q1_in)); \ 180 q3_asub_q2_m = __msa_asub_u_b((q3_in), (q2_in)); \ 181 p0_asub_q0_m = __msa_asub_u_b((p0_in), (q0_in)); \ 182 p1_asub_q1_m = __msa_asub_u_b((p1_in), (q1_in)); \ 183 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ 184 hev_out = (thresh_in) < (v16u8)flat_out; \ 185 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ 186 p1_asub_q1_m >>= 1; \ 187 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ 188 mask_out = (b_limit_in) < p0_asub_q0_m; \ 189 mask_out = __msa_max_u_b(flat_out, mask_out); \ 190 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ 191 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ 192 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ 193 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ 194 mask_out = (limit_in) < (v16u8)mask_out; \ 195 mask_out = __msa_xori_b(mask_out, 0xff); \ 196 } 197 198 #define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) \ 199 { \ 200 uint16_t tmp0_h; \ 201 uint32_t tmp0_w; \ 202 \ 203 tmp0_w = __msa_copy_u_w((v4i32)in0, in0_idx); \ 204 tmp0_h = __msa_copy_u_h((v8i16)in1, in1_idx); \ 205 SW(tmp0_w, pdst); \ 206 SH(tmp0_h, pdst + stride); \ 207 } 208 209 static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, 210 const uint8_t *b_limit0_ptr, 211 const uint8_t *limit0_ptr, 212 const uint8_t *thresh0_ptr, 213 const uint8_t *b_limit1_ptr, 214 const uint8_t *limit1_ptr, 215 const uint8_t *thresh1_ptr) { 216 v16u8 mask, hev, flat; 217 v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; 218 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 219 220 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 221 thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); 222 thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); 223 thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); 224 225 b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); 226 b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); 227 b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); 228 229 limit0 = (v16u8)__msa_fill_b(*limit0_ptr); 230 limit1 = (v16u8)__msa_fill_b(*limit1_ptr); 231 limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); 232 233 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, 234 mask, flat); 235 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 236 237 ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); 238 } 239 240 static void loop_filter_vertical_4_dual_msa(uint8_t *src, int32_t pitch, 241 const uint8_t *b_limit0_ptr, 242 const uint8_t *limit0_ptr, 243 const uint8_t *thresh0_ptr, 244 const uint8_t *b_limit1_ptr, 245 const uint8_t *limit1_ptr, 246 const uint8_t *thresh1_ptr) { 247 v16u8 mask, hev, flat; 248 v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; 249 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 250 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 251 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 252 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 253 254 LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 255 LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, 256 row14, row15); 257 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, 258 row9, row10, row11, row12, row13, row14, row15, p3, p2, 259 p1, p0, q0, q1, q2, q3); 260 261 thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); 262 thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); 263 thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); 264 265 b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); 266 b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); 267 b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); 268 269 limit0 = (v16u8)__msa_fill_b(*limit0_ptr); 270 limit1 = (v16u8)__msa_fill_b(*limit1_ptr); 271 limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); 272 273 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, 274 mask, flat); 275 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 276 ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 277 ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); 278 ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); 279 ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); 280 281 src -= 2; 282 ST4x8_UB(tmp2, tmp3, src, pitch); 283 src += (8 * pitch); 284 ST4x8_UB(tmp4, tmp5, src, pitch); 285 } 286 287 static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch, 288 const uint8_t b_limit_in, 289 const uint8_t limit_in, 290 const uint8_t thresh_in) { 291 uint8_t *temp_src; 292 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 293 v16u8 mask, hev, flat, thresh, limit, b_limit; 294 295 b_limit = (v16u8)__msa_fill_b(b_limit_in); 296 limit = (v16u8)__msa_fill_b(limit_in); 297 thresh = (v16u8)__msa_fill_b(thresh_in); 298 temp_src = src - (pitch << 2); 299 LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); 300 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 301 mask, flat); 302 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 303 temp_src = src - 3 * pitch; 304 ST_UB4(p2, p1, p0, q0, temp_src, pitch); 305 temp_src += (4 * pitch); 306 ST_UB2(q1, q2, temp_src, pitch); 307 } 308 309 static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, 310 int32_t pitch, 311 const uint8_t b_limit_in, 312 const uint8_t limit_in, 313 const uint8_t thresh_in) { 314 uint8_t *temp_src; 315 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 316 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 317 v16u8 mask, hev, flat, thresh, limit, b_limit; 318 v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; 319 v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; 320 321 b_limit = (v16u8)__msa_fill_b(b_limit_in); 322 limit = (v16u8)__msa_fill_b(limit_in); 323 thresh = (v16u8)__msa_fill_b(thresh_in); 324 325 temp_src = src_u - (pitch << 2); 326 LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); 327 temp_src = src_v - (pitch << 2); 328 LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); 329 330 ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); 331 ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); 332 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 333 mask, flat); 334 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 335 336 p2_d = __msa_copy_u_d((v2i64)p2, 0); 337 p1_d = __msa_copy_u_d((v2i64)p1, 0); 338 p0_d = __msa_copy_u_d((v2i64)p0, 0); 339 q0_d = __msa_copy_u_d((v2i64)q0, 0); 340 q1_d = __msa_copy_u_d((v2i64)q1, 0); 341 q2_d = __msa_copy_u_d((v2i64)q2, 0); 342 src_u -= (pitch * 3); 343 SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); 344 src_u += 4 * pitch; 345 SD(q1_d, src_u); 346 src_u += pitch; 347 SD(q2_d, src_u); 348 349 p2_d = __msa_copy_u_d((v2i64)p2, 1); 350 p1_d = __msa_copy_u_d((v2i64)p1, 1); 351 p0_d = __msa_copy_u_d((v2i64)p0, 1); 352 q0_d = __msa_copy_u_d((v2i64)q0, 1); 353 q1_d = __msa_copy_u_d((v2i64)q1, 1); 354 q2_d = __msa_copy_u_d((v2i64)q2, 1); 355 src_v -= (pitch * 3); 356 SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); 357 src_v += 4 * pitch; 358 SD(q1_d, src_v); 359 src_v += pitch; 360 SD(q2_d, src_v); 361 } 362 363 static void mbloop_filter_vertical_edge_y_msa(uint8_t *src, int32_t pitch, 364 const uint8_t b_limit_in, 365 const uint8_t limit_in, 366 const uint8_t thresh_in) { 367 uint8_t *temp_src; 368 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 369 v16u8 mask, hev, flat, thresh, limit, b_limit; 370 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 371 v16u8 row9, row10, row11, row12, row13, row14, row15; 372 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 373 374 b_limit = (v16u8)__msa_fill_b(b_limit_in); 375 limit = (v16u8)__msa_fill_b(limit_in); 376 thresh = (v16u8)__msa_fill_b(thresh_in); 377 temp_src = src - 4; 378 LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 379 temp_src += (8 * pitch); 380 LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15); 381 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, 382 row9, row10, row11, row12, row13, row14, row15, p3, p2, 383 p1, p0, q0, q1, q2, q3); 384 385 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 386 mask, flat); 387 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 388 ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 389 ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); 390 ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 391 ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); 392 ILVRL_B2_SH(q2, q1, tmp2, tmp5); 393 394 temp_src = src - 3; 395 VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4); 396 temp_src += pitch; 397 VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4); 398 temp_src += pitch; 399 VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4); 400 temp_src += pitch; 401 VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4); 402 temp_src += pitch; 403 VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4); 404 temp_src += pitch; 405 VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4); 406 temp_src += pitch; 407 VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4); 408 temp_src += pitch; 409 VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4); 410 temp_src += pitch; 411 VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4); 412 temp_src += pitch; 413 VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4); 414 temp_src += pitch; 415 VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4); 416 temp_src += pitch; 417 VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4); 418 temp_src += pitch; 419 VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4); 420 temp_src += pitch; 421 VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4); 422 temp_src += pitch; 423 VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4); 424 temp_src += pitch; 425 VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4); 426 } 427 428 static void mbloop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, 429 int32_t pitch, 430 const uint8_t b_limit_in, 431 const uint8_t limit_in, 432 const uint8_t thresh_in) { 433 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 434 v16u8 mask, hev, flat, thresh, limit, b_limit; 435 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 436 v16u8 row9, row10, row11, row12, row13, row14, row15; 437 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 438 439 b_limit = (v16u8)__msa_fill_b(b_limit_in); 440 limit = (v16u8)__msa_fill_b(limit_in); 441 thresh = (v16u8)__msa_fill_b(thresh_in); 442 443 LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 444 LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14, 445 row15); 446 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, 447 row9, row10, row11, row12, row13, row14, row15, p3, p2, 448 p1, p0, q0, q1, q2, q3); 449 450 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 451 mask, flat); 452 VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); 453 454 ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 455 ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4); 456 ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1); 457 ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7); 458 ILVRL_B2_SH(q2, q1, tmp2, tmp5); 459 460 src_u -= 3; 461 VP8_ST6x1_UB(tmp3, 0, tmp2, 0, src_u, 4); 462 src_u += pitch; 463 VP8_ST6x1_UB(tmp3, 1, tmp2, 1, src_u, 4); 464 src_u += pitch; 465 VP8_ST6x1_UB(tmp3, 2, tmp2, 2, src_u, 4); 466 src_u += pitch; 467 VP8_ST6x1_UB(tmp3, 3, tmp2, 3, src_u, 4); 468 src_u += pitch; 469 VP8_ST6x1_UB(tmp4, 0, tmp2, 4, src_u, 4); 470 src_u += pitch; 471 VP8_ST6x1_UB(tmp4, 1, tmp2, 5, src_u, 4); 472 src_u += pitch; 473 VP8_ST6x1_UB(tmp4, 2, tmp2, 6, src_u, 4); 474 src_u += pitch; 475 VP8_ST6x1_UB(tmp4, 3, tmp2, 7, src_u, 4); 476 477 src_v -= 3; 478 VP8_ST6x1_UB(tmp6, 0, tmp5, 0, src_v, 4); 479 src_v += pitch; 480 VP8_ST6x1_UB(tmp6, 1, tmp5, 1, src_v, 4); 481 src_v += pitch; 482 VP8_ST6x1_UB(tmp6, 2, tmp5, 2, src_v, 4); 483 src_v += pitch; 484 VP8_ST6x1_UB(tmp6, 3, tmp5, 3, src_v, 4); 485 src_v += pitch; 486 VP8_ST6x1_UB(tmp7, 0, tmp5, 4, src_v, 4); 487 src_v += pitch; 488 VP8_ST6x1_UB(tmp7, 1, tmp5, 5, src_v, 4); 489 src_v += pitch; 490 VP8_ST6x1_UB(tmp7, 2, tmp5, 6, src_v, 4); 491 src_v += pitch; 492 VP8_ST6x1_UB(tmp7, 3, tmp5, 7, src_v, 4); 493 } 494 495 void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch, 496 const uint8_t *b_limit_ptr) { 497 v16u8 p1, p0, q1, q0; 498 v16u8 mask, b_limit; 499 500 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 501 LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1); 502 VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); 503 VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); 504 ST_UB2(p0, q0, (src - pitch), pitch); 505 } 506 507 void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch, 508 const uint8_t *b_limit_ptr) { 509 uint8_t *temp_src; 510 v16u8 p1, p0, q1, q0; 511 v16u8 mask, b_limit; 512 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 513 v16u8 row9, row10, row11, row12, row13, row14, row15; 514 v8i16 tmp0, tmp1; 515 516 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 517 temp_src = src - 2; 518 LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 519 temp_src += (8 * pitch); 520 LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15); 521 TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, 522 row9, row10, row11, row12, row13, row14, row15, p1, p0, 523 q0, q1); 524 VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); 525 VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); 526 ILVRL_B2_SH(q0, p0, tmp1, tmp0); 527 528 src -= 1; 529 ST2x4_UB(tmp1, 0, src, pitch); 530 src += 4 * pitch; 531 ST2x4_UB(tmp1, 4, src, pitch); 532 src += 4 * pitch; 533 ST2x4_UB(tmp0, 0, src, pitch); 534 src += 4 * pitch; 535 ST2x4_UB(tmp0, 4, src, pitch); 536 src += 4 * pitch; 537 } 538 539 static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, 540 int32_t pitch, 541 const uint8_t b_limit_in, 542 const uint8_t limit_in, 543 const uint8_t thresh_in) { 544 uint64_t p1_d, p0_d, q0_d, q1_d; 545 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 546 v16u8 mask, hev, flat, thresh, limit, b_limit; 547 v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; 548 v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; 549 550 thresh = (v16u8)__msa_fill_b(thresh_in); 551 limit = (v16u8)__msa_fill_b(limit_in); 552 b_limit = (v16u8)__msa_fill_b(b_limit_in); 553 554 src_u = src_u - (pitch << 2); 555 LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); 556 src_u += (5 * pitch); 557 src_v = src_v - (pitch << 2); 558 LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); 559 src_v += (5 * pitch); 560 561 /* right 8 element of p3 are u pixel and 562 left 8 element of p3 are v pixel */ 563 ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); 564 ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); 565 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 566 mask, flat); 567 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 568 569 p1_d = __msa_copy_u_d((v2i64)p1, 0); 570 p0_d = __msa_copy_u_d((v2i64)p0, 0); 571 q0_d = __msa_copy_u_d((v2i64)q0, 0); 572 q1_d = __msa_copy_u_d((v2i64)q1, 0); 573 SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch)); 574 575 p1_d = __msa_copy_u_d((v2i64)p1, 1); 576 p0_d = __msa_copy_u_d((v2i64)p0, 1); 577 q0_d = __msa_copy_u_d((v2i64)q0, 1); 578 q1_d = __msa_copy_u_d((v2i64)q1, 1); 579 SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch)); 580 } 581 582 static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, 583 int32_t pitch, 584 const uint8_t b_limit_in, 585 const uint8_t limit_in, 586 const uint8_t thresh_in) { 587 uint8_t *temp_src_u, *temp_src_v; 588 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 589 v16u8 mask, hev, flat, thresh, limit, b_limit; 590 v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; 591 v16u8 row9, row10, row11, row12, row13, row14, row15; 592 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; 593 594 thresh = (v16u8)__msa_fill_b(thresh_in); 595 limit = (v16u8)__msa_fill_b(limit_in); 596 b_limit = (v16u8)__msa_fill_b(b_limit_in); 597 598 LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); 599 LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14, 600 row15); 601 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, 602 row9, row10, row11, row12, row13, row14, row15, p3, p2, 603 p1, p0, q0, q1, q2, q3); 604 605 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 606 mask, flat); 607 VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); 608 ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1); 609 ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3); 610 tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1); 611 tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0); 612 ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); 613 614 temp_src_u = src_u - 2; 615 ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch); 616 temp_src_u += 4 * pitch; 617 ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch); 618 619 temp_src_v = src_v - 2; 620 ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch); 621 temp_src_v += 4 * pitch; 622 ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch); 623 } 624 625 void vp8_loop_filter_mbh_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, 626 int32_t pitch_y, int32_t pitch_u_v, 627 loop_filter_info *lpf_info_ptr) { 628 mbloop_filter_horizontal_edge_y_msa(src_y, pitch_y, *lpf_info_ptr->mblim, 629 *lpf_info_ptr->lim, 630 *lpf_info_ptr->hev_thr); 631 if (src_u) { 632 mbloop_filter_horizontal_edge_uv_msa( 633 src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, 634 *lpf_info_ptr->hev_thr); 635 } 636 } 637 638 void vp8_loop_filter_mbv_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, 639 int32_t pitch_y, int32_t pitch_u_v, 640 loop_filter_info *lpf_info_ptr) { 641 mbloop_filter_vertical_edge_y_msa(src_y, pitch_y, *lpf_info_ptr->mblim, 642 *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); 643 if (src_u) { 644 mbloop_filter_vertical_edge_uv_msa(src_u, src_v, pitch_u_v, 645 *lpf_info_ptr->mblim, *lpf_info_ptr->lim, 646 *lpf_info_ptr->hev_thr); 647 } 648 } 649 650 void vp8_loop_filter_bh_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, 651 int32_t pitch_y, int32_t pitch_u_v, 652 loop_filter_info *lpf_info_ptr) { 653 loop_filter_horizontal_4_dual_msa(src_y + 4 * pitch_y, pitch_y, 654 lpf_info_ptr->blim, lpf_info_ptr->lim, 655 lpf_info_ptr->hev_thr, lpf_info_ptr->blim, 656 lpf_info_ptr->lim, lpf_info_ptr->hev_thr); 657 loop_filter_horizontal_4_dual_msa(src_y + 8 * pitch_y, pitch_y, 658 lpf_info_ptr->blim, lpf_info_ptr->lim, 659 lpf_info_ptr->hev_thr, lpf_info_ptr->blim, 660 lpf_info_ptr->lim, lpf_info_ptr->hev_thr); 661 loop_filter_horizontal_4_dual_msa(src_y + 12 * pitch_y, pitch_y, 662 lpf_info_ptr->blim, lpf_info_ptr->lim, 663 lpf_info_ptr->hev_thr, lpf_info_ptr->blim, 664 lpf_info_ptr->lim, lpf_info_ptr->hev_thr); 665 if (src_u) { 666 loop_filter_horizontal_edge_uv_msa( 667 src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v, 668 *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); 669 } 670 } 671 672 void vp8_loop_filter_bv_msa(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, 673 int32_t pitch_y, int32_t pitch_u_v, 674 loop_filter_info *lpf_info_ptr) { 675 loop_filter_vertical_4_dual_msa(src_y + 4, pitch_y, lpf_info_ptr->blim, 676 lpf_info_ptr->lim, lpf_info_ptr->hev_thr, 677 lpf_info_ptr->blim, lpf_info_ptr->lim, 678 lpf_info_ptr->hev_thr); 679 loop_filter_vertical_4_dual_msa(src_y + 8, pitch_y, lpf_info_ptr->blim, 680 lpf_info_ptr->lim, lpf_info_ptr->hev_thr, 681 lpf_info_ptr->blim, lpf_info_ptr->lim, 682 lpf_info_ptr->hev_thr); 683 loop_filter_vertical_4_dual_msa(src_y + 12, pitch_y, lpf_info_ptr->blim, 684 lpf_info_ptr->lim, lpf_info_ptr->hev_thr, 685 lpf_info_ptr->blim, lpf_info_ptr->lim, 686 lpf_info_ptr->hev_thr); 687 if (src_u) { 688 loop_filter_vertical_edge_uv_msa(src_u + 4, src_v + 4, pitch_u_v, 689 *lpf_info_ptr->blim, *lpf_info_ptr->lim, 690 *lpf_info_ptr->hev_thr); 691 } 692 } 693 694 void vp8_loop_filter_bhs_msa(uint8_t *src_y, int32_t pitch_y, 695 const uint8_t *b_limit_ptr) { 696 vp8_loop_filter_simple_horizontal_edge_msa(src_y + (4 * pitch_y), pitch_y, 697 b_limit_ptr); 698 vp8_loop_filter_simple_horizontal_edge_msa(src_y + (8 * pitch_y), pitch_y, 699 b_limit_ptr); 700 vp8_loop_filter_simple_horizontal_edge_msa(src_y + (12 * pitch_y), pitch_y, 701 b_limit_ptr); 702 } 703 704 void vp8_loop_filter_bvs_msa(uint8_t *src_y, int32_t pitch_y, 705 const uint8_t *b_limit_ptr) { 706 vp8_loop_filter_simple_vertical_edge_msa(src_y + 4, pitch_y, b_limit_ptr); 707 vp8_loop_filter_simple_vertical_edge_msa(src_y + 8, pitch_y, b_limit_ptr); 708 vp8_loop_filter_simple_vertical_edge_msa(src_y + 12, pitch_y, b_limit_ptr); 709 } 710