1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "vpx_ports/mem.h" 12 #include "vpx_dsp/mips/loopfilter_msa.h" 13 14 int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, 15 const uint8_t *b_limit_ptr, 16 const uint8_t *limit_ptr, 17 const uint8_t *thresh_ptr) { 18 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 19 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 20 v16u8 flat, mask, hev, thresh, b_limit, limit; 21 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 22 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 23 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 24 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 25 v16u8 zero = { 0 }; 26 27 /* load vector elements */ 28 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 29 30 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 31 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 32 limit = (v16u8)__msa_fill_b(*limit_ptr); 33 34 /* mask and hev */ 35 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 36 mask, flat); 37 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 38 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 39 40 if (__msa_test_bz_v(flat)) { 41 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 42 43 return 1; 44 } else { 45 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 46 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 47 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 48 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 49 50 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 51 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 52 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 53 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 54 55 /* convert 16 bit output data into 8 bit */ 56 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 57 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 58 p0_filt8_r, q0_filt8_r); 59 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 60 q2_filt8_r); 61 62 /* store pixel values */ 63 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 64 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 65 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 66 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 67 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 68 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 69 70 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 71 filter48 += (4 * 16); 72 ST_UB2(q1_out, q2_out, filter48, 16); 73 filter48 += (2 * 16); 74 ST_UB(flat, filter48); 75 76 return 0; 77 } 78 } 79 80 void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { 81 v16u8 flat, flat2, filter8; 82 v16i8 zero = { 0 }; 83 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 84 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 85 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 86 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; 87 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; 88 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 89 v8i16 l_out, r_out; 90 91 flat = LD_UB(filter48 + 96); 92 93 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); 94 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); 95 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 96 97 if (__msa_test_bz_v(flat2)) { 98 LD_UB4(filter48, 16, p2, p1, p0, q0); 99 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 100 101 src -= 3 * pitch; 102 ST_UB4(p2, p1, p0, q0, src, pitch); 103 src += (4 * pitch); 104 ST_UB2(q1, q2, src, pitch); 105 } else { 106 src -= 7 * pitch; 107 108 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, 109 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, 110 p2_r_in, p1_r_in, p0_r_in); 111 112 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 113 114 tmp0_r = p7_r_in << 3; 115 tmp0_r -= p7_r_in; 116 tmp0_r += p6_r_in; 117 tmp0_r += q0_r_in; 118 tmp1_r = p6_r_in + p5_r_in; 119 tmp1_r += p4_r_in; 120 tmp1_r += p3_r_in; 121 tmp1_r += p2_r_in; 122 tmp1_r += p1_r_in; 123 tmp1_r += p0_r_in; 124 tmp1_r += tmp0_r; 125 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 126 127 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 128 p5_l_in, p4_l_in); 129 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 130 p1_l_in, p0_l_in); 131 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); 132 133 tmp0_l = p7_l_in << 3; 134 tmp0_l -= p7_l_in; 135 tmp0_l += p6_l_in; 136 tmp0_l += q0_l_in; 137 tmp1_l = p6_l_in + p5_l_in; 138 tmp1_l += p4_l_in; 139 tmp1_l += p3_l_in; 140 tmp1_l += p2_l_in; 141 tmp1_l += p1_l_in; 142 tmp1_l += p0_l_in; 143 tmp1_l += tmp0_l; 144 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 145 146 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 147 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 148 ST_UB(p6, src); 149 src += pitch; 150 151 /* p5 */ 152 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 153 tmp0_r = p5_r_in - p6_r_in; 154 tmp0_r += q1_r_in; 155 tmp0_r -= p7_r_in; 156 tmp1_r += tmp0_r; 157 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 158 159 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); 160 tmp0_l = p5_l_in - p6_l_in; 161 tmp0_l += q1_l_in; 162 tmp0_l -= p7_l_in; 163 tmp1_l += tmp0_l; 164 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 165 166 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 167 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 168 ST_UB(p5, src); 169 src += pitch; 170 171 /* p4 */ 172 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 173 tmp0_r = p4_r_in - p5_r_in; 174 tmp0_r += q2_r_in; 175 tmp0_r -= p7_r_in; 176 tmp1_r += tmp0_r; 177 r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); 178 179 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); 180 tmp0_l = p4_l_in - p5_l_in; 181 tmp0_l += q2_l_in; 182 tmp0_l -= p7_l_in; 183 tmp1_l += tmp0_l; 184 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 185 186 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 187 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 188 ST_UB(p4, src); 189 src += pitch; 190 191 /* p3 */ 192 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 193 tmp0_r = p3_r_in - p4_r_in; 194 tmp0_r += q3_r_in; 195 tmp0_r -= p7_r_in; 196 tmp1_r += tmp0_r; 197 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 198 199 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); 200 tmp0_l = p3_l_in - p4_l_in; 201 tmp0_l += q3_l_in; 202 tmp0_l -= p7_l_in; 203 tmp1_l += tmp0_l; 204 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 205 206 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 207 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 208 ST_UB(p3, src); 209 src += pitch; 210 211 /* p2 */ 212 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 213 filter8 = LD_UB(filter48); 214 tmp0_r = p2_r_in - p3_r_in; 215 tmp0_r += q4_r_in; 216 tmp0_r -= p7_r_in; 217 tmp1_r += tmp0_r; 218 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 219 220 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); 221 tmp0_l = p2_l_in - p3_l_in; 222 tmp0_l += q4_l_in; 223 tmp0_l -= p7_l_in; 224 tmp1_l += tmp0_l; 225 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 226 227 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 228 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 229 ST_UB(filter8, src); 230 src += pitch; 231 232 /* p1 */ 233 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 234 filter8 = LD_UB(filter48 + 16); 235 tmp0_r = p1_r_in - p2_r_in; 236 tmp0_r += q5_r_in; 237 tmp0_r -= p7_r_in; 238 tmp1_r += tmp0_r; 239 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 240 241 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); 242 tmp0_l = p1_l_in - p2_l_in; 243 tmp0_l += q5_l_in; 244 tmp0_l -= p7_l_in; 245 tmp1_l += tmp0_l; 246 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 247 248 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 249 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 250 ST_UB(filter8, src); 251 src += pitch; 252 253 /* p0 */ 254 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 255 filter8 = LD_UB(filter48 + 32); 256 tmp0_r = p0_r_in - p1_r_in; 257 tmp0_r += q6_r_in; 258 tmp0_r -= p7_r_in; 259 tmp1_r += tmp0_r; 260 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 261 262 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); 263 tmp0_l = p0_l_in - p1_l_in; 264 tmp0_l += q6_l_in; 265 tmp0_l -= p7_l_in; 266 tmp1_l += tmp0_l; 267 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 268 269 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 270 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 271 ST_UB(filter8, src); 272 src += pitch; 273 274 /* q0 */ 275 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 276 filter8 = LD_UB(filter48 + 48); 277 tmp0_r = q7_r_in - p0_r_in; 278 tmp0_r += q0_r_in; 279 tmp0_r -= p7_r_in; 280 tmp1_r += tmp0_r; 281 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 282 283 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); 284 tmp0_l = q7_l_in - p0_l_in; 285 tmp0_l += q0_l_in; 286 tmp0_l -= p7_l_in; 287 tmp1_l += tmp0_l; 288 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 289 290 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 291 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 292 ST_UB(filter8, src); 293 src += pitch; 294 295 /* q1 */ 296 filter8 = LD_UB(filter48 + 64); 297 tmp0_r = q7_r_in - q0_r_in; 298 tmp0_r += q1_r_in; 299 tmp0_r -= p6_r_in; 300 tmp1_r += tmp0_r; 301 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 302 303 tmp0_l = q7_l_in - q0_l_in; 304 tmp0_l += q1_l_in; 305 tmp0_l -= p6_l_in; 306 tmp1_l += tmp0_l; 307 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 308 309 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 310 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 311 ST_UB(filter8, src); 312 src += pitch; 313 314 /* q2 */ 315 filter8 = LD_UB(filter48 + 80); 316 tmp0_r = q7_r_in - q1_r_in; 317 tmp0_r += q2_r_in; 318 tmp0_r -= p5_r_in; 319 tmp1_r += tmp0_r; 320 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 321 322 tmp0_l = q7_l_in - q1_l_in; 323 tmp0_l += q2_l_in; 324 tmp0_l -= p5_l_in; 325 tmp1_l += tmp0_l; 326 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 327 328 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 329 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 330 ST_UB(filter8, src); 331 src += pitch; 332 333 /* q3 */ 334 tmp0_r = q7_r_in - q2_r_in; 335 tmp0_r += q3_r_in; 336 tmp0_r -= p4_r_in; 337 tmp1_r += tmp0_r; 338 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 339 340 tmp0_l = q7_l_in - q2_l_in; 341 tmp0_l += q3_l_in; 342 tmp0_l -= p4_l_in; 343 tmp1_l += tmp0_l; 344 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 345 346 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 347 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 348 ST_UB(q3, src); 349 src += pitch; 350 351 /* q4 */ 352 tmp0_r = q7_r_in - q3_r_in; 353 tmp0_r += q4_r_in; 354 tmp0_r -= p3_r_in; 355 tmp1_r += tmp0_r; 356 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 357 358 tmp0_l = q7_l_in - q3_l_in; 359 tmp0_l += q4_l_in; 360 tmp0_l -= p3_l_in; 361 tmp1_l += tmp0_l; 362 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 363 364 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 365 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 366 ST_UB(q4, src); 367 src += pitch; 368 369 /* q5 */ 370 tmp0_r = q7_r_in - q4_r_in; 371 tmp0_r += q5_r_in; 372 tmp0_r -= p2_r_in; 373 tmp1_r += tmp0_r; 374 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 375 376 tmp0_l = q7_l_in - q4_l_in; 377 tmp0_l += q5_l_in; 378 tmp0_l -= p2_l_in; 379 tmp1_l += tmp0_l; 380 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 381 382 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 383 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 384 ST_UB(q5, src); 385 src += pitch; 386 387 /* q6 */ 388 tmp0_r = q7_r_in - q5_r_in; 389 tmp0_r += q6_r_in; 390 tmp0_r -= p1_r_in; 391 tmp1_r += tmp0_r; 392 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 393 394 tmp0_l = q7_l_in - q5_l_in; 395 tmp0_l += q6_l_in; 396 tmp0_l -= p1_l_in; 397 tmp1_l += tmp0_l; 398 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 399 400 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 401 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 402 ST_UB(q6, src); 403 } 404 } 405 406 static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, 407 const uint8_t *b_limit_ptr, 408 const uint8_t *limit_ptr, 409 const uint8_t *thresh_ptr, 410 int32_t count) { 411 DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); 412 uint8_t early_exit = 0; 413 414 (void)count; 415 416 early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, 417 limit_ptr, thresh_ptr); 418 419 if (0 == early_exit) { 420 vpx_hz_lpf_t16_16w(src, pitch, filter48); 421 } 422 } 423 424 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, 425 const uint8_t *b_limit_ptr, 426 const uint8_t *limit_ptr, 427 const uint8_t *thresh_ptr, int32_t count) { 428 if (1 == count) { 429 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 430 uint64_t dword0, dword1; 431 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; 432 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; 433 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 434 v16u8 p0_filter16, p1_filter16; 435 v8i16 p2_filter8, p1_filter8, p0_filter8; 436 v8i16 q0_filter8, q1_filter8, q2_filter8; 437 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; 438 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 439 v16i8 zero = { 0 }; 440 v8u16 tmp0, tmp1, tmp2; 441 442 /* load vector elements */ 443 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 444 445 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 446 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 447 limit = (v16u8)__msa_fill_b(*limit_ptr); 448 449 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 450 mask, flat); 451 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 452 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 453 q1_out); 454 455 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 456 457 if (__msa_test_bz_v(flat)) { 458 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 459 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 460 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 461 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 462 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); 463 } else { 464 /* convert 8 bit input data into 16 bit */ 465 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 466 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 467 q3_r); 468 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, 469 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 470 471 /* convert 16 bit output data into 8 bit */ 472 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, 473 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); 474 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); 475 476 /* store pixel values */ 477 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); 478 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); 479 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); 480 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); 481 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); 482 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); 483 484 /* load 16 vector elements */ 485 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); 486 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); 487 488 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 489 490 if (__msa_test_bz_v(flat2)) { 491 p2_d = __msa_copy_u_d((v2i64)p2_out, 0); 492 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 493 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 494 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 495 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 496 q2_d = __msa_copy_u_d((v2i64)q2_out, 0); 497 498 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); 499 SD(q1_d, src + pitch); 500 SD(q2_d, src + 2 * pitch); 501 } else { 502 /* LSB(right) 8 pixel operation */ 503 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, 504 zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, 505 q7_r); 506 507 tmp0 = p7_r << 3; 508 tmp0 -= p7_r; 509 tmp0 += p6_r; 510 tmp0 += q0_r; 511 512 src -= 7 * pitch; 513 514 /* calculation of p6 and p5 */ 515 tmp1 = p6_r + p5_r + p4_r + p3_r; 516 tmp1 += (p2_r + p1_r + p0_r); 517 tmp1 += tmp0; 518 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 519 tmp0 = p5_r - p6_r + q1_r - p7_r; 520 tmp1 += tmp0; 521 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 522 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 523 p1_filter16); 524 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); 525 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); 526 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 527 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 528 SD(dword0, src); 529 src += pitch; 530 SD(dword1, src); 531 src += pitch; 532 533 /* calculation of p4 and p3 */ 534 tmp0 = p4_r - p5_r + q2_r - p7_r; 535 tmp2 = p3_r - p4_r + q3_r - p7_r; 536 tmp1 += tmp0; 537 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 538 tmp1 += tmp2; 539 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 540 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 541 p1_filter16); 542 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); 543 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); 544 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 545 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 546 SD(dword0, src); 547 src += pitch; 548 SD(dword1, src); 549 src += pitch; 550 551 /* calculation of p2 and p1 */ 552 tmp0 = p2_r - p3_r + q4_r - p7_r; 553 tmp2 = p1_r - p2_r + q5_r - p7_r; 554 tmp1 += tmp0; 555 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 556 tmp1 += tmp2; 557 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 558 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 559 p1_filter16); 560 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); 561 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); 562 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 563 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 564 SD(dword0, src); 565 src += pitch; 566 SD(dword1, src); 567 src += pitch; 568 569 /* calculation of p0 and q0 */ 570 tmp0 = (p0_r - p1_r) + (q6_r - p7_r); 571 tmp2 = (q7_r - p0_r) + (q0_r - p7_r); 572 tmp1 += tmp0; 573 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 574 tmp1 += tmp2; 575 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 576 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 577 p1_filter16); 578 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); 579 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); 580 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 581 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 582 SD(dword0, src); 583 src += pitch; 584 SD(dword1, src); 585 src += pitch; 586 587 /* calculation of q1 and q2 */ 588 tmp0 = q7_r - q0_r + q1_r - p6_r; 589 tmp2 = q7_r - q1_r + q2_r - p5_r; 590 tmp1 += tmp0; 591 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 592 tmp1 += tmp2; 593 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 594 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 595 p1_filter16); 596 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); 597 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); 598 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 599 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 600 SD(dword0, src); 601 src += pitch; 602 SD(dword1, src); 603 src += pitch; 604 605 /* calculation of q3 and q4 */ 606 tmp0 = (q7_r - q2_r) + (q3_r - p4_r); 607 tmp2 = (q7_r - q3_r) + (q4_r - p3_r); 608 tmp1 += tmp0; 609 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 610 tmp1 += tmp2; 611 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 612 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 613 p1_filter16); 614 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); 615 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); 616 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 617 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 618 SD(dword0, src); 619 src += pitch; 620 SD(dword1, src); 621 src += pitch; 622 623 /* calculation of q5 and q6 */ 624 tmp0 = (q7_r - q4_r) + (q5_r - p2_r); 625 tmp2 = (q7_r - q5_r) + (q6_r - p1_r); 626 tmp1 += tmp0; 627 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 628 tmp1 += tmp2; 629 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 630 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 631 p1_filter16); 632 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); 633 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); 634 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 635 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 636 SD(dword0, src); 637 src += pitch; 638 SD(dword1, src); 639 } 640 } 641 } else { 642 mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 643 count); 644 } 645 } 646 647 void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, 648 const uint8_t *b_limit_ptr, 649 const uint8_t *limit_ptr, 650 const uint8_t *thresh_ptr) { 651 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); 652 } 653 654 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, 655 const uint8_t *b_limit_ptr, 656 const uint8_t *limit_ptr, 657 const uint8_t *thresh_ptr) { 658 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); 659 } 660 661 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, 662 uint8_t *output, int32_t out_pitch) { 663 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; 664 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 665 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 666 667 LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, 668 p1_org, p0_org); 669 /* 8x8 transpose */ 670 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, 671 p0_org, p7, p6, p5, p4, p3, p2, p1, p0); 672 /* 8x8 transpose */ 673 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, 674 tmp0, tmp1, tmp2, tmp3); 675 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); 676 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); 677 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); 678 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); 679 SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); 680 681 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 682 output += (8 * out_pitch); 683 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 684 } 685 686 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, 687 uint8_t *output, int32_t out_pitch) { 688 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; 689 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 690 691 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); 692 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); 693 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, 694 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); 695 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); 696 } 697 698 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, 699 int32_t out_pitch) { 700 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 701 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 702 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 703 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; 704 v4i32 tmp2, tmp3; 705 706 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); 707 input += (8 * in_pitch); 708 LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); 709 710 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, 711 row9, row10, row11, row12, row13, row14, row15, p7, p6, 712 p5, p4, p3, p2, p1, p0); 713 714 /* transpose 16x8 matrix into 8x16 */ 715 /* total 8 intermediate register and 32 instructions */ 716 q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); 717 q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); 718 q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); 719 q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); 720 q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); 721 q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); 722 q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); 723 q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); 724 725 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); 726 tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); 727 tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); 728 729 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); 730 tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); 731 tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); 732 733 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); 734 q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 735 q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 736 737 tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); 738 tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); 739 q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 740 q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 741 742 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); 743 q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 744 q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 745 746 tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); 747 tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); 748 q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 749 q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 750 751 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 752 output += (8 * out_pitch); 753 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 754 } 755 756 int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, 757 uint8_t *src_org, int32_t pitch_org, 758 const uint8_t *b_limit_ptr, 759 const uint8_t *limit_ptr, 760 const uint8_t *thresh_ptr) { 761 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 762 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 763 v16u8 flat, mask, hev, thresh, b_limit, limit; 764 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 765 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 766 v16i8 zero = { 0 }; 767 v8i16 vec0, vec1, vec2, vec3; 768 769 /* load vector elements */ 770 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 771 772 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 773 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 774 limit = (v16u8)__msa_fill_b(*limit_ptr); 775 776 /* mask and hev */ 777 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 778 mask, flat); 779 /* flat4 */ 780 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 781 /* filter4 */ 782 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 783 784 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 785 786 if (__msa_test_bz_v(flat)) { 787 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 788 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 789 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); 790 return 1; 791 } else { 792 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 793 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 794 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 795 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 796 797 /* convert 16 bit output data into 8 bit */ 798 p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); 799 p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); 800 p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); 801 q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); 802 q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); 803 q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); 804 805 /* store pixel values */ 806 p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); 807 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); 808 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); 809 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); 810 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); 811 q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); 812 813 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 814 filter48 += (4 * 16); 815 ST_UB2(q1_out, q2_out, filter48, 16); 816 filter48 += (2 * 16); 817 ST_UB(flat, filter48); 818 819 return 0; 820 } 821 } 822 823 int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, 824 uint8_t *filter48) { 825 v16i8 zero = { 0 }; 826 v16u8 filter8, flat, flat2; 827 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 828 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 829 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 830 v8u16 tmp0_r, tmp1_r; 831 v8i16 r_out; 832 833 flat = LD_UB(filter48 + 6 * 16); 834 835 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 836 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 837 838 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 839 840 if (__msa_test_bz_v(flat2)) { 841 v8i16 vec0, vec1, vec2, vec3, vec4; 842 843 LD_UB4(filter48, 16, p2, p1, p0, q0); 844 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 845 846 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 847 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 848 vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); 849 850 src_org -= 3; 851 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); 852 ST2x4_UB(vec2, 0, (src_org + 4), pitch); 853 src_org += (4 * pitch); 854 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); 855 ST2x4_UB(vec2, 4, (src_org + 4), pitch); 856 857 return 1; 858 } else { 859 src -= 7 * 16; 860 861 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, 862 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, 863 p2_r_in, p1_r_in, p0_r_in); 864 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 865 866 tmp0_r = p7_r_in << 3; 867 tmp0_r -= p7_r_in; 868 tmp0_r += p6_r_in; 869 tmp0_r += q0_r_in; 870 tmp1_r = p6_r_in + p5_r_in; 871 tmp1_r += p4_r_in; 872 tmp1_r += p3_r_in; 873 tmp1_r += p2_r_in; 874 tmp1_r += p1_r_in; 875 tmp1_r += p0_r_in; 876 tmp1_r += tmp0_r; 877 878 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 879 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 880 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 881 ST8x1_UB(p6, src); 882 src += 16; 883 884 /* p5 */ 885 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 886 tmp0_r = p5_r_in - p6_r_in; 887 tmp0_r += q1_r_in; 888 tmp0_r -= p7_r_in; 889 tmp1_r += tmp0_r; 890 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 891 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 892 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 893 ST8x1_UB(p5, src); 894 src += 16; 895 896 /* p4 */ 897 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 898 tmp0_r = p4_r_in - p5_r_in; 899 tmp0_r += q2_r_in; 900 tmp0_r -= p7_r_in; 901 tmp1_r += tmp0_r; 902 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 903 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 904 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 905 ST8x1_UB(p4, src); 906 src += 16; 907 908 /* p3 */ 909 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 910 tmp0_r = p3_r_in - p4_r_in; 911 tmp0_r += q3_r_in; 912 tmp0_r -= p7_r_in; 913 tmp1_r += tmp0_r; 914 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 915 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 916 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 917 ST8x1_UB(p3, src); 918 src += 16; 919 920 /* p2 */ 921 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 922 filter8 = LD_UB(filter48); 923 tmp0_r = p2_r_in - p3_r_in; 924 tmp0_r += q4_r_in; 925 tmp0_r -= p7_r_in; 926 tmp1_r += tmp0_r; 927 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 928 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 929 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 930 ST8x1_UB(filter8, src); 931 src += 16; 932 933 /* p1 */ 934 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 935 filter8 = LD_UB(filter48 + 16); 936 tmp0_r = p1_r_in - p2_r_in; 937 tmp0_r += q5_r_in; 938 tmp0_r -= p7_r_in; 939 tmp1_r += tmp0_r; 940 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 941 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 942 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 943 ST8x1_UB(filter8, src); 944 src += 16; 945 946 /* p0 */ 947 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 948 filter8 = LD_UB(filter48 + 32); 949 tmp0_r = p0_r_in - p1_r_in; 950 tmp0_r += q6_r_in; 951 tmp0_r -= p7_r_in; 952 tmp1_r += tmp0_r; 953 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 954 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 955 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 956 ST8x1_UB(filter8, src); 957 src += 16; 958 959 /* q0 */ 960 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 961 filter8 = LD_UB(filter48 + 48); 962 tmp0_r = q7_r_in - p0_r_in; 963 tmp0_r += q0_r_in; 964 tmp0_r -= p7_r_in; 965 tmp1_r += tmp0_r; 966 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 967 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 968 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 969 ST8x1_UB(filter8, src); 970 src += 16; 971 972 /* q1 */ 973 filter8 = LD_UB(filter48 + 64); 974 tmp0_r = q7_r_in - q0_r_in; 975 tmp0_r += q1_r_in; 976 tmp0_r -= p6_r_in; 977 tmp1_r += tmp0_r; 978 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 979 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 980 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 981 ST8x1_UB(filter8, src); 982 src += 16; 983 984 /* q2 */ 985 filter8 = LD_UB(filter48 + 80); 986 tmp0_r = q7_r_in - q1_r_in; 987 tmp0_r += q2_r_in; 988 tmp0_r -= p5_r_in; 989 tmp1_r += tmp0_r; 990 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 991 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 992 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 993 ST8x1_UB(filter8, src); 994 src += 16; 995 996 /* q3 */ 997 tmp0_r = q7_r_in - q2_r_in; 998 tmp0_r += q3_r_in; 999 tmp0_r -= p4_r_in; 1000 tmp1_r += tmp0_r; 1001 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1002 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1003 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 1004 ST8x1_UB(q3, src); 1005 src += 16; 1006 1007 /* q4 */ 1008 tmp0_r = q7_r_in - q3_r_in; 1009 tmp0_r += q4_r_in; 1010 tmp0_r -= p3_r_in; 1011 tmp1_r += tmp0_r; 1012 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1013 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1014 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 1015 ST8x1_UB(q4, src); 1016 src += 16; 1017 1018 /* q5 */ 1019 tmp0_r = q7_r_in - q4_r_in; 1020 tmp0_r += q5_r_in; 1021 tmp0_r -= p2_r_in; 1022 tmp1_r += tmp0_r; 1023 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1024 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1025 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 1026 ST8x1_UB(q5, src); 1027 src += 16; 1028 1029 /* q6 */ 1030 tmp0_r = q7_r_in - q5_r_in; 1031 tmp0_r += q6_r_in; 1032 tmp0_r -= p1_r_in; 1033 tmp1_r += tmp0_r; 1034 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1035 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1036 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 1037 ST8x1_UB(q6, src); 1038 1039 return 0; 1040 } 1041 } 1042 1043 void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, 1044 const uint8_t *b_limit_ptr, 1045 const uint8_t *limit_ptr, 1046 const uint8_t *thresh_ptr) { 1047 uint8_t early_exit = 0; 1048 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); 1049 uint8_t *filter48 = &transposed_input[16 * 16]; 1050 1051 transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); 1052 1053 early_exit = 1054 vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, 1055 pitch, b_limit_ptr, limit_ptr, thresh_ptr); 1056 1057 if (0 == early_exit) { 1058 early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, 1059 &filter48[0]); 1060 1061 if (0 == early_exit) { 1062 transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); 1063 } 1064 } 1065 } 1066 1067 int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, 1068 uint8_t *src_org, int32_t pitch, 1069 const uint8_t *b_limit_ptr, 1070 const uint8_t *limit_ptr, 1071 const uint8_t *thresh_ptr) { 1072 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1073 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 1074 v16u8 flat, mask, hev, thresh, b_limit, limit; 1075 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1076 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1077 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 1078 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 1079 v16i8 zero = { 0 }; 1080 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 1081 1082 /* load vector elements */ 1083 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 1084 1085 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 1086 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 1087 limit = (v16u8)__msa_fill_b(*limit_ptr); 1088 1089 /* mask and hev */ 1090 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 1091 mask, flat); 1092 /* flat4 */ 1093 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1094 /* filter4 */ 1095 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 1096 1097 if (__msa_test_bz_v(flat)) { 1098 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1099 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1100 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1101 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1102 1103 src_org -= 2; 1104 ST4x8_UB(vec2, vec3, src_org, pitch); 1105 src_org += 8 * pitch; 1106 ST4x8_UB(vec4, vec5, src_org, pitch); 1107 1108 return 1; 1109 } else { 1110 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 1111 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 1112 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1113 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1114 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 1115 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 1116 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1117 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1118 1119 /* convert 16 bit output data into 8 bit */ 1120 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 1121 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 1122 p0_filt8_r, q0_filt8_r); 1123 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 1124 q2_filt8_r); 1125 1126 /* store pixel values */ 1127 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 1128 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 1129 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 1130 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 1131 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 1132 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 1133 1134 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 1135 filter48 += (4 * 16); 1136 ST_UB2(q1_out, q2_out, filter48, 16); 1137 filter48 += (2 * 16); 1138 ST_UB(flat, filter48); 1139 1140 return 0; 1141 } 1142 } 1143 1144 int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, 1145 uint8_t *filter48) { 1146 v16u8 flat, flat2, filter8; 1147 v16i8 zero = { 0 }; 1148 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1149 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 1150 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 1151 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; 1152 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; 1153 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 1154 v8i16 l_out, r_out; 1155 1156 flat = LD_UB(filter48 + 6 * 16); 1157 1158 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 1159 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 1160 1161 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1162 1163 if (__msa_test_bz_v(flat2)) { 1164 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1165 1166 LD_UB4(filter48, 16, p2, p1, p0, q0); 1167 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 1168 1169 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1170 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1171 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1172 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1173 ILVRL_B2_SH(q2, q1, vec2, vec5); 1174 1175 src_org -= 3; 1176 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); 1177 ST2x4_UB(vec2, 0, (src_org + 4), pitch); 1178 src_org += (4 * pitch); 1179 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); 1180 ST2x4_UB(vec2, 4, (src_org + 4), pitch); 1181 src_org += (4 * pitch); 1182 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); 1183 ST2x4_UB(vec5, 0, (src_org + 4), pitch); 1184 src_org += (4 * pitch); 1185 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); 1186 ST2x4_UB(vec5, 4, (src_org + 4), pitch); 1187 1188 return 1; 1189 } else { 1190 src -= 7 * 16; 1191 1192 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, 1193 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, 1194 p2_r_in, p1_r_in, p0_r_in); 1195 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 1196 1197 tmp0_r = p7_r_in << 3; 1198 tmp0_r -= p7_r_in; 1199 tmp0_r += p6_r_in; 1200 tmp0_r += q0_r_in; 1201 tmp1_r = p6_r_in + p5_r_in; 1202 tmp1_r += p4_r_in; 1203 tmp1_r += p3_r_in; 1204 tmp1_r += p2_r_in; 1205 tmp1_r += p1_r_in; 1206 tmp1_r += p0_r_in; 1207 tmp1_r += tmp0_r; 1208 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1209 1210 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 1211 p5_l_in, p4_l_in); 1212 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 1213 p1_l_in, p0_l_in); 1214 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); 1215 1216 tmp0_l = p7_l_in << 3; 1217 tmp0_l -= p7_l_in; 1218 tmp0_l += p6_l_in; 1219 tmp0_l += q0_l_in; 1220 tmp1_l = p6_l_in + p5_l_in; 1221 tmp1_l += p4_l_in; 1222 tmp1_l += p3_l_in; 1223 tmp1_l += p2_l_in; 1224 tmp1_l += p1_l_in; 1225 tmp1_l += p0_l_in; 1226 tmp1_l += tmp0_l; 1227 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1228 1229 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1230 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 1231 ST_UB(p6, src); 1232 src += 16; 1233 1234 /* p5 */ 1235 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 1236 tmp0_r = p5_r_in - p6_r_in; 1237 tmp0_r += q1_r_in; 1238 tmp0_r -= p7_r_in; 1239 tmp1_r += tmp0_r; 1240 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1241 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); 1242 tmp0_l = p5_l_in - p6_l_in; 1243 tmp0_l += q1_l_in; 1244 tmp0_l -= p7_l_in; 1245 tmp1_l += tmp0_l; 1246 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1247 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1248 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 1249 ST_UB(p5, src); 1250 src += 16; 1251 1252 /* p4 */ 1253 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 1254 tmp0_r = p4_r_in - p5_r_in; 1255 tmp0_r += q2_r_in; 1256 tmp0_r -= p7_r_in; 1257 tmp1_r += tmp0_r; 1258 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1259 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); 1260 tmp0_l = p4_l_in - p5_l_in; 1261 tmp0_l += q2_l_in; 1262 tmp0_l -= p7_l_in; 1263 tmp1_l += tmp0_l; 1264 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1265 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1266 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 1267 ST_UB(p4, src); 1268 src += 16; 1269 1270 /* p3 */ 1271 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 1272 tmp0_r = p3_r_in - p4_r_in; 1273 tmp0_r += q3_r_in; 1274 tmp0_r -= p7_r_in; 1275 tmp1_r += tmp0_r; 1276 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1277 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); 1278 tmp0_l = p3_l_in - p4_l_in; 1279 tmp0_l += q3_l_in; 1280 tmp0_l -= p7_l_in; 1281 tmp1_l += tmp0_l; 1282 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1283 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1284 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 1285 ST_UB(p3, src); 1286 src += 16; 1287 1288 /* p2 */ 1289 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 1290 filter8 = LD_UB(filter48); 1291 tmp0_r = p2_r_in - p3_r_in; 1292 tmp0_r += q4_r_in; 1293 tmp0_r -= p7_r_in; 1294 tmp1_r += tmp0_r; 1295 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1296 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); 1297 tmp0_l = p2_l_in - p3_l_in; 1298 tmp0_l += q4_l_in; 1299 tmp0_l -= p7_l_in; 1300 tmp1_l += tmp0_l; 1301 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1302 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1303 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1304 ST_UB(filter8, src); 1305 src += 16; 1306 1307 /* p1 */ 1308 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 1309 filter8 = LD_UB(filter48 + 16); 1310 tmp0_r = p1_r_in - p2_r_in; 1311 tmp0_r += q5_r_in; 1312 tmp0_r -= p7_r_in; 1313 tmp1_r += tmp0_r; 1314 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1315 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); 1316 tmp0_l = p1_l_in - p2_l_in; 1317 tmp0_l += q5_l_in; 1318 tmp0_l -= p7_l_in; 1319 tmp1_l += tmp0_l; 1320 l_out = __msa_srari_h((v8i16)(tmp1_l), 4); 1321 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1322 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1323 ST_UB(filter8, src); 1324 src += 16; 1325 1326 /* p0 */ 1327 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 1328 filter8 = LD_UB(filter48 + 32); 1329 tmp0_r = p0_r_in - p1_r_in; 1330 tmp0_r += q6_r_in; 1331 tmp0_r -= p7_r_in; 1332 tmp1_r += tmp0_r; 1333 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1334 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); 1335 tmp0_l = p0_l_in - p1_l_in; 1336 tmp0_l += q6_l_in; 1337 tmp0_l -= p7_l_in; 1338 tmp1_l += tmp0_l; 1339 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1340 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1341 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1342 ST_UB(filter8, src); 1343 src += 16; 1344 1345 /* q0 */ 1346 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 1347 filter8 = LD_UB(filter48 + 48); 1348 tmp0_r = q7_r_in - p0_r_in; 1349 tmp0_r += q0_r_in; 1350 tmp0_r -= p7_r_in; 1351 tmp1_r += tmp0_r; 1352 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1353 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); 1354 tmp0_l = q7_l_in - p0_l_in; 1355 tmp0_l += q0_l_in; 1356 tmp0_l -= p7_l_in; 1357 tmp1_l += tmp0_l; 1358 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1359 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1360 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1361 ST_UB(filter8, src); 1362 src += 16; 1363 1364 /* q1 */ 1365 filter8 = LD_UB(filter48 + 64); 1366 tmp0_r = q7_r_in - q0_r_in; 1367 tmp0_r += q1_r_in; 1368 tmp0_r -= p6_r_in; 1369 tmp1_r += tmp0_r; 1370 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1371 tmp0_l = q7_l_in - q0_l_in; 1372 tmp0_l += q1_l_in; 1373 tmp0_l -= p6_l_in; 1374 tmp1_l += tmp0_l; 1375 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1376 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1377 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1378 ST_UB(filter8, src); 1379 src += 16; 1380 1381 /* q2 */ 1382 filter8 = LD_UB(filter48 + 80); 1383 tmp0_r = q7_r_in - q1_r_in; 1384 tmp0_r += q2_r_in; 1385 tmp0_r -= p5_r_in; 1386 tmp1_r += tmp0_r; 1387 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1388 tmp0_l = q7_l_in - q1_l_in; 1389 tmp0_l += q2_l_in; 1390 tmp0_l -= p5_l_in; 1391 tmp1_l += tmp0_l; 1392 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1393 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1394 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1395 ST_UB(filter8, src); 1396 src += 16; 1397 1398 /* q3 */ 1399 tmp0_r = q7_r_in - q2_r_in; 1400 tmp0_r += q3_r_in; 1401 tmp0_r -= p4_r_in; 1402 tmp1_r += tmp0_r; 1403 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1404 tmp0_l = q7_l_in - q2_l_in; 1405 tmp0_l += q3_l_in; 1406 tmp0_l -= p4_l_in; 1407 tmp1_l += tmp0_l; 1408 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1409 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1410 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 1411 ST_UB(q3, src); 1412 src += 16; 1413 1414 /* q4 */ 1415 tmp0_r = q7_r_in - q3_r_in; 1416 tmp0_r += q4_r_in; 1417 tmp0_r -= p3_r_in; 1418 tmp1_r += tmp0_r; 1419 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1420 tmp0_l = q7_l_in - q3_l_in; 1421 tmp0_l += q4_l_in; 1422 tmp0_l -= p3_l_in; 1423 tmp1_l += tmp0_l; 1424 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1425 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1426 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 1427 ST_UB(q4, src); 1428 src += 16; 1429 1430 /* q5 */ 1431 tmp0_r = q7_r_in - q4_r_in; 1432 tmp0_r += q5_r_in; 1433 tmp0_r -= p2_r_in; 1434 tmp1_r += tmp0_r; 1435 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1436 tmp0_l = q7_l_in - q4_l_in; 1437 tmp0_l += q5_l_in; 1438 tmp0_l -= p2_l_in; 1439 tmp1_l += tmp0_l; 1440 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1441 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1442 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 1443 ST_UB(q5, src); 1444 src += 16; 1445 1446 /* q6 */ 1447 tmp0_r = q7_r_in - q5_r_in; 1448 tmp0_r += q6_r_in; 1449 tmp0_r -= p1_r_in; 1450 tmp1_r += tmp0_r; 1451 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1452 tmp0_l = q7_l_in - q5_l_in; 1453 tmp0_l += q6_l_in; 1454 tmp0_l -= p1_l_in; 1455 tmp1_l += tmp0_l; 1456 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1457 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1458 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 1459 ST_UB(q6, src); 1460 1461 return 0; 1462 } 1463 } 1464 1465 void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, 1466 const uint8_t *b_limit_ptr, 1467 const uint8_t *limit_ptr, 1468 const uint8_t *thresh_ptr) { 1469 uint8_t early_exit = 0; 1470 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); 1471 uint8_t *filter48 = &transposed_input[16 * 16]; 1472 1473 transpose_16x16((src - 8), pitch, &transposed_input[0], 16); 1474 1475 early_exit = 1476 vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, 1477 pitch, b_limit_ptr, limit_ptr, thresh_ptr); 1478 1479 if (0 == early_exit) { 1480 early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, 1481 &filter48[0]); 1482 1483 if (0 == early_exit) { 1484 transpose_16x16(transposed_input, 16, (src - 8), pitch); 1485 } 1486 } 1487 } 1488