1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "vpx_ports/mem.h" 12 #include "vpx_dsp/mips/loopfilter_msa.h" 13 14 int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, 15 uint8_t *filter48, 16 const uint8_t *b_limit_ptr, 17 const uint8_t *limit_ptr, 18 const uint8_t *thresh_ptr) { 19 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 20 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 21 v16u8 flat, mask, hev, thresh, b_limit, limit; 22 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 23 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 24 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 25 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 26 v16u8 zero = { 0 }; 27 28 /* load vector elements */ 29 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 30 31 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 32 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 33 limit = (v16u8)__msa_fill_b(*limit_ptr); 34 35 /* mask and hev */ 36 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 37 hev, mask, flat); 38 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 39 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 40 41 if (__msa_test_bz_v(flat)) { 42 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 43 44 return 1; 45 } else { 46 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 47 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, 48 q2_r, q3_r); 49 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 50 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 51 52 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 53 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 54 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 55 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 56 57 /* convert 16 bit output data into 8 bit */ 58 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 59 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 60 p0_filt8_r, q0_filt8_r); 61 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 62 q2_filt8_r); 63 64 /* store pixel values */ 65 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 66 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 67 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 68 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 69 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 70 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 71 72 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 73 filter48 += (4 * 16); 74 ST_UB2(q1_out, q2_out, filter48, 16); 75 filter48 += (2 * 16); 76 ST_UB(flat, filter48); 77 78 return 0; 79 } 80 } 81 82 void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { 83 v16u8 flat, flat2, filter8; 84 v16i8 zero = { 0 }; 85 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 86 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 87 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 88 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; 89 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; 90 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 91 v8i16 l_out, r_out; 92 93 flat = LD_UB(filter48 + 96); 94 95 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); 96 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); 97 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 98 99 if (__msa_test_bz_v(flat2)) { 100 LD_UB4(filter48, 16, p2, p1, p0, q0); 101 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 102 103 src -= 3 * pitch; 104 ST_UB4(p2, p1, p0, q0, src, pitch); 105 src += (4 * pitch); 106 ST_UB2(q1, q2, src, pitch); 107 } else { 108 src -= 7 * pitch; 109 110 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 111 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, 112 p2_r_in, p1_r_in, p0_r_in); 113 114 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 115 116 tmp0_r = p7_r_in << 3; 117 tmp0_r -= p7_r_in; 118 tmp0_r += p6_r_in; 119 tmp0_r += q0_r_in; 120 tmp1_r = p6_r_in + p5_r_in; 121 tmp1_r += p4_r_in; 122 tmp1_r += p3_r_in; 123 tmp1_r += p2_r_in; 124 tmp1_r += p1_r_in; 125 tmp1_r += p0_r_in; 126 tmp1_r += tmp0_r; 127 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 128 129 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 130 p5_l_in, p4_l_in); 131 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 132 p1_l_in, p0_l_in); 133 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); 134 135 tmp0_l = p7_l_in << 3; 136 tmp0_l -= p7_l_in; 137 tmp0_l += p6_l_in; 138 tmp0_l += q0_l_in; 139 tmp1_l = p6_l_in + p5_l_in; 140 tmp1_l += p4_l_in; 141 tmp1_l += p3_l_in; 142 tmp1_l += p2_l_in; 143 tmp1_l += p1_l_in; 144 tmp1_l += p0_l_in; 145 tmp1_l += tmp0_l; 146 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 147 148 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 149 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 150 ST_UB(p6, src); 151 src += pitch; 152 153 /* p5 */ 154 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 155 tmp0_r = p5_r_in - p6_r_in; 156 tmp0_r += q1_r_in; 157 tmp0_r -= p7_r_in; 158 tmp1_r += tmp0_r; 159 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 160 161 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); 162 tmp0_l = p5_l_in - p6_l_in; 163 tmp0_l += q1_l_in; 164 tmp0_l -= p7_l_in; 165 tmp1_l += tmp0_l; 166 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 167 168 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 169 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 170 ST_UB(p5, src); 171 src += pitch; 172 173 /* p4 */ 174 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 175 tmp0_r = p4_r_in - p5_r_in; 176 tmp0_r += q2_r_in; 177 tmp0_r -= p7_r_in; 178 tmp1_r += tmp0_r; 179 r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); 180 181 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); 182 tmp0_l = p4_l_in - p5_l_in; 183 tmp0_l += q2_l_in; 184 tmp0_l -= p7_l_in; 185 tmp1_l += tmp0_l; 186 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 187 188 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 189 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 190 ST_UB(p4, src); 191 src += pitch; 192 193 /* p3 */ 194 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 195 tmp0_r = p3_r_in - p4_r_in; 196 tmp0_r += q3_r_in; 197 tmp0_r -= p7_r_in; 198 tmp1_r += tmp0_r; 199 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 200 201 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); 202 tmp0_l = p3_l_in - p4_l_in; 203 tmp0_l += q3_l_in; 204 tmp0_l -= p7_l_in; 205 tmp1_l += tmp0_l; 206 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 207 208 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 209 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 210 ST_UB(p3, src); 211 src += pitch; 212 213 /* p2 */ 214 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 215 filter8 = LD_UB(filter48); 216 tmp0_r = p2_r_in - p3_r_in; 217 tmp0_r += q4_r_in; 218 tmp0_r -= p7_r_in; 219 tmp1_r += tmp0_r; 220 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 221 222 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); 223 tmp0_l = p2_l_in - p3_l_in; 224 tmp0_l += q4_l_in; 225 tmp0_l -= p7_l_in; 226 tmp1_l += tmp0_l; 227 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 228 229 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 230 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 231 ST_UB(filter8, src); 232 src += pitch; 233 234 /* p1 */ 235 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 236 filter8 = LD_UB(filter48 + 16); 237 tmp0_r = p1_r_in - p2_r_in; 238 tmp0_r += q5_r_in; 239 tmp0_r -= p7_r_in; 240 tmp1_r += tmp0_r; 241 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 242 243 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); 244 tmp0_l = p1_l_in - p2_l_in; 245 tmp0_l += q5_l_in; 246 tmp0_l -= p7_l_in; 247 tmp1_l += tmp0_l; 248 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 249 250 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 251 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 252 ST_UB(filter8, src); 253 src += pitch; 254 255 /* p0 */ 256 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 257 filter8 = LD_UB(filter48 + 32); 258 tmp0_r = p0_r_in - p1_r_in; 259 tmp0_r += q6_r_in; 260 tmp0_r -= p7_r_in; 261 tmp1_r += tmp0_r; 262 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 263 264 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); 265 tmp0_l = p0_l_in - p1_l_in; 266 tmp0_l += q6_l_in; 267 tmp0_l -= p7_l_in; 268 tmp1_l += tmp0_l; 269 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 270 271 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 272 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 273 ST_UB(filter8, src); 274 src += pitch; 275 276 /* q0 */ 277 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 278 filter8 = LD_UB(filter48 + 48); 279 tmp0_r = q7_r_in - p0_r_in; 280 tmp0_r += q0_r_in; 281 tmp0_r -= p7_r_in; 282 tmp1_r += tmp0_r; 283 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 284 285 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); 286 tmp0_l = q7_l_in - p0_l_in; 287 tmp0_l += q0_l_in; 288 tmp0_l -= p7_l_in; 289 tmp1_l += tmp0_l; 290 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 291 292 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 293 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 294 ST_UB(filter8, src); 295 src += pitch; 296 297 /* q1 */ 298 filter8 = LD_UB(filter48 + 64); 299 tmp0_r = q7_r_in - q0_r_in; 300 tmp0_r += q1_r_in; 301 tmp0_r -= p6_r_in; 302 tmp1_r += tmp0_r; 303 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 304 305 tmp0_l = q7_l_in - q0_l_in; 306 tmp0_l += q1_l_in; 307 tmp0_l -= p6_l_in; 308 tmp1_l += tmp0_l; 309 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 310 311 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 312 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 313 ST_UB(filter8, src); 314 src += pitch; 315 316 /* q2 */ 317 filter8 = LD_UB(filter48 + 80); 318 tmp0_r = q7_r_in - q1_r_in; 319 tmp0_r += q2_r_in; 320 tmp0_r -= p5_r_in; 321 tmp1_r += tmp0_r; 322 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 323 324 tmp0_l = q7_l_in - q1_l_in; 325 tmp0_l += q2_l_in; 326 tmp0_l -= p5_l_in; 327 tmp1_l += tmp0_l; 328 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 329 330 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 331 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 332 ST_UB(filter8, src); 333 src += pitch; 334 335 /* q3 */ 336 tmp0_r = q7_r_in - q2_r_in; 337 tmp0_r += q3_r_in; 338 tmp0_r -= p4_r_in; 339 tmp1_r += tmp0_r; 340 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 341 342 tmp0_l = q7_l_in - q2_l_in; 343 tmp0_l += q3_l_in; 344 tmp0_l -= p4_l_in; 345 tmp1_l += tmp0_l; 346 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 347 348 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 349 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 350 ST_UB(q3, src); 351 src += pitch; 352 353 /* q4 */ 354 tmp0_r = q7_r_in - q3_r_in; 355 tmp0_r += q4_r_in; 356 tmp0_r -= p3_r_in; 357 tmp1_r += tmp0_r; 358 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 359 360 tmp0_l = q7_l_in - q3_l_in; 361 tmp0_l += q4_l_in; 362 tmp0_l -= p3_l_in; 363 tmp1_l += tmp0_l; 364 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 365 366 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 367 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 368 ST_UB(q4, src); 369 src += pitch; 370 371 /* q5 */ 372 tmp0_r = q7_r_in - q4_r_in; 373 tmp0_r += q5_r_in; 374 tmp0_r -= p2_r_in; 375 tmp1_r += tmp0_r; 376 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 377 378 tmp0_l = q7_l_in - q4_l_in; 379 tmp0_l += q5_l_in; 380 tmp0_l -= p2_l_in; 381 tmp1_l += tmp0_l; 382 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 383 384 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 385 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 386 ST_UB(q5, src); 387 src += pitch; 388 389 /* q6 */ 390 tmp0_r = q7_r_in - q5_r_in; 391 tmp0_r += q6_r_in; 392 tmp0_r -= p1_r_in; 393 tmp1_r += tmp0_r; 394 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 395 396 tmp0_l = q7_l_in - q5_l_in; 397 tmp0_l += q6_l_in; 398 tmp0_l -= p1_l_in; 399 tmp1_l += tmp0_l; 400 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 401 402 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 403 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 404 ST_UB(q6, src); 405 } 406 } 407 408 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, 409 const uint8_t *b_limit_ptr, 410 const uint8_t *limit_ptr, 411 const uint8_t *thresh_ptr, 412 int32_t count) { 413 DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); 414 uint8_t early_exit = 0; 415 416 (void)count; 417 418 early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, 419 limit_ptr, thresh_ptr); 420 421 if (0 == early_exit) { 422 vpx_hz_lpf_t16_16w(src, pitch, filter48); 423 } 424 } 425 426 void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, 427 const uint8_t *b_limit_ptr, 428 const uint8_t *limit_ptr, 429 const uint8_t *thresh_ptr, 430 int32_t count) { 431 if (1 == count) { 432 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 433 uint64_t dword0, dword1; 434 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; 435 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; 436 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 437 v16u8 p0_filter16, p1_filter16; 438 v8i16 p2_filter8, p1_filter8, p0_filter8; 439 v8i16 q0_filter8, q1_filter8, q2_filter8; 440 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; 441 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 442 v16i8 zero = { 0 }; 443 v8u16 tmp0, tmp1, tmp2; 444 445 /* load vector elements */ 446 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 447 448 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 449 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 450 limit = (v16u8)__msa_fill_b(*limit_ptr); 451 452 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 453 hev, mask, flat); 454 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 455 VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, 456 q1_out); 457 458 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 459 460 if (__msa_test_bz_v(flat)) { 461 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 462 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 463 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 464 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 465 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); 466 } else { 467 /* convert 8 bit input data into 16 bit */ 468 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 469 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 470 q3_r); 471 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, 472 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 473 474 /* convert 16 bit output data into 8 bit */ 475 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, 476 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, 477 q0_filter8); 478 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); 479 480 /* store pixel values */ 481 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); 482 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); 483 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); 484 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); 485 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); 486 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); 487 488 /* load 16 vector elements */ 489 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); 490 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); 491 492 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 493 494 if (__msa_test_bz_v(flat2)) { 495 p2_d = __msa_copy_u_d((v2i64)p2_out, 0); 496 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 497 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 498 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 499 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 500 q2_d = __msa_copy_u_d((v2i64)q2_out, 0); 501 502 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); 503 SD(q1_d, src + pitch); 504 SD(q2_d, src + 2 * pitch); 505 } else { 506 /* LSB(right) 8 pixel operation */ 507 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, 508 zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, 509 q7_r); 510 511 tmp0 = p7_r << 3; 512 tmp0 -= p7_r; 513 tmp0 += p6_r; 514 tmp0 += q0_r; 515 516 src -= 7 * pitch; 517 518 /* calculation of p6 and p5 */ 519 tmp1 = p6_r + p5_r + p4_r + p3_r; 520 tmp1 += (p2_r + p1_r + p0_r); 521 tmp1 += tmp0; 522 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 523 tmp0 = p5_r - p6_r + q1_r - p7_r; 524 tmp1 += tmp0; 525 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 526 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 527 p1_filter16); 528 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); 529 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); 530 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 531 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 532 SD(dword0, src); 533 src += pitch; 534 SD(dword1, src); 535 src += pitch; 536 537 /* calculation of p4 and p3 */ 538 tmp0 = p4_r - p5_r + q2_r - p7_r; 539 tmp2 = p3_r - p4_r + q3_r - p7_r; 540 tmp1 += tmp0; 541 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 542 tmp1 += tmp2; 543 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 544 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 545 p1_filter16); 546 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); 547 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); 548 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 549 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 550 SD(dword0, src); 551 src += pitch; 552 SD(dword1, src); 553 src += pitch; 554 555 /* calculation of p2 and p1 */ 556 tmp0 = p2_r - p3_r + q4_r - p7_r; 557 tmp2 = p1_r - p2_r + q5_r - p7_r; 558 tmp1 += tmp0; 559 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 560 tmp1 += tmp2; 561 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 562 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 563 p1_filter16); 564 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); 565 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); 566 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 567 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 568 SD(dword0, src); 569 src += pitch; 570 SD(dword1, src); 571 src += pitch; 572 573 /* calculation of p0 and q0 */ 574 tmp0 = (p0_r - p1_r) + (q6_r - p7_r); 575 tmp2 = (q7_r - p0_r) + (q0_r - p7_r); 576 tmp1 += tmp0; 577 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 578 tmp1 += tmp2; 579 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 580 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 581 p1_filter16); 582 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); 583 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); 584 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 585 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 586 SD(dword0, src); 587 src += pitch; 588 SD(dword1, src); 589 src += pitch; 590 591 /* calculation of q1 and q2 */ 592 tmp0 = q7_r - q0_r + q1_r - p6_r; 593 tmp2 = q7_r - q1_r + q2_r - p5_r; 594 tmp1 += tmp0; 595 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 596 tmp1 += tmp2; 597 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 598 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 599 p1_filter16); 600 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); 601 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); 602 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 603 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 604 SD(dword0, src); 605 src += pitch; 606 SD(dword1, src); 607 src += pitch; 608 609 /* calculation of q3 and q4 */ 610 tmp0 = (q7_r - q2_r) + (q3_r - p4_r); 611 tmp2 = (q7_r - q3_r) + (q4_r - p3_r); 612 tmp1 += tmp0; 613 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 614 tmp1 += tmp2; 615 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 616 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 617 p1_filter16); 618 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); 619 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); 620 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 621 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 622 SD(dword0, src); 623 src += pitch; 624 SD(dword1, src); 625 src += pitch; 626 627 /* calculation of q5 and q6 */ 628 tmp0 = (q7_r - q4_r) + (q5_r - p2_r); 629 tmp2 = (q7_r - q5_r) + (q6_r - p1_r); 630 tmp1 += tmp0; 631 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 632 tmp1 += tmp2; 633 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); 634 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, 635 p1_filter16); 636 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); 637 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); 638 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); 639 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); 640 SD(dword0, src); 641 src += pitch; 642 SD(dword1, src); 643 } 644 } 645 } else { 646 vpx_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, 647 thresh_ptr, count); 648 } 649 } 650 651 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, 652 uint8_t *output, int32_t out_pitch) { 653 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; 654 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 655 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 656 657 LD_UB8(input, in_pitch, 658 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); 659 /* 8x8 transpose */ 660 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, 661 p0_org, p7, p6, p5, p4, p3, p2, p1, p0); 662 /* 8x8 transpose */ 663 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, 664 tmp0, tmp1, tmp2, tmp3); 665 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); 666 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); 667 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); 668 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); 669 SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); 670 671 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 672 output += (8 * out_pitch); 673 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 674 } 675 676 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, 677 uint8_t *output, int32_t out_pitch) { 678 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; 679 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 680 681 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); 682 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); 683 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, 684 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); 685 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); 686 } 687 688 static void transpose_16x16(uint8_t *input, int32_t in_pitch, 689 uint8_t *output, int32_t out_pitch) { 690 v16u8 row0, row1, row2, row3, row4, row5, row6, row7; 691 v16u8 row8, row9, row10, row11, row12, row13, row14, row15; 692 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 693 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; 694 v4i32 tmp2, tmp3; 695 696 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); 697 input += (8 * in_pitch); 698 LD_UB8(input, in_pitch, 699 row8, row9, row10, row11, row12, row13, row14, row15); 700 701 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, 702 row8, row9, row10, row11, row12, row13, row14, row15, 703 p7, p6, p5, p4, p3, p2, p1, p0); 704 705 /* transpose 16x8 matrix into 8x16 */ 706 /* total 8 intermediate register and 32 instructions */ 707 q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); 708 q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); 709 q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); 710 q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); 711 q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); 712 q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); 713 q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); 714 q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); 715 716 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); 717 tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); 718 tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); 719 720 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); 721 tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); 722 tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); 723 724 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); 725 q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 726 q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 727 728 tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); 729 tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); 730 q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 731 q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 732 733 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); 734 q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 735 q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 736 737 tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); 738 tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); 739 q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); 740 q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); 741 742 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); 743 output += (8 * out_pitch); 744 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); 745 } 746 747 int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, 748 uint8_t *src_org, int32_t pitch_org, 749 const uint8_t *b_limit_ptr, 750 const uint8_t *limit_ptr, 751 const uint8_t *thresh_ptr) { 752 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 753 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 754 v16u8 flat, mask, hev, thresh, b_limit, limit; 755 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 756 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 757 v16i8 zero = { 0 }; 758 v8i16 vec0, vec1, vec2, vec3; 759 760 /* load vector elements */ 761 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 762 763 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 764 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 765 limit = (v16u8)__msa_fill_b(*limit_ptr); 766 767 /* mask and hev */ 768 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 769 hev, mask, flat); 770 /* flat4 */ 771 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 772 /* filter4 */ 773 VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 774 775 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 776 777 if (__msa_test_bz_v(flat)) { 778 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 779 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 780 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); 781 return 1; 782 } else { 783 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 784 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 785 q3_r); 786 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 787 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 788 789 /* convert 16 bit output data into 8 bit */ 790 p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); 791 p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); 792 p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); 793 q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); 794 q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); 795 q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); 796 797 /* store pixel values */ 798 p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); 799 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); 800 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); 801 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); 802 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); 803 q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); 804 805 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 806 filter48 += (4 * 16); 807 ST_UB2(q1_out, q2_out, filter48, 16); 808 filter48 += (2 * 16); 809 ST_UB(flat, filter48); 810 811 return 0; 812 } 813 } 814 815 int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, 816 uint8_t *filter48) { 817 v16i8 zero = { 0 }; 818 v16u8 filter8, flat, flat2; 819 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 820 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 821 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 822 v8u16 tmp0_r, tmp1_r; 823 v8i16 r_out; 824 825 flat = LD_UB(filter48 + 6 * 16); 826 827 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 828 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 829 830 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 831 832 if (__msa_test_bz_v(flat2)) { 833 v8i16 vec0, vec1, vec2, vec3, vec4; 834 835 LD_UB4(filter48, 16, p2, p1, p0, q0); 836 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 837 838 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 839 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 840 vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); 841 842 src_org -= 3; 843 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); 844 ST2x4_UB(vec2, 0, (src_org + 4), pitch); 845 src_org += (4 * pitch); 846 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); 847 ST2x4_UB(vec2, 4, (src_org + 4), pitch); 848 849 return 1; 850 } else { 851 src -= 7 * 16; 852 853 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 854 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 855 p3_r_in, p2_r_in, p1_r_in, p0_r_in); 856 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 857 858 tmp0_r = p7_r_in << 3; 859 tmp0_r -= p7_r_in; 860 tmp0_r += p6_r_in; 861 tmp0_r += q0_r_in; 862 tmp1_r = p6_r_in + p5_r_in; 863 tmp1_r += p4_r_in; 864 tmp1_r += p3_r_in; 865 tmp1_r += p2_r_in; 866 tmp1_r += p1_r_in; 867 tmp1_r += p0_r_in; 868 tmp1_r += tmp0_r; 869 870 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 871 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 872 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 873 ST8x1_UB(p6, src); 874 src += 16; 875 876 /* p5 */ 877 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 878 tmp0_r = p5_r_in - p6_r_in; 879 tmp0_r += q1_r_in; 880 tmp0_r -= p7_r_in; 881 tmp1_r += tmp0_r; 882 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 883 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 884 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 885 ST8x1_UB(p5, src); 886 src += 16; 887 888 /* p4 */ 889 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 890 tmp0_r = p4_r_in - p5_r_in; 891 tmp0_r += q2_r_in; 892 tmp0_r -= p7_r_in; 893 tmp1_r += tmp0_r; 894 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 895 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 896 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 897 ST8x1_UB(p4, src); 898 src += 16; 899 900 /* p3 */ 901 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 902 tmp0_r = p3_r_in - p4_r_in; 903 tmp0_r += q3_r_in; 904 tmp0_r -= p7_r_in; 905 tmp1_r += tmp0_r; 906 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 907 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 908 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 909 ST8x1_UB(p3, src); 910 src += 16; 911 912 /* p2 */ 913 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 914 filter8 = LD_UB(filter48); 915 tmp0_r = p2_r_in - p3_r_in; 916 tmp0_r += q4_r_in; 917 tmp0_r -= p7_r_in; 918 tmp1_r += tmp0_r; 919 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 920 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 921 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 922 ST8x1_UB(filter8, src); 923 src += 16; 924 925 /* p1 */ 926 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 927 filter8 = LD_UB(filter48 + 16); 928 tmp0_r = p1_r_in - p2_r_in; 929 tmp0_r += q5_r_in; 930 tmp0_r -= p7_r_in; 931 tmp1_r += tmp0_r; 932 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 933 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 934 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 935 ST8x1_UB(filter8, src); 936 src += 16; 937 938 /* p0 */ 939 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 940 filter8 = LD_UB(filter48 + 32); 941 tmp0_r = p0_r_in - p1_r_in; 942 tmp0_r += q6_r_in; 943 tmp0_r -= p7_r_in; 944 tmp1_r += tmp0_r; 945 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 946 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 947 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 948 ST8x1_UB(filter8, src); 949 src += 16; 950 951 /* q0 */ 952 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 953 filter8 = LD_UB(filter48 + 48); 954 tmp0_r = q7_r_in - p0_r_in; 955 tmp0_r += q0_r_in; 956 tmp0_r -= p7_r_in; 957 tmp1_r += tmp0_r; 958 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 959 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 960 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 961 ST8x1_UB(filter8, src); 962 src += 16; 963 964 /* q1 */ 965 filter8 = LD_UB(filter48 + 64); 966 tmp0_r = q7_r_in - q0_r_in; 967 tmp0_r += q1_r_in; 968 tmp0_r -= p6_r_in; 969 tmp1_r += tmp0_r; 970 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 971 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 972 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 973 ST8x1_UB(filter8, src); 974 src += 16; 975 976 /* q2 */ 977 filter8 = LD_UB(filter48 + 80); 978 tmp0_r = q7_r_in - q1_r_in; 979 tmp0_r += q2_r_in; 980 tmp0_r -= p5_r_in; 981 tmp1_r += tmp0_r; 982 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 983 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 984 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 985 ST8x1_UB(filter8, src); 986 src += 16; 987 988 /* q3 */ 989 tmp0_r = q7_r_in - q2_r_in; 990 tmp0_r += q3_r_in; 991 tmp0_r -= p4_r_in; 992 tmp1_r += tmp0_r; 993 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 994 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 995 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 996 ST8x1_UB(q3, src); 997 src += 16; 998 999 /* q4 */ 1000 tmp0_r = q7_r_in - q3_r_in; 1001 tmp0_r += q4_r_in; 1002 tmp0_r -= p3_r_in; 1003 tmp1_r += tmp0_r; 1004 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1005 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1006 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 1007 ST8x1_UB(q4, src); 1008 src += 16; 1009 1010 /* q5 */ 1011 tmp0_r = q7_r_in - q4_r_in; 1012 tmp0_r += q5_r_in; 1013 tmp0_r -= p2_r_in; 1014 tmp1_r += tmp0_r; 1015 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1016 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1017 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 1018 ST8x1_UB(q5, src); 1019 src += 16; 1020 1021 /* q6 */ 1022 tmp0_r = q7_r_in - q5_r_in; 1023 tmp0_r += q6_r_in; 1024 tmp0_r -= p1_r_in; 1025 tmp1_r += tmp0_r; 1026 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1027 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); 1028 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 1029 ST8x1_UB(q6, src); 1030 1031 return 0; 1032 } 1033 } 1034 1035 void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, 1036 const uint8_t *b_limit_ptr, 1037 const uint8_t *limit_ptr, 1038 const uint8_t *thresh_ptr) { 1039 uint8_t early_exit = 0; 1040 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); 1041 uint8_t *filter48 = &transposed_input[16 * 16]; 1042 1043 transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); 1044 1045 early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), 1046 &filter48[0], src, pitch, b_limit_ptr, 1047 limit_ptr, thresh_ptr); 1048 1049 if (0 == early_exit) { 1050 early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, 1051 &filter48[0]); 1052 1053 if (0 == early_exit) { 1054 transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); 1055 } 1056 } 1057 } 1058 1059 int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, 1060 uint8_t *src_org, int32_t pitch, 1061 const uint8_t *b_limit_ptr, 1062 const uint8_t *limit_ptr, 1063 const uint8_t *thresh_ptr) { 1064 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 1065 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 1066 v16u8 flat, mask, hev, thresh, b_limit, limit; 1067 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 1068 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 1069 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 1070 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 1071 v16i8 zero = { 0 }; 1072 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 1073 1074 /* load vector elements */ 1075 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); 1076 1077 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 1078 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 1079 limit = (v16u8)__msa_fill_b(*limit_ptr); 1080 1081 /* mask and hev */ 1082 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, 1083 hev, mask, flat); 1084 /* flat4 */ 1085 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 1086 /* filter4 */ 1087 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 1088 1089 if (__msa_test_bz_v(flat)) { 1090 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1091 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 1092 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 1093 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 1094 1095 src_org -= 2; 1096 ST4x8_UB(vec2, vec3, src_org, pitch); 1097 src_org += 8 * pitch; 1098 ST4x8_UB(vec4, vec5, src_org, pitch); 1099 1100 return 1; 1101 } else { 1102 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, 1103 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, 1104 q3_r); 1105 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 1106 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 1107 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 1108 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 1109 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 1110 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 1111 1112 /* convert 16 bit output data into 8 bit */ 1113 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 1114 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 1115 p0_filt8_r, q0_filt8_r); 1116 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 1117 q2_filt8_r); 1118 1119 /* store pixel values */ 1120 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 1121 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 1122 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 1123 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 1124 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 1125 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 1126 1127 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); 1128 filter48 += (4 * 16); 1129 ST_UB2(q1_out, q2_out, filter48, 16); 1130 filter48 += (2 * 16); 1131 ST_UB(flat, filter48); 1132 1133 return 0; 1134 } 1135 } 1136 1137 int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, 1138 uint8_t *filter48) { 1139 v16u8 flat, flat2, filter8; 1140 v16i8 zero = { 0 }; 1141 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; 1142 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; 1143 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; 1144 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; 1145 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; 1146 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; 1147 v8i16 l_out, r_out; 1148 1149 flat = LD_UB(filter48 + 6 * 16); 1150 1151 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); 1152 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); 1153 1154 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); 1155 1156 if (__msa_test_bz_v(flat2)) { 1157 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1158 1159 LD_UB4(filter48, 16, p2, p1, p0, q0); 1160 LD_UB2(filter48 + 4 * 16, 16, q1, q2); 1161 1162 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 1163 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 1164 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 1165 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 1166 ILVRL_B2_SH(q2, q1, vec2, vec5); 1167 1168 src_org -= 3; 1169 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); 1170 ST2x4_UB(vec2, 0, (src_org + 4), pitch); 1171 src_org += (4 * pitch); 1172 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); 1173 ST2x4_UB(vec2, 4, (src_org + 4), pitch); 1174 src_org += (4 * pitch); 1175 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); 1176 ST2x4_UB(vec5, 0, (src_org + 4), pitch); 1177 src_org += (4 * pitch); 1178 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); 1179 ST2x4_UB(vec5, 4, (src_org + 4), pitch); 1180 1181 return 1; 1182 } else { 1183 src -= 7 * 16; 1184 1185 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, 1186 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, 1187 p3_r_in, p2_r_in, p1_r_in, p0_r_in); 1188 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); 1189 1190 tmp0_r = p7_r_in << 3; 1191 tmp0_r -= p7_r_in; 1192 tmp0_r += p6_r_in; 1193 tmp0_r += q0_r_in; 1194 tmp1_r = p6_r_in + p5_r_in; 1195 tmp1_r += p4_r_in; 1196 tmp1_r += p3_r_in; 1197 tmp1_r += p2_r_in; 1198 tmp1_r += p1_r_in; 1199 tmp1_r += p0_r_in; 1200 tmp1_r += tmp0_r; 1201 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1202 1203 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, 1204 p5_l_in, p4_l_in); 1205 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, 1206 p1_l_in, p0_l_in); 1207 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); 1208 1209 tmp0_l = p7_l_in << 3; 1210 tmp0_l -= p7_l_in; 1211 tmp0_l += p6_l_in; 1212 tmp0_l += q0_l_in; 1213 tmp1_l = p6_l_in + p5_l_in; 1214 tmp1_l += p4_l_in; 1215 tmp1_l += p3_l_in; 1216 tmp1_l += p2_l_in; 1217 tmp1_l += p1_l_in; 1218 tmp1_l += p0_l_in; 1219 tmp1_l += tmp0_l; 1220 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1221 1222 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1223 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); 1224 ST_UB(p6, src); 1225 src += 16; 1226 1227 /* p5 */ 1228 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); 1229 tmp0_r = p5_r_in - p6_r_in; 1230 tmp0_r += q1_r_in; 1231 tmp0_r -= p7_r_in; 1232 tmp1_r += tmp0_r; 1233 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1234 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); 1235 tmp0_l = p5_l_in - p6_l_in; 1236 tmp0_l += q1_l_in; 1237 tmp0_l -= p7_l_in; 1238 tmp1_l += tmp0_l; 1239 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1240 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1241 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); 1242 ST_UB(p5, src); 1243 src += 16; 1244 1245 /* p4 */ 1246 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); 1247 tmp0_r = p4_r_in - p5_r_in; 1248 tmp0_r += q2_r_in; 1249 tmp0_r -= p7_r_in; 1250 tmp1_r += tmp0_r; 1251 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1252 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); 1253 tmp0_l = p4_l_in - p5_l_in; 1254 tmp0_l += q2_l_in; 1255 tmp0_l -= p7_l_in; 1256 tmp1_l += tmp0_l; 1257 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1258 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1259 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); 1260 ST_UB(p4, src); 1261 src += 16; 1262 1263 /* p3 */ 1264 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); 1265 tmp0_r = p3_r_in - p4_r_in; 1266 tmp0_r += q3_r_in; 1267 tmp0_r -= p7_r_in; 1268 tmp1_r += tmp0_r; 1269 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1270 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); 1271 tmp0_l = p3_l_in - p4_l_in; 1272 tmp0_l += q3_l_in; 1273 tmp0_l -= p7_l_in; 1274 tmp1_l += tmp0_l; 1275 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1276 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1277 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); 1278 ST_UB(p3, src); 1279 src += 16; 1280 1281 /* p2 */ 1282 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); 1283 filter8 = LD_UB(filter48); 1284 tmp0_r = p2_r_in - p3_r_in; 1285 tmp0_r += q4_r_in; 1286 tmp0_r -= p7_r_in; 1287 tmp1_r += tmp0_r; 1288 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1289 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); 1290 tmp0_l = p2_l_in - p3_l_in; 1291 tmp0_l += q4_l_in; 1292 tmp0_l -= p7_l_in; 1293 tmp1_l += tmp0_l; 1294 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1295 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1296 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1297 ST_UB(filter8, src); 1298 src += 16; 1299 1300 /* p1 */ 1301 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); 1302 filter8 = LD_UB(filter48 + 16); 1303 tmp0_r = p1_r_in - p2_r_in; 1304 tmp0_r += q5_r_in; 1305 tmp0_r -= p7_r_in; 1306 tmp1_r += tmp0_r; 1307 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1308 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); 1309 tmp0_l = p1_l_in - p2_l_in; 1310 tmp0_l += q5_l_in; 1311 tmp0_l -= p7_l_in; 1312 tmp1_l += tmp0_l; 1313 l_out = __msa_srari_h((v8i16)(tmp1_l), 4); 1314 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1315 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1316 ST_UB(filter8, src); 1317 src += 16; 1318 1319 /* p0 */ 1320 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); 1321 filter8 = LD_UB(filter48 + 32); 1322 tmp0_r = p0_r_in - p1_r_in; 1323 tmp0_r += q6_r_in; 1324 tmp0_r -= p7_r_in; 1325 tmp1_r += tmp0_r; 1326 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1327 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); 1328 tmp0_l = p0_l_in - p1_l_in; 1329 tmp0_l += q6_l_in; 1330 tmp0_l -= p7_l_in; 1331 tmp1_l += tmp0_l; 1332 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1333 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1334 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1335 ST_UB(filter8, src); 1336 src += 16; 1337 1338 /* q0 */ 1339 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); 1340 filter8 = LD_UB(filter48 + 48); 1341 tmp0_r = q7_r_in - p0_r_in; 1342 tmp0_r += q0_r_in; 1343 tmp0_r -= p7_r_in; 1344 tmp1_r += tmp0_r; 1345 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1346 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); 1347 tmp0_l = q7_l_in - p0_l_in; 1348 tmp0_l += q0_l_in; 1349 tmp0_l -= p7_l_in; 1350 tmp1_l += tmp0_l; 1351 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1352 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1353 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1354 ST_UB(filter8, src); 1355 src += 16; 1356 1357 /* q1 */ 1358 filter8 = LD_UB(filter48 + 64); 1359 tmp0_r = q7_r_in - q0_r_in; 1360 tmp0_r += q1_r_in; 1361 tmp0_r -= p6_r_in; 1362 tmp1_r += tmp0_r; 1363 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1364 tmp0_l = q7_l_in - q0_l_in; 1365 tmp0_l += q1_l_in; 1366 tmp0_l -= p6_l_in; 1367 tmp1_l += tmp0_l; 1368 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1369 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1370 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1371 ST_UB(filter8, src); 1372 src += 16; 1373 1374 /* q2 */ 1375 filter8 = LD_UB(filter48 + 80); 1376 tmp0_r = q7_r_in - q1_r_in; 1377 tmp0_r += q2_r_in; 1378 tmp0_r -= p5_r_in; 1379 tmp1_r += tmp0_r; 1380 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1381 tmp0_l = q7_l_in - q1_l_in; 1382 tmp0_l += q2_l_in; 1383 tmp0_l -= p5_l_in; 1384 tmp1_l += tmp0_l; 1385 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1386 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1387 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); 1388 ST_UB(filter8, src); 1389 src += 16; 1390 1391 /* q3 */ 1392 tmp0_r = q7_r_in - q2_r_in; 1393 tmp0_r += q3_r_in; 1394 tmp0_r -= p4_r_in; 1395 tmp1_r += tmp0_r; 1396 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1397 tmp0_l = q7_l_in - q2_l_in; 1398 tmp0_l += q3_l_in; 1399 tmp0_l -= p4_l_in; 1400 tmp1_l += tmp0_l; 1401 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1402 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1403 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); 1404 ST_UB(q3, src); 1405 src += 16; 1406 1407 /* q4 */ 1408 tmp0_r = q7_r_in - q3_r_in; 1409 tmp0_r += q4_r_in; 1410 tmp0_r -= p3_r_in; 1411 tmp1_r += tmp0_r; 1412 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1413 tmp0_l = q7_l_in - q3_l_in; 1414 tmp0_l += q4_l_in; 1415 tmp0_l -= p3_l_in; 1416 tmp1_l += tmp0_l; 1417 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1418 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1419 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); 1420 ST_UB(q4, src); 1421 src += 16; 1422 1423 /* q5 */ 1424 tmp0_r = q7_r_in - q4_r_in; 1425 tmp0_r += q5_r_in; 1426 tmp0_r -= p2_r_in; 1427 tmp1_r += tmp0_r; 1428 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1429 tmp0_l = q7_l_in - q4_l_in; 1430 tmp0_l += q5_l_in; 1431 tmp0_l -= p2_l_in; 1432 tmp1_l += tmp0_l; 1433 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1434 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1435 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); 1436 ST_UB(q5, src); 1437 src += 16; 1438 1439 /* q6 */ 1440 tmp0_r = q7_r_in - q5_r_in; 1441 tmp0_r += q6_r_in; 1442 tmp0_r -= p1_r_in; 1443 tmp1_r += tmp0_r; 1444 r_out = __msa_srari_h((v8i16)tmp1_r, 4); 1445 tmp0_l = q7_l_in - q5_l_in; 1446 tmp0_l += q6_l_in; 1447 tmp0_l -= p1_l_in; 1448 tmp1_l += tmp0_l; 1449 l_out = __msa_srari_h((v8i16)tmp1_l, 4); 1450 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); 1451 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); 1452 ST_UB(q6, src); 1453 1454 return 0; 1455 } 1456 } 1457 1458 void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, 1459 const uint8_t *b_limit_ptr, 1460 const uint8_t *limit_ptr, 1461 const uint8_t *thresh_ptr) { 1462 uint8_t early_exit = 0; 1463 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); 1464 uint8_t *filter48 = &transposed_input[16 * 16]; 1465 1466 transpose_16x16((src - 8), pitch, &transposed_input[0], 16); 1467 1468 early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), 1469 &filter48[0], src, pitch, b_limit_ptr, 1470 limit_ptr, thresh_ptr); 1471 1472 if (0 == early_exit) { 1473 early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, 1474 &filter48[0]); 1475 1476 if (0 == early_exit) { 1477 transpose_16x16(transposed_input, 16, (src - 8), pitch); 1478 } 1479 } 1480 } 1481