1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "vpx_dsp/mips/loopfilter_msa.h" 12 13 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, 14 const uint8_t *b_limit_ptr, 15 const uint8_t *limit_ptr, 16 const uint8_t *thresh_ptr) { 17 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; 18 v16u8 mask, hev, flat, thresh, b_limit, limit; 19 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 20 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 21 v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; 22 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; 23 v16i8 zero = { 0 }; 24 25 /* load vector elements */ 26 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 27 28 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 29 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 30 limit = (v16u8)__msa_fill_b(*limit_ptr); 31 32 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 33 mask, flat); 34 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 35 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 36 37 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 38 39 if (__msa_test_bz_v(flat)) { 40 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 41 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 42 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 43 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 44 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); 45 } else { 46 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 47 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 48 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, 49 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); 50 51 /* convert 16 bit output data into 8 bit */ 52 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, 53 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); 54 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); 55 56 /* store pixel values */ 57 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); 58 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); 59 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); 60 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); 61 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); 62 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); 63 64 p2_d = __msa_copy_u_d((v2i64)p2_out, 0); 65 p1_d = __msa_copy_u_d((v2i64)p1_out, 0); 66 p0_d = __msa_copy_u_d((v2i64)p0_out, 0); 67 q0_d = __msa_copy_u_d((v2i64)q0_out, 0); 68 q1_d = __msa_copy_u_d((v2i64)q1_out, 0); 69 q2_d = __msa_copy_u_d((v2i64)q2_out, 0); 70 71 src -= 3 * pitch; 72 73 SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); 74 src += (4 * pitch); 75 SD(q1_d, src); 76 src += pitch; 77 SD(q2_d, src); 78 } 79 } 80 81 void vpx_lpf_horizontal_8_dual_msa( 82 uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, 83 const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, 84 const uint8_t *thresh1) { 85 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 86 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; 87 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; 88 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 89 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 90 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 91 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 92 v16u8 zero = { 0 }; 93 94 /* load vector elements */ 95 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); 96 97 thresh = (v16u8)__msa_fill_b(*thresh0); 98 tmp = (v16u8)__msa_fill_b(*thresh1); 99 thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); 100 101 b_limit = (v16u8)__msa_fill_b(*b_limit0); 102 tmp = (v16u8)__msa_fill_b(*b_limit1); 103 b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); 104 105 limit = (v16u8)__msa_fill_b(*limit0); 106 tmp = (v16u8)__msa_fill_b(*limit1); 107 limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); 108 109 /* mask and hev */ 110 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 111 mask, flat); 112 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 113 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 114 115 if (__msa_test_bz_v(flat)) { 116 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); 117 } else { 118 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 119 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 120 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 121 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 122 123 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 124 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 125 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 126 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 127 128 /* convert 16 bit output data into 8 bit */ 129 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 130 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 131 p0_filt8_r, q0_filt8_r); 132 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 133 q2_filt8_r); 134 135 /* store pixel values */ 136 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 137 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 138 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 139 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 140 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 141 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 142 143 src -= 3 * pitch; 144 145 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); 146 src += (4 * pitch); 147 ST_UB2(q1_out, q2_out, src, pitch); 148 src += (2 * pitch); 149 } 150 } 151 152 void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, 153 const uint8_t *b_limit_ptr, 154 const uint8_t *limit_ptr, 155 const uint8_t *thresh_ptr) { 156 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 157 v16u8 p1_out, p0_out, q0_out, q1_out; 158 v16u8 flat, mask, hev, thresh, b_limit, limit; 159 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 160 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 161 v16u8 zero = { 0 }; 162 v8i16 vec0, vec1, vec2, vec3, vec4; 163 164 /* load vector elements */ 165 LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); 166 167 TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, 168 q3); 169 170 thresh = (v16u8)__msa_fill_b(*thresh_ptr); 171 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); 172 limit = (v16u8)__msa_fill_b(*limit_ptr); 173 174 /* mask and hev */ 175 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 176 mask, flat); 177 /* flat4 */ 178 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 179 /* filter4 */ 180 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 181 182 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); 183 184 if (__msa_test_bz_v(flat)) { 185 /* Store 4 pixels p1-_q1 */ 186 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 187 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 188 189 src -= 2; 190 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); 191 src += 4 * pitch; 192 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); 193 } else { 194 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 195 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 196 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 197 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 198 /* convert 16 bit output data into 8 bit */ 199 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, 200 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, 201 p0_filt8_r, q0_filt8_r); 202 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, 203 q2_filt8_r); 204 205 /* store pixel values */ 206 p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 207 p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 208 p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 209 q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 210 q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 211 q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 212 213 /* Store 6 pixels p2-_q2 */ 214 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 215 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 216 vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); 217 218 src -= 3; 219 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); 220 ST2x4_UB(vec4, 0, src + 4, pitch); 221 src += (4 * pitch); 222 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); 223 ST2x4_UB(vec4, 4, src + 4, pitch); 224 } 225 } 226 227 void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, 228 const uint8_t *b_limit0, const uint8_t *limit0, 229 const uint8_t *thresh0, 230 const uint8_t *b_limit1, const uint8_t *limit1, 231 const uint8_t *thresh1) { 232 uint8_t *temp_src; 233 v16u8 p3, p2, p1, p0, q3, q2, q1, q0; 234 v16u8 p1_out, p0_out, q0_out, q1_out; 235 v16u8 flat, mask, hev, thresh, b_limit, limit; 236 v16u8 row4, row5, row6, row7, row12, row13, row14, row15; 237 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; 238 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; 239 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; 240 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; 241 v16u8 zero = { 0 }; 242 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 243 244 temp_src = src - 4; 245 246 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); 247 temp_src += (8 * pitch); 248 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); 249 250 /* transpose 16x8 matrix into 8x16 */ 251 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, 252 row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, 253 q3); 254 255 thresh = (v16u8)__msa_fill_b(*thresh0); 256 vec0 = (v8i16)__msa_fill_b(*thresh1); 257 thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); 258 259 b_limit = (v16u8)__msa_fill_b(*b_limit0); 260 vec0 = (v8i16)__msa_fill_b(*b_limit1); 261 b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); 262 263 limit = (v16u8)__msa_fill_b(*limit0); 264 vec0 = (v8i16)__msa_fill_b(*limit1); 265 limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); 266 267 /* mask and hev */ 268 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, 269 mask, flat); 270 /* flat4 */ 271 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); 272 /* filter4 */ 273 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); 274 275 if (__msa_test_bz_v(flat)) { 276 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 277 ILVRL_H2_SH(vec1, vec0, vec2, vec3); 278 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); 279 ILVRL_H2_SH(vec1, vec0, vec4, vec5); 280 281 src -= 2; 282 ST4x8_UB(vec2, vec3, src, pitch); 283 src += 8 * pitch; 284 ST4x8_UB(vec4, vec5, src, pitch); 285 } else { 286 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, 287 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); 288 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, 289 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); 290 291 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); 292 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); 293 294 /* filter8 */ 295 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, 296 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); 297 298 /* convert 16 bit output data into 8 bit */ 299 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, 300 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, 301 p0_filt8_r, q0_filt8_r); 302 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, 303 q2_filt8_r); 304 305 /* store pixel values */ 306 p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); 307 p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); 308 p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); 309 q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); 310 q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); 311 q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); 312 313 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); 314 ILVRL_H2_SH(vec1, vec0, vec3, vec4); 315 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); 316 ILVRL_H2_SH(vec1, vec0, vec6, vec7); 317 ILVRL_B2_SH(q2, q1, vec2, vec5); 318 319 src -= 3; 320 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); 321 ST2x4_UB(vec2, 0, src + 4, pitch); 322 src += (4 * pitch); 323 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); 324 ST2x4_UB(vec2, 4, src + 4, pitch); 325 src += (4 * pitch); 326 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); 327 ST2x4_UB(vec5, 0, src + 4, pitch); 328 src += (4 * pitch); 329 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); 330 ST2x4_UB(vec5, 4, src + 4, pitch); 331 } 332 } 333