1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <stdlib.h> 12 #include "vp8_rtcd.h" 13 #include "vp8/common/onyxc_int.h" 14 15 #if HAVE_DSPR2 16 typedef unsigned char uc; 17 18 /* prefetch data for load */ 19 inline void prefetch_load_lf(unsigned char *src) { 20 __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); 21 } 22 23 /* prefetch data for store */ 24 inline void prefetch_store_lf(unsigned char *dst) { 25 __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); 26 } 27 28 /* processing 4 pixels at the same time 29 * compute hev and mask in the same function 30 */ 31 static __inline void vp8_filter_mask_vec_mips( 32 uint32_t limit, uint32_t flimit, uint32_t p1, uint32_t p0, uint32_t p3, 33 uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3, 34 uint32_t thresh, uint32_t *hev, uint32_t *mask) { 35 uint32_t c, r, r3, r_k; 36 uint32_t s1, s2, s3; 37 uint32_t ones = 0xFFFFFFFF; 38 uint32_t hev1; 39 40 __asm__ __volatile__( 41 /* mask |= (abs(p3 - p2) > limit) */ 42 "subu_s.qb %[c], %[p3], %[p2] \n\t" 43 "subu_s.qb %[r_k], %[p2], %[p3] \n\t" 44 "or %[r_k], %[r_k], %[c] \n\t" 45 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" 46 "or %[r], $0, %[c] \n\t" 47 48 /* mask |= (abs(p2 - p1) > limit) */ 49 "subu_s.qb %[c], %[p2], %[p1] \n\t" 50 "subu_s.qb %[r_k], %[p1], %[p2] \n\t" 51 "or %[r_k], %[r_k], %[c] \n\t" 52 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" 53 "or %[r], %[r], %[c] \n\t" 54 55 /* mask |= (abs(p1 - p0) > limit) 56 * hev |= (abs(p1 - p0) > thresh) 57 */ 58 "subu_s.qb %[c], %[p1], %[p0] \n\t" 59 "subu_s.qb %[r_k], %[p0], %[p1] \n\t" 60 "or %[r_k], %[r_k], %[c] \n\t" 61 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" 62 "or %[r3], $0, %[c] \n\t" 63 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" 64 "or %[r], %[r], %[c] \n\t" 65 66 /* mask |= (abs(q1 - q0) > limit) 67 * hev |= (abs(q1 - q0) > thresh) 68 */ 69 "subu_s.qb %[c], %[q1], %[q0] \n\t" 70 "subu_s.qb %[r_k], %[q0], %[q1] \n\t" 71 "or %[r_k], %[r_k], %[c] \n\t" 72 "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" 73 "or %[r3], %[r3], %[c] \n\t" 74 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" 75 "or %[r], %[r], %[c] \n\t" 76 77 /* mask |= (abs(q2 - q1) > limit) */ 78 "subu_s.qb %[c], %[q2], %[q1] \n\t" 79 "subu_s.qb %[r_k], %[q1], %[q2] \n\t" 80 "or %[r_k], %[r_k], %[c] \n\t" 81 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" 82 "or %[r], %[r], %[c] \n\t" 83 "sll %[r3], %[r3], 24 \n\t" 84 85 /* mask |= (abs(q3 - q2) > limit) */ 86 "subu_s.qb %[c], %[q3], %[q2] \n\t" 87 "subu_s.qb %[r_k], %[q2], %[q3] \n\t" 88 "or %[r_k], %[r_k], %[c] \n\t" 89 "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" 90 "or %[r], %[r], %[c] \n\t" 91 92 : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) 93 : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), 94 [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), 95 [thresh] "r"(thresh)); 96 97 __asm__ __volatile__( 98 /* abs(p0 - q0) */ 99 "subu_s.qb %[c], %[p0], %[q0] \n\t" 100 "subu_s.qb %[r_k], %[q0], %[p0] \n\t" 101 "wrdsp %[r3] \n\t" 102 "or %[s1], %[r_k], %[c] \n\t" 103 104 /* abs(p1 - q1) */ 105 "subu_s.qb %[c], %[p1], %[q1] \n\t" 106 "addu_s.qb %[s3], %[s1], %[s1] \n\t" 107 "pick.qb %[hev1], %[ones], $0 \n\t" 108 "subu_s.qb %[r_k], %[q1], %[p1] \n\t" 109 "or %[s2], %[r_k], %[c] \n\t" 110 111 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ 112 "shrl.qb %[s2], %[s2], 1 \n\t" 113 "addu_s.qb %[s1], %[s2], %[s3] \n\t" 114 "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" 115 "or %[r], %[r], %[c] \n\t" 116 "sll %[r], %[r], 24 \n\t" 117 118 "wrdsp %[r] \n\t" 119 "pick.qb %[s2], $0, %[ones] \n\t" 120 121 : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), 122 [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) 123 : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), 124 [ones] "r"(ones), [flimit] "r"(flimit)); 125 126 *hev = hev1; 127 *mask = s2; 128 } 129 130 /* inputs & outputs are quad-byte vectors */ 131 static __inline void vp8_filter_mips(uint32_t mask, uint32_t hev, uint32_t *ps1, 132 uint32_t *ps0, uint32_t *qs0, 133 uint32_t *qs1) { 134 int32_t vp8_filter_l, vp8_filter_r; 135 int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; 136 int32_t subr_r, subr_l; 137 uint32_t t1, t2, HWM, t3; 138 uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; 139 140 int32_t vps1, vps0, vqs0, vqs1; 141 int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; 142 uint32_t N128; 143 144 N128 = 0x80808080; 145 t1 = 0x03000300; 146 t2 = 0x04000400; 147 t3 = 0x01000100; 148 HWM = 0xFF00FF00; 149 150 vps0 = (*ps0) ^ N128; 151 vps1 = (*ps1) ^ N128; 152 vqs0 = (*qs0) ^ N128; 153 vqs1 = (*qs1) ^ N128; 154 155 /* use halfword pairs instead quad-bytes because of accuracy */ 156 vps0_l = vps0 & HWM; 157 vps0_r = vps0 << 8; 158 vps0_r = vps0_r & HWM; 159 160 vps1_l = vps1 & HWM; 161 vps1_r = vps1 << 8; 162 vps1_r = vps1_r & HWM; 163 164 vqs0_l = vqs0 & HWM; 165 vqs0_r = vqs0 << 8; 166 vqs0_r = vqs0_r & HWM; 167 168 vqs1_l = vqs1 & HWM; 169 vqs1_r = vqs1 << 8; 170 vqs1_r = vqs1_r & HWM; 171 172 mask_l = mask & HWM; 173 mask_r = mask << 8; 174 mask_r = mask_r & HWM; 175 176 hev_l = hev & HWM; 177 hev_r = hev << 8; 178 hev_r = hev_r & HWM; 179 180 __asm__ __volatile__( 181 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */ 182 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t" 183 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t" 184 185 /* qs0 - ps0 */ 186 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" 187 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" 188 189 /* vp8_filter &= hev; */ 190 "and %[vp8_filter_l], %[vp8_filter_l], %[hev_l] \n\t" 191 "and %[vp8_filter_r], %[vp8_filter_r], %[hev_r] \n\t" 192 193 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */ 194 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" 195 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" 196 "xor %[invhev_l], %[hev_l], %[HWM] \n\t" 197 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" 198 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" 199 "xor %[invhev_r], %[hev_r], %[HWM] \n\t" 200 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" 201 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" 202 203 /* vp8_filter &= mask; */ 204 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t" 205 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t" 206 207 : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=&r"(vp8_filter_r), 208 [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), 209 [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) 210 211 : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), 212 [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), 213 [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), 214 [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), 215 [HWM] "r"(HWM)); 216 217 /* save bottom 3 bits so that we round one side +4 and the other +3 */ 218 __asm__ __volatile__( 219 /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */ 220 "addq_s.ph %[Filter1_l], %[vp8_filter_l], %[t2] \n\t" 221 "addq_s.ph %[Filter1_r], %[vp8_filter_r], %[t2] \n\t" 222 223 /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */ 224 "addq_s.ph %[Filter2_l], %[vp8_filter_l], %[t1] \n\t" 225 "addq_s.ph %[Filter2_r], %[vp8_filter_r], %[t1] \n\t" 226 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" 227 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" 228 229 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" 230 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" 231 232 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" 233 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" 234 235 /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */ 236 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" 237 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" 238 239 /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */ 240 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" 241 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" 242 243 : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), 244 [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), 245 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), 246 [vqs0_r] "+r"(vqs0_r) 247 248 : [t1] "r"(t1), [t2] "r"(t2), [vp8_filter_l] "r"(vp8_filter_l), 249 [vp8_filter_r] "r"(vp8_filter_r), [HWM] "r"(HWM)); 250 251 __asm__ __volatile__( 252 /* (vp8_filter += 1) >>= 1 */ 253 "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" 254 "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" 255 256 /* vp8_filter &= ~hev; */ 257 "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" 258 "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" 259 260 /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */ 261 "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" 262 "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" 263 264 /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */ 265 "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" 266 "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" 267 268 : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), 269 [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), 270 [vqs1_r] "+r"(vqs1_r) 271 272 : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); 273 274 /* Create quad-bytes from halfword pairs */ 275 vqs0_l = vqs0_l & HWM; 276 vqs1_l = vqs1_l & HWM; 277 vps0_l = vps0_l & HWM; 278 vps1_l = vps1_l & HWM; 279 280 __asm__ __volatile__( 281 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" 282 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" 283 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" 284 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" 285 286 : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), 287 [vqs0_r] "+r"(vqs0_r) 288 :); 289 290 vqs0 = vqs0_l | vqs0_r; 291 vqs1 = vqs1_l | vqs1_r; 292 vps0 = vps0_l | vps0_r; 293 vps1 = vps1_l | vps1_r; 294 295 *ps0 = vps0 ^ N128; 296 *ps1 = vps1 ^ N128; 297 *qs0 = vqs0 ^ N128; 298 *qs1 = vqs1 ^ N128; 299 } 300 301 void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p, 302 unsigned int flimit, 303 unsigned int limit, 304 unsigned int thresh, int count) { 305 uint32_t mask; 306 uint32_t hev; 307 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 308 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; 309 (void)count; 310 311 mask = 0; 312 hev = 0; 313 p1 = 0; 314 p2 = 0; 315 p3 = 0; 316 p4 = 0; 317 318 /* prefetch data for store */ 319 prefetch_store_lf(s); 320 321 /* loop filter designed to work using chars so that we can make maximum use 322 * of 8 bit simd instructions. 323 */ 324 325 sm1 = s - (p << 2); 326 s0 = s - p - p - p; 327 s1 = s - p - p; 328 s2 = s - p; 329 s3 = s; 330 s4 = s + p; 331 s5 = s + p + p; 332 s6 = s + p + p + p; 333 334 /* load quad-byte vectors 335 * memory is 4 byte aligned 336 */ 337 p1 = *((uint32_t *)(s1)); 338 p2 = *((uint32_t *)(s2)); 339 p3 = *((uint32_t *)(s3)); 340 p4 = *((uint32_t *)(s4)); 341 342 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 343 * mask will be zero and filtering is not needed 344 */ 345 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 346 pm1 = *((uint32_t *)(sm1)); 347 p0 = *((uint32_t *)(s0)); 348 p5 = *((uint32_t *)(s5)); 349 p6 = *((uint32_t *)(s6)); 350 351 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 352 thresh, &hev, &mask); 353 354 /* if mask == 0 do filtering is not needed */ 355 if (mask) { 356 /* filtering */ 357 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 358 359 /* unpack processed 4x4 neighborhood */ 360 *((uint32_t *)s1) = p1; 361 *((uint32_t *)s2) = p2; 362 *((uint32_t *)s3) = p3; 363 *((uint32_t *)s4) = p4; 364 } 365 } 366 367 sm1 += 4; 368 s0 += 4; 369 s1 += 4; 370 s2 += 4; 371 s3 += 4; 372 s4 += 4; 373 s5 += 4; 374 s6 += 4; 375 376 /* load quad-byte vectors 377 * memory is 4 byte aligned 378 */ 379 p1 = *((uint32_t *)(s1)); 380 p2 = *((uint32_t *)(s2)); 381 p3 = *((uint32_t *)(s3)); 382 p4 = *((uint32_t *)(s4)); 383 384 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 385 * mask will be zero and filtering is not needed 386 */ 387 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 388 pm1 = *((uint32_t *)(sm1)); 389 p0 = *((uint32_t *)(s0)); 390 p5 = *((uint32_t *)(s5)); 391 p6 = *((uint32_t *)(s6)); 392 393 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 394 thresh, &hev, &mask); 395 396 /* if mask == 0 do filtering is not needed */ 397 if (mask) { 398 /* filtering */ 399 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 400 401 /* unpack processed 4x4 neighborhood */ 402 *((uint32_t *)s1) = p1; 403 *((uint32_t *)s2) = p2; 404 *((uint32_t *)s3) = p3; 405 *((uint32_t *)s4) = p4; 406 } 407 } 408 409 sm1 += 4; 410 s0 += 4; 411 s1 += 4; 412 s2 += 4; 413 s3 += 4; 414 s4 += 4; 415 s5 += 4; 416 s6 += 4; 417 418 /* load quad-byte vectors 419 * memory is 4 byte aligned 420 */ 421 p1 = *((uint32_t *)(s1)); 422 p2 = *((uint32_t *)(s2)); 423 p3 = *((uint32_t *)(s3)); 424 p4 = *((uint32_t *)(s4)); 425 426 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 427 * mask will be zero and filtering is not needed 428 */ 429 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 430 pm1 = *((uint32_t *)(sm1)); 431 p0 = *((uint32_t *)(s0)); 432 p5 = *((uint32_t *)(s5)); 433 p6 = *((uint32_t *)(s6)); 434 435 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 436 thresh, &hev, &mask); 437 438 /* if mask == 0 do filtering is not needed */ 439 if (mask) { 440 /* filtering */ 441 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 442 443 /* unpack processed 4x4 neighborhood */ 444 *((uint32_t *)s1) = p1; 445 *((uint32_t *)s2) = p2; 446 *((uint32_t *)s3) = p3; 447 *((uint32_t *)s4) = p4; 448 } 449 } 450 451 sm1 += 4; 452 s0 += 4; 453 s1 += 4; 454 s2 += 4; 455 s3 += 4; 456 s4 += 4; 457 s5 += 4; 458 s6 += 4; 459 460 /* load quad-byte vectors 461 * memory is 4 byte aligned 462 */ 463 p1 = *((uint32_t *)(s1)); 464 p2 = *((uint32_t *)(s2)); 465 p3 = *((uint32_t *)(s3)); 466 p4 = *((uint32_t *)(s4)); 467 468 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 469 * mask will be zero and filtering is not needed 470 */ 471 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 472 pm1 = *((uint32_t *)(sm1)); 473 p0 = *((uint32_t *)(s0)); 474 p5 = *((uint32_t *)(s5)); 475 p6 = *((uint32_t *)(s6)); 476 477 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 478 thresh, &hev, &mask); 479 480 /* if mask == 0 do filtering is not needed */ 481 if (mask) { 482 /* filtering */ 483 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 484 485 /* unpack processed 4x4 neighborhood */ 486 *((uint32_t *)s1) = p1; 487 *((uint32_t *)s2) = p2; 488 *((uint32_t *)s3) = p3; 489 *((uint32_t *)s4) = p4; 490 } 491 } 492 } 493 494 void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, 495 unsigned int flimit, 496 unsigned int limit, 497 unsigned int thresh, int count) { 498 uint32_t mask; 499 uint32_t hev; 500 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 501 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; 502 (void)count; 503 504 mask = 0; 505 hev = 0; 506 p1 = 0; 507 p2 = 0; 508 p3 = 0; 509 p4 = 0; 510 511 /* loop filter designed to work using chars so that we can make maximum use 512 * of 8 bit simd instructions. 513 */ 514 515 sm1 = s - (p << 2); 516 s0 = s - p - p - p; 517 s1 = s - p - p; 518 s2 = s - p; 519 s3 = s; 520 s4 = s + p; 521 s5 = s + p + p; 522 s6 = s + p + p + p; 523 524 /* load quad-byte vectors 525 * memory is 4 byte aligned 526 */ 527 p1 = *((uint32_t *)(s1)); 528 p2 = *((uint32_t *)(s2)); 529 p3 = *((uint32_t *)(s3)); 530 p4 = *((uint32_t *)(s4)); 531 532 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 533 * mask will be zero and filtering is not needed 534 */ 535 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 536 pm1 = *((uint32_t *)(sm1)); 537 p0 = *((uint32_t *)(s0)); 538 p5 = *((uint32_t *)(s5)); 539 p6 = *((uint32_t *)(s6)); 540 541 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 542 thresh, &hev, &mask); 543 544 /* if mask == 0 do filtering is not needed */ 545 if (mask) { 546 /* filtering */ 547 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 548 549 /* unpack processed 4x4 neighborhood */ 550 *((uint32_t *)s1) = p1; 551 *((uint32_t *)s2) = p2; 552 *((uint32_t *)s3) = p3; 553 *((uint32_t *)s4) = p4; 554 } 555 } 556 557 sm1 += 4; 558 s0 += 4; 559 s1 += 4; 560 s2 += 4; 561 s3 += 4; 562 s4 += 4; 563 s5 += 4; 564 s6 += 4; 565 566 /* load quad-byte vectors 567 * memory is 4 byte aligned 568 */ 569 p1 = *((uint32_t *)(s1)); 570 p2 = *((uint32_t *)(s2)); 571 p3 = *((uint32_t *)(s3)); 572 p4 = *((uint32_t *)(s4)); 573 574 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 575 * mask will be zero and filtering is not needed 576 */ 577 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 578 pm1 = *((uint32_t *)(sm1)); 579 p0 = *((uint32_t *)(s0)); 580 p5 = *((uint32_t *)(s5)); 581 p6 = *((uint32_t *)(s6)); 582 583 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 584 thresh, &hev, &mask); 585 586 /* if mask == 0 do filtering is not needed */ 587 if (mask) { 588 /* filtering */ 589 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 590 591 /* unpack processed 4x4 neighborhood */ 592 *((uint32_t *)s1) = p1; 593 *((uint32_t *)s2) = p2; 594 *((uint32_t *)s3) = p3; 595 *((uint32_t *)s4) = p4; 596 } 597 } 598 } 599 600 void vp8_loop_filter_vertical_edge_mips(unsigned char *s, int p, 601 const unsigned int flimit, 602 const unsigned int limit, 603 const unsigned int thresh, int count) { 604 int i; 605 uint32_t mask, hev; 606 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 607 unsigned char *s1, *s2, *s3, *s4; 608 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 609 610 hev = 0; 611 mask = 0; 612 i = 0; 613 pm1 = 0; 614 p0 = 0; 615 p1 = 0; 616 p2 = 0; 617 p3 = 0; 618 p4 = 0; 619 p5 = 0; 620 p6 = 0; 621 622 /* loop filter designed to work using chars so that we can make maximum use 623 * of 8 bit simd instructions. 624 */ 625 626 /* apply filter on 4 pixesl at the same time */ 627 do { 628 /* prefetch data for store */ 629 prefetch_store_lf(s + p); 630 631 s1 = s; 632 s2 = s + p; 633 s3 = s2 + p; 634 s4 = s3 + p; 635 s = s4 + p; 636 637 /* load quad-byte vectors 638 * memory is 4 byte aligned 639 */ 640 p2 = *((uint32_t *)(s1 - 4)); 641 p6 = *((uint32_t *)(s1)); 642 p1 = *((uint32_t *)(s2 - 4)); 643 p5 = *((uint32_t *)(s2)); 644 p0 = *((uint32_t *)(s3 - 4)); 645 p4 = *((uint32_t *)(s3)); 646 pm1 = *((uint32_t *)(s4 - 4)); 647 p3 = *((uint32_t *)(s4)); 648 649 /* transpose pm1, p0, p1, p2 */ 650 __asm__ __volatile__( 651 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 652 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 653 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 654 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 655 656 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 657 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 658 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 659 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 660 661 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 662 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 663 "append %[p1], %[sec3], 16 \n\t" 664 "append %[pm1], %[sec4], 16 \n\t" 665 666 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 667 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 668 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 669 :); 670 671 /* transpose p3, p4, p5, p6 */ 672 __asm__ __volatile__( 673 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 674 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 675 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 676 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 677 678 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 679 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 680 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 681 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 682 683 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 684 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 685 "append %[p5], %[sec3], 16 \n\t" 686 "append %[p3], %[sec4], 16 \n\t" 687 688 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 689 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 690 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 691 :); 692 693 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 694 * mask will be zero and filtering is not needed 695 */ 696 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 697 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 698 thresh, &hev, &mask); 699 700 /* if mask == 0 do filtering is not needed */ 701 if (mask) { 702 /* filtering */ 703 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 704 705 /* unpack processed 4x4 neighborhood 706 * don't use transpose on output data 707 * because memory isn't aligned 708 */ 709 __asm__ __volatile__( 710 "sb %[p4], 1(%[s4]) \n\t" 711 "sb %[p3], 0(%[s4]) \n\t" 712 "sb %[p2], -1(%[s4]) \n\t" 713 "sb %[p1], -2(%[s4]) \n\t" 714 : 715 : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), 716 [p1] "r"(p1)); 717 718 __asm__ __volatile__( 719 "srl %[p4], %[p4], 8 \n\t" 720 "srl %[p3], %[p3], 8 \n\t" 721 "srl %[p2], %[p2], 8 \n\t" 722 "srl %[p1], %[p1], 8 \n\t" 723 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 724 :); 725 726 __asm__ __volatile__( 727 "sb %[p4], 1(%[s3]) \n\t" 728 "sb %[p3], 0(%[s3]) \n\t" 729 "sb %[p2], -1(%[s3]) \n\t" 730 "sb %[p1], -2(%[s3]) \n\t" 731 : [p1] "+r"(p1) 732 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); 733 734 __asm__ __volatile__( 735 "srl %[p4], %[p4], 8 \n\t" 736 "srl %[p3], %[p3], 8 \n\t" 737 "srl %[p2], %[p2], 8 \n\t" 738 "srl %[p1], %[p1], 8 \n\t" 739 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 740 :); 741 742 __asm__ __volatile__( 743 "sb %[p4], 1(%[s2]) \n\t" 744 "sb %[p3], 0(%[s2]) \n\t" 745 "sb %[p2], -1(%[s2]) \n\t" 746 "sb %[p1], -2(%[s2]) \n\t" 747 : 748 : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), 749 [p1] "r"(p1)); 750 751 __asm__ __volatile__( 752 "srl %[p4], %[p4], 8 \n\t" 753 "srl %[p3], %[p3], 8 \n\t" 754 "srl %[p2], %[p2], 8 \n\t" 755 "srl %[p1], %[p1], 8 \n\t" 756 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 757 :); 758 759 __asm__ __volatile__( 760 "sb %[p4], 1(%[s1]) \n\t" 761 "sb %[p3], 0(%[s1]) \n\t" 762 "sb %[p2], -1(%[s1]) \n\t" 763 "sb %[p1], -2(%[s1]) \n\t" 764 : 765 : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), 766 [p1] "r"(p1)); 767 } 768 } 769 770 s1 = s; 771 s2 = s + p; 772 s3 = s2 + p; 773 s4 = s3 + p; 774 s = s4 + p; 775 776 /* load quad-byte vectors 777 * memory is 4 byte aligned 778 */ 779 p2 = *((uint32_t *)(s1 - 4)); 780 p6 = *((uint32_t *)(s1)); 781 p1 = *((uint32_t *)(s2 - 4)); 782 p5 = *((uint32_t *)(s2)); 783 p0 = *((uint32_t *)(s3 - 4)); 784 p4 = *((uint32_t *)(s3)); 785 pm1 = *((uint32_t *)(s4 - 4)); 786 p3 = *((uint32_t *)(s4)); 787 788 /* transpose pm1, p0, p1, p2 */ 789 __asm__ __volatile__( 790 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 791 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 792 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 793 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 794 795 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 796 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 797 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 798 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 799 800 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 801 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 802 "append %[p1], %[sec3], 16 \n\t" 803 "append %[pm1], %[sec4], 16 \n\t" 804 805 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 806 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 807 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 808 :); 809 810 /* transpose p3, p4, p5, p6 */ 811 __asm__ __volatile__( 812 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 813 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 814 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 815 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 816 817 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 818 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 819 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 820 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 821 822 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 823 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 824 "append %[p5], %[sec3], 16 \n\t" 825 "append %[p3], %[sec4], 16 \n\t" 826 827 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 828 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 829 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 830 :); 831 832 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 833 * mask will be zero and filtering is not needed 834 */ 835 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 836 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 837 thresh, &hev, &mask); 838 839 /* if mask == 0 do filtering is not needed */ 840 if (mask) { 841 /* filtering */ 842 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 843 844 /* unpack processed 4x4 neighborhood 845 * don't use transpose on output data 846 * because memory isn't aligned 847 */ 848 __asm__ __volatile__( 849 "sb %[p4], 1(%[s4]) \n\t" 850 "sb %[p3], 0(%[s4]) \n\t" 851 "sb %[p2], -1(%[s4]) \n\t" 852 "sb %[p1], -2(%[s4]) \n\t" 853 : 854 : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), 855 [p1] "r"(p1)); 856 857 __asm__ __volatile__( 858 "srl %[p4], %[p4], 8 \n\t" 859 "srl %[p3], %[p3], 8 \n\t" 860 "srl %[p2], %[p2], 8 \n\t" 861 "srl %[p1], %[p1], 8 \n\t" 862 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 863 :); 864 865 __asm__ __volatile__( 866 "sb %[p4], 1(%[s3]) \n\t" 867 "sb %[p3], 0(%[s3]) \n\t" 868 "sb %[p2], -1(%[s3]) \n\t" 869 "sb %[p1], -2(%[s3]) \n\t" 870 : [p1] "+r"(p1) 871 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); 872 873 __asm__ __volatile__( 874 "srl %[p4], %[p4], 8 \n\t" 875 "srl %[p3], %[p3], 8 \n\t" 876 "srl %[p2], %[p2], 8 \n\t" 877 "srl %[p1], %[p1], 8 \n\t" 878 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 879 :); 880 881 __asm__ __volatile__( 882 "sb %[p4], 1(%[s2]) \n\t" 883 "sb %[p3], 0(%[s2]) \n\t" 884 "sb %[p2], -1(%[s2]) \n\t" 885 "sb %[p1], -2(%[s2]) \n\t" 886 : 887 : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), 888 [p1] "r"(p1)); 889 890 __asm__ __volatile__( 891 "srl %[p4], %[p4], 8 \n\t" 892 "srl %[p3], %[p3], 8 \n\t" 893 "srl %[p2], %[p2], 8 \n\t" 894 "srl %[p1], %[p1], 8 \n\t" 895 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 896 :); 897 898 __asm__ __volatile__( 899 "sb %[p4], 1(%[s1]) \n\t" 900 "sb %[p3], 0(%[s1]) \n\t" 901 "sb %[p2], -1(%[s1]) \n\t" 902 "sb %[p1], -2(%[s1]) \n\t" 903 : 904 : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), 905 [p1] "r"(p1)); 906 } 907 } 908 909 i += 8; 910 } 911 912 while (i < count); 913 } 914 915 void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, 916 unsigned int flimit, 917 unsigned int limit, 918 unsigned int thresh, int count) { 919 uint32_t mask, hev; 920 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 921 unsigned char *s1, *s2, *s3, *s4; 922 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 923 (void)count; 924 925 /* loop filter designed to work using chars so that we can make maximum use 926 * of 8 bit simd instructions. 927 */ 928 929 /* apply filter on 4 pixesl at the same time */ 930 931 s1 = s; 932 s2 = s + p; 933 s3 = s2 + p; 934 s4 = s3 + p; 935 936 /* load quad-byte vectors 937 * memory is 4 byte aligned 938 */ 939 p2 = *((uint32_t *)(s1 - 4)); 940 p6 = *((uint32_t *)(s1)); 941 p1 = *((uint32_t *)(s2 - 4)); 942 p5 = *((uint32_t *)(s2)); 943 p0 = *((uint32_t *)(s3 - 4)); 944 p4 = *((uint32_t *)(s3)); 945 pm1 = *((uint32_t *)(s4 - 4)); 946 p3 = *((uint32_t *)(s4)); 947 948 /* transpose pm1, p0, p1, p2 */ 949 __asm__ __volatile__( 950 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 951 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 952 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 953 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 954 955 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 956 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 957 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 958 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 959 960 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 961 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 962 "append %[p1], %[sec3], 16 \n\t" 963 "append %[pm1], %[sec4], 16 \n\t" 964 965 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 966 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 967 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 968 :); 969 970 /* transpose p3, p4, p5, p6 */ 971 __asm__ __volatile__( 972 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 973 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 974 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 975 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 976 977 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 978 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 979 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 980 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 981 982 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 983 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 984 "append %[p5], %[sec3], 16 \n\t" 985 "append %[p3], %[sec4], 16 \n\t" 986 987 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 988 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 989 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 990 :); 991 992 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 993 * mask will be zero and filtering is not needed 994 */ 995 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 996 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 997 thresh, &hev, &mask); 998 999 /* if mask == 0 do filtering is not needed */ 1000 if (mask) { 1001 /* filtering */ 1002 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 1003 1004 /* unpack processed 4x4 neighborhood 1005 * don't use transpose on output data 1006 * because memory isn't aligned 1007 */ 1008 __asm__ __volatile__( 1009 "sb %[p4], 1(%[s4]) \n\t" 1010 "sb %[p3], 0(%[s4]) \n\t" 1011 "sb %[p2], -1(%[s4]) \n\t" 1012 "sb %[p1], -2(%[s4]) \n\t" 1013 : 1014 : 1015 [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1)); 1016 1017 __asm__ __volatile__( 1018 "srl %[p4], %[p4], 8 \n\t" 1019 "srl %[p3], %[p3], 8 \n\t" 1020 "srl %[p2], %[p2], 8 \n\t" 1021 "srl %[p1], %[p1], 8 \n\t" 1022 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 1023 :); 1024 1025 __asm__ __volatile__( 1026 "sb %[p4], 1(%[s3]) \n\t" 1027 "sb %[p3], 0(%[s3]) \n\t" 1028 "sb %[p2], -1(%[s3]) \n\t" 1029 "sb %[p1], -2(%[s3]) \n\t" 1030 : [p1] "+r"(p1) 1031 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); 1032 1033 __asm__ __volatile__( 1034 "srl %[p4], %[p4], 8 \n\t" 1035 "srl %[p3], %[p3], 8 \n\t" 1036 "srl %[p2], %[p2], 8 \n\t" 1037 "srl %[p1], %[p1], 8 \n\t" 1038 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 1039 :); 1040 1041 __asm__ __volatile__( 1042 "sb %[p4], 1(%[s2]) \n\t" 1043 "sb %[p3], 0(%[s2]) \n\t" 1044 "sb %[p2], -1(%[s2]) \n\t" 1045 "sb %[p1], -2(%[s2]) \n\t" 1046 : 1047 : 1048 [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1)); 1049 1050 __asm__ __volatile__( 1051 "srl %[p4], %[p4], 8 \n\t" 1052 "srl %[p3], %[p3], 8 \n\t" 1053 "srl %[p2], %[p2], 8 \n\t" 1054 "srl %[p1], %[p1], 8 \n\t" 1055 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 1056 :); 1057 1058 __asm__ __volatile__( 1059 "sb %[p4], 1(%[s1]) \n\t" 1060 "sb %[p3], 0(%[s1]) \n\t" 1061 "sb %[p2], -1(%[s1]) \n\t" 1062 "sb %[p1], -2(%[s1]) \n\t" 1063 : 1064 : 1065 [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1)); 1066 } 1067 } 1068 1069 s1 = s4 + p; 1070 s2 = s1 + p; 1071 s3 = s2 + p; 1072 s4 = s3 + p; 1073 1074 /* load quad-byte vectors 1075 * memory is 4 byte aligned 1076 */ 1077 p2 = *((uint32_t *)(s1 - 4)); 1078 p6 = *((uint32_t *)(s1)); 1079 p1 = *((uint32_t *)(s2 - 4)); 1080 p5 = *((uint32_t *)(s2)); 1081 p0 = *((uint32_t *)(s3 - 4)); 1082 p4 = *((uint32_t *)(s3)); 1083 pm1 = *((uint32_t *)(s4 - 4)); 1084 p3 = *((uint32_t *)(s4)); 1085 1086 /* transpose pm1, p0, p1, p2 */ 1087 __asm__ __volatile__( 1088 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 1089 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 1090 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 1091 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 1092 1093 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 1094 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 1095 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 1096 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 1097 1098 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 1099 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 1100 "append %[p1], %[sec3], 16 \n\t" 1101 "append %[pm1], %[sec4], 16 \n\t" 1102 1103 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 1104 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 1105 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 1106 :); 1107 1108 /* transpose p3, p4, p5, p6 */ 1109 __asm__ __volatile__( 1110 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 1111 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 1112 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 1113 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 1114 1115 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 1116 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 1117 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 1118 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 1119 1120 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 1121 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 1122 "append %[p5], %[sec3], 16 \n\t" 1123 "append %[p3], %[sec4], 16 \n\t" 1124 1125 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 1126 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 1127 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 1128 :); 1129 1130 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 1131 * mask will be zero and filtering is not needed 1132 */ 1133 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 1134 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 1135 thresh, &hev, &mask); 1136 1137 /* if mask == 0 do filtering is not needed */ 1138 if (mask) { 1139 /* filtering */ 1140 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4); 1141 1142 /* unpack processed 4x4 neighborhood 1143 * don't use transpose on output data 1144 * because memory isn't aligned 1145 */ 1146 __asm__ __volatile__( 1147 "sb %[p4], 1(%[s4]) \n\t" 1148 "sb %[p3], 0(%[s4]) \n\t" 1149 "sb %[p2], -1(%[s4]) \n\t" 1150 "sb %[p1], -2(%[s4]) \n\t" 1151 : 1152 : 1153 [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1)); 1154 1155 __asm__ __volatile__( 1156 "srl %[p4], %[p4], 8 \n\t" 1157 "srl %[p3], %[p3], 8 \n\t" 1158 "srl %[p2], %[p2], 8 \n\t" 1159 "srl %[p1], %[p1], 8 \n\t" 1160 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 1161 :); 1162 1163 __asm__ __volatile__( 1164 "sb %[p4], 1(%[s3]) \n\t" 1165 "sb %[p3], 0(%[s3]) \n\t" 1166 "sb %[p2], -1(%[s3]) \n\t" 1167 "sb %[p1], -2(%[s3]) \n\t" 1168 : [p1] "+r"(p1) 1169 : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2)); 1170 1171 __asm__ __volatile__( 1172 "srl %[p4], %[p4], 8 \n\t" 1173 "srl %[p3], %[p3], 8 \n\t" 1174 "srl %[p2], %[p2], 8 \n\t" 1175 "srl %[p1], %[p1], 8 \n\t" 1176 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 1177 :); 1178 1179 __asm__ __volatile__( 1180 "sb %[p4], 1(%[s2]) \n\t" 1181 "sb %[p3], 0(%[s2]) \n\t" 1182 "sb %[p2], -1(%[s2]) \n\t" 1183 "sb %[p1], -2(%[s2]) \n\t" 1184 : 1185 : 1186 [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1)); 1187 1188 __asm__ __volatile__( 1189 "srl %[p4], %[p4], 8 \n\t" 1190 "srl %[p3], %[p3], 8 \n\t" 1191 "srl %[p2], %[p2], 8 \n\t" 1192 "srl %[p1], %[p1], 8 \n\t" 1193 : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) 1194 :); 1195 1196 __asm__ __volatile__( 1197 "sb %[p4], 1(%[s1]) \n\t" 1198 "sb %[p3], 0(%[s1]) \n\t" 1199 "sb %[p2], -1(%[s1]) \n\t" 1200 "sb %[p1], -2(%[s1]) \n\t" 1201 : 1202 : 1203 [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1)); 1204 } 1205 } 1206 } 1207 1208 /* inputs & outputs are quad-byte vectors */ 1209 static __inline void vp8_mbfilter_mips(uint32_t mask, uint32_t hev, 1210 uint32_t *ps2, uint32_t *ps1, 1211 uint32_t *ps0, uint32_t *qs0, 1212 uint32_t *qs1, uint32_t *qs2) { 1213 int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2; 1214 int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l; 1215 int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r; 1216 uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, 1217 subr_r, subr_l; 1218 uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, 1219 invhev_r; 1220 uint32_t N128, R63; 1221 uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r; 1222 1223 R63 = 0x003F003F; 1224 HWM = 0xFF00FF00; 1225 N128 = 0x80808080; 1226 t1 = 0x03000300; 1227 t2 = 0x04000400; 1228 1229 vps0 = (*ps0) ^ N128; 1230 vps1 = (*ps1) ^ N128; 1231 vps2 = (*ps2) ^ N128; 1232 vqs0 = (*qs0) ^ N128; 1233 vqs1 = (*qs1) ^ N128; 1234 vqs2 = (*qs2) ^ N128; 1235 1236 /* use halfword pairs instead quad-bytes because of accuracy */ 1237 vps0_l = vps0 & HWM; 1238 vps0_r = vps0 << 8; 1239 vps0_r = vps0_r & HWM; 1240 1241 vqs0_l = vqs0 & HWM; 1242 vqs0_r = vqs0 << 8; 1243 vqs0_r = vqs0_r & HWM; 1244 1245 vps1_l = vps1 & HWM; 1246 vps1_r = vps1 << 8; 1247 vps1_r = vps1_r & HWM; 1248 1249 vqs1_l = vqs1 & HWM; 1250 vqs1_r = vqs1 << 8; 1251 vqs1_r = vqs1_r & HWM; 1252 1253 vqs2_l = vqs2 & HWM; 1254 vqs2_r = vqs2 << 8; 1255 vqs2_r = vqs2_r & HWM; 1256 1257 __asm__ __volatile__( 1258 /* qs0 - ps0 */ 1259 "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" 1260 "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" 1261 1262 /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */ 1263 "subq_s.ph %[vp8_filter_l], %[vps1_l], %[vqs1_l] \n\t" 1264 "subq_s.ph %[vp8_filter_r], %[vps1_r], %[vqs1_r] \n\t" 1265 1266 : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=r"(vp8_filter_r), 1267 [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r) 1268 : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), 1269 [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), 1270 [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r)); 1271 1272 vps2_l = vps2 & HWM; 1273 vps2_r = vps2 << 8; 1274 vps2_r = vps2_r & HWM; 1275 1276 /* add outer taps if we have high edge variance */ 1277 __asm__ __volatile__( 1278 /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */ 1279 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" 1280 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" 1281 "and %[mask_l], %[HWM], %[mask] \n\t" 1282 "sll %[mask_r], %[mask], 8 \n\t" 1283 "and %[mask_r], %[HWM], %[mask_r] \n\t" 1284 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" 1285 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" 1286 "and %[hev_l], %[HWM], %[hev] \n\t" 1287 "sll %[hev_r], %[hev], 8 \n\t" 1288 "and %[hev_r], %[HWM], %[hev_r] \n\t" 1289 "addq_s.ph %[vp8_filter_l], %[vp8_filter_l], %[subr_l] \n\t" 1290 "addq_s.ph %[vp8_filter_r], %[vp8_filter_r], %[subr_r] \n\t" 1291 1292 /* vp8_filter &= mask; */ 1293 "and %[vp8_filter_l], %[vp8_filter_l], %[mask_l] \n\t" 1294 "and %[vp8_filter_r], %[vp8_filter_r], %[mask_r] \n\t" 1295 1296 /* Filter2 = vp8_filter & hev; */ 1297 "and %[Filter2_l], %[vp8_filter_l], %[hev_l] \n\t" 1298 "and %[Filter2_r], %[vp8_filter_r], %[hev_r] \n\t" 1299 1300 : [vp8_filter_l] "+r"(vp8_filter_l), [vp8_filter_r] "+r"(vp8_filter_r), 1301 [hev_l] "=&r"(hev_l), [hev_r] "=&r"(hev_r), [mask_l] "=&r"(mask_l), 1302 [mask_r] "=&r"(mask_r), [Filter2_l] "=&r"(Filter2_l), 1303 [Filter2_r] "=&r"(Filter2_r) 1304 : [subr_l] "r"(subr_l), [subr_r] "r"(subr_r), [HWM] "r"(HWM), 1305 [hev] "r"(hev), [mask] "r"(mask)); 1306 1307 /* save bottom 3 bits so that we round one side +4 and the other +3 */ 1308 __asm__ __volatile__( 1309 /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */ 1310 "addq_s.ph %[Filter1_l], %[Filter2_l], %[t2] \n\t" 1311 "xor %[invhev_l], %[hev_l], %[HWM] \n\t" 1312 "addq_s.ph %[Filter1_r], %[Filter2_r], %[t2] \n\t" 1313 1314 /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */ 1315 "addq_s.ph %[Filter2_l], %[Filter2_l], %[t1] \n\t" 1316 "addq_s.ph %[Filter2_r], %[Filter2_r], %[t1] \n\t" 1317 1318 "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" 1319 "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" 1320 1321 "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" 1322 "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" 1323 "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" 1324 "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" 1325 "xor %[invhev_r], %[hev_r], %[HWM] \n\t" 1326 1327 /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */ 1328 "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" 1329 "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" 1330 1331 /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */ 1332 "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" 1333 "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" 1334 1335 : [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r), 1336 [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), 1337 [Filter2_l] "+r"(Filter2_l), [Filter2_r] "+r"(Filter2_r), 1338 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), 1339 [vqs0_r] "+r"(vqs0_r) 1340 : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), [hev_l] "r"(hev_l), 1341 [hev_r] "r"(hev_r)); 1342 1343 /* only apply wider filter if not high edge variance */ 1344 __asm__ __volatile__( 1345 /* vp8_filter &= ~hev; */ 1346 "and %[Filter2_l], %[vp8_filter_l], %[invhev_l] \n\t" 1347 "and %[Filter2_r], %[vp8_filter_r], %[invhev_r] \n\t" 1348 1349 "shra.ph %[Filter2_l], %[Filter2_l], 8 \n\t" 1350 "shra.ph %[Filter2_r], %[Filter2_r], 8 \n\t" 1351 1352 : [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r) 1353 : [vp8_filter_l] "r"(vp8_filter_l), [vp8_filter_r] "r"(vp8_filter_r), 1354 [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); 1355 1356 /* roughly 3/7th difference across boundary */ 1357 __asm__ __volatile__( 1358 "shll.ph %[u3_l], %[Filter2_l], 3 \n\t" 1359 "shll.ph %[u3_r], %[Filter2_r], 3 \n\t" 1360 1361 "addq.ph %[u3_l], %[u3_l], %[Filter2_l] \n\t" 1362 "addq.ph %[u3_r], %[u3_r], %[Filter2_r] \n\t" 1363 1364 "shll.ph %[u2_l], %[u3_l], 1 \n\t" 1365 "shll.ph %[u2_r], %[u3_r], 1 \n\t" 1366 1367 "addq.ph %[u1_l], %[u3_l], %[u2_l] \n\t" 1368 "addq.ph %[u1_r], %[u3_r], %[u2_r] \n\t" 1369 1370 "addq.ph %[u2_l], %[u2_l], %[R63] \n\t" 1371 "addq.ph %[u2_r], %[u2_r], %[R63] \n\t" 1372 1373 "addq.ph %[u3_l], %[u3_l], %[R63] \n\t" 1374 "addq.ph %[u3_r], %[u3_r], %[R63] \n\t" 1375 1376 /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7) 1377 * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7) 1378 */ 1379 "addq.ph %[u1_l], %[u1_l], %[R63] \n\t" 1380 "addq.ph %[u1_r], %[u1_r], %[R63] \n\t" 1381 "shra.ph %[u1_l], %[u1_l], 7 \n\t" 1382 "shra.ph %[u1_r], %[u1_r], 7 \n\t" 1383 "shra.ph %[u2_l], %[u2_l], 7 \n\t" 1384 "shra.ph %[u2_r], %[u2_r], 7 \n\t" 1385 "shll.ph %[u1_l], %[u1_l], 8 \n\t" 1386 "shll.ph %[u1_r], %[u1_r], 8 \n\t" 1387 "shll.ph %[u2_l], %[u2_l], 8 \n\t" 1388 "shll.ph %[u2_r], %[u2_r], 8 \n\t" 1389 1390 /* vqs0 = vp8_signed_char_clamp(qs0 - u); */ 1391 "subq_s.ph %[vqs0_l], %[vqs0_l], %[u1_l] \n\t" 1392 "subq_s.ph %[vqs0_r], %[vqs0_r], %[u1_r] \n\t" 1393 1394 /* vps0 = vp8_signed_char_clamp(ps0 + u); */ 1395 "addq_s.ph %[vps0_l], %[vps0_l], %[u1_l] \n\t" 1396 "addq_s.ph %[vps0_r], %[vps0_r], %[u1_r] \n\t" 1397 1398 : [u1_l] "=&r"(u1_l), [u1_r] "=&r"(u1_r), [u2_l] "=&r"(u2_l), 1399 [u2_r] "=&r"(u2_r), [u3_l] "=&r"(u3_l), [u3_r] "=&r"(u3_r), 1400 [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), 1401 [vqs0_r] "+r"(vqs0_r) 1402 : [R63] "r"(R63), [Filter2_l] "r"(Filter2_l), [Filter2_r] "r"(Filter2_r)); 1403 1404 __asm__ __volatile__( 1405 /* vqs1 = vp8_signed_char_clamp(qs1 - u); */ 1406 "subq_s.ph %[vqs1_l], %[vqs1_l], %[u2_l] \n\t" 1407 "addq_s.ph %[vps1_l], %[vps1_l], %[u2_l] \n\t" 1408 1409 /* vps1 = vp8_signed_char_clamp(ps1 + u); */ 1410 "addq_s.ph %[vps1_r], %[vps1_r], %[u2_r] \n\t" 1411 "subq_s.ph %[vqs1_r], %[vqs1_r], %[u2_r] \n\t" 1412 1413 : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), 1414 [vqs1_r] "+r"(vqs1_r) 1415 : [u2_l] "r"(u2_l), [u2_r] "r"(u2_r)); 1416 1417 /* roughly 1/7th difference across boundary */ 1418 __asm__ __volatile__( 1419 /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */ 1420 "shra.ph %[u3_l], %[u3_l], 7 \n\t" 1421 "shra.ph %[u3_r], %[u3_r], 7 \n\t" 1422 "shll.ph %[u3_l], %[u3_l], 8 \n\t" 1423 "shll.ph %[u3_r], %[u3_r], 8 \n\t" 1424 1425 /* vqs2 = vp8_signed_char_clamp(qs2 - u); */ 1426 "subq_s.ph %[vqs2_l], %[vqs2_l], %[u3_l] \n\t" 1427 "subq_s.ph %[vqs2_r], %[vqs2_r], %[u3_r] \n\t" 1428 1429 /* vps2 = vp8_signed_char_clamp(ps2 + u); */ 1430 "addq_s.ph %[vps2_l], %[vps2_l], %[u3_l] \n\t" 1431 "addq_s.ph %[vps2_r], %[vps2_r], %[u3_r] \n\t" 1432 1433 : [u3_l] "+r"(u3_l), [u3_r] "+r"(u3_r), [vps2_l] "+r"(vps2_l), 1434 [vps2_r] "+r"(vps2_r), [vqs2_l] "+r"(vqs2_l), [vqs2_r] "+r"(vqs2_r) 1435 :); 1436 1437 /* Create quad-bytes from halfword pairs */ 1438 __asm__ __volatile__( 1439 "and %[vqs0_l], %[vqs0_l], %[HWM] \n\t" 1440 "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" 1441 1442 "and %[vps0_l], %[vps0_l], %[HWM] \n\t" 1443 "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" 1444 1445 "and %[vqs1_l], %[vqs1_l], %[HWM] \n\t" 1446 "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" 1447 1448 "and %[vps1_l], %[vps1_l], %[HWM] \n\t" 1449 "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" 1450 1451 "and %[vqs2_l], %[vqs2_l], %[HWM] \n\t" 1452 "shrl.ph %[vqs2_r], %[vqs2_r], 8 \n\t" 1453 1454 "and %[vps2_l], %[vps2_l], %[HWM] \n\t" 1455 "shrl.ph %[vps2_r], %[vps2_r], 8 \n\t" 1456 1457 "or %[vqs0_r], %[vqs0_l], %[vqs0_r] \n\t" 1458 "or %[vps0_r], %[vps0_l], %[vps0_r] \n\t" 1459 "or %[vqs1_r], %[vqs1_l], %[vqs1_r] \n\t" 1460 "or %[vps1_r], %[vps1_l], %[vps1_r] \n\t" 1461 "or %[vqs2_r], %[vqs2_l], %[vqs2_r] \n\t" 1462 "or %[vps2_r], %[vps2_l], %[vps2_r] \n\t" 1463 1464 : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), 1465 [vqs1_r] "+r"(vqs1_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), 1466 [vqs0_l] "+r"(vqs0_l), [vqs0_r] "+r"(vqs0_r), [vqs2_l] "+r"(vqs2_l), 1467 [vqs2_r] "+r"(vqs2_r), [vps2_r] "+r"(vps2_r), [vps2_l] "+r"(vps2_l) 1468 : [HWM] "r"(HWM)); 1469 1470 *ps0 = vps0_r ^ N128; 1471 *ps1 = vps1_r ^ N128; 1472 *ps2 = vps2_r ^ N128; 1473 *qs0 = vqs0_r ^ N128; 1474 *qs1 = vqs1_r ^ N128; 1475 *qs2 = vqs2_r ^ N128; 1476 } 1477 1478 void vp8_mbloop_filter_horizontal_edge_mips(unsigned char *s, int p, 1479 unsigned int flimit, 1480 unsigned int limit, 1481 unsigned int thresh, int count) { 1482 int i; 1483 uint32_t mask, hev; 1484 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 1485 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; 1486 1487 mask = 0; 1488 hev = 0; 1489 i = 0; 1490 p1 = 0; 1491 p2 = 0; 1492 p3 = 0; 1493 p4 = 0; 1494 1495 /* loop filter designed to work using chars so that we can make maximum use 1496 * of 8 bit simd instructions. 1497 */ 1498 1499 sm1 = s - (p << 2); 1500 s0 = s - p - p - p; 1501 s1 = s - p - p; 1502 s2 = s - p; 1503 s3 = s; 1504 s4 = s + p; 1505 s5 = s + p + p; 1506 s6 = s + p + p + p; 1507 1508 /* prefetch data for load */ 1509 prefetch_load_lf(s + p); 1510 1511 /* apply filter on 4 pixesl at the same time */ 1512 do { 1513 /* load quad-byte vectors 1514 * memory is 4 byte aligned 1515 */ 1516 p1 = *((uint32_t *)(s1)); 1517 p2 = *((uint32_t *)(s2)); 1518 p3 = *((uint32_t *)(s3)); 1519 p4 = *((uint32_t *)(s4)); 1520 1521 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 1522 * mask will be zero and filtering is not needed 1523 */ 1524 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 1525 pm1 = *((uint32_t *)(sm1)); 1526 p0 = *((uint32_t *)(s0)); 1527 p5 = *((uint32_t *)(s5)); 1528 p6 = *((uint32_t *)(s6)); 1529 1530 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 1531 thresh, &hev, &mask); 1532 1533 /* if mask == 0 do filtering is not needed */ 1534 if (mask) { 1535 /* filtering */ 1536 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); 1537 1538 /* unpack processed 4x4 neighborhood 1539 * memory is 4 byte aligned 1540 */ 1541 *((uint32_t *)s0) = p0; 1542 *((uint32_t *)s1) = p1; 1543 *((uint32_t *)s2) = p2; 1544 *((uint32_t *)s3) = p3; 1545 *((uint32_t *)s4) = p4; 1546 *((uint32_t *)s5) = p5; 1547 } 1548 } 1549 1550 sm1 += 4; 1551 s0 += 4; 1552 s1 += 4; 1553 s2 += 4; 1554 s3 += 4; 1555 s4 += 4; 1556 s5 += 4; 1557 s6 += 4; 1558 1559 /* load quad-byte vectors 1560 * memory is 4 byte aligned 1561 */ 1562 p1 = *((uint32_t *)(s1)); 1563 p2 = *((uint32_t *)(s2)); 1564 p3 = *((uint32_t *)(s3)); 1565 p4 = *((uint32_t *)(s4)); 1566 1567 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 1568 * mask will be zero and filtering is not needed 1569 */ 1570 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 1571 pm1 = *((uint32_t *)(sm1)); 1572 p0 = *((uint32_t *)(s0)); 1573 p5 = *((uint32_t *)(s5)); 1574 p6 = *((uint32_t *)(s6)); 1575 1576 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 1577 thresh, &hev, &mask); 1578 1579 /* if mask == 0 do filtering is not needed */ 1580 if (mask) { 1581 /* filtering */ 1582 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); 1583 1584 /* unpack processed 4x4 neighborhood 1585 * memory is 4 byte aligned 1586 */ 1587 *((uint32_t *)s0) = p0; 1588 *((uint32_t *)s1) = p1; 1589 *((uint32_t *)s2) = p2; 1590 *((uint32_t *)s3) = p3; 1591 *((uint32_t *)s4) = p4; 1592 *((uint32_t *)s5) = p5; 1593 } 1594 } 1595 1596 sm1 += 4; 1597 s0 += 4; 1598 s1 += 4; 1599 s2 += 4; 1600 s3 += 4; 1601 s4 += 4; 1602 s5 += 4; 1603 s6 += 4; 1604 1605 i += 8; 1606 } 1607 1608 while (i < count); 1609 } 1610 1611 void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, 1612 unsigned int flimit, 1613 unsigned int limit, 1614 unsigned int thresh, int count) { 1615 uint32_t mask, hev; 1616 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 1617 unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; 1618 (void)count; 1619 1620 mask = 0; 1621 hev = 0; 1622 p1 = 0; 1623 p2 = 0; 1624 p3 = 0; 1625 p4 = 0; 1626 1627 /* loop filter designed to work using chars so that we can make maximum use 1628 * of 8 bit simd instructions. 1629 */ 1630 1631 sm1 = s - (p << 2); 1632 s0 = s - p - p - p; 1633 s1 = s - p - p; 1634 s2 = s - p; 1635 s3 = s; 1636 s4 = s + p; 1637 s5 = s + p + p; 1638 s6 = s + p + p + p; 1639 1640 /* load quad-byte vectors 1641 * memory is 4 byte aligned 1642 */ 1643 p1 = *((uint32_t *)(s1)); 1644 p2 = *((uint32_t *)(s2)); 1645 p3 = *((uint32_t *)(s3)); 1646 p4 = *((uint32_t *)(s4)); 1647 1648 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 1649 * mask will be zero and filtering is not needed 1650 */ 1651 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 1652 pm1 = *((uint32_t *)(sm1)); 1653 p0 = *((uint32_t *)(s0)); 1654 p5 = *((uint32_t *)(s5)); 1655 p6 = *((uint32_t *)(s6)); 1656 1657 /* if mask == 0 do filtering is not needed */ 1658 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 1659 thresh, &hev, &mask); 1660 1661 if (mask) { 1662 /* filtering */ 1663 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); 1664 1665 /* unpack processed 4x4 neighborhood 1666 * memory is 4 byte aligned 1667 */ 1668 *((uint32_t *)s0) = p0; 1669 *((uint32_t *)s1) = p1; 1670 *((uint32_t *)s2) = p2; 1671 *((uint32_t *)s3) = p3; 1672 *((uint32_t *)s4) = p4; 1673 *((uint32_t *)s5) = p5; 1674 } 1675 } 1676 1677 sm1 += 4; 1678 s0 += 4; 1679 s1 += 4; 1680 s2 += 4; 1681 s3 += 4; 1682 s4 += 4; 1683 s5 += 4; 1684 s6 += 4; 1685 1686 /* load quad-byte vectors 1687 * memory is 4 byte aligned 1688 */ 1689 p1 = *((uint32_t *)(s1)); 1690 p2 = *((uint32_t *)(s2)); 1691 p3 = *((uint32_t *)(s3)); 1692 p4 = *((uint32_t *)(s4)); 1693 1694 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 1695 * mask will be zero and filtering is not needed 1696 */ 1697 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 1698 pm1 = *((uint32_t *)(sm1)); 1699 p0 = *((uint32_t *)(s0)); 1700 p5 = *((uint32_t *)(s5)); 1701 p6 = *((uint32_t *)(s6)); 1702 1703 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 1704 thresh, &hev, &mask); 1705 1706 /* if mask == 0 do filtering is not needed */ 1707 if (mask) { 1708 /* filtering */ 1709 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); 1710 1711 /* unpack processed 4x4 neighborhood 1712 * memory is 4 byte aligned 1713 */ 1714 *((uint32_t *)s0) = p0; 1715 *((uint32_t *)s1) = p1; 1716 *((uint32_t *)s2) = p2; 1717 *((uint32_t *)s3) = p3; 1718 *((uint32_t *)s4) = p4; 1719 *((uint32_t *)s5) = p5; 1720 } 1721 } 1722 } 1723 1724 void vp8_mbloop_filter_vertical_edge_mips(unsigned char *s, int p, 1725 unsigned int flimit, 1726 unsigned int limit, 1727 unsigned int thresh, int count) { 1728 int i; 1729 uint32_t mask, hev; 1730 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 1731 unsigned char *s1, *s2, *s3, *s4; 1732 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 1733 1734 mask = 0; 1735 hev = 0; 1736 i = 0; 1737 pm1 = 0; 1738 p0 = 0; 1739 p1 = 0; 1740 p2 = 0; 1741 p3 = 0; 1742 p4 = 0; 1743 p5 = 0; 1744 p6 = 0; 1745 1746 /* loop filter designed to work using chars so that we can make maximum use 1747 * of 8 bit simd instructions. 1748 */ 1749 1750 /* apply filter on 4 pixesl at the same time */ 1751 do { 1752 s1 = s; 1753 s2 = s + p; 1754 s3 = s2 + p; 1755 s4 = s3 + p; 1756 s = s4 + p; 1757 1758 /* load quad-byte vectors 1759 * memory is 4 byte aligned 1760 */ 1761 p2 = *((uint32_t *)(s1 - 4)); 1762 p6 = *((uint32_t *)(s1)); 1763 p1 = *((uint32_t *)(s2 - 4)); 1764 p5 = *((uint32_t *)(s2)); 1765 p0 = *((uint32_t *)(s3 - 4)); 1766 p4 = *((uint32_t *)(s3)); 1767 pm1 = *((uint32_t *)(s4 - 4)); 1768 p3 = *((uint32_t *)(s4)); 1769 1770 /* transpose pm1, p0, p1, p2 */ 1771 __asm__ __volatile__( 1772 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 1773 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 1774 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 1775 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 1776 1777 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 1778 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 1779 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 1780 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 1781 1782 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 1783 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 1784 "append %[p1], %[sec3], 16 \n\t" 1785 "append %[pm1], %[sec4], 16 \n\t" 1786 1787 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 1788 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 1789 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 1790 :); 1791 1792 /* transpose p3, p4, p5, p6 */ 1793 __asm__ __volatile__( 1794 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 1795 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 1796 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 1797 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 1798 1799 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 1800 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 1801 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 1802 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 1803 1804 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 1805 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 1806 "append %[p5], %[sec3], 16 \n\t" 1807 "append %[p3], %[sec4], 16 \n\t" 1808 1809 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 1810 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 1811 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 1812 :); 1813 1814 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 1815 * mask will be zero and filtering is not needed 1816 */ 1817 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 1818 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 1819 thresh, &hev, &mask); 1820 1821 /* if mask == 0 do filtering is not needed */ 1822 if (mask) { 1823 /* filtering */ 1824 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); 1825 1826 /* don't use transpose on output data 1827 * because memory isn't aligned 1828 */ 1829 __asm__ __volatile__( 1830 "sb %[p5], 2(%[s4]) \n\t" 1831 "sb %[p4], 1(%[s4]) \n\t" 1832 "sb %[p3], 0(%[s4]) \n\t" 1833 "sb %[p2], -1(%[s4]) \n\t" 1834 "sb %[p1], -2(%[s4]) \n\t" 1835 "sb %[p0], -3(%[s4]) \n\t" 1836 : 1837 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), 1838 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 1839 1840 __asm__ __volatile__( 1841 "srl %[p5], %[p5], 8 \n\t" 1842 "srl %[p4], %[p4], 8 \n\t" 1843 "srl %[p3], %[p3], 8 \n\t" 1844 "srl %[p2], %[p2], 8 \n\t" 1845 "srl %[p1], %[p1], 8 \n\t" 1846 "srl %[p0], %[p0], 8 \n\t" 1847 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 1848 [p1] "+r"(p1), [p0] "+r"(p0) 1849 :); 1850 1851 __asm__ __volatile__( 1852 "sb %[p5], 2(%[s3]) \n\t" 1853 "sb %[p4], 1(%[s3]) \n\t" 1854 "sb %[p3], 0(%[s3]) \n\t" 1855 "sb %[p2], -1(%[s3]) \n\t" 1856 "sb %[p1], -2(%[s3]) \n\t" 1857 "sb %[p0], -3(%[s3]) \n\t" 1858 : 1859 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), 1860 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 1861 1862 __asm__ __volatile__( 1863 "srl %[p5], %[p5], 8 \n\t" 1864 "srl %[p4], %[p4], 8 \n\t" 1865 "srl %[p3], %[p3], 8 \n\t" 1866 "srl %[p2], %[p2], 8 \n\t" 1867 "srl %[p1], %[p1], 8 \n\t" 1868 "srl %[p0], %[p0], 8 \n\t" 1869 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 1870 [p1] "+r"(p1), [p0] "+r"(p0) 1871 :); 1872 1873 __asm__ __volatile__( 1874 "sb %[p5], 2(%[s2]) \n\t" 1875 "sb %[p4], 1(%[s2]) \n\t" 1876 "sb %[p3], 0(%[s2]) \n\t" 1877 "sb %[p2], -1(%[s2]) \n\t" 1878 "sb %[p1], -2(%[s2]) \n\t" 1879 "sb %[p0], -3(%[s2]) \n\t" 1880 : 1881 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), 1882 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 1883 1884 __asm__ __volatile__( 1885 "srl %[p5], %[p5], 8 \n\t" 1886 "srl %[p4], %[p4], 8 \n\t" 1887 "srl %[p3], %[p3], 8 \n\t" 1888 "srl %[p2], %[p2], 8 \n\t" 1889 "srl %[p1], %[p1], 8 \n\t" 1890 "srl %[p0], %[p0], 8 \n\t" 1891 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 1892 [p1] "+r"(p1), [p0] "+r"(p0) 1893 :); 1894 1895 __asm__ __volatile__( 1896 "sb %[p5], 2(%[s1]) \n\t" 1897 "sb %[p4], 1(%[s1]) \n\t" 1898 "sb %[p3], 0(%[s1]) \n\t" 1899 "sb %[p2], -1(%[s1]) \n\t" 1900 "sb %[p1], -2(%[s1]) \n\t" 1901 "sb %[p0], -3(%[s1]) \n\t" 1902 : 1903 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), 1904 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 1905 } 1906 } 1907 1908 i += 4; 1909 } 1910 1911 while (i < count); 1912 } 1913 1914 void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p, 1915 unsigned int flimit, 1916 unsigned int limit, 1917 unsigned int thresh, int count) { 1918 uint32_t mask, hev; 1919 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 1920 unsigned char *s1, *s2, *s3, *s4; 1921 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 1922 (void)count; 1923 1924 mask = 0; 1925 hev = 0; 1926 pm1 = 0; 1927 p0 = 0; 1928 p1 = 0; 1929 p2 = 0; 1930 p3 = 0; 1931 p4 = 0; 1932 p5 = 0; 1933 p6 = 0; 1934 1935 /* loop filter designed to work using chars so that we can make maximum use 1936 * of 8 bit simd instructions. 1937 */ 1938 1939 /* apply filter on 4 pixesl at the same time */ 1940 1941 s1 = s; 1942 s2 = s + p; 1943 s3 = s2 + p; 1944 s4 = s3 + p; 1945 1946 /* prefetch data for load */ 1947 prefetch_load_lf(s + 2 * p); 1948 1949 /* load quad-byte vectors 1950 * memory is 4 byte aligned 1951 */ 1952 p2 = *((uint32_t *)(s1 - 4)); 1953 p6 = *((uint32_t *)(s1)); 1954 p1 = *((uint32_t *)(s2 - 4)); 1955 p5 = *((uint32_t *)(s2)); 1956 p0 = *((uint32_t *)(s3 - 4)); 1957 p4 = *((uint32_t *)(s3)); 1958 pm1 = *((uint32_t *)(s4 - 4)); 1959 p3 = *((uint32_t *)(s4)); 1960 1961 /* transpose pm1, p0, p1, p2 */ 1962 __asm__ __volatile__( 1963 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 1964 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 1965 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 1966 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 1967 1968 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 1969 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 1970 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 1971 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 1972 1973 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 1974 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 1975 "append %[p1], %[sec3], 16 \n\t" 1976 "append %[pm1], %[sec4], 16 \n\t" 1977 1978 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 1979 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 1980 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 1981 :); 1982 1983 /* transpose p3, p4, p5, p6 */ 1984 __asm__ __volatile__( 1985 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 1986 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 1987 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 1988 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 1989 1990 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 1991 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 1992 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 1993 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 1994 1995 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 1996 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 1997 "append %[p5], %[sec3], 16 \n\t" 1998 "append %[p3], %[sec4], 16 \n\t" 1999 2000 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 2001 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 2002 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 2003 :); 2004 2005 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 2006 * mask will be zero and filtering is not needed 2007 */ 2008 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 2009 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 2010 thresh, &hev, &mask); 2011 2012 /* if mask == 0 do filtering is not needed */ 2013 if (mask) { 2014 /* filtering */ 2015 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); 2016 2017 /* don't use transpose on output data 2018 * because memory isn't aligned 2019 */ 2020 __asm__ __volatile__( 2021 "sb %[p5], 2(%[s4]) \n\t" 2022 "sb %[p4], 1(%[s4]) \n\t" 2023 "sb %[p3], 0(%[s4]) \n\t" 2024 "sb %[p2], -1(%[s4]) \n\t" 2025 "sb %[p1], -2(%[s4]) \n\t" 2026 "sb %[p0], -3(%[s4]) \n\t" 2027 : 2028 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), 2029 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2030 2031 __asm__ __volatile__( 2032 "srl %[p5], %[p5], 8 \n\t" 2033 "srl %[p4], %[p4], 8 \n\t" 2034 "srl %[p3], %[p3], 8 \n\t" 2035 "srl %[p2], %[p2], 8 \n\t" 2036 "srl %[p1], %[p1], 8 \n\t" 2037 "srl %[p0], %[p0], 8 \n\t" 2038 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 2039 [p1] "+r"(p1), [p0] "+r"(p0) 2040 :); 2041 2042 __asm__ __volatile__( 2043 "sb %[p5], 2(%[s3]) \n\t" 2044 "sb %[p4], 1(%[s3]) \n\t" 2045 "sb %[p3], 0(%[s3]) \n\t" 2046 "sb %[p2], -1(%[s3]) \n\t" 2047 "sb %[p1], -2(%[s3]) \n\t" 2048 "sb %[p0], -3(%[s3]) \n\t" 2049 : 2050 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), 2051 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2052 2053 __asm__ __volatile__( 2054 "srl %[p5], %[p5], 8 \n\t" 2055 "srl %[p4], %[p4], 8 \n\t" 2056 "srl %[p3], %[p3], 8 \n\t" 2057 "srl %[p2], %[p2], 8 \n\t" 2058 "srl %[p1], %[p1], 8 \n\t" 2059 "srl %[p0], %[p0], 8 \n\t" 2060 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 2061 [p1] "+r"(p1), [p0] "+r"(p0) 2062 :); 2063 2064 __asm__ __volatile__( 2065 "sb %[p5], 2(%[s2]) \n\t" 2066 "sb %[p4], 1(%[s2]) \n\t" 2067 "sb %[p3], 0(%[s2]) \n\t" 2068 "sb %[p2], -1(%[s2]) \n\t" 2069 "sb %[p1], -2(%[s2]) \n\t" 2070 "sb %[p0], -3(%[s2]) \n\t" 2071 : 2072 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), 2073 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2074 2075 __asm__ __volatile__( 2076 "srl %[p5], %[p5], 8 \n\t" 2077 "srl %[p4], %[p4], 8 \n\t" 2078 "srl %[p3], %[p3], 8 \n\t" 2079 "srl %[p2], %[p2], 8 \n\t" 2080 "srl %[p1], %[p1], 8 \n\t" 2081 "srl %[p0], %[p0], 8 \n\t" 2082 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 2083 [p1] "+r"(p1), [p0] "+r"(p0) 2084 :); 2085 2086 __asm__ __volatile__( 2087 "sb %[p5], 2(%[s1]) \n\t" 2088 "sb %[p4], 1(%[s1]) \n\t" 2089 "sb %[p3], 0(%[s1]) \n\t" 2090 "sb %[p2], -1(%[s1]) \n\t" 2091 "sb %[p1], -2(%[s1]) \n\t" 2092 "sb %[p0], -3(%[s1]) \n\t" 2093 : 2094 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), 2095 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2096 } 2097 } 2098 2099 s1 = s4 + p; 2100 s2 = s1 + p; 2101 s3 = s2 + p; 2102 s4 = s3 + p; 2103 2104 /* load quad-byte vectors 2105 * memory is 4 byte aligned 2106 */ 2107 p2 = *((uint32_t *)(s1 - 4)); 2108 p6 = *((uint32_t *)(s1)); 2109 p1 = *((uint32_t *)(s2 - 4)); 2110 p5 = *((uint32_t *)(s2)); 2111 p0 = *((uint32_t *)(s3 - 4)); 2112 p4 = *((uint32_t *)(s3)); 2113 pm1 = *((uint32_t *)(s4 - 4)); 2114 p3 = *((uint32_t *)(s4)); 2115 2116 /* transpose pm1, p0, p1, p2 */ 2117 __asm__ __volatile__( 2118 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 2119 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 2120 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 2121 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 2122 2123 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 2124 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 2125 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 2126 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 2127 2128 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 2129 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 2130 "append %[p1], %[sec3], 16 \n\t" 2131 "append %[pm1], %[sec4], 16 \n\t" 2132 2133 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 2134 [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), 2135 [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 2136 :); 2137 2138 /* transpose p3, p4, p5, p6 */ 2139 __asm__ __volatile__( 2140 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 2141 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 2142 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 2143 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 2144 2145 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 2146 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 2147 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 2148 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 2149 2150 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 2151 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 2152 "append %[p5], %[sec3], 16 \n\t" 2153 "append %[p3], %[sec4], 16 \n\t" 2154 2155 : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), 2156 [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), 2157 [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) 2158 :); 2159 2160 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 2161 * mask will be zero and filtering is not needed 2162 */ 2163 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 2164 vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, 2165 thresh, &hev, &mask); 2166 2167 /* if mask == 0 do filtering is not needed */ 2168 if (mask) { 2169 /* filtering */ 2170 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5); 2171 2172 /* don't use transpose on output data 2173 * because memory isn't aligned 2174 */ 2175 __asm__ __volatile__( 2176 "sb %[p5], 2(%[s4]) \n\t" 2177 "sb %[p4], 1(%[s4]) \n\t" 2178 "sb %[p3], 0(%[s4]) \n\t" 2179 "sb %[p2], -1(%[s4]) \n\t" 2180 "sb %[p1], -2(%[s4]) \n\t" 2181 "sb %[p0], -3(%[s4]) \n\t" 2182 : 2183 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), 2184 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2185 2186 __asm__ __volatile__( 2187 "srl %[p5], %[p5], 8 \n\t" 2188 "srl %[p4], %[p4], 8 \n\t" 2189 "srl %[p3], %[p3], 8 \n\t" 2190 "srl %[p2], %[p2], 8 \n\t" 2191 "srl %[p1], %[p1], 8 \n\t" 2192 "srl %[p0], %[p0], 8 \n\t" 2193 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 2194 [p1] "+r"(p1), [p0] "+r"(p0) 2195 :); 2196 2197 __asm__ __volatile__( 2198 "sb %[p5], 2(%[s3]) \n\t" 2199 "sb %[p4], 1(%[s3]) \n\t" 2200 "sb %[p3], 0(%[s3]) \n\t" 2201 "sb %[p2], -1(%[s3]) \n\t" 2202 "sb %[p1], -2(%[s3]) \n\t" 2203 "sb %[p0], -3(%[s3]) \n\t" 2204 : 2205 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), 2206 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2207 2208 __asm__ __volatile__( 2209 "srl %[p5], %[p5], 8 \n\t" 2210 "srl %[p4], %[p4], 8 \n\t" 2211 "srl %[p3], %[p3], 8 \n\t" 2212 "srl %[p2], %[p2], 8 \n\t" 2213 "srl %[p1], %[p1], 8 \n\t" 2214 "srl %[p0], %[p0], 8 \n\t" 2215 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 2216 [p1] "+r"(p1), [p0] "+r"(p0) 2217 :); 2218 2219 __asm__ __volatile__( 2220 "sb %[p5], 2(%[s2]) \n\t" 2221 "sb %[p4], 1(%[s2]) \n\t" 2222 "sb %[p3], 0(%[s2]) \n\t" 2223 "sb %[p2], -1(%[s2]) \n\t" 2224 "sb %[p1], -2(%[s2]) \n\t" 2225 "sb %[p0], -3(%[s2]) \n\t" 2226 : 2227 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), 2228 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2229 2230 __asm__ __volatile__( 2231 "srl %[p5], %[p5], 8 \n\t" 2232 "srl %[p4], %[p4], 8 \n\t" 2233 "srl %[p3], %[p3], 8 \n\t" 2234 "srl %[p2], %[p2], 8 \n\t" 2235 "srl %[p1], %[p1], 8 \n\t" 2236 "srl %[p0], %[p0], 8 \n\t" 2237 : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), 2238 [p1] "+r"(p1), [p0] "+r"(p0) 2239 :); 2240 2241 __asm__ __volatile__( 2242 "sb %[p5], 2(%[s1]) \n\t" 2243 "sb %[p4], 1(%[s1]) \n\t" 2244 "sb %[p3], 0(%[s1]) \n\t" 2245 "sb %[p2], -1(%[s1]) \n\t" 2246 "sb %[p1], -2(%[s1]) \n\t" 2247 "sb %[p0], -3(%[s1]) \n\t" 2248 : 2249 : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), 2250 [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0)); 2251 } 2252 } 2253 } 2254 2255 /* Horizontal MB filtering */ 2256 void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, 2257 unsigned char *v_ptr, int y_stride, 2258 int uv_stride, loop_filter_info *lfi) { 2259 unsigned int thresh_vec, flimit_vec, limit_vec; 2260 unsigned char thresh, flimit, limit, flimit_temp; 2261 2262 /* use direct value instead pointers */ 2263 limit = *(lfi->lim); 2264 flimit_temp = *(lfi->mblim); 2265 thresh = *(lfi->hev_thr); 2266 flimit = flimit_temp; 2267 2268 /* create quad-byte */ 2269 __asm__ __volatile__( 2270 "replv.qb %[thresh_vec], %[thresh] \n\t" 2271 "replv.qb %[flimit_vec], %[flimit] \n\t" 2272 "replv.qb %[limit_vec], %[limit] \n\t" 2273 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), 2274 [limit_vec] "=r"(limit_vec) 2275 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); 2276 2277 vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, 2278 thresh_vec, 16); 2279 2280 if (u_ptr) { 2281 vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, 2282 limit_vec, thresh_vec, 0); 2283 } 2284 2285 if (v_ptr) { 2286 vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, 2287 limit_vec, thresh_vec, 0); 2288 } 2289 } 2290 2291 /* Vertical MB Filtering */ 2292 void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, 2293 unsigned char *v_ptr, int y_stride, 2294 int uv_stride, loop_filter_info *lfi) { 2295 unsigned int thresh_vec, flimit_vec, limit_vec; 2296 unsigned char thresh, flimit, limit, flimit_temp; 2297 2298 /* use direct value instead pointers */ 2299 limit = *(lfi->lim); 2300 flimit_temp = *(lfi->mblim); 2301 thresh = *(lfi->hev_thr); 2302 flimit = flimit_temp; 2303 2304 /* create quad-byte */ 2305 __asm__ __volatile__( 2306 "replv.qb %[thresh_vec], %[thresh] \n\t" 2307 "replv.qb %[flimit_vec], %[flimit] \n\t" 2308 "replv.qb %[limit_vec], %[limit] \n\t" 2309 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), 2310 [limit_vec] "=r"(limit_vec) 2311 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); 2312 2313 vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, 2314 thresh_vec, 16); 2315 2316 if (u_ptr) 2317 vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, 2318 limit_vec, thresh_vec, 0); 2319 2320 if (v_ptr) 2321 vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, 2322 limit_vec, thresh_vec, 0); 2323 } 2324 2325 /* Horizontal B Filtering */ 2326 void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, 2327 unsigned char *v_ptr, int y_stride, int uv_stride, 2328 loop_filter_info *lfi) { 2329 unsigned int thresh_vec, flimit_vec, limit_vec; 2330 unsigned char thresh, flimit, limit, flimit_temp; 2331 2332 /* use direct value instead pointers */ 2333 limit = *(lfi->lim); 2334 flimit_temp = *(lfi->blim); 2335 thresh = *(lfi->hev_thr); 2336 flimit = flimit_temp; 2337 2338 /* create quad-byte */ 2339 __asm__ __volatile__( 2340 "replv.qb %[thresh_vec], %[thresh] \n\t" 2341 "replv.qb %[flimit_vec], %[flimit] \n\t" 2342 "replv.qb %[limit_vec], %[limit] \n\t" 2343 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), 2344 [limit_vec] "=r"(limit_vec) 2345 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); 2346 2347 vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, 2348 flimit_vec, limit_vec, thresh_vec, 16); 2349 vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, 2350 flimit_vec, limit_vec, thresh_vec, 16); 2351 vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, 2352 flimit_vec, limit_vec, thresh_vec, 16); 2353 2354 if (u_ptr) 2355 vp8_loop_filter_uvhorizontal_edge_mips( 2356 u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0); 2357 2358 if (v_ptr) 2359 vp8_loop_filter_uvhorizontal_edge_mips( 2360 v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0); 2361 } 2362 2363 /* Vertical B Filtering */ 2364 void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, 2365 unsigned char *v_ptr, int y_stride, int uv_stride, 2366 loop_filter_info *lfi) { 2367 unsigned int thresh_vec, flimit_vec, limit_vec; 2368 unsigned char thresh, flimit, limit, flimit_temp; 2369 2370 /* use direct value instead pointers */ 2371 limit = *(lfi->lim); 2372 flimit_temp = *(lfi->blim); 2373 thresh = *(lfi->hev_thr); 2374 flimit = flimit_temp; 2375 2376 /* create quad-byte */ 2377 __asm__ __volatile__( 2378 "replv.qb %[thresh_vec], %[thresh] \n\t" 2379 "replv.qb %[flimit_vec], %[flimit] \n\t" 2380 "replv.qb %[limit_vec], %[limit] \n\t" 2381 : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), 2382 [limit_vec] "=r"(limit_vec) 2383 : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit)); 2384 2385 vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, 2386 thresh_vec, 16); 2387 vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, 2388 thresh_vec, 16); 2389 vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, 2390 limit_vec, thresh_vec, 16); 2391 2392 if (u_ptr) 2393 vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, 2394 limit_vec, thresh_vec, 0); 2395 2396 if (v_ptr) 2397 vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, 2398 limit_vec, thresh_vec, 0); 2399 } 2400 2401 #endif 2402