1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include <stdlib.h> 13 #include "vp8_rtcd.h" 14 #include "vpx_ports/mem.h" 15 16 #if HAVE_DSPR2 17 #define CROP_WIDTH 256 18 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; 19 20 static const unsigned short sub_pel_filterss[8][3] = 21 { 22 { 0, 0, 0}, 23 { 0, 0x0601, 0x7b0c}, 24 { 0x0201, 0x0b08, 0x6c24}, 25 { 0, 0x0906, 0x5d32}, 26 { 0x0303, 0x1010, 0x4d4d}, 27 { 0, 0x0609, 0x325d}, 28 { 0x0102, 0x080b, 0x246c}, 29 { 0, 0x0106, 0x0c7b}, 30 }; 31 32 33 static const int sub_pel_filters_int[8][3] = 34 { 35 { 0, 0, 0}, 36 { 0x0000fffa, 0x007b000c, 0xffff0000}, 37 { 0x0002fff5, 0x006c0024, 0xfff80001}, 38 { 0x0000fff7, 0x005d0032, 0xfffa0000}, 39 { 0x0003fff0, 0x004d004d, 0xfff00003}, 40 { 0x0000fffa, 0x0032005d, 0xfff70000}, 41 { 0x0001fff8, 0x0024006c, 0xfff50002}, 42 { 0x0000ffff, 0x000c007b, 0xfffa0000}, 43 }; 44 45 46 static const int sub_pel_filters_inv[8][3] = 47 { 48 { 0, 0, 0}, 49 { 0xfffa0000, 0x000c007b, 0x0000ffff}, 50 { 0xfff50002, 0x0024006c, 0x0001fff8}, 51 { 0xfff70000, 0x0032005d, 0x0000fffa}, 52 { 0xfff00003, 0x004d004d, 0x0003fff0}, 53 { 0xfffa0000, 0x005d0032, 0x0000fff7}, 54 { 0xfff80001, 0x006c0024, 0x0002fff5}, 55 { 0xffff0000, 0x007b000c, 0x0000fffa}, 56 }; 57 58 59 static const int sub_pel_filters_int_tap_4[8][2] = 60 { 61 { 0, 0}, 62 { 0xfffa007b, 0x000cffff}, 63 { 0, 0}, 64 { 0xfff7005d, 0x0032fffa}, 65 { 0, 0}, 66 { 0xfffa0032, 0x005dfff7}, 67 { 0, 0}, 68 { 0xffff000c, 0x007bfffa}, 69 }; 70 71 72 static const int sub_pel_filters_inv_tap_4[8][2] = 73 { 74 { 0, 0}, 75 { 0x007bfffa, 0xffff000c}, 76 { 0, 0}, 77 { 0x005dfff7, 0xfffa0032}, 78 { 0, 0}, 79 { 0x0032fffa, 0xfff7005d}, 80 { 0, 0}, 81 { 0x000cffff, 0xfffa007b}, 82 }; 83 84 inline void prefetch_load(unsigned char *src) 85 { 86 __asm__ __volatile__ ( 87 "pref 0, 0(%[src]) \n\t" 88 : 89 : [src] "r" (src) 90 ); 91 } 92 93 94 inline void prefetch_store(unsigned char *dst) 95 { 96 __asm__ __volatile__ ( 97 "pref 1, 0(%[dst]) \n\t" 98 : 99 : [dst] "r" (dst) 100 ); 101 } 102 103 void dsputil_static_init(void) 104 { 105 int i; 106 107 for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i; 108 109 for (i = 0; i < CROP_WIDTH; i++) 110 { 111 ff_cropTbl[i] = 0; 112 ff_cropTbl[i + CROP_WIDTH + 256] = 255; 113 } 114 } 115 116 void vp8_filter_block2d_first_pass_4 117 ( 118 unsigned char *RESTRICT src_ptr, 119 unsigned char *RESTRICT dst_ptr, 120 unsigned int src_pixels_per_line, 121 unsigned int output_height, 122 int xoffset, 123 int pitch 124 ) 125 { 126 unsigned int i; 127 int Temp1, Temp2, Temp3, Temp4; 128 129 unsigned int vector4a = 64; 130 int vector1b, vector2b, vector3b; 131 unsigned int tp1, tp2, tn1, tn2; 132 unsigned int p1, p2, p3; 133 unsigned int n1, n2, n3; 134 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 135 136 vector3b = sub_pel_filters_inv[xoffset][2]; 137 138 /* if (xoffset == 0) we don't need any filtering */ 139 if (vector3b == 0) 140 { 141 for (i = 0; i < output_height; i++) 142 { 143 /* prefetch src_ptr data to cache memory */ 144 prefetch_load(src_ptr + src_pixels_per_line); 145 dst_ptr[0] = src_ptr[0]; 146 dst_ptr[1] = src_ptr[1]; 147 dst_ptr[2] = src_ptr[2]; 148 dst_ptr[3] = src_ptr[3]; 149 150 /* next row... */ 151 src_ptr += src_pixels_per_line; 152 dst_ptr += 4; 153 } 154 } 155 else 156 { 157 if (vector3b > 65536) 158 { 159 /* 6 tap filter */ 160 161 vector1b = sub_pel_filters_inv[xoffset][0]; 162 vector2b = sub_pel_filters_inv[xoffset][1]; 163 164 /* prefetch src_ptr data to cache memory */ 165 prefetch_load(src_ptr + src_pixels_per_line); 166 167 for (i = output_height; i--;) 168 { 169 /* apply filter with vectors pairs */ 170 __asm__ __volatile__ ( 171 "ulw %[tp1], -2(%[src_ptr]) \n\t" 172 "ulw %[tp2], 2(%[src_ptr]) \n\t" 173 174 /* even 1. pixel */ 175 "mtlo %[vector4a], $ac3 \n\t" 176 "preceu.ph.qbr %[p1], %[tp1] \n\t" 177 "preceu.ph.qbl %[p2], %[tp1] \n\t" 178 "preceu.ph.qbr %[p3], %[tp2] \n\t" 179 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 180 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 181 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 182 183 /* even 2. pixel */ 184 "mtlo %[vector4a], $ac2 \n\t" 185 "preceu.ph.qbl %[p1], %[tp2] \n\t" 186 "balign %[tp2], %[tp1], 3 \n\t" 187 "extp %[Temp1], $ac3, 9 \n\t" 188 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 189 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 190 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 191 192 /* odd 1. pixel */ 193 "ulw %[tn2], 3(%[src_ptr]) \n\t" 194 "mtlo %[vector4a], $ac3 \n\t" 195 "preceu.ph.qbr %[n1], %[tp2] \n\t" 196 "preceu.ph.qbl %[n2], %[tp2] \n\t" 197 "preceu.ph.qbr %[n3], %[tn2] \n\t" 198 "extp %[Temp3], $ac2, 9 \n\t" 199 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 200 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 201 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 202 203 /* even 2. pixel */ 204 "mtlo %[vector4a], $ac2 \n\t" 205 "preceu.ph.qbl %[n1], %[tn2] \n\t" 206 "extp %[Temp2], $ac3, 9 \n\t" 207 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 208 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 209 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 210 "extp %[Temp4], $ac2, 9 \n\t" 211 212 /* clamp */ 213 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 214 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 215 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 216 "lbux %[n2], %[Temp4](%[cm]) \n\t" 217 218 /* store bytes */ 219 "sb %[tp1], 0(%[dst_ptr]) \n\t" 220 "sb %[tn1], 1(%[dst_ptr]) \n\t" 221 "sb %[tp2], 2(%[dst_ptr]) \n\t" 222 "sb %[n2], 3(%[dst_ptr]) \n\t" 223 224 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), 225 [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2), 226 [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2), 227 [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 228 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 229 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 230 [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr), 231 [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr) 232 ); 233 234 /* Next row... */ 235 src_ptr += src_pixels_per_line; 236 dst_ptr += pitch; 237 } 238 } 239 else 240 { 241 /* 4 tap filter */ 242 243 vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; 244 vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; 245 246 for (i = output_height; i--;) 247 { 248 /* apply filter with vectors pairs */ 249 __asm__ __volatile__ ( 250 "ulw %[tp1], -1(%[src_ptr]) \n\t" 251 "ulw %[tp2], 3(%[src_ptr]) \n\t" 252 253 /* even 1. pixel */ 254 "mtlo %[vector4a], $ac3 \n\t" 255 "preceu.ph.qbr %[p1], %[tp1] \n\t" 256 "preceu.ph.qbl %[p2], %[tp1] \n\t" 257 "preceu.ph.qbr %[p3], %[tp2] \n\t" 258 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 259 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 260 261 /* even 2. pixel */ 262 "mtlo %[vector4a], $ac2 \n\t" 263 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 264 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 265 "extp %[Temp1], $ac3, 9 \n\t" 266 267 /* odd 1. pixel */ 268 "srl %[tn1], %[tp2], 8 \n\t" 269 "balign %[tp2], %[tp1], 3 \n\t" 270 "mtlo %[vector4a], $ac3 \n\t" 271 "preceu.ph.qbr %[n1], %[tp2] \n\t" 272 "preceu.ph.qbl %[n2], %[tp2] \n\t" 273 "preceu.ph.qbr %[n3], %[tn1] \n\t" 274 "extp %[Temp3], $ac2, 9 \n\t" 275 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 276 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 277 278 /* odd 2. pixel */ 279 "mtlo %[vector4a], $ac2 \n\t" 280 "extp %[Temp2], $ac3, 9 \n\t" 281 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 282 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 283 "extp %[Temp4], $ac2, 9 \n\t" 284 285 /* clamp and store results */ 286 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 287 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 288 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 289 "sb %[tp1], 0(%[dst_ptr]) \n\t" 290 "sb %[tn1], 1(%[dst_ptr]) \n\t" 291 "lbux %[n2], %[Temp4](%[cm]) \n\t" 292 "sb %[tp2], 2(%[dst_ptr]) \n\t" 293 "sb %[n2], 3(%[dst_ptr]) \n\t" 294 295 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), 296 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), 297 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), 298 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 299 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 300 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 301 [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr), 302 [src_ptr] "r" (src_ptr) 303 ); 304 /* Next row... */ 305 src_ptr += src_pixels_per_line; 306 dst_ptr += pitch; 307 } 308 } 309 } 310 } 311 312 void vp8_filter_block2d_first_pass_8_all 313 ( 314 unsigned char *RESTRICT src_ptr, 315 unsigned char *RESTRICT dst_ptr, 316 unsigned int src_pixels_per_line, 317 unsigned int output_height, 318 int xoffset, 319 int pitch 320 ) 321 { 322 unsigned int i; 323 int Temp1, Temp2, Temp3, Temp4; 324 325 unsigned int vector4a = 64; 326 unsigned int vector1b, vector2b, vector3b; 327 unsigned int tp1, tp2, tn1, tn2; 328 unsigned int p1, p2, p3, p4; 329 unsigned int n1, n2, n3, n4; 330 331 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 332 333 /* if (xoffset == 0) we don't need any filtering */ 334 if (xoffset == 0) 335 { 336 for (i = 0; i < output_height; i++) 337 { 338 /* prefetch src_ptr data to cache memory */ 339 prefetch_load(src_ptr + src_pixels_per_line); 340 341 dst_ptr[0] = src_ptr[0]; 342 dst_ptr[1] = src_ptr[1]; 343 dst_ptr[2] = src_ptr[2]; 344 dst_ptr[3] = src_ptr[3]; 345 dst_ptr[4] = src_ptr[4]; 346 dst_ptr[5] = src_ptr[5]; 347 dst_ptr[6] = src_ptr[6]; 348 dst_ptr[7] = src_ptr[7]; 349 350 /* next row... */ 351 src_ptr += src_pixels_per_line; 352 dst_ptr += 8; 353 } 354 } 355 else 356 { 357 vector3b = sub_pel_filters_inv[xoffset][2]; 358 359 if (vector3b > 65536) 360 { 361 /* 6 tap filter */ 362 363 vector1b = sub_pel_filters_inv[xoffset][0]; 364 vector2b = sub_pel_filters_inv[xoffset][1]; 365 366 for (i = output_height; i--;) 367 { 368 /* prefetch src_ptr data to cache memory */ 369 prefetch_load(src_ptr + src_pixels_per_line); 370 371 /* apply filter with vectors pairs */ 372 __asm__ __volatile__ ( 373 "ulw %[tp1], -2(%[src_ptr]) \n\t" 374 "ulw %[tp2], 2(%[src_ptr]) \n\t" 375 376 /* even 1. pixel */ 377 "mtlo %[vector4a], $ac3 \n\t" 378 "preceu.ph.qbr %[p1], %[tp1] \n\t" 379 "preceu.ph.qbl %[p2], %[tp1] \n\t" 380 "preceu.ph.qbr %[p3], %[tp2] \n\t" 381 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 382 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 383 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 384 385 /* even 2. pixel */ 386 "mtlo %[vector4a], $ac2 \n\t" 387 "preceu.ph.qbl %[p1], %[tp2] \n\t" 388 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 389 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 390 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 391 392 "balign %[tp2], %[tp1], 3 \n\t" 393 "extp %[Temp1], $ac3, 9 \n\t" 394 "ulw %[tn2], 3(%[src_ptr]) \n\t" 395 396 /* odd 1. pixel */ 397 "mtlo %[vector4a], $ac3 \n\t" 398 "preceu.ph.qbr %[n1], %[tp2] \n\t" 399 "preceu.ph.qbl %[n2], %[tp2] \n\t" 400 "preceu.ph.qbr %[n3], %[tn2] \n\t" 401 "extp %[Temp3], $ac2, 9 \n\t" 402 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 403 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 404 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 405 406 /* odd 2. pixel */ 407 "mtlo %[vector4a], $ac2 \n\t" 408 "preceu.ph.qbl %[n1], %[tn2] \n\t" 409 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 410 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 411 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 412 "ulw %[tp1], 6(%[src_ptr]) \n\t" 413 "extp %[Temp2], $ac3, 9 \n\t" 414 "mtlo %[vector4a], $ac3 \n\t" 415 "preceu.ph.qbr %[p2], %[tp1] \n\t" 416 "extp %[Temp4], $ac2, 9 \n\t" 417 418 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2), 419 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), 420 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), 421 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 422 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 423 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 424 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), 425 [src_ptr] "r" (src_ptr) 426 ); 427 428 /* clamp and store results */ 429 dst_ptr[0] = cm[Temp1]; 430 dst_ptr[1] = cm[Temp2]; 431 dst_ptr[2] = cm[Temp3]; 432 dst_ptr[3] = cm[Temp4]; 433 434 /* next 4 pixels */ 435 __asm__ __volatile__ ( 436 /* even 3. pixel */ 437 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 438 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" 439 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" 440 441 /* even 4. pixel */ 442 "mtlo %[vector4a], $ac2 \n\t" 443 "preceu.ph.qbl %[p4], %[tp1] \n\t" 444 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 445 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 446 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 447 448 "ulw %[tn1], 7(%[src_ptr]) \n\t" 449 "extp %[Temp1], $ac3, 9 \n\t" 450 451 /* odd 3. pixel */ 452 "mtlo %[vector4a], $ac3 \n\t" 453 "preceu.ph.qbr %[n2], %[tn1] \n\t" 454 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" 455 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" 456 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" 457 "extp %[Temp3], $ac2, 9 \n\t" 458 459 /* odd 4. pixel */ 460 "mtlo %[vector4a], $ac2 \n\t" 461 "preceu.ph.qbl %[n4], %[tn1] \n\t" 462 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" 463 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" 464 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 465 "extp %[Temp2], $ac3, 9 \n\t" 466 "extp %[Temp4], $ac2, 9 \n\t" 467 468 : [tn1] "=&r" (tn1), [n2] "=&r" (n2), 469 [p4] "=&r" (p4), [n4] "=&r" (n4), 470 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 471 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 472 : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2), 473 [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1), 474 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), 475 [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr) 476 ); 477 478 /* clamp and store results */ 479 dst_ptr[4] = cm[Temp1]; 480 dst_ptr[5] = cm[Temp2]; 481 dst_ptr[6] = cm[Temp3]; 482 dst_ptr[7] = cm[Temp4]; 483 484 src_ptr += src_pixels_per_line; 485 dst_ptr += pitch; 486 } 487 } 488 else 489 { 490 /* 4 tap filter */ 491 492 vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; 493 vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; 494 495 for (i = output_height; i--;) 496 { 497 /* prefetch src_ptr data to cache memory */ 498 prefetch_load(src_ptr + src_pixels_per_line); 499 500 /* apply filter with vectors pairs */ 501 __asm__ __volatile__ ( 502 "ulw %[tp1], -1(%[src_ptr]) \n\t" 503 504 /* even 1. pixel */ 505 "mtlo %[vector4a], $ac3 \n\t" 506 "preceu.ph.qbr %[p1], %[tp1] \n\t" 507 "preceu.ph.qbl %[p2], %[tp1] \n\t" 508 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 509 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 510 511 "ulw %[tp2], 3(%[src_ptr]) \n\t" 512 513 /* even 2. pixel */ 514 "mtlo %[vector4a], $ac2 \n\t" 515 "preceu.ph.qbr %[p3], %[tp2] \n\t" 516 "preceu.ph.qbl %[p4], %[tp2] \n\t" 517 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 518 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 519 "extp %[Temp1], $ac3, 9 \n\t" 520 521 "balign %[tp2], %[tp1], 3 \n\t" 522 523 /* odd 1. pixel */ 524 "mtlo %[vector4a], $ac3 \n\t" 525 "preceu.ph.qbr %[n1], %[tp2] \n\t" 526 "preceu.ph.qbl %[n2], %[tp2] \n\t" 527 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 528 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 529 "extp %[Temp3], $ac2, 9 \n\t" 530 531 "ulw %[tn2], 4(%[src_ptr]) \n\t" 532 533 /* odd 2. pixel */ 534 "mtlo %[vector4a], $ac2 \n\t" 535 "preceu.ph.qbr %[n3], %[tn2] \n\t" 536 "preceu.ph.qbl %[n4], %[tn2] \n\t" 537 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 538 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 539 "ulw %[tp1], 7(%[src_ptr]) \n\t" 540 "extp %[Temp2], $ac3, 9 \n\t" 541 "mtlo %[vector4a], $ac3 \n\t" 542 "extp %[Temp4], $ac2, 9 \n\t" 543 544 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 545 [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2), 546 [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1), 547 [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), 548 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 549 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 550 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 551 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) 552 ); 553 554 /* clamp and store results */ 555 dst_ptr[0] = cm[Temp1]; 556 dst_ptr[1] = cm[Temp2]; 557 dst_ptr[2] = cm[Temp3]; 558 dst_ptr[3] = cm[Temp4]; 559 560 /* next 4 pixels */ 561 __asm__ __volatile__ ( 562 /* even 3. pixel */ 563 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 564 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 565 566 /* even 4. pixel */ 567 "mtlo %[vector4a], $ac2 \n\t" 568 "preceu.ph.qbr %[p2], %[tp1] \n\t" 569 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 570 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 571 "extp %[Temp1], $ac3, 9 \n\t" 572 573 /* odd 3. pixel */ 574 "mtlo %[vector4a], $ac3 \n\t" 575 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" 576 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" 577 "ulw %[tn1], 8(%[src_ptr]) \n\t" 578 "extp %[Temp3], $ac2, 9 \n\t" 579 580 /* odd 4. pixel */ 581 "mtlo %[vector4a], $ac2 \n\t" 582 "preceu.ph.qbr %[n2], %[tn1] \n\t" 583 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" 584 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" 585 "extp %[Temp2], $ac3, 9 \n\t" 586 "extp %[Temp4], $ac2, 9 \n\t" 587 588 : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), 589 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 590 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 591 : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4), 592 [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 593 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr), 594 [n3] "r" (n3), [n4] "r" (n4) 595 ); 596 597 /* clamp and store results */ 598 dst_ptr[4] = cm[Temp1]; 599 dst_ptr[5] = cm[Temp2]; 600 dst_ptr[6] = cm[Temp3]; 601 dst_ptr[7] = cm[Temp4]; 602 603 /* next row... */ 604 src_ptr += src_pixels_per_line; 605 dst_ptr += pitch; 606 } 607 } 608 } 609 } 610 611 612 void vp8_filter_block2d_first_pass16_6tap 613 ( 614 unsigned char *RESTRICT src_ptr, 615 unsigned char *RESTRICT dst_ptr, 616 unsigned int src_pixels_per_line, 617 unsigned int output_height, 618 int xoffset, 619 int pitch 620 ) 621 { 622 unsigned int i; 623 int Temp1, Temp2, Temp3, Temp4; 624 625 unsigned int vector4a; 626 unsigned int vector1b, vector2b, vector3b; 627 unsigned int tp1, tp2, tn1, tn2; 628 unsigned int p1, p2, p3, p4; 629 unsigned int n1, n2, n3, n4; 630 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 631 632 vector1b = sub_pel_filters_inv[xoffset][0]; 633 vector2b = sub_pel_filters_inv[xoffset][1]; 634 vector3b = sub_pel_filters_inv[xoffset][2]; 635 vector4a = 64; 636 637 for (i = output_height; i--;) 638 { 639 /* prefetch src_ptr data to cache memory */ 640 prefetch_load(src_ptr + src_pixels_per_line); 641 642 /* apply filter with vectors pairs */ 643 __asm__ __volatile__ ( 644 "ulw %[tp1], -2(%[src_ptr]) \n\t" 645 "ulw %[tp2], 2(%[src_ptr]) \n\t" 646 647 /* even 1. pixel */ 648 "mtlo %[vector4a], $ac3 \n\t" 649 "preceu.ph.qbr %[p1], %[tp1] \n\t" 650 "preceu.ph.qbl %[p2], %[tp1] \n\t" 651 "preceu.ph.qbr %[p3], %[tp2] \n\t" 652 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 653 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 654 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 655 656 /* even 2. pixel */ 657 "mtlo %[vector4a], $ac2 \n\t" 658 "preceu.ph.qbl %[p1], %[tp2] \n\t" 659 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 660 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 661 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 662 663 "balign %[tp2], %[tp1], 3 \n\t" 664 "ulw %[tn2], 3(%[src_ptr]) \n\t" 665 "extp %[Temp1], $ac3, 9 \n\t" 666 667 /* odd 1. pixel */ 668 "mtlo %[vector4a], $ac3 \n\t" 669 "preceu.ph.qbr %[n1], %[tp2] \n\t" 670 "preceu.ph.qbl %[n2], %[tp2] \n\t" 671 "preceu.ph.qbr %[n3], %[tn2] \n\t" 672 "extp %[Temp3], $ac2, 9 \n\t" 673 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 674 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 675 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 676 677 /* odd 2. pixel */ 678 "mtlo %[vector4a], $ac2 \n\t" 679 "preceu.ph.qbl %[n1], %[tn2] \n\t" 680 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 681 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 682 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 683 "ulw %[tp1], 6(%[src_ptr]) \n\t" 684 "extp %[Temp2], $ac3, 9 \n\t" 685 "mtlo %[vector4a], $ac3 \n\t" 686 "preceu.ph.qbr %[p2], %[tp1] \n\t" 687 "extp %[Temp4], $ac2, 9 \n\t" 688 689 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2), 690 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), 691 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), 692 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 693 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 694 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 695 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), 696 [src_ptr] "r" (src_ptr) 697 ); 698 699 /* clamp and store results */ 700 dst_ptr[0] = cm[Temp1]; 701 dst_ptr[1] = cm[Temp2]; 702 dst_ptr[2] = cm[Temp3]; 703 dst_ptr[3] = cm[Temp4]; 704 705 /* next 4 pixels */ 706 __asm__ __volatile__ ( 707 /* even 3. pixel */ 708 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 709 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" 710 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" 711 712 /* even 4. pixel */ 713 "mtlo %[vector4a], $ac2 \n\t" 714 "preceu.ph.qbl %[p4], %[tp1] \n\t" 715 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 716 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 717 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 718 "ulw %[tn1], 7(%[src_ptr]) \n\t" 719 "extp %[Temp1], $ac3, 9 \n\t" 720 721 /* odd 3. pixel */ 722 "mtlo %[vector4a], $ac3 \n\t" 723 "preceu.ph.qbr %[n2], %[tn1] \n\t" 724 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" 725 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" 726 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" 727 "extp %[Temp3], $ac2, 9 \n\t" 728 729 /* odd 4. pixel */ 730 "mtlo %[vector4a], $ac2 \n\t" 731 "preceu.ph.qbl %[n4], %[tn1] \n\t" 732 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" 733 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" 734 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 735 "ulw %[tp2], 10(%[src_ptr]) \n\t" 736 "extp %[Temp2], $ac3, 9 \n\t" 737 "mtlo %[vector4a], $ac3 \n\t" 738 "preceu.ph.qbr %[p1], %[tp2] \n\t" 739 "extp %[Temp4], $ac2, 9 \n\t" 740 741 : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2), 742 [p4] "=&r" (p4), [n4] "=&r" (n4), 743 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 744 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 745 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 746 [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1), 747 [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b), 748 [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr) 749 ); 750 751 /* clamp and store results */ 752 dst_ptr[4] = cm[Temp1]; 753 dst_ptr[5] = cm[Temp2]; 754 dst_ptr[6] = cm[Temp3]; 755 dst_ptr[7] = cm[Temp4]; 756 757 /* next 4 pixels */ 758 __asm__ __volatile__ ( 759 /* even 5. pixel */ 760 "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t" 761 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 762 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 763 764 /* even 6. pixel */ 765 "mtlo %[vector4a], $ac2 \n\t" 766 "preceu.ph.qbl %[p3], %[tp2] \n\t" 767 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 768 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 769 "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t" 770 771 "ulw %[tn1], 11(%[src_ptr]) \n\t" 772 "extp %[Temp1], $ac3, 9 \n\t" 773 774 /* odd 5. pixel */ 775 "mtlo %[vector4a], $ac3 \n\t" 776 "preceu.ph.qbr %[n1], %[tn1] \n\t" 777 "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t" 778 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" 779 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 780 "extp %[Temp3], $ac2, 9 \n\t" 781 782 /* odd 6. pixel */ 783 "mtlo %[vector4a], $ac2 \n\t" 784 "preceu.ph.qbl %[n3], %[tn1] \n\t" 785 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" 786 "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t" 787 "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t" 788 "ulw %[tp1], 14(%[src_ptr]) \n\t" 789 "extp %[Temp2], $ac3, 9 \n\t" 790 "mtlo %[vector4a], $ac3 \n\t" 791 "preceu.ph.qbr %[p4], %[tp1] \n\t" 792 "extp %[Temp4], $ac2, 9 \n\t" 793 794 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), 795 [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3), 796 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 797 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 798 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 799 [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2), 800 [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr), 801 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b) 802 ); 803 804 /* clamp and store results */ 805 dst_ptr[8] = cm[Temp1]; 806 dst_ptr[9] = cm[Temp2]; 807 dst_ptr[10] = cm[Temp3]; 808 dst_ptr[11] = cm[Temp4]; 809 810 /* next 4 pixels */ 811 __asm__ __volatile__ ( 812 /* even 7. pixel */ 813 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 814 "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t" 815 "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t" 816 817 /* even 8. pixel */ 818 "mtlo %[vector4a], $ac2 \n\t" 819 "preceu.ph.qbl %[p2], %[tp1] \n\t" 820 "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t" 821 "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t" 822 "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t" 823 "ulw %[tn1], 15(%[src_ptr]) \n\t" 824 "extp %[Temp1], $ac3, 9 \n\t" 825 826 /* odd 7. pixel */ 827 "mtlo %[vector4a], $ac3 \n\t" 828 "preceu.ph.qbr %[n4], %[tn1] \n\t" 829 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 830 "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t" 831 "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t" 832 "extp %[Temp3], $ac2, 9 \n\t" 833 834 /* odd 8. pixel */ 835 "mtlo %[vector4a], $ac2 \n\t" 836 "preceu.ph.qbl %[n2], %[tn1] \n\t" 837 "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t" 838 "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t" 839 "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t" 840 "extp %[Temp2], $ac3, 9 \n\t" 841 "extp %[Temp4], $ac2, 9 \n\t" 842 843 /* clamp and store results */ 844 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 845 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 846 "lbux %[p2], %[Temp3](%[cm]) \n\t" 847 "sb %[tp1], 12(%[dst_ptr]) \n\t" 848 "sb %[tn1], 13(%[dst_ptr]) \n\t" 849 "lbux %[n2], %[Temp4](%[cm]) \n\t" 850 "sb %[p2], 14(%[dst_ptr]) \n\t" 851 "sb %[n2], 15(%[dst_ptr]) \n\t" 852 853 : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4), 854 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 855 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4) 856 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 857 [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1), 858 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3), 859 [n3] "r" (n3), [src_ptr] "r" (src_ptr), 860 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 861 ); 862 863 src_ptr += src_pixels_per_line; 864 dst_ptr += pitch; 865 } 866 } 867 868 869 void vp8_filter_block2d_first_pass16_0 870 ( 871 unsigned char *RESTRICT src_ptr, 872 unsigned char *RESTRICT output_ptr, 873 unsigned int src_pixels_per_line 874 ) 875 { 876 int Temp1, Temp2, Temp3, Temp4; 877 int i; 878 879 /* prefetch src_ptr data to cache memory */ 880 prefetch_store(output_ptr + 32); 881 882 /* copy memory from src buffer to dst buffer */ 883 for (i = 0; i < 7; i++) 884 { 885 __asm__ __volatile__ ( 886 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 887 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 888 "ulw %[Temp3], 8(%[src_ptr]) \n\t" 889 "ulw %[Temp4], 12(%[src_ptr]) \n\t" 890 "sw %[Temp1], 0(%[output_ptr]) \n\t" 891 "sw %[Temp2], 4(%[output_ptr]) \n\t" 892 "sw %[Temp3], 8(%[output_ptr]) \n\t" 893 "sw %[Temp4], 12(%[output_ptr]) \n\t" 894 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 895 896 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 897 [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr) 898 : [src_pixels_per_line] "r" (src_pixels_per_line), 899 [output_ptr] "r" (output_ptr) 900 ); 901 902 __asm__ __volatile__ ( 903 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 904 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 905 "ulw %[Temp3], 8(%[src_ptr]) \n\t" 906 "ulw %[Temp4], 12(%[src_ptr]) \n\t" 907 "sw %[Temp1], 16(%[output_ptr]) \n\t" 908 "sw %[Temp2], 20(%[output_ptr]) \n\t" 909 "sw %[Temp3], 24(%[output_ptr]) \n\t" 910 "sw %[Temp4], 28(%[output_ptr]) \n\t" 911 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 912 913 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 914 [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr) 915 : [src_pixels_per_line] "r" (src_pixels_per_line), 916 [output_ptr] "r" (output_ptr) 917 ); 918 919 __asm__ __volatile__ ( 920 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 921 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 922 "ulw %[Temp3], 8(%[src_ptr]) \n\t" 923 "ulw %[Temp4], 12(%[src_ptr]) \n\t" 924 "sw %[Temp1], 32(%[output_ptr]) \n\t" 925 "sw %[Temp2], 36(%[output_ptr]) \n\t" 926 "sw %[Temp3], 40(%[output_ptr]) \n\t" 927 "sw %[Temp4], 44(%[output_ptr]) \n\t" 928 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 929 930 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 931 [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr) 932 : [src_pixels_per_line] "r" (src_pixels_per_line), 933 [output_ptr] "r" (output_ptr) 934 ); 935 936 output_ptr += 48; 937 } 938 } 939 940 941 void vp8_filter_block2d_first_pass16_4tap 942 ( 943 unsigned char *RESTRICT src_ptr, 944 unsigned char *RESTRICT output_ptr, 945 unsigned int src_pixels_per_line, 946 unsigned int output_width, 947 unsigned int output_height, 948 int xoffset, 949 int yoffset, 950 unsigned char *RESTRICT dst_ptr, 951 int pitch 952 ) 953 { 954 unsigned int i, j; 955 int Temp1, Temp2, Temp3, Temp4; 956 957 unsigned int vector4a; 958 int vector1b, vector2b; 959 unsigned int tp1, tp2, tp3, tn1; 960 unsigned int p1, p2, p3; 961 unsigned int n1, n2, n3; 962 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 963 964 vector4a = 64; 965 966 vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; 967 vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; 968 969 /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */ 970 if (yoffset == 0) 971 { 972 output_height -= 5; 973 src_ptr += (src_pixels_per_line + src_pixels_per_line); 974 975 for (i = output_height; i--;) 976 { 977 __asm__ __volatile__ ( 978 "ulw %[tp3], -1(%[src_ptr]) \n\t" 979 : [tp3] "=&r" (tp3) 980 : [src_ptr] "r" (src_ptr) 981 ); 982 983 /* processing 4 adjacent pixels */ 984 for (j = 0; j < 16; j += 4) 985 { 986 /* apply filter with vectors pairs */ 987 __asm__ __volatile__ ( 988 "ulw %[tp2], 3(%[src_ptr]) \n\t" 989 "move %[tp1], %[tp3] \n\t" 990 991 /* even 1. pixel */ 992 "mtlo %[vector4a], $ac3 \n\t" 993 "mthi $0, $ac3 \n\t" 994 "move %[tp3], %[tp2] \n\t" 995 "preceu.ph.qbr %[p1], %[tp1] \n\t" 996 "preceu.ph.qbl %[p2], %[tp1] \n\t" 997 "preceu.ph.qbr %[p3], %[tp2] \n\t" 998 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 999 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 1000 1001 /* even 2. pixel */ 1002 "mtlo %[vector4a], $ac2 \n\t" 1003 "mthi $0, $ac2 \n\t" 1004 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 1005 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 1006 "extr.w %[Temp1], $ac3, 7 \n\t" 1007 1008 /* odd 1. pixel */ 1009 "ulw %[tn1], 4(%[src_ptr]) \n\t" 1010 "balign %[tp2], %[tp1], 3 \n\t" 1011 "mtlo %[vector4a], $ac3 \n\t" 1012 "mthi $0, $ac3 \n\t" 1013 "preceu.ph.qbr %[n1], %[tp2] \n\t" 1014 "preceu.ph.qbl %[n2], %[tp2] \n\t" 1015 "preceu.ph.qbr %[n3], %[tn1] \n\t" 1016 "extr.w %[Temp3], $ac2, 7 \n\t" 1017 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 1018 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 1019 1020 /* odd 2. pixel */ 1021 "mtlo %[vector4a], $ac2 \n\t" 1022 "mthi $0, $ac2 \n\t" 1023 "extr.w %[Temp2], $ac3, 7 \n\t" 1024 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 1025 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 1026 "extr.w %[Temp4], $ac2, 7 \n\t" 1027 1028 /* clamp and store results */ 1029 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 1030 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 1031 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 1032 "sb %[tp1], 0(%[dst_ptr]) \n\t" 1033 "sb %[tn1], 1(%[dst_ptr]) \n\t" 1034 "lbux %[n2], %[Temp4](%[cm]) \n\t" 1035 "sb %[tp2], 2(%[dst_ptr]) \n\t" 1036 "sb %[n2], 3(%[dst_ptr]) \n\t" 1037 1038 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), 1039 [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2), 1040 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), 1041 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3), 1042 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 1043 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 1044 [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr), 1045 [src_ptr] "r" (src_ptr) 1046 ); 1047 1048 src_ptr += 4; 1049 } 1050 1051 /* Next row... */ 1052 src_ptr += src_pixels_per_line - 16; 1053 dst_ptr += pitch; 1054 } 1055 } 1056 else 1057 { 1058 for (i = output_height; i--;) 1059 { 1060 /* processing 4 adjacent pixels */ 1061 for (j = 0; j < 16; j += 4) 1062 { 1063 /* apply filter with vectors pairs */ 1064 __asm__ __volatile__ ( 1065 "ulw %[tp1], -1(%[src_ptr]) \n\t" 1066 "ulw %[tp2], 3(%[src_ptr]) \n\t" 1067 1068 /* even 1. pixel */ 1069 "mtlo %[vector4a], $ac3 \n\t" 1070 "mthi $0, $ac3 \n\t" 1071 "preceu.ph.qbr %[p1], %[tp1] \n\t" 1072 "preceu.ph.qbl %[p2], %[tp1] \n\t" 1073 "preceu.ph.qbr %[p3], %[tp2] \n\t" 1074 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 1075 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 1076 1077 /* even 2. pixel */ 1078 "mtlo %[vector4a], $ac2 \n\t" 1079 "mthi $0, $ac2 \n\t" 1080 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 1081 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 1082 "extr.w %[Temp1], $ac3, 7 \n\t" 1083 1084 /* odd 1. pixel */ 1085 "ulw %[tn1], 4(%[src_ptr]) \n\t" 1086 "balign %[tp2], %[tp1], 3 \n\t" 1087 "mtlo %[vector4a], $ac3 \n\t" 1088 "mthi $0, $ac3 \n\t" 1089 "preceu.ph.qbr %[n1], %[tp2] \n\t" 1090 "preceu.ph.qbl %[n2], %[tp2] \n\t" 1091 "preceu.ph.qbr %[n3], %[tn1] \n\t" 1092 "extr.w %[Temp3], $ac2, 7 \n\t" 1093 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 1094 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 1095 1096 /* odd 2. pixel */ 1097 "mtlo %[vector4a], $ac2 \n\t" 1098 "mthi $0, $ac2 \n\t" 1099 "extr.w %[Temp2], $ac3, 7 \n\t" 1100 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 1101 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 1102 "extr.w %[Temp4], $ac2, 7 \n\t" 1103 1104 /* clamp and store results */ 1105 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 1106 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 1107 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 1108 "sb %[tp1], 0(%[output_ptr]) \n\t" 1109 "sb %[tn1], 1(%[output_ptr]) \n\t" 1110 "lbux %[n2], %[Temp4](%[cm]) \n\t" 1111 "sb %[tp2], 2(%[output_ptr]) \n\t" 1112 "sb %[n2], 3(%[output_ptr]) \n\t" 1113 1114 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), 1115 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), 1116 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), 1117 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 1118 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 1119 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 1120 [vector4a] "r" (vector4a), [cm] "r" (cm), 1121 [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr) 1122 ); 1123 1124 src_ptr += 4; 1125 } 1126 1127 /* next row... */ 1128 src_ptr += src_pixels_per_line; 1129 output_ptr += output_width; 1130 } 1131 } 1132 } 1133 1134 1135 void vp8_filter_block2d_second_pass4 1136 ( 1137 unsigned char *RESTRICT src_ptr, 1138 unsigned char *RESTRICT output_ptr, 1139 int output_pitch, 1140 int yoffset 1141 ) 1142 { 1143 unsigned int i; 1144 1145 int Temp1, Temp2, Temp3, Temp4; 1146 unsigned int vector1b, vector2b, vector3b, vector4a; 1147 1148 unsigned char src_ptr_l2; 1149 unsigned char src_ptr_l1; 1150 unsigned char src_ptr_0; 1151 unsigned char src_ptr_r1; 1152 unsigned char src_ptr_r2; 1153 unsigned char src_ptr_r3; 1154 1155 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 1156 1157 vector4a = 64; 1158 1159 /* load filter coefficients */ 1160 vector1b = sub_pel_filterss[yoffset][0]; 1161 vector2b = sub_pel_filterss[yoffset][2]; 1162 vector3b = sub_pel_filterss[yoffset][1]; 1163 1164 if (vector1b) 1165 { 1166 /* 6 tap filter */ 1167 1168 for (i = 2; i--;) 1169 { 1170 /* prefetch src_ptr data to cache memory */ 1171 prefetch_load(src_ptr); 1172 1173 /* do not allow compiler to reorder instructions */ 1174 __asm__ __volatile__ ( 1175 ".set noreorder \n\t" 1176 : 1177 : 1178 ); 1179 1180 /* apply filter with vectors pairs */ 1181 __asm__ __volatile__ ( 1182 "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t" 1183 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 1184 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1185 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" 1186 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" 1187 "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t" 1188 "mtlo %[vector4a], $ac2 \n\t" 1189 1190 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1191 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1192 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1193 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1194 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1195 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1196 1197 "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t" 1198 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 1199 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1200 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" 1201 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" 1202 "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t" 1203 "mtlo %[vector4a], $ac3 \n\t" 1204 "extp %[Temp1], $ac2, 9 \n\t" 1205 1206 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1207 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1208 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1209 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1210 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1211 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1212 1213 "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t" 1214 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 1215 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 1216 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" 1217 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" 1218 "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t" 1219 "mtlo %[vector4a], $ac0 \n\t" 1220 "extp %[Temp2], $ac3, 9 \n\t" 1221 1222 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1223 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1224 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1225 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1226 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1227 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1228 1229 "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t" 1230 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 1231 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 1232 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" 1233 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" 1234 "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t" 1235 "mtlo %[vector4a], $ac1 \n\t" 1236 "extp %[Temp3], $ac0, 9 \n\t" 1237 1238 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1239 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1240 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1241 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1242 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1243 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1244 "extp %[Temp4], $ac1, 9 \n\t" 1245 1246 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 1247 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), 1248 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1249 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), 1250 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) 1251 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 1252 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), 1253 [src_ptr] "r" (src_ptr) 1254 ); 1255 1256 /* clamp and store results */ 1257 output_ptr[0] = cm[Temp1]; 1258 output_ptr[1] = cm[Temp2]; 1259 output_ptr[2] = cm[Temp3]; 1260 output_ptr[3] = cm[Temp4]; 1261 1262 output_ptr += output_pitch; 1263 1264 /* apply filter with vectors pairs */ 1265 __asm__ __volatile__ ( 1266 "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t" 1267 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" 1268 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 1269 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1270 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" 1271 "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t" 1272 "mtlo %[vector4a], $ac2 \n\t" 1273 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1274 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1275 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1276 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1277 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1278 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1279 1280 "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t" 1281 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" 1282 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 1283 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1284 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" 1285 "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t" 1286 "mtlo %[vector4a], $ac3 \n\t" 1287 "extp %[Temp1], $ac2, 9 \n\t" 1288 1289 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1290 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1291 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1292 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1293 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1294 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1295 1296 "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t" 1297 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" 1298 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 1299 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" 1300 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" 1301 "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t" 1302 "mtlo %[vector4a], $ac0 \n\t" 1303 "extp %[Temp2], $ac3, 9 \n\t" 1304 1305 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1306 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1307 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1308 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1309 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1310 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1311 1312 "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t" 1313 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" 1314 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 1315 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" 1316 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" 1317 "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t" 1318 "mtlo %[vector4a], $ac1 \n\t" 1319 "extp %[Temp3], $ac0, 9 \n\t" 1320 1321 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1322 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1323 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1324 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1325 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1326 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1327 "extp %[Temp4], $ac1, 9 \n\t" 1328 1329 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 1330 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), 1331 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1332 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), 1333 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) 1334 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 1335 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), 1336 [src_ptr] "r" (src_ptr) 1337 ); 1338 1339 /* clamp and store results */ 1340 output_ptr[0] = cm[Temp1]; 1341 output_ptr[1] = cm[Temp2]; 1342 output_ptr[2] = cm[Temp3]; 1343 output_ptr[3] = cm[Temp4]; 1344 1345 src_ptr += 8; 1346 output_ptr += output_pitch; 1347 } 1348 } 1349 else 1350 { 1351 /* 4 tap filter */ 1352 1353 /* prefetch src_ptr data to cache memory */ 1354 prefetch_load(src_ptr); 1355 1356 for (i = 2; i--;) 1357 { 1358 /* do not allow compiler to reorder instructions */ 1359 __asm__ __volatile__ ( 1360 ".set noreorder \n\t" 1361 : 1362 : 1363 ); 1364 1365 /* apply filter with vectors pairs */ 1366 __asm__ __volatile__ ( 1367 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 1368 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1369 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" 1370 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" 1371 "mtlo %[vector4a], $ac2 \n\t" 1372 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1373 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1374 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1375 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1376 1377 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 1378 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1379 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" 1380 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" 1381 "mtlo %[vector4a], $ac3 \n\t" 1382 "extp %[Temp1], $ac2, 9 \n\t" 1383 1384 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1385 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1386 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1387 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1388 1389 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 1390 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 1391 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" 1392 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" 1393 "mtlo %[vector4a], $ac0 \n\t" 1394 "extp %[Temp2], $ac3, 9 \n\t" 1395 1396 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1397 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1398 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1399 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1400 1401 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 1402 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 1403 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" 1404 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" 1405 "mtlo %[vector4a], $ac1 \n\t" 1406 "extp %[Temp3], $ac0, 9 \n\t" 1407 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1408 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1409 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1410 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1411 "extp %[Temp4], $ac1, 9 \n\t" 1412 1413 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 1414 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), 1415 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1416 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) 1417 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1418 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) 1419 ); 1420 1421 /* clamp and store results */ 1422 output_ptr[0] = cm[Temp1]; 1423 output_ptr[1] = cm[Temp2]; 1424 output_ptr[2] = cm[Temp3]; 1425 output_ptr[3] = cm[Temp4]; 1426 1427 output_ptr += output_pitch; 1428 1429 /* apply filter with vectors pairs */ 1430 __asm__ __volatile__ ( 1431 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" 1432 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 1433 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1434 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" 1435 "mtlo %[vector4a], $ac2 \n\t" 1436 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1437 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1438 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1439 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1440 1441 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" 1442 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 1443 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1444 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" 1445 "mtlo %[vector4a], $ac3 \n\t" 1446 "extp %[Temp1], $ac2, 9 \n\t" 1447 1448 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1449 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1450 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1451 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1452 1453 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" 1454 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 1455 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" 1456 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" 1457 "mtlo %[vector4a], $ac0 \n\t" 1458 "extp %[Temp2], $ac3, 9 \n\t" 1459 1460 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1461 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1462 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1463 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1464 1465 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" 1466 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 1467 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" 1468 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" 1469 "mtlo %[vector4a], $ac1 \n\t" 1470 "extp %[Temp3], $ac0, 9 \n\t" 1471 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1472 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1473 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1474 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1475 "extp %[Temp4], $ac1, 9 \n\t" 1476 1477 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 1478 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4), 1479 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1480 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) 1481 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1482 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) 1483 ); 1484 1485 /* clamp and store results */ 1486 output_ptr[0] = cm[Temp1]; 1487 output_ptr[1] = cm[Temp2]; 1488 output_ptr[2] = cm[Temp3]; 1489 output_ptr[3] = cm[Temp4]; 1490 1491 src_ptr += 8; 1492 output_ptr += output_pitch; 1493 } 1494 } 1495 } 1496 1497 1498 void vp8_filter_block2d_second_pass_8 1499 ( 1500 unsigned char *RESTRICT src_ptr, 1501 unsigned char *RESTRICT output_ptr, 1502 int output_pitch, 1503 unsigned int output_height, 1504 unsigned int output_width, 1505 unsigned int yoffset 1506 ) 1507 { 1508 unsigned int i; 1509 1510 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; 1511 unsigned int vector1b, vector2b, vector3b, vector4a; 1512 1513 unsigned char src_ptr_l2; 1514 unsigned char src_ptr_l1; 1515 unsigned char src_ptr_0; 1516 unsigned char src_ptr_r1; 1517 unsigned char src_ptr_r2; 1518 unsigned char src_ptr_r3; 1519 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 1520 1521 vector4a = 64; 1522 1523 vector1b = sub_pel_filterss[yoffset][0]; 1524 vector2b = sub_pel_filterss[yoffset][2]; 1525 vector3b = sub_pel_filterss[yoffset][1]; 1526 1527 if (vector1b) 1528 { 1529 /* 6 tap filter */ 1530 1531 /* prefetch src_ptr data to cache memory */ 1532 prefetch_load(src_ptr); 1533 1534 for (i = output_height; i--;) 1535 { 1536 /* apply filter with vectors pairs */ 1537 __asm__ __volatile__ ( 1538 "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t" 1539 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" 1540 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1541 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1542 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" 1543 "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t" 1544 "mtlo %[vector4a], $ac2 \n\t" 1545 1546 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1547 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1548 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1549 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1550 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1551 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1552 1553 "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t" 1554 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" 1555 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1556 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1557 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" 1558 "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t" 1559 "mtlo %[vector4a], $ac3 \n\t" 1560 "extp %[Temp1], $ac2, 9 \n\t" 1561 1562 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1563 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1564 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1565 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1566 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1567 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1568 1569 "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t" 1570 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" 1571 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 1572 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" 1573 "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t" 1574 "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t" 1575 "mtlo %[vector4a], $ac0 \n\t" 1576 "extp %[Temp2], $ac3, 9 \n\t" 1577 1578 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1579 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1580 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1581 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1582 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1583 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1584 1585 "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t" 1586 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" 1587 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 1588 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" 1589 "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t" 1590 "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t" 1591 "mtlo %[vector4a], $ac1 \n\t" 1592 "extp %[Temp3], $ac0, 9 \n\t" 1593 1594 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1595 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1596 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1597 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1598 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1599 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1600 1601 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 1602 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1603 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), 1604 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) 1605 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 1606 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), 1607 [src_ptr] "r" (src_ptr) 1608 ); 1609 1610 /* apply filter with vectors pairs */ 1611 __asm__ __volatile__ ( 1612 "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t" 1613 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 1614 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 1615 "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t" 1616 "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t" 1617 "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t" 1618 "mtlo %[vector4a], $ac2 \n\t" 1619 1620 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1621 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1622 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1623 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1624 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1625 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1626 "extp %[Temp4], $ac1, 9 \n\t" 1627 1628 "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t" 1629 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 1630 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 1631 "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t" 1632 "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t" 1633 "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t" 1634 "mtlo %[vector4a], $ac3 \n\t" 1635 "extp %[Temp5], $ac2, 9 \n\t" 1636 1637 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1638 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1639 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1640 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1641 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1642 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1643 1644 "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t" 1645 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 1646 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 1647 "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t" 1648 "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t" 1649 "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t" 1650 "mtlo %[vector4a], $ac0 \n\t" 1651 "extp %[Temp6], $ac3, 9 \n\t" 1652 1653 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1654 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1655 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1656 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1657 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1658 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1659 1660 "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t" 1661 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 1662 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 1663 "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t" 1664 "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t" 1665 "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t" 1666 "mtlo %[vector4a], $ac1 \n\t" 1667 "extp %[Temp7], $ac0, 9 \n\t" 1668 1669 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1670 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1671 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1672 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1673 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1674 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1675 "extp %[Temp8], $ac1, 9 \n\t" 1676 1677 : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5), 1678 [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), 1679 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1680 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), 1681 [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3) 1682 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 1683 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), 1684 [src_ptr] "r" (src_ptr) 1685 ); 1686 1687 /* clamp and store results */ 1688 output_ptr[0] = cm[Temp1]; 1689 output_ptr[1] = cm[Temp2]; 1690 output_ptr[2] = cm[Temp3]; 1691 output_ptr[3] = cm[Temp4]; 1692 output_ptr[4] = cm[Temp5]; 1693 output_ptr[5] = cm[Temp6]; 1694 output_ptr[6] = cm[Temp7]; 1695 output_ptr[7] = cm[Temp8]; 1696 1697 src_ptr += 8; 1698 output_ptr += output_pitch; 1699 } 1700 } 1701 else 1702 { 1703 /* 4 tap filter */ 1704 1705 /* prefetch src_ptr data to cache memory */ 1706 prefetch_load(src_ptr); 1707 1708 for (i = output_height; i--;) 1709 { 1710 __asm__ __volatile__ ( 1711 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" 1712 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1713 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1714 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" 1715 "mtlo %[vector4a], $ac2 \n\t" 1716 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1717 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1718 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1719 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1720 1721 : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1722 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) 1723 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1724 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) 1725 ); 1726 1727 __asm__ __volatile__ ( 1728 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" 1729 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1730 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1731 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" 1732 "mtlo %[vector4a], $ac3 \n\t" 1733 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1734 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1735 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1736 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1737 "extp %[Temp1], $ac2, 9 \n\t" 1738 1739 : [Temp1] "=r" (Temp1), 1740 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 1741 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) 1742 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1743 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) 1744 ); 1745 1746 src_ptr_l1 = src_ptr[-6]; 1747 src_ptr_0 = src_ptr[2]; 1748 src_ptr_r1 = src_ptr[10]; 1749 src_ptr_r2 = src_ptr[18]; 1750 1751 __asm__ __volatile__ ( 1752 "mtlo %[vector4a], $ac0 \n\t" 1753 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1754 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1755 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1756 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1757 "extp %[Temp2], $ac3, 9 \n\t" 1758 1759 : [Temp2] "=r" (Temp2) 1760 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1761 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), 1762 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), 1763 [vector4a] "r" (vector4a) 1764 ); 1765 1766 src_ptr_l1 = src_ptr[-5]; 1767 src_ptr_0 = src_ptr[3]; 1768 src_ptr_r1 = src_ptr[11]; 1769 src_ptr_r2 = src_ptr[19]; 1770 1771 __asm__ __volatile__ ( 1772 "mtlo %[vector4a], $ac1 \n\t" 1773 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1774 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1775 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1776 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1777 "extp %[Temp3], $ac0, 9 \n\t" 1778 1779 : [Temp3] "=r" (Temp3) 1780 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1781 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), 1782 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), 1783 [vector4a] "r" (vector4a) 1784 ); 1785 1786 src_ptr_l1 = src_ptr[-4]; 1787 src_ptr_0 = src_ptr[4]; 1788 src_ptr_r1 = src_ptr[12]; 1789 src_ptr_r2 = src_ptr[20]; 1790 1791 __asm__ __volatile__ ( 1792 "mtlo %[vector4a], $ac2 \n\t" 1793 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1794 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1795 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1796 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1797 "extp %[Temp4], $ac1, 9 \n\t" 1798 1799 : [Temp4] "=r" (Temp4) 1800 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1801 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), 1802 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), 1803 [vector4a] "r" (vector4a) 1804 ); 1805 1806 src_ptr_l1 = src_ptr[-3]; 1807 src_ptr_0 = src_ptr[5]; 1808 src_ptr_r1 = src_ptr[13]; 1809 src_ptr_r2 = src_ptr[21]; 1810 1811 __asm__ __volatile__ ( 1812 "mtlo %[vector4a], $ac3 \n\t" 1813 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1814 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1815 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1816 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1817 "extp %[Temp5], $ac2, 9 \n\t" 1818 1819 : [Temp5] "=&r" (Temp5) 1820 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1821 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), 1822 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), 1823 [vector4a] "r" (vector4a) 1824 ); 1825 1826 src_ptr_l1 = src_ptr[-2]; 1827 src_ptr_0 = src_ptr[6]; 1828 src_ptr_r1 = src_ptr[14]; 1829 src_ptr_r2 = src_ptr[22]; 1830 1831 __asm__ __volatile__ ( 1832 "mtlo %[vector4a], $ac0 \n\t" 1833 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1834 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1835 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1836 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1837 "extp %[Temp6], $ac3, 9 \n\t" 1838 1839 : [Temp6] "=r" (Temp6) 1840 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1841 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), 1842 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), 1843 [vector4a] "r" (vector4a) 1844 ); 1845 1846 src_ptr_l1 = src_ptr[-1]; 1847 src_ptr_0 = src_ptr[7]; 1848 src_ptr_r1 = src_ptr[15]; 1849 src_ptr_r2 = src_ptr[23]; 1850 1851 __asm__ __volatile__ ( 1852 "mtlo %[vector4a], $ac1 \n\t" 1853 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1854 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1855 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1856 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1857 "extp %[Temp7], $ac0, 9 \n\t" 1858 "extp %[Temp8], $ac1, 9 \n\t" 1859 1860 : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8) 1861 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 1862 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0), 1863 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2), 1864 [vector4a] "r" (vector4a) 1865 ); 1866 1867 /* clamp and store results */ 1868 output_ptr[0] = cm[Temp1]; 1869 output_ptr[1] = cm[Temp2]; 1870 output_ptr[2] = cm[Temp3]; 1871 output_ptr[3] = cm[Temp4]; 1872 output_ptr[4] = cm[Temp5]; 1873 output_ptr[5] = cm[Temp6]; 1874 output_ptr[6] = cm[Temp7]; 1875 output_ptr[7] = cm[Temp8]; 1876 1877 src_ptr += 8; 1878 output_ptr += output_pitch; 1879 } 1880 } 1881 } 1882 1883 1884 void vp8_filter_block2d_second_pass161 1885 ( 1886 unsigned char *RESTRICT src_ptr, 1887 unsigned char *RESTRICT output_ptr, 1888 int output_pitch, 1889 const unsigned short *vp8_filter 1890 ) 1891 { 1892 unsigned int i, j; 1893 1894 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; 1895 unsigned int vector4a; 1896 unsigned int vector1b, vector2b, vector3b; 1897 1898 unsigned char src_ptr_l2; 1899 unsigned char src_ptr_l1; 1900 unsigned char src_ptr_0; 1901 unsigned char src_ptr_r1; 1902 unsigned char src_ptr_r2; 1903 unsigned char src_ptr_r3; 1904 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 1905 1906 vector4a = 64; 1907 1908 vector1b = vp8_filter[0]; 1909 vector2b = vp8_filter[2]; 1910 vector3b = vp8_filter[1]; 1911 1912 if (vector1b == 0) 1913 { 1914 /* 4 tap filter */ 1915 1916 /* prefetch src_ptr data to cache memory */ 1917 prefetch_load(src_ptr + 16); 1918 1919 for (i = 16; i--;) 1920 { 1921 /* unrolling for loop */ 1922 for (j = 0; j < 16; j += 8) 1923 { 1924 /* apply filter with vectors pairs */ 1925 __asm__ __volatile__ ( 1926 "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t" 1927 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1928 "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t" 1929 "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t" 1930 "mtlo %[vector4a], $ac2 \n\t" 1931 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1932 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1933 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1934 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1935 1936 "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t" 1937 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1938 "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t" 1939 "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t" 1940 "mtlo %[vector4a], $ac3 \n\t" 1941 "extp %[Temp1], $ac2, 9 \n\t" 1942 1943 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1944 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1945 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1946 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1947 1948 "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t" 1949 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 1950 "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t" 1951 "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t" 1952 "mtlo %[vector4a], $ac1 \n\t" 1953 "extp %[Temp2], $ac3, 9 \n\t" 1954 1955 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1956 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1957 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1958 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1959 1960 "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t" 1961 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 1962 "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t" 1963 "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t" 1964 "mtlo %[vector4a], $ac3 \n\t" 1965 "extp %[Temp3], $ac1, 9 \n\t" 1966 1967 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1968 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1969 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1970 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1971 1972 "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t" 1973 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 1974 "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t" 1975 "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t" 1976 "mtlo %[vector4a], $ac2 \n\t" 1977 "extp %[Temp4], $ac3, 9 \n\t" 1978 1979 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1980 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1981 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1982 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1983 1984 "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t" 1985 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 1986 "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t" 1987 "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t" 1988 "mtlo %[vector4a], $ac3 \n\t" 1989 "extp %[Temp5], $ac2, 9 \n\t" 1990 1991 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1992 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1993 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1994 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1995 1996 "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t" 1997 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 1998 "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t" 1999 "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t" 2000 "mtlo %[vector4a], $ac1 \n\t" 2001 "extp %[Temp6], $ac3, 9 \n\t" 2002 2003 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2004 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2005 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2006 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2007 2008 "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t" 2009 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 2010 "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t" 2011 "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t" 2012 "mtlo %[vector4a], $ac3 \n\t" 2013 "extp %[Temp7], $ac1, 9 \n\t" 2014 2015 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2016 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2017 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2018 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2019 "extp %[Temp8], $ac3, 9 \n\t" 2020 2021 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 2022 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), 2023 [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6), 2024 [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), 2025 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 2026 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2) 2027 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b), 2028 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr) 2029 ); 2030 2031 /* clamp and store results */ 2032 output_ptr[j] = cm[Temp1]; 2033 output_ptr[j + 1] = cm[Temp2]; 2034 output_ptr[j + 2] = cm[Temp3]; 2035 output_ptr[j + 3] = cm[Temp4]; 2036 output_ptr[j + 4] = cm[Temp5]; 2037 output_ptr[j + 5] = cm[Temp6]; 2038 output_ptr[j + 6] = cm[Temp7]; 2039 output_ptr[j + 7] = cm[Temp8]; 2040 2041 src_ptr += 8; 2042 } 2043 2044 output_ptr += output_pitch; 2045 } 2046 } 2047 else 2048 { 2049 /* 4 tap filter */ 2050 2051 /* prefetch src_ptr data to cache memory */ 2052 prefetch_load(src_ptr + 16); 2053 2054 /* unroll for loop */ 2055 for (i = 16; i--;) 2056 { 2057 /* apply filter with vectors pairs */ 2058 __asm__ __volatile__ ( 2059 "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t" 2060 "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t" 2061 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 2062 "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t" 2063 "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t" 2064 "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t" 2065 "mtlo %[vector4a], $ac2 \n\t" 2066 2067 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2068 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2069 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2070 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2071 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2072 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2073 2074 "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t" 2075 "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t" 2076 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 2077 "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t" 2078 "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t" 2079 "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t" 2080 "mtlo %[vector4a], $ac0 \n\t" 2081 "extp %[Temp1], $ac2, 9 \n\t" 2082 2083 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2084 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2085 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2086 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2087 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2088 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2089 2090 "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t" 2091 "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t" 2092 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 2093 "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t" 2094 "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t" 2095 "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t" 2096 "mtlo %[vector4a], $ac1 \n\t" 2097 "extp %[Temp2], $ac0, 9 \n\t" 2098 2099 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2100 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2101 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2102 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2103 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2104 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2105 2106 "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t" 2107 "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t" 2108 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 2109 "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t" 2110 "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t" 2111 "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t" 2112 "mtlo %[vector4a], $ac3 \n\t" 2113 "extp %[Temp3], $ac1, 9 \n\t" 2114 2115 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2116 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2117 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2118 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2119 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2120 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2121 2122 "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t" 2123 "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t" 2124 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 2125 "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t" 2126 "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t" 2127 "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t" 2128 "mtlo %[vector4a], $ac2 \n\t" 2129 "extp %[Temp4], $ac3, 9 \n\t" 2130 2131 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2132 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2133 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2134 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2135 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2136 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2137 2138 "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t" 2139 "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t" 2140 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 2141 "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t" 2142 "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t" 2143 "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t" 2144 "mtlo %[vector4a], $ac0 \n\t" 2145 "extp %[Temp5], $ac2, 9 \n\t" 2146 2147 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2148 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2149 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2150 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2151 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2152 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2153 2154 "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t" 2155 "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t" 2156 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 2157 "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t" 2158 "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t" 2159 "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t" 2160 "mtlo %[vector4a], $ac1 \n\t" 2161 "extp %[Temp6], $ac0, 9 \n\t" 2162 2163 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2164 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2165 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2166 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2167 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2168 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2169 2170 "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t" 2171 "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t" 2172 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 2173 "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t" 2174 "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t" 2175 "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t" 2176 "mtlo %[vector4a], $ac3 \n\t" 2177 "extp %[Temp7], $ac1, 9 \n\t" 2178 2179 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2180 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2181 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2182 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2183 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2184 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2185 "extp %[Temp8], $ac3, 9 \n\t" 2186 2187 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 2188 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), 2189 [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6), 2190 [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), 2191 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 2192 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), 2193 [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3) 2194 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 2195 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), 2196 [src_ptr] "r" (src_ptr) 2197 ); 2198 2199 /* clamp and store results */ 2200 output_ptr[0] = cm[Temp1]; 2201 output_ptr[1] = cm[Temp2]; 2202 output_ptr[2] = cm[Temp3]; 2203 output_ptr[3] = cm[Temp4]; 2204 output_ptr[4] = cm[Temp5]; 2205 output_ptr[5] = cm[Temp6]; 2206 output_ptr[6] = cm[Temp7]; 2207 output_ptr[7] = cm[Temp8]; 2208 2209 /* apply filter with vectors pairs */ 2210 __asm__ __volatile__ ( 2211 "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t" 2212 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" 2213 "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t" 2214 "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t" 2215 "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t" 2216 "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t" 2217 "mtlo %[vector4a], $ac2 \n\t" 2218 2219 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2220 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2221 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2222 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2223 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2224 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2225 2226 "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t" 2227 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" 2228 "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t" 2229 "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t" 2230 "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t" 2231 "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t" 2232 "mtlo %[vector4a], $ac0 \n\t" 2233 "extp %[Temp1], $ac2, 9 \n\t" 2234 2235 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2236 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2237 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2238 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2239 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2240 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2241 2242 "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t" 2243 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" 2244 "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t" 2245 "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t" 2246 "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t" 2247 "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t" 2248 "mtlo %[vector4a], $ac1 \n\t" 2249 "extp %[Temp2], $ac0, 9 \n\t" 2250 2251 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2252 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2253 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2254 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2255 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2256 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2257 2258 "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t" 2259 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" 2260 "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t" 2261 "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t" 2262 "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t" 2263 "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t" 2264 "mtlo %[vector4a], $ac3 \n\t" 2265 "extp %[Temp3], $ac1, 9 \n\t" 2266 2267 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2268 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2269 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2270 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2271 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2272 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2273 2274 "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t" 2275 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 2276 "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t" 2277 "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t" 2278 "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t" 2279 "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t" 2280 "mtlo %[vector4a], $ac2 \n\t" 2281 "extp %[Temp4], $ac3, 9 \n\t" 2282 2283 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2284 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2285 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2286 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2287 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2288 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2289 2290 "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t" 2291 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 2292 "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t" 2293 "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t" 2294 "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t" 2295 "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t" 2296 "mtlo %[vector4a], $ac0 \n\t" 2297 "extp %[Temp5], $ac2, 9 \n\t" 2298 2299 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2300 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2301 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2302 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2303 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2304 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2305 2306 "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t" 2307 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 2308 "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t" 2309 "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t" 2310 "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t" 2311 "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t" 2312 "mtlo %[vector4a], $ac1 \n\t" 2313 "extp %[Temp6], $ac0, 9 \n\t" 2314 2315 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2316 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2317 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2318 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2319 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2320 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2321 2322 "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t" 2323 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 2324 "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t" 2325 "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t" 2326 "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t" 2327 "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t" 2328 "mtlo %[vector4a], $ac3 \n\t" 2329 "extp %[Temp7], $ac1, 9 \n\t" 2330 2331 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2332 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2333 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2334 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2335 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2336 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2337 "extp %[Temp8], $ac3, 9 \n\t" 2338 2339 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 2340 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), 2341 [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6), 2342 [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8), 2343 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0), 2344 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2), 2345 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3) 2346 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 2347 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a), 2348 [src_ptr] "r" (src_ptr) 2349 ); 2350 2351 src_ptr += 16; 2352 output_ptr[8] = cm[Temp1]; 2353 output_ptr[9] = cm[Temp2]; 2354 output_ptr[10] = cm[Temp3]; 2355 output_ptr[11] = cm[Temp4]; 2356 output_ptr[12] = cm[Temp5]; 2357 output_ptr[13] = cm[Temp6]; 2358 output_ptr[14] = cm[Temp7]; 2359 output_ptr[15] = cm[Temp8]; 2360 2361 output_ptr += output_pitch; 2362 } 2363 } 2364 } 2365 2366 2367 void vp8_sixtap_predict4x4_dspr2 2368 ( 2369 unsigned char *RESTRICT src_ptr, 2370 int src_pixels_per_line, 2371 int xoffset, 2372 int yoffset, 2373 unsigned char *RESTRICT dst_ptr, 2374 int dst_pitch 2375 ) 2376 { 2377 unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */ 2378 unsigned int pos = 16; 2379 2380 /* bit positon for extract from acc */ 2381 __asm__ __volatile__ ( 2382 "wrdsp %[pos], 1 \n\t" 2383 : 2384 : [pos] "r" (pos) 2385 ); 2386 2387 if (yoffset) 2388 { 2389 /* First filter 1-D horizontally... */ 2390 vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData, 2391 src_pixels_per_line, 9, xoffset, 4); 2392 /* then filter verticaly... */ 2393 vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset); 2394 } 2395 else 2396 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2397 vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, 2398 4, xoffset, dst_pitch); 2399 } 2400 2401 2402 void vp8_sixtap_predict8x8_dspr2 2403 ( 2404 unsigned char *RESTRICT src_ptr, 2405 int src_pixels_per_line, 2406 int xoffset, 2407 int yoffset, 2408 unsigned char *RESTRICT dst_ptr, 2409 int dst_pitch 2410 ) 2411 { 2412 2413 unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */ 2414 unsigned int pos, Temp1, Temp2; 2415 2416 pos = 16; 2417 2418 /* bit positon for extract from acc */ 2419 __asm__ __volatile__ ( 2420 "wrdsp %[pos], 1 \n\t" 2421 : 2422 : [pos] "r" (pos) 2423 ); 2424 2425 if (yoffset) 2426 { 2427 2428 src_ptr = src_ptr - (2 * src_pixels_per_line); 2429 2430 if (xoffset) 2431 /* filter 1-D horizontally... */ 2432 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, 2433 13, xoffset, 8); 2434 2435 else 2436 { 2437 /* prefetch src_ptr data to cache memory */ 2438 prefetch_load(src_ptr + 2 * src_pixels_per_line); 2439 2440 __asm__ __volatile__ ( 2441 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2442 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2443 "sw %[Temp1], 0(%[FData]) \n\t" 2444 "sw %[Temp2], 4(%[FData]) \n\t" 2445 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2446 2447 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2448 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2449 "sw %[Temp1], 8(%[FData]) \n\t" 2450 "sw %[Temp2], 12(%[FData]) \n\t" 2451 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2452 2453 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2454 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2455 "sw %[Temp1], 16(%[FData]) \n\t" 2456 "sw %[Temp2], 20(%[FData]) \n\t" 2457 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2458 2459 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2460 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2461 "sw %[Temp1], 24(%[FData]) \n\t" 2462 "sw %[Temp2], 28(%[FData]) \n\t" 2463 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2464 2465 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2466 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2467 "sw %[Temp1], 32(%[FData]) \n\t" 2468 "sw %[Temp2], 36(%[FData]) \n\t" 2469 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2470 2471 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2472 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2473 "sw %[Temp1], 40(%[FData]) \n\t" 2474 "sw %[Temp2], 44(%[FData]) \n\t" 2475 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2476 2477 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2478 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2479 "sw %[Temp1], 48(%[FData]) \n\t" 2480 "sw %[Temp2], 52(%[FData]) \n\t" 2481 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2482 2483 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2484 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2485 "sw %[Temp1], 56(%[FData]) \n\t" 2486 "sw %[Temp2], 60(%[FData]) \n\t" 2487 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2488 2489 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2490 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2491 "sw %[Temp1], 64(%[FData]) \n\t" 2492 "sw %[Temp2], 68(%[FData]) \n\t" 2493 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2494 2495 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2496 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2497 "sw %[Temp1], 72(%[FData]) \n\t" 2498 "sw %[Temp2], 76(%[FData]) \n\t" 2499 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2500 2501 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2502 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2503 "sw %[Temp1], 80(%[FData]) \n\t" 2504 "sw %[Temp2], 84(%[FData]) \n\t" 2505 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2506 2507 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2508 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2509 "sw %[Temp1], 88(%[FData]) \n\t" 2510 "sw %[Temp2], 92(%[FData]) \n\t" 2511 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2512 2513 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2514 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2515 "sw %[Temp1], 96(%[FData]) \n\t" 2516 "sw %[Temp2], 100(%[FData]) \n\t" 2517 2518 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) 2519 : [FData] "r" (FData), [src_ptr] "r" (src_ptr), 2520 [src_pixels_per_line] "r" (src_pixels_per_line) 2521 ); 2522 } 2523 2524 /* filter verticaly... */ 2525 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset); 2526 } 2527 2528 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2529 else 2530 { 2531 if (xoffset) 2532 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, 2533 8, xoffset, dst_pitch); 2534 2535 else 2536 { 2537 /* copy from src buffer to dst buffer */ 2538 __asm__ __volatile__ ( 2539 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2540 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2541 "sw %[Temp1], 0(%[dst_ptr]) \n\t" 2542 "sw %[Temp2], 4(%[dst_ptr]) \n\t" 2543 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2544 2545 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2546 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2547 "sw %[Temp1], 8(%[dst_ptr]) \n\t" 2548 "sw %[Temp2], 12(%[dst_ptr]) \n\t" 2549 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2550 2551 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2552 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2553 "sw %[Temp1], 16(%[dst_ptr]) \n\t" 2554 "sw %[Temp2], 20(%[dst_ptr]) \n\t" 2555 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2556 2557 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2558 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2559 "sw %[Temp1], 24(%[dst_ptr]) \n\t" 2560 "sw %[Temp2], 28(%[dst_ptr]) \n\t" 2561 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2562 2563 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2564 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2565 "sw %[Temp1], 32(%[dst_ptr]) \n\t" 2566 "sw %[Temp2], 36(%[dst_ptr]) \n\t" 2567 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2568 2569 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2570 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2571 "sw %[Temp1], 40(%[dst_ptr]) \n\t" 2572 "sw %[Temp2], 44(%[dst_ptr]) \n\t" 2573 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2574 2575 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2576 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2577 "sw %[Temp1], 48(%[dst_ptr]) \n\t" 2578 "sw %[Temp2], 52(%[dst_ptr]) \n\t" 2579 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2580 2581 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2582 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2583 "sw %[Temp1], 56(%[dst_ptr]) \n\t" 2584 "sw %[Temp2], 60(%[dst_ptr]) \n\t" 2585 2586 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) 2587 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr), 2588 [src_pixels_per_line] "r" (src_pixels_per_line) 2589 ); 2590 } 2591 } 2592 } 2593 2594 2595 void vp8_sixtap_predict8x4_dspr2 2596 ( 2597 unsigned char *RESTRICT src_ptr, 2598 int src_pixels_per_line, 2599 int xoffset, 2600 int yoffset, 2601 unsigned char *RESTRICT dst_ptr, 2602 int dst_pitch 2603 ) 2604 { 2605 unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */ 2606 unsigned int pos, Temp1, Temp2; 2607 2608 pos = 16; 2609 2610 /* bit positon for extract from acc */ 2611 __asm__ __volatile__ ( 2612 "wrdsp %[pos], 1 \n\t" 2613 : 2614 : [pos] "r" (pos) 2615 ); 2616 2617 if (yoffset) 2618 { 2619 2620 src_ptr = src_ptr - (2 * src_pixels_per_line); 2621 2622 if (xoffset) 2623 /* filter 1-D horizontally... */ 2624 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, 2625 9, xoffset, 8); 2626 2627 else 2628 { 2629 /* prefetch src_ptr data to cache memory */ 2630 prefetch_load(src_ptr + 2 * src_pixels_per_line); 2631 2632 __asm__ __volatile__ ( 2633 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2634 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2635 "sw %[Temp1], 0(%[FData]) \n\t" 2636 "sw %[Temp2], 4(%[FData]) \n\t" 2637 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2638 2639 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2640 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2641 "sw %[Temp1], 8(%[FData]) \n\t" 2642 "sw %[Temp2], 12(%[FData]) \n\t" 2643 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2644 2645 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2646 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2647 "sw %[Temp1], 16(%[FData]) \n\t" 2648 "sw %[Temp2], 20(%[FData]) \n\t" 2649 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2650 2651 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2652 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2653 "sw %[Temp1], 24(%[FData]) \n\t" 2654 "sw %[Temp2], 28(%[FData]) \n\t" 2655 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2656 2657 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2658 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2659 "sw %[Temp1], 32(%[FData]) \n\t" 2660 "sw %[Temp2], 36(%[FData]) \n\t" 2661 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2662 2663 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2664 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2665 "sw %[Temp1], 40(%[FData]) \n\t" 2666 "sw %[Temp2], 44(%[FData]) \n\t" 2667 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2668 2669 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2670 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2671 "sw %[Temp1], 48(%[FData]) \n\t" 2672 "sw %[Temp2], 52(%[FData]) \n\t" 2673 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2674 2675 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2676 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2677 "sw %[Temp1], 56(%[FData]) \n\t" 2678 "sw %[Temp2], 60(%[FData]) \n\t" 2679 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2680 2681 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2682 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2683 "sw %[Temp1], 64(%[FData]) \n\t" 2684 "sw %[Temp2], 68(%[FData]) \n\t" 2685 2686 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) 2687 : [FData] "r" (FData), [src_ptr] "r" (src_ptr), 2688 [src_pixels_per_line] "r" (src_pixels_per_line) 2689 ); 2690 } 2691 2692 /* filter verticaly... */ 2693 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset); 2694 } 2695 2696 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2697 else 2698 { 2699 if (xoffset) 2700 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, 2701 4, xoffset, dst_pitch); 2702 2703 else 2704 { 2705 /* copy from src buffer to dst buffer */ 2706 __asm__ __volatile__ ( 2707 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2708 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2709 "sw %[Temp1], 0(%[dst_ptr]) \n\t" 2710 "sw %[Temp2], 4(%[dst_ptr]) \n\t" 2711 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2712 2713 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2714 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2715 "sw %[Temp1], 8(%[dst_ptr]) \n\t" 2716 "sw %[Temp2], 12(%[dst_ptr]) \n\t" 2717 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2718 2719 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2720 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2721 "sw %[Temp1], 16(%[dst_ptr]) \n\t" 2722 "sw %[Temp2], 20(%[dst_ptr]) \n\t" 2723 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2724 2725 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2726 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2727 "sw %[Temp1], 24(%[dst_ptr]) \n\t" 2728 "sw %[Temp2], 28(%[dst_ptr]) \n\t" 2729 2730 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2) 2731 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr), 2732 [src_pixels_per_line] "r" (src_pixels_per_line) 2733 ); 2734 } 2735 } 2736 } 2737 2738 2739 void vp8_sixtap_predict16x16_dspr2 2740 ( 2741 unsigned char *RESTRICT src_ptr, 2742 int src_pixels_per_line, 2743 int xoffset, 2744 int yoffset, 2745 unsigned char *RESTRICT dst_ptr, 2746 int dst_pitch 2747 ) 2748 { 2749 const unsigned short *VFilter; 2750 unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */ 2751 unsigned int pos; 2752 2753 VFilter = sub_pel_filterss[yoffset]; 2754 2755 pos = 16; 2756 2757 /* bit positon for extract from acc */ 2758 __asm__ __volatile__ ( 2759 "wrdsp %[pos], 1 \n\t" 2760 : 2761 : [pos] "r" (pos) 2762 ); 2763 2764 if (yoffset) 2765 { 2766 2767 src_ptr = src_ptr - (2 * src_pixels_per_line); 2768 2769 switch (xoffset) 2770 { 2771 /* filter 1-D horizontally... */ 2772 case 2: 2773 case 4: 2774 case 6: 2775 /* 6 tap filter */ 2776 vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line, 2777 21, xoffset, 16); 2778 break; 2779 2780 case 0: 2781 /* only copy buffer */ 2782 vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line); 2783 break; 2784 2785 case 1: 2786 case 3: 2787 case 5: 2788 case 7: 2789 /* 4 tap filter */ 2790 vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16, 2791 21, xoffset, yoffset, dst_ptr, dst_pitch); 2792 break; 2793 } 2794 2795 /* filter verticaly... */ 2796 vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter); 2797 } 2798 else 2799 { 2800 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2801 switch (xoffset) 2802 { 2803 case 2: 2804 case 4: 2805 case 6: 2806 /* 6 tap filter */ 2807 vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line, 2808 16, xoffset, dst_pitch); 2809 break; 2810 2811 case 1: 2812 case 3: 2813 case 5: 2814 case 7: 2815 /* 4 tap filter */ 2816 vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16, 2817 21, xoffset, yoffset, dst_ptr, dst_pitch); 2818 break; 2819 } 2820 } 2821 } 2822 2823 #endif 2824