1 /* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <stdlib.h> 12 #include "vp8_rtcd.h" 13 #include "vpx_ports/mem.h" 14 15 #if HAVE_DSPR2 16 #define CROP_WIDTH 256 17 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH]; 18 19 static const unsigned short sub_pel_filterss[8][3] = { 20 { 0, 0, 0 }, 21 { 0, 0x0601, 0x7b0c }, 22 { 0x0201, 0x0b08, 0x6c24 }, 23 { 0, 0x0906, 0x5d32 }, 24 { 0x0303, 0x1010, 0x4d4d }, 25 { 0, 0x0609, 0x325d }, 26 { 0x0102, 0x080b, 0x246c }, 27 { 0, 0x0106, 0x0c7b }, 28 }; 29 30 static const int sub_pel_filters_int[8][3] = { 31 { 0, 0, 0 }, 32 { 0x0000fffa, 0x007b000c, 0xffff0000 }, 33 { 0x0002fff5, 0x006c0024, 0xfff80001 }, 34 { 0x0000fff7, 0x005d0032, 0xfffa0000 }, 35 { 0x0003fff0, 0x004d004d, 0xfff00003 }, 36 { 0x0000fffa, 0x0032005d, 0xfff70000 }, 37 { 0x0001fff8, 0x0024006c, 0xfff50002 }, 38 { 0x0000ffff, 0x000c007b, 0xfffa0000 }, 39 }; 40 41 static const int sub_pel_filters_inv[8][3] = { 42 { 0, 0, 0 }, 43 { 0xfffa0000, 0x000c007b, 0x0000ffff }, 44 { 0xfff50002, 0x0024006c, 0x0001fff8 }, 45 { 0xfff70000, 0x0032005d, 0x0000fffa }, 46 { 0xfff00003, 0x004d004d, 0x0003fff0 }, 47 { 0xfffa0000, 0x005d0032, 0x0000fff7 }, 48 { 0xfff80001, 0x006c0024, 0x0002fff5 }, 49 { 0xffff0000, 0x007b000c, 0x0000fffa }, 50 }; 51 52 /* clang-format off */ 53 static const int sub_pel_filters_int_tap_4[8][2] = { 54 { 0, 0}, 55 { 0xfffa007b, 0x000cffff}, 56 { 0, 0}, 57 { 0xfff7005d, 0x0032fffa}, 58 { 0, 0}, 59 { 0xfffa0032, 0x005dfff7}, 60 { 0, 0}, 61 { 0xffff000c, 0x007bfffa}, 62 }; 63 64 65 static const int sub_pel_filters_inv_tap_4[8][2] = { 66 { 0, 0}, 67 { 0x007bfffa, 0xffff000c}, 68 { 0, 0}, 69 { 0x005dfff7, 0xfffa0032}, 70 { 0, 0}, 71 { 0x0032fffa, 0xfff7005d}, 72 { 0, 0}, 73 { 0x000cffff, 0xfffa007b}, 74 }; 75 /* clang-format on */ 76 77 inline void prefetch_load(unsigned char *src) { 78 __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); 79 } 80 81 inline void prefetch_store(unsigned char *dst) { 82 __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); 83 } 84 85 void dsputil_static_init(void) { 86 int i; 87 88 for (i = 0; i < 256; ++i) ff_cropTbl[i + CROP_WIDTH] = i; 89 90 for (i = 0; i < CROP_WIDTH; ++i) { 91 ff_cropTbl[i] = 0; 92 ff_cropTbl[i + CROP_WIDTH + 256] = 255; 93 } 94 } 95 96 void vp8_filter_block2d_first_pass_4(unsigned char *RESTRICT src_ptr, 97 unsigned char *RESTRICT dst_ptr, 98 unsigned int src_pixels_per_line, 99 unsigned int output_height, int xoffset, 100 int pitch) { 101 unsigned int i; 102 int Temp1, Temp2, Temp3, Temp4; 103 104 unsigned int vector4a = 64; 105 int vector1b, vector2b, vector3b; 106 unsigned int tp1, tp2, tn1, tn2; 107 unsigned int p1, p2, p3; 108 unsigned int n1, n2, n3; 109 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 110 111 vector3b = sub_pel_filters_inv[xoffset][2]; 112 113 /* if (xoffset == 0) we don't need any filtering */ 114 if (vector3b == 0) { 115 for (i = 0; i < output_height; ++i) { 116 /* prefetch src_ptr data to cache memory */ 117 prefetch_load(src_ptr + src_pixels_per_line); 118 dst_ptr[0] = src_ptr[0]; 119 dst_ptr[1] = src_ptr[1]; 120 dst_ptr[2] = src_ptr[2]; 121 dst_ptr[3] = src_ptr[3]; 122 123 /* next row... */ 124 src_ptr += src_pixels_per_line; 125 dst_ptr += 4; 126 } 127 } else { 128 if (vector3b > 65536) { 129 /* 6 tap filter */ 130 131 vector1b = sub_pel_filters_inv[xoffset][0]; 132 vector2b = sub_pel_filters_inv[xoffset][1]; 133 134 /* prefetch src_ptr data to cache memory */ 135 prefetch_load(src_ptr + src_pixels_per_line); 136 137 for (i = output_height; i--;) { 138 /* apply filter with vectors pairs */ 139 __asm__ __volatile__( 140 "ulw %[tp1], -2(%[src_ptr]) \n\t" 141 "ulw %[tp2], 2(%[src_ptr]) \n\t" 142 143 /* even 1. pixel */ 144 "mtlo %[vector4a], $ac3 \n\t" 145 "preceu.ph.qbr %[p1], %[tp1] \n\t" 146 "preceu.ph.qbl %[p2], %[tp1] \n\t" 147 "preceu.ph.qbr %[p3], %[tp2] \n\t" 148 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 149 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 150 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 151 152 /* even 2. pixel */ 153 "mtlo %[vector4a], $ac2 \n\t" 154 "preceu.ph.qbl %[p1], %[tp2] \n\t" 155 "balign %[tp2], %[tp1], 3 \n\t" 156 "extp %[Temp1], $ac3, 9 \n\t" 157 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 158 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 159 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 160 161 /* odd 1. pixel */ 162 "ulw %[tn2], 3(%[src_ptr]) \n\t" 163 "mtlo %[vector4a], $ac3 \n\t" 164 "preceu.ph.qbr %[n1], %[tp2] \n\t" 165 "preceu.ph.qbl %[n2], %[tp2] \n\t" 166 "preceu.ph.qbr %[n3], %[tn2] \n\t" 167 "extp %[Temp3], $ac2, 9 \n\t" 168 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 169 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 170 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 171 172 /* even 2. pixel */ 173 "mtlo %[vector4a], $ac2 \n\t" 174 "preceu.ph.qbl %[n1], %[tn2] \n\t" 175 "extp %[Temp2], $ac3, 9 \n\t" 176 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 177 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 178 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 179 "extp %[Temp4], $ac2, 9 \n\t" 180 181 /* clamp */ 182 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 183 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 184 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 185 "lbux %[n2], %[Temp4](%[cm]) \n\t" 186 187 /* store bytes */ 188 "sb %[tp1], 0(%[dst_ptr]) \n\t" 189 "sb %[tn1], 1(%[dst_ptr]) \n\t" 190 "sb %[tp2], 2(%[dst_ptr]) \n\t" 191 "sb %[n2], 3(%[dst_ptr]) \n\t" 192 193 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 194 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 195 [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), 196 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 197 [Temp4] "=&r"(Temp4) 198 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 199 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr), 200 [vector3b] "r"(vector3b), [src_ptr] "r"(src_ptr)); 201 202 /* Next row... */ 203 src_ptr += src_pixels_per_line; 204 dst_ptr += pitch; 205 } 206 } else { 207 /* 4 tap filter */ 208 209 vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; 210 vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; 211 212 for (i = output_height; i--;) { 213 /* apply filter with vectors pairs */ 214 __asm__ __volatile__( 215 "ulw %[tp1], -1(%[src_ptr]) \n\t" 216 "ulw %[tp2], 3(%[src_ptr]) \n\t" 217 218 /* even 1. pixel */ 219 "mtlo %[vector4a], $ac3 \n\t" 220 "preceu.ph.qbr %[p1], %[tp1] \n\t" 221 "preceu.ph.qbl %[p2], %[tp1] \n\t" 222 "preceu.ph.qbr %[p3], %[tp2] \n\t" 223 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 224 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 225 226 /* even 2. pixel */ 227 "mtlo %[vector4a], $ac2 \n\t" 228 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 229 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 230 "extp %[Temp1], $ac3, 9 \n\t" 231 232 /* odd 1. pixel */ 233 "srl %[tn1], %[tp2], 8 \n\t" 234 "balign %[tp2], %[tp1], 3 \n\t" 235 "mtlo %[vector4a], $ac3 \n\t" 236 "preceu.ph.qbr %[n1], %[tp2] \n\t" 237 "preceu.ph.qbl %[n2], %[tp2] \n\t" 238 "preceu.ph.qbr %[n3], %[tn1] \n\t" 239 "extp %[Temp3], $ac2, 9 \n\t" 240 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 241 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 242 243 /* odd 2. pixel */ 244 "mtlo %[vector4a], $ac2 \n\t" 245 "extp %[Temp2], $ac3, 9 \n\t" 246 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 247 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 248 "extp %[Temp4], $ac2, 9 \n\t" 249 250 /* clamp and store results */ 251 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 252 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 253 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 254 "sb %[tp1], 0(%[dst_ptr]) \n\t" 255 "sb %[tn1], 1(%[dst_ptr]) \n\t" 256 "lbux %[n2], %[Temp4](%[cm]) \n\t" 257 "sb %[tp2], 2(%[dst_ptr]) \n\t" 258 "sb %[n2], 3(%[dst_ptr]) \n\t" 259 260 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 261 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), 262 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), 263 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) 264 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 265 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr), 266 [src_ptr] "r"(src_ptr)); 267 /* Next row... */ 268 src_ptr += src_pixels_per_line; 269 dst_ptr += pitch; 270 } 271 } 272 } 273 } 274 275 void vp8_filter_block2d_first_pass_8_all(unsigned char *RESTRICT src_ptr, 276 unsigned char *RESTRICT dst_ptr, 277 unsigned int src_pixels_per_line, 278 unsigned int output_height, 279 int xoffset, int pitch) { 280 unsigned int i; 281 int Temp1, Temp2, Temp3, Temp4; 282 283 unsigned int vector4a = 64; 284 unsigned int vector1b, vector2b, vector3b; 285 unsigned int tp1, tp2, tn1, tn2; 286 unsigned int p1, p2, p3, p4; 287 unsigned int n1, n2, n3, n4; 288 289 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 290 291 /* if (xoffset == 0) we don't need any filtering */ 292 if (xoffset == 0) { 293 for (i = 0; i < output_height; ++i) { 294 /* prefetch src_ptr data to cache memory */ 295 prefetch_load(src_ptr + src_pixels_per_line); 296 297 dst_ptr[0] = src_ptr[0]; 298 dst_ptr[1] = src_ptr[1]; 299 dst_ptr[2] = src_ptr[2]; 300 dst_ptr[3] = src_ptr[3]; 301 dst_ptr[4] = src_ptr[4]; 302 dst_ptr[5] = src_ptr[5]; 303 dst_ptr[6] = src_ptr[6]; 304 dst_ptr[7] = src_ptr[7]; 305 306 /* next row... */ 307 src_ptr += src_pixels_per_line; 308 dst_ptr += 8; 309 } 310 } else { 311 vector3b = sub_pel_filters_inv[xoffset][2]; 312 313 if (vector3b > 65536) { 314 /* 6 tap filter */ 315 316 vector1b = sub_pel_filters_inv[xoffset][0]; 317 vector2b = sub_pel_filters_inv[xoffset][1]; 318 319 for (i = output_height; i--;) { 320 /* prefetch src_ptr data to cache memory */ 321 prefetch_load(src_ptr + src_pixels_per_line); 322 323 /* apply filter with vectors pairs */ 324 __asm__ __volatile__( 325 "ulw %[tp1], -2(%[src_ptr]) \n\t" 326 "ulw %[tp2], 2(%[src_ptr]) \n\t" 327 328 /* even 1. pixel */ 329 "mtlo %[vector4a], $ac3 \n\t" 330 "preceu.ph.qbr %[p1], %[tp1] \n\t" 331 "preceu.ph.qbl %[p2], %[tp1] \n\t" 332 "preceu.ph.qbr %[p3], %[tp2] \n\t" 333 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 334 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 335 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 336 337 /* even 2. pixel */ 338 "mtlo %[vector4a], $ac2 \n\t" 339 "preceu.ph.qbl %[p1], %[tp2] \n\t" 340 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 341 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 342 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 343 344 "balign %[tp2], %[tp1], 3 \n\t" 345 "extp %[Temp1], $ac3, 9 \n\t" 346 "ulw %[tn2], 3(%[src_ptr]) \n\t" 347 348 /* odd 1. pixel */ 349 "mtlo %[vector4a], $ac3 \n\t" 350 "preceu.ph.qbr %[n1], %[tp2] \n\t" 351 "preceu.ph.qbl %[n2], %[tp2] \n\t" 352 "preceu.ph.qbr %[n3], %[tn2] \n\t" 353 "extp %[Temp3], $ac2, 9 \n\t" 354 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 355 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 356 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 357 358 /* odd 2. pixel */ 359 "mtlo %[vector4a], $ac2 \n\t" 360 "preceu.ph.qbl %[n1], %[tn2] \n\t" 361 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 362 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 363 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 364 "ulw %[tp1], 6(%[src_ptr]) \n\t" 365 "extp %[Temp2], $ac3, 9 \n\t" 366 "mtlo %[vector4a], $ac3 \n\t" 367 "preceu.ph.qbr %[p2], %[tp1] \n\t" 368 "extp %[Temp4], $ac2, 9 \n\t" 369 370 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), 371 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), 372 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), 373 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) 374 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 375 [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), 376 [src_ptr] "r"(src_ptr)); 377 378 /* clamp and store results */ 379 dst_ptr[0] = cm[Temp1]; 380 dst_ptr[1] = cm[Temp2]; 381 dst_ptr[2] = cm[Temp3]; 382 dst_ptr[3] = cm[Temp4]; 383 384 /* next 4 pixels */ 385 __asm__ __volatile__( 386 /* even 3. pixel */ 387 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 388 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" 389 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" 390 391 /* even 4. pixel */ 392 "mtlo %[vector4a], $ac2 \n\t" 393 "preceu.ph.qbl %[p4], %[tp1] \n\t" 394 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 395 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 396 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 397 398 "ulw %[tn1], 7(%[src_ptr]) \n\t" 399 "extp %[Temp1], $ac3, 9 \n\t" 400 401 /* odd 3. pixel */ 402 "mtlo %[vector4a], $ac3 \n\t" 403 "preceu.ph.qbr %[n2], %[tn1] \n\t" 404 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" 405 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" 406 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" 407 "extp %[Temp3], $ac2, 9 \n\t" 408 409 /* odd 4. pixel */ 410 "mtlo %[vector4a], $ac2 \n\t" 411 "preceu.ph.qbl %[n4], %[tn1] \n\t" 412 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" 413 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" 414 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 415 "extp %[Temp2], $ac3, 9 \n\t" 416 "extp %[Temp4], $ac2, 9 \n\t" 417 418 : [tn1] "=&r"(tn1), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4), 419 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 420 [Temp4] "=r"(Temp4) 421 : [tp1] "r"(tp1), [vector1b] "r"(vector1b), [p2] "r"(p2), 422 [vector2b] "r"(vector2b), [n1] "r"(n1), [p1] "r"(p1), 423 [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3), 424 [n3] "r"(n3), [src_ptr] "r"(src_ptr)); 425 426 /* clamp and store results */ 427 dst_ptr[4] = cm[Temp1]; 428 dst_ptr[5] = cm[Temp2]; 429 dst_ptr[6] = cm[Temp3]; 430 dst_ptr[7] = cm[Temp4]; 431 432 src_ptr += src_pixels_per_line; 433 dst_ptr += pitch; 434 } 435 } else { 436 /* 4 tap filter */ 437 438 vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; 439 vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; 440 441 for (i = output_height; i--;) { 442 /* prefetch src_ptr data to cache memory */ 443 prefetch_load(src_ptr + src_pixels_per_line); 444 445 /* apply filter with vectors pairs */ 446 __asm__ __volatile__( 447 "ulw %[tp1], -1(%[src_ptr]) \n\t" 448 449 /* even 1. pixel */ 450 "mtlo %[vector4a], $ac3 \n\t" 451 "preceu.ph.qbr %[p1], %[tp1] \n\t" 452 "preceu.ph.qbl %[p2], %[tp1] \n\t" 453 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 454 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 455 456 "ulw %[tp2], 3(%[src_ptr]) \n\t" 457 458 /* even 2. pixel */ 459 "mtlo %[vector4a], $ac2 \n\t" 460 "preceu.ph.qbr %[p3], %[tp2] \n\t" 461 "preceu.ph.qbl %[p4], %[tp2] \n\t" 462 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 463 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 464 "extp %[Temp1], $ac3, 9 \n\t" 465 466 "balign %[tp2], %[tp1], 3 \n\t" 467 468 /* odd 1. pixel */ 469 "mtlo %[vector4a], $ac3 \n\t" 470 "preceu.ph.qbr %[n1], %[tp2] \n\t" 471 "preceu.ph.qbl %[n2], %[tp2] \n\t" 472 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 473 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 474 "extp %[Temp3], $ac2, 9 \n\t" 475 476 "ulw %[tn2], 4(%[src_ptr]) \n\t" 477 478 /* odd 2. pixel */ 479 "mtlo %[vector4a], $ac2 \n\t" 480 "preceu.ph.qbr %[n3], %[tn2] \n\t" 481 "preceu.ph.qbl %[n4], %[tn2] \n\t" 482 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 483 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 484 "ulw %[tp1], 7(%[src_ptr]) \n\t" 485 "extp %[Temp2], $ac3, 9 \n\t" 486 "mtlo %[vector4a], $ac3 \n\t" 487 "extp %[Temp4], $ac2, 9 \n\t" 488 489 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), 490 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), 491 [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), [n4] "=&r"(n4), 492 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 493 [Temp4] "=r"(Temp4) 494 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 495 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); 496 497 /* clamp and store results */ 498 dst_ptr[0] = cm[Temp1]; 499 dst_ptr[1] = cm[Temp2]; 500 dst_ptr[2] = cm[Temp3]; 501 dst_ptr[3] = cm[Temp4]; 502 503 /* next 4 pixels */ 504 __asm__ __volatile__( 505 /* even 3. pixel */ 506 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 507 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 508 509 /* even 4. pixel */ 510 "mtlo %[vector4a], $ac2 \n\t" 511 "preceu.ph.qbr %[p2], %[tp1] \n\t" 512 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 513 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 514 "extp %[Temp1], $ac3, 9 \n\t" 515 516 /* odd 3. pixel */ 517 "mtlo %[vector4a], $ac3 \n\t" 518 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" 519 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" 520 "ulw %[tn1], 8(%[src_ptr]) \n\t" 521 "extp %[Temp3], $ac2, 9 \n\t" 522 523 /* odd 4. pixel */ 524 "mtlo %[vector4a], $ac2 \n\t" 525 "preceu.ph.qbr %[n2], %[tn1] \n\t" 526 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" 527 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" 528 "extp %[Temp2], $ac3, 9 \n\t" 529 "extp %[Temp4], $ac2, 9 \n\t" 530 531 : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), 532 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 533 [Temp4] "=r"(Temp4) 534 : [tp1] "r"(tp1), [p3] "r"(p3), [p4] "r"(p4), 535 [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 536 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr), [n3] "r"(n3), 537 [n4] "r"(n4)); 538 539 /* clamp and store results */ 540 dst_ptr[4] = cm[Temp1]; 541 dst_ptr[5] = cm[Temp2]; 542 dst_ptr[6] = cm[Temp3]; 543 dst_ptr[7] = cm[Temp4]; 544 545 /* next row... */ 546 src_ptr += src_pixels_per_line; 547 dst_ptr += pitch; 548 } 549 } 550 } 551 } 552 553 void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, 554 unsigned char *RESTRICT dst_ptr, 555 unsigned int src_pixels_per_line, 556 unsigned int output_height, 557 int xoffset, int pitch) { 558 unsigned int i; 559 int Temp1, Temp2, Temp3, Temp4; 560 561 unsigned int vector4a; 562 unsigned int vector1b, vector2b, vector3b; 563 unsigned int tp1, tp2, tn1, tn2; 564 unsigned int p1, p2, p3, p4; 565 unsigned int n1, n2, n3, n4; 566 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 567 568 vector1b = sub_pel_filters_inv[xoffset][0]; 569 vector2b = sub_pel_filters_inv[xoffset][1]; 570 vector3b = sub_pel_filters_inv[xoffset][2]; 571 vector4a = 64; 572 573 for (i = output_height; i--;) { 574 /* prefetch src_ptr data to cache memory */ 575 prefetch_load(src_ptr + src_pixels_per_line); 576 577 /* apply filter with vectors pairs */ 578 __asm__ __volatile__( 579 "ulw %[tp1], -2(%[src_ptr]) \n\t" 580 "ulw %[tp2], 2(%[src_ptr]) \n\t" 581 582 /* even 1. pixel */ 583 "mtlo %[vector4a], $ac3 \n\t" 584 "preceu.ph.qbr %[p1], %[tp1] \n\t" 585 "preceu.ph.qbl %[p2], %[tp1] \n\t" 586 "preceu.ph.qbr %[p3], %[tp2] \n\t" 587 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 588 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 589 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 590 591 /* even 2. pixel */ 592 "mtlo %[vector4a], $ac2 \n\t" 593 "preceu.ph.qbl %[p1], %[tp2] \n\t" 594 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 595 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 596 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 597 598 "balign %[tp2], %[tp1], 3 \n\t" 599 "ulw %[tn2], 3(%[src_ptr]) \n\t" 600 "extp %[Temp1], $ac3, 9 \n\t" 601 602 /* odd 1. pixel */ 603 "mtlo %[vector4a], $ac3 \n\t" 604 "preceu.ph.qbr %[n1], %[tp2] \n\t" 605 "preceu.ph.qbl %[n2], %[tp2] \n\t" 606 "preceu.ph.qbr %[n3], %[tn2] \n\t" 607 "extp %[Temp3], $ac2, 9 \n\t" 608 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 609 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 610 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 611 612 /* odd 2. pixel */ 613 "mtlo %[vector4a], $ac2 \n\t" 614 "preceu.ph.qbl %[n1], %[tn2] \n\t" 615 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 616 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 617 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 618 "ulw %[tp1], 6(%[src_ptr]) \n\t" 619 "extp %[Temp2], $ac3, 9 \n\t" 620 "mtlo %[vector4a], $ac3 \n\t" 621 "preceu.ph.qbr %[p2], %[tp1] \n\t" 622 "extp %[Temp4], $ac2, 9 \n\t" 623 624 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), [p1] "=&r"(p1), 625 [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), [n2] "=&r"(n2), 626 [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 627 [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) 628 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 629 [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), 630 [src_ptr] "r"(src_ptr)); 631 632 /* clamp and store results */ 633 dst_ptr[0] = cm[Temp1]; 634 dst_ptr[1] = cm[Temp2]; 635 dst_ptr[2] = cm[Temp3]; 636 dst_ptr[3] = cm[Temp4]; 637 638 /* next 4 pixels */ 639 __asm__ __volatile__( 640 /* even 3. pixel */ 641 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 642 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t" 643 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t" 644 645 /* even 4. pixel */ 646 "mtlo %[vector4a], $ac2 \n\t" 647 "preceu.ph.qbl %[p4], %[tp1] \n\t" 648 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 649 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 650 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 651 "ulw %[tn1], 7(%[src_ptr]) \n\t" 652 "extp %[Temp1], $ac3, 9 \n\t" 653 654 /* odd 3. pixel */ 655 "mtlo %[vector4a], $ac3 \n\t" 656 "preceu.ph.qbr %[n2], %[tn1] \n\t" 657 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t" 658 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t" 659 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t" 660 "extp %[Temp3], $ac2, 9 \n\t" 661 662 /* odd 4. pixel */ 663 "mtlo %[vector4a], $ac2 \n\t" 664 "preceu.ph.qbl %[n4], %[tn1] \n\t" 665 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t" 666 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t" 667 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 668 "ulw %[tp2], 10(%[src_ptr]) \n\t" 669 "extp %[Temp2], $ac3, 9 \n\t" 670 "mtlo %[vector4a], $ac3 \n\t" 671 "preceu.ph.qbr %[p1], %[tp2] \n\t" 672 "extp %[Temp4], $ac2, 9 \n\t" 673 674 : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4), 675 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 676 [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) 677 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), 678 [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2), 679 [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), 680 [src_ptr] "r"(src_ptr)); 681 682 /* clamp and store results */ 683 dst_ptr[4] = cm[Temp1]; 684 dst_ptr[5] = cm[Temp2]; 685 dst_ptr[6] = cm[Temp3]; 686 dst_ptr[7] = cm[Temp4]; 687 688 /* next 4 pixels */ 689 __asm__ __volatile__( 690 /* even 5. pixel */ 691 "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t" 692 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 693 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 694 695 /* even 6. pixel */ 696 "mtlo %[vector4a], $ac2 \n\t" 697 "preceu.ph.qbl %[p3], %[tp2] \n\t" 698 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 699 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 700 "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t" 701 702 "ulw %[tn1], 11(%[src_ptr]) \n\t" 703 "extp %[Temp1], $ac3, 9 \n\t" 704 705 /* odd 5. pixel */ 706 "mtlo %[vector4a], $ac3 \n\t" 707 "preceu.ph.qbr %[n1], %[tn1] \n\t" 708 "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t" 709 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t" 710 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 711 "extp %[Temp3], $ac2, 9 \n\t" 712 713 /* odd 6. pixel */ 714 "mtlo %[vector4a], $ac2 \n\t" 715 "preceu.ph.qbl %[n3], %[tn1] \n\t" 716 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t" 717 "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t" 718 "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t" 719 "ulw %[tp1], 14(%[src_ptr]) \n\t" 720 "extp %[Temp2], $ac3, 9 \n\t" 721 "mtlo %[vector4a], $ac3 \n\t" 722 "preceu.ph.qbr %[p4], %[tp1] \n\t" 723 "extp %[Temp4], $ac2, 9 \n\t" 724 725 : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3), 726 [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 727 [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) 728 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2), 729 [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1), 730 [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a), 731 [vector3b] "r"(vector3b)); 732 733 /* clamp and store results */ 734 dst_ptr[8] = cm[Temp1]; 735 dst_ptr[9] = cm[Temp2]; 736 dst_ptr[10] = cm[Temp3]; 737 dst_ptr[11] = cm[Temp4]; 738 739 /* next 4 pixels */ 740 __asm__ __volatile__( 741 /* even 7. pixel */ 742 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 743 "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t" 744 "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t" 745 746 /* even 8. pixel */ 747 "mtlo %[vector4a], $ac2 \n\t" 748 "preceu.ph.qbl %[p2], %[tp1] \n\t" 749 "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t" 750 "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t" 751 "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t" 752 "ulw %[tn1], 15(%[src_ptr]) \n\t" 753 "extp %[Temp1], $ac3, 9 \n\t" 754 755 /* odd 7. pixel */ 756 "mtlo %[vector4a], $ac3 \n\t" 757 "preceu.ph.qbr %[n4], %[tn1] \n\t" 758 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 759 "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t" 760 "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t" 761 "extp %[Temp3], $ac2, 9 \n\t" 762 763 /* odd 8. pixel */ 764 "mtlo %[vector4a], $ac2 \n\t" 765 "preceu.ph.qbl %[n2], %[tn1] \n\t" 766 "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t" 767 "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t" 768 "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t" 769 "extp %[Temp2], $ac3, 9 \n\t" 770 "extp %[Temp4], $ac2, 9 \n\t" 771 772 /* clamp and store results */ 773 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 774 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 775 "lbux %[p2], %[Temp3](%[cm]) \n\t" 776 "sb %[tp1], 12(%[dst_ptr]) \n\t" 777 "sb %[tn1], 13(%[dst_ptr]) \n\t" 778 "lbux %[n2], %[Temp4](%[cm]) \n\t" 779 "sb %[p2], 14(%[dst_ptr]) \n\t" 780 "sb %[n2], 15(%[dst_ptr]) \n\t" 781 782 : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4), 783 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 784 [Temp4] "=r"(Temp4) 785 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), 786 [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), 787 [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), 788 [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); 789 790 src_ptr += src_pixels_per_line; 791 dst_ptr += pitch; 792 } 793 } 794 795 void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, 796 unsigned char *RESTRICT output_ptr, 797 unsigned int src_pixels_per_line) { 798 int Temp1, Temp2, Temp3, Temp4; 799 int i; 800 801 /* prefetch src_ptr data to cache memory */ 802 prefetch_store(output_ptr + 32); 803 804 /* copy memory from src buffer to dst buffer */ 805 for (i = 0; i < 7; ++i) { 806 __asm__ __volatile__( 807 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 808 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 809 "ulw %[Temp3], 8(%[src_ptr]) \n\t" 810 "ulw %[Temp4], 12(%[src_ptr]) \n\t" 811 "sw %[Temp1], 0(%[output_ptr]) \n\t" 812 "sw %[Temp2], 4(%[output_ptr]) \n\t" 813 "sw %[Temp3], 8(%[output_ptr]) \n\t" 814 "sw %[Temp4], 12(%[output_ptr]) \n\t" 815 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 816 817 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 818 [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) 819 : [src_pixels_per_line] "r"(src_pixels_per_line), 820 [output_ptr] "r"(output_ptr)); 821 822 __asm__ __volatile__( 823 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 824 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 825 "ulw %[Temp3], 8(%[src_ptr]) \n\t" 826 "ulw %[Temp4], 12(%[src_ptr]) \n\t" 827 "sw %[Temp1], 16(%[output_ptr]) \n\t" 828 "sw %[Temp2], 20(%[output_ptr]) \n\t" 829 "sw %[Temp3], 24(%[output_ptr]) \n\t" 830 "sw %[Temp4], 28(%[output_ptr]) \n\t" 831 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 832 833 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 834 [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) 835 : [src_pixels_per_line] "r"(src_pixels_per_line), 836 [output_ptr] "r"(output_ptr)); 837 838 __asm__ __volatile__( 839 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 840 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 841 "ulw %[Temp3], 8(%[src_ptr]) \n\t" 842 "ulw %[Temp4], 12(%[src_ptr]) \n\t" 843 "sw %[Temp1], 32(%[output_ptr]) \n\t" 844 "sw %[Temp2], 36(%[output_ptr]) \n\t" 845 "sw %[Temp3], 40(%[output_ptr]) \n\t" 846 "sw %[Temp4], 44(%[output_ptr]) \n\t" 847 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 848 849 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 850 [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) 851 : [src_pixels_per_line] "r"(src_pixels_per_line), 852 [output_ptr] "r"(output_ptr)); 853 854 output_ptr += 48; 855 } 856 } 857 858 void vp8_filter_block2d_first_pass16_4tap( 859 unsigned char *RESTRICT src_ptr, unsigned char *RESTRICT output_ptr, 860 unsigned int src_pixels_per_line, unsigned int output_width, 861 unsigned int output_height, int xoffset, int yoffset, 862 unsigned char *RESTRICT dst_ptr, int pitch) { 863 unsigned int i, j; 864 int Temp1, Temp2, Temp3, Temp4; 865 866 unsigned int vector4a; 867 int vector1b, vector2b; 868 unsigned int tp1, tp2, tp3, tn1; 869 unsigned int p1, p2, p3; 870 unsigned int n1, n2, n3; 871 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 872 873 vector4a = 64; 874 875 vector1b = sub_pel_filters_inv_tap_4[xoffset][0]; 876 vector2b = sub_pel_filters_inv_tap_4[xoffset][1]; 877 878 /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */ 879 if (yoffset == 0) { 880 output_height -= 5; 881 src_ptr += (src_pixels_per_line + src_pixels_per_line); 882 883 for (i = output_height; i--;) { 884 __asm__ __volatile__("ulw %[tp3], -1(%[src_ptr]) \n\t" 885 : [tp3] "=&r"(tp3) 886 : [src_ptr] "r"(src_ptr)); 887 888 /* processing 4 adjacent pixels */ 889 for (j = 0; j < 16; j += 4) { 890 /* apply filter with vectors pairs */ 891 __asm__ __volatile__( 892 "ulw %[tp2], 3(%[src_ptr]) " 893 "\n\t" 894 "move %[tp1], %[tp3] " 895 "\n\t" 896 897 /* even 1. pixel */ 898 "mtlo %[vector4a], $ac3 " 899 "\n\t" 900 "mthi $0, $ac3 " 901 "\n\t" 902 "move %[tp3], %[tp2] " 903 "\n\t" 904 "preceu.ph.qbr %[p1], %[tp1] " 905 "\n\t" 906 "preceu.ph.qbl %[p2], %[tp1] " 907 "\n\t" 908 "preceu.ph.qbr %[p3], %[tp2] " 909 "\n\t" 910 "dpa.w.ph $ac3, %[p1], %[vector1b] " 911 "\n\t" 912 "dpa.w.ph $ac3, %[p2], %[vector2b] " 913 "\n\t" 914 915 /* even 2. pixel */ 916 "mtlo %[vector4a], $ac2 " 917 "\n\t" 918 "mthi $0, $ac2 " 919 "\n\t" 920 "dpa.w.ph $ac2, %[p2], %[vector1b] " 921 "\n\t" 922 "dpa.w.ph $ac2, %[p3], %[vector2b] " 923 "\n\t" 924 "extr.w %[Temp1], $ac3, 7 " 925 "\n\t" 926 927 /* odd 1. pixel */ 928 "ulw %[tn1], 4(%[src_ptr]) " 929 "\n\t" 930 "balign %[tp2], %[tp1], 3 " 931 "\n\t" 932 "mtlo %[vector4a], $ac3 " 933 "\n\t" 934 "mthi $0, $ac3 " 935 "\n\t" 936 "preceu.ph.qbr %[n1], %[tp2] " 937 "\n\t" 938 "preceu.ph.qbl %[n2], %[tp2] " 939 "\n\t" 940 "preceu.ph.qbr %[n3], %[tn1] " 941 "\n\t" 942 "extr.w %[Temp3], $ac2, 7 " 943 "\n\t" 944 "dpa.w.ph $ac3, %[n1], %[vector1b] " 945 "\n\t" 946 "dpa.w.ph $ac3, %[n2], %[vector2b] " 947 "\n\t" 948 949 /* odd 2. pixel */ 950 "mtlo %[vector4a], $ac2 " 951 "\n\t" 952 "mthi $0, $ac2 " 953 "\n\t" 954 "extr.w %[Temp2], $ac3, 7 " 955 "\n\t" 956 "dpa.w.ph $ac2, %[n2], %[vector1b] " 957 "\n\t" 958 "dpa.w.ph $ac2, %[n3], %[vector2b] " 959 "\n\t" 960 "extr.w %[Temp4], $ac2, 7 " 961 "\n\t" 962 963 /* clamp and store results */ 964 "lbux %[tp1], %[Temp1](%[cm]) " 965 "\n\t" 966 "lbux %[tn1], %[Temp2](%[cm]) " 967 "\n\t" 968 "lbux %[tp2], %[Temp3](%[cm]) " 969 "\n\t" 970 "sb %[tp1], 0(%[dst_ptr]) " 971 "\n\t" 972 "sb %[tn1], 1(%[dst_ptr]) " 973 "\n\t" 974 "lbux %[n2], %[Temp4](%[cm]) " 975 "\n\t" 976 "sb %[tp2], 2(%[dst_ptr]) " 977 "\n\t" 978 "sb %[n2], 3(%[dst_ptr]) " 979 "\n\t" 980 981 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 982 [tn1] "=&r"(tn1), [p1] "=&r"(p1), [p2] "=&r"(p2), [n1] "=&r"(n1), 983 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), 984 [Temp2] "=&r"(Temp2), [p3] "=&r"(p3), [Temp3] "=&r"(Temp3), 985 [Temp4] "=&r"(Temp4) 986 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 987 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr), 988 [src_ptr] "r"(src_ptr)); 989 990 src_ptr += 4; 991 } 992 993 /* Next row... */ 994 src_ptr += src_pixels_per_line - 16; 995 dst_ptr += pitch; 996 } 997 } else { 998 for (i = output_height; i--;) { 999 /* processing 4 adjacent pixels */ 1000 for (j = 0; j < 16; j += 4) { 1001 /* apply filter with vectors pairs */ 1002 __asm__ __volatile__( 1003 "ulw %[tp1], -1(%[src_ptr]) " 1004 "\n\t" 1005 "ulw %[tp2], 3(%[src_ptr]) " 1006 "\n\t" 1007 1008 /* even 1. pixel */ 1009 "mtlo %[vector4a], $ac3 " 1010 "\n\t" 1011 "mthi $0, $ac3 " 1012 "\n\t" 1013 "preceu.ph.qbr %[p1], %[tp1] " 1014 "\n\t" 1015 "preceu.ph.qbl %[p2], %[tp1] " 1016 "\n\t" 1017 "preceu.ph.qbr %[p3], %[tp2] " 1018 "\n\t" 1019 "dpa.w.ph $ac3, %[p1], %[vector1b] " 1020 "\n\t" 1021 "dpa.w.ph $ac3, %[p2], %[vector2b] " 1022 "\n\t" 1023 1024 /* even 2. pixel */ 1025 "mtlo %[vector4a], $ac2 " 1026 "\n\t" 1027 "mthi $0, $ac2 " 1028 "\n\t" 1029 "dpa.w.ph $ac2, %[p2], %[vector1b] " 1030 "\n\t" 1031 "dpa.w.ph $ac2, %[p3], %[vector2b] " 1032 "\n\t" 1033 "extr.w %[Temp1], $ac3, 7 " 1034 "\n\t" 1035 1036 /* odd 1. pixel */ 1037 "ulw %[tn1], 4(%[src_ptr]) " 1038 "\n\t" 1039 "balign %[tp2], %[tp1], 3 " 1040 "\n\t" 1041 "mtlo %[vector4a], $ac3 " 1042 "\n\t" 1043 "mthi $0, $ac3 " 1044 "\n\t" 1045 "preceu.ph.qbr %[n1], %[tp2] " 1046 "\n\t" 1047 "preceu.ph.qbl %[n2], %[tp2] " 1048 "\n\t" 1049 "preceu.ph.qbr %[n3], %[tn1] " 1050 "\n\t" 1051 "extr.w %[Temp3], $ac2, 7 " 1052 "\n\t" 1053 "dpa.w.ph $ac3, %[n1], %[vector1b] " 1054 "\n\t" 1055 "dpa.w.ph $ac3, %[n2], %[vector2b] " 1056 "\n\t" 1057 1058 /* odd 2. pixel */ 1059 "mtlo %[vector4a], $ac2 " 1060 "\n\t" 1061 "mthi $0, $ac2 " 1062 "\n\t" 1063 "extr.w %[Temp2], $ac3, 7 " 1064 "\n\t" 1065 "dpa.w.ph $ac2, %[n2], %[vector1b] " 1066 "\n\t" 1067 "dpa.w.ph $ac2, %[n3], %[vector2b] " 1068 "\n\t" 1069 "extr.w %[Temp4], $ac2, 7 " 1070 "\n\t" 1071 1072 /* clamp and store results */ 1073 "lbux %[tp1], %[Temp1](%[cm]) " 1074 "\n\t" 1075 "lbux %[tn1], %[Temp2](%[cm]) " 1076 "\n\t" 1077 "lbux %[tp2], %[Temp3](%[cm]) " 1078 "\n\t" 1079 "sb %[tp1], 0(%[output_ptr]) " 1080 "\n\t" 1081 "sb %[tn1], 1(%[output_ptr]) " 1082 "\n\t" 1083 "lbux %[n2], %[Temp4](%[cm]) " 1084 "\n\t" 1085 "sb %[tp2], 2(%[output_ptr]) " 1086 "\n\t" 1087 "sb %[n2], 3(%[output_ptr]) " 1088 "\n\t" 1089 1090 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 1091 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), 1092 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), 1093 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) 1094 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 1095 [vector4a] "r"(vector4a), [cm] "r"(cm), 1096 [output_ptr] "r"(output_ptr), [src_ptr] "r"(src_ptr)); 1097 1098 src_ptr += 4; 1099 } 1100 1101 /* next row... */ 1102 src_ptr += src_pixels_per_line; 1103 output_ptr += output_width; 1104 } 1105 } 1106 } 1107 1108 void vp8_filter_block2d_second_pass4(unsigned char *RESTRICT src_ptr, 1109 unsigned char *RESTRICT output_ptr, 1110 int output_pitch, int yoffset) { 1111 unsigned int i; 1112 1113 int Temp1, Temp2, Temp3, Temp4; 1114 unsigned int vector1b, vector2b, vector3b, vector4a; 1115 1116 unsigned char src_ptr_l2; 1117 unsigned char src_ptr_l1; 1118 unsigned char src_ptr_0; 1119 unsigned char src_ptr_r1; 1120 unsigned char src_ptr_r2; 1121 unsigned char src_ptr_r3; 1122 1123 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 1124 1125 vector4a = 64; 1126 1127 /* load filter coefficients */ 1128 vector1b = sub_pel_filterss[yoffset][0]; 1129 vector2b = sub_pel_filterss[yoffset][2]; 1130 vector3b = sub_pel_filterss[yoffset][1]; 1131 1132 if (vector1b) { 1133 /* 6 tap filter */ 1134 1135 for (i = 2; i--;) { 1136 /* prefetch src_ptr data to cache memory */ 1137 prefetch_load(src_ptr); 1138 1139 /* do not allow compiler to reorder instructions */ 1140 __asm__ __volatile__( 1141 ".set noreorder \n\t" 1142 : 1143 :); 1144 1145 /* apply filter with vectors pairs */ 1146 __asm__ __volatile__( 1147 "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t" 1148 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 1149 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1150 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" 1151 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" 1152 "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t" 1153 "mtlo %[vector4a], $ac2 \n\t" 1154 1155 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1156 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1157 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1158 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1159 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1160 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1161 1162 "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t" 1163 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 1164 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1165 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" 1166 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" 1167 "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t" 1168 "mtlo %[vector4a], $ac3 \n\t" 1169 "extp %[Temp1], $ac2, 9 \n\t" 1170 1171 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1172 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1173 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1174 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1175 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1176 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1177 1178 "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t" 1179 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 1180 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 1181 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" 1182 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" 1183 "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t" 1184 "mtlo %[vector4a], $ac0 \n\t" 1185 "extp %[Temp2], $ac3, 9 \n\t" 1186 1187 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1188 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1189 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1190 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1191 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1192 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1193 1194 "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t" 1195 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 1196 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 1197 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" 1198 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" 1199 "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t" 1200 "mtlo %[vector4a], $ac1 \n\t" 1201 "extp %[Temp3], $ac0, 9 \n\t" 1202 1203 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1204 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1205 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1206 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1207 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1208 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1209 "extp %[Temp4], $ac1, 9 \n\t" 1210 1211 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 1212 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), 1213 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), 1214 [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2), 1215 [src_ptr_r3] "=&r"(src_ptr_r3) 1216 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 1217 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), 1218 [src_ptr] "r"(src_ptr)); 1219 1220 /* clamp and store results */ 1221 output_ptr[0] = cm[Temp1]; 1222 output_ptr[1] = cm[Temp2]; 1223 output_ptr[2] = cm[Temp3]; 1224 output_ptr[3] = cm[Temp4]; 1225 1226 output_ptr += output_pitch; 1227 1228 /* apply filter with vectors pairs */ 1229 __asm__ __volatile__( 1230 "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t" 1231 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" 1232 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 1233 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1234 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" 1235 "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t" 1236 "mtlo %[vector4a], $ac2 \n\t" 1237 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1238 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1239 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1240 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1241 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1242 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1243 1244 "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t" 1245 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" 1246 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 1247 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1248 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" 1249 "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t" 1250 "mtlo %[vector4a], $ac3 \n\t" 1251 "extp %[Temp1], $ac2, 9 \n\t" 1252 1253 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1254 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1255 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1256 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1257 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1258 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1259 1260 "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t" 1261 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" 1262 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 1263 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" 1264 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" 1265 "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t" 1266 "mtlo %[vector4a], $ac0 \n\t" 1267 "extp %[Temp2], $ac3, 9 \n\t" 1268 1269 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1270 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1271 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1272 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1273 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1274 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1275 1276 "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t" 1277 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" 1278 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 1279 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" 1280 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" 1281 "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t" 1282 "mtlo %[vector4a], $ac1 \n\t" 1283 "extp %[Temp3], $ac0, 9 \n\t" 1284 1285 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1286 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1287 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1288 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1289 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1290 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1291 "extp %[Temp4], $ac1, 9 \n\t" 1292 1293 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 1294 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), 1295 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), 1296 [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2), 1297 [src_ptr_r3] "=&r"(src_ptr_r3) 1298 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 1299 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), 1300 [src_ptr] "r"(src_ptr)); 1301 1302 /* clamp and store results */ 1303 output_ptr[0] = cm[Temp1]; 1304 output_ptr[1] = cm[Temp2]; 1305 output_ptr[2] = cm[Temp3]; 1306 output_ptr[3] = cm[Temp4]; 1307 1308 src_ptr += 8; 1309 output_ptr += output_pitch; 1310 } 1311 } else { 1312 /* 4 tap filter */ 1313 1314 /* prefetch src_ptr data to cache memory */ 1315 prefetch_load(src_ptr); 1316 1317 for (i = 2; i--;) { 1318 /* do not allow compiler to reorder instructions */ 1319 __asm__ __volatile__( 1320 ".set noreorder \n\t" 1321 : 1322 :); 1323 1324 /* apply filter with vectors pairs */ 1325 __asm__ __volatile__( 1326 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 1327 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1328 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t" 1329 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t" 1330 "mtlo %[vector4a], $ac2 \n\t" 1331 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1332 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1333 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1334 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1335 1336 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 1337 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1338 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t" 1339 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t" 1340 "mtlo %[vector4a], $ac3 \n\t" 1341 "extp %[Temp1], $ac2, 9 \n\t" 1342 1343 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1344 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1345 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1346 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1347 1348 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 1349 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 1350 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t" 1351 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t" 1352 "mtlo %[vector4a], $ac0 \n\t" 1353 "extp %[Temp2], $ac3, 9 \n\t" 1354 1355 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1356 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1357 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1358 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1359 1360 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 1361 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 1362 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t" 1363 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t" 1364 "mtlo %[vector4a], $ac1 \n\t" 1365 "extp %[Temp3], $ac0, 9 \n\t" 1366 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1367 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1368 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1369 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1370 "extp %[Temp4], $ac1, 9 \n\t" 1371 1372 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 1373 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), 1374 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), 1375 [src_ptr_r2] "=&r"(src_ptr_r2) 1376 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1377 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); 1378 1379 /* clamp and store results */ 1380 output_ptr[0] = cm[Temp1]; 1381 output_ptr[1] = cm[Temp2]; 1382 output_ptr[2] = cm[Temp3]; 1383 output_ptr[3] = cm[Temp4]; 1384 1385 output_ptr += output_pitch; 1386 1387 /* apply filter with vectors pairs */ 1388 __asm__ __volatile__( 1389 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t" 1390 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 1391 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1392 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t" 1393 "mtlo %[vector4a], $ac2 \n\t" 1394 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1395 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1396 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1397 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1398 1399 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t" 1400 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 1401 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1402 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t" 1403 "mtlo %[vector4a], $ac3 \n\t" 1404 "extp %[Temp1], $ac2, 9 \n\t" 1405 1406 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1407 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1408 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1409 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1410 1411 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t" 1412 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 1413 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" 1414 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t" 1415 "mtlo %[vector4a], $ac0 \n\t" 1416 "extp %[Temp2], $ac3, 9 \n\t" 1417 1418 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1419 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1420 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1421 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1422 1423 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t" 1424 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 1425 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" 1426 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t" 1427 "mtlo %[vector4a], $ac1 \n\t" 1428 "extp %[Temp3], $ac0, 9 \n\t" 1429 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1430 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1431 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1432 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1433 "extp %[Temp4], $ac1, 9 \n\t" 1434 1435 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 1436 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1), 1437 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), 1438 [src_ptr_r2] "=&r"(src_ptr_r2) 1439 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1440 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); 1441 1442 /* clamp and store results */ 1443 output_ptr[0] = cm[Temp1]; 1444 output_ptr[1] = cm[Temp2]; 1445 output_ptr[2] = cm[Temp3]; 1446 output_ptr[3] = cm[Temp4]; 1447 1448 src_ptr += 8; 1449 output_ptr += output_pitch; 1450 } 1451 } 1452 } 1453 1454 void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr, 1455 unsigned char *RESTRICT output_ptr, 1456 int output_pitch, 1457 unsigned int output_height, 1458 unsigned int output_width, 1459 unsigned int yoffset) { 1460 unsigned int i; 1461 1462 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; 1463 unsigned int vector1b, vector2b, vector3b, vector4a; 1464 1465 unsigned char src_ptr_l2; 1466 unsigned char src_ptr_l1; 1467 unsigned char src_ptr_0; 1468 unsigned char src_ptr_r1; 1469 unsigned char src_ptr_r2; 1470 unsigned char src_ptr_r3; 1471 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 1472 (void)output_width; 1473 1474 vector4a = 64; 1475 1476 vector1b = sub_pel_filterss[yoffset][0]; 1477 vector2b = sub_pel_filterss[yoffset][2]; 1478 vector3b = sub_pel_filterss[yoffset][1]; 1479 1480 if (vector1b) { 1481 /* 6 tap filter */ 1482 1483 /* prefetch src_ptr data to cache memory */ 1484 prefetch_load(src_ptr); 1485 1486 for (i = output_height; i--;) { 1487 /* apply filter with vectors pairs */ 1488 __asm__ __volatile__( 1489 "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t" 1490 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" 1491 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1492 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1493 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" 1494 "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t" 1495 "mtlo %[vector4a], $ac2 \n\t" 1496 1497 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1498 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1499 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1500 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1501 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1502 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1503 1504 "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t" 1505 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" 1506 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1507 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1508 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" 1509 "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t" 1510 "mtlo %[vector4a], $ac3 \n\t" 1511 "extp %[Temp1], $ac2, 9 \n\t" 1512 1513 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1514 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1515 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1516 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1517 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1518 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1519 1520 "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t" 1521 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" 1522 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 1523 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t" 1524 "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t" 1525 "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t" 1526 "mtlo %[vector4a], $ac0 \n\t" 1527 "extp %[Temp2], $ac3, 9 \n\t" 1528 1529 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1530 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1531 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1532 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1533 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1534 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1535 1536 "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t" 1537 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" 1538 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 1539 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t" 1540 "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t" 1541 "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t" 1542 "mtlo %[vector4a], $ac1 \n\t" 1543 "extp %[Temp3], $ac0, 9 \n\t" 1544 1545 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1546 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1547 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1548 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1549 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1550 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1551 1552 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 1553 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), 1554 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), 1555 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) 1556 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 1557 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), 1558 [src_ptr] "r"(src_ptr)); 1559 1560 /* apply filter with vectors pairs */ 1561 __asm__ __volatile__( 1562 "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t" 1563 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 1564 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 1565 "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t" 1566 "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t" 1567 "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t" 1568 "mtlo %[vector4a], $ac2 \n\t" 1569 1570 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1571 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1572 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1573 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 1574 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1575 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1576 "extp %[Temp4], $ac1, 9 \n\t" 1577 1578 "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t" 1579 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 1580 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 1581 "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t" 1582 "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t" 1583 "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t" 1584 "mtlo %[vector4a], $ac3 \n\t" 1585 "extp %[Temp5], $ac2, 9 \n\t" 1586 1587 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1588 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1589 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1590 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 1591 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1592 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1593 1594 "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t" 1595 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 1596 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 1597 "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t" 1598 "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t" 1599 "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t" 1600 "mtlo %[vector4a], $ac0 \n\t" 1601 "extp %[Temp6], $ac3, 9 \n\t" 1602 1603 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1604 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1605 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1606 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 1607 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1608 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1609 1610 "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t" 1611 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 1612 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 1613 "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t" 1614 "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t" 1615 "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t" 1616 "mtlo %[vector4a], $ac1 \n\t" 1617 "extp %[Temp7], $ac0, 9 \n\t" 1618 1619 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 1620 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1621 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1622 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 1623 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1624 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1625 "extp %[Temp8], $ac1, 9 \n\t" 1626 1627 : [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), 1628 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), 1629 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), 1630 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), 1631 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) 1632 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 1633 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), 1634 [src_ptr] "r"(src_ptr)); 1635 1636 /* clamp and store results */ 1637 output_ptr[0] = cm[Temp1]; 1638 output_ptr[1] = cm[Temp2]; 1639 output_ptr[2] = cm[Temp3]; 1640 output_ptr[3] = cm[Temp4]; 1641 output_ptr[4] = cm[Temp5]; 1642 output_ptr[5] = cm[Temp6]; 1643 output_ptr[6] = cm[Temp7]; 1644 output_ptr[7] = cm[Temp8]; 1645 1646 src_ptr += 8; 1647 output_ptr += output_pitch; 1648 } 1649 } else { 1650 /* 4 tap filter */ 1651 1652 /* prefetch src_ptr data to cache memory */ 1653 prefetch_load(src_ptr); 1654 1655 for (i = output_height; i--;) { 1656 __asm__ __volatile__( 1657 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" 1658 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 1659 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t" 1660 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t" 1661 "mtlo %[vector4a], $ac2 \n\t" 1662 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1663 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1664 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1665 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1666 1667 : [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), 1668 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2) 1669 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1670 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); 1671 1672 __asm__ __volatile__( 1673 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" 1674 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 1675 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t" 1676 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t" 1677 "mtlo %[vector4a], $ac3 \n\t" 1678 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1679 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1680 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1681 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1682 "extp %[Temp1], $ac2, 9 \n\t" 1683 1684 : [Temp1] "=r"(Temp1), [src_ptr_l1] "=&r"(src_ptr_l1), 1685 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1), 1686 [src_ptr_r2] "=&r"(src_ptr_r2) 1687 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1688 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); 1689 1690 src_ptr_l1 = src_ptr[-6]; 1691 src_ptr_0 = src_ptr[2]; 1692 src_ptr_r1 = src_ptr[10]; 1693 src_ptr_r2 = src_ptr[18]; 1694 1695 __asm__ __volatile__( 1696 "mtlo %[vector4a], $ac0 \n\t" 1697 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1698 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1699 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1700 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1701 "extp %[Temp2], $ac3, 9 \n\t" 1702 1703 : [Temp2] "=r"(Temp2) 1704 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1705 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), 1706 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), 1707 [vector4a] "r"(vector4a)); 1708 1709 src_ptr_l1 = src_ptr[-5]; 1710 src_ptr_0 = src_ptr[3]; 1711 src_ptr_r1 = src_ptr[11]; 1712 src_ptr_r2 = src_ptr[19]; 1713 1714 __asm__ __volatile__( 1715 "mtlo %[vector4a], $ac1 \n\t" 1716 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1717 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1718 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1719 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1720 "extp %[Temp3], $ac0, 9 \n\t" 1721 1722 : [Temp3] "=r"(Temp3) 1723 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1724 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), 1725 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), 1726 [vector4a] "r"(vector4a)); 1727 1728 src_ptr_l1 = src_ptr[-4]; 1729 src_ptr_0 = src_ptr[4]; 1730 src_ptr_r1 = src_ptr[12]; 1731 src_ptr_r2 = src_ptr[20]; 1732 1733 __asm__ __volatile__( 1734 "mtlo %[vector4a], $ac2 \n\t" 1735 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1736 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1737 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 1738 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 1739 "extp %[Temp4], $ac1, 9 \n\t" 1740 1741 : [Temp4] "=r"(Temp4) 1742 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1743 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), 1744 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), 1745 [vector4a] "r"(vector4a)); 1746 1747 src_ptr_l1 = src_ptr[-3]; 1748 src_ptr_0 = src_ptr[5]; 1749 src_ptr_r1 = src_ptr[13]; 1750 src_ptr_r2 = src_ptr[21]; 1751 1752 __asm__ __volatile__( 1753 "mtlo %[vector4a], $ac3 \n\t" 1754 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1755 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1756 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 1757 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 1758 "extp %[Temp5], $ac2, 9 \n\t" 1759 1760 : [Temp5] "=&r"(Temp5) 1761 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1762 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), 1763 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), 1764 [vector4a] "r"(vector4a)); 1765 1766 src_ptr_l1 = src_ptr[-2]; 1767 src_ptr_0 = src_ptr[6]; 1768 src_ptr_r1 = src_ptr[14]; 1769 src_ptr_r2 = src_ptr[22]; 1770 1771 __asm__ __volatile__( 1772 "mtlo %[vector4a], $ac0 \n\t" 1773 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1774 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1775 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 1776 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 1777 "extp %[Temp6], $ac3, 9 \n\t" 1778 1779 : [Temp6] "=r"(Temp6) 1780 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1781 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), 1782 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), 1783 [vector4a] "r"(vector4a)); 1784 1785 src_ptr_l1 = src_ptr[-1]; 1786 src_ptr_0 = src_ptr[7]; 1787 src_ptr_r1 = src_ptr[15]; 1788 src_ptr_r2 = src_ptr[23]; 1789 1790 __asm__ __volatile__( 1791 "mtlo %[vector4a], $ac1 \n\t" 1792 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 1793 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 1794 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 1795 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 1796 "extp %[Temp7], $ac0, 9 \n\t" 1797 "extp %[Temp8], $ac1, 9 \n\t" 1798 1799 : [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8) 1800 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 1801 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0), 1802 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2), 1803 [vector4a] "r"(vector4a)); 1804 1805 /* clamp and store results */ 1806 output_ptr[0] = cm[Temp1]; 1807 output_ptr[1] = cm[Temp2]; 1808 output_ptr[2] = cm[Temp3]; 1809 output_ptr[3] = cm[Temp4]; 1810 output_ptr[4] = cm[Temp5]; 1811 output_ptr[5] = cm[Temp6]; 1812 output_ptr[6] = cm[Temp7]; 1813 output_ptr[7] = cm[Temp8]; 1814 1815 src_ptr += 8; 1816 output_ptr += output_pitch; 1817 } 1818 } 1819 } 1820 1821 void vp8_filter_block2d_second_pass161(unsigned char *RESTRICT src_ptr, 1822 unsigned char *RESTRICT output_ptr, 1823 int output_pitch, 1824 const unsigned short *vp8_filter) { 1825 unsigned int i, j; 1826 1827 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8; 1828 unsigned int vector4a; 1829 unsigned int vector1b, vector2b, vector3b; 1830 1831 unsigned char src_ptr_l2; 1832 unsigned char src_ptr_l1; 1833 unsigned char src_ptr_0; 1834 unsigned char src_ptr_r1; 1835 unsigned char src_ptr_r2; 1836 unsigned char src_ptr_r3; 1837 unsigned char *cm = ff_cropTbl + CROP_WIDTH; 1838 1839 vector4a = 64; 1840 1841 vector1b = vp8_filter[0]; 1842 vector2b = vp8_filter[2]; 1843 vector3b = vp8_filter[1]; 1844 1845 if (vector1b == 0) { 1846 /* 4 tap filter */ 1847 1848 /* prefetch src_ptr data to cache memory */ 1849 prefetch_load(src_ptr + 16); 1850 1851 for (i = 16; i--;) { 1852 /* unrolling for loop */ 1853 for (j = 0; j < 16; j += 8) { 1854 /* apply filter with vectors pairs */ 1855 __asm__ __volatile__( 1856 "lbu %[src_ptr_l1], -16(%[src_ptr]) " 1857 "\n\t" 1858 "lbu %[src_ptr_0], 0(%[src_ptr]) " 1859 "\n\t" 1860 "lbu %[src_ptr_r1], 16(%[src_ptr]) " 1861 "\n\t" 1862 "lbu %[src_ptr_r2], 32(%[src_ptr]) " 1863 "\n\t" 1864 "mtlo %[vector4a], $ac2 " 1865 "\n\t" 1866 "append %[src_ptr_0], %[src_ptr_r1], 8 " 1867 "\n\t" 1868 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 1869 "\n\t" 1870 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] " 1871 "\n\t" 1872 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] " 1873 "\n\t" 1874 1875 "lbu %[src_ptr_l1], -15(%[src_ptr]) " 1876 "\n\t" 1877 "lbu %[src_ptr_0], 1(%[src_ptr]) " 1878 "\n\t" 1879 "lbu %[src_ptr_r1], 17(%[src_ptr]) " 1880 "\n\t" 1881 "lbu %[src_ptr_r2], 33(%[src_ptr]) " 1882 "\n\t" 1883 "mtlo %[vector4a], $ac3 " 1884 "\n\t" 1885 "extp %[Temp1], $ac2, 9 " 1886 "\n\t" 1887 1888 "append %[src_ptr_0], %[src_ptr_r1], 8 " 1889 "\n\t" 1890 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 1891 "\n\t" 1892 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " 1893 "\n\t" 1894 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " 1895 "\n\t" 1896 1897 "lbu %[src_ptr_l1], -14(%[src_ptr]) " 1898 "\n\t" 1899 "lbu %[src_ptr_0], 2(%[src_ptr]) " 1900 "\n\t" 1901 "lbu %[src_ptr_r1], 18(%[src_ptr]) " 1902 "\n\t" 1903 "lbu %[src_ptr_r2], 34(%[src_ptr]) " 1904 "\n\t" 1905 "mtlo %[vector4a], $ac1 " 1906 "\n\t" 1907 "extp %[Temp2], $ac3, 9 " 1908 "\n\t" 1909 1910 "append %[src_ptr_0], %[src_ptr_r1], 8 " 1911 "\n\t" 1912 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 1913 "\n\t" 1914 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] " 1915 "\n\t" 1916 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] " 1917 "\n\t" 1918 1919 "lbu %[src_ptr_l1], -13(%[src_ptr]) " 1920 "\n\t" 1921 "lbu %[src_ptr_0], 3(%[src_ptr]) " 1922 "\n\t" 1923 "lbu %[src_ptr_r1], 19(%[src_ptr]) " 1924 "\n\t" 1925 "lbu %[src_ptr_r2], 35(%[src_ptr]) " 1926 "\n\t" 1927 "mtlo %[vector4a], $ac3 " 1928 "\n\t" 1929 "extp %[Temp3], $ac1, 9 " 1930 "\n\t" 1931 1932 "append %[src_ptr_0], %[src_ptr_r1], 8 " 1933 "\n\t" 1934 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 1935 "\n\t" 1936 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " 1937 "\n\t" 1938 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " 1939 "\n\t" 1940 1941 "lbu %[src_ptr_l1], -12(%[src_ptr]) " 1942 "\n\t" 1943 "lbu %[src_ptr_0], 4(%[src_ptr]) " 1944 "\n\t" 1945 "lbu %[src_ptr_r1], 20(%[src_ptr]) " 1946 "\n\t" 1947 "lbu %[src_ptr_r2], 36(%[src_ptr]) " 1948 "\n\t" 1949 "mtlo %[vector4a], $ac2 " 1950 "\n\t" 1951 "extp %[Temp4], $ac3, 9 " 1952 "\n\t" 1953 1954 "append %[src_ptr_0], %[src_ptr_r1], 8 " 1955 "\n\t" 1956 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 1957 "\n\t" 1958 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] " 1959 "\n\t" 1960 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] " 1961 "\n\t" 1962 1963 "lbu %[src_ptr_l1], -11(%[src_ptr]) " 1964 "\n\t" 1965 "lbu %[src_ptr_0], 5(%[src_ptr]) " 1966 "\n\t" 1967 "lbu %[src_ptr_r1], 21(%[src_ptr]) " 1968 "\n\t" 1969 "lbu %[src_ptr_r2], 37(%[src_ptr]) " 1970 "\n\t" 1971 "mtlo %[vector4a], $ac3 " 1972 "\n\t" 1973 "extp %[Temp5], $ac2, 9 " 1974 "\n\t" 1975 1976 "append %[src_ptr_0], %[src_ptr_r1], 8 " 1977 "\n\t" 1978 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 1979 "\n\t" 1980 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " 1981 "\n\t" 1982 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " 1983 "\n\t" 1984 1985 "lbu %[src_ptr_l1], -10(%[src_ptr]) " 1986 "\n\t" 1987 "lbu %[src_ptr_0], 6(%[src_ptr]) " 1988 "\n\t" 1989 "lbu %[src_ptr_r1], 22(%[src_ptr]) " 1990 "\n\t" 1991 "lbu %[src_ptr_r2], 38(%[src_ptr]) " 1992 "\n\t" 1993 "mtlo %[vector4a], $ac1 " 1994 "\n\t" 1995 "extp %[Temp6], $ac3, 9 " 1996 "\n\t" 1997 1998 "append %[src_ptr_0], %[src_ptr_r1], 8 " 1999 "\n\t" 2000 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 2001 "\n\t" 2002 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] " 2003 "\n\t" 2004 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] " 2005 "\n\t" 2006 2007 "lbu %[src_ptr_l1], -9(%[src_ptr]) " 2008 "\n\t" 2009 "lbu %[src_ptr_0], 7(%[src_ptr]) " 2010 "\n\t" 2011 "lbu %[src_ptr_r1], 23(%[src_ptr]) " 2012 "\n\t" 2013 "lbu %[src_ptr_r2], 39(%[src_ptr]) " 2014 "\n\t" 2015 "mtlo %[vector4a], $ac3 " 2016 "\n\t" 2017 "extp %[Temp7], $ac1, 9 " 2018 "\n\t" 2019 2020 "append %[src_ptr_0], %[src_ptr_r1], 8 " 2021 "\n\t" 2022 "append %[src_ptr_l1], %[src_ptr_r2], 8 " 2023 "\n\t" 2024 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] " 2025 "\n\t" 2026 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] " 2027 "\n\t" 2028 "extp %[Temp8], $ac3, 9 " 2029 "\n\t" 2030 2031 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 2032 [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), 2033 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), 2034 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), 2035 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2) 2036 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b), 2037 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr)); 2038 2039 /* clamp and store results */ 2040 output_ptr[j] = cm[Temp1]; 2041 output_ptr[j + 1] = cm[Temp2]; 2042 output_ptr[j + 2] = cm[Temp3]; 2043 output_ptr[j + 3] = cm[Temp4]; 2044 output_ptr[j + 4] = cm[Temp5]; 2045 output_ptr[j + 5] = cm[Temp6]; 2046 output_ptr[j + 6] = cm[Temp7]; 2047 output_ptr[j + 7] = cm[Temp8]; 2048 2049 src_ptr += 8; 2050 } 2051 2052 output_ptr += output_pitch; 2053 } 2054 } else { 2055 /* 4 tap filter */ 2056 2057 /* prefetch src_ptr data to cache memory */ 2058 prefetch_load(src_ptr + 16); 2059 2060 /* unroll for loop */ 2061 for (i = 16; i--;) { 2062 /* apply filter with vectors pairs */ 2063 __asm__ __volatile__( 2064 "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t" 2065 "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t" 2066 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t" 2067 "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t" 2068 "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t" 2069 "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t" 2070 "mtlo %[vector4a], $ac2 \n\t" 2071 2072 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2073 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2074 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2075 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2076 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2077 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2078 2079 "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t" 2080 "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t" 2081 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t" 2082 "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t" 2083 "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t" 2084 "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t" 2085 "mtlo %[vector4a], $ac0 \n\t" 2086 "extp %[Temp1], $ac2, 9 \n\t" 2087 2088 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2089 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2090 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2091 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2092 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2093 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2094 2095 "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t" 2096 "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t" 2097 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t" 2098 "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t" 2099 "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t" 2100 "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t" 2101 "mtlo %[vector4a], $ac1 \n\t" 2102 "extp %[Temp2], $ac0, 9 \n\t" 2103 2104 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2105 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2106 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2107 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2108 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2109 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2110 2111 "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t" 2112 "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t" 2113 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t" 2114 "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t" 2115 "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t" 2116 "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t" 2117 "mtlo %[vector4a], $ac3 \n\t" 2118 "extp %[Temp3], $ac1, 9 \n\t" 2119 2120 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2121 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2122 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2123 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2124 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2125 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2126 2127 "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t" 2128 "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t" 2129 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t" 2130 "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t" 2131 "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t" 2132 "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t" 2133 "mtlo %[vector4a], $ac2 \n\t" 2134 "extp %[Temp4], $ac3, 9 \n\t" 2135 2136 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2137 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2138 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2139 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2140 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2141 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2142 2143 "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t" 2144 "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t" 2145 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t" 2146 "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t" 2147 "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t" 2148 "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t" 2149 "mtlo %[vector4a], $ac0 \n\t" 2150 "extp %[Temp5], $ac2, 9 \n\t" 2151 2152 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2153 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2154 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2155 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2156 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2157 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2158 2159 "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t" 2160 "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t" 2161 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t" 2162 "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t" 2163 "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t" 2164 "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t" 2165 "mtlo %[vector4a], $ac1 \n\t" 2166 "extp %[Temp6], $ac0, 9 \n\t" 2167 2168 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2169 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2170 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2171 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2172 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2173 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2174 2175 "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t" 2176 "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t" 2177 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t" 2178 "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t" 2179 "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t" 2180 "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t" 2181 "mtlo %[vector4a], $ac3 \n\t" 2182 "extp %[Temp7], $ac1, 9 \n\t" 2183 2184 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2185 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2186 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2187 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2188 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2189 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2190 "extp %[Temp8], $ac3, 9 \n\t" 2191 2192 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 2193 [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), 2194 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), 2195 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), 2196 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), 2197 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) 2198 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 2199 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), 2200 [src_ptr] "r"(src_ptr)); 2201 2202 /* clamp and store results */ 2203 output_ptr[0] = cm[Temp1]; 2204 output_ptr[1] = cm[Temp2]; 2205 output_ptr[2] = cm[Temp3]; 2206 output_ptr[3] = cm[Temp4]; 2207 output_ptr[4] = cm[Temp5]; 2208 output_ptr[5] = cm[Temp6]; 2209 output_ptr[6] = cm[Temp7]; 2210 output_ptr[7] = cm[Temp8]; 2211 2212 /* apply filter with vectors pairs */ 2213 __asm__ __volatile__( 2214 "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t" 2215 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t" 2216 "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t" 2217 "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t" 2218 "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t" 2219 "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t" 2220 "mtlo %[vector4a], $ac2 \n\t" 2221 2222 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2223 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2224 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2225 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2226 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2227 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2228 2229 "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t" 2230 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t" 2231 "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t" 2232 "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t" 2233 "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t" 2234 "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t" 2235 "mtlo %[vector4a], $ac0 \n\t" 2236 "extp %[Temp1], $ac2, 9 \n\t" 2237 2238 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2239 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2240 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2241 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2242 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2243 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2244 2245 "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t" 2246 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t" 2247 "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t" 2248 "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t" 2249 "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t" 2250 "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t" 2251 "mtlo %[vector4a], $ac1 \n\t" 2252 "extp %[Temp2], $ac0, 9 \n\t" 2253 2254 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2255 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2256 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2257 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2258 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2259 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2260 2261 "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t" 2262 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t" 2263 "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t" 2264 "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t" 2265 "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t" 2266 "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t" 2267 "mtlo %[vector4a], $ac3 \n\t" 2268 "extp %[Temp3], $ac1, 9 \n\t" 2269 2270 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2271 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2272 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2273 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2274 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2275 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2276 2277 "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t" 2278 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t" 2279 "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t" 2280 "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t" 2281 "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t" 2282 "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t" 2283 "mtlo %[vector4a], $ac2 \n\t" 2284 "extp %[Temp4], $ac3, 9 \n\t" 2285 2286 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2287 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2288 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2289 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t" 2290 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t" 2291 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t" 2292 2293 "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t" 2294 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t" 2295 "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t" 2296 "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t" 2297 "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t" 2298 "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t" 2299 "mtlo %[vector4a], $ac0 \n\t" 2300 "extp %[Temp5], $ac2, 9 \n\t" 2301 2302 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2303 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2304 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2305 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t" 2306 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t" 2307 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t" 2308 2309 "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t" 2310 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t" 2311 "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t" 2312 "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t" 2313 "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t" 2314 "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t" 2315 "mtlo %[vector4a], $ac1 \n\t" 2316 "extp %[Temp6], $ac0, 9 \n\t" 2317 2318 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2319 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2320 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2321 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t" 2322 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t" 2323 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t" 2324 2325 "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t" 2326 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t" 2327 "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t" 2328 "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t" 2329 "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t" 2330 "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t" 2331 "mtlo %[vector4a], $ac3 \n\t" 2332 "extp %[Temp7], $ac1, 9 \n\t" 2333 2334 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t" 2335 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t" 2336 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t" 2337 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t" 2338 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t" 2339 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t" 2340 "extp %[Temp8], $ac3, 9 \n\t" 2341 2342 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 2343 [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6), 2344 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8), 2345 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0), 2346 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2), 2347 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3) 2348 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 2349 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a), 2350 [src_ptr] "r"(src_ptr)); 2351 2352 src_ptr += 16; 2353 output_ptr[8] = cm[Temp1]; 2354 output_ptr[9] = cm[Temp2]; 2355 output_ptr[10] = cm[Temp3]; 2356 output_ptr[11] = cm[Temp4]; 2357 output_ptr[12] = cm[Temp5]; 2358 output_ptr[13] = cm[Temp6]; 2359 output_ptr[14] = cm[Temp7]; 2360 output_ptr[15] = cm[Temp8]; 2361 2362 output_ptr += output_pitch; 2363 } 2364 } 2365 } 2366 2367 void vp8_sixtap_predict4x4_dspr2(unsigned char *RESTRICT src_ptr, 2368 int src_pixels_per_line, int xoffset, 2369 int yoffset, unsigned char *RESTRICT dst_ptr, 2370 int dst_pitch) { 2371 unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */ 2372 unsigned int pos = 16; 2373 2374 /* bit positon for extract from acc */ 2375 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 2376 : 2377 : [pos] "r"(pos)); 2378 2379 if (yoffset) { 2380 /* First filter 1-D horizontally... */ 2381 vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData, 2382 src_pixels_per_line, 9, xoffset, 4); 2383 /* then filter verticaly... */ 2384 vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset); 2385 } else 2386 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2387 vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, 4, 2388 xoffset, dst_pitch); 2389 } 2390 2391 void vp8_sixtap_predict8x8_dspr2(unsigned char *RESTRICT src_ptr, 2392 int src_pixels_per_line, int xoffset, 2393 int yoffset, unsigned char *RESTRICT dst_ptr, 2394 int dst_pitch) { 2395 unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */ 2396 unsigned int pos, Temp1, Temp2; 2397 2398 pos = 16; 2399 2400 /* bit positon for extract from acc */ 2401 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 2402 : 2403 : [pos] "r"(pos)); 2404 2405 if (yoffset) { 2406 src_ptr = src_ptr - (2 * src_pixels_per_line); 2407 2408 if (xoffset) /* filter 1-D horizontally... */ 2409 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, 2410 13, xoffset, 8); 2411 2412 else { 2413 /* prefetch src_ptr data to cache memory */ 2414 prefetch_load(src_ptr + 2 * src_pixels_per_line); 2415 2416 __asm__ __volatile__( 2417 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2418 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2419 "sw %[Temp1], 0(%[FData]) \n\t" 2420 "sw %[Temp2], 4(%[FData]) \n\t" 2421 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2422 2423 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2424 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2425 "sw %[Temp1], 8(%[FData]) \n\t" 2426 "sw %[Temp2], 12(%[FData]) \n\t" 2427 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2428 2429 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2430 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2431 "sw %[Temp1], 16(%[FData]) \n\t" 2432 "sw %[Temp2], 20(%[FData]) \n\t" 2433 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2434 2435 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2436 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2437 "sw %[Temp1], 24(%[FData]) \n\t" 2438 "sw %[Temp2], 28(%[FData]) \n\t" 2439 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2440 2441 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2442 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2443 "sw %[Temp1], 32(%[FData]) \n\t" 2444 "sw %[Temp2], 36(%[FData]) \n\t" 2445 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2446 2447 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2448 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2449 "sw %[Temp1], 40(%[FData]) \n\t" 2450 "sw %[Temp2], 44(%[FData]) \n\t" 2451 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2452 2453 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2454 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2455 "sw %[Temp1], 48(%[FData]) \n\t" 2456 "sw %[Temp2], 52(%[FData]) \n\t" 2457 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2458 2459 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2460 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2461 "sw %[Temp1], 56(%[FData]) \n\t" 2462 "sw %[Temp2], 60(%[FData]) \n\t" 2463 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2464 2465 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2466 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2467 "sw %[Temp1], 64(%[FData]) \n\t" 2468 "sw %[Temp2], 68(%[FData]) \n\t" 2469 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2470 2471 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2472 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2473 "sw %[Temp1], 72(%[FData]) \n\t" 2474 "sw %[Temp2], 76(%[FData]) \n\t" 2475 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2476 2477 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2478 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2479 "sw %[Temp1], 80(%[FData]) \n\t" 2480 "sw %[Temp2], 84(%[FData]) \n\t" 2481 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2482 2483 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2484 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2485 "sw %[Temp1], 88(%[FData]) \n\t" 2486 "sw %[Temp2], 92(%[FData]) \n\t" 2487 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2488 2489 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2490 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2491 "sw %[Temp1], 96(%[FData]) \n\t" 2492 "sw %[Temp2], 100(%[FData]) \n\t" 2493 2494 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) 2495 : [FData] "r"(FData), [src_ptr] "r"(src_ptr), 2496 [src_pixels_per_line] "r"(src_pixels_per_line)); 2497 } 2498 2499 /* filter verticaly... */ 2500 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, 2501 yoffset); 2502 } 2503 2504 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2505 else { 2506 if (xoffset) 2507 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, 2508 8, xoffset, dst_pitch); 2509 2510 else { 2511 /* copy from src buffer to dst buffer */ 2512 __asm__ __volatile__( 2513 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2514 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2515 "sw %[Temp1], 0(%[dst_ptr]) \n\t" 2516 "sw %[Temp2], 4(%[dst_ptr]) \n\t" 2517 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2518 2519 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2520 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2521 "sw %[Temp1], 8(%[dst_ptr]) \n\t" 2522 "sw %[Temp2], 12(%[dst_ptr]) \n\t" 2523 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2524 2525 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2526 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2527 "sw %[Temp1], 16(%[dst_ptr]) \n\t" 2528 "sw %[Temp2], 20(%[dst_ptr]) \n\t" 2529 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2530 2531 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2532 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2533 "sw %[Temp1], 24(%[dst_ptr]) \n\t" 2534 "sw %[Temp2], 28(%[dst_ptr]) \n\t" 2535 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2536 2537 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2538 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2539 "sw %[Temp1], 32(%[dst_ptr]) \n\t" 2540 "sw %[Temp2], 36(%[dst_ptr]) \n\t" 2541 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2542 2543 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2544 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2545 "sw %[Temp1], 40(%[dst_ptr]) \n\t" 2546 "sw %[Temp2], 44(%[dst_ptr]) \n\t" 2547 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2548 2549 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2550 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2551 "sw %[Temp1], 48(%[dst_ptr]) \n\t" 2552 "sw %[Temp2], 52(%[dst_ptr]) \n\t" 2553 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2554 2555 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2556 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2557 "sw %[Temp1], 56(%[dst_ptr]) \n\t" 2558 "sw %[Temp2], 60(%[dst_ptr]) \n\t" 2559 2560 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) 2561 : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), 2562 [src_pixels_per_line] "r"(src_pixels_per_line)); 2563 } 2564 } 2565 } 2566 2567 void vp8_sixtap_predict8x4_dspr2(unsigned char *RESTRICT src_ptr, 2568 int src_pixels_per_line, int xoffset, 2569 int yoffset, unsigned char *RESTRICT dst_ptr, 2570 int dst_pitch) { 2571 unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */ 2572 unsigned int pos, Temp1, Temp2; 2573 2574 pos = 16; 2575 2576 /* bit positon for extract from acc */ 2577 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 2578 : 2579 : [pos] "r"(pos)); 2580 2581 if (yoffset) { 2582 src_ptr = src_ptr - (2 * src_pixels_per_line); 2583 2584 if (xoffset) /* filter 1-D horizontally... */ 2585 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line, 2586 9, xoffset, 8); 2587 2588 else { 2589 /* prefetch src_ptr data to cache memory */ 2590 prefetch_load(src_ptr + 2 * src_pixels_per_line); 2591 2592 __asm__ __volatile__( 2593 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2594 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2595 "sw %[Temp1], 0(%[FData]) \n\t" 2596 "sw %[Temp2], 4(%[FData]) \n\t" 2597 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2598 2599 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2600 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2601 "sw %[Temp1], 8(%[FData]) \n\t" 2602 "sw %[Temp2], 12(%[FData]) \n\t" 2603 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2604 2605 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2606 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2607 "sw %[Temp1], 16(%[FData]) \n\t" 2608 "sw %[Temp2], 20(%[FData]) \n\t" 2609 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2610 2611 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2612 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2613 "sw %[Temp1], 24(%[FData]) \n\t" 2614 "sw %[Temp2], 28(%[FData]) \n\t" 2615 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2616 2617 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2618 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2619 "sw %[Temp1], 32(%[FData]) \n\t" 2620 "sw %[Temp2], 36(%[FData]) \n\t" 2621 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2622 2623 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2624 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2625 "sw %[Temp1], 40(%[FData]) \n\t" 2626 "sw %[Temp2], 44(%[FData]) \n\t" 2627 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2628 2629 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2630 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2631 "sw %[Temp1], 48(%[FData]) \n\t" 2632 "sw %[Temp2], 52(%[FData]) \n\t" 2633 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2634 2635 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2636 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2637 "sw %[Temp1], 56(%[FData]) \n\t" 2638 "sw %[Temp2], 60(%[FData]) \n\t" 2639 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2640 2641 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2642 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2643 "sw %[Temp1], 64(%[FData]) \n\t" 2644 "sw %[Temp2], 68(%[FData]) \n\t" 2645 2646 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) 2647 : [FData] "r"(FData), [src_ptr] "r"(src_ptr), 2648 [src_pixels_per_line] "r"(src_pixels_per_line)); 2649 } 2650 2651 /* filter verticaly... */ 2652 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, 2653 yoffset); 2654 } 2655 2656 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2657 else { 2658 if (xoffset) 2659 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line, 2660 4, xoffset, dst_pitch); 2661 2662 else { 2663 /* copy from src buffer to dst buffer */ 2664 __asm__ __volatile__( 2665 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2666 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2667 "sw %[Temp1], 0(%[dst_ptr]) \n\t" 2668 "sw %[Temp2], 4(%[dst_ptr]) \n\t" 2669 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2670 2671 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2672 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2673 "sw %[Temp1], 8(%[dst_ptr]) \n\t" 2674 "sw %[Temp2], 12(%[dst_ptr]) \n\t" 2675 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2676 2677 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2678 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2679 "sw %[Temp1], 16(%[dst_ptr]) \n\t" 2680 "sw %[Temp2], 20(%[dst_ptr]) \n\t" 2681 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t" 2682 2683 "ulw %[Temp1], 0(%[src_ptr]) \n\t" 2684 "ulw %[Temp2], 4(%[src_ptr]) \n\t" 2685 "sw %[Temp1], 24(%[dst_ptr]) \n\t" 2686 "sw %[Temp2], 28(%[dst_ptr]) \n\t" 2687 2688 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2) 2689 : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), 2690 [src_pixels_per_line] "r"(src_pixels_per_line)); 2691 } 2692 } 2693 } 2694 2695 void vp8_sixtap_predict16x16_dspr2(unsigned char *RESTRICT src_ptr, 2696 int src_pixels_per_line, int xoffset, 2697 int yoffset, unsigned char *RESTRICT dst_ptr, 2698 int dst_pitch) { 2699 const unsigned short *VFilter; 2700 unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */ 2701 unsigned int pos; 2702 2703 VFilter = sub_pel_filterss[yoffset]; 2704 2705 pos = 16; 2706 2707 /* bit positon for extract from acc */ 2708 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 2709 : 2710 : [pos] "r"(pos)); 2711 2712 if (yoffset) { 2713 src_ptr = src_ptr - (2 * src_pixels_per_line); 2714 2715 switch (xoffset) { 2716 /* filter 1-D horizontally... */ 2717 case 2: 2718 case 4: 2719 case 6: 2720 /* 6 tap filter */ 2721 vp8_filter_block2d_first_pass16_6tap( 2722 src_ptr, FData, src_pixels_per_line, 21, xoffset, 16); 2723 break; 2724 2725 case 0: 2726 /* only copy buffer */ 2727 vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line); 2728 break; 2729 2730 case 1: 2731 case 3: 2732 case 5: 2733 case 7: 2734 /* 4 tap filter */ 2735 vp8_filter_block2d_first_pass16_4tap( 2736 src_ptr, FData, src_pixels_per_line, 16, 21, xoffset, yoffset, 2737 dst_ptr, dst_pitch); 2738 break; 2739 } 2740 2741 /* filter verticaly... */ 2742 vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter); 2743 } else { 2744 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */ 2745 switch (xoffset) { 2746 case 2: 2747 case 4: 2748 case 6: 2749 /* 6 tap filter */ 2750 vp8_filter_block2d_first_pass16_6tap( 2751 src_ptr, dst_ptr, src_pixels_per_line, 16, xoffset, dst_pitch); 2752 break; 2753 2754 case 1: 2755 case 3: 2756 case 5: 2757 case 7: 2758 /* 4 tap filter */ 2759 vp8_filter_block2d_first_pass16_4tap( 2760 src_ptr, dst_ptr, src_pixels_per_line, 16, 21, xoffset, yoffset, 2761 dst_ptr, dst_pitch); 2762 break; 2763 } 2764 } 2765 } 2766 2767 #endif 2768