1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx_dsp/mips/convolve_common_dspr2.h" 16 #include "vpx_dsp/vpx_dsp_common.h" 17 #include "vpx_dsp/vpx_filter.h" 18 #include "vpx_ports/mem.h" 19 20 #if HAVE_DSPR2 21 static void convolve_bi_horiz_4_transposed_dspr2( 22 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 23 const int16_t *filter_x0, int32_t h) { 24 int32_t y; 25 uint8_t *cm = vpx_ff_cropTbl; 26 uint8_t *dst_ptr; 27 int32_t Temp1, Temp2; 28 uint32_t vector4a = 64; 29 uint32_t tp1, tp2; 30 uint32_t p1, p2; 31 const int16_t *filter = &filter_x0[3]; 32 uint32_t filter45; 33 34 filter45 = ((const int32_t *)filter)[0]; 35 36 for (y = h; y--;) { 37 dst_ptr = dst; 38 /* prefetch data to cache memory */ 39 prefetch_load(src + src_stride); 40 prefetch_load(src + src_stride + 32); 41 42 __asm__ __volatile__( 43 "ulw %[tp1], 0(%[src]) \n\t" 44 "ulw %[tp2], 4(%[src]) \n\t" 45 46 /* even 1. pixel */ 47 "mtlo %[vector4a], $ac3 \n\t" 48 "mthi $zero, $ac3 \n\t" 49 "preceu.ph.qbr %[p1], %[tp1] \n\t" 50 "preceu.ph.qbl %[p2], %[tp1] \n\t" 51 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 52 "extp %[Temp1], $ac3, 31 \n\t" 53 54 /* even 2. pixel */ 55 "mtlo %[vector4a], $ac2 \n\t" 56 "mthi $zero, $ac2 \n\t" 57 "balign %[tp2], %[tp1], 3 \n\t" 58 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 59 "extp %[Temp2], $ac2, 31 \n\t" 60 61 /* odd 1. pixel */ 62 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 63 "mtlo %[vector4a], $ac3 \n\t" 64 "mthi $zero, $ac3 \n\t" 65 "preceu.ph.qbr %[p1], %[tp2] \n\t" 66 "preceu.ph.qbl %[p2], %[tp2] \n\t" 67 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 68 "extp %[Temp1], $ac3, 31 \n\t" 69 70 /* odd 2. pixel */ 71 "lbux %[tp2], %[Temp2](%[cm]) \n\t" 72 "mtlo %[vector4a], $ac2 \n\t" 73 "mthi $zero, $ac2 \n\t" 74 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 75 "extp %[Temp2], $ac2, 31 \n\t" 76 77 /* clamp */ 78 "lbux %[p1], %[Temp1](%[cm]) \n\t" 79 "lbux %[p2], %[Temp2](%[cm]) \n\t" 80 81 /* store bytes */ 82 "sb %[tp1], 0(%[dst_ptr]) \n\t" 83 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 84 85 "sb %[p1], 0(%[dst_ptr]) \n\t" 86 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 87 88 "sb %[tp2], 0(%[dst_ptr]) \n\t" 89 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 90 91 "sb %[p2], 0(%[dst_ptr]) \n\t" 92 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 93 94 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), 95 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) 96 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 97 [src] "r"(src), [dst_stride] "r"(dst_stride)); 98 99 /* Next row... */ 100 src += src_stride; 101 dst += 1; 102 } 103 } 104 105 static void convolve_bi_horiz_8_transposed_dspr2( 106 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 107 const int16_t *filter_x0, int32_t h) { 108 int32_t y; 109 uint8_t *cm = vpx_ff_cropTbl; 110 uint8_t *dst_ptr; 111 uint32_t vector4a = 64; 112 int32_t Temp1, Temp2, Temp3; 113 uint32_t tp1, tp2, tp3; 114 uint32_t p1, p2, p3, p4; 115 uint8_t *odd_dst; 116 uint32_t dst_pitch_2 = (dst_stride << 1); 117 const int16_t *filter = &filter_x0[3]; 118 uint32_t filter45; 119 120 filter45 = ((const int32_t *)filter)[0]; 121 122 for (y = h; y--;) { 123 /* prefetch data to cache memory */ 124 prefetch_load(src + src_stride); 125 prefetch_load(src + src_stride + 32); 126 127 dst_ptr = dst; 128 odd_dst = (dst_ptr + dst_stride); 129 130 __asm__ __volatile__( 131 "ulw %[tp1], 0(%[src]) \n\t" 132 "ulw %[tp2], 4(%[src]) \n\t" 133 134 /* even 1. pixel */ 135 "mtlo %[vector4a], $ac3 \n\t" 136 "mthi $zero, $ac3 \n\t" 137 "mtlo %[vector4a], $ac2 \n\t" 138 "mthi $zero, $ac2 \n\t" 139 "preceu.ph.qbr %[p1], %[tp1] \n\t" 140 "preceu.ph.qbl %[p2], %[tp1] \n\t" 141 "preceu.ph.qbr %[p3], %[tp2] \n\t" 142 "preceu.ph.qbl %[p4], %[tp2] \n\t" 143 "ulw %[tp3], 8(%[src]) \n\t" 144 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 145 "extp %[Temp1], $ac3, 31 \n\t" 146 147 /* even 2. pixel */ 148 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 149 "extp %[Temp3], $ac2, 31 \n\t" 150 151 /* even 3. pixel */ 152 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 153 "mtlo %[vector4a], $ac1 \n\t" 154 "mthi $zero, $ac1 \n\t" 155 "balign %[tp3], %[tp2], 3 \n\t" 156 "balign %[tp2], %[tp1], 3 \n\t" 157 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" 158 "lbux %[tp1], %[Temp3](%[cm]) \n\t" 159 "extp %[p3], $ac1, 31 \n\t" 160 161 /* even 4. pixel */ 162 "mtlo %[vector4a], $ac2 \n\t" 163 "mthi $zero, $ac2 \n\t" 164 "mtlo %[vector4a], $ac3 \n\t" 165 "mthi $zero, $ac3 \n\t" 166 "sb %[Temp2], 0(%[dst_ptr]) \n\t" 167 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 168 "sb %[tp1], 0(%[dst_ptr]) \n\t" 169 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 170 171 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 172 "extp %[Temp3], $ac2, 31 \n\t" 173 174 "lbux %[Temp1], %[p3](%[cm]) " 175 "\n\t" 176 177 /* odd 1. pixel */ 178 "mtlo %[vector4a], $ac1 \n\t" 179 "mthi $zero, $ac1 \n\t" 180 "preceu.ph.qbr %[p1], %[tp2] \n\t" 181 "preceu.ph.qbl %[p2], %[tp2] \n\t" 182 "preceu.ph.qbr %[p3], %[tp3] \n\t" 183 "preceu.ph.qbl %[p4], %[tp3] \n\t" 184 "sb %[Temp1], 0(%[dst_ptr]) \n\t" 185 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 186 187 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 188 "extp %[Temp2], $ac3, 31 \n\t" 189 190 /* odd 2. pixel */ 191 "lbux %[tp1], %[Temp3](%[cm]) \n\t" 192 "mtlo %[vector4a], $ac3 \n\t" 193 "mthi $zero, $ac3 \n\t" 194 "mtlo %[vector4a], $ac2 \n\t" 195 "mthi $zero, $ac2 \n\t" 196 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" 197 "sb %[tp1], 0(%[dst_ptr]) \n\t" 198 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 199 "extp %[Temp3], $ac1, 31 \n\t" 200 201 /* odd 3. pixel */ 202 "lbux %[tp3], %[Temp2](%[cm]) \n\t" 203 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" 204 "extp %[Temp2], $ac3, 31 \n\t" 205 206 /* odd 4. pixel */ 207 "sb %[tp3], 0(%[odd_dst]) \n\t" 208 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 209 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 210 "extp %[Temp1], $ac2, 31 \n\t" 211 212 /* clamp */ 213 "lbux %[p4], %[Temp3](%[cm]) \n\t" 214 "lbux %[p2], %[Temp2](%[cm]) \n\t" 215 "lbux %[p1], %[Temp1](%[cm]) \n\t" 216 217 /* store bytes */ 218 "sb %[p4], 0(%[odd_dst]) \n\t" 219 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 220 221 "sb %[p2], 0(%[odd_dst]) \n\t" 222 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 223 224 "sb %[p1], 0(%[odd_dst]) \n\t" 225 226 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), 227 [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), 228 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), 229 [odd_dst] "+r"(odd_dst) 230 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 231 [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); 232 233 /* Next row... */ 234 src += src_stride; 235 dst += 1; 236 } 237 } 238 239 static void convolve_bi_horiz_16_transposed_dspr2( 240 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, 241 int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { 242 int32_t c, y; 243 const uint8_t *src; 244 uint8_t *dst; 245 uint8_t *cm = vpx_ff_cropTbl; 246 uint32_t vector_64 = 64; 247 int32_t Temp1, Temp2, Temp3; 248 uint32_t qload1, qload2; 249 uint32_t p1, p2, p3, p4, p5; 250 uint32_t st1, st2, st3; 251 uint32_t dst_pitch_2 = (dst_stride << 1); 252 uint8_t *odd_dst; 253 const int16_t *filter = &filter_x0[3]; 254 uint32_t filter45; 255 256 filter45 = ((const int32_t *)filter)[0]; 257 258 for (y = h; y--;) { 259 /* prefetch data to cache memory */ 260 prefetch_load(src_ptr + src_stride); 261 prefetch_load(src_ptr + src_stride + 32); 262 263 src = src_ptr; 264 dst = dst_ptr; 265 266 odd_dst = (dst + dst_stride); 267 268 for (c = 0; c < count; c++) { 269 __asm__ __volatile__( 270 "ulw %[qload1], 0(%[src]) " 271 "\n\t" 272 "ulw %[qload2], 4(%[src]) " 273 "\n\t" 274 275 /* even 1. pixel */ 276 "mtlo %[vector_64], $ac1 " 277 "\n\t" /* even 1 */ 278 "mthi $zero, $ac1 " 279 "\n\t" 280 "mtlo %[vector_64], $ac2 " 281 "\n\t" /* even 2 */ 282 "mthi $zero, $ac2 " 283 "\n\t" 284 "preceu.ph.qbr %[p1], %[qload1] " 285 "\n\t" 286 "preceu.ph.qbl %[p2], %[qload1] " 287 "\n\t" 288 "preceu.ph.qbr %[p3], %[qload2] " 289 "\n\t" 290 "preceu.ph.qbl %[p4], %[qload2] " 291 "\n\t" 292 "ulw %[qload1], 8(%[src]) " 293 "\n\t" 294 "dpa.w.ph $ac1, %[p1], %[filter45] " 295 "\n\t" /* even 1 */ 296 "extp %[Temp1], $ac1, 31 " 297 "\n\t" /* even 1 */ 298 299 /* even 2. pixel */ 300 "mtlo %[vector_64], $ac3 " 301 "\n\t" /* even 3 */ 302 "mthi $zero, $ac3 " 303 "\n\t" 304 "preceu.ph.qbr %[p1], %[qload1] " 305 "\n\t" 306 "preceu.ph.qbl %[p5], %[qload1] " 307 "\n\t" 308 "ulw %[qload2], 12(%[src]) " 309 "\n\t" 310 "dpa.w.ph $ac2, %[p2], %[filter45] " 311 "\n\t" /* even 1 */ 312 "lbux %[st1], %[Temp1](%[cm]) " 313 "\n\t" /* even 1 */ 314 "extp %[Temp2], $ac2, 31 " 315 "\n\t" /* even 1 */ 316 317 /* even 3. pixel */ 318 "mtlo %[vector_64], $ac1 " 319 "\n\t" /* even 4 */ 320 "mthi $zero, $ac1 " 321 "\n\t" 322 "preceu.ph.qbr %[p2], %[qload2] " 323 "\n\t" 324 "sb %[st1], 0(%[dst]) " 325 "\n\t" /* even 1 */ 326 "addu %[dst], %[dst], %[dst_pitch_2] " 327 " \n\t" 328 "dpa.w.ph $ac3, %[p3], %[filter45] " 329 "\n\t" /* even 3 */ 330 "extp %[Temp3], $ac3, 31 " 331 "\n\t" /* even 3 */ 332 "lbux %[st2], %[Temp2](%[cm]) " 333 "\n\t" /* even 1 */ 334 335 /* even 4. pixel */ 336 "mtlo %[vector_64], $ac2 " 337 "\n\t" /* even 5 */ 338 "mthi $zero, $ac2 " 339 "\n\t" 340 "preceu.ph.qbl %[p3], %[qload2] " 341 "\n\t" 342 "sb %[st2], 0(%[dst]) " 343 "\n\t" /* even 2 */ 344 "addu %[dst], %[dst], %[dst_pitch_2] " 345 "\n\t" 346 "dpa.w.ph $ac1, %[p4], %[filter45] " 347 "\n\t" /* even 4 */ 348 "extp %[Temp1], $ac1, 31 " 349 "\n\t" /* even 4 */ 350 "lbux %[st3], %[Temp3](%[cm]) " 351 "\n\t" /* even 3 */ 352 353 /* even 5. pixel */ 354 "mtlo %[vector_64], $ac3 " 355 "\n\t" /* even 6 */ 356 "mthi $zero, $ac3 " 357 "\n\t" 358 "sb %[st3], 0(%[dst]) " 359 "\n\t" /* even 3 */ 360 "addu %[dst], %[dst], %[dst_pitch_2] " 361 "\n\t" 362 "dpa.w.ph $ac2, %[p1], %[filter45] " 363 "\n\t" /* even 5 */ 364 "extp %[Temp2], $ac2, 31 " 365 "\n\t" /* even 5 */ 366 "lbux %[st1], %[Temp1](%[cm]) " 367 "\n\t" /* even 4 */ 368 369 /* even 6. pixel */ 370 "mtlo %[vector_64], $ac1 " 371 "\n\t" /* even 7 */ 372 "mthi $zero, $ac1 " 373 "\n\t" 374 "sb %[st1], 0(%[dst]) " 375 "\n\t" /* even 4 */ 376 "addu %[dst], %[dst], %[dst_pitch_2] " 377 "\n\t" 378 "ulw %[qload1], 20(%[src]) " 379 "\n\t" 380 "dpa.w.ph $ac3, %[p5], %[filter45] " 381 "\n\t" /* even 6 */ 382 "extp %[Temp3], $ac3, 31 " 383 "\n\t" /* even 6 */ 384 "lbux %[st2], %[Temp2](%[cm]) " 385 "\n\t" /* even 5 */ 386 387 /* even 7. pixel */ 388 "mtlo %[vector_64], $ac2 " 389 "\n\t" /* even 8 */ 390 "mthi $zero, $ac2 " 391 "\n\t" 392 "preceu.ph.qbr %[p5], %[qload1] " 393 "\n\t" 394 "sb %[st2], 0(%[dst]) " 395 "\n\t" /* even 5 */ 396 "addu %[dst], %[dst], %[dst_pitch_2] " 397 "\n\t" 398 "dpa.w.ph $ac1, %[p2], %[filter45] " 399 "\n\t" /* even 7 */ 400 "extp %[Temp1], $ac1, 31 " 401 "\n\t" /* even 7 */ 402 "lbux %[st3], %[Temp3](%[cm]) " 403 "\n\t" /* even 6 */ 404 405 /* even 8. pixel */ 406 "mtlo %[vector_64], $ac3 " 407 "\n\t" /* odd 1 */ 408 "mthi $zero, $ac3 " 409 "\n\t" 410 "dpa.w.ph $ac2, %[p3], %[filter45] " 411 "\n\t" /* even 8 */ 412 "sb %[st3], 0(%[dst]) " 413 "\n\t" /* even 6 */ 414 "addu %[dst], %[dst], %[dst_pitch_2] " 415 "\n\t" 416 "extp %[Temp2], $ac2, 31 " 417 "\n\t" /* even 8 */ 418 "lbux %[st1], %[Temp1](%[cm]) " 419 "\n\t" /* even 7 */ 420 421 /* ODD pixels */ 422 "ulw %[qload1], 1(%[src]) " 423 "\n\t" 424 "ulw %[qload2], 5(%[src]) " 425 "\n\t" 426 427 /* odd 1. pixel */ 428 "mtlo %[vector_64], $ac1 " 429 "\n\t" /* odd 2 */ 430 "mthi $zero, $ac1 " 431 "\n\t" 432 "preceu.ph.qbr %[p1], %[qload1] " 433 "\n\t" 434 "preceu.ph.qbl %[p2], %[qload1] " 435 "\n\t" 436 "preceu.ph.qbr %[p3], %[qload2] " 437 "\n\t" 438 "preceu.ph.qbl %[p4], %[qload2] " 439 "\n\t" 440 "sb %[st1], 0(%[dst]) " 441 "\n\t" /* even 7 */ 442 "addu %[dst], %[dst], %[dst_pitch_2] " 443 "\n\t" 444 "ulw %[qload2], 9(%[src]) " 445 "\n\t" 446 "dpa.w.ph $ac3, %[p1], %[filter45] " 447 "\n\t" /* odd 1 */ 448 "extp %[Temp3], $ac3, 31 " 449 "\n\t" /* odd 1 */ 450 "lbux %[st2], %[Temp2](%[cm]) " 451 "\n\t" /* even 8 */ 452 453 /* odd 2. pixel */ 454 "mtlo %[vector_64], $ac2 " 455 "\n\t" /* odd 3 */ 456 "mthi $zero, $ac2 " 457 "\n\t" 458 "preceu.ph.qbr %[p1], %[qload2] " 459 "\n\t" 460 "preceu.ph.qbl %[p5], %[qload2] " 461 "\n\t" 462 "sb %[st2], 0(%[dst]) " 463 "\n\t" /* even 8 */ 464 "ulw %[qload1], 13(%[src]) " 465 "\n\t" 466 "dpa.w.ph $ac1, %[p2], %[filter45] " 467 "\n\t" /* odd 2 */ 468 "extp %[Temp1], $ac1, 31 " 469 "\n\t" /* odd 2 */ 470 "lbux %[st3], %[Temp3](%[cm]) " 471 "\n\t" /* odd 1 */ 472 473 /* odd 3. pixel */ 474 "mtlo %[vector_64], $ac3 " 475 "\n\t" /* odd 4 */ 476 "mthi $zero, $ac3 " 477 "\n\t" 478 "preceu.ph.qbr %[p2], %[qload1] " 479 "\n\t" 480 "sb %[st3], 0(%[odd_dst]) " 481 "\n\t" /* odd 1 */ 482 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 483 "\n\t" 484 "dpa.w.ph $ac2, %[p3], %[filter45] " 485 "\n\t" /* odd 3 */ 486 "extp %[Temp2], $ac2, 31 " 487 "\n\t" /* odd 3 */ 488 "lbux %[st1], %[Temp1](%[cm]) " 489 "\n\t" /* odd 2 */ 490 491 /* odd 4. pixel */ 492 "mtlo %[vector_64], $ac1 " 493 "\n\t" /* odd 5 */ 494 "mthi $zero, $ac1 " 495 "\n\t" 496 "preceu.ph.qbl %[p3], %[qload1] " 497 "\n\t" 498 "sb %[st1], 0(%[odd_dst]) " 499 "\n\t" /* odd 2 */ 500 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 501 "\n\t" 502 "dpa.w.ph $ac3, %[p4], %[filter45] " 503 "\n\t" /* odd 4 */ 504 "extp %[Temp3], $ac3, 31 " 505 "\n\t" /* odd 4 */ 506 "lbux %[st2], %[Temp2](%[cm]) " 507 "\n\t" /* odd 3 */ 508 509 /* odd 5. pixel */ 510 "mtlo %[vector_64], $ac2 " 511 "\n\t" /* odd 6 */ 512 "mthi $zero, $ac2 " 513 "\n\t" 514 "sb %[st2], 0(%[odd_dst]) " 515 "\n\t" /* odd 3 */ 516 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 517 "\n\t" 518 "dpa.w.ph $ac1, %[p1], %[filter45] " 519 "\n\t" /* odd 5 */ 520 "extp %[Temp1], $ac1, 31 " 521 "\n\t" /* odd 5 */ 522 "lbux %[st3], %[Temp3](%[cm]) " 523 "\n\t" /* odd 4 */ 524 525 /* odd 6. pixel */ 526 "mtlo %[vector_64], $ac3 " 527 "\n\t" /* odd 7 */ 528 "mthi $zero, $ac3 " 529 "\n\t" 530 "sb %[st3], 0(%[odd_dst]) " 531 "\n\t" /* odd 4 */ 532 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 533 "\n\t" 534 "ulw %[qload1], 21(%[src]) " 535 "\n\t" 536 "dpa.w.ph $ac2, %[p5], %[filter45] " 537 "\n\t" /* odd 6 */ 538 "extp %[Temp2], $ac2, 31 " 539 "\n\t" /* odd 6 */ 540 "lbux %[st1], %[Temp1](%[cm]) " 541 "\n\t" /* odd 5 */ 542 543 /* odd 7. pixel */ 544 "mtlo %[vector_64], $ac1 " 545 "\n\t" /* odd 8 */ 546 "mthi $zero, $ac1 " 547 "\n\t" 548 "preceu.ph.qbr %[p5], %[qload1] " 549 "\n\t" 550 "sb %[st1], 0(%[odd_dst]) " 551 "\n\t" /* odd 5 */ 552 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 553 "\n\t" 554 "dpa.w.ph $ac3, %[p2], %[filter45] " 555 "\n\t" /* odd 7 */ 556 "extp %[Temp3], $ac3, 31 " 557 "\n\t" /* odd 7 */ 558 559 /* odd 8. pixel */ 560 "dpa.w.ph $ac1, %[p3], %[filter45] " 561 "\n\t" /* odd 8 */ 562 "extp %[Temp1], $ac1, 31 " 563 "\n\t" /* odd 8 */ 564 565 "lbux %[st2], %[Temp2](%[cm]) " 566 "\n\t" /* odd 6 */ 567 "lbux %[st3], %[Temp3](%[cm]) " 568 "\n\t" /* odd 7 */ 569 "lbux %[st1], %[Temp1](%[cm]) " 570 "\n\t" /* odd 8 */ 571 572 "sb %[st2], 0(%[odd_dst]) " 573 "\n\t" /* odd 6 */ 574 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 575 "\n\t" 576 577 "sb %[st3], 0(%[odd_dst]) " 578 "\n\t" /* odd 7 */ 579 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 580 "\n\t" 581 582 "sb %[st1], 0(%[odd_dst]) " 583 "\n\t" /* odd 8 */ 584 585 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), 586 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), 587 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), 588 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 589 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) 590 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 591 [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); 592 593 src += 16; 594 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 595 odd_dst = (dst + dst_stride); 596 } 597 598 /* Next row... */ 599 src_ptr += src_stride; 600 dst_ptr += 1; 601 } 602 } 603 604 static void convolve_bi_horiz_64_transposed_dspr2( 605 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, 606 int32_t dst_stride, const int16_t *filter_x0, int32_t h) { 607 int32_t c, y; 608 const uint8_t *src; 609 uint8_t *dst; 610 uint8_t *cm = vpx_ff_cropTbl; 611 uint32_t vector_64 = 64; 612 int32_t Temp1, Temp2, Temp3; 613 uint32_t qload1, qload2; 614 uint32_t p1, p2, p3, p4, p5; 615 uint32_t st1, st2, st3; 616 uint32_t dst_pitch_2 = (dst_stride << 1); 617 uint8_t *odd_dst; 618 const int16_t *filter = &filter_x0[3]; 619 uint32_t filter45; 620 621 filter45 = ((const int32_t *)filter)[0]; 622 623 for (y = h; y--;) { 624 /* prefetch data to cache memory */ 625 prefetch_load(src_ptr + src_stride); 626 prefetch_load(src_ptr + src_stride + 32); 627 prefetch_load(src_ptr + src_stride + 64); 628 629 src = src_ptr; 630 dst = dst_ptr; 631 632 odd_dst = (dst + dst_stride); 633 634 for (c = 0; c < 4; c++) { 635 __asm__ __volatile__( 636 "ulw %[qload1], 0(%[src]) " 637 "\n\t" 638 "ulw %[qload2], 4(%[src]) " 639 "\n\t" 640 641 /* even 1. pixel */ 642 "mtlo %[vector_64], $ac1 " 643 "\n\t" /* even 1 */ 644 "mthi $zero, $ac1 " 645 "\n\t" 646 "mtlo %[vector_64], $ac2 " 647 "\n\t" /* even 2 */ 648 "mthi $zero, $ac2 " 649 "\n\t" 650 "preceu.ph.qbr %[p1], %[qload1] " 651 "\n\t" 652 "preceu.ph.qbl %[p2], %[qload1] " 653 "\n\t" 654 "preceu.ph.qbr %[p3], %[qload2] " 655 "\n\t" 656 "preceu.ph.qbl %[p4], %[qload2] " 657 "\n\t" 658 "ulw %[qload1], 8(%[src]) " 659 "\n\t" 660 "dpa.w.ph $ac1, %[p1], %[filter45] " 661 "\n\t" /* even 1 */ 662 "extp %[Temp1], $ac1, 31 " 663 "\n\t" /* even 1 */ 664 665 /* even 2. pixel */ 666 "mtlo %[vector_64], $ac3 " 667 "\n\t" /* even 3 */ 668 "mthi $zero, $ac3 " 669 "\n\t" 670 "preceu.ph.qbr %[p1], %[qload1] " 671 "\n\t" 672 "preceu.ph.qbl %[p5], %[qload1] " 673 "\n\t" 674 "ulw %[qload2], 12(%[src]) " 675 "\n\t" 676 "dpa.w.ph $ac2, %[p2], %[filter45] " 677 "\n\t" /* even 1 */ 678 "lbux %[st1], %[Temp1](%[cm]) " 679 "\n\t" /* even 1 */ 680 "extp %[Temp2], $ac2, 31 " 681 "\n\t" /* even 1 */ 682 683 /* even 3. pixel */ 684 "mtlo %[vector_64], $ac1 " 685 "\n\t" /* even 4 */ 686 "mthi $zero, $ac1 " 687 "\n\t" 688 "preceu.ph.qbr %[p2], %[qload2] " 689 "\n\t" 690 "sb %[st1], 0(%[dst]) " 691 "\n\t" /* even 1 */ 692 "addu %[dst], %[dst], %[dst_pitch_2] " 693 " \n\t" 694 "dpa.w.ph $ac3, %[p3], %[filter45] " 695 "\n\t" /* even 3 */ 696 "extp %[Temp3], $ac3, 31 " 697 "\n\t" /* even 3 */ 698 "lbux %[st2], %[Temp2](%[cm]) " 699 "\n\t" /* even 1 */ 700 701 /* even 4. pixel */ 702 "mtlo %[vector_64], $ac2 " 703 "\n\t" /* even 5 */ 704 "mthi $zero, $ac2 " 705 "\n\t" 706 "preceu.ph.qbl %[p3], %[qload2] " 707 "\n\t" 708 "sb %[st2], 0(%[dst]) " 709 "\n\t" /* even 2 */ 710 "addu %[dst], %[dst], %[dst_pitch_2] " 711 "\n\t" 712 "dpa.w.ph $ac1, %[p4], %[filter45] " 713 "\n\t" /* even 4 */ 714 "extp %[Temp1], $ac1, 31 " 715 "\n\t" /* even 4 */ 716 "lbux %[st3], %[Temp3](%[cm]) " 717 "\n\t" /* even 3 */ 718 719 /* even 5. pixel */ 720 "mtlo %[vector_64], $ac3 " 721 "\n\t" /* even 6 */ 722 "mthi $zero, $ac3 " 723 "\n\t" 724 "sb %[st3], 0(%[dst]) " 725 "\n\t" /* even 3 */ 726 "addu %[dst], %[dst], %[dst_pitch_2] " 727 "\n\t" 728 "dpa.w.ph $ac2, %[p1], %[filter45] " 729 "\n\t" /* even 5 */ 730 "extp %[Temp2], $ac2, 31 " 731 "\n\t" /* even 5 */ 732 "lbux %[st1], %[Temp1](%[cm]) " 733 "\n\t" /* even 4 */ 734 735 /* even 6. pixel */ 736 "mtlo %[vector_64], $ac1 " 737 "\n\t" /* even 7 */ 738 "mthi $zero, $ac1 " 739 "\n\t" 740 "sb %[st1], 0(%[dst]) " 741 "\n\t" /* even 4 */ 742 "addu %[dst], %[dst], %[dst_pitch_2] " 743 "\n\t" 744 "ulw %[qload1], 20(%[src]) " 745 "\n\t" 746 "dpa.w.ph $ac3, %[p5], %[filter45] " 747 "\n\t" /* even 6 */ 748 "extp %[Temp3], $ac3, 31 " 749 "\n\t" /* even 6 */ 750 "lbux %[st2], %[Temp2](%[cm]) " 751 "\n\t" /* even 5 */ 752 753 /* even 7. pixel */ 754 "mtlo %[vector_64], $ac2 " 755 "\n\t" /* even 8 */ 756 "mthi $zero, $ac2 " 757 "\n\t" 758 "preceu.ph.qbr %[p5], %[qload1] " 759 "\n\t" 760 "sb %[st2], 0(%[dst]) " 761 "\n\t" /* even 5 */ 762 "addu %[dst], %[dst], %[dst_pitch_2] " 763 "\n\t" 764 "dpa.w.ph $ac1, %[p2], %[filter45] " 765 "\n\t" /* even 7 */ 766 "extp %[Temp1], $ac1, 31 " 767 "\n\t" /* even 7 */ 768 "lbux %[st3], %[Temp3](%[cm]) " 769 "\n\t" /* even 6 */ 770 771 /* even 8. pixel */ 772 "mtlo %[vector_64], $ac3 " 773 "\n\t" /* odd 1 */ 774 "mthi $zero, $ac3 " 775 "\n\t" 776 "dpa.w.ph $ac2, %[p3], %[filter45] " 777 "\n\t" /* even 8 */ 778 "sb %[st3], 0(%[dst]) " 779 "\n\t" /* even 6 */ 780 "addu %[dst], %[dst], %[dst_pitch_2] " 781 "\n\t" 782 "extp %[Temp2], $ac2, 31 " 783 "\n\t" /* even 8 */ 784 "lbux %[st1], %[Temp1](%[cm]) " 785 "\n\t" /* even 7 */ 786 787 /* ODD pixels */ 788 "ulw %[qload1], 1(%[src]) " 789 "\n\t" 790 "ulw %[qload2], 5(%[src]) " 791 "\n\t" 792 793 /* odd 1. pixel */ 794 "mtlo %[vector_64], $ac1 " 795 "\n\t" /* odd 2 */ 796 "mthi $zero, $ac1 " 797 "\n\t" 798 "preceu.ph.qbr %[p1], %[qload1] " 799 "\n\t" 800 "preceu.ph.qbl %[p2], %[qload1] " 801 "\n\t" 802 "preceu.ph.qbr %[p3], %[qload2] " 803 "\n\t" 804 "preceu.ph.qbl %[p4], %[qload2] " 805 "\n\t" 806 "sb %[st1], 0(%[dst]) " 807 "\n\t" /* even 7 */ 808 "addu %[dst], %[dst], %[dst_pitch_2] " 809 "\n\t" 810 "ulw %[qload2], 9(%[src]) " 811 "\n\t" 812 "dpa.w.ph $ac3, %[p1], %[filter45] " 813 "\n\t" /* odd 1 */ 814 "extp %[Temp3], $ac3, 31 " 815 "\n\t" /* odd 1 */ 816 "lbux %[st2], %[Temp2](%[cm]) " 817 "\n\t" /* even 8 */ 818 819 /* odd 2. pixel */ 820 "mtlo %[vector_64], $ac2 " 821 "\n\t" /* odd 3 */ 822 "mthi $zero, $ac2 " 823 "\n\t" 824 "preceu.ph.qbr %[p1], %[qload2] " 825 "\n\t" 826 "preceu.ph.qbl %[p5], %[qload2] " 827 "\n\t" 828 "sb %[st2], 0(%[dst]) " 829 "\n\t" /* even 8 */ 830 "ulw %[qload1], 13(%[src]) " 831 "\n\t" 832 "dpa.w.ph $ac1, %[p2], %[filter45] " 833 "\n\t" /* odd 2 */ 834 "extp %[Temp1], $ac1, 31 " 835 "\n\t" /* odd 2 */ 836 "lbux %[st3], %[Temp3](%[cm]) " 837 "\n\t" /* odd 1 */ 838 839 /* odd 3. pixel */ 840 "mtlo %[vector_64], $ac3 " 841 "\n\t" /* odd 4 */ 842 "mthi $zero, $ac3 " 843 "\n\t" 844 "preceu.ph.qbr %[p2], %[qload1] " 845 "\n\t" 846 "sb %[st3], 0(%[odd_dst]) " 847 "\n\t" /* odd 1 */ 848 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 849 "\n\t" 850 "dpa.w.ph $ac2, %[p3], %[filter45] " 851 "\n\t" /* odd 3 */ 852 "extp %[Temp2], $ac2, 31 " 853 "\n\t" /* odd 3 */ 854 "lbux %[st1], %[Temp1](%[cm]) " 855 "\n\t" /* odd 2 */ 856 857 /* odd 4. pixel */ 858 "mtlo %[vector_64], $ac1 " 859 "\n\t" /* odd 5 */ 860 "mthi $zero, $ac1 " 861 "\n\t" 862 "preceu.ph.qbl %[p3], %[qload1] " 863 "\n\t" 864 "sb %[st1], 0(%[odd_dst]) " 865 "\n\t" /* odd 2 */ 866 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 867 "\n\t" 868 "dpa.w.ph $ac3, %[p4], %[filter45] " 869 "\n\t" /* odd 4 */ 870 "extp %[Temp3], $ac3, 31 " 871 "\n\t" /* odd 4 */ 872 "lbux %[st2], %[Temp2](%[cm]) " 873 "\n\t" /* odd 3 */ 874 875 /* odd 5. pixel */ 876 "mtlo %[vector_64], $ac2 " 877 "\n\t" /* odd 6 */ 878 "mthi $zero, $ac2 " 879 "\n\t" 880 "sb %[st2], 0(%[odd_dst]) " 881 "\n\t" /* odd 3 */ 882 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 883 "\n\t" 884 "dpa.w.ph $ac1, %[p1], %[filter45] " 885 "\n\t" /* odd 5 */ 886 "extp %[Temp1], $ac1, 31 " 887 "\n\t" /* odd 5 */ 888 "lbux %[st3], %[Temp3](%[cm]) " 889 "\n\t" /* odd 4 */ 890 891 /* odd 6. pixel */ 892 "mtlo %[vector_64], $ac3 " 893 "\n\t" /* odd 7 */ 894 "mthi $zero, $ac3 " 895 "\n\t" 896 "sb %[st3], 0(%[odd_dst]) " 897 "\n\t" /* odd 4 */ 898 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 899 "\n\t" 900 "ulw %[qload1], 21(%[src]) " 901 "\n\t" 902 "dpa.w.ph $ac2, %[p5], %[filter45] " 903 "\n\t" /* odd 6 */ 904 "extp %[Temp2], $ac2, 31 " 905 "\n\t" /* odd 6 */ 906 "lbux %[st1], %[Temp1](%[cm]) " 907 "\n\t" /* odd 5 */ 908 909 /* odd 7. pixel */ 910 "mtlo %[vector_64], $ac1 " 911 "\n\t" /* odd 8 */ 912 "mthi $zero, $ac1 " 913 "\n\t" 914 "preceu.ph.qbr %[p5], %[qload1] " 915 "\n\t" 916 "sb %[st1], 0(%[odd_dst]) " 917 "\n\t" /* odd 5 */ 918 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 919 "\n\t" 920 "dpa.w.ph $ac3, %[p2], %[filter45] " 921 "\n\t" /* odd 7 */ 922 "extp %[Temp3], $ac3, 31 " 923 "\n\t" /* odd 7 */ 924 925 /* odd 8. pixel */ 926 "dpa.w.ph $ac1, %[p3], %[filter45] " 927 "\n\t" /* odd 8 */ 928 "extp %[Temp1], $ac1, 31 " 929 "\n\t" /* odd 8 */ 930 931 "lbux %[st2], %[Temp2](%[cm]) " 932 "\n\t" /* odd 6 */ 933 "lbux %[st3], %[Temp3](%[cm]) " 934 "\n\t" /* odd 7 */ 935 "lbux %[st1], %[Temp1](%[cm]) " 936 "\n\t" /* odd 8 */ 937 938 "sb %[st2], 0(%[odd_dst]) " 939 "\n\t" /* odd 6 */ 940 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 941 "\n\t" 942 943 "sb %[st3], 0(%[odd_dst]) " 944 "\n\t" /* odd 7 */ 945 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 946 "\n\t" 947 948 "sb %[st1], 0(%[odd_dst]) " 949 "\n\t" /* odd 8 */ 950 951 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), 952 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), 953 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), 954 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 955 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) 956 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 957 [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); 958 959 src += 16; 960 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 961 odd_dst = (dst + dst_stride); 962 } 963 964 /* Next row... */ 965 src_ptr += src_stride; 966 dst_ptr += 1; 967 } 968 } 969 970 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, 971 uint8_t *dst, ptrdiff_t dst_stride, 972 const int16_t *filter, int w, int h) { 973 int x, y; 974 975 for (y = 0; y < h; ++y) { 976 for (x = 0; x < w; ++x) { 977 int sum = 0; 978 979 sum += src[x] * filter[3]; 980 sum += src[x + 1] * filter[4]; 981 982 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); 983 } 984 985 src += src_stride; 986 dst += 1; 987 } 988 } 989 990 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 991 ptrdiff_t dst_stride, const int16_t *filter, int w, 992 int h) { 993 uint32_t pos = 38; 994 995 /* bit positon for extract from acc */ 996 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 997 : 998 : [pos] "r"(pos)); 999 1000 /* prefetch data to cache memory */ 1001 prefetch_load(src); 1002 prefetch_load(src + 32); 1003 1004 switch (w) { 1005 case 4: 1006 convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, 1007 filter, h); 1008 break; 1009 case 8: 1010 convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, 1011 filter, h); 1012 break; 1013 case 16: 1014 case 32: 1015 convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, 1016 filter, h, (w / 16)); 1017 break; 1018 case 64: 1019 prefetch_load(src + 32); 1020 convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, 1021 filter, h); 1022 break; 1023 default: 1024 convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, 1025 h); 1026 break; 1027 } 1028 } 1029 #endif 1030