1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 #include "vp9/common/vp9_common.h" 17 #include "vpx/vpx_integer.h" 18 #include "vpx_ports/mem.h" 19 #include "vp9/common/vp9_filter.h" 20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 21 22 #if HAVE_DSPR2 23 uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; 24 uint8_t *vp9_ff_cropTbl; 25 26 void vp9_dsputil_static_init(void) { 27 int i; 28 29 for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i; 30 31 for (i = 0; i < CROP_WIDTH; i++) { 32 vp9_ff_cropTbl_a[i] = 0; 33 vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; 34 } 35 36 vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH]; 37 } 38 39 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, 40 int32_t src_stride, 41 uint8_t *dst, 42 int32_t dst_stride, 43 const int16_t *filter_x0, 44 int32_t h) { 45 int32_t y; 46 uint8_t *cm = vp9_ff_cropTbl; 47 uint8_t *dst_ptr; 48 int32_t vector1b, vector2b, vector3b, vector4b; 49 int32_t Temp1, Temp2, Temp3, Temp4; 50 uint32_t vector4a = 64; 51 uint32_t tp1, tp2; 52 uint32_t p1, p2, p3, p4; 53 uint32_t tn1, tn2; 54 55 vector1b = ((const int32_t *)filter_x0)[0]; 56 vector2b = ((const int32_t *)filter_x0)[1]; 57 vector3b = ((const int32_t *)filter_x0)[2]; 58 vector4b = ((const int32_t *)filter_x0)[3]; 59 60 for (y = h; y--;) { 61 dst_ptr = dst; 62 /* prefetch data to cache memory */ 63 vp9_prefetch_load(src + src_stride); 64 vp9_prefetch_load(src + src_stride + 32); 65 66 __asm__ __volatile__ ( 67 "ulw %[tp1], 0(%[src]) \n\t" 68 "ulw %[tp2], 4(%[src]) \n\t" 69 70 /* even 1. pixel */ 71 "mtlo %[vector4a], $ac3 \n\t" 72 "mthi $zero, $ac3 \n\t" 73 "preceu.ph.qbr %[p1], %[tp1] \n\t" 74 "preceu.ph.qbl %[p2], %[tp1] \n\t" 75 "preceu.ph.qbr %[p3], %[tp2] \n\t" 76 "preceu.ph.qbl %[p4], %[tp2] \n\t" 77 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 78 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 79 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 80 "ulw %[tn2], 8(%[src]) \n\t" 81 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 82 "extp %[Temp1], $ac3, 31 \n\t" 83 84 /* even 2. pixel */ 85 "mtlo %[vector4a], $ac2 \n\t" 86 "mthi $zero, $ac2 \n\t" 87 "preceu.ph.qbr %[p1], %[tn2] \n\t" 88 "balign %[tn1], %[tn2], 3 \n\t" 89 "balign %[tn2], %[tp2], 3 \n\t" 90 "balign %[tp2], %[tp1], 3 \n\t" 91 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 92 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 93 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 94 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 95 "extp %[Temp3], $ac2, 31 \n\t" 96 97 /* odd 1. pixel */ 98 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 99 "mtlo %[vector4a], $ac3 \n\t" 100 "mthi $zero, $ac3 \n\t" 101 "preceu.ph.qbr %[p1], %[tp2] \n\t" 102 "preceu.ph.qbl %[p2], %[tp2] \n\t" 103 "preceu.ph.qbr %[p3], %[tn2] \n\t" 104 "preceu.ph.qbl %[p4], %[tn2] \n\t" 105 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 106 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 107 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 108 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 109 "extp %[Temp2], $ac3, 31 \n\t" 110 111 /* odd 2. pixel */ 112 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 113 "mtlo %[vector4a], $ac2 \n\t" 114 "mthi $zero, $ac2 \n\t" 115 "preceu.ph.qbr %[p1], %[tn1] \n\t" 116 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 117 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 118 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 119 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 120 "extp %[Temp4], $ac2, 31 \n\t" 121 122 /* clamp */ 123 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 124 "lbux %[p2], %[Temp4](%[cm]) \n\t" 125 126 /* store bytes */ 127 "sb %[tp1], 0(%[dst_ptr]) \n\t" 128 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 129 130 "sb %[tn1], 0(%[dst_ptr]) \n\t" 131 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 132 133 "sb %[tp2], 0(%[dst_ptr]) \n\t" 134 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 135 136 "sb %[p2], 0(%[dst_ptr]) \n\t" 137 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 138 139 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), 140 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 141 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), 142 [dst_ptr] "+r" (dst_ptr) 143 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 144 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 145 [vector4a] "r" (vector4a), 146 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) 147 ); 148 149 /* Next row... */ 150 src += src_stride; 151 dst += 1; 152 } 153 } 154 155 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, 156 int32_t src_stride, 157 uint8_t *dst, 158 int32_t dst_stride, 159 const int16_t *filter_x0, 160 int32_t h) { 161 int32_t y; 162 uint8_t *cm = vp9_ff_cropTbl; 163 uint8_t *dst_ptr; 164 uint32_t vector4a = 64; 165 int32_t vector1b, vector2b, vector3b, vector4b; 166 int32_t Temp1, Temp2, Temp3; 167 uint32_t tp1, tp2, tp3; 168 uint32_t p1, p2, p3, p4, n1; 169 uint8_t *odd_dst; 170 uint32_t dst_pitch_2 = (dst_stride << 1); 171 172 vector1b = ((const int32_t *)filter_x0)[0]; 173 vector2b = ((const int32_t *)filter_x0)[1]; 174 vector3b = ((const int32_t *)filter_x0)[2]; 175 vector4b = ((const int32_t *)filter_x0)[3]; 176 177 for (y = h; y--;) { 178 /* prefetch data to cache memory */ 179 vp9_prefetch_load(src + src_stride); 180 vp9_prefetch_load(src + src_stride + 32); 181 182 dst_ptr = dst; 183 odd_dst = (dst_ptr + dst_stride); 184 185 __asm__ __volatile__ ( 186 "ulw %[tp2], 0(%[src]) \n\t" 187 "ulw %[tp1], 4(%[src]) \n\t" 188 189 /* even 1. pixel */ 190 "mtlo %[vector4a], $ac3 \n\t" 191 "mthi $zero, $ac3 \n\t" 192 "mtlo %[vector4a], $ac2 \n\t" 193 "mthi $zero, $ac2 \n\t" 194 "preceu.ph.qbr %[p1], %[tp2] \n\t" 195 "preceu.ph.qbl %[p2], %[tp2] \n\t" 196 "preceu.ph.qbr %[p3], %[tp1] \n\t" 197 "preceu.ph.qbl %[p4], %[tp1] \n\t" 198 "ulw %[tp3], 8(%[src]) \n\t" 199 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 200 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 201 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 202 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 203 "extp %[Temp1], $ac3, 31 \n\t" 204 205 /* even 2. pixel */ 206 "preceu.ph.qbr %[p1], %[tp3] \n\t" 207 "preceu.ph.qbl %[n1], %[tp3] \n\t" 208 "ulw %[tp2], 12(%[src]) \n\t" 209 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 210 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 211 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 212 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 213 "extp %[Temp3], $ac2, 31 \n\t" 214 215 /* even 3. pixel */ 216 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 217 "mtlo %[vector4a], $ac1 \n\t" 218 "mthi $zero, $ac1 \n\t" 219 "preceu.ph.qbr %[p2], %[tp2] \n\t" 220 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 221 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 222 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 223 "lbux %[tp3], %[Temp3](%[cm]) \n\t" 224 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 225 "extp %[p3], $ac1, 31 \n\t" 226 227 /* even 4. pixel */ 228 "mtlo %[vector4a], $ac2 \n\t" 229 "mthi $zero, $ac2 \n\t" 230 "mtlo %[vector4a], $ac3 \n\t" 231 "mthi $zero, $ac3 \n\t" 232 "sb %[Temp2], 0(%[dst_ptr]) \n\t" 233 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 234 "sb %[tp3], 0(%[dst_ptr]) \n\t" 235 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 236 237 "ulw %[tp1], 1(%[src]) \n\t" 238 "ulw %[tp3], 5(%[src]) \n\t" 239 240 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 241 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 242 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 243 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 244 "extp %[Temp3], $ac2, 31 \n\t" 245 246 "lbux %[tp2], %[p3](%[cm]) \n\t" 247 248 /* odd 1. pixel */ 249 "mtlo %[vector4a], $ac1 \n\t" 250 "mthi $zero, $ac1 \n\t" 251 "preceu.ph.qbr %[p1], %[tp1] \n\t" 252 "preceu.ph.qbl %[p2], %[tp1] \n\t" 253 "preceu.ph.qbr %[p3], %[tp3] \n\t" 254 "preceu.ph.qbl %[p4], %[tp3] \n\t" 255 "sb %[tp2], 0(%[dst_ptr]) \n\t" 256 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 257 "ulw %[tp2], 9(%[src]) \n\t" 258 259 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 260 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 261 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 262 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 263 "extp %[Temp2], $ac3, 31 \n\t" 264 265 /* odd 2. pixel */ 266 "lbux %[tp1], %[Temp3](%[cm]) \n\t" 267 "mtlo %[vector4a], $ac3 \n\t" 268 "mthi $zero, $ac3 \n\t" 269 "mtlo %[vector4a], $ac2 \n\t" 270 "mthi $zero, $ac2 \n\t" 271 "preceu.ph.qbr %[p1], %[tp2] \n\t" 272 "preceu.ph.qbl %[n1], %[tp2] \n\t" 273 "ulw %[Temp1], 13(%[src]) \n\t" 274 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 275 "sb %[tp1], 0(%[dst_ptr]) \n\t" 276 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 277 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 278 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 279 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 280 "extp %[Temp3], $ac1, 31 \n\t" 281 282 /* odd 3. pixel */ 283 "lbux %[tp3], %[Temp2](%[cm]) \n\t" 284 "preceu.ph.qbr %[p2], %[Temp1] \n\t" 285 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 286 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 287 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 288 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 289 "extp %[Temp2], $ac3, 31 \n\t" 290 291 /* odd 4. pixel */ 292 "sb %[tp3], 0(%[odd_dst]) \n\t" 293 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 294 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 295 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 296 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 297 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 298 "extp %[Temp1], $ac2, 31 \n\t" 299 300 /* clamp */ 301 "lbux %[p4], %[Temp3](%[cm]) \n\t" 302 "lbux %[p2], %[Temp2](%[cm]) \n\t" 303 "lbux %[n1], %[Temp1](%[cm]) \n\t" 304 305 /* store bytes */ 306 "sb %[p4], 0(%[odd_dst]) \n\t" 307 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 308 309 "sb %[p2], 0(%[odd_dst]) \n\t" 310 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 311 312 "sb %[n1], 0(%[odd_dst]) \n\t" 313 314 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), 315 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 316 [n1] "=&r" (n1), 317 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 318 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) 319 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 320 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 321 [vector4a] "r" (vector4a), [cm] "r" (cm), 322 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) 323 ); 324 325 /* Next row... */ 326 src += src_stride; 327 dst += 1; 328 } 329 } 330 331 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, 332 int32_t src_stride, 333 uint8_t *dst_ptr, 334 int32_t dst_stride, 335 const int16_t *filter_x0, 336 int32_t h, 337 int32_t count) { 338 int32_t c, y; 339 const uint8_t *src; 340 uint8_t *dst; 341 uint8_t *cm = vp9_ff_cropTbl; 342 uint32_t vector_64 = 64; 343 int32_t filter12, filter34, filter56, filter78; 344 int32_t Temp1, Temp2, Temp3; 345 uint32_t qload1, qload2; 346 uint32_t p1, p2, p3, p4, p5; 347 uint32_t st1, st2, st3; 348 uint32_t dst_pitch_2 = (dst_stride << 1); 349 uint8_t *odd_dst; 350 351 filter12 = ((const int32_t *)filter_x0)[0]; 352 filter34 = ((const int32_t *)filter_x0)[1]; 353 filter56 = ((const int32_t *)filter_x0)[2]; 354 filter78 = ((const int32_t *)filter_x0)[3]; 355 356 for (y = h; y--;) { 357 /* prefetch data to cache memory */ 358 vp9_prefetch_load(src_ptr + src_stride); 359 vp9_prefetch_load(src_ptr + src_stride + 32); 360 361 src = src_ptr; 362 dst = dst_ptr; 363 364 odd_dst = (dst + dst_stride); 365 366 for (c = 0; c < count; c++) { 367 __asm__ __volatile__ ( 368 "ulw %[qload1], 0(%[src]) \n\t" 369 "ulw %[qload2], 4(%[src]) \n\t" 370 371 /* even 1. pixel */ 372 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 373 "mthi $zero, $ac1 \n\t" 374 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 375 "mthi $zero, $ac2 \n\t" 376 "preceu.ph.qbr %[p3], %[qload2] \n\t" 377 "preceu.ph.qbl %[p4], %[qload2] \n\t" 378 "preceu.ph.qbr %[p1], %[qload1] \n\t" 379 "preceu.ph.qbl %[p2], %[qload1] \n\t" 380 "ulw %[qload2], 8(%[src]) \n\t" 381 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 382 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 383 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 384 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 385 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 386 387 /* even 2. pixel */ 388 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 389 "mthi $zero, $ac3 \n\t" 390 "preceu.ph.qbr %[p1], %[qload2] \n\t" 391 "preceu.ph.qbl %[p5], %[qload2] \n\t" 392 "ulw %[qload1], 12(%[src]) \n\t" 393 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 394 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 395 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 396 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 397 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 398 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 399 400 /* even 3. pixel */ 401 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 402 "mthi $zero, $ac1 \n\t" 403 "preceu.ph.qbr %[p2], %[qload1] \n\t" 404 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 405 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 406 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 407 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 408 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 409 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 410 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 411 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 412 413 /* even 4. pixel */ 414 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 415 "mthi $zero, $ac2 \n\t" 416 "preceu.ph.qbl %[p3], %[qload1] \n\t" 417 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ 418 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 419 "ulw %[qload2], 16(%[src]) \n\t" 420 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 421 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 422 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 423 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 424 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 425 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 426 427 /* even 5. pixel */ 428 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 429 "mthi $zero, $ac3 \n\t" 430 "preceu.ph.qbr %[p4], %[qload2] \n\t" 431 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ 432 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 433 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 434 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 435 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 436 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 437 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 438 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 439 440 /* even 6. pixel */ 441 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 442 "mthi $zero, $ac1 \n\t" 443 "preceu.ph.qbl %[p1], %[qload2] \n\t" 444 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ 445 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 446 "ulw %[qload1], 20(%[src]) \n\t" 447 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 448 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 449 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 450 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 451 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 452 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 453 454 /* even 7. pixel */ 455 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 456 "mthi $zero, $ac2 \n\t" 457 "preceu.ph.qbr %[p5], %[qload1] \n\t" 458 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ 459 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 460 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 461 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 462 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 463 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 464 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 465 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 466 467 /* even 8. pixel */ 468 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 469 "mthi $zero, $ac3 \n\t" 470 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 471 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 472 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ 473 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 474 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 475 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 476 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 477 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 478 479 /* ODD pixels */ 480 "ulw %[qload1], 1(%[src]) \n\t" 481 "ulw %[qload2], 5(%[src]) \n\t" 482 483 /* odd 1. pixel */ 484 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 485 "mthi $zero, $ac1 \n\t" 486 "preceu.ph.qbr %[p1], %[qload1] \n\t" 487 "preceu.ph.qbl %[p2], %[qload1] \n\t" 488 "preceu.ph.qbr %[p3], %[qload2] \n\t" 489 "preceu.ph.qbl %[p4], %[qload2] \n\t" 490 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ 491 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 492 "ulw %[qload2], 9(%[src]) \n\t" 493 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 494 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 495 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 496 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 497 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 498 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 499 500 /* odd 2. pixel */ 501 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 502 "mthi $zero, $ac2 \n\t" 503 "preceu.ph.qbr %[p1], %[qload2] \n\t" 504 "preceu.ph.qbl %[p5], %[qload2] \n\t" 505 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ 506 "ulw %[qload1], 13(%[src]) \n\t" 507 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 508 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 509 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 510 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 511 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 512 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 513 514 /* odd 3. pixel */ 515 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 516 "mthi $zero, $ac3 \n\t" 517 "preceu.ph.qbr %[p2], %[qload1] \n\t" 518 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ 519 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 520 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 521 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 522 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 523 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 524 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 525 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 526 527 /* odd 4. pixel */ 528 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 529 "mthi $zero, $ac1 \n\t" 530 "preceu.ph.qbl %[p3], %[qload1] \n\t" 531 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ 532 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 533 "ulw %[qload2], 17(%[src]) \n\t" 534 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 535 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 536 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 537 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 538 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 539 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 540 541 /* odd 5. pixel */ 542 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 543 "mthi $zero, $ac2 \n\t" 544 "preceu.ph.qbr %[p4], %[qload2] \n\t" 545 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ 546 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 547 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 548 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 549 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 550 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 551 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 552 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 553 554 /* odd 6. pixel */ 555 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 556 "mthi $zero, $ac3 \n\t" 557 "preceu.ph.qbl %[p1], %[qload2] \n\t" 558 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ 559 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 560 "ulw %[qload1], 21(%[src]) \n\t" 561 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 562 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 563 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 564 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 565 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 566 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 567 568 /* odd 7. pixel */ 569 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 570 "mthi $zero, $ac1 \n\t" 571 "preceu.ph.qbr %[p5], %[qload1] \n\t" 572 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ 573 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 574 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 575 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 576 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 577 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 578 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 579 580 /* odd 8. pixel */ 581 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 582 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 583 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 584 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 585 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 586 587 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 588 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 589 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 590 591 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ 592 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 593 594 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ 595 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 596 597 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ 598 599 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), 600 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 601 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 602 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 603 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) 604 : [filter12] "r" (filter12), [filter34] "r" (filter34), 605 [filter56] "r" (filter56), [filter78] "r" (filter78), 606 [vector_64] "r" (vector_64), [cm] "r" (cm), 607 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) 608 ); 609 610 src += 16; 611 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 612 odd_dst = (dst + dst_stride); 613 } 614 615 /* Next row... */ 616 src_ptr += src_stride; 617 618 dst_ptr += 1; 619 } 620 } 621 622 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, 623 int32_t src_stride, 624 uint8_t *dst_ptr, 625 int32_t dst_stride, 626 const int16_t *filter_x0, 627 int32_t h) { 628 int32_t c, y; 629 const uint8_t *src; 630 uint8_t *dst; 631 uint8_t *cm = vp9_ff_cropTbl; 632 uint32_t vector_64 = 64; 633 int32_t filter12, filter34, filter56, filter78; 634 int32_t Temp1, Temp2, Temp3; 635 uint32_t qload1, qload2; 636 uint32_t p1, p2, p3, p4, p5; 637 uint32_t st1, st2, st3; 638 uint32_t dst_pitch_2 = (dst_stride << 1); 639 uint8_t *odd_dst; 640 641 filter12 = ((const int32_t *)filter_x0)[0]; 642 filter34 = ((const int32_t *)filter_x0)[1]; 643 filter56 = ((const int32_t *)filter_x0)[2]; 644 filter78 = ((const int32_t *)filter_x0)[3]; 645 646 for (y = h; y--;) { 647 /* prefetch data to cache memory */ 648 vp9_prefetch_load(src_ptr + src_stride); 649 vp9_prefetch_load(src_ptr + src_stride + 32); 650 vp9_prefetch_load(src_ptr + src_stride + 64); 651 652 src = src_ptr; 653 dst = dst_ptr; 654 655 odd_dst = (dst + dst_stride); 656 657 for (c = 0; c < 4; c++) { 658 __asm__ __volatile__ ( 659 "ulw %[qload1], 0(%[src]) \n\t" 660 "ulw %[qload2], 4(%[src]) \n\t" 661 662 /* even 1. pixel */ 663 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 664 "mthi $zero, $ac1 \n\t" 665 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 666 "mthi $zero, $ac2 \n\t" 667 "preceu.ph.qbr %[p3], %[qload2] \n\t" 668 "preceu.ph.qbl %[p4], %[qload2] \n\t" 669 "preceu.ph.qbr %[p1], %[qload1] \n\t" 670 "preceu.ph.qbl %[p2], %[qload1] \n\t" 671 "ulw %[qload2], 8(%[src]) \n\t" 672 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 673 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 674 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 675 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 676 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 677 678 /* even 2. pixel */ 679 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 680 "mthi $zero, $ac3 \n\t" 681 "preceu.ph.qbr %[p1], %[qload2] \n\t" 682 "preceu.ph.qbl %[p5], %[qload2] \n\t" 683 "ulw %[qload1], 12(%[src]) \n\t" 684 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 685 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 686 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 687 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 688 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 689 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 690 691 /* even 3. pixel */ 692 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 693 "mthi $zero, $ac1 \n\t" 694 "preceu.ph.qbr %[p2], %[qload1] \n\t" 695 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 696 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 697 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 698 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 699 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 700 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 701 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 702 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 703 704 /* even 4. pixel */ 705 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 706 "mthi $zero, $ac2 \n\t" 707 "preceu.ph.qbl %[p3], %[qload1] \n\t" 708 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ 709 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 710 "ulw %[qload2], 16(%[src]) \n\t" 711 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 712 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 713 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 714 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 715 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 716 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 717 718 /* even 5. pixel */ 719 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 720 "mthi $zero, $ac3 \n\t" 721 "preceu.ph.qbr %[p4], %[qload2] \n\t" 722 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ 723 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 724 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 725 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 726 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 727 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 728 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 729 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 730 731 /* even 6. pixel */ 732 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 733 "mthi $zero, $ac1 \n\t" 734 "preceu.ph.qbl %[p1], %[qload2] \n\t" 735 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ 736 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 737 "ulw %[qload1], 20(%[src]) \n\t" 738 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 739 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 740 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 741 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 742 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 743 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 744 745 /* even 7. pixel */ 746 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 747 "mthi $zero, $ac2 \n\t" 748 "preceu.ph.qbr %[p5], %[qload1] \n\t" 749 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ 750 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 751 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 752 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 753 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 754 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 755 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 756 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 757 758 /* even 8. pixel */ 759 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 760 "mthi $zero, $ac3 \n\t" 761 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 762 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 763 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ 764 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 765 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 766 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 767 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 768 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 769 770 /* ODD pixels */ 771 "ulw %[qload1], 1(%[src]) \n\t" 772 "ulw %[qload2], 5(%[src]) \n\t" 773 774 /* odd 1. pixel */ 775 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 776 "mthi $zero, $ac1 \n\t" 777 "preceu.ph.qbr %[p1], %[qload1] \n\t" 778 "preceu.ph.qbl %[p2], %[qload1] \n\t" 779 "preceu.ph.qbr %[p3], %[qload2] \n\t" 780 "preceu.ph.qbl %[p4], %[qload2] \n\t" 781 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ 782 "addu %[dst], %[dst], %[dst_pitch_2] \n\t" 783 "ulw %[qload2], 9(%[src]) \n\t" 784 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 785 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 786 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 787 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 788 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 789 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 790 791 /* odd 2. pixel */ 792 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 793 "mthi $zero, $ac2 \n\t" 794 "preceu.ph.qbr %[p1], %[qload2] \n\t" 795 "preceu.ph.qbl %[p5], %[qload2] \n\t" 796 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ 797 "ulw %[qload1], 13(%[src]) \n\t" 798 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 799 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 800 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 801 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 802 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 803 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 804 805 /* odd 3. pixel */ 806 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 807 "mthi $zero, $ac3 \n\t" 808 "preceu.ph.qbr %[p2], %[qload1] \n\t" 809 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ 810 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 811 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 812 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 813 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 814 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 815 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 816 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 817 818 /* odd 4. pixel */ 819 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 820 "mthi $zero, $ac1 \n\t" 821 "preceu.ph.qbl %[p3], %[qload1] \n\t" 822 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ 823 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 824 "ulw %[qload2], 17(%[src]) \n\t" 825 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 826 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 827 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 828 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 829 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 830 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 831 832 /* odd 5. pixel */ 833 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 834 "mthi $zero, $ac2 \n\t" 835 "preceu.ph.qbr %[p4], %[qload2] \n\t" 836 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ 837 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 838 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 839 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 840 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 841 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 842 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 843 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 844 845 /* odd 6. pixel */ 846 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 847 "mthi $zero, $ac3 \n\t" 848 "preceu.ph.qbl %[p1], %[qload2] \n\t" 849 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ 850 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 851 "ulw %[qload1], 21(%[src]) \n\t" 852 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 853 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 854 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 855 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 856 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 857 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 858 859 /* odd 7. pixel */ 860 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 861 "mthi $zero, $ac1 \n\t" 862 "preceu.ph.qbr %[p5], %[qload1] \n\t" 863 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ 864 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 865 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 866 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 867 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 868 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 869 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 870 871 /* odd 8. pixel */ 872 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 873 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 874 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 875 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 876 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 877 878 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 879 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 880 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 881 882 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ 883 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 884 885 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ 886 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 887 888 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ 889 890 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), 891 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 892 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 893 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 894 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) 895 : [filter12] "r" (filter12), [filter34] "r" (filter34), 896 [filter56] "r" (filter56), [filter78] "r" (filter78), 897 [vector_64] "r" (vector_64), [cm] "r" (cm), 898 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) 899 ); 900 901 src += 16; 902 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 903 odd_dst = (dst + dst_stride); 904 } 905 906 /* Next row... */ 907 src_ptr += src_stride; 908 909 dst_ptr += 1; 910 } 911 } 912 913 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, 914 uint8_t *dst, ptrdiff_t dst_stride, 915 const int16_t *filter, int w, int h) { 916 int x, y, k; 917 918 for (y = 0; y < h; ++y) { 919 for (x = 0; x < w; ++x) { 920 int sum = 0; 921 922 for (k = 0; k < 8; ++k) 923 sum += src[x + k] * filter[k]; 924 925 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); 926 } 927 928 src += src_stride; 929 dst += 1; 930 } 931 } 932 933 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, 934 uint8_t *dst, ptrdiff_t dst_stride, 935 int w, int h) { 936 int x, y; 937 938 for (y = 0; y < h; ++y) { 939 for (x = 0; x < w; ++x) { 940 dst[x * dst_stride] = src[x]; 941 } 942 943 src += src_stride; 944 dst += 1; 945 } 946 } 947 948 void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, 949 uint8_t *dst, ptrdiff_t dst_stride, 950 const int16_t *filter_x, int x_step_q4, 951 const int16_t *filter_y, int y_step_q4, 952 int w, int h) { 953 DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135); 954 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; 955 uint32_t pos = 38; 956 957 /* bit positon for extract from acc */ 958 __asm__ __volatile__ ( 959 "wrdsp %[pos], 1 \n\t" 960 : 961 : [pos] "r" (pos) 962 ); 963 964 if (intermediate_height < h) 965 intermediate_height = h; 966 967 if (x_step_q4 != 16 || y_step_q4 != 16) 968 return vp9_convolve8_c(src, src_stride, 969 dst, dst_stride, 970 filter_x, x_step_q4, 971 filter_y, y_step_q4, 972 w, h); 973 974 if ((((const int32_t *)filter_x)[1] == 0x800000) 975 && (((const int32_t *)filter_y)[1] == 0x800000)) 976 return vp9_convolve_copy(src, src_stride, 977 dst, dst_stride, 978 filter_x, x_step_q4, 979 filter_y, y_step_q4, 980 w, h); 981 982 /* copy the src to dst */ 983 if (filter_x[3] == 0x80) { 984 copy_horiz_transposed(src - src_stride * 3, src_stride, 985 temp, intermediate_height, 986 w, intermediate_height); 987 } else if (((const int32_t *)filter_x)[0] == 0) { 988 vp9_convolve2_dspr2(src - src_stride * 3, src_stride, 989 temp, intermediate_height, 990 filter_x, 991 w, intermediate_height); 992 } else { 993 src -= (src_stride * 3 + 3); 994 995 /* prefetch data to cache memory */ 996 vp9_prefetch_load(src); 997 vp9_prefetch_load(src + 32); 998 999 switch (w) { 1000 case 4: 1001 convolve_horiz_4_transposed_dspr2(src, src_stride, 1002 temp, intermediate_height, 1003 filter_x, intermediate_height); 1004 break; 1005 case 8: 1006 convolve_horiz_8_transposed_dspr2(src, src_stride, 1007 temp, intermediate_height, 1008 filter_x, intermediate_height); 1009 break; 1010 case 16: 1011 case 32: 1012 convolve_horiz_16_transposed_dspr2(src, src_stride, 1013 temp, intermediate_height, 1014 filter_x, intermediate_height, 1015 (w/16)); 1016 break; 1017 case 64: 1018 vp9_prefetch_load(src + 32); 1019 convolve_horiz_64_transposed_dspr2(src, src_stride, 1020 temp, intermediate_height, 1021 filter_x, intermediate_height); 1022 break; 1023 default: 1024 convolve_horiz_transposed(src, src_stride, 1025 temp, intermediate_height, 1026 filter_x, w, intermediate_height); 1027 break; 1028 } 1029 } 1030 1031 /* copy the src to dst */ 1032 if (filter_y[3] == 0x80) { 1033 copy_horiz_transposed(temp + 3, intermediate_height, 1034 dst, dst_stride, 1035 h, w); 1036 } else if (((const int32_t *)filter_y)[0] == 0) { 1037 vp9_convolve2_dspr2(temp + 3, intermediate_height, 1038 dst, dst_stride, 1039 filter_y, 1040 h, w); 1041 } else { 1042 switch (h) { 1043 case 4: 1044 convolve_horiz_4_transposed_dspr2(temp, intermediate_height, 1045 dst, dst_stride, 1046 filter_y, w); 1047 break; 1048 case 8: 1049 convolve_horiz_8_transposed_dspr2(temp, intermediate_height, 1050 dst, dst_stride, 1051 filter_y, w); 1052 break; 1053 case 16: 1054 case 32: 1055 convolve_horiz_16_transposed_dspr2(temp, intermediate_height, 1056 dst, dst_stride, 1057 filter_y, w, (h/16)); 1058 break; 1059 case 64: 1060 convolve_horiz_64_transposed_dspr2(temp, intermediate_height, 1061 dst, dst_stride, 1062 filter_y, w); 1063 break; 1064 default: 1065 convolve_horiz_transposed(temp, intermediate_height, 1066 dst, dst_stride, 1067 filter_y, h, w); 1068 break; 1069 } 1070 } 1071 } 1072 1073 void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, 1074 uint8_t *dst, ptrdiff_t dst_stride, 1075 const int16_t *filter_x, int filter_x_stride, 1076 const int16_t *filter_y, int filter_y_stride, 1077 int w, int h) { 1078 int x, y; 1079 1080 /* prefetch data to cache memory */ 1081 vp9_prefetch_load(src); 1082 vp9_prefetch_load(src + 32); 1083 vp9_prefetch_store(dst); 1084 1085 switch (w) { 1086 case 4: 1087 { 1088 uint32_t tp1; 1089 1090 /* 1 word storage */ 1091 for (y = h; y--; ) { 1092 vp9_prefetch_load(src + src_stride); 1093 vp9_prefetch_load(src + src_stride + 32); 1094 vp9_prefetch_store(dst + dst_stride); 1095 1096 __asm__ __volatile__ ( 1097 "ulw %[tp1], (%[src]) \n\t" 1098 "sw %[tp1], (%[dst]) \n\t" /* store */ 1099 1100 : [tp1] "=&r" (tp1) 1101 : [src] "r" (src), [dst] "r" (dst) 1102 ); 1103 1104 src += src_stride; 1105 dst += dst_stride; 1106 } 1107 } 1108 break; 1109 case 8: 1110 { 1111 uint32_t tp1, tp2; 1112 1113 /* 2 word storage */ 1114 for (y = h; y--; ) { 1115 vp9_prefetch_load(src + src_stride); 1116 vp9_prefetch_load(src + src_stride + 32); 1117 vp9_prefetch_store(dst + dst_stride); 1118 1119 __asm__ __volatile__ ( 1120 "ulw %[tp1], 0(%[src]) \n\t" 1121 "ulw %[tp2], 4(%[src]) \n\t" 1122 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1123 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1124 1125 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) 1126 : [src] "r" (src), [dst] "r" (dst) 1127 ); 1128 1129 src += src_stride; 1130 dst += dst_stride; 1131 } 1132 } 1133 break; 1134 case 16: 1135 { 1136 uint32_t tp1, tp2, tp3, tp4; 1137 1138 /* 4 word storage */ 1139 for (y = h; y--; ) { 1140 vp9_prefetch_load(src + src_stride); 1141 vp9_prefetch_load(src + src_stride + 32); 1142 vp9_prefetch_store(dst + dst_stride); 1143 1144 __asm__ __volatile__ ( 1145 "ulw %[tp1], 0(%[src]) \n\t" 1146 "ulw %[tp2], 4(%[src]) \n\t" 1147 "ulw %[tp3], 8(%[src]) \n\t" 1148 "ulw %[tp4], 12(%[src]) \n\t" 1149 1150 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1151 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1152 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 1153 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 1154 1155 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 1156 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) 1157 : [src] "r" (src), [dst] "r" (dst) 1158 ); 1159 1160 src += src_stride; 1161 dst += dst_stride; 1162 } 1163 } 1164 break; 1165 case 32: 1166 { 1167 uint32_t tp1, tp2, tp3, tp4; 1168 uint32_t tp5, tp6, tp7, tp8; 1169 1170 /* 8 word storage */ 1171 for (y = h; y--; ) { 1172 vp9_prefetch_load(src + src_stride); 1173 vp9_prefetch_load(src + src_stride + 32); 1174 vp9_prefetch_store(dst + dst_stride); 1175 1176 __asm__ __volatile__ ( 1177 "ulw %[tp1], 0(%[src]) \n\t" 1178 "ulw %[tp2], 4(%[src]) \n\t" 1179 "ulw %[tp3], 8(%[src]) \n\t" 1180 "ulw %[tp4], 12(%[src]) \n\t" 1181 "ulw %[tp5], 16(%[src]) \n\t" 1182 "ulw %[tp6], 20(%[src]) \n\t" 1183 "ulw %[tp7], 24(%[src]) \n\t" 1184 "ulw %[tp8], 28(%[src]) \n\t" 1185 1186 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1187 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1188 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 1189 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 1190 "sw %[tp5], 16(%[dst]) \n\t" /* store */ 1191 "sw %[tp6], 20(%[dst]) \n\t" /* store */ 1192 "sw %[tp7], 24(%[dst]) \n\t" /* store */ 1193 "sw %[tp8], 28(%[dst]) \n\t" /* store */ 1194 1195 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 1196 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 1197 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), 1198 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) 1199 : [src] "r" (src), [dst] "r" (dst) 1200 ); 1201 1202 src += src_stride; 1203 dst += dst_stride; 1204 } 1205 } 1206 break; 1207 case 64: 1208 { 1209 uint32_t tp1, tp2, tp3, tp4; 1210 uint32_t tp5, tp6, tp7, tp8; 1211 1212 vp9_prefetch_load(src + 64); 1213 vp9_prefetch_store(dst + 32); 1214 1215 /* 16 word storage */ 1216 for (y = h; y--; ) { 1217 vp9_prefetch_load(src + src_stride); 1218 vp9_prefetch_load(src + src_stride + 32); 1219 vp9_prefetch_load(src + src_stride + 64); 1220 vp9_prefetch_store(dst + dst_stride); 1221 vp9_prefetch_store(dst + dst_stride + 32); 1222 1223 __asm__ __volatile__ ( 1224 "ulw %[tp1], 0(%[src]) \n\t" 1225 "ulw %[tp2], 4(%[src]) \n\t" 1226 "ulw %[tp3], 8(%[src]) \n\t" 1227 "ulw %[tp4], 12(%[src]) \n\t" 1228 "ulw %[tp5], 16(%[src]) \n\t" 1229 "ulw %[tp6], 20(%[src]) \n\t" 1230 "ulw %[tp7], 24(%[src]) \n\t" 1231 "ulw %[tp8], 28(%[src]) \n\t" 1232 1233 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1234 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1235 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 1236 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 1237 "sw %[tp5], 16(%[dst]) \n\t" /* store */ 1238 "sw %[tp6], 20(%[dst]) \n\t" /* store */ 1239 "sw %[tp7], 24(%[dst]) \n\t" /* store */ 1240 "sw %[tp8], 28(%[dst]) \n\t" /* store */ 1241 1242 "ulw %[tp1], 32(%[src]) \n\t" 1243 "ulw %[tp2], 36(%[src]) \n\t" 1244 "ulw %[tp3], 40(%[src]) \n\t" 1245 "ulw %[tp4], 44(%[src]) \n\t" 1246 "ulw %[tp5], 48(%[src]) \n\t" 1247 "ulw %[tp6], 52(%[src]) \n\t" 1248 "ulw %[tp7], 56(%[src]) \n\t" 1249 "ulw %[tp8], 60(%[src]) \n\t" 1250 1251 "sw %[tp1], 32(%[dst]) \n\t" /* store */ 1252 "sw %[tp2], 36(%[dst]) \n\t" /* store */ 1253 "sw %[tp3], 40(%[dst]) \n\t" /* store */ 1254 "sw %[tp4], 44(%[dst]) \n\t" /* store */ 1255 "sw %[tp5], 48(%[dst]) \n\t" /* store */ 1256 "sw %[tp6], 52(%[dst]) \n\t" /* store */ 1257 "sw %[tp7], 56(%[dst]) \n\t" /* store */ 1258 "sw %[tp8], 60(%[dst]) \n\t" /* store */ 1259 1260 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 1261 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 1262 [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), 1263 [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) 1264 : [src] "r" (src), [dst] "r" (dst) 1265 ); 1266 1267 src += src_stride; 1268 dst += dst_stride; 1269 } 1270 } 1271 break; 1272 default: 1273 for (y = h; y--; ) { 1274 for (x = 0; x < w; ++x) { 1275 dst[x] = src[x]; 1276 } 1277 1278 src += src_stride; 1279 dst += dst_stride; 1280 } 1281 break; 1282 } 1283 } 1284 #endif 1285