1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 #include "vp9/common/vp9_common.h" 17 #include "vpx/vpx_integer.h" 18 #include "vpx_ports/mem.h" 19 #include "vp9/common/vp9_convolve.h" 20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 21 22 #if HAVE_DSPR2 23 static void convolve_avg_vert_4_dspr2(const uint8_t *src, 24 int32_t src_stride, 25 uint8_t *dst, 26 int32_t dst_stride, 27 const int16_t *filter_y, 28 int32_t w, 29 int32_t h) { 30 int32_t x, y; 31 const uint8_t *src_ptr; 32 uint8_t *dst_ptr; 33 uint8_t *cm = vp9_ff_cropTbl; 34 uint32_t vector4a = 64; 35 uint32_t load1, load2, load3, load4; 36 uint32_t p1, p2; 37 uint32_t n1, n2; 38 uint32_t scratch1, scratch2; 39 uint32_t store1, store2; 40 int32_t vector1b, vector2b, vector3b, vector4b; 41 int32_t Temp1, Temp2; 42 43 vector1b = ((const int32_t *)filter_y)[0]; 44 vector2b = ((const int32_t *)filter_y)[1]; 45 vector3b = ((const int32_t *)filter_y)[2]; 46 vector4b = ((const int32_t *)filter_y)[3]; 47 48 src -= 3 * src_stride; 49 50 for (y = h; y--;) { 51 /* prefetch data to cache memory */ 52 vp9_prefetch_store(dst + dst_stride); 53 54 for (x = 0; x < w; x += 4) { 55 src_ptr = src + x; 56 dst_ptr = dst + x; 57 58 __asm__ __volatile__ ( 59 "ulw %[load1], 0(%[src_ptr]) \n\t" 60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 61 "ulw %[load2], 0(%[src_ptr]) \n\t" 62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 63 "ulw %[load3], 0(%[src_ptr]) \n\t" 64 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 65 "ulw %[load4], 0(%[src_ptr]) \n\t" 66 67 "mtlo %[vector4a], $ac0 \n\t" 68 "mtlo %[vector4a], $ac1 \n\t" 69 "mtlo %[vector4a], $ac2 \n\t" 70 "mtlo %[vector4a], $ac3 \n\t" 71 "mthi $zero, $ac0 \n\t" 72 "mthi $zero, $ac1 \n\t" 73 "mthi $zero, $ac2 \n\t" 74 "mthi $zero, $ac3 \n\t" 75 76 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 77 "preceu.ph.qbr %[p1], %[load2] \n\t" 78 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 79 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 80 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 81 "preceu.ph.qbr %[p2], %[load4] \n\t" 82 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 83 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 84 85 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 86 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 87 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 88 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 89 90 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 91 "preceu.ph.qbl %[p1], %[load2] \n\t" 92 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 93 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 94 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 95 "preceu.ph.qbl %[p2], %[load4] \n\t" 96 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 97 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 98 99 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 100 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 101 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 102 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 103 104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 105 "ulw %[load1], 0(%[src_ptr]) \n\t" 106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 107 "ulw %[load2], 0(%[src_ptr]) \n\t" 108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 109 "ulw %[load3], 0(%[src_ptr]) \n\t" 110 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 111 "ulw %[load4], 0(%[src_ptr]) \n\t" 112 113 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 114 "preceu.ph.qbr %[p1], %[load2] \n\t" 115 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 116 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 117 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 118 "preceu.ph.qbr %[p2], %[load4] \n\t" 119 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 120 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 121 122 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 123 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 124 "extp %[Temp1], $ac0, 31 \n\t" 125 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 126 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 127 "extp %[Temp2], $ac1, 31 \n\t" 128 129 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 130 "preceu.ph.qbl %[p1], %[load2] \n\t" 131 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 132 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 133 "lbu %[scratch1], 0(%[dst_ptr]) \n\t" 134 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 135 "preceu.ph.qbl %[p2], %[load4] \n\t" 136 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 137 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 138 "lbu %[scratch2], 1(%[dst_ptr]) \n\t" 139 140 "lbux %[store1], %[Temp1](%[cm]) \n\t" 141 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 142 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 143 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ 144 "extp %[Temp1], $ac2, 31 \n\t" 145 146 "lbux %[store2], %[Temp2](%[cm]) \n\t" 147 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 148 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 149 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ 150 "extp %[Temp2], $ac3, 31 \n\t" 151 "lbu %[scratch1], 2(%[dst_ptr]) \n\t" 152 153 "sb %[store1], 0(%[dst_ptr]) \n\t" 154 "sb %[store2], 1(%[dst_ptr]) \n\t" 155 "lbu %[scratch2], 3(%[dst_ptr]) \n\t" 156 157 "lbux %[store1], %[Temp1](%[cm]) \n\t" 158 "lbux %[store2], %[Temp2](%[cm]) \n\t" 159 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ 160 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ 161 162 "sb %[store1], 2(%[dst_ptr]) \n\t" 163 "sb %[store2], 3(%[dst_ptr]) \n\t" 164 165 : [load1] "=&r" (load1), [load2] "=&r" (load2), 166 [load3] "=&r" (load3), [load4] "=&r" (load4), 167 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), 168 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 169 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 170 [store1] "=&r" (store1), [store2] "=&r" (store2), 171 [src_ptr] "+r" (src_ptr) 172 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 173 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 174 [vector4a] "r" (vector4a), 175 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 176 ); 177 } 178 179 /* Next row... */ 180 src += src_stride; 181 dst += dst_stride; 182 } 183 } 184 185 static void convolve_avg_vert_64_dspr2(const uint8_t *src, 186 int32_t src_stride, 187 uint8_t *dst, 188 int32_t dst_stride, 189 const int16_t *filter_y, 190 int32_t h) { 191 int32_t x, y; 192 const uint8_t *src_ptr; 193 uint8_t *dst_ptr; 194 uint8_t *cm = vp9_ff_cropTbl; 195 uint32_t vector4a = 64; 196 uint32_t load1, load2, load3, load4; 197 uint32_t p1, p2; 198 uint32_t n1, n2; 199 uint32_t scratch1, scratch2; 200 uint32_t store1, store2; 201 int32_t vector1b, vector2b, vector3b, vector4b; 202 int32_t Temp1, Temp2; 203 204 vector1b = ((const int32_t *)filter_y)[0]; 205 vector2b = ((const int32_t *)filter_y)[1]; 206 vector3b = ((const int32_t *)filter_y)[2]; 207 vector4b = ((const int32_t *)filter_y)[3]; 208 209 src -= 3 * src_stride; 210 211 for (y = h; y--;) { 212 /* prefetch data to cache memory */ 213 vp9_prefetch_store(dst + dst_stride); 214 vp9_prefetch_store(dst + dst_stride + 32); 215 216 for (x = 0; x < 64; x += 4) { 217 src_ptr = src + x; 218 dst_ptr = dst + x; 219 220 __asm__ __volatile__ ( 221 "ulw %[load1], 0(%[src_ptr]) \n\t" 222 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 223 "ulw %[load2], 0(%[src_ptr]) \n\t" 224 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 225 "ulw %[load3], 0(%[src_ptr]) \n\t" 226 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 227 "ulw %[load4], 0(%[src_ptr]) \n\t" 228 229 "mtlo %[vector4a], $ac0 \n\t" 230 "mtlo %[vector4a], $ac1 \n\t" 231 "mtlo %[vector4a], $ac2 \n\t" 232 "mtlo %[vector4a], $ac3 \n\t" 233 "mthi $zero, $ac0 \n\t" 234 "mthi $zero, $ac1 \n\t" 235 "mthi $zero, $ac2 \n\t" 236 "mthi $zero, $ac3 \n\t" 237 238 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 239 "preceu.ph.qbr %[p1], %[load2] \n\t" 240 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 241 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 242 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 243 "preceu.ph.qbr %[p2], %[load4] \n\t" 244 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 245 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 246 247 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 248 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 249 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 250 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 251 252 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 253 "preceu.ph.qbl %[p1], %[load2] \n\t" 254 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 255 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 256 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 257 "preceu.ph.qbl %[p2], %[load4] \n\t" 258 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 259 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 260 261 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 262 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 263 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 264 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 265 266 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 267 "ulw %[load1], 0(%[src_ptr]) \n\t" 268 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 269 "ulw %[load2], 0(%[src_ptr]) \n\t" 270 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 271 "ulw %[load3], 0(%[src_ptr]) \n\t" 272 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 273 "ulw %[load4], 0(%[src_ptr]) \n\t" 274 275 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 276 "preceu.ph.qbr %[p1], %[load2] \n\t" 277 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 278 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 279 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 280 "preceu.ph.qbr %[p2], %[load4] \n\t" 281 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 282 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 283 284 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 285 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 286 "extp %[Temp1], $ac0, 31 \n\t" 287 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 288 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 289 "extp %[Temp2], $ac1, 31 \n\t" 290 291 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 292 "preceu.ph.qbl %[p1], %[load2] \n\t" 293 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 294 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 295 "lbu %[scratch1], 0(%[dst_ptr]) \n\t" 296 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 297 "preceu.ph.qbl %[p2], %[load4] \n\t" 298 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 299 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 300 "lbu %[scratch2], 1(%[dst_ptr]) \n\t" 301 302 "lbux %[store1], %[Temp1](%[cm]) \n\t" 303 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 304 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 305 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ 306 "extp %[Temp1], $ac2, 31 \n\t" 307 308 "lbux %[store2], %[Temp2](%[cm]) \n\t" 309 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 310 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 311 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ 312 "extp %[Temp2], $ac3, 31 \n\t" 313 "lbu %[scratch1], 2(%[dst_ptr]) \n\t" 314 315 "sb %[store1], 0(%[dst_ptr]) \n\t" 316 "sb %[store2], 1(%[dst_ptr]) \n\t" 317 "lbu %[scratch2], 3(%[dst_ptr]) \n\t" 318 319 "lbux %[store1], %[Temp1](%[cm]) \n\t" 320 "lbux %[store2], %[Temp2](%[cm]) \n\t" 321 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ 322 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ 323 324 "sb %[store1], 2(%[dst_ptr]) \n\t" 325 "sb %[store2], 3(%[dst_ptr]) \n\t" 326 327 : [load1] "=&r" (load1), [load2] "=&r" (load2), 328 [load3] "=&r" (load3), [load4] "=&r" (load4), 329 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), 330 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 331 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 332 [store1] "=&r" (store1), [store2] "=&r" (store2), 333 [src_ptr] "+r" (src_ptr) 334 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 335 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 336 [vector4a] "r" (vector4a), 337 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 338 ); 339 } 340 341 /* Next row... */ 342 src += src_stride; 343 dst += dst_stride; 344 } 345 } 346 347 void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, 348 uint8_t *dst, ptrdiff_t dst_stride, 349 const int16_t *filter_x, int x_step_q4, 350 const int16_t *filter_y, int y_step_q4, 351 int w, int h) { 352 if (((const int32_t *)filter_y)[1] == 0x800000) { 353 vp9_convolve_avg(src, src_stride, 354 dst, dst_stride, 355 filter_x, x_step_q4, 356 filter_y, y_step_q4, 357 w, h); 358 } else if (((const int32_t *)filter_y)[0] == 0) { 359 vp9_convolve2_avg_vert_dspr2(src, src_stride, 360 dst, dst_stride, 361 filter_x, x_step_q4, 362 filter_y, y_step_q4, 363 w, h); 364 } else { 365 if (16 == y_step_q4) { 366 uint32_t pos = 38; 367 368 /* bit positon for extract from acc */ 369 __asm__ __volatile__ ( 370 "wrdsp %[pos], 1 \n\t" 371 : 372 : [pos] "r" (pos) 373 ); 374 375 vp9_prefetch_store(dst); 376 377 switch (w) { 378 case 4: 379 case 8: 380 case 16: 381 case 32: 382 convolve_avg_vert_4_dspr2(src, src_stride, 383 dst, dst_stride, 384 filter_y, w, h); 385 break; 386 case 64: 387 vp9_prefetch_store(dst + 32); 388 convolve_avg_vert_64_dspr2(src, src_stride, 389 dst, dst_stride, 390 filter_y, h); 391 break; 392 default: 393 vp9_convolve8_avg_vert_c(src, src_stride, 394 dst, dst_stride, 395 filter_x, x_step_q4, 396 filter_y, y_step_q4, 397 w, h); 398 break; 399 } 400 } else { 401 vp9_convolve8_avg_vert_c(src, src_stride, 402 dst, dst_stride, 403 filter_x, x_step_q4, 404 filter_y, y_step_q4, 405 w, h); 406 } 407 } 408 } 409 410 void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, 411 uint8_t *dst, ptrdiff_t dst_stride, 412 const int16_t *filter_x, int x_step_q4, 413 const int16_t *filter_y, int y_step_q4, 414 int w, int h) { 415 /* Fixed size intermediate buffer places limits on parameters. */ 416 DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135); 417 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; 418 419 assert(w <= 64); 420 assert(h <= 64); 421 422 if (intermediate_height < h) 423 intermediate_height = h; 424 425 if (x_step_q4 != 16 || y_step_q4 != 16) 426 return vp9_convolve8_avg_c(src, src_stride, 427 dst, dst_stride, 428 filter_x, x_step_q4, 429 filter_y, y_step_q4, 430 w, h); 431 432 vp9_convolve8_horiz(src - (src_stride * 3), src_stride, 433 temp, 64, 434 filter_x, x_step_q4, 435 filter_y, y_step_q4, 436 w, intermediate_height); 437 438 vp9_convolve8_avg_vert(temp + 64 * 3, 64, 439 dst, dst_stride, 440 filter_x, x_step_q4, 441 filter_y, y_step_q4, 442 w, h); 443 } 444 445 void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, 446 uint8_t *dst, ptrdiff_t dst_stride, 447 const int16_t *filter_x, int filter_x_stride, 448 const int16_t *filter_y, int filter_y_stride, 449 int w, int h) { 450 int x, y; 451 uint32_t tp1, tp2, tn1; 452 uint32_t tp3, tp4, tn2; 453 454 /* prefetch data to cache memory */ 455 vp9_prefetch_load(src); 456 vp9_prefetch_load(src + 32); 457 vp9_prefetch_store(dst); 458 459 switch (w) { 460 case 4: 461 /* 1 word storage */ 462 for (y = h; y--; ) { 463 vp9_prefetch_load(src + src_stride); 464 vp9_prefetch_load(src + src_stride + 32); 465 vp9_prefetch_store(dst + dst_stride); 466 467 __asm__ __volatile__ ( 468 "ulw %[tp1], 0(%[src]) \n\t" 469 "ulw %[tp2], 0(%[dst]) \n\t" 470 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 471 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 472 473 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), 474 [tp2] "=&r" (tp2) 475 : [src] "r" (src), [dst] "r" (dst) 476 ); 477 478 src += src_stride; 479 dst += dst_stride; 480 } 481 break; 482 case 8: 483 /* 2 word storage */ 484 for (y = h; y--; ) { 485 vp9_prefetch_load(src + src_stride); 486 vp9_prefetch_load(src + src_stride + 32); 487 vp9_prefetch_store(dst + dst_stride); 488 489 __asm__ __volatile__ ( 490 "ulw %[tp1], 0(%[src]) \n\t" 491 "ulw %[tp2], 0(%[dst]) \n\t" 492 "ulw %[tp3], 4(%[src]) \n\t" 493 "ulw %[tp4], 4(%[dst]) \n\t" 494 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 495 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 496 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 497 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 498 499 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 500 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 501 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 502 : [src] "r" (src), [dst] "r" (dst) 503 ); 504 505 src += src_stride; 506 dst += dst_stride; 507 } 508 break; 509 case 16: 510 /* 4 word storage */ 511 for (y = h; y--; ) { 512 vp9_prefetch_load(src + src_stride); 513 vp9_prefetch_load(src + src_stride + 32); 514 vp9_prefetch_store(dst + dst_stride); 515 516 __asm__ __volatile__ ( 517 "ulw %[tp1], 0(%[src]) \n\t" 518 "ulw %[tp2], 0(%[dst]) \n\t" 519 "ulw %[tp3], 4(%[src]) \n\t" 520 "ulw %[tp4], 4(%[dst]) \n\t" 521 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 522 "ulw %[tp1], 8(%[src]) \n\t" 523 "ulw %[tp2], 8(%[dst]) \n\t" 524 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 525 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 526 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 527 "ulw %[tp3], 12(%[src]) \n\t" 528 "ulw %[tp4], 12(%[dst]) \n\t" 529 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 530 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 531 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 532 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 533 534 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 535 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 536 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 537 : [src] "r" (src), [dst] "r" (dst) 538 ); 539 540 src += src_stride; 541 dst += dst_stride; 542 } 543 break; 544 case 32: 545 /* 8 word storage */ 546 for (y = h; y--; ) { 547 vp9_prefetch_load(src + src_stride); 548 vp9_prefetch_load(src + src_stride + 32); 549 vp9_prefetch_store(dst + dst_stride); 550 551 __asm__ __volatile__ ( 552 "ulw %[tp1], 0(%[src]) \n\t" 553 "ulw %[tp2], 0(%[dst]) \n\t" 554 "ulw %[tp3], 4(%[src]) \n\t" 555 "ulw %[tp4], 4(%[dst]) \n\t" 556 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 557 "ulw %[tp1], 8(%[src]) \n\t" 558 "ulw %[tp2], 8(%[dst]) \n\t" 559 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 560 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 561 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 562 "ulw %[tp3], 12(%[src]) \n\t" 563 "ulw %[tp4], 12(%[dst]) \n\t" 564 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 565 "ulw %[tp1], 16(%[src]) \n\t" 566 "ulw %[tp2], 16(%[dst]) \n\t" 567 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 568 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 569 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 570 "ulw %[tp3], 20(%[src]) \n\t" 571 "ulw %[tp4], 20(%[dst]) \n\t" 572 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 573 "ulw %[tp1], 24(%[src]) \n\t" 574 "ulw %[tp2], 24(%[dst]) \n\t" 575 "sw %[tn1], 16(%[dst]) \n\t" /* store */ 576 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 577 "sw %[tn2], 20(%[dst]) \n\t" /* store */ 578 "ulw %[tp3], 28(%[src]) \n\t" 579 "ulw %[tp4], 28(%[dst]) \n\t" 580 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 581 "sw %[tn1], 24(%[dst]) \n\t" /* store */ 582 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 583 "sw %[tn2], 28(%[dst]) \n\t" /* store */ 584 585 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 586 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 587 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 588 : [src] "r" (src), [dst] "r" (dst) 589 ); 590 591 src += src_stride; 592 dst += dst_stride; 593 } 594 break; 595 case 64: 596 vp9_prefetch_load(src + 64); 597 vp9_prefetch_store(dst + 32); 598 599 /* 16 word storage */ 600 for (y = h; y--; ) { 601 vp9_prefetch_load(src + src_stride); 602 vp9_prefetch_load(src + src_stride + 32); 603 vp9_prefetch_load(src + src_stride + 64); 604 vp9_prefetch_store(dst + dst_stride); 605 vp9_prefetch_store(dst + dst_stride + 32); 606 607 __asm__ __volatile__ ( 608 "ulw %[tp1], 0(%[src]) \n\t" 609 "ulw %[tp2], 0(%[dst]) \n\t" 610 "ulw %[tp3], 4(%[src]) \n\t" 611 "ulw %[tp4], 4(%[dst]) \n\t" 612 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 613 "ulw %[tp1], 8(%[src]) \n\t" 614 "ulw %[tp2], 8(%[dst]) \n\t" 615 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 616 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 617 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 618 "ulw %[tp3], 12(%[src]) \n\t" 619 "ulw %[tp4], 12(%[dst]) \n\t" 620 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 621 "ulw %[tp1], 16(%[src]) \n\t" 622 "ulw %[tp2], 16(%[dst]) \n\t" 623 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 624 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 625 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 626 "ulw %[tp3], 20(%[src]) \n\t" 627 "ulw %[tp4], 20(%[dst]) \n\t" 628 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 629 "ulw %[tp1], 24(%[src]) \n\t" 630 "ulw %[tp2], 24(%[dst]) \n\t" 631 "sw %[tn1], 16(%[dst]) \n\t" /* store */ 632 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 633 "sw %[tn2], 20(%[dst]) \n\t" /* store */ 634 "ulw %[tp3], 28(%[src]) \n\t" 635 "ulw %[tp4], 28(%[dst]) \n\t" 636 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 637 "ulw %[tp1], 32(%[src]) \n\t" 638 "ulw %[tp2], 32(%[dst]) \n\t" 639 "sw %[tn1], 24(%[dst]) \n\t" /* store */ 640 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 641 "sw %[tn2], 28(%[dst]) \n\t" /* store */ 642 "ulw %[tp3], 36(%[src]) \n\t" 643 "ulw %[tp4], 36(%[dst]) \n\t" 644 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 645 "ulw %[tp1], 40(%[src]) \n\t" 646 "ulw %[tp2], 40(%[dst]) \n\t" 647 "sw %[tn1], 32(%[dst]) \n\t" /* store */ 648 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 649 "sw %[tn2], 36(%[dst]) \n\t" /* store */ 650 "ulw %[tp3], 44(%[src]) \n\t" 651 "ulw %[tp4], 44(%[dst]) \n\t" 652 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 653 "ulw %[tp1], 48(%[src]) \n\t" 654 "ulw %[tp2], 48(%[dst]) \n\t" 655 "sw %[tn1], 40(%[dst]) \n\t" /* store */ 656 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 657 "sw %[tn2], 44(%[dst]) \n\t" /* store */ 658 "ulw %[tp3], 52(%[src]) \n\t" 659 "ulw %[tp4], 52(%[dst]) \n\t" 660 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 661 "ulw %[tp1], 56(%[src]) \n\t" 662 "ulw %[tp2], 56(%[dst]) \n\t" 663 "sw %[tn1], 48(%[dst]) \n\t" /* store */ 664 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 665 "sw %[tn2], 52(%[dst]) \n\t" /* store */ 666 "ulw %[tp3], 60(%[src]) \n\t" 667 "ulw %[tp4], 60(%[dst]) \n\t" 668 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 669 "sw %[tn1], 56(%[dst]) \n\t" /* store */ 670 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 671 "sw %[tn2], 60(%[dst]) \n\t" /* store */ 672 673 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 674 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 675 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 676 : [src] "r" (src), [dst] "r" (dst) 677 ); 678 679 src += src_stride; 680 dst += dst_stride; 681 } 682 break; 683 default: 684 for (y = h; y > 0; --y) { 685 for (x = 0; x < w; ++x) { 686 dst[x] = (dst[x] + src[x] + 1) >> 1; 687 } 688 689 src += src_stride; 690 dst += dst_stride; 691 } 692 break; 693 } 694 } 695 #endif 696