1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 #include "vp9/common/vp9_common.h" 17 #include "vpx/vpx_integer.h" 18 #include "vpx_ports/mem.h" 19 #include "vp9/common/vp9_convolve.h" 20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 21 22 #if HAVE_DSPR2 23 static void convolve_vert_4_dspr2(const uint8_t *src, 24 int32_t src_stride, 25 uint8_t *dst, 26 int32_t dst_stride, 27 const int16_t *filter_y, 28 int32_t w, 29 int32_t h) { 30 int32_t x, y; 31 const uint8_t *src_ptr; 32 uint8_t *dst_ptr; 33 uint8_t *cm = vp9_ff_cropTbl; 34 uint32_t vector4a = 64; 35 uint32_t load1, load2, load3, load4; 36 uint32_t p1, p2; 37 uint32_t n1, n2; 38 uint32_t scratch1, scratch2; 39 uint32_t store1, store2; 40 int32_t vector1b, vector2b, vector3b, vector4b; 41 int32_t Temp1, Temp2; 42 43 vector1b = ((const int32_t *)filter_y)[0]; 44 vector2b = ((const int32_t *)filter_y)[1]; 45 vector3b = ((const int32_t *)filter_y)[2]; 46 vector4b = ((const int32_t *)filter_y)[3]; 47 48 src -= 3 * src_stride; 49 50 for (y = h; y--;) { 51 /* prefetch data to cache memory */ 52 vp9_prefetch_store(dst + dst_stride); 53 54 for (x = 0; x < w; x += 4) { 55 src_ptr = src + x; 56 dst_ptr = dst + x; 57 58 __asm__ __volatile__ ( 59 "ulw %[load1], 0(%[src_ptr]) \n\t" 60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 61 "ulw %[load2], 0(%[src_ptr]) \n\t" 62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 63 "ulw %[load3], 0(%[src_ptr]) \n\t" 64 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 65 "ulw %[load4], 0(%[src_ptr]) \n\t" 66 67 "mtlo %[vector4a], $ac0 \n\t" 68 "mtlo %[vector4a], $ac1 \n\t" 69 "mtlo %[vector4a], $ac2 \n\t" 70 "mtlo %[vector4a], $ac3 \n\t" 71 "mthi $zero, $ac0 \n\t" 72 "mthi $zero, $ac1 \n\t" 73 "mthi $zero, $ac2 \n\t" 74 "mthi $zero, $ac3 \n\t" 75 76 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 77 "preceu.ph.qbr %[p1], %[load2] \n\t" 78 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 79 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 80 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 81 "preceu.ph.qbr %[p2], %[load4] \n\t" 82 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 83 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 84 85 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 86 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 87 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 88 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 89 90 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 91 "preceu.ph.qbl %[p1], %[load2] \n\t" 92 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 93 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 94 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 95 "preceu.ph.qbl %[p2], %[load4] \n\t" 96 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 97 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 98 99 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 100 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 101 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 102 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 103 104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 105 "ulw %[load1], 0(%[src_ptr]) \n\t" 106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 107 "ulw %[load2], 0(%[src_ptr]) \n\t" 108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 109 "ulw %[load3], 0(%[src_ptr]) \n\t" 110 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 111 "ulw %[load4], 0(%[src_ptr]) \n\t" 112 113 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 114 "preceu.ph.qbr %[p1], %[load2] \n\t" 115 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 116 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 117 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 118 "preceu.ph.qbr %[p2], %[load4] \n\t" 119 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 120 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 121 122 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 123 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 124 "extp %[Temp1], $ac0, 31 \n\t" 125 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 126 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 127 "extp %[Temp2], $ac1, 31 \n\t" 128 129 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 130 "preceu.ph.qbl %[p1], %[load2] \n\t" 131 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 132 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 133 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 134 "preceu.ph.qbl %[p2], %[load4] \n\t" 135 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 136 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 137 138 "lbux %[store1], %[Temp1](%[cm]) \n\t" 139 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 140 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 141 "extp %[Temp1], $ac2, 31 \n\t" 142 143 "lbux %[store2], %[Temp2](%[cm]) \n\t" 144 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 145 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 146 "extp %[Temp2], $ac3, 31 \n\t" 147 148 "sb %[store1], 0(%[dst_ptr]) \n\t" 149 "sb %[store2], 1(%[dst_ptr]) \n\t" 150 151 "lbux %[store1], %[Temp1](%[cm]) \n\t" 152 "lbux %[store2], %[Temp2](%[cm]) \n\t" 153 154 "sb %[store1], 2(%[dst_ptr]) \n\t" 155 "sb %[store2], 3(%[dst_ptr]) \n\t" 156 157 : [load1] "=&r" (load1), [load2] "=&r" (load2), 158 [load3] "=&r" (load3), [load4] "=&r" (load4), 159 [p1] "=&r" (p1), [p2] "=&r" (p2), 160 [n1] "=&r" (n1), [n2] "=&r" (n2), 161 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 162 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 163 [store1] "=&r" (store1), [store2] "=&r" (store2), 164 [src_ptr] "+r" (src_ptr) 165 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 166 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 167 [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), 168 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 169 ); 170 } 171 172 /* Next row... */ 173 src += src_stride; 174 dst += dst_stride; 175 } 176 } 177 178 static void convolve_vert_64_dspr2(const uint8_t *src, 179 int32_t src_stride, 180 uint8_t *dst, 181 int32_t dst_stride, 182 const int16_t *filter_y, 183 int32_t h) { 184 int32_t x, y; 185 const uint8_t *src_ptr; 186 uint8_t *dst_ptr; 187 uint8_t *cm = vp9_ff_cropTbl; 188 uint32_t vector4a = 64; 189 uint32_t load1, load2, load3, load4; 190 uint32_t p1, p2; 191 uint32_t n1, n2; 192 uint32_t scratch1, scratch2; 193 uint32_t store1, store2; 194 int32_t vector1b, vector2b, vector3b, vector4b; 195 int32_t Temp1, Temp2; 196 197 vector1b = ((const int32_t *)filter_y)[0]; 198 vector2b = ((const int32_t *)filter_y)[1]; 199 vector3b = ((const int32_t *)filter_y)[2]; 200 vector4b = ((const int32_t *)filter_y)[3]; 201 202 src -= 3 * src_stride; 203 204 for (y = h; y--;) { 205 /* prefetch data to cache memory */ 206 vp9_prefetch_store(dst + dst_stride); 207 vp9_prefetch_store(dst + dst_stride + 32); 208 209 for (x = 0; x < 64; x += 4) { 210 src_ptr = src + x; 211 dst_ptr = dst + x; 212 213 __asm__ __volatile__ ( 214 "ulw %[load1], 0(%[src_ptr]) \n\t" 215 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 216 "ulw %[load2], 0(%[src_ptr]) \n\t" 217 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 218 "ulw %[load3], 0(%[src_ptr]) \n\t" 219 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 220 "ulw %[load4], 0(%[src_ptr]) \n\t" 221 222 "mtlo %[vector4a], $ac0 \n\t" 223 "mtlo %[vector4a], $ac1 \n\t" 224 "mtlo %[vector4a], $ac2 \n\t" 225 "mtlo %[vector4a], $ac3 \n\t" 226 "mthi $zero, $ac0 \n\t" 227 "mthi $zero, $ac1 \n\t" 228 "mthi $zero, $ac2 \n\t" 229 "mthi $zero, $ac3 \n\t" 230 231 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 232 "preceu.ph.qbr %[p1], %[load2] \n\t" 233 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 234 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 235 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 236 "preceu.ph.qbr %[p2], %[load4] \n\t" 237 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 238 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 239 240 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 241 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 242 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 243 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 244 245 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 246 "preceu.ph.qbl %[p1], %[load2] \n\t" 247 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 248 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 249 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 250 "preceu.ph.qbl %[p2], %[load4] \n\t" 251 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 252 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 253 254 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 255 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 256 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 257 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 258 259 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 260 "ulw %[load1], 0(%[src_ptr]) \n\t" 261 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 262 "ulw %[load2], 0(%[src_ptr]) \n\t" 263 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 264 "ulw %[load3], 0(%[src_ptr]) \n\t" 265 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 266 "ulw %[load4], 0(%[src_ptr]) \n\t" 267 268 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 269 "preceu.ph.qbr %[p1], %[load2] \n\t" 270 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 271 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 272 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 273 "preceu.ph.qbr %[p2], %[load4] \n\t" 274 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 275 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 276 277 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 278 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 279 "extp %[Temp1], $ac0, 31 \n\t" 280 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 281 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 282 "extp %[Temp2], $ac1, 31 \n\t" 283 284 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 285 "preceu.ph.qbl %[p1], %[load2] \n\t" 286 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 287 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 288 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 289 "preceu.ph.qbl %[p2], %[load4] \n\t" 290 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 291 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 292 293 "lbux %[store1], %[Temp1](%[cm]) \n\t" 294 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 295 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 296 "extp %[Temp1], $ac2, 31 \n\t" 297 298 "lbux %[store2], %[Temp2](%[cm]) \n\t" 299 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 300 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 301 "extp %[Temp2], $ac3, 31 \n\t" 302 303 "sb %[store1], 0(%[dst_ptr]) \n\t" 304 "sb %[store2], 1(%[dst_ptr]) \n\t" 305 306 "lbux %[store1], %[Temp1](%[cm]) \n\t" 307 "lbux %[store2], %[Temp2](%[cm]) \n\t" 308 309 "sb %[store1], 2(%[dst_ptr]) \n\t" 310 "sb %[store2], 3(%[dst_ptr]) \n\t" 311 312 : [load1] "=&r" (load1), [load2] "=&r" (load2), 313 [load3] "=&r" (load3), [load4] "=&r" (load4), 314 [p1] "=&r" (p1), [p2] "=&r" (p2), 315 [n1] "=&r" (n1), [n2] "=&r" (n2), 316 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 317 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 318 [store1] "=&r" (store1), [store2] "=&r" (store2), 319 [src_ptr] "+r" (src_ptr) 320 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 321 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 322 [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), 323 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 324 ); 325 } 326 327 /* Next row... */ 328 src += src_stride; 329 dst += dst_stride; 330 } 331 } 332 333 void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, 334 uint8_t *dst, ptrdiff_t dst_stride, 335 const int16_t *filter_x, int x_step_q4, 336 const int16_t *filter_y, int y_step_q4, 337 int w, int h) { 338 if (((const int32_t *)filter_y)[1] == 0x800000) { 339 vp9_convolve_copy(src, src_stride, 340 dst, dst_stride, 341 filter_x, x_step_q4, 342 filter_y, y_step_q4, 343 w, h); 344 } else if (((const int32_t *)filter_y)[0] == 0) { 345 vp9_convolve2_vert_dspr2(src, src_stride, 346 dst, dst_stride, 347 filter_x, x_step_q4, 348 filter_y, y_step_q4, 349 w, h); 350 } else { 351 if (16 == y_step_q4) { 352 uint32_t pos = 38; 353 354 /* bit positon for extract from acc */ 355 __asm__ __volatile__ ( 356 "wrdsp %[pos], 1 \n\t" 357 : 358 : [pos] "r" (pos) 359 ); 360 361 vp9_prefetch_store(dst); 362 363 switch (w) { 364 case 4 : 365 case 8 : 366 case 16 : 367 case 32 : 368 convolve_vert_4_dspr2(src, src_stride, 369 dst, dst_stride, 370 filter_y, w, h); 371 break; 372 case 64 : 373 vp9_prefetch_store(dst + 32); 374 convolve_vert_64_dspr2(src, src_stride, 375 dst, dst_stride, 376 filter_y, h); 377 break; 378 default: 379 vp9_convolve8_vert_c(src, src_stride, 380 dst, dst_stride, 381 filter_x, x_step_q4, 382 filter_y, y_step_q4, 383 w, h); 384 break; 385 } 386 } else { 387 vp9_convolve8_vert_c(src, src_stride, 388 dst, dst_stride, 389 filter_x, x_step_q4, 390 filter_y, y_step_q4, 391 w, h); 392 } 393 } 394 } 395 396 #endif 397