1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 #include "vp9/common/vp9_common.h" 17 #include "vpx/vpx_integer.h" 18 #include "vpx_ports/mem.h" 19 #include "vp9/common/vp9_convolve.h" 20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 21 22 #if HAVE_DSPR2 23 static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, 24 int32_t src_stride, 25 uint8_t *dst, 26 int32_t dst_stride, 27 const int16_t *filter_x0, 28 int32_t h) { 29 int32_t y; 30 uint8_t *cm = vp9_ff_cropTbl; 31 int32_t Temp1, Temp2, Temp3, Temp4; 32 uint32_t vector4a = 64; 33 uint32_t tp1, tp2; 34 uint32_t p1, p2, p3; 35 uint32_t tn1, tn2; 36 const int16_t *filter = &filter_x0[3]; 37 uint32_t filter45; 38 39 filter45 = ((const int32_t *)filter)[0]; 40 41 for (y = h; y--;) { 42 /* prefetch data to cache memory */ 43 vp9_prefetch_load(src + src_stride); 44 vp9_prefetch_load(src + src_stride + 32); 45 vp9_prefetch_store(dst + dst_stride); 46 47 __asm__ __volatile__ ( 48 "ulw %[tp1], 0(%[src]) \n\t" 49 "ulw %[tp2], 4(%[src]) \n\t" 50 51 /* even 1. pixel */ 52 "mtlo %[vector4a], $ac3 \n\t" 53 "mthi $zero, $ac3 \n\t" 54 "preceu.ph.qbr %[p1], %[tp1] \n\t" 55 "preceu.ph.qbl %[p2], %[tp1] \n\t" 56 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 57 "extp %[Temp1], $ac3, 31 \n\t" 58 59 /* even 2. pixel */ 60 "mtlo %[vector4a], $ac2 \n\t" 61 "mthi $zero, $ac2 \n\t" 62 "balign %[tp2], %[tp1], 3 \n\t" 63 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 64 "extp %[Temp3], $ac2, 31 \n\t" 65 66 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ 67 68 /* odd 1. pixel */ 69 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ 70 "mtlo %[vector4a], $ac3 \n\t" 71 "mthi $zero, $ac3 \n\t" 72 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ 73 "preceu.ph.qbr %[p1], %[tp2] \n\t" 74 "preceu.ph.qbl %[p3], %[tp2] \n\t" 75 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 76 "extp %[Temp2], $ac3, 31 \n\t" 77 78 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ 79 80 /* odd 2. pixel */ 81 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ 82 "mtlo %[vector4a], $ac2 \n\t" 83 "mthi $zero, $ac2 \n\t" 84 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ 85 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ 86 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" 87 "extp %[Temp4], $ac2, 31 \n\t" 88 89 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ 90 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ 91 92 /* clamp */ 93 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ 94 "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ 95 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ 96 97 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ 98 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ 99 100 "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ 101 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ 102 103 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 104 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), 105 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), 106 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 107 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 108 : [filter45] "r" (filter45), [vector4a] "r" (vector4a), 109 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 110 ); 111 112 /* Next row... */ 113 src += src_stride; 114 dst += dst_stride; 115 } 116 } 117 118 static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, 119 int32_t src_stride, 120 uint8_t *dst, 121 int32_t dst_stride, 122 const int16_t *filter_x0, 123 int32_t h) { 124 int32_t y; 125 uint8_t *cm = vp9_ff_cropTbl; 126 uint32_t vector4a = 64; 127 int32_t Temp1, Temp2, Temp3; 128 uint32_t tp1, tp2, tp3, tp4; 129 uint32_t p1, p2, p3, p4, n1; 130 uint32_t st0, st1; 131 const int16_t *filter = &filter_x0[3]; 132 uint32_t filter45;; 133 134 filter45 = ((const int32_t *)filter)[0]; 135 136 for (y = h; y--;) { 137 /* prefetch data to cache memory */ 138 vp9_prefetch_load(src + src_stride); 139 vp9_prefetch_load(src + src_stride + 32); 140 vp9_prefetch_store(dst + dst_stride); 141 142 __asm__ __volatile__ ( 143 "ulw %[tp1], 0(%[src]) \n\t" 144 "ulw %[tp2], 4(%[src]) \n\t" 145 146 /* even 1. pixel */ 147 "mtlo %[vector4a], $ac3 \n\t" 148 "mthi $zero, $ac3 \n\t" 149 "mtlo %[vector4a], $ac2 \n\t" 150 "mthi $zero, $ac2 \n\t" 151 "preceu.ph.qbr %[p1], %[tp1] \n\t" 152 "preceu.ph.qbl %[p2], %[tp1] \n\t" 153 "preceu.ph.qbr %[p3], %[tp2] \n\t" 154 "preceu.ph.qbl %[p4], %[tp2] \n\t" 155 "ulw %[tp3], 8(%[src]) \n\t" 156 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 157 "extp %[Temp1], $ac3, 31 \n\t" 158 "lbu %[Temp2], 0(%[dst]) \n\t" 159 "lbu %[tp4], 2(%[dst]) \n\t" 160 161 /* even 2. pixel */ 162 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 163 "extp %[Temp3], $ac2, 31 \n\t" 164 165 /* even 3. pixel */ 166 "lbux %[st0], %[Temp1](%[cm]) \n\t" 167 "mtlo %[vector4a], $ac1 \n\t" 168 "mthi $zero, $ac1 \n\t" 169 "lbux %[st1], %[Temp3](%[cm]) \n\t" 170 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" 171 "extp %[Temp1], $ac1, 31 \n\t" 172 173 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 174 "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" 175 "sb %[Temp2], 0(%[dst]) \n\t" 176 "sb %[tp4], 2(%[dst]) \n\t" 177 178 /* even 4. pixel */ 179 "mtlo %[vector4a], $ac2 \n\t" 180 "mthi $zero, $ac2 \n\t" 181 "mtlo %[vector4a], $ac3 \n\t" 182 "mthi $zero, $ac3 \n\t" 183 184 "balign %[tp3], %[tp2], 3 \n\t" 185 "balign %[tp2], %[tp1], 3 \n\t" 186 187 "lbux %[st0], %[Temp1](%[cm]) \n\t" 188 "lbu %[Temp2], 4(%[dst]) \n\t" 189 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 190 191 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 192 "extp %[Temp3], $ac2, 31 \n\t" 193 194 /* odd 1. pixel */ 195 "mtlo %[vector4a], $ac1 \n\t" 196 "mthi $zero, $ac1 \n\t" 197 "sb %[Temp2], 4(%[dst]) \n\t" 198 "preceu.ph.qbr %[p1], %[tp2] \n\t" 199 "preceu.ph.qbl %[p2], %[tp2] \n\t" 200 "preceu.ph.qbr %[p3], %[tp3] \n\t" 201 "preceu.ph.qbl %[p4], %[tp3] \n\t" 202 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 203 "extp %[Temp2], $ac3, 31 \n\t" 204 205 "lbu %[tp1], 6(%[dst]) \n\t" 206 207 /* odd 2. pixel */ 208 "mtlo %[vector4a], $ac3 \n\t" 209 "mthi $zero, $ac3 \n\t" 210 "mtlo %[vector4a], $ac2 \n\t" 211 "mthi $zero, $ac2 \n\t" 212 "lbux %[st0], %[Temp3](%[cm]) \n\t" 213 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" 214 "extp %[Temp3], $ac1, 31 \n\t" 215 216 "lbu %[tp2], 1(%[dst]) \n\t" 217 "lbu %[tp3], 3(%[dst]) \n\t" 218 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" 219 220 /* odd 3. pixel */ 221 "lbux %[st1], %[Temp2](%[cm]) \n\t" 222 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" 223 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" 224 "extp %[Temp2], $ac3, 31 \n\t" 225 226 "lbu %[tp4], 5(%[dst]) \n\t" 227 228 /* odd 4. pixel */ 229 "sb %[tp2], 1(%[dst]) \n\t" 230 "sb %[tp1], 6(%[dst]) \n\t" 231 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 232 "extp %[Temp1], $ac2, 31 \n\t" 233 234 "lbu %[tp1], 7(%[dst]) \n\t" 235 236 /* clamp */ 237 "lbux %[p4], %[Temp3](%[cm]) \n\t" 238 "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" 239 240 "lbux %[p2], %[Temp2](%[cm]) \n\t" 241 "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" 242 243 "lbux %[p1], %[Temp1](%[cm]) \n\t" 244 "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" 245 246 /* store bytes */ 247 "sb %[tp3], 3(%[dst]) \n\t" 248 "sb %[tp4], 5(%[dst]) \n\t" 249 "sb %[tp1], 7(%[dst]) \n\t" 250 251 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 252 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 253 [st0] "=&r" (st0), [st1] "=&r" (st1), 254 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 255 [n1] "=&r" (n1), 256 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 257 : [filter45] "r" (filter45), [vector4a] "r" (vector4a), 258 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 259 ); 260 261 /* Next row... */ 262 src += src_stride; 263 dst += dst_stride; 264 } 265 } 266 267 static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, 268 int32_t src_stride, 269 uint8_t *dst_ptr, 270 int32_t dst_stride, 271 const int16_t *filter_x0, 272 int32_t h, 273 int32_t count) { 274 int32_t y, c; 275 const uint8_t *src; 276 uint8_t *dst; 277 uint8_t *cm = vp9_ff_cropTbl; 278 uint32_t vector_64 = 64; 279 int32_t Temp1, Temp2, Temp3; 280 uint32_t qload1, qload2, qload3; 281 uint32_t p1, p2, p3, p4, p5; 282 uint32_t st1, st2, st3; 283 const int16_t *filter = &filter_x0[3]; 284 uint32_t filter45;; 285 286 filter45 = ((const int32_t *)filter)[0]; 287 288 for (y = h; y--;) { 289 src = src_ptr; 290 dst = dst_ptr; 291 292 /* prefetch data to cache memory */ 293 vp9_prefetch_load(src_ptr + src_stride); 294 vp9_prefetch_load(src_ptr + src_stride + 32); 295 vp9_prefetch_store(dst_ptr + dst_stride); 296 297 for (c = 0; c < count; c++) { 298 __asm__ __volatile__ ( 299 "ulw %[qload1], 0(%[src]) \n\t" 300 "ulw %[qload2], 4(%[src]) \n\t" 301 302 /* even 1. pixel */ 303 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 304 "mthi $zero, $ac1 \n\t" 305 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 306 "mthi $zero, $ac2 \n\t" 307 "preceu.ph.qbr %[p1], %[qload1] \n\t" 308 "preceu.ph.qbl %[p2], %[qload1] \n\t" 309 "preceu.ph.qbr %[p3], %[qload2] \n\t" 310 "preceu.ph.qbl %[p4], %[qload2] \n\t" 311 "ulw %[qload3], 8(%[src]) \n\t" 312 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 313 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 314 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 315 316 /* even 2. pixel */ 317 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 318 "mthi $zero, $ac3 \n\t" 319 "preceu.ph.qbr %[p1], %[qload3] \n\t" 320 "preceu.ph.qbl %[p5], %[qload3] \n\t" 321 "ulw %[qload1], 12(%[src]) \n\t" 322 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 323 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 324 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 325 326 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 327 328 /* even 3. pixel */ 329 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 330 "mthi $zero, $ac1 \n\t" 331 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 332 "preceu.ph.qbr %[p2], %[qload1] \n\t" 333 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 334 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 335 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 336 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 337 338 /* even 4. pixel */ 339 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 340 "mthi $zero, $ac2 \n\t" 341 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 342 "preceu.ph.qbl %[p3], %[qload1] \n\t" 343 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 344 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 345 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 346 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 347 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 348 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 349 350 /* even 5. pixel */ 351 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 352 "mthi $zero, $ac3 \n\t" 353 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 354 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 355 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 356 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 357 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 358 359 /* even 6. pixel */ 360 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 361 "mthi $zero, $ac1 \n\t" 362 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 363 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 364 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 365 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 366 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 367 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 368 369 /* even 7. pixel */ 370 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 371 "mthi $zero, $ac2 \n\t" 372 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 373 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 374 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 375 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 376 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 377 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 378 379 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 380 381 /* even 8. pixel */ 382 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 383 "mthi $zero, $ac3 \n\t" 384 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 385 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 386 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 387 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 388 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 389 390 /* ODD pixels */ 391 "ulw %[qload1], 1(%[src]) \n\t" 392 "ulw %[qload2], 5(%[src]) \n\t" 393 394 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 395 396 /* odd 1. pixel */ 397 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 398 "mthi $zero, $ac1 \n\t" 399 "preceu.ph.qbr %[p1], %[qload1] \n\t" 400 "preceu.ph.qbl %[p2], %[qload1] \n\t" 401 "preceu.ph.qbr %[p3], %[qload2] \n\t" 402 "preceu.ph.qbl %[p4], %[qload2] \n\t" 403 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 404 "ulw %[qload3], 9(%[src]) \n\t" 405 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 406 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 407 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 408 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 409 410 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 411 412 /* odd 2. pixel */ 413 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 414 "mthi $zero, $ac2 \n\t" 415 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 416 "preceu.ph.qbr %[p1], %[qload3] \n\t" 417 "preceu.ph.qbl %[p5], %[qload3] \n\t" 418 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 419 "ulw %[qload1], 13(%[src]) \n\t" 420 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 421 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 422 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 423 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 424 425 /* odd 3. pixel */ 426 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 427 "mthi $zero, $ac3 \n\t" 428 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 429 "preceu.ph.qbr %[p2], %[qload1] \n\t" 430 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 431 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 432 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 433 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 434 435 /* odd 4. pixel */ 436 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 437 "mthi $zero, $ac1 \n\t" 438 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 439 "preceu.ph.qbl %[p3], %[qload1] \n\t" 440 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 441 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 442 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 443 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 444 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 445 446 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 447 448 /* odd 5. pixel */ 449 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 450 "mthi $zero, $ac2 \n\t" 451 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 452 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 453 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 454 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 455 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 456 457 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 458 459 /* odd 6. pixel */ 460 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 461 "mthi $zero, $ac3 \n\t" 462 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 463 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 464 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 465 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 466 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 467 468 /* odd 7. pixel */ 469 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 470 "mthi $zero, $ac1 \n\t" 471 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 472 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 473 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 474 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 475 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 476 477 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 478 479 /* odd 8. pixel */ 480 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 481 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 482 483 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 484 485 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 486 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 487 488 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 489 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 490 491 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 492 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 493 494 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 495 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 496 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 497 498 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), 499 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 500 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 501 [qload3] "=&r" (qload3), [p5] "=&r" (p5), 502 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 503 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), 504 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 505 ); 506 507 src += 16; 508 dst += 16; 509 } 510 511 /* Next row... */ 512 src_ptr += src_stride; 513 dst_ptr += dst_stride; 514 } 515 } 516 517 static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, 518 int32_t src_stride, 519 uint8_t *dst_ptr, 520 int32_t dst_stride, 521 const int16_t *filter_x0, 522 int32_t h) { 523 int32_t y, c; 524 const uint8_t *src; 525 uint8_t *dst; 526 uint8_t *cm = vp9_ff_cropTbl; 527 uint32_t vector_64 = 64; 528 int32_t Temp1, Temp2, Temp3; 529 uint32_t qload1, qload2, qload3; 530 uint32_t p1, p2, p3, p4, p5; 531 uint32_t st1, st2, st3; 532 const int16_t *filter = &filter_x0[3]; 533 uint32_t filter45;; 534 535 filter45 = ((const int32_t *)filter)[0]; 536 537 for (y = h; y--;) { 538 src = src_ptr; 539 dst = dst_ptr; 540 541 /* prefetch data to cache memory */ 542 vp9_prefetch_load(src_ptr + src_stride); 543 vp9_prefetch_load(src_ptr + src_stride + 32); 544 vp9_prefetch_load(src_ptr + src_stride + 64); 545 vp9_prefetch_store(dst_ptr + dst_stride); 546 vp9_prefetch_store(dst_ptr + dst_stride + 32); 547 548 for (c = 0; c < 4; c++) { 549 __asm__ __volatile__ ( 550 "ulw %[qload1], 0(%[src]) \n\t" 551 "ulw %[qload2], 4(%[src]) \n\t" 552 553 /* even 1. pixel */ 554 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 555 "mthi $zero, $ac1 \n\t" 556 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 557 "mthi $zero, $ac2 \n\t" 558 "preceu.ph.qbr %[p1], %[qload1] \n\t" 559 "preceu.ph.qbl %[p2], %[qload1] \n\t" 560 "preceu.ph.qbr %[p3], %[qload2] \n\t" 561 "preceu.ph.qbl %[p4], %[qload2] \n\t" 562 "ulw %[qload3], 8(%[src]) \n\t" 563 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 564 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 565 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 566 567 /* even 2. pixel */ 568 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 569 "mthi $zero, $ac3 \n\t" 570 "preceu.ph.qbr %[p1], %[qload3] \n\t" 571 "preceu.ph.qbl %[p5], %[qload3] \n\t" 572 "ulw %[qload1], 12(%[src]) \n\t" 573 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 574 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 575 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 576 577 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 578 579 /* even 3. pixel */ 580 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 581 "mthi $zero, $ac1 \n\t" 582 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 583 "preceu.ph.qbr %[p2], %[qload1] \n\t" 584 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 585 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 586 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 587 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 588 589 /* even 4. pixel */ 590 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 591 "mthi $zero, $ac2 \n\t" 592 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 593 "preceu.ph.qbl %[p3], %[qload1] \n\t" 594 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 595 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 596 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 597 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 598 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 599 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 600 601 /* even 5. pixel */ 602 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 603 "mthi $zero, $ac3 \n\t" 604 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 605 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 606 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 607 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 608 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 609 610 /* even 6. pixel */ 611 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 612 "mthi $zero, $ac1 \n\t" 613 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 614 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 615 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 616 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 617 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 618 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 619 620 /* even 7. pixel */ 621 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 622 "mthi $zero, $ac2 \n\t" 623 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 624 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 625 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 626 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 627 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 628 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 629 630 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 631 632 /* even 8. pixel */ 633 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 634 "mthi $zero, $ac3 \n\t" 635 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 636 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 637 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 638 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 639 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 640 641 /* ODD pixels */ 642 "ulw %[qload1], 1(%[src]) \n\t" 643 "ulw %[qload2], 5(%[src]) \n\t" 644 645 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 646 647 /* odd 1. pixel */ 648 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 649 "mthi $zero, $ac1 \n\t" 650 "preceu.ph.qbr %[p1], %[qload1] \n\t" 651 "preceu.ph.qbl %[p2], %[qload1] \n\t" 652 "preceu.ph.qbr %[p3], %[qload2] \n\t" 653 "preceu.ph.qbl %[p4], %[qload2] \n\t" 654 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 655 "ulw %[qload3], 9(%[src]) \n\t" 656 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 657 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 658 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 659 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 660 661 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 662 663 /* odd 2. pixel */ 664 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 665 "mthi $zero, $ac2 \n\t" 666 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 667 "preceu.ph.qbr %[p1], %[qload3] \n\t" 668 "preceu.ph.qbl %[p5], %[qload3] \n\t" 669 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 670 "ulw %[qload1], 13(%[src]) \n\t" 671 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 672 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 673 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 674 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 675 676 /* odd 3. pixel */ 677 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 678 "mthi $zero, $ac3 \n\t" 679 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 680 "preceu.ph.qbr %[p2], %[qload1] \n\t" 681 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 682 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 683 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 684 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 685 686 /* odd 4. pixel */ 687 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 688 "mthi $zero, $ac1 \n\t" 689 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 690 "preceu.ph.qbl %[p3], %[qload1] \n\t" 691 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 692 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 693 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 694 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 695 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 696 697 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 698 699 /* odd 5. pixel */ 700 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 701 "mthi $zero, $ac2 \n\t" 702 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 703 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 704 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 705 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 706 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 707 708 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 709 710 /* odd 6. pixel */ 711 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 712 "mthi $zero, $ac3 \n\t" 713 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 714 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 715 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 716 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 717 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 718 719 /* odd 7. pixel */ 720 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 721 "mthi $zero, $ac1 \n\t" 722 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 723 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 724 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 725 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 726 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 727 728 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 729 730 /* odd 8. pixel */ 731 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 732 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 733 734 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 735 736 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 737 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 738 739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 740 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 741 742 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 743 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 744 745 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 746 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 747 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 748 749 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), 750 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 751 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 752 [qload3] "=&r" (qload3), [p5] "=&r" (p5), 753 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 754 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), 755 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 756 ); 757 758 src += 16; 759 dst += 16; 760 } 761 762 /* Next row... */ 763 src_ptr += src_stride; 764 dst_ptr += dst_stride; 765 } 766 } 767 768 void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 769 uint8_t *dst, ptrdiff_t dst_stride, 770 const int16_t *filter_x, int x_step_q4, 771 const int16_t *filter_y, int y_step_q4, 772 int w, int h) { 773 if (16 == x_step_q4) { 774 uint32_t pos = 38; 775 776 /* bit positon for extract from acc */ 777 __asm__ __volatile__ ( 778 "wrdsp %[pos], 1 \n\t" 779 : 780 : [pos] "r" (pos) 781 ); 782 783 /* prefetch data to cache memory */ 784 vp9_prefetch_load(src); 785 vp9_prefetch_load(src + 32); 786 vp9_prefetch_store(dst); 787 788 switch (w) { 789 case 4: 790 convolve_bi_avg_horiz_4_dspr2(src, src_stride, 791 dst, dst_stride, 792 filter_x, h); 793 break; 794 case 8: 795 convolve_bi_avg_horiz_8_dspr2(src, src_stride, 796 dst, dst_stride, 797 filter_x, h); 798 break; 799 case 16: 800 convolve_bi_avg_horiz_16_dspr2(src, src_stride, 801 dst, dst_stride, 802 filter_x, h, 1); 803 break; 804 case 32: 805 convolve_bi_avg_horiz_16_dspr2(src, src_stride, 806 dst, dst_stride, 807 filter_x, h, 2); 808 break; 809 case 64: 810 vp9_prefetch_load(src + 64); 811 vp9_prefetch_store(dst + 32); 812 813 convolve_bi_avg_horiz_64_dspr2(src, src_stride, 814 dst, dst_stride, 815 filter_x, h); 816 break; 817 default: 818 vp9_convolve8_avg_horiz_c(src, src_stride, 819 dst, dst_stride, 820 filter_x, x_step_q4, 821 filter_y, y_step_q4, 822 w, h); 823 break; 824 } 825 } else { 826 vp9_convolve8_avg_horiz_c(src, src_stride, 827 dst, dst_stride, 828 filter_x, x_step_q4, 829 filter_y, y_step_q4, 830 w, h); 831 } 832 } 833 #endif 834