1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <stdio.h> 14 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom_dsp/mips/convolve_common_dspr2.h" 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_ports/mem.h" 20 21 #if HAVE_DSPR2 22 static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, 23 uint8_t *dst, int32_t dst_stride, 24 const int16_t *filter_x0, int32_t h) { 25 int32_t y; 26 uint8_t *cm = aom_ff_cropTbl; 27 int32_t Temp1, Temp2, Temp3, Temp4; 28 uint32_t vector4a = 64; 29 uint32_t tp1, tp2; 30 uint32_t p1, p2; 31 const int16_t *filter = &filter_x0[3]; 32 uint32_t filter45; 33 34 filter45 = ((const int32_t *)filter)[0]; 35 36 for (y = h; y--;) { 37 /* prefetch data to cache memory */ 38 prefetch_load(src + src_stride); 39 prefetch_load(src + src_stride + 32); 40 prefetch_store(dst + dst_stride); 41 42 __asm__ __volatile__( 43 "ulw %[tp1], 0(%[src]) \n\t" 44 "ulw %[tp2], 4(%[src]) \n\t" 45 46 /* even 1. pixel */ 47 "mtlo %[vector4a], $ac3 \n\t" 48 "mthi $zero, $ac3 \n\t" 49 "preceu.ph.qbr %[p1], %[tp1] \n\t" 50 "preceu.ph.qbl %[p2], %[tp1] \n\t" 51 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 52 "extp %[Temp1], $ac3, 31 \n\t" 53 54 /* even 2. pixel */ 55 "mtlo %[vector4a], $ac2 \n\t" 56 "mthi $zero, $ac2 \n\t" 57 "balign %[tp2], %[tp1], 3 \n\t" 58 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 59 "extp %[Temp3], $ac2, 31 \n\t" 60 61 /* odd 1. pixel */ 62 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 63 "mtlo %[vector4a], $ac3 \n\t" 64 "mthi $zero, $ac3 \n\t" 65 "preceu.ph.qbr %[p1], %[tp2] \n\t" 66 "preceu.ph.qbl %[p2], %[tp2] \n\t" 67 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 68 "extp %[Temp2], $ac3, 31 \n\t" 69 70 /* odd 2. pixel */ 71 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 72 "mtlo %[vector4a], $ac2 \n\t" 73 "mthi $zero, $ac2 \n\t" 74 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 75 "extp %[Temp4], $ac2, 31 \n\t" 76 77 /* clamp */ 78 "lbux %[p1], %[Temp2](%[cm]) \n\t" 79 "lbux %[p2], %[Temp4](%[cm]) \n\t" 80 81 /* store bytes */ 82 "sb %[tp1], 0(%[dst]) \n\t" 83 "sb %[p1], 1(%[dst]) \n\t" 84 "sb %[tp2], 2(%[dst]) \n\t" 85 "sb %[p2], 3(%[dst]) \n\t" 86 87 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), 88 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 89 [Temp4] "=&r"(Temp4) 90 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 91 [dst] "r"(dst), [src] "r"(src)); 92 93 /* Next row... */ 94 src += src_stride; 95 dst += dst_stride; 96 } 97 } 98 99 static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, 100 uint8_t *dst, int32_t dst_stride, 101 const int16_t *filter_x0, int32_t h) { 102 int32_t y; 103 uint8_t *cm = aom_ff_cropTbl; 104 uint32_t vector4a = 64; 105 int32_t Temp1, Temp2, Temp3; 106 uint32_t tp1, tp2, tp3; 107 uint32_t p1, p2, p3, p4; 108 uint32_t st0, st1; 109 const int16_t *filter = &filter_x0[3]; 110 uint32_t filter45; 111 112 filter45 = ((const int32_t *)filter)[0]; 113 114 for (y = h; y--;) { 115 /* prefetch data to cache memory */ 116 prefetch_load(src + src_stride); 117 prefetch_load(src + src_stride + 32); 118 prefetch_store(dst + dst_stride); 119 120 __asm__ __volatile__( 121 "ulw %[tp1], 0(%[src]) \n\t" 122 "ulw %[tp2], 4(%[src]) \n\t" 123 124 /* even 1. pixel */ 125 "mtlo %[vector4a], $ac3 \n\t" 126 "mthi $zero, $ac3 \n\t" 127 "mtlo %[vector4a], $ac2 \n\t" 128 "mthi $zero, $ac2 \n\t" 129 "preceu.ph.qbr %[p1], %[tp1] \n\t" 130 "preceu.ph.qbl %[p2], %[tp1] \n\t" 131 "preceu.ph.qbr %[p3], %[tp2] \n\t" 132 "preceu.ph.qbl %[p4], %[tp2] \n\t" 133 "ulw %[tp3], 8(%[src]) \n\t" 134 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 135 "extp %[Temp1], $ac3, 31 \n\t" 136 137 /* even 2. pixel */ 138 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 139 "extp %[Temp3], $ac2, 31 \n\t" 140 141 /* even 3. pixel */ 142 "lbux %[st0], %[Temp1](%[cm]) \n\t" 143 "mtlo %[vector4a], $ac1 \n\t" 144 "mthi $zero, $ac1 \n\t" 145 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" 146 "extp %[Temp1], $ac1, 31 \n\t" 147 148 /* even 4. pixel */ 149 "mtlo %[vector4a], $ac2 \n\t" 150 "mthi $zero, $ac2 \n\t" 151 "mtlo %[vector4a], $ac3 \n\t" 152 "mthi $zero, $ac3 \n\t" 153 "sb %[st0], 0(%[dst]) \n\t" 154 "lbux %[st1], %[Temp3](%[cm]) \n\t" 155 156 "balign %[tp3], %[tp2], 3 \n\t" 157 "balign %[tp2], %[tp1], 3 \n\t" 158 159 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 160 "extp %[Temp3], $ac2, 31 \n\t" 161 162 "lbux %[st0], %[Temp1](%[cm]) \n\t" 163 164 /* odd 1. pixel */ 165 "mtlo %[vector4a], $ac1 \n\t" 166 "mthi $zero, $ac1 \n\t" 167 "sb %[st1], 2(%[dst]) \n\t" 168 "preceu.ph.qbr %[p1], %[tp2] \n\t" 169 "preceu.ph.qbl %[p2], %[tp2] \n\t" 170 "preceu.ph.qbr %[p3], %[tp3] \n\t" 171 "preceu.ph.qbl %[p4], %[tp3] \n\t" 172 "sb %[st0], 4(%[dst]) \n\t" 173 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 174 "extp %[Temp2], $ac3, 31 \n\t" 175 176 /* odd 2. pixel */ 177 "mtlo %[vector4a], $ac3 \n\t" 178 "mthi $zero, $ac3 \n\t" 179 "mtlo %[vector4a], $ac2 \n\t" 180 "mthi $zero, $ac2 \n\t" 181 "lbux %[st0], %[Temp3](%[cm]) \n\t" 182 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" 183 "extp %[Temp3], $ac1, 31 \n\t" 184 185 /* odd 3. pixel */ 186 "lbux %[st1], %[Temp2](%[cm]) \n\t" 187 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" 188 "extp %[Temp2], $ac3, 31 \n\t" 189 190 /* odd 4. pixel */ 191 "sb %[st1], 1(%[dst]) \n\t" 192 "sb %[st0], 6(%[dst]) \n\t" 193 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 194 "extp %[Temp1], $ac2, 31 \n\t" 195 196 /* clamp */ 197 "lbux %[p4], %[Temp3](%[cm]) \n\t" 198 "lbux %[p2], %[Temp2](%[cm]) \n\t" 199 "lbux %[p1], %[Temp1](%[cm]) \n\t" 200 201 /* store bytes */ 202 "sb %[p4], 3(%[dst]) \n\t" 203 "sb %[p2], 5(%[dst]) \n\t" 204 "sb %[p1], 7(%[dst]) \n\t" 205 206 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 207 [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), 208 [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), 209 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 210 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 211 [dst] "r"(dst), [src] "r"(src)); 212 213 /* Next row... */ 214 src += src_stride; 215 dst += dst_stride; 216 } 217 } 218 219 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, 220 int32_t src_stride, uint8_t *dst_ptr, 221 int32_t dst_stride, 222 const int16_t *filter_x0, int32_t h, 223 int32_t count) { 224 int32_t y, c; 225 const uint8_t *src; 226 uint8_t *dst; 227 uint8_t *cm = aom_ff_cropTbl; 228 uint32_t vector_64 = 64; 229 int32_t Temp1, Temp2, Temp3; 230 uint32_t qload1, qload2, qload3; 231 uint32_t p1, p2, p3, p4, p5; 232 uint32_t st1, st2, st3; 233 const int16_t *filter = &filter_x0[3]; 234 uint32_t filter45; 235 236 filter45 = ((const int32_t *)filter)[0]; 237 238 for (y = h; y--;) { 239 src = src_ptr; 240 dst = dst_ptr; 241 242 /* prefetch data to cache memory */ 243 prefetch_load(src_ptr + src_stride); 244 prefetch_load(src_ptr + src_stride + 32); 245 prefetch_store(dst_ptr + dst_stride); 246 247 for (c = 0; c < count; c++) { 248 __asm__ __volatile__( 249 "ulw %[qload1], 0(%[src]) \n\t" 250 "ulw %[qload2], 4(%[src]) \n\t" 251 252 /* even 1. pixel */ 253 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 254 "mthi $zero, $ac1 \n\t" 255 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 256 "mthi $zero, $ac2 \n\t" 257 "preceu.ph.qbr %[p1], %[qload1] \n\t" 258 "preceu.ph.qbl %[p2], %[qload1] \n\t" 259 "preceu.ph.qbr %[p3], %[qload2] \n\t" 260 "preceu.ph.qbl %[p4], %[qload2] \n\t" 261 "ulw %[qload3], 8(%[src]) \n\t" 262 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 263 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 264 265 /* even 2. pixel */ 266 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 267 "mthi $zero, $ac3 \n\t" 268 "preceu.ph.qbr %[p1], %[qload3] \n\t" 269 "preceu.ph.qbl %[p5], %[qload3] \n\t" 270 "ulw %[qload1], 12(%[src]) \n\t" 271 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 272 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 273 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 274 275 /* even 3. pixel */ 276 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 277 "mthi $zero, $ac1 \n\t" 278 "preceu.ph.qbr %[p2], %[qload1] \n\t" 279 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 280 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 281 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 282 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 283 284 /* even 4. pixel */ 285 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 286 "mthi $zero, $ac2 \n\t" 287 "preceu.ph.qbl %[p3], %[qload1] \n\t" 288 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 289 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 290 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 291 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 292 293 /* even 5. pixel */ 294 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 295 "mthi $zero, $ac3 \n\t" 296 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 297 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 298 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 299 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 300 301 /* even 6. pixel */ 302 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 303 "mthi $zero, $ac1 \n\t" 304 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 305 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 306 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 307 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 308 309 /* even 7. pixel */ 310 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 311 "mthi $zero, $ac2 \n\t" 312 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 313 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 314 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 315 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 316 317 /* even 8. pixel */ 318 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 319 "mthi $zero, $ac3 \n\t" 320 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 321 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 322 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 323 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 324 325 /* ODD pixels */ 326 "ulw %[qload1], 1(%[src]) \n\t" 327 "ulw %[qload2], 5(%[src]) \n\t" 328 329 /* odd 1. pixel */ 330 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 331 "mthi $zero, $ac1 \n\t" 332 "preceu.ph.qbr %[p1], %[qload1] \n\t" 333 "preceu.ph.qbl %[p2], %[qload1] \n\t" 334 "preceu.ph.qbr %[p3], %[qload2] \n\t" 335 "preceu.ph.qbl %[p4], %[qload2] \n\t" 336 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 337 "ulw %[qload3], 9(%[src]) \n\t" 338 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 339 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 340 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 341 342 /* odd 2. pixel */ 343 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 344 "mthi $zero, $ac2 \n\t" 345 "preceu.ph.qbr %[p1], %[qload3] \n\t" 346 "preceu.ph.qbl %[p5], %[qload3] \n\t" 347 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 348 "ulw %[qload1], 13(%[src]) \n\t" 349 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 350 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 351 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 352 353 /* odd 3. pixel */ 354 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 355 "mthi $zero, $ac3 \n\t" 356 "preceu.ph.qbr %[p2], %[qload1] \n\t" 357 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 358 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 359 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 360 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 361 362 /* odd 4. pixel */ 363 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 364 "mthi $zero, $ac1 \n\t" 365 "preceu.ph.qbl %[p3], %[qload1] \n\t" 366 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 367 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 368 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 369 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 370 371 /* odd 5. pixel */ 372 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 373 "mthi $zero, $ac2 \n\t" 374 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 375 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 376 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 377 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 378 379 /* odd 6. pixel */ 380 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 381 "mthi $zero, $ac3 \n\t" 382 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 383 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 384 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 385 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 386 387 /* odd 7. pixel */ 388 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 389 "mthi $zero, $ac1 \n\t" 390 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 391 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 392 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 393 394 /* odd 8. pixel */ 395 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 396 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 397 398 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 399 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 400 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 401 402 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 403 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 404 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 405 406 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 407 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 408 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 409 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 410 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 411 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 412 [dst] "r"(dst), [src] "r"(src)); 413 414 src += 16; 415 dst += 16; 416 } 417 418 /* Next row... */ 419 src_ptr += src_stride; 420 dst_ptr += dst_stride; 421 } 422 } 423 424 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, 425 int32_t src_stride, uint8_t *dst_ptr, 426 int32_t dst_stride, 427 const int16_t *filter_x0, int32_t h) { 428 int32_t y, c; 429 const uint8_t *src; 430 uint8_t *dst; 431 uint8_t *cm = aom_ff_cropTbl; 432 uint32_t vector_64 = 64; 433 int32_t Temp1, Temp2, Temp3; 434 uint32_t qload1, qload2, qload3; 435 uint32_t p1, p2, p3, p4, p5; 436 uint32_t st1, st2, st3; 437 const int16_t *filter = &filter_x0[3]; 438 uint32_t filter45; 439 440 filter45 = ((const int32_t *)filter)[0]; 441 442 for (y = h; y--;) { 443 src = src_ptr; 444 dst = dst_ptr; 445 446 /* prefetch data to cache memory */ 447 prefetch_load(src_ptr + src_stride); 448 prefetch_load(src_ptr + src_stride + 32); 449 prefetch_load(src_ptr + src_stride + 64); 450 prefetch_store(dst_ptr + dst_stride); 451 prefetch_store(dst_ptr + dst_stride + 32); 452 453 for (c = 0; c < 4; c++) { 454 __asm__ __volatile__( 455 "ulw %[qload1], 0(%[src]) \n\t" 456 "ulw %[qload2], 4(%[src]) \n\t" 457 458 /* even 1. pixel */ 459 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 460 "mthi $zero, $ac1 \n\t" 461 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 462 "mthi $zero, $ac2 \n\t" 463 "preceu.ph.qbr %[p1], %[qload1] \n\t" 464 "preceu.ph.qbl %[p2], %[qload1] \n\t" 465 "preceu.ph.qbr %[p3], %[qload2] \n\t" 466 "preceu.ph.qbl %[p4], %[qload2] \n\t" 467 "ulw %[qload3], 8(%[src]) \n\t" 468 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 469 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 470 471 /* even 2. pixel */ 472 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 473 "mthi $zero, $ac3 \n\t" 474 "preceu.ph.qbr %[p1], %[qload3] \n\t" 475 "preceu.ph.qbl %[p5], %[qload3] \n\t" 476 "ulw %[qload1], 12(%[src]) \n\t" 477 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 478 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 479 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 480 481 /* even 3. pixel */ 482 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 483 "mthi $zero, $ac1 \n\t" 484 "preceu.ph.qbr %[p2], %[qload1] \n\t" 485 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 486 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 487 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 488 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 489 490 /* even 4. pixel */ 491 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 492 "mthi $zero, $ac2 \n\t" 493 "preceu.ph.qbl %[p3], %[qload1] \n\t" 494 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 495 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 496 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 497 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 498 499 /* even 5. pixel */ 500 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 501 "mthi $zero, $ac3 \n\t" 502 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 503 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 504 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 505 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 506 507 /* even 6. pixel */ 508 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 509 "mthi $zero, $ac1 \n\t" 510 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 511 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 512 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 513 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 514 515 /* even 7. pixel */ 516 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 517 "mthi $zero, $ac2 \n\t" 518 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 519 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 520 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 521 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 522 523 /* even 8. pixel */ 524 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 525 "mthi $zero, $ac3 \n\t" 526 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 527 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 528 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 529 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 530 531 /* ODD pixels */ 532 "ulw %[qload1], 1(%[src]) \n\t" 533 "ulw %[qload2], 5(%[src]) \n\t" 534 535 /* odd 1. pixel */ 536 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 537 "mthi $zero, $ac1 \n\t" 538 "preceu.ph.qbr %[p1], %[qload1] \n\t" 539 "preceu.ph.qbl %[p2], %[qload1] \n\t" 540 "preceu.ph.qbr %[p3], %[qload2] \n\t" 541 "preceu.ph.qbl %[p4], %[qload2] \n\t" 542 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 543 "ulw %[qload3], 9(%[src]) \n\t" 544 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 545 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 546 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 547 548 /* odd 2. pixel */ 549 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 550 "mthi $zero, $ac2 \n\t" 551 "preceu.ph.qbr %[p1], %[qload3] \n\t" 552 "preceu.ph.qbl %[p5], %[qload3] \n\t" 553 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 554 "ulw %[qload1], 13(%[src]) \n\t" 555 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 556 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 557 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 558 559 /* odd 3. pixel */ 560 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 561 "mthi $zero, $ac3 \n\t" 562 "preceu.ph.qbr %[p2], %[qload1] \n\t" 563 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 564 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 565 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 566 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 567 568 /* odd 4. pixel */ 569 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 570 "mthi $zero, $ac1 \n\t" 571 "preceu.ph.qbl %[p3], %[qload1] \n\t" 572 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 573 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 574 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 575 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 576 577 /* odd 5. pixel */ 578 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 579 "mthi $zero, $ac2 \n\t" 580 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 581 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 582 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 583 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 584 585 /* odd 6. pixel */ 586 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 587 "mthi $zero, $ac3 \n\t" 588 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 589 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 590 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 591 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 592 593 /* odd 7. pixel */ 594 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 595 "mthi $zero, $ac1 \n\t" 596 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 597 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 598 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 599 600 /* odd 8. pixel */ 601 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 602 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 603 604 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 605 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 606 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 607 608 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 609 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 610 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 611 612 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 613 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 614 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 615 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 616 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 617 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 618 [dst] "r"(dst), [src] "r"(src)); 619 620 src += 16; 621 dst += 16; 622 } 623 624 /* Next row... */ 625 src_ptr += src_stride; 626 dst_ptr += dst_stride; 627 } 628 } 629 630 void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 631 uint8_t *dst, ptrdiff_t dst_stride, 632 const int16_t *filter_x, int x_step_q4, 633 const int16_t *filter_y, int y_step_q4, int w, 634 int h) { 635 uint32_t pos = 38; 636 637 assert(x_step_q4 == 16); 638 639 prefetch_load((const uint8_t *)filter_x); 640 641 /* bit positon for extract from acc */ 642 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 643 : 644 : [pos] "r"(pos)); 645 646 /* prefetch data to cache memory */ 647 prefetch_load(src); 648 prefetch_load(src + 32); 649 prefetch_store(dst); 650 651 switch (w) { 652 case 4: 653 convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, 654 (int32_t)dst_stride, filter_x, (int32_t)h); 655 break; 656 case 8: 657 convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, 658 (int32_t)dst_stride, filter_x, (int32_t)h); 659 break; 660 case 16: 661 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, 662 (int32_t)dst_stride, filter_x, (int32_t)h, 1); 663 break; 664 case 32: 665 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, 666 (int32_t)dst_stride, filter_x, (int32_t)h, 2); 667 break; 668 case 64: 669 prefetch_load(src + 64); 670 prefetch_store(dst + 32); 671 672 convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, 673 (int32_t)dst_stride, filter_x, (int32_t)h); 674 break; 675 default: 676 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, 677 x_step_q4, filter_y, y_step_q4, w, h); 678 break; 679 } 680 } 681 #endif 682