1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <stdio.h> 14 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom_dsp/mips/convolve_common_dspr2.h" 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/aom_filter.h" 20 #include "aom_ports/mem.h" 21 22 #if HAVE_DSPR2 23 static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, 24 uint8_t *dst, int32_t dst_stride, 25 const int16_t *filter_x0, int32_t h) { 26 int32_t y; 27 uint8_t *cm = aom_ff_cropTbl; 28 int32_t vector1b, vector2b, vector3b, vector4b; 29 int32_t Temp1, Temp2, Temp3, Temp4; 30 uint32_t vector4a = 64; 31 uint32_t tp1, tp2; 32 uint32_t p1, p2, p3, p4; 33 uint32_t n1, n2, n3, n4; 34 uint32_t tn1, tn2; 35 36 vector1b = ((const int32_t *)filter_x0)[0]; 37 vector2b = ((const int32_t *)filter_x0)[1]; 38 vector3b = ((const int32_t *)filter_x0)[2]; 39 vector4b = ((const int32_t *)filter_x0)[3]; 40 41 for (y = h; y--;) { 42 /* prefetch data to cache memory */ 43 prefetch_load(src + src_stride); 44 prefetch_load(src + src_stride + 32); 45 prefetch_store(dst + dst_stride); 46 47 __asm__ __volatile__( 48 "ulw %[tp1], 0(%[src]) \n\t" 49 "ulw %[tp2], 4(%[src]) \n\t" 50 51 /* even 1. pixel */ 52 "mtlo %[vector4a], $ac3 \n\t" 53 "mthi $zero, $ac3 \n\t" 54 "preceu.ph.qbr %[p1], %[tp1] \n\t" 55 "preceu.ph.qbl %[p2], %[tp1] \n\t" 56 "preceu.ph.qbr %[p3], %[tp2] \n\t" 57 "preceu.ph.qbl %[p4], %[tp2] \n\t" 58 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 59 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 60 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 61 "ulw %[tn2], 8(%[src]) \n\t" 62 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 63 "extp %[Temp1], $ac3, 31 \n\t" 64 65 /* even 2. pixel */ 66 "mtlo %[vector4a], $ac2 \n\t" 67 "mthi $zero, $ac2 \n\t" 68 "preceu.ph.qbr %[p1], %[tn2] \n\t" 69 "balign %[tn1], %[tn2], 3 \n\t" 70 "balign %[tn2], %[tp2], 3 \n\t" 71 "balign %[tp2], %[tp1], 3 \n\t" 72 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 73 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 74 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 75 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 76 "extp %[Temp3], $ac2, 31 \n\t" 77 78 /* odd 1. pixel */ 79 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 80 "mtlo %[vector4a], $ac3 \n\t" 81 "mthi $zero, $ac3 \n\t" 82 "preceu.ph.qbr %[n1], %[tp2] \n\t" 83 "preceu.ph.qbl %[n2], %[tp2] \n\t" 84 "preceu.ph.qbr %[n3], %[tn2] \n\t" 85 "preceu.ph.qbl %[n4], %[tn2] \n\t" 86 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 87 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 88 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 89 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" 90 "extp %[Temp2], $ac3, 31 \n\t" 91 92 /* odd 2. pixel */ 93 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 94 "mtlo %[vector4a], $ac2 \n\t" 95 "mthi $zero, $ac2 \n\t" 96 "preceu.ph.qbr %[n1], %[tn1] \n\t" 97 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 98 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 99 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 100 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" 101 "extp %[Temp4], $ac2, 31 \n\t" 102 103 /* clamp */ 104 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 105 "lbux %[n2], %[Temp4](%[cm]) \n\t" 106 107 /* store bytes */ 108 "sb %[tp1], 0(%[dst]) \n\t" 109 "sb %[tn1], 1(%[dst]) \n\t" 110 "sb %[tp2], 2(%[dst]) \n\t" 111 "sb %[n2], 3(%[dst]) \n\t" 112 113 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 114 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 115 [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), 116 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 117 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) 118 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 119 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 120 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 121 [src] "r"(src)); 122 123 /* Next row... */ 124 src += src_stride; 125 dst += dst_stride; 126 } 127 } 128 129 static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, 130 uint8_t *dst, int32_t dst_stride, 131 const int16_t *filter_x0, int32_t h) { 132 int32_t y; 133 uint8_t *cm = aom_ff_cropTbl; 134 uint32_t vector4a = 64; 135 int32_t vector1b, vector2b, vector3b, vector4b; 136 int32_t Temp1, Temp2, Temp3; 137 uint32_t tp1, tp2; 138 uint32_t p1, p2, p3, p4, n1; 139 uint32_t tn1, tn2, tn3; 140 uint32_t st0, st1; 141 142 vector1b = ((const int32_t *)filter_x0)[0]; 143 vector2b = ((const int32_t *)filter_x0)[1]; 144 vector3b = ((const int32_t *)filter_x0)[2]; 145 vector4b = ((const int32_t *)filter_x0)[3]; 146 147 for (y = h; y--;) { 148 /* prefetch data to cache memory */ 149 prefetch_load(src + src_stride); 150 prefetch_load(src + src_stride + 32); 151 prefetch_store(dst + dst_stride); 152 153 __asm__ __volatile__( 154 "ulw %[tp1], 0(%[src]) \n\t" 155 "ulw %[tp2], 4(%[src]) \n\t" 156 157 /* even 1. pixel */ 158 "mtlo %[vector4a], $ac3 \n\t" 159 "mthi $zero, $ac3 \n\t" 160 "mtlo %[vector4a], $ac2 \n\t" 161 "mthi $zero, $ac2 \n\t" 162 "preceu.ph.qbr %[p1], %[tp1] \n\t" 163 "preceu.ph.qbl %[p2], %[tp1] \n\t" 164 "preceu.ph.qbr %[p3], %[tp2] \n\t" 165 "preceu.ph.qbl %[p4], %[tp2] \n\t" 166 "ulw %[tn2], 8(%[src]) \n\t" 167 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 168 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 169 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 170 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 171 "extp %[Temp1], $ac3, 31 \n\t" 172 173 /* even 2. pixel */ 174 "preceu.ph.qbr %[p1], %[tn2] \n\t" 175 "preceu.ph.qbl %[n1], %[tn2] \n\t" 176 "ulw %[tn1], 12(%[src]) \n\t" 177 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 178 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 179 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 180 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 181 "extp %[Temp3], $ac2, 31 \n\t" 182 183 /* even 3. pixel */ 184 "lbux %[st0], %[Temp1](%[cm]) \n\t" 185 "mtlo %[vector4a], $ac1 \n\t" 186 "mthi $zero, $ac1 \n\t" 187 "preceu.ph.qbr %[p2], %[tn1] \n\t" 188 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 189 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 190 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 191 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 192 "extp %[Temp1], $ac1, 31 \n\t" 193 194 /* even 4. pixel */ 195 "mtlo %[vector4a], $ac2 \n\t" 196 "mthi $zero, $ac2 \n\t" 197 "mtlo %[vector4a], $ac3 \n\t" 198 "mthi $zero, $ac3 \n\t" 199 "sb %[st0], 0(%[dst]) \n\t" 200 "lbux %[st1], %[Temp3](%[cm]) \n\t" 201 202 "balign %[tn3], %[tn1], 3 \n\t" 203 "balign %[tn1], %[tn2], 3 \n\t" 204 "balign %[tn2], %[tp2], 3 \n\t" 205 "balign %[tp2], %[tp1], 3 \n\t" 206 207 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 208 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 209 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 210 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 211 "extp %[Temp3], $ac2, 31 \n\t" 212 213 "lbux %[st0], %[Temp1](%[cm]) \n\t" 214 215 /* odd 1. pixel */ 216 "mtlo %[vector4a], $ac1 \n\t" 217 "mthi $zero, $ac1 \n\t" 218 "sb %[st1], 2(%[dst]) \n\t" 219 "preceu.ph.qbr %[p1], %[tp2] \n\t" 220 "preceu.ph.qbl %[p2], %[tp2] \n\t" 221 "preceu.ph.qbr %[p3], %[tn2] \n\t" 222 "preceu.ph.qbl %[p4], %[tn2] \n\t" 223 "sb %[st0], 4(%[dst]) \n\t" 224 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 225 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 226 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 227 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 228 "extp %[Temp2], $ac3, 31 \n\t" 229 230 /* odd 2. pixel */ 231 "mtlo %[vector4a], $ac3 \n\t" 232 "mthi $zero, $ac3 \n\t" 233 "mtlo %[vector4a], $ac2 \n\t" 234 "mthi $zero, $ac2 \n\t" 235 "preceu.ph.qbr %[p1], %[tn1] \n\t" 236 "preceu.ph.qbl %[n1], %[tn1] \n\t" 237 "lbux %[st0], %[Temp3](%[cm]) \n\t" 238 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 239 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 240 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 241 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 242 "extp %[Temp3], $ac1, 31 \n\t" 243 244 /* odd 3. pixel */ 245 "lbux %[st1], %[Temp2](%[cm]) \n\t" 246 "preceu.ph.qbr %[p2], %[tn3] \n\t" 247 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 248 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 249 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 250 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 251 "extp %[Temp2], $ac3, 31 \n\t" 252 253 /* odd 4. pixel */ 254 "sb %[st1], 1(%[dst]) \n\t" 255 "sb %[st0], 6(%[dst]) \n\t" 256 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 257 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 258 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 259 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 260 "extp %[Temp1], $ac2, 31 \n\t" 261 262 /* clamp */ 263 "lbux %[p4], %[Temp3](%[cm]) \n\t" 264 "lbux %[p2], %[Temp2](%[cm]) \n\t" 265 "lbux %[n1], %[Temp1](%[cm]) \n\t" 266 267 /* store bytes */ 268 "sb %[p4], 3(%[dst]) \n\t" 269 "sb %[p2], 5(%[dst]) \n\t" 270 "sb %[n1], 7(%[dst]) \n\t" 271 272 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 273 [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), 274 [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 275 [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), 276 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 277 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 278 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 279 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 280 [src] "r"(src)); 281 282 /* Next row... */ 283 src += src_stride; 284 dst += dst_stride; 285 } 286 } 287 288 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, 289 uint8_t *dst_ptr, int32_t dst_stride, 290 const int16_t *filter_x0, int32_t h, 291 int32_t count) { 292 int32_t y, c; 293 const uint8_t *src; 294 uint8_t *dst; 295 uint8_t *cm = aom_ff_cropTbl; 296 uint32_t vector_64 = 64; 297 int32_t filter12, filter34, filter56, filter78; 298 int32_t Temp1, Temp2, Temp3; 299 uint32_t qload1, qload2, qload3; 300 uint32_t p1, p2, p3, p4, p5; 301 uint32_t st1, st2, st3; 302 303 filter12 = ((const int32_t *)filter_x0)[0]; 304 filter34 = ((const int32_t *)filter_x0)[1]; 305 filter56 = ((const int32_t *)filter_x0)[2]; 306 filter78 = ((const int32_t *)filter_x0)[3]; 307 308 for (y = h; y--;) { 309 src = src_ptr; 310 dst = dst_ptr; 311 312 /* prefetch data to cache memory */ 313 prefetch_load(src_ptr + src_stride); 314 prefetch_load(src_ptr + src_stride + 32); 315 prefetch_store(dst_ptr + dst_stride); 316 317 for (c = 0; c < count; c++) { 318 __asm__ __volatile__( 319 "ulw %[qload1], 0(%[src]) \n\t" 320 "ulw %[qload2], 4(%[src]) \n\t" 321 322 /* even 1. pixel */ 323 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 324 "mthi $zero, $ac1 \n\t" 325 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 326 "mthi $zero, $ac2 \n\t" 327 "preceu.ph.qbr %[p1], %[qload1] \n\t" 328 "preceu.ph.qbl %[p2], %[qload1] \n\t" 329 "preceu.ph.qbr %[p3], %[qload2] \n\t" 330 "preceu.ph.qbl %[p4], %[qload2] \n\t" 331 "ulw %[qload3], 8(%[src]) \n\t" 332 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 333 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 334 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 335 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 336 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 337 338 /* even 2. pixel */ 339 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 340 "mthi $zero, $ac3 \n\t" 341 "preceu.ph.qbr %[p1], %[qload3] \n\t" 342 "preceu.ph.qbl %[p5], %[qload3] \n\t" 343 "ulw %[qload1], 12(%[src]) \n\t" 344 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 345 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 346 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 347 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 348 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 349 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 350 351 /* even 3. pixel */ 352 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 353 "mthi $zero, $ac1 \n\t" 354 "preceu.ph.qbr %[p2], %[qload1] \n\t" 355 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 356 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 357 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 358 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 359 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 360 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 361 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 362 363 /* even 4. pixel */ 364 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 365 "mthi $zero, $ac2 \n\t" 366 "preceu.ph.qbl %[p3], %[qload1] \n\t" 367 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 368 "ulw %[qload2], 16(%[src]) \n\t" 369 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 370 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 371 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 372 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 373 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 374 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 375 376 /* even 5. pixel */ 377 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 378 "mthi $zero, $ac3 \n\t" 379 "preceu.ph.qbr %[p4], %[qload2] \n\t" 380 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 381 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 382 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 383 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 384 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 385 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 386 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 387 388 /* even 6. pixel */ 389 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 390 "mthi $zero, $ac1 \n\t" 391 "preceu.ph.qbl %[p1], %[qload2] \n\t" 392 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 393 "ulw %[qload3], 20(%[src]) \n\t" 394 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 395 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 396 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 397 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 398 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 399 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 400 401 /* even 7. pixel */ 402 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 403 "mthi $zero, $ac2 \n\t" 404 "preceu.ph.qbr %[p5], %[qload3] \n\t" 405 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 406 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 407 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 408 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 409 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 410 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 411 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 412 413 /* even 8. pixel */ 414 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 415 "mthi $zero, $ac3 \n\t" 416 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 417 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 418 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 419 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 420 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 421 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 422 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 423 424 /* ODD pixels */ 425 "ulw %[qload1], 1(%[src]) \n\t" 426 "ulw %[qload2], 5(%[src]) \n\t" 427 428 /* odd 1. pixel */ 429 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 430 "mthi $zero, $ac1 \n\t" 431 "preceu.ph.qbr %[p1], %[qload1] \n\t" 432 "preceu.ph.qbl %[p2], %[qload1] \n\t" 433 "preceu.ph.qbr %[p3], %[qload2] \n\t" 434 "preceu.ph.qbl %[p4], %[qload2] \n\t" 435 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 436 "ulw %[qload3], 9(%[src]) \n\t" 437 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 438 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 439 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 440 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 441 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 442 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 443 444 /* odd 2. pixel */ 445 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 446 "mthi $zero, $ac2 \n\t" 447 "preceu.ph.qbr %[p1], %[qload3] \n\t" 448 "preceu.ph.qbl %[p5], %[qload3] \n\t" 449 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 450 "ulw %[qload1], 13(%[src]) \n\t" 451 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 452 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 453 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 454 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 455 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 456 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 457 458 /* odd 3. pixel */ 459 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 460 "mthi $zero, $ac3 \n\t" 461 "preceu.ph.qbr %[p2], %[qload1] \n\t" 462 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 463 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 464 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 465 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 466 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 467 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 468 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 469 470 /* odd 4. pixel */ 471 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 472 "mthi $zero, $ac1 \n\t" 473 "preceu.ph.qbl %[p3], %[qload1] \n\t" 474 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 475 "ulw %[qload2], 17(%[src]) \n\t" 476 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 477 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 478 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 479 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 480 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 481 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 482 483 /* odd 5. pixel */ 484 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 485 "mthi $zero, $ac2 \n\t" 486 "preceu.ph.qbr %[p4], %[qload2] \n\t" 487 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 488 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 489 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 490 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 491 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 492 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 493 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 494 495 /* odd 6. pixel */ 496 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 497 "mthi $zero, $ac3 \n\t" 498 "preceu.ph.qbl %[p1], %[qload2] \n\t" 499 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 500 "ulw %[qload3], 21(%[src]) \n\t" 501 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 502 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 503 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 504 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 505 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 506 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 507 508 /* odd 7. pixel */ 509 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 510 "mthi $zero, $ac1 \n\t" 511 "preceu.ph.qbr %[p5], %[qload3] \n\t" 512 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 513 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 514 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 515 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 516 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 517 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 518 519 /* odd 8. pixel */ 520 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 521 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 522 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 523 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 524 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 525 526 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 527 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 528 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 529 530 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 531 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 532 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 533 534 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 535 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 536 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 537 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 538 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 539 : [filter12] "r"(filter12), [filter34] "r"(filter34), 540 [filter56] "r"(filter56), [filter78] "r"(filter78), 541 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 542 [src] "r"(src)); 543 544 src += 16; 545 dst += 16; 546 } 547 548 /* Next row... */ 549 src_ptr += src_stride; 550 dst_ptr += dst_stride; 551 } 552 } 553 554 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, 555 uint8_t *dst_ptr, int32_t dst_stride, 556 const int16_t *filter_x0, int32_t h) { 557 int32_t y, c; 558 const uint8_t *src; 559 uint8_t *dst; 560 uint8_t *cm = aom_ff_cropTbl; 561 uint32_t vector_64 = 64; 562 int32_t filter12, filter34, filter56, filter78; 563 int32_t Temp1, Temp2, Temp3; 564 uint32_t qload1, qload2, qload3; 565 uint32_t p1, p2, p3, p4, p5; 566 uint32_t st1, st2, st3; 567 568 filter12 = ((const int32_t *)filter_x0)[0]; 569 filter34 = ((const int32_t *)filter_x0)[1]; 570 filter56 = ((const int32_t *)filter_x0)[2]; 571 filter78 = ((const int32_t *)filter_x0)[3]; 572 573 for (y = h; y--;) { 574 src = src_ptr; 575 dst = dst_ptr; 576 577 /* prefetch data to cache memory */ 578 prefetch_load(src_ptr + src_stride); 579 prefetch_load(src_ptr + src_stride + 32); 580 prefetch_load(src_ptr + src_stride + 64); 581 prefetch_store(dst_ptr + dst_stride); 582 prefetch_store(dst_ptr + dst_stride + 32); 583 584 for (c = 0; c < 4; c++) { 585 __asm__ __volatile__( 586 "ulw %[qload1], 0(%[src]) \n\t" 587 "ulw %[qload2], 4(%[src]) \n\t" 588 589 /* even 1. pixel */ 590 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 591 "mthi $zero, $ac1 \n\t" 592 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 593 "mthi $zero, $ac2 \n\t" 594 "preceu.ph.qbr %[p1], %[qload1] \n\t" 595 "preceu.ph.qbl %[p2], %[qload1] \n\t" 596 "preceu.ph.qbr %[p3], %[qload2] \n\t" 597 "preceu.ph.qbl %[p4], %[qload2] \n\t" 598 "ulw %[qload3], 8(%[src]) \n\t" 599 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 600 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 601 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 602 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 603 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 604 605 /* even 2. pixel */ 606 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 607 "mthi $zero, $ac3 \n\t" 608 "preceu.ph.qbr %[p1], %[qload3] \n\t" 609 "preceu.ph.qbl %[p5], %[qload3] \n\t" 610 "ulw %[qload1], 12(%[src]) \n\t" 611 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 612 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 613 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 614 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 615 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 616 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 617 618 /* even 3. pixel */ 619 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 620 "mthi $zero, $ac1 \n\t" 621 "preceu.ph.qbr %[p2], %[qload1] \n\t" 622 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 623 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 624 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 625 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 626 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 627 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 628 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 629 630 /* even 4. pixel */ 631 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 632 "mthi $zero, $ac2 \n\t" 633 "preceu.ph.qbl %[p3], %[qload1] \n\t" 634 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 635 "ulw %[qload2], 16(%[src]) \n\t" 636 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 637 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 638 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 639 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 640 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 641 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 642 643 /* even 5. pixel */ 644 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 645 "mthi $zero, $ac3 \n\t" 646 "preceu.ph.qbr %[p4], %[qload2] \n\t" 647 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 648 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 649 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 650 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 651 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 652 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 653 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 654 655 /* even 6. pixel */ 656 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 657 "mthi $zero, $ac1 \n\t" 658 "preceu.ph.qbl %[p1], %[qload2] \n\t" 659 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 660 "ulw %[qload3], 20(%[src]) \n\t" 661 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 662 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 663 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 664 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 665 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 666 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 667 668 /* even 7. pixel */ 669 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 670 "mthi $zero, $ac2 \n\t" 671 "preceu.ph.qbr %[p5], %[qload3] \n\t" 672 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 673 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 674 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 675 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 676 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 677 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 678 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 679 680 /* even 8. pixel */ 681 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 682 "mthi $zero, $ac3 \n\t" 683 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 684 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 685 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 686 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 687 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 688 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 689 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 690 691 /* ODD pixels */ 692 "ulw %[qload1], 1(%[src]) \n\t" 693 "ulw %[qload2], 5(%[src]) \n\t" 694 695 /* odd 1. pixel */ 696 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 697 "mthi $zero, $ac1 \n\t" 698 "preceu.ph.qbr %[p1], %[qload1] \n\t" 699 "preceu.ph.qbl %[p2], %[qload1] \n\t" 700 "preceu.ph.qbr %[p3], %[qload2] \n\t" 701 "preceu.ph.qbl %[p4], %[qload2] \n\t" 702 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 703 "ulw %[qload3], 9(%[src]) \n\t" 704 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 705 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 706 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 707 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 708 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 709 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 710 711 /* odd 2. pixel */ 712 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 713 "mthi $zero, $ac2 \n\t" 714 "preceu.ph.qbr %[p1], %[qload3] \n\t" 715 "preceu.ph.qbl %[p5], %[qload3] \n\t" 716 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 717 "ulw %[qload1], 13(%[src]) \n\t" 718 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 719 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 720 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 721 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 722 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 723 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 724 725 /* odd 3. pixel */ 726 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 727 "mthi $zero, $ac3 \n\t" 728 "preceu.ph.qbr %[p2], %[qload1] \n\t" 729 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 730 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 731 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 732 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 733 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 734 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 735 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 736 737 /* odd 4. pixel */ 738 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 739 "mthi $zero, $ac1 \n\t" 740 "preceu.ph.qbl %[p3], %[qload1] \n\t" 741 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 742 "ulw %[qload2], 17(%[src]) \n\t" 743 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 744 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 745 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 746 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 747 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 748 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 749 750 /* odd 5. pixel */ 751 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 752 "mthi $zero, $ac2 \n\t" 753 "preceu.ph.qbr %[p4], %[qload2] \n\t" 754 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 755 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 756 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 757 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 758 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 759 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 760 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 761 762 /* odd 6. pixel */ 763 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 764 "mthi $zero, $ac3 \n\t" 765 "preceu.ph.qbl %[p1], %[qload2] \n\t" 766 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 767 "ulw %[qload3], 21(%[src]) \n\t" 768 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 769 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 770 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 771 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 772 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 773 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 774 775 /* odd 7. pixel */ 776 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 777 "mthi $zero, $ac1 \n\t" 778 "preceu.ph.qbr %[p5], %[qload3] \n\t" 779 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 780 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 781 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 782 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 783 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 784 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 785 786 /* odd 8. pixel */ 787 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 788 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 789 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 790 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 791 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 792 793 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 794 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 795 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 796 797 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 798 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 799 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 800 801 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 802 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 803 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 804 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 805 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 806 : [filter12] "r"(filter12), [filter34] "r"(filter34), 807 [filter56] "r"(filter56), [filter78] "r"(filter78), 808 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 809 [src] "r"(src)); 810 811 src += 16; 812 dst += 16; 813 } 814 815 /* Next row... */ 816 src_ptr += src_stride; 817 dst_ptr += dst_stride; 818 } 819 } 820 821 void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 822 uint8_t *dst, ptrdiff_t dst_stride, 823 const int16_t *filter_x, int x_step_q4, 824 const int16_t *filter_y, int y_step_q4, int w, 825 int h) { 826 assert(x_step_q4 == 16); 827 assert(((const int32_t *)filter_x)[1] != 0x800000); 828 829 if (((const int32_t *)filter_x)[0] == 0) { 830 aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, 831 x_step_q4, filter_y, y_step_q4, w, h); 832 } else { 833 uint32_t pos = 38; 834 835 prefetch_load((const uint8_t *)filter_x); 836 src -= 3; 837 838 /* bit positon for extract from acc */ 839 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 840 : 841 : [pos] "r"(pos)); 842 843 /* prefetch data to cache memory */ 844 prefetch_load(src); 845 prefetch_load(src + 32); 846 prefetch_store(dst); 847 848 switch (w) { 849 case 4: 850 convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, 851 (int32_t)dst_stride, filter_x, (int32_t)h); 852 break; 853 case 8: 854 convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, 855 (int32_t)dst_stride, filter_x, (int32_t)h); 856 break; 857 case 16: 858 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, 859 (int32_t)dst_stride, filter_x, (int32_t)h, 1); 860 break; 861 case 32: 862 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, 863 (int32_t)dst_stride, filter_x, (int32_t)h, 2); 864 break; 865 case 64: 866 prefetch_load(src + 64); 867 prefetch_store(dst + 32); 868 869 convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, 870 (int32_t)dst_stride, filter_x, (int32_t)h); 871 break; 872 default: 873 aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, 874 x_step_q4, filter_y, y_step_q4, w, h); 875 break; 876 } 877 } 878 } 879 #endif 880