1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx_dsp/mips/convolve_common_dspr2.h" 16 #include "vpx_dsp/vpx_convolve.h" 17 #include "vpx_dsp/vpx_dsp_common.h" 18 #include "vpx_ports/mem.h" 19 20 #if HAVE_DSPR2 21 static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, 22 uint8_t *dst, int32_t dst_stride, 23 const int16_t *filter_x0, int32_t h) { 24 int32_t y; 25 uint8_t *cm = vpx_ff_cropTbl; 26 int32_t vector1b, vector2b, vector3b, vector4b; 27 int32_t Temp1, Temp2, Temp3, Temp4; 28 uint32_t vector4a = 64; 29 uint32_t tp1, tp2; 30 uint32_t p1, p2, p3, p4; 31 uint32_t n1, n2, n3, n4; 32 uint32_t tn1, tn2; 33 34 vector1b = ((const int32_t *)filter_x0)[0]; 35 vector2b = ((const int32_t *)filter_x0)[1]; 36 vector3b = ((const int32_t *)filter_x0)[2]; 37 vector4b = ((const int32_t *)filter_x0)[3]; 38 39 for (y = h; y--;) { 40 /* prefetch data to cache memory */ 41 prefetch_load(src + src_stride); 42 prefetch_load(src + src_stride + 32); 43 prefetch_store(dst + dst_stride); 44 45 __asm__ __volatile__( 46 "ulw %[tp1], 0(%[src]) \n\t" 47 "ulw %[tp2], 4(%[src]) \n\t" 48 49 /* even 1. pixel */ 50 "mtlo %[vector4a], $ac3 \n\t" 51 "mthi $zero, $ac3 \n\t" 52 "preceu.ph.qbr %[p1], %[tp1] \n\t" 53 "preceu.ph.qbl %[p2], %[tp1] \n\t" 54 "preceu.ph.qbr %[p3], %[tp2] \n\t" 55 "preceu.ph.qbl %[p4], %[tp2] \n\t" 56 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 57 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 58 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 59 "ulw %[tn2], 8(%[src]) \n\t" 60 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 61 "extp %[Temp1], $ac3, 31 \n\t" 62 63 /* even 2. pixel */ 64 "mtlo %[vector4a], $ac2 \n\t" 65 "mthi $zero, $ac2 \n\t" 66 "preceu.ph.qbr %[p1], %[tn2] \n\t" 67 "balign %[tn1], %[tn2], 3 \n\t" 68 "balign %[tn2], %[tp2], 3 \n\t" 69 "balign %[tp2], %[tp1], 3 \n\t" 70 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 71 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 72 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 73 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 74 "extp %[Temp3], $ac2, 31 \n\t" 75 76 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ 77 78 /* odd 1. pixel */ 79 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ 80 "mtlo %[vector4a], $ac3 \n\t" 81 "mthi $zero, $ac3 \n\t" 82 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ 83 "preceu.ph.qbr %[n1], %[tp2] \n\t" 84 "preceu.ph.qbl %[n2], %[tp2] \n\t" 85 "preceu.ph.qbr %[n3], %[tn2] \n\t" 86 "preceu.ph.qbl %[n4], %[tn2] \n\t" 87 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 88 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 89 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 90 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" 91 "extp %[Temp2], $ac3, 31 \n\t" 92 93 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ 94 95 /* odd 2. pixel */ 96 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ 97 "mtlo %[vector4a], $ac2 \n\t" 98 "mthi $zero, $ac2 \n\t" 99 "preceu.ph.qbr %[n1], %[tn1] \n\t" 100 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ 101 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ 102 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 103 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 104 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 105 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" 106 "extp %[Temp4], $ac2, 31 \n\t" 107 108 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ 109 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ 110 111 /* clamp */ 112 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ 113 "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ 114 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ 115 116 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ 117 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ 118 119 "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ 120 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ 121 122 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 123 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 124 [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), 125 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 126 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) 127 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 128 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 129 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 130 [src] "r"(src)); 131 132 /* Next row... */ 133 src += src_stride; 134 dst += dst_stride; 135 } 136 } 137 138 static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, 139 uint8_t *dst, int32_t dst_stride, 140 const int16_t *filter_x0, int32_t h) { 141 int32_t y; 142 uint8_t *cm = vpx_ff_cropTbl; 143 uint32_t vector4a = 64; 144 int32_t vector1b, vector2b, vector3b, vector4b; 145 int32_t Temp1, Temp2, Temp3; 146 uint32_t tp1, tp2; 147 uint32_t p1, p2, p3, p4, n1; 148 uint32_t tn1, tn2, tn3; 149 uint32_t st0, st1; 150 151 vector1b = ((const int32_t *)filter_x0)[0]; 152 vector2b = ((const int32_t *)filter_x0)[1]; 153 vector3b = ((const int32_t *)filter_x0)[2]; 154 vector4b = ((const int32_t *)filter_x0)[3]; 155 156 for (y = h; y--;) { 157 /* prefetch data to cache memory */ 158 prefetch_load(src + src_stride); 159 prefetch_load(src + src_stride + 32); 160 prefetch_store(dst + dst_stride); 161 162 __asm__ __volatile__( 163 "ulw %[tp1], 0(%[src]) \n\t" 164 "ulw %[tp2], 4(%[src]) \n\t" 165 166 /* even 1. pixel */ 167 "mtlo %[vector4a], $ac3 \n\t" 168 "mthi $zero, $ac3 \n\t" 169 "mtlo %[vector4a], $ac2 \n\t" 170 "mthi $zero, $ac2 \n\t" 171 "preceu.ph.qbr %[p1], %[tp1] \n\t" 172 "preceu.ph.qbl %[p2], %[tp1] \n\t" 173 "preceu.ph.qbr %[p3], %[tp2] \n\t" 174 "preceu.ph.qbl %[p4], %[tp2] \n\t" 175 "ulw %[tn2], 8(%[src]) \n\t" 176 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 177 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 178 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 179 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 180 "extp %[Temp1], $ac3, 31 \n\t" 181 "lbu %[Temp2], 0(%[dst]) \n\t" 182 "lbu %[tn3], 2(%[dst]) \n\t" 183 184 /* even 2. pixel */ 185 "preceu.ph.qbr %[p1], %[tn2] \n\t" 186 "preceu.ph.qbl %[n1], %[tn2] \n\t" 187 "ulw %[tn1], 12(%[src]) \n\t" 188 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 189 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 190 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 191 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 192 "extp %[Temp3], $ac2, 31 \n\t" 193 194 /* even 3. pixel */ 195 "lbux %[st0], %[Temp1](%[cm]) \n\t" 196 "mtlo %[vector4a], $ac1 \n\t" 197 "mthi $zero, $ac1 \n\t" 198 "preceu.ph.qbr %[p2], %[tn1] \n\t" 199 "lbux %[st1], %[Temp3](%[cm]) \n\t" 200 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 201 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 202 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 203 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 204 "extp %[Temp1], $ac1, 31 \n\t" 205 206 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 207 "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" 208 "sb %[Temp2], 0(%[dst]) \n\t" 209 "sb %[tn3], 2(%[dst]) \n\t" 210 211 /* even 4. pixel */ 212 "mtlo %[vector4a], $ac2 \n\t" 213 "mthi $zero, $ac2 \n\t" 214 "mtlo %[vector4a], $ac3 \n\t" 215 "mthi $zero, $ac3 \n\t" 216 217 "balign %[tn3], %[tn1], 3 \n\t" 218 "balign %[tn1], %[tn2], 3 \n\t" 219 "balign %[tn2], %[tp2], 3 \n\t" 220 "balign %[tp2], %[tp1], 3 \n\t" 221 222 "lbux %[st0], %[Temp1](%[cm]) \n\t" 223 "lbu %[Temp2], 4(%[dst]) \n\t" 224 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 225 226 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 227 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 228 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 229 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 230 "extp %[Temp3], $ac2, 31 \n\t" 231 232 /* odd 1. pixel */ 233 "mtlo %[vector4a], $ac1 \n\t" 234 "mthi $zero, $ac1 \n\t" 235 "sb %[Temp2], 4(%[dst]) \n\t" 236 "preceu.ph.qbr %[p1], %[tp2] \n\t" 237 "preceu.ph.qbl %[p2], %[tp2] \n\t" 238 "preceu.ph.qbr %[p3], %[tn2] \n\t" 239 "preceu.ph.qbl %[p4], %[tn2] \n\t" 240 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 241 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 242 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 243 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 244 "extp %[Temp2], $ac3, 31 \n\t" 245 246 "lbu %[tp1], 6(%[dst]) \n\t" 247 248 /* odd 2. pixel */ 249 "mtlo %[vector4a], $ac3 \n\t" 250 "mthi $zero, $ac3 \n\t" 251 "mtlo %[vector4a], $ac2 \n\t" 252 "mthi $zero, $ac2 \n\t" 253 "preceu.ph.qbr %[p1], %[tn1] \n\t" 254 "preceu.ph.qbl %[n1], %[tn1] \n\t" 255 "lbux %[st0], %[Temp3](%[cm]) \n\t" 256 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 257 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 258 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 259 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 260 "extp %[Temp3], $ac1, 31 \n\t" 261 262 "lbu %[tp2], 1(%[dst]) \n\t" 263 "lbu %[tn2], 3(%[dst]) \n\t" 264 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" 265 266 /* odd 3. pixel */ 267 "lbux %[st1], %[Temp2](%[cm]) \n\t" 268 "preceu.ph.qbr %[p2], %[tn3] \n\t" 269 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 270 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 271 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 272 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 273 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" 274 "extp %[Temp2], $ac3, 31 \n\t" 275 276 "lbu %[tn3], 5(%[dst]) \n\t" 277 278 /* odd 4. pixel */ 279 "sb %[tp2], 1(%[dst]) \n\t" 280 "sb %[tp1], 6(%[dst]) \n\t" 281 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 282 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 283 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 284 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 285 "extp %[Temp1], $ac2, 31 \n\t" 286 287 "lbu %[tn1], 7(%[dst]) \n\t" 288 289 /* clamp */ 290 "lbux %[p4], %[Temp3](%[cm]) \n\t" 291 "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" 292 293 "lbux %[p2], %[Temp2](%[cm]) \n\t" 294 "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" 295 296 "lbux %[n1], %[Temp1](%[cm]) \n\t" 297 "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" 298 299 /* store bytes */ 300 "sb %[tn2], 3(%[dst]) \n\t" 301 "sb %[tn3], 5(%[dst]) \n\t" 302 "sb %[tn1], 7(%[dst]) \n\t" 303 304 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 305 [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), 306 [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 307 [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), 308 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 309 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 310 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 311 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 312 [src] "r"(src)); 313 314 /* Next row... */ 315 src += src_stride; 316 dst += dst_stride; 317 } 318 } 319 320 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, 321 int32_t src_stride, uint8_t *dst_ptr, 322 int32_t dst_stride, 323 const int16_t *filter_x0, int32_t h, 324 int32_t count) { 325 int32_t y, c; 326 const uint8_t *src; 327 uint8_t *dst; 328 uint8_t *cm = vpx_ff_cropTbl; 329 uint32_t vector_64 = 64; 330 int32_t filter12, filter34, filter56, filter78; 331 int32_t Temp1, Temp2, Temp3; 332 uint32_t qload1, qload2, qload3; 333 uint32_t p1, p2, p3, p4, p5; 334 uint32_t st1, st2, st3; 335 336 filter12 = ((const int32_t *)filter_x0)[0]; 337 filter34 = ((const int32_t *)filter_x0)[1]; 338 filter56 = ((const int32_t *)filter_x0)[2]; 339 filter78 = ((const int32_t *)filter_x0)[3]; 340 341 for (y = h; y--;) { 342 src = src_ptr; 343 dst = dst_ptr; 344 345 /* prefetch data to cache memory */ 346 prefetch_load(src_ptr + src_stride); 347 prefetch_load(src_ptr + src_stride + 32); 348 prefetch_store(dst_ptr + dst_stride); 349 350 for (c = 0; c < count; c++) { 351 __asm__ __volatile__( 352 "ulw %[qload1], 0(%[src]) \n\t" 353 "ulw %[qload2], 4(%[src]) \n\t" 354 355 /* even 1. pixel */ 356 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 357 "mthi $zero, $ac1 \n\t" 358 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 359 "mthi $zero, $ac2 \n\t" 360 "preceu.ph.qbr %[p1], %[qload1] \n\t" 361 "preceu.ph.qbl %[p2], %[qload1] \n\t" 362 "preceu.ph.qbr %[p3], %[qload2] \n\t" 363 "preceu.ph.qbl %[p4], %[qload2] \n\t" 364 "ulw %[qload3], 8(%[src]) \n\t" 365 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 366 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 367 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 368 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 369 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 370 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 371 372 /* even 2. pixel */ 373 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 374 "mthi $zero, $ac3 \n\t" 375 "preceu.ph.qbr %[p1], %[qload3] \n\t" 376 "preceu.ph.qbl %[p5], %[qload3] \n\t" 377 "ulw %[qload1], 12(%[src]) \n\t" 378 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 379 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 380 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 381 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 382 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 383 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 384 385 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 386 387 /* even 3. pixel */ 388 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 389 "mthi $zero, $ac1 \n\t" 390 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 391 "preceu.ph.qbr %[p2], %[qload1] \n\t" 392 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 393 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 394 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 395 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 396 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 397 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 398 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 399 400 /* even 4. pixel */ 401 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 402 "mthi $zero, $ac2 \n\t" 403 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 404 "preceu.ph.qbl %[p3], %[qload1] \n\t" 405 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 406 "ulw %[qload2], 16(%[src]) \n\t" 407 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 408 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 409 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 410 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 411 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 412 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 413 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 414 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 415 416 /* even 5. pixel */ 417 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 418 "mthi $zero, $ac3 \n\t" 419 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 420 "preceu.ph.qbr %[p4], %[qload2] \n\t" 421 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 422 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 423 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 424 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 425 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 426 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 427 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 428 429 /* even 6. pixel */ 430 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 431 "mthi $zero, $ac1 \n\t" 432 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 433 "preceu.ph.qbl %[p1], %[qload2] \n\t" 434 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 435 "ulw %[qload3], 20(%[src]) \n\t" 436 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 437 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 438 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 439 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 440 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 441 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 442 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 443 444 /* even 7. pixel */ 445 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 446 "mthi $zero, $ac2 \n\t" 447 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 448 "preceu.ph.qbr %[p5], %[qload3] \n\t" 449 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 450 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 451 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 452 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 453 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 454 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 455 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 456 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 457 458 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 459 460 /* even 8. pixel */ 461 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 462 "mthi $zero, $ac3 \n\t" 463 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 464 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 465 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 466 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 467 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 468 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 469 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 470 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 471 472 /* ODD pixels */ 473 "ulw %[qload1], 1(%[src]) \n\t" 474 "ulw %[qload2], 5(%[src]) \n\t" 475 476 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 477 478 /* odd 1. pixel */ 479 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 480 "mthi $zero, $ac1 \n\t" 481 "preceu.ph.qbr %[p1], %[qload1] \n\t" 482 "preceu.ph.qbl %[p2], %[qload1] \n\t" 483 "preceu.ph.qbr %[p3], %[qload2] \n\t" 484 "preceu.ph.qbl %[p4], %[qload2] \n\t" 485 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 486 "ulw %[qload3], 9(%[src]) \n\t" 487 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 488 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 489 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 490 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 491 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 492 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 493 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 494 495 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 496 497 /* odd 2. pixel */ 498 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 499 "mthi $zero, $ac2 \n\t" 500 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 501 "preceu.ph.qbr %[p1], %[qload3] \n\t" 502 "preceu.ph.qbl %[p5], %[qload3] \n\t" 503 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 504 "ulw %[qload1], 13(%[src]) \n\t" 505 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 506 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 507 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 508 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 509 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 510 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 511 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 512 513 /* odd 3. pixel */ 514 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 515 "mthi $zero, $ac3 \n\t" 516 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 517 "preceu.ph.qbr %[p2], %[qload1] \n\t" 518 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 519 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 520 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 521 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 522 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 523 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 524 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 525 526 /* odd 4. pixel */ 527 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 528 "mthi $zero, $ac1 \n\t" 529 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 530 "preceu.ph.qbl %[p3], %[qload1] \n\t" 531 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 532 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 533 "ulw %[qload2], 17(%[src]) \n\t" 534 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 535 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 536 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 537 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 538 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 539 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 540 541 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 542 543 /* odd 5. pixel */ 544 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 545 "mthi $zero, $ac2 \n\t" 546 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 547 "preceu.ph.qbr %[p4], %[qload2] \n\t" 548 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 549 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 550 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 551 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 552 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 553 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 554 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 555 556 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 557 558 /* odd 6. pixel */ 559 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 560 "mthi $zero, $ac3 \n\t" 561 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 562 "preceu.ph.qbl %[p1], %[qload2] \n\t" 563 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 564 "ulw %[qload3], 21(%[src]) \n\t" 565 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 566 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 567 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 568 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 569 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 570 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 571 572 /* odd 7. pixel */ 573 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 574 "mthi $zero, $ac1 \n\t" 575 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 576 "preceu.ph.qbr %[p5], %[qload3] \n\t" 577 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 578 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 579 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 580 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 581 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 582 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 583 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 584 585 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 586 587 /* odd 8. pixel */ 588 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 589 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 590 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 591 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 592 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 593 594 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 595 596 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 597 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 598 599 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 600 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 601 602 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 603 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 604 605 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 606 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 607 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 608 609 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), 610 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), 611 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), 612 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 613 [Temp3] "=&r"(Temp3) 614 : [filter12] "r"(filter12), [filter34] "r"(filter34), 615 [filter56] "r"(filter56), [filter78] "r"(filter78), 616 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 617 [src] "r"(src)); 618 619 src += 16; 620 dst += 16; 621 } 622 623 /* Next row... */ 624 src_ptr += src_stride; 625 dst_ptr += dst_stride; 626 } 627 } 628 629 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, 630 int32_t src_stride, uint8_t *dst_ptr, 631 int32_t dst_stride, 632 const int16_t *filter_x0, int32_t h) { 633 int32_t y, c; 634 const uint8_t *src; 635 uint8_t *dst; 636 uint8_t *cm = vpx_ff_cropTbl; 637 uint32_t vector_64 = 64; 638 int32_t filter12, filter34, filter56, filter78; 639 int32_t Temp1, Temp2, Temp3; 640 uint32_t qload1, qload2, qload3; 641 uint32_t p1, p2, p3, p4, p5; 642 uint32_t st1, st2, st3; 643 644 filter12 = ((const int32_t *)filter_x0)[0]; 645 filter34 = ((const int32_t *)filter_x0)[1]; 646 filter56 = ((const int32_t *)filter_x0)[2]; 647 filter78 = ((const int32_t *)filter_x0)[3]; 648 649 for (y = h; y--;) { 650 src = src_ptr; 651 dst = dst_ptr; 652 653 /* prefetch data to cache memory */ 654 prefetch_load(src_ptr + src_stride); 655 prefetch_load(src_ptr + src_stride + 32); 656 prefetch_load(src_ptr + src_stride + 64); 657 prefetch_store(dst_ptr + dst_stride); 658 prefetch_store(dst_ptr + dst_stride + 32); 659 660 for (c = 0; c < 4; c++) { 661 __asm__ __volatile__( 662 "ulw %[qload1], 0(%[src]) \n\t" 663 "ulw %[qload2], 4(%[src]) \n\t" 664 665 /* even 1. pixel */ 666 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 667 "mthi $zero, $ac1 \n\t" 668 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 669 "mthi $zero, $ac2 \n\t" 670 "preceu.ph.qbr %[p1], %[qload1] \n\t" 671 "preceu.ph.qbl %[p2], %[qload1] \n\t" 672 "preceu.ph.qbr %[p3], %[qload2] \n\t" 673 "preceu.ph.qbl %[p4], %[qload2] \n\t" 674 "ulw %[qload3], 8(%[src]) \n\t" 675 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 676 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 677 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 678 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 679 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 680 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 681 682 /* even 2. pixel */ 683 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 684 "mthi $zero, $ac3 \n\t" 685 "preceu.ph.qbr %[p1], %[qload3] \n\t" 686 "preceu.ph.qbl %[p5], %[qload3] \n\t" 687 "ulw %[qload1], 12(%[src]) \n\t" 688 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 689 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 690 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 691 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 692 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 693 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 694 695 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 696 697 /* even 3. pixel */ 698 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 699 "mthi $zero, $ac1 \n\t" 700 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 701 "preceu.ph.qbr %[p2], %[qload1] \n\t" 702 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 703 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 704 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 705 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 706 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 707 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 708 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 709 710 /* even 4. pixel */ 711 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 712 "mthi $zero, $ac2 \n\t" 713 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 714 "preceu.ph.qbl %[p3], %[qload1] \n\t" 715 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 716 "ulw %[qload2], 16(%[src]) \n\t" 717 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 718 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 719 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 720 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 721 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 722 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 723 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 724 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 725 726 /* even 5. pixel */ 727 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 728 "mthi $zero, $ac3 \n\t" 729 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 730 "preceu.ph.qbr %[p4], %[qload2] \n\t" 731 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 732 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 733 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 734 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 735 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 736 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 737 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 738 739 /* even 6. pixel */ 740 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 741 "mthi $zero, $ac1 \n\t" 742 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 743 "preceu.ph.qbl %[p1], %[qload2] \n\t" 744 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 745 "ulw %[qload3], 20(%[src]) \n\t" 746 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 747 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 748 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 749 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 750 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 751 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 752 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 753 754 /* even 7. pixel */ 755 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 756 "mthi $zero, $ac2 \n\t" 757 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 758 "preceu.ph.qbr %[p5], %[qload3] \n\t" 759 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 760 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 761 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 762 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 763 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 764 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 765 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 766 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 767 768 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 769 770 /* even 8. pixel */ 771 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 772 "mthi $zero, $ac3 \n\t" 773 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 774 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 775 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 776 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 777 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 778 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 779 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 780 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 781 782 /* ODD pixels */ 783 "ulw %[qload1], 1(%[src]) \n\t" 784 "ulw %[qload2], 5(%[src]) \n\t" 785 786 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 787 788 /* odd 1. pixel */ 789 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 790 "mthi $zero, $ac1 \n\t" 791 "preceu.ph.qbr %[p1], %[qload1] \n\t" 792 "preceu.ph.qbl %[p2], %[qload1] \n\t" 793 "preceu.ph.qbr %[p3], %[qload2] \n\t" 794 "preceu.ph.qbl %[p4], %[qload2] \n\t" 795 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 796 "ulw %[qload3], 9(%[src]) \n\t" 797 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 798 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 799 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 800 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 801 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 802 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 803 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 804 805 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 806 807 /* odd 2. pixel */ 808 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 809 "mthi $zero, $ac2 \n\t" 810 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 811 "preceu.ph.qbr %[p1], %[qload3] \n\t" 812 "preceu.ph.qbl %[p5], %[qload3] \n\t" 813 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 814 "ulw %[qload1], 13(%[src]) \n\t" 815 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 816 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 817 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 818 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 819 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 820 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 821 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 822 823 /* odd 3. pixel */ 824 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 825 "mthi $zero, $ac3 \n\t" 826 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 827 "preceu.ph.qbr %[p2], %[qload1] \n\t" 828 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 829 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 830 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 831 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 832 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 833 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 834 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 835 836 /* odd 4. pixel */ 837 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 838 "mthi $zero, $ac1 \n\t" 839 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 840 "preceu.ph.qbl %[p3], %[qload1] \n\t" 841 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 842 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 843 "ulw %[qload2], 17(%[src]) \n\t" 844 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 845 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 846 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 847 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 848 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 849 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 850 851 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 852 853 /* odd 5. pixel */ 854 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 855 "mthi $zero, $ac2 \n\t" 856 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 857 "preceu.ph.qbr %[p4], %[qload2] \n\t" 858 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 859 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 860 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 861 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 862 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 863 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 864 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 865 866 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 867 868 /* odd 6. pixel */ 869 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 870 "mthi $zero, $ac3 \n\t" 871 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 872 "preceu.ph.qbl %[p1], %[qload2] \n\t" 873 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 874 "ulw %[qload3], 21(%[src]) \n\t" 875 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 876 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 877 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 878 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 879 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 880 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 881 882 /* odd 7. pixel */ 883 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 884 "mthi $zero, $ac1 \n\t" 885 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 886 "preceu.ph.qbr %[p5], %[qload3] \n\t" 887 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 888 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 889 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 890 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 891 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 892 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 893 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 894 895 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 896 897 /* odd 8. pixel */ 898 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 899 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 900 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 901 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 902 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 903 904 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 905 906 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 907 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 908 909 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 910 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 911 912 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 913 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 914 915 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 916 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 917 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 918 919 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), 920 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), 921 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), 922 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 923 [Temp3] "=&r"(Temp3) 924 : [filter12] "r"(filter12), [filter34] "r"(filter34), 925 [filter56] "r"(filter56), [filter78] "r"(filter78), 926 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 927 [src] "r"(src)); 928 929 src += 16; 930 dst += 16; 931 } 932 933 /* Next row... */ 934 src_ptr += src_stride; 935 dst_ptr += dst_stride; 936 } 937 } 938 939 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 940 uint8_t *dst, ptrdiff_t dst_stride, 941 const int16_t *filter_x, int x_step_q4, 942 const int16_t *filter_y, int y_step_q4, 943 int w, int h) { 944 assert(x_step_q4 == 16); 945 assert(((const int32_t *)filter_x)[1] != 0x800000); 946 947 if (((const int32_t *)filter_x)[0] == 0) { 948 vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, 949 x_step_q4, filter_y, y_step_q4, w, h); 950 } else { 951 uint32_t pos = 38; 952 953 src -= 3; 954 955 /* bit positon for extract from acc */ 956 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 957 : 958 : [pos] "r"(pos)); 959 960 /* prefetch data to cache memory */ 961 prefetch_load(src); 962 prefetch_load(src + 32); 963 prefetch_store(dst); 964 965 switch (w) { 966 case 4: 967 convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, 968 h); 969 break; 970 case 8: 971 convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, 972 h); 973 break; 974 case 16: 975 convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, 976 h, 1); 977 break; 978 case 32: 979 convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, 980 h, 2); 981 break; 982 case 64: 983 prefetch_load(src + 64); 984 prefetch_store(dst + 32); 985 986 convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, 987 h); 988 break; 989 default: 990 vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, 991 filter_x, x_step_q4, filter_y, y_step_q4, w, 992 h); 993 break; 994 } 995 } 996 } 997 #endif 998