1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx_dsp/mips/convolve_common_dspr2.h" 16 #include "vpx_dsp/vpx_convolve.h" 17 #include "vpx_dsp/vpx_dsp_common.h" 18 #include "vpx_dsp/vpx_filter.h" 19 #include "vpx_ports/mem.h" 20 21 #if HAVE_DSPR2 22 static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, 23 uint8_t *dst, int32_t dst_stride, 24 const int16_t *filter_x0, int32_t h) { 25 int32_t y; 26 uint8_t *cm = vpx_ff_cropTbl; 27 int32_t vector1b, vector2b, vector3b, vector4b; 28 int32_t Temp1, Temp2, Temp3, Temp4; 29 uint32_t vector4a = 64; 30 uint32_t tp1, tp2; 31 uint32_t p1, p2, p3, p4; 32 uint32_t n1, n2, n3, n4; 33 uint32_t tn1, tn2; 34 35 vector1b = ((const int32_t *)filter_x0)[0]; 36 vector2b = ((const int32_t *)filter_x0)[1]; 37 vector3b = ((const int32_t *)filter_x0)[2]; 38 vector4b = ((const int32_t *)filter_x0)[3]; 39 40 for (y = h; y--;) { 41 /* prefetch data to cache memory */ 42 prefetch_load(src + src_stride); 43 prefetch_load(src + src_stride + 32); 44 prefetch_store(dst + dst_stride); 45 46 __asm__ __volatile__( 47 "ulw %[tp1], 0(%[src]) \n\t" 48 "ulw %[tp2], 4(%[src]) \n\t" 49 50 /* even 1. pixel */ 51 "mtlo %[vector4a], $ac3 \n\t" 52 "mthi $zero, $ac3 \n\t" 53 "preceu.ph.qbr %[p1], %[tp1] \n\t" 54 "preceu.ph.qbl %[p2], %[tp1] \n\t" 55 "preceu.ph.qbr %[p3], %[tp2] \n\t" 56 "preceu.ph.qbl %[p4], %[tp2] \n\t" 57 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 58 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 59 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 60 "ulw %[tn2], 8(%[src]) \n\t" 61 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 62 "extp %[Temp1], $ac3, 31 \n\t" 63 64 /* even 2. pixel */ 65 "mtlo %[vector4a], $ac2 \n\t" 66 "mthi $zero, $ac2 \n\t" 67 "preceu.ph.qbr %[p1], %[tn2] \n\t" 68 "balign %[tn1], %[tn2], 3 \n\t" 69 "balign %[tn2], %[tp2], 3 \n\t" 70 "balign %[tp2], %[tp1], 3 \n\t" 71 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 72 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 73 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 74 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 75 "extp %[Temp3], $ac2, 31 \n\t" 76 77 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ 78 79 /* odd 1. pixel */ 80 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ 81 "mtlo %[vector4a], $ac3 \n\t" 82 "mthi $zero, $ac3 \n\t" 83 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ 84 "preceu.ph.qbr %[n1], %[tp2] \n\t" 85 "preceu.ph.qbl %[n2], %[tp2] \n\t" 86 "preceu.ph.qbr %[n3], %[tn2] \n\t" 87 "preceu.ph.qbl %[n4], %[tn2] \n\t" 88 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 89 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 90 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 91 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" 92 "extp %[Temp2], $ac3, 31 \n\t" 93 94 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ 95 96 /* odd 2. pixel */ 97 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ 98 "mtlo %[vector4a], $ac2 \n\t" 99 "mthi $zero, $ac2 \n\t" 100 "preceu.ph.qbr %[n1], %[tn1] \n\t" 101 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ 102 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ 103 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 104 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 105 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 106 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" 107 "extp %[Temp4], $ac2, 31 \n\t" 108 109 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ 110 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ 111 112 /* clamp */ 113 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ 114 "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ 115 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ 116 117 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ 118 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ 119 120 "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ 121 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ 122 123 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 124 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 125 [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), 126 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 127 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) 128 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 129 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 130 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 131 [src] "r"(src)); 132 133 /* Next row... */ 134 src += src_stride; 135 dst += dst_stride; 136 } 137 } 138 139 static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, 140 uint8_t *dst, int32_t dst_stride, 141 const int16_t *filter_x0, int32_t h) { 142 int32_t y; 143 uint8_t *cm = vpx_ff_cropTbl; 144 uint32_t vector4a = 64; 145 int32_t vector1b, vector2b, vector3b, vector4b; 146 int32_t Temp1, Temp2, Temp3; 147 uint32_t tp1, tp2; 148 uint32_t p1, p2, p3, p4, n1; 149 uint32_t tn1, tn2, tn3; 150 uint32_t st0, st1; 151 152 vector1b = ((const int32_t *)filter_x0)[0]; 153 vector2b = ((const int32_t *)filter_x0)[1]; 154 vector3b = ((const int32_t *)filter_x0)[2]; 155 vector4b = ((const int32_t *)filter_x0)[3]; 156 157 for (y = h; y--;) { 158 /* prefetch data to cache memory */ 159 prefetch_load(src + src_stride); 160 prefetch_load(src + src_stride + 32); 161 prefetch_store(dst + dst_stride); 162 163 __asm__ __volatile__( 164 "ulw %[tp1], 0(%[src]) \n\t" 165 "ulw %[tp2], 4(%[src]) \n\t" 166 167 /* even 1. pixel */ 168 "mtlo %[vector4a], $ac3 \n\t" 169 "mthi $zero, $ac3 \n\t" 170 "mtlo %[vector4a], $ac2 \n\t" 171 "mthi $zero, $ac2 \n\t" 172 "preceu.ph.qbr %[p1], %[tp1] \n\t" 173 "preceu.ph.qbl %[p2], %[tp1] \n\t" 174 "preceu.ph.qbr %[p3], %[tp2] \n\t" 175 "preceu.ph.qbl %[p4], %[tp2] \n\t" 176 "ulw %[tn2], 8(%[src]) \n\t" 177 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 178 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 179 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 180 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 181 "extp %[Temp1], $ac3, 31 \n\t" 182 "lbu %[Temp2], 0(%[dst]) \n\t" 183 "lbu %[tn3], 2(%[dst]) \n\t" 184 185 /* even 2. pixel */ 186 "preceu.ph.qbr %[p1], %[tn2] \n\t" 187 "preceu.ph.qbl %[n1], %[tn2] \n\t" 188 "ulw %[tn1], 12(%[src]) \n\t" 189 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 190 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 191 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 192 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 193 "extp %[Temp3], $ac2, 31 \n\t" 194 195 /* even 3. pixel */ 196 "lbux %[st0], %[Temp1](%[cm]) \n\t" 197 "mtlo %[vector4a], $ac1 \n\t" 198 "mthi $zero, $ac1 \n\t" 199 "preceu.ph.qbr %[p2], %[tn1] \n\t" 200 "lbux %[st1], %[Temp3](%[cm]) \n\t" 201 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 202 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 203 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 204 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 205 "extp %[Temp1], $ac1, 31 \n\t" 206 207 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 208 "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" 209 "sb %[Temp2], 0(%[dst]) \n\t" 210 "sb %[tn3], 2(%[dst]) \n\t" 211 212 /* even 4. pixel */ 213 "mtlo %[vector4a], $ac2 \n\t" 214 "mthi $zero, $ac2 \n\t" 215 "mtlo %[vector4a], $ac3 \n\t" 216 "mthi $zero, $ac3 \n\t" 217 218 "balign %[tn3], %[tn1], 3 \n\t" 219 "balign %[tn1], %[tn2], 3 \n\t" 220 "balign %[tn2], %[tp2], 3 \n\t" 221 "balign %[tp2], %[tp1], 3 \n\t" 222 223 "lbux %[st0], %[Temp1](%[cm]) \n\t" 224 "lbu %[Temp2], 4(%[dst]) \n\t" 225 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 226 227 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 228 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 229 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 230 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 231 "extp %[Temp3], $ac2, 31 \n\t" 232 233 /* odd 1. pixel */ 234 "mtlo %[vector4a], $ac1 \n\t" 235 "mthi $zero, $ac1 \n\t" 236 "sb %[Temp2], 4(%[dst]) \n\t" 237 "preceu.ph.qbr %[p1], %[tp2] \n\t" 238 "preceu.ph.qbl %[p2], %[tp2] \n\t" 239 "preceu.ph.qbr %[p3], %[tn2] \n\t" 240 "preceu.ph.qbl %[p4], %[tn2] \n\t" 241 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 242 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 243 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 244 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 245 "extp %[Temp2], $ac3, 31 \n\t" 246 247 "lbu %[tp1], 6(%[dst]) \n\t" 248 249 /* odd 2. pixel */ 250 "mtlo %[vector4a], $ac3 \n\t" 251 "mthi $zero, $ac3 \n\t" 252 "mtlo %[vector4a], $ac2 \n\t" 253 "mthi $zero, $ac2 \n\t" 254 "preceu.ph.qbr %[p1], %[tn1] \n\t" 255 "preceu.ph.qbl %[n1], %[tn1] \n\t" 256 "lbux %[st0], %[Temp3](%[cm]) \n\t" 257 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 258 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 259 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 260 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 261 "extp %[Temp3], $ac1, 31 \n\t" 262 263 "lbu %[tp2], 1(%[dst]) \n\t" 264 "lbu %[tn2], 3(%[dst]) \n\t" 265 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" 266 267 /* odd 3. pixel */ 268 "lbux %[st1], %[Temp2](%[cm]) \n\t" 269 "preceu.ph.qbr %[p2], %[tn3] \n\t" 270 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 271 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 272 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 273 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 274 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" 275 "extp %[Temp2], $ac3, 31 \n\t" 276 277 "lbu %[tn3], 5(%[dst]) \n\t" 278 279 /* odd 4. pixel */ 280 "sb %[tp2], 1(%[dst]) \n\t" 281 "sb %[tp1], 6(%[dst]) \n\t" 282 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 283 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 284 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 285 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 286 "extp %[Temp1], $ac2, 31 \n\t" 287 288 "lbu %[tn1], 7(%[dst]) \n\t" 289 290 /* clamp */ 291 "lbux %[p4], %[Temp3](%[cm]) \n\t" 292 "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" 293 294 "lbux %[p2], %[Temp2](%[cm]) \n\t" 295 "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" 296 297 "lbux %[n1], %[Temp1](%[cm]) \n\t" 298 "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" 299 300 /* store bytes */ 301 "sb %[tn2], 3(%[dst]) \n\t" 302 "sb %[tn3], 5(%[dst]) \n\t" 303 "sb %[tn1], 7(%[dst]) \n\t" 304 305 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 306 [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), 307 [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 308 [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), 309 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 310 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 311 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 312 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 313 [src] "r"(src)); 314 315 /* Next row... */ 316 src += src_stride; 317 dst += dst_stride; 318 } 319 } 320 321 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, 322 int32_t src_stride, uint8_t *dst_ptr, 323 int32_t dst_stride, 324 const int16_t *filter_x0, int32_t h, 325 int32_t count) { 326 int32_t y, c; 327 const uint8_t *src; 328 uint8_t *dst; 329 uint8_t *cm = vpx_ff_cropTbl; 330 uint32_t vector_64 = 64; 331 int32_t filter12, filter34, filter56, filter78; 332 int32_t Temp1, Temp2, Temp3; 333 uint32_t qload1, qload2, qload3; 334 uint32_t p1, p2, p3, p4, p5; 335 uint32_t st1, st2, st3; 336 337 filter12 = ((const int32_t *)filter_x0)[0]; 338 filter34 = ((const int32_t *)filter_x0)[1]; 339 filter56 = ((const int32_t *)filter_x0)[2]; 340 filter78 = ((const int32_t *)filter_x0)[3]; 341 342 for (y = h; y--;) { 343 src = src_ptr; 344 dst = dst_ptr; 345 346 /* prefetch data to cache memory */ 347 prefetch_load(src_ptr + src_stride); 348 prefetch_load(src_ptr + src_stride + 32); 349 prefetch_store(dst_ptr + dst_stride); 350 351 for (c = 0; c < count; c++) { 352 __asm__ __volatile__( 353 "ulw %[qload1], 0(%[src]) \n\t" 354 "ulw %[qload2], 4(%[src]) \n\t" 355 356 /* even 1. pixel */ 357 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 358 "mthi $zero, $ac1 \n\t" 359 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 360 "mthi $zero, $ac2 \n\t" 361 "preceu.ph.qbr %[p1], %[qload1] \n\t" 362 "preceu.ph.qbl %[p2], %[qload1] \n\t" 363 "preceu.ph.qbr %[p3], %[qload2] \n\t" 364 "preceu.ph.qbl %[p4], %[qload2] \n\t" 365 "ulw %[qload3], 8(%[src]) \n\t" 366 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 367 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 368 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 369 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 370 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 371 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 372 373 /* even 2. pixel */ 374 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 375 "mthi $zero, $ac3 \n\t" 376 "preceu.ph.qbr %[p1], %[qload3] \n\t" 377 "preceu.ph.qbl %[p5], %[qload3] \n\t" 378 "ulw %[qload1], 12(%[src]) \n\t" 379 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 380 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 381 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 382 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 383 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 384 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 385 386 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 387 388 /* even 3. pixel */ 389 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 390 "mthi $zero, $ac1 \n\t" 391 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 392 "preceu.ph.qbr %[p2], %[qload1] \n\t" 393 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 394 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 395 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 396 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 397 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 398 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 399 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 400 401 /* even 4. pixel */ 402 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 403 "mthi $zero, $ac2 \n\t" 404 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 405 "preceu.ph.qbl %[p3], %[qload1] \n\t" 406 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 407 "ulw %[qload2], 16(%[src]) \n\t" 408 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 409 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 410 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 411 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 412 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 413 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 414 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 415 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 416 417 /* even 5. pixel */ 418 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 419 "mthi $zero, $ac3 \n\t" 420 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 421 "preceu.ph.qbr %[p4], %[qload2] \n\t" 422 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 423 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 424 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 425 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 426 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 427 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 428 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 429 430 /* even 6. pixel */ 431 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 432 "mthi $zero, $ac1 \n\t" 433 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 434 "preceu.ph.qbl %[p1], %[qload2] \n\t" 435 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 436 "ulw %[qload3], 20(%[src]) \n\t" 437 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 438 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 439 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 440 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 441 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 442 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 443 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 444 445 /* even 7. pixel */ 446 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 447 "mthi $zero, $ac2 \n\t" 448 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 449 "preceu.ph.qbr %[p5], %[qload3] \n\t" 450 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 451 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 452 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 453 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 454 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 455 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 456 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 457 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 458 459 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 460 461 /* even 8. pixel */ 462 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 463 "mthi $zero, $ac3 \n\t" 464 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 465 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 466 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 467 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 468 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 469 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 470 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 471 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 472 473 /* ODD pixels */ 474 "ulw %[qload1], 1(%[src]) \n\t" 475 "ulw %[qload2], 5(%[src]) \n\t" 476 477 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 478 479 /* odd 1. pixel */ 480 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 481 "mthi $zero, $ac1 \n\t" 482 "preceu.ph.qbr %[p1], %[qload1] \n\t" 483 "preceu.ph.qbl %[p2], %[qload1] \n\t" 484 "preceu.ph.qbr %[p3], %[qload2] \n\t" 485 "preceu.ph.qbl %[p4], %[qload2] \n\t" 486 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 487 "ulw %[qload3], 9(%[src]) \n\t" 488 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 489 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 490 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 491 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 492 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 493 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 494 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 495 496 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 497 498 /* odd 2. pixel */ 499 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 500 "mthi $zero, $ac2 \n\t" 501 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 502 "preceu.ph.qbr %[p1], %[qload3] \n\t" 503 "preceu.ph.qbl %[p5], %[qload3] \n\t" 504 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 505 "ulw %[qload1], 13(%[src]) \n\t" 506 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 507 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 508 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 509 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 510 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 511 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 512 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 513 514 /* odd 3. pixel */ 515 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 516 "mthi $zero, $ac3 \n\t" 517 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 518 "preceu.ph.qbr %[p2], %[qload1] \n\t" 519 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 520 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 521 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 522 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 523 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 524 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 525 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 526 527 /* odd 4. pixel */ 528 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 529 "mthi $zero, $ac1 \n\t" 530 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 531 "preceu.ph.qbl %[p3], %[qload1] \n\t" 532 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 533 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 534 "ulw %[qload2], 17(%[src]) \n\t" 535 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 536 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 537 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 538 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 539 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 540 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 541 542 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 543 544 /* odd 5. pixel */ 545 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 546 "mthi $zero, $ac2 \n\t" 547 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 548 "preceu.ph.qbr %[p4], %[qload2] \n\t" 549 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 550 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 551 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 552 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 553 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 554 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 555 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 556 557 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 558 559 /* odd 6. pixel */ 560 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 561 "mthi $zero, $ac3 \n\t" 562 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 563 "preceu.ph.qbl %[p1], %[qload2] \n\t" 564 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 565 "ulw %[qload3], 21(%[src]) \n\t" 566 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 567 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 568 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 569 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 570 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 571 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 572 573 /* odd 7. pixel */ 574 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 575 "mthi $zero, $ac1 \n\t" 576 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 577 "preceu.ph.qbr %[p5], %[qload3] \n\t" 578 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 579 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 580 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 581 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 582 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 583 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 584 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 585 586 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 587 588 /* odd 8. pixel */ 589 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 590 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 591 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 592 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 593 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 594 595 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 596 597 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 598 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 599 600 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 601 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 602 603 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 604 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 605 606 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 607 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 608 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 609 610 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), 611 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), 612 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), 613 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 614 [Temp3] "=&r"(Temp3) 615 : [filter12] "r"(filter12), [filter34] "r"(filter34), 616 [filter56] "r"(filter56), [filter78] "r"(filter78), 617 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 618 [src] "r"(src)); 619 620 src += 16; 621 dst += 16; 622 } 623 624 /* Next row... */ 625 src_ptr += src_stride; 626 dst_ptr += dst_stride; 627 } 628 } 629 630 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, 631 int32_t src_stride, uint8_t *dst_ptr, 632 int32_t dst_stride, 633 const int16_t *filter_x0, int32_t h) { 634 int32_t y, c; 635 const uint8_t *src; 636 uint8_t *dst; 637 uint8_t *cm = vpx_ff_cropTbl; 638 uint32_t vector_64 = 64; 639 int32_t filter12, filter34, filter56, filter78; 640 int32_t Temp1, Temp2, Temp3; 641 uint32_t qload1, qload2, qload3; 642 uint32_t p1, p2, p3, p4, p5; 643 uint32_t st1, st2, st3; 644 645 filter12 = ((const int32_t *)filter_x0)[0]; 646 filter34 = ((const int32_t *)filter_x0)[1]; 647 filter56 = ((const int32_t *)filter_x0)[2]; 648 filter78 = ((const int32_t *)filter_x0)[3]; 649 650 for (y = h; y--;) { 651 src = src_ptr; 652 dst = dst_ptr; 653 654 /* prefetch data to cache memory */ 655 prefetch_load(src_ptr + src_stride); 656 prefetch_load(src_ptr + src_stride + 32); 657 prefetch_load(src_ptr + src_stride + 64); 658 prefetch_store(dst_ptr + dst_stride); 659 prefetch_store(dst_ptr + dst_stride + 32); 660 661 for (c = 0; c < 4; c++) { 662 __asm__ __volatile__( 663 "ulw %[qload1], 0(%[src]) \n\t" 664 "ulw %[qload2], 4(%[src]) \n\t" 665 666 /* even 1. pixel */ 667 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 668 "mthi $zero, $ac1 \n\t" 669 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 670 "mthi $zero, $ac2 \n\t" 671 "preceu.ph.qbr %[p1], %[qload1] \n\t" 672 "preceu.ph.qbl %[p2], %[qload1] \n\t" 673 "preceu.ph.qbr %[p3], %[qload2] \n\t" 674 "preceu.ph.qbl %[p4], %[qload2] \n\t" 675 "ulw %[qload3], 8(%[src]) \n\t" 676 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 677 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 678 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 679 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 680 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 681 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 682 683 /* even 2. pixel */ 684 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 685 "mthi $zero, $ac3 \n\t" 686 "preceu.ph.qbr %[p1], %[qload3] \n\t" 687 "preceu.ph.qbl %[p5], %[qload3] \n\t" 688 "ulw %[qload1], 12(%[src]) \n\t" 689 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 690 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 691 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 692 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 693 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 694 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 695 696 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 697 698 /* even 3. pixel */ 699 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 700 "mthi $zero, $ac1 \n\t" 701 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 702 "preceu.ph.qbr %[p2], %[qload1] \n\t" 703 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 704 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 705 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 706 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 707 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 708 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 709 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 710 711 /* even 4. pixel */ 712 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 713 "mthi $zero, $ac2 \n\t" 714 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 715 "preceu.ph.qbl %[p3], %[qload1] \n\t" 716 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 717 "ulw %[qload2], 16(%[src]) \n\t" 718 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 719 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 720 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 721 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 722 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 723 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 724 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 725 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 726 727 /* even 5. pixel */ 728 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 729 "mthi $zero, $ac3 \n\t" 730 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 731 "preceu.ph.qbr %[p4], %[qload2] \n\t" 732 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 733 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 734 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 735 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 736 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 737 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 738 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 739 740 /* even 6. pixel */ 741 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 742 "mthi $zero, $ac1 \n\t" 743 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 744 "preceu.ph.qbl %[p1], %[qload2] \n\t" 745 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 746 "ulw %[qload3], 20(%[src]) \n\t" 747 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 748 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 749 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 750 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 751 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 752 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 753 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 754 755 /* even 7. pixel */ 756 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 757 "mthi $zero, $ac2 \n\t" 758 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 759 "preceu.ph.qbr %[p5], %[qload3] \n\t" 760 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 761 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 762 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 763 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 764 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 765 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 766 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 767 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 768 769 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 770 771 /* even 8. pixel */ 772 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 773 "mthi $zero, $ac3 \n\t" 774 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 775 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 776 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 777 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 778 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 779 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 780 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 781 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 782 783 /* ODD pixels */ 784 "ulw %[qload1], 1(%[src]) \n\t" 785 "ulw %[qload2], 5(%[src]) \n\t" 786 787 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 788 789 /* odd 1. pixel */ 790 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 791 "mthi $zero, $ac1 \n\t" 792 "preceu.ph.qbr %[p1], %[qload1] \n\t" 793 "preceu.ph.qbl %[p2], %[qload1] \n\t" 794 "preceu.ph.qbr %[p3], %[qload2] \n\t" 795 "preceu.ph.qbl %[p4], %[qload2] \n\t" 796 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 797 "ulw %[qload3], 9(%[src]) \n\t" 798 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 799 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 800 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 801 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 802 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 803 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 804 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 805 806 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 807 808 /* odd 2. pixel */ 809 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 810 "mthi $zero, $ac2 \n\t" 811 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 812 "preceu.ph.qbr %[p1], %[qload3] \n\t" 813 "preceu.ph.qbl %[p5], %[qload3] \n\t" 814 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 815 "ulw %[qload1], 13(%[src]) \n\t" 816 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 817 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 818 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 819 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 820 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 821 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 822 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 823 824 /* odd 3. pixel */ 825 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 826 "mthi $zero, $ac3 \n\t" 827 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 828 "preceu.ph.qbr %[p2], %[qload1] \n\t" 829 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 830 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 831 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 832 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 833 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 834 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 835 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 836 837 /* odd 4. pixel */ 838 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 839 "mthi $zero, $ac1 \n\t" 840 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 841 "preceu.ph.qbl %[p3], %[qload1] \n\t" 842 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 843 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 844 "ulw %[qload2], 17(%[src]) \n\t" 845 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 846 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 847 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 848 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 849 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 850 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 851 852 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 853 854 /* odd 5. pixel */ 855 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 856 "mthi $zero, $ac2 \n\t" 857 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 858 "preceu.ph.qbr %[p4], %[qload2] \n\t" 859 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 860 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 861 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 862 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 863 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 864 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 865 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 866 867 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 868 869 /* odd 6. pixel */ 870 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 871 "mthi $zero, $ac3 \n\t" 872 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 873 "preceu.ph.qbl %[p1], %[qload2] \n\t" 874 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 875 "ulw %[qload3], 21(%[src]) \n\t" 876 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 877 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 878 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 879 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 880 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 881 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 882 883 /* odd 7. pixel */ 884 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 885 "mthi $zero, $ac1 \n\t" 886 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 887 "preceu.ph.qbr %[p5], %[qload3] \n\t" 888 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 889 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 890 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 891 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 892 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 893 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 894 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 895 896 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 897 898 /* odd 8. pixel */ 899 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 900 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 901 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 902 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 903 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 904 905 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 906 907 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 908 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 909 910 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 911 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 912 913 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 914 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 915 916 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 917 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 918 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 919 920 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), 921 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), 922 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), 923 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 924 [Temp3] "=&r"(Temp3) 925 : [filter12] "r"(filter12), [filter34] "r"(filter34), 926 [filter56] "r"(filter56), [filter78] "r"(filter78), 927 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 928 [src] "r"(src)); 929 930 src += 16; 931 dst += 16; 932 } 933 934 /* Next row... */ 935 src_ptr += src_stride; 936 dst_ptr += dst_stride; 937 } 938 } 939 940 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 941 uint8_t *dst, ptrdiff_t dst_stride, 942 const InterpKernel *filter, int x0_q4, 943 int32_t x_step_q4, int y0_q4, int y_step_q4, 944 int w, int h) { 945 const int16_t *const filter_x = filter[x0_q4]; 946 assert(x_step_q4 == 16); 947 assert(((const int32_t *)filter_x)[1] != 0x800000); 948 949 if (vpx_get_filter_taps(filter_x) == 2) { 950 vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter, 951 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); 952 } else { 953 uint32_t pos = 38; 954 955 src -= 3; 956 957 /* bit positon for extract from acc */ 958 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 959 : 960 : [pos] "r"(pos)); 961 962 /* prefetch data to cache memory */ 963 prefetch_load(src); 964 prefetch_load(src + 32); 965 prefetch_store(dst); 966 967 switch (w) { 968 case 4: 969 convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, 970 h); 971 break; 972 case 8: 973 convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, 974 h); 975 break; 976 case 16: 977 convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, 978 h, 1); 979 break; 980 case 32: 981 convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, 982 h, 2); 983 break; 984 case 64: 985 prefetch_load(src + 64); 986 prefetch_store(dst + 32); 987 988 convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, 989 h); 990 break; 991 default: 992 vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter, 993 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); 994 break; 995 } 996 } 997 } 998 #endif 999