1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_config.h" 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/mips/inv_txfm_dspr2.h" 14 #include "vpx_dsp/txfm_common.h" 15 16 #if HAVE_DSPR2 17 void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { 18 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 19 const int const_2_power_13 = 8192; 20 int Temp0, Temp1, Temp2, Temp3, Temp4; 21 int i; 22 23 for (i = no_rows; i--; ) { 24 __asm__ __volatile__ ( 25 /* 26 temp_1 = (input[0] + input[4]) * cospi_16_64; 27 step2_0 = dct_const_round_shift(temp_1); 28 29 temp_2 = (input[0] - input[4]) * cospi_16_64; 30 step2_1 = dct_const_round_shift(temp_2); 31 */ 32 "lh %[Temp0], 0(%[input]) \n\t" 33 "lh %[Temp1], 8(%[input]) \n\t" 34 "mtlo %[const_2_power_13], $ac0 \n\t" 35 "mthi $zero, $ac0 \n\t" 36 "mtlo %[const_2_power_13], $ac1 \n\t" 37 "mthi $zero, $ac1 \n\t" 38 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 39 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 40 "extp %[Temp4], $ac0, 31 \n\t" 41 42 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 43 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 44 "mtlo %[const_2_power_13], $ac0 \n\t" 45 "mthi $zero, $ac0 \n\t" 46 "extp %[Temp2], $ac1, 31 \n\t" 47 48 /* 49 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; 50 step2_2 = dct_const_round_shift(temp_1); 51 */ 52 "lh %[Temp0], 4(%[input]) \n\t" 53 "lh %[Temp1], 12(%[input]) \n\t" 54 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 55 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 56 "mtlo %[const_2_power_13], $ac1 \n\t" 57 "mthi $zero, $ac1 \n\t" 58 "extp %[Temp3], $ac0, 31 \n\t" 59 60 /* 61 step1_1 = step2_1 + step2_2; 62 step1_2 = step2_1 - step2_2; 63 */ 64 "add %[step1_1], %[Temp2], %[Temp3] \n\t" 65 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" 66 67 /* 68 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; 69 step2_3 = dct_const_round_shift(temp_2); 70 */ 71 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 72 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 73 "extp %[Temp1], $ac1, 31 \n\t" 74 75 "mtlo %[const_2_power_13], $ac0 \n\t" 76 "mthi $zero, $ac0 \n\t" 77 78 /* 79 step1_0 = step2_0 + step2_3; 80 step1_3 = step2_0 - step2_3; 81 */ 82 "add %[step1_0], %[Temp4], %[Temp1] \n\t" 83 "sub %[step1_3], %[Temp4], %[Temp1] \n\t" 84 85 /* 86 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 87 step1_4 = dct_const_round_shift(temp_1); 88 */ 89 "lh %[Temp0], 2(%[input]) \n\t" 90 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" 91 "mtlo %[const_2_power_13], $ac1 \n\t" 92 "mthi $zero, $ac1 \n\t" 93 "lh %[Temp1], 14(%[input]) \n\t" 94 "lh %[Temp0], 2(%[input]) \n\t" 95 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" 96 "extp %[step1_4], $ac0, 31 \n\t" 97 98 /* 99 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 100 step1_7 = dct_const_round_shift(temp_2); 101 */ 102 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" 103 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" 104 "extp %[step1_7], $ac1, 31 \n\t" 105 106 /* 107 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 108 step1_5 = dct_const_round_shift(temp_1); 109 */ 110 "mtlo %[const_2_power_13], $ac0 \n\t" 111 "mthi $zero, $ac0 \n\t" 112 "lh %[Temp0], 10(%[input]) \n\t" 113 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" 114 "lh %[Temp1], 6(%[input]) \n\t" 115 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" 116 "extp %[step1_5], $ac0, 31 \n\t" 117 118 /* 119 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 120 step1_6 = dct_const_round_shift(temp_2); 121 */ 122 "mtlo %[const_2_power_13], $ac1 \n\t" 123 "mthi $zero, $ac1 \n\t" 124 "lh %[Temp0], 10(%[input]) \n\t" 125 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" 126 "lh %[Temp1], 6(%[input]) \n\t" 127 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" 128 "extp %[step1_6], $ac1, 31 \n\t" 129 130 /* 131 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; 132 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; 133 */ 134 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" 135 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" 136 "add %[Temp0], %[Temp0], %[step1_5] \n\t" 137 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" 138 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" 139 "add %[Temp1], %[Temp1], %[step1_7] \n\t" 140 141 "mtlo %[const_2_power_13], $ac0 \n\t" 142 "mthi $zero, $ac0 \n\t" 143 "mtlo %[const_2_power_13], $ac1 \n\t" 144 "mthi $zero, $ac1 \n\t" 145 146 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" 147 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" 148 149 /* 150 step1_4 = step1_4 + step1_5; 151 step1_7 = step1_6 + step1_7; 152 */ 153 "add %[step1_4], %[step1_4], %[step1_5] \n\t" 154 "add %[step1_7], %[step1_7], %[step1_6] \n\t" 155 156 "extp %[step1_5], $ac0, 31 \n\t" 157 "extp %[step1_6], $ac1, 31 \n\t" 158 159 "add %[Temp0], %[step1_0], %[step1_7] \n\t" 160 "sh %[Temp0], 0(%[output]) \n\t" 161 "add %[Temp1], %[step1_1], %[step1_6] \n\t" 162 "sh %[Temp1], 16(%[output]) \n\t" 163 "add %[Temp0], %[step1_2], %[step1_5] \n\t" 164 "sh %[Temp0], 32(%[output]) \n\t" 165 "add %[Temp1], %[step1_3], %[step1_4] \n\t" 166 "sh %[Temp1], 48(%[output]) \n\t" 167 168 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" 169 "sh %[Temp0], 64(%[output]) \n\t" 170 "sub %[Temp1], %[step1_2], %[step1_5] \n\t" 171 "sh %[Temp1], 80(%[output]) \n\t" 172 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" 173 "sh %[Temp0], 96(%[output]) \n\t" 174 "sub %[Temp1], %[step1_0], %[step1_7] \n\t" 175 "sh %[Temp1], 112(%[output]) \n\t" 176 177 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), 178 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), 179 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), 180 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), 181 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), 182 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 183 [Temp4] "=&r" (Temp4) 184 : [const_2_power_13] "r" (const_2_power_13), 185 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), 186 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), 187 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), 188 [cospi_24_64] "r" (cospi_24_64), 189 [output] "r" (output), [input] "r" (input) 190 ); 191 192 input += 8; 193 output += 1; 194 } 195 } 196 197 void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, 198 int dest_stride) { 199 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 200 int Temp0, Temp1, Temp2, Temp3; 201 int i; 202 const int const_2_power_13 = 8192; 203 uint8_t *dest_pix; 204 uint8_t *cm = vpx_ff_cropTbl; 205 206 /* prefetch vpx_ff_cropTbl */ 207 prefetch_load(vpx_ff_cropTbl); 208 prefetch_load(vpx_ff_cropTbl + 32); 209 prefetch_load(vpx_ff_cropTbl + 64); 210 prefetch_load(vpx_ff_cropTbl + 96); 211 prefetch_load(vpx_ff_cropTbl + 128); 212 prefetch_load(vpx_ff_cropTbl + 160); 213 prefetch_load(vpx_ff_cropTbl + 192); 214 prefetch_load(vpx_ff_cropTbl + 224); 215 216 for (i = 0; i < 8; ++i) { 217 dest_pix = (dest + i); 218 219 __asm__ __volatile__ ( 220 /* 221 temp_1 = (input[0] + input[4]) * cospi_16_64; 222 step2_0 = dct_const_round_shift(temp_1); 223 224 temp_2 = (input[0] - input[4]) * cospi_16_64; 225 step2_1 = dct_const_round_shift(temp_2); 226 */ 227 "lh %[Temp0], 0(%[input]) \n\t" 228 "lh %[Temp1], 8(%[input]) \n\t" 229 "mtlo %[const_2_power_13], $ac0 \n\t" 230 "mthi $zero, $ac0 \n\t" 231 "mtlo %[const_2_power_13], $ac1 \n\t" 232 "mthi $zero, $ac1 \n\t" 233 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 234 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 235 "extp %[step1_6], $ac0, 31 \n\t" 236 237 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 238 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 239 "mtlo %[const_2_power_13], $ac0 \n\t" 240 "mthi $zero, $ac0 \n\t" 241 "extp %[Temp2], $ac1, 31 \n\t" 242 243 /* 244 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; 245 step2_2 = dct_const_round_shift(temp_1); 246 */ 247 "lh %[Temp0], 4(%[input]) \n\t" 248 "lh %[Temp1], 12(%[input]) \n\t" 249 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 250 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 251 "mtlo %[const_2_power_13], $ac1 \n\t" 252 "mthi $zero, $ac1 \n\t" 253 "extp %[Temp3], $ac0, 31 \n\t" 254 255 /* 256 step1_1 = step2_1 + step2_2; 257 step1_2 = step2_1 - step2_2; 258 */ 259 "add %[step1_1], %[Temp2], %[Temp3] \n\t" 260 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" 261 262 /* 263 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; 264 step2_3 = dct_const_round_shift(temp_2); 265 */ 266 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 267 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 268 "extp %[Temp1], $ac1, 31 \n\t" 269 270 "mtlo %[const_2_power_13], $ac0 \n\t" 271 "mthi $zero, $ac0 \n\t" 272 273 /* 274 step1_0 = step2_0 + step2_3; 275 step1_3 = step2_0 - step2_3; 276 */ 277 "add %[step1_0], %[step1_6], %[Temp1] \n\t" 278 "sub %[step1_3], %[step1_6], %[Temp1] \n\t" 279 280 /* 281 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 282 step1_4 = dct_const_round_shift(temp_1); 283 */ 284 "lh %[Temp0], 2(%[input]) \n\t" 285 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" 286 "mtlo %[const_2_power_13], $ac1 \n\t" 287 "mthi $zero, $ac1 \n\t" 288 "lh %[Temp1], 14(%[input]) \n\t" 289 "lh %[Temp0], 2(%[input]) \n\t" 290 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" 291 "extp %[step1_4], $ac0, 31 \n\t" 292 293 /* 294 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 295 step1_7 = dct_const_round_shift(temp_2); 296 */ 297 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" 298 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" 299 "extp %[step1_7], $ac1, 31 \n\t" 300 301 /* 302 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 303 step1_5 = dct_const_round_shift(temp_1); 304 */ 305 "mtlo %[const_2_power_13], $ac0 \n\t" 306 "mthi $zero, $ac0 \n\t" 307 "lh %[Temp0], 10(%[input]) \n\t" 308 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" 309 "lh %[Temp1], 6(%[input]) \n\t" 310 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" 311 "extp %[step1_5], $ac0, 31 \n\t" 312 313 /* 314 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 315 step1_6 = dct_const_round_shift(temp_2); 316 */ 317 "mtlo %[const_2_power_13], $ac1 \n\t" 318 "mthi $zero, $ac1 \n\t" 319 "lh %[Temp0], 10(%[input]) \n\t" 320 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" 321 "lh %[Temp1], 6(%[input]) \n\t" 322 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" 323 "extp %[step1_6], $ac1, 31 \n\t" 324 325 /* 326 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; 327 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; 328 */ 329 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" 330 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" 331 "add %[Temp0], %[Temp0], %[step1_5] \n\t" 332 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" 333 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" 334 "add %[Temp1], %[Temp1], %[step1_7] \n\t" 335 336 "mtlo %[const_2_power_13], $ac0 \n\t" 337 "mthi $zero, $ac0 \n\t" 338 "mtlo %[const_2_power_13], $ac1 \n\t" 339 "mthi $zero, $ac1 \n\t" 340 341 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" 342 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" 343 344 /* 345 step1_4 = step1_4 + step1_5; 346 step1_7 = step1_6 + step1_7; 347 */ 348 "add %[step1_4], %[step1_4], %[step1_5] \n\t" 349 "add %[step1_7], %[step1_7], %[step1_6] \n\t" 350 351 "extp %[step1_5], $ac0, 31 \n\t" 352 "extp %[step1_6], $ac1, 31 \n\t" 353 354 /* add block */ 355 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 356 "add %[Temp0], %[step1_0], %[step1_7] \n\t" 357 "addi %[Temp0], %[Temp0], 16 \n\t" 358 "sra %[Temp0], %[Temp0], 5 \n\t" 359 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 360 "add %[Temp0], %[step1_1], %[step1_6] \n\t" 361 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 362 "sb %[Temp2], 0(%[dest_pix]) \n\t" 363 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 364 365 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 366 "addi %[Temp0], %[Temp0], 16 \n\t" 367 "sra %[Temp0], %[Temp0], 5 \n\t" 368 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 369 "add %[Temp0], %[step1_2], %[step1_5] \n\t" 370 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 371 "sb %[Temp2], 0(%[dest_pix]) \n\t" 372 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 373 374 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 375 "addi %[Temp0], %[Temp0], 16 \n\t" 376 "sra %[Temp0], %[Temp0], 5 \n\t" 377 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 378 "add %[Temp0], %[step1_3], %[step1_4] \n\t" 379 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 380 "sb %[Temp2], 0(%[dest_pix]) \n\t" 381 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 382 383 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 384 "addi %[Temp0], %[Temp0], 16 \n\t" 385 "sra %[Temp0], %[Temp0], 5 \n\t" 386 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 387 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" 388 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 389 "sb %[Temp2], 0(%[dest_pix]) \n\t" 390 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 391 392 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 393 "addi %[Temp0], %[Temp0], 16 \n\t" 394 "sra %[Temp0], %[Temp0], 5 \n\t" 395 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 396 "sub %[Temp0], %[step1_2], %[step1_5] \n\t" 397 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 398 "sb %[Temp2], 0(%[dest_pix]) \n\t" 399 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 400 401 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 402 "addi %[Temp0], %[Temp0], 16 \n\t" 403 "sra %[Temp0], %[Temp0], 5 \n\t" 404 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 405 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" 406 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 407 "sb %[Temp2], 0(%[dest_pix]) \n\t" 408 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 409 410 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 411 "addi %[Temp0], %[Temp0], 16 \n\t" 412 "sra %[Temp0], %[Temp0], 5 \n\t" 413 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 414 "sub %[Temp0], %[step1_0], %[step1_7] \n\t" 415 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 416 "sb %[Temp2], 0(%[dest_pix]) \n\t" 417 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" 418 419 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 420 "addi %[Temp0], %[Temp0], 16 \n\t" 421 "sra %[Temp0], %[Temp0], 5 \n\t" 422 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 423 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 424 "sb %[Temp2], 0(%[dest_pix]) \n\t" 425 426 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), 427 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), 428 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), 429 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), 430 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), 431 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), 432 [dest_pix] "+r" (dest_pix) 433 : [const_2_power_13] "r" (const_2_power_13), 434 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), 435 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), 436 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), 437 [cospi_24_64] "r" (cospi_24_64), 438 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) 439 ); 440 441 input += 8; 442 } 443 } 444 445 void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, 446 int dest_stride) { 447 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 448 int16_t *outptr = out; 449 uint32_t pos = 45; 450 451 /* bit positon for extract from acc */ 452 __asm__ __volatile__ ( 453 "wrdsp %[pos], 1 \n\t" 454 : 455 : [pos] "r" (pos) 456 ); 457 458 // First transform rows 459 idct8_rows_dspr2(input, outptr, 8); 460 461 // Then transform columns and add to dest 462 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); 463 } 464 465 void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, 466 int dest_stride) { 467 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 468 int16_t *outptr = out; 469 uint32_t pos = 45; 470 471 /* bit positon for extract from acc */ 472 __asm__ __volatile__ ( 473 "wrdsp %[pos], 1 \n\t" 474 : 475 : [pos] "r" (pos) 476 ); 477 478 // First transform rows 479 idct8_rows_dspr2(input, outptr, 4); 480 481 outptr += 4; 482 483 __asm__ __volatile__ ( 484 "sw $zero, 0(%[outptr]) \n\t" 485 "sw $zero, 4(%[outptr]) \n\t" 486 "sw $zero, 16(%[outptr]) \n\t" 487 "sw $zero, 20(%[outptr]) \n\t" 488 "sw $zero, 32(%[outptr]) \n\t" 489 "sw $zero, 36(%[outptr]) \n\t" 490 "sw $zero, 48(%[outptr]) \n\t" 491 "sw $zero, 52(%[outptr]) \n\t" 492 "sw $zero, 64(%[outptr]) \n\t" 493 "sw $zero, 68(%[outptr]) \n\t" 494 "sw $zero, 80(%[outptr]) \n\t" 495 "sw $zero, 84(%[outptr]) \n\t" 496 "sw $zero, 96(%[outptr]) \n\t" 497 "sw $zero, 100(%[outptr]) \n\t" 498 "sw $zero, 112(%[outptr]) \n\t" 499 "sw $zero, 116(%[outptr]) \n\t" 500 501 : 502 : [outptr] "r" (outptr) 503 ); 504 505 506 // Then transform columns and add to dest 507 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); 508 } 509 510 void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, 511 int dest_stride) { 512 uint32_t pos = 45; 513 int32_t out; 514 int32_t r; 515 int32_t a1, absa1; 516 int32_t t1, t2, vector_a1, vector_1, vector_2; 517 518 /* bit positon for extract from acc */ 519 __asm__ __volatile__ ( 520 "wrdsp %[pos], 1 \n\t" 521 522 : 523 : [pos] "r" (pos) 524 ); 525 526 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 527 __asm__ __volatile__ ( 528 "addi %[out], %[out], 16 \n\t" 529 "sra %[a1], %[out], 5 \n\t" 530 531 : [out] "+r" (out), [a1] "=r" (a1) 532 : 533 ); 534 535 if (a1 < 0) { 536 /* use quad-byte 537 * input and output memory are four byte aligned */ 538 __asm__ __volatile__ ( 539 "abs %[absa1], %[a1] \n\t" 540 "replv.qb %[vector_a1], %[absa1] \n\t" 541 542 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) 543 : [a1] "r" (a1) 544 ); 545 546 for (r = 8; r--;) { 547 __asm__ __volatile__ ( 548 "lw %[t1], 0(%[dest]) \n\t" 549 "lw %[t2], 4(%[dest]) \n\t" 550 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 551 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 552 "sw %[vector_1], 0(%[dest]) \n\t" 553 "sw %[vector_2], 4(%[dest]) \n\t" 554 "add %[dest], %[dest], %[dest_stride] \n\t" 555 556 : [t1] "=&r" (t1), [t2] "=&r" (t2), 557 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 558 [dest] "+&r" (dest) 559 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 560 ); 561 } 562 } else { 563 /* use quad-byte 564 * input and output memory are four byte aligned */ 565 __asm__ __volatile__ ( 566 "replv.qb %[vector_a1], %[a1] \n\t" 567 568 : [vector_a1] "=r" (vector_a1) 569 : [a1] "r" (a1) 570 ); 571 572 for (r = 8; r--;) { 573 __asm__ __volatile__ ( 574 "lw %[t1], 0(%[dest]) \n\t" 575 "lw %[t2], 4(%[dest]) \n\t" 576 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 577 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 578 "sw %[vector_1], 0(%[dest]) \n\t" 579 "sw %[vector_2], 4(%[dest]) \n\t" 580 "add %[dest], %[dest], %[dest_stride] \n\t" 581 582 : [t1] "=&r" (t1), [t2] "=&r" (t2), 583 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), 584 [dest] "+r" (dest) 585 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) 586 ); 587 } 588 } 589 } 590 591 void iadst8_dspr2(const int16_t *input, int16_t *output) { 592 int s0, s1, s2, s3, s4, s5, s6, s7; 593 int x0, x1, x2, x3, x4, x5, x6, x7; 594 595 x0 = input[7]; 596 x1 = input[0]; 597 x2 = input[5]; 598 x3 = input[2]; 599 x4 = input[3]; 600 x5 = input[4]; 601 x6 = input[1]; 602 x7 = input[6]; 603 604 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 605 output[0] = output[1] = output[2] = output[3] = output[4] 606 = output[5] = output[6] = output[7] = 0; 607 return; 608 } 609 610 // stage 1 611 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 612 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 613 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 614 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 615 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 616 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 617 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 618 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 619 620 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); 621 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); 622 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); 623 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); 624 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); 625 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); 626 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); 627 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); 628 629 // stage 2 630 s0 = x0; 631 s1 = x1; 632 s2 = x2; 633 s3 = x3; 634 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 635 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 636 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 637 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 638 639 x0 = s0 + s2; 640 x1 = s1 + s3; 641 x2 = s0 - s2; 642 x3 = s1 - s3; 643 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); 644 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); 645 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); 646 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); 647 648 // stage 3 649 s2 = cospi_16_64 * (x2 + x3); 650 s3 = cospi_16_64 * (x2 - x3); 651 s6 = cospi_16_64 * (x6 + x7); 652 s7 = cospi_16_64 * (x6 - x7); 653 654 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); 655 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); 656 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); 657 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); 658 659 output[0] = x0; 660 output[1] = -x4; 661 output[2] = x6; 662 output[3] = -x2; 663 output[4] = x3; 664 output[5] = -x7; 665 output[6] = x5; 666 output[7] = -x1; 667 } 668 #endif // HAVE_DSPR2 669