1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_config.h" 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/mips/inv_txfm_dspr2.h" 14 #include "vpx_dsp/txfm_common.h" 15 16 #if HAVE_DSPR2 17 void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { 18 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 19 const int const_2_power_13 = 8192; 20 int Temp0, Temp1, Temp2, Temp3, Temp4; 21 int i; 22 23 for (i = no_rows; i--;) { 24 __asm__ __volatile__( 25 /* 26 temp_1 = (input[0] + input[4]) * cospi_16_64; 27 step2_0 = dct_const_round_shift(temp_1); 28 29 temp_2 = (input[0] - input[4]) * cospi_16_64; 30 step2_1 = dct_const_round_shift(temp_2); 31 */ 32 "lh %[Temp0], 0(%[input]) \n\t" 33 "lh %[Temp1], 8(%[input]) \n\t" 34 "mtlo %[const_2_power_13], $ac0 \n\t" 35 "mthi $zero, $ac0 \n\t" 36 "mtlo %[const_2_power_13], $ac1 \n\t" 37 "mthi $zero, $ac1 \n\t" 38 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 39 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 40 "extp %[Temp4], $ac0, 31 \n\t" 41 42 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 43 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 44 "mtlo %[const_2_power_13], $ac0 \n\t" 45 "mthi $zero, $ac0 \n\t" 46 "extp %[Temp2], $ac1, 31 \n\t" 47 48 /* 49 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; 50 step2_2 = dct_const_round_shift(temp_1); 51 */ 52 "lh %[Temp0], 4(%[input]) \n\t" 53 "lh %[Temp1], 12(%[input]) \n\t" 54 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 55 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 56 "mtlo %[const_2_power_13], $ac1 \n\t" 57 "mthi $zero, $ac1 \n\t" 58 "extp %[Temp3], $ac0, 31 \n\t" 59 60 /* 61 step1_1 = step2_1 + step2_2; 62 step1_2 = step2_1 - step2_2; 63 */ 64 "add %[step1_1], %[Temp2], %[Temp3] \n\t" 65 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" 66 67 /* 68 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; 69 step2_3 = dct_const_round_shift(temp_2); 70 */ 71 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 72 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 73 "extp %[Temp1], $ac1, 31 \n\t" 74 75 "mtlo %[const_2_power_13], $ac0 \n\t" 76 "mthi $zero, $ac0 \n\t" 77 78 /* 79 step1_0 = step2_0 + step2_3; 80 step1_3 = step2_0 - step2_3; 81 */ 82 "add %[step1_0], %[Temp4], %[Temp1] \n\t" 83 "sub %[step1_3], %[Temp4], %[Temp1] \n\t" 84 85 /* 86 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 87 step1_4 = dct_const_round_shift(temp_1); 88 */ 89 "lh %[Temp0], 2(%[input]) \n\t" 90 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" 91 "mtlo %[const_2_power_13], $ac1 \n\t" 92 "mthi $zero, $ac1 \n\t" 93 "lh %[Temp1], 14(%[input]) \n\t" 94 "lh %[Temp0], 2(%[input]) \n\t" 95 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" 96 "extp %[step1_4], $ac0, 31 \n\t" 97 98 /* 99 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 100 step1_7 = dct_const_round_shift(temp_2); 101 */ 102 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" 103 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" 104 "extp %[step1_7], $ac1, 31 \n\t" 105 106 /* 107 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 108 step1_5 = dct_const_round_shift(temp_1); 109 */ 110 "mtlo %[const_2_power_13], $ac0 \n\t" 111 "mthi $zero, $ac0 \n\t" 112 "lh %[Temp0], 10(%[input]) \n\t" 113 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" 114 "lh %[Temp1], 6(%[input]) \n\t" 115 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" 116 "extp %[step1_5], $ac0, 31 \n\t" 117 118 /* 119 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 120 step1_6 = dct_const_round_shift(temp_2); 121 */ 122 "mtlo %[const_2_power_13], $ac1 \n\t" 123 "mthi $zero, $ac1 \n\t" 124 "lh %[Temp0], 10(%[input]) \n\t" 125 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" 126 "lh %[Temp1], 6(%[input]) \n\t" 127 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" 128 "extp %[step1_6], $ac1, 31 \n\t" 129 130 /* 131 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; 132 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; 133 */ 134 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" 135 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" 136 "add %[Temp0], %[Temp0], %[step1_5] \n\t" 137 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" 138 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" 139 "add %[Temp1], %[Temp1], %[step1_7] \n\t" 140 141 "mtlo %[const_2_power_13], $ac0 \n\t" 142 "mthi $zero, $ac0 \n\t" 143 "mtlo %[const_2_power_13], $ac1 \n\t" 144 "mthi $zero, $ac1 \n\t" 145 146 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" 147 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" 148 149 /* 150 step1_4 = step1_4 + step1_5; 151 step1_7 = step1_6 + step1_7; 152 */ 153 "add %[step1_4], %[step1_4], %[step1_5] \n\t" 154 "add %[step1_7], %[step1_7], %[step1_6] \n\t" 155 156 "extp %[step1_5], $ac0, 31 \n\t" 157 "extp %[step1_6], $ac1, 31 \n\t" 158 159 "add %[Temp0], %[step1_0], %[step1_7] \n\t" 160 "sh %[Temp0], 0(%[output]) \n\t" 161 "add %[Temp1], %[step1_1], %[step1_6] \n\t" 162 "sh %[Temp1], 16(%[output]) \n\t" 163 "add %[Temp0], %[step1_2], %[step1_5] \n\t" 164 "sh %[Temp0], 32(%[output]) \n\t" 165 "add %[Temp1], %[step1_3], %[step1_4] \n\t" 166 "sh %[Temp1], 48(%[output]) \n\t" 167 168 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" 169 "sh %[Temp0], 64(%[output]) \n\t" 170 "sub %[Temp1], %[step1_2], %[step1_5] \n\t" 171 "sh %[Temp1], 80(%[output]) \n\t" 172 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" 173 "sh %[Temp0], 96(%[output]) \n\t" 174 "sub %[Temp1], %[step1_0], %[step1_7] \n\t" 175 "sh %[Temp1], 112(%[output]) \n\t" 176 177 : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), 178 [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), 179 [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), 180 [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), 181 [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 182 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) 183 : [const_2_power_13] "r"(const_2_power_13), 184 [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), 185 [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), 186 [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), 187 [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), 188 [input] "r"(input)); 189 190 input += 8; 191 output += 1; 192 } 193 } 194 195 void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { 196 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; 197 int Temp0, Temp1, Temp2, Temp3; 198 int i; 199 const int const_2_power_13 = 8192; 200 const int const_255 = 255; 201 uint8_t *dest_pix; 202 203 for (i = 0; i < 8; ++i) { 204 dest_pix = (dest + i); 205 206 __asm__ __volatile__( 207 /* 208 temp_1 = (input[0] + input[4]) * cospi_16_64; 209 step2_0 = dct_const_round_shift(temp_1); 210 211 temp_2 = (input[0] - input[4]) * cospi_16_64; 212 step2_1 = dct_const_round_shift(temp_2); 213 */ 214 "lh %[Temp0], 0(%[input]) \n\t" 215 "lh %[Temp1], 8(%[input]) \n\t" 216 "mtlo %[const_2_power_13], $ac0 \n\t" 217 "mthi $zero, $ac0 \n\t" 218 "mtlo %[const_2_power_13], $ac1 \n\t" 219 "mthi $zero, $ac1 \n\t" 220 "add %[Temp2], %[Temp0], %[Temp1] \n\t" 221 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" 222 "extp %[step1_6], $ac0, 31 \n\t" 223 224 "sub %[Temp3], %[Temp0], %[Temp1] \n\t" 225 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" 226 "mtlo %[const_2_power_13], $ac0 \n\t" 227 "mthi $zero, $ac0 \n\t" 228 "extp %[Temp2], $ac1, 31 \n\t" 229 230 /* 231 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; 232 step2_2 = dct_const_round_shift(temp_1); 233 */ 234 "lh %[Temp0], 4(%[input]) \n\t" 235 "lh %[Temp1], 12(%[input]) \n\t" 236 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" 237 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" 238 "mtlo %[const_2_power_13], $ac1 \n\t" 239 "mthi $zero, $ac1 \n\t" 240 "extp %[Temp3], $ac0, 31 \n\t" 241 242 /* 243 step1_1 = step2_1 + step2_2; 244 step1_2 = step2_1 - step2_2; 245 */ 246 "add %[step1_1], %[Temp2], %[Temp3] \n\t" 247 "sub %[step1_2], %[Temp2], %[Temp3] \n\t" 248 249 /* 250 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; 251 step2_3 = dct_const_round_shift(temp_2); 252 */ 253 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" 254 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" 255 "extp %[Temp1], $ac1, 31 \n\t" 256 257 "mtlo %[const_2_power_13], $ac0 \n\t" 258 "mthi $zero, $ac0 \n\t" 259 260 /* 261 step1_0 = step2_0 + step2_3; 262 step1_3 = step2_0 - step2_3; 263 */ 264 "add %[step1_0], %[step1_6], %[Temp1] \n\t" 265 "sub %[step1_3], %[step1_6], %[Temp1] \n\t" 266 267 /* 268 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 269 step1_4 = dct_const_round_shift(temp_1); 270 */ 271 "lh %[Temp0], 2(%[input]) \n\t" 272 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" 273 "mtlo %[const_2_power_13], $ac1 \n\t" 274 "mthi $zero, $ac1 \n\t" 275 "lh %[Temp1], 14(%[input]) \n\t" 276 "lh %[Temp0], 2(%[input]) \n\t" 277 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" 278 "extp %[step1_4], $ac0, 31 \n\t" 279 280 /* 281 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 282 step1_7 = dct_const_round_shift(temp_2); 283 */ 284 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" 285 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" 286 "extp %[step1_7], $ac1, 31 \n\t" 287 288 /* 289 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 290 step1_5 = dct_const_round_shift(temp_1); 291 */ 292 "mtlo %[const_2_power_13], $ac0 \n\t" 293 "mthi $zero, $ac0 \n\t" 294 "lh %[Temp0], 10(%[input]) \n\t" 295 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" 296 "lh %[Temp1], 6(%[input]) \n\t" 297 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" 298 "extp %[step1_5], $ac0, 31 \n\t" 299 300 /* 301 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 302 step1_6 = dct_const_round_shift(temp_2); 303 */ 304 "mtlo %[const_2_power_13], $ac1 \n\t" 305 "mthi $zero, $ac1 \n\t" 306 "lh %[Temp0], 10(%[input]) \n\t" 307 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" 308 "lh %[Temp1], 6(%[input]) \n\t" 309 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" 310 "extp %[step1_6], $ac1, 31 \n\t" 311 312 /* 313 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; 314 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; 315 */ 316 "sub %[Temp0], %[step1_7], %[step1_6] \n\t" 317 "sub %[Temp0], %[Temp0], %[step1_4] \n\t" 318 "add %[Temp0], %[Temp0], %[step1_5] \n\t" 319 "sub %[Temp1], %[step1_4], %[step1_5] \n\t" 320 "sub %[Temp1], %[Temp1], %[step1_6] \n\t" 321 "add %[Temp1], %[Temp1], %[step1_7] \n\t" 322 323 "mtlo %[const_2_power_13], $ac0 \n\t" 324 "mthi $zero, $ac0 \n\t" 325 "mtlo %[const_2_power_13], $ac1 \n\t" 326 "mthi $zero, $ac1 \n\t" 327 328 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" 329 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" 330 331 /* 332 step1_4 = step1_4 + step1_5; 333 step1_7 = step1_6 + step1_7; 334 */ 335 "add %[step1_4], %[step1_4], %[step1_5] \n\t" 336 "add %[step1_7], %[step1_7], %[step1_6] \n\t" 337 338 "extp %[step1_5], $ac0, 31 \n\t" 339 "extp %[step1_6], $ac1, 31 \n\t" 340 341 /* add block */ 342 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 343 "add %[Temp0], %[step1_0], %[step1_7] \n\t" 344 "addi %[Temp0], %[Temp0], 16 \n\t" 345 "sra %[Temp0], %[Temp0], 5 \n\t" 346 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 347 "add %[Temp0], %[step1_1], %[step1_6] \n\t" 348 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 349 "slt %[Temp3], $zero, %[Temp1] \n\t" 350 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 351 "movz %[Temp1], $zero, %[Temp3] \n\t" 352 "sb %[Temp1], 0(%[dest_pix]) \n\t" 353 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 354 355 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 356 "addi %[Temp0], %[Temp0], 16 \n\t" 357 "sra %[Temp0], %[Temp0], 5 \n\t" 358 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 359 "add %[Temp0], %[step1_2], %[step1_5] \n\t" 360 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 361 "slt %[Temp3], $zero, %[Temp1] \n\t" 362 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 363 "movz %[Temp1], $zero, %[Temp3] \n\t" 364 "sb %[Temp1], 0(%[dest_pix]) \n\t" 365 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 366 367 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 368 "addi %[Temp0], %[Temp0], 16 \n\t" 369 "sra %[Temp0], %[Temp0], 5 \n\t" 370 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 371 "add %[Temp0], %[step1_3], %[step1_4] \n\t" 372 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 373 "slt %[Temp3], $zero, %[Temp1] \n\t" 374 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 375 "movz %[Temp1], $zero, %[Temp3] \n\t" 376 "sb %[Temp1], 0(%[dest_pix]) \n\t" 377 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 378 379 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 380 "addi %[Temp0], %[Temp0], 16 \n\t" 381 "sra %[Temp0], %[Temp0], 5 \n\t" 382 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 383 "sub %[Temp0], %[step1_3], %[step1_4] \n\t" 384 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 385 "slt %[Temp3], $zero, %[Temp1] \n\t" 386 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 387 "movz %[Temp1], $zero, %[Temp3] \n\t" 388 "sb %[Temp1], 0(%[dest_pix]) \n\t" 389 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 390 391 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 392 "addi %[Temp0], %[Temp0], 16 \n\t" 393 "sra %[Temp0], %[Temp0], 5 \n\t" 394 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 395 "sub %[Temp0], %[step1_2], %[step1_5] \n\t" 396 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 397 "slt %[Temp3], $zero, %[Temp1] \n\t" 398 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 399 "movz %[Temp1], $zero, %[Temp3] \n\t" 400 "sb %[Temp1], 0(%[dest_pix]) \n\t" 401 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 402 403 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 404 "addi %[Temp0], %[Temp0], 16 \n\t" 405 "sra %[Temp0], %[Temp0], 5 \n\t" 406 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 407 "sub %[Temp0], %[step1_1], %[step1_6] \n\t" 408 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 409 "slt %[Temp3], $zero, %[Temp1] \n\t" 410 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 411 "movz %[Temp1], $zero, %[Temp3] \n\t" 412 "sb %[Temp1], 0(%[dest_pix]) \n\t" 413 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 414 415 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 416 "addi %[Temp0], %[Temp0], 16 \n\t" 417 "sra %[Temp0], %[Temp0], 5 \n\t" 418 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 419 "sub %[Temp0], %[step1_0], %[step1_7] \n\t" 420 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 421 "slt %[Temp3], $zero, %[Temp1] \n\t" 422 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 423 "movz %[Temp1], $zero, %[Temp3] \n\t" 424 "sb %[Temp1], 0(%[dest_pix]) \n\t" 425 "addu %[dest_pix], %[dest_pix], %[stride] \n\t" 426 427 "lbu %[Temp1], 0(%[dest_pix]) \n\t" 428 "addi %[Temp0], %[Temp0], 16 \n\t" 429 "sra %[Temp0], %[Temp0], 5 \n\t" 430 "add %[Temp1], %[Temp1], %[Temp0] \n\t" 431 "slt %[Temp2], %[Temp1], %[const_255] \n\t" 432 "slt %[Temp3], $zero, %[Temp1] \n\t" 433 "movz %[Temp1], %[const_255], %[Temp2] \n\t" 434 "movz %[Temp1], $zero, %[Temp3] \n\t" 435 "sb %[Temp1], 0(%[dest_pix]) \n\t" 436 437 : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), 438 [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), 439 [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), 440 [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), 441 [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 442 [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) 443 : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255), 444 [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), 445 [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), 446 [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), 447 [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), 448 [stride] "r"(stride)); 449 450 input += 8; 451 } 452 } 453 454 void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { 455 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 456 int16_t *outptr = out; 457 uint32_t pos = 45; 458 459 /* bit positon for extract from acc */ 460 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); 461 462 // First transform rows 463 idct8_rows_dspr2(input, outptr, 8); 464 465 // Then transform columns and add to dest 466 idct8_columns_add_blk_dspr2(&out[0], dest, stride); 467 } 468 469 void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { 470 DECLARE_ALIGNED(32, int16_t, out[8 * 8]); 471 int16_t *outptr = out; 472 uint32_t pos = 45; 473 474 /* bit positon for extract from acc */ 475 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); 476 477 // First transform rows 478 idct8_rows_dspr2(input, outptr, 4); 479 480 outptr += 4; 481 482 __asm__ __volatile__( 483 "sw $zero, 0(%[outptr]) \n\t" 484 "sw $zero, 4(%[outptr]) \n\t" 485 "sw $zero, 16(%[outptr]) \n\t" 486 "sw $zero, 20(%[outptr]) \n\t" 487 "sw $zero, 32(%[outptr]) \n\t" 488 "sw $zero, 36(%[outptr]) \n\t" 489 "sw $zero, 48(%[outptr]) \n\t" 490 "sw $zero, 52(%[outptr]) \n\t" 491 "sw $zero, 64(%[outptr]) \n\t" 492 "sw $zero, 68(%[outptr]) \n\t" 493 "sw $zero, 80(%[outptr]) \n\t" 494 "sw $zero, 84(%[outptr]) \n\t" 495 "sw $zero, 96(%[outptr]) \n\t" 496 "sw $zero, 100(%[outptr]) \n\t" 497 "sw $zero, 112(%[outptr]) \n\t" 498 "sw $zero, 116(%[outptr]) \n\t" 499 500 : 501 : [outptr] "r"(outptr)); 502 503 // Then transform columns and add to dest 504 idct8_columns_add_blk_dspr2(&out[0], dest, stride); 505 } 506 507 void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { 508 uint32_t pos = 45; 509 int32_t out; 510 int32_t r; 511 int32_t a1, absa1; 512 int32_t t1, t2, vector_a1, vector_1, vector_2; 513 514 /* bit positon for extract from acc */ 515 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 516 517 : 518 : [pos] "r"(pos)); 519 520 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 521 __asm__ __volatile__( 522 "addi %[out], %[out], 16 \n\t" 523 "sra %[a1], %[out], 5 \n\t" 524 525 : [out] "+r"(out), [a1] "=r"(a1) 526 :); 527 528 if (a1 < 0) { 529 /* use quad-byte 530 * input and output memory are four byte aligned */ 531 __asm__ __volatile__( 532 "abs %[absa1], %[a1] \n\t" 533 "replv.qb %[vector_a1], %[absa1] \n\t" 534 535 : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) 536 : [a1] "r"(a1)); 537 538 for (r = 8; r--;) { 539 __asm__ __volatile__( 540 "lw %[t1], 0(%[dest]) \n\t" 541 "lw %[t2], 4(%[dest]) \n\t" 542 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 543 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 544 "sw %[vector_1], 0(%[dest]) \n\t" 545 "sw %[vector_2], 4(%[dest]) \n\t" 546 "add %[dest], %[dest], %[stride] \n\t" 547 548 : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), 549 [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) 550 : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); 551 } 552 } else if (a1 > 255) { 553 int32_t a11, a12, vector_a11, vector_a12; 554 555 /* use quad-byte 556 * input and output memory are four byte aligned */ 557 a11 = a1 >> 2; 558 a12 = a1 - (a11 * 3); 559 560 __asm__ __volatile__( 561 "replv.qb %[vector_a11], %[a11] \n\t" 562 "replv.qb %[vector_a12], %[a12] \n\t" 563 564 : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) 565 : [a11] "r"(a11), [a12] "r"(a12)); 566 567 for (r = 8; r--;) { 568 __asm__ __volatile__( 569 "lw %[t1], 0(%[dest]) \n\t" 570 "lw %[t2], 4(%[dest]) \n\t" 571 "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" 572 "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" 573 "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" 574 "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" 575 "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" 576 "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" 577 "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" 578 "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" 579 "sw %[vector_1], 0(%[dest]) \n\t" 580 "sw %[vector_2], 4(%[dest]) \n\t" 581 "add %[dest], %[dest], %[stride] \n\t" 582 583 : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), 584 [vector_2] "=&r"(vector_2), [dest] "+r"(dest) 585 : [stride] "r"(stride), [vector_a11] "r"(vector_a11), 586 [vector_a12] "r"(vector_a12)); 587 } 588 } else { 589 /* use quad-byte 590 * input and output memory are four byte aligned */ 591 __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" 592 593 : [vector_a1] "=r"(vector_a1) 594 : [a1] "r"(a1)); 595 596 for (r = 8; r--;) { 597 __asm__ __volatile__( 598 "lw %[t1], 0(%[dest]) \n\t" 599 "lw %[t2], 4(%[dest]) \n\t" 600 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 601 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 602 "sw %[vector_1], 0(%[dest]) \n\t" 603 "sw %[vector_2], 4(%[dest]) \n\t" 604 "add %[dest], %[dest], %[stride] \n\t" 605 606 : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), 607 [vector_2] "=&r"(vector_2), [dest] "+r"(dest) 608 : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); 609 } 610 } 611 } 612 613 void iadst8_dspr2(const int16_t *input, int16_t *output) { 614 int s0, s1, s2, s3, s4, s5, s6, s7; 615 int x0, x1, x2, x3, x4, x5, x6, x7; 616 617 x0 = input[7]; 618 x1 = input[0]; 619 x2 = input[5]; 620 x3 = input[2]; 621 x4 = input[3]; 622 x5 = input[4]; 623 x6 = input[1]; 624 x7 = input[6]; 625 626 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 627 output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = 628 output[6] = output[7] = 0; 629 return; 630 } 631 632 // stage 1 633 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 634 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 635 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 636 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 637 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 638 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 639 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 640 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 641 642 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); 643 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); 644 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); 645 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); 646 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); 647 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); 648 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); 649 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); 650 651 // stage 2 652 s0 = x0; 653 s1 = x1; 654 s2 = x2; 655 s3 = x3; 656 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 657 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 658 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 659 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 660 661 x0 = s0 + s2; 662 x1 = s1 + s3; 663 x2 = s0 - s2; 664 x3 = s1 - s3; 665 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); 666 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); 667 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); 668 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); 669 670 // stage 3 671 s2 = cospi_16_64 * (x2 + x3); 672 s3 = cospi_16_64 * (x2 - x3); 673 s6 = cospi_16_64 * (x6 + x7); 674 s7 = cospi_16_64 * (x6 - x7); 675 676 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); 677 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); 678 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); 679 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); 680 681 output[0] = x0; 682 output[1] = -x4; 683 output[2] = x6; 684 output[3] = -x2; 685 output[4] = x3; 686 output[5] = -x7; 687 output[6] = x5; 688 output[7] = -x1; 689 } 690 #endif // HAVE_DSPR2 691