1 /* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <stdio.h> 13 14 #include "./vpx_config.h" 15 #include "vpx_dsp/mips/inv_txfm_dspr2.h" 16 #include "vpx_dsp/txfm_common.h" 17 18 #if HAVE_DSPR2 19 static void idct32_rows_dspr2(const int16_t *input, int16_t *output, 20 uint32_t no_rows) { 21 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; 22 int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; 23 int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; 24 int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; 25 int step1_28, step1_29, step1_30, step1_31; 26 int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 27 int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; 28 int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; 29 int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; 30 int step2_28, step2_29, step2_30, step2_31; 31 int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; 32 int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; 33 int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; 34 int step3_29, step3_30, step3_31; 35 int temp0, temp1, temp2, temp3; 36 int load1, load2, load3, load4; 37 int result1, result2; 38 int i; 39 const int const_2_power_13 = 8192; 40 const int32_t *input_int; 41 42 for (i = no_rows; i--;) { 43 input_int = (const int32_t *)input; 44 45 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | 46 input_int[4] | input_int[5] | input_int[6] | input_int[7] | 47 input_int[8] | input_int[9] | input_int[10] | input_int[11] | 48 input_int[12] | input_int[13] | input_int[14] | input_int[15])) { 49 input += 32; 50 51 __asm__ __volatile__( 52 "sh $zero, 0(%[output]) \n\t" 53 "sh $zero, 64(%[output]) \n\t" 54 "sh $zero, 128(%[output]) \n\t" 55 "sh $zero, 192(%[output]) \n\t" 56 "sh $zero, 256(%[output]) \n\t" 57 "sh $zero, 320(%[output]) \n\t" 58 "sh $zero, 384(%[output]) \n\t" 59 "sh $zero, 448(%[output]) \n\t" 60 "sh $zero, 512(%[output]) \n\t" 61 "sh $zero, 576(%[output]) \n\t" 62 "sh $zero, 640(%[output]) \n\t" 63 "sh $zero, 704(%[output]) \n\t" 64 "sh $zero, 768(%[output]) \n\t" 65 "sh $zero, 832(%[output]) \n\t" 66 "sh $zero, 896(%[output]) \n\t" 67 "sh $zero, 960(%[output]) \n\t" 68 "sh $zero, 1024(%[output]) \n\t" 69 "sh $zero, 1088(%[output]) \n\t" 70 "sh $zero, 1152(%[output]) \n\t" 71 "sh $zero, 1216(%[output]) \n\t" 72 "sh $zero, 1280(%[output]) \n\t" 73 "sh $zero, 1344(%[output]) \n\t" 74 "sh $zero, 1408(%[output]) \n\t" 75 "sh $zero, 1472(%[output]) \n\t" 76 "sh $zero, 1536(%[output]) \n\t" 77 "sh $zero, 1600(%[output]) \n\t" 78 "sh $zero, 1664(%[output]) \n\t" 79 "sh $zero, 1728(%[output]) \n\t" 80 "sh $zero, 1792(%[output]) \n\t" 81 "sh $zero, 1856(%[output]) \n\t" 82 "sh $zero, 1920(%[output]) \n\t" 83 "sh $zero, 1984(%[output]) \n\t" 84 85 : 86 : [output] "r"(output)); 87 88 output += 1; 89 90 continue; 91 } 92 93 /* prefetch row */ 94 prefetch_load((const uint8_t *)(input + 32)); 95 prefetch_load((const uint8_t *)(input + 48)); 96 97 __asm__ __volatile__( 98 "lh %[load1], 2(%[input]) \n\t" 99 "lh %[load2], 62(%[input]) \n\t" 100 "lh %[load3], 34(%[input]) \n\t" 101 "lh %[load4], 30(%[input]) \n\t" 102 103 "mtlo %[const_2_power_13], $ac1 \n\t" 104 "mthi $zero, $ac1 \n\t" 105 "mtlo %[const_2_power_13], $ac3 \n\t" 106 "mthi $zero, $ac3 \n\t" 107 108 "madd $ac1, %[load1], %[cospi_31_64] \n\t" 109 "msub $ac1, %[load2], %[cospi_1_64] \n\t" 110 "extp %[temp0], $ac1, 31 \n\t" 111 112 "madd $ac3, %[load1], %[cospi_1_64] \n\t" 113 "madd $ac3, %[load2], %[cospi_31_64] \n\t" 114 "extp %[temp3], $ac3, 31 \n\t" 115 116 "mtlo %[const_2_power_13], $ac1 \n\t" 117 "mthi $zero, $ac1 \n\t" 118 "mtlo %[const_2_power_13], $ac2 \n\t" 119 "mthi $zero, $ac2 \n\t" 120 121 "madd $ac2, %[load3], %[cospi_15_64] \n\t" 122 "msub $ac2, %[load4], %[cospi_17_64] \n\t" 123 "extp %[temp1], $ac2, 31 \n\t" 124 125 "madd $ac1, %[load3], %[cospi_17_64] \n\t" 126 "madd $ac1, %[load4], %[cospi_15_64] \n\t" 127 "extp %[temp2], $ac1, 31 \n\t" 128 129 "mtlo %[const_2_power_13], $ac1 \n\t" 130 "mthi $zero, $ac1 \n\t" 131 "mtlo %[const_2_power_13], $ac3 \n\t" 132 "mthi $zero, $ac3 \n\t" 133 134 "sub %[load1], %[temp3], %[temp2] \n\t" 135 "sub %[load2], %[temp0], %[temp1] \n\t" 136 137 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 138 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 139 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 140 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 141 142 "extp %[step1_17], $ac1, 31 \n\t" 143 "extp %[step1_30], $ac3, 31 \n\t" 144 "add %[step1_16], %[temp0], %[temp1] \n\t" 145 "add %[step1_31], %[temp2], %[temp3] \n\t" 146 147 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 148 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 149 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 150 [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), 151 [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) 152 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 153 [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), 154 [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), 155 [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); 156 157 __asm__ __volatile__( 158 "lh %[load1], 18(%[input]) \n\t" 159 "lh %[load2], 46(%[input]) \n\t" 160 "lh %[load3], 50(%[input]) \n\t" 161 "lh %[load4], 14(%[input]) \n\t" 162 163 "mtlo %[const_2_power_13], $ac1 \n\t" 164 "mthi $zero, $ac1 \n\t" 165 "mtlo %[const_2_power_13], $ac3 \n\t" 166 "mthi $zero, $ac3 \n\t" 167 168 "madd $ac1, %[load1], %[cospi_23_64] \n\t" 169 "msub $ac1, %[load2], %[cospi_9_64] \n\t" 170 "extp %[temp0], $ac1, 31 \n\t" 171 172 "madd $ac3, %[load1], %[cospi_9_64] \n\t" 173 "madd $ac3, %[load2], %[cospi_23_64] \n\t" 174 "extp %[temp3], $ac3, 31 \n\t" 175 176 "mtlo %[const_2_power_13], $ac1 \n\t" 177 "mthi $zero, $ac1 \n\t" 178 "mtlo %[const_2_power_13], $ac2 \n\t" 179 "mthi $zero, $ac2 \n\t" 180 181 "madd $ac2, %[load3], %[cospi_7_64] \n\t" 182 "msub $ac2, %[load4], %[cospi_25_64] \n\t" 183 "extp %[temp1], $ac2, 31 \n\t" 184 185 "madd $ac1, %[load3], %[cospi_25_64] \n\t" 186 "madd $ac1, %[load4], %[cospi_7_64] \n\t" 187 "extp %[temp2], $ac1, 31 \n\t" 188 189 "mtlo %[const_2_power_13], $ac1 \n\t" 190 "mthi $zero, $ac1 \n\t" 191 "mtlo %[const_2_power_13], $ac3 \n\t" 192 "mthi $zero, $ac3 \n\t" 193 194 "sub %[load1], %[temp1], %[temp0] \n\t" 195 "sub %[load2], %[temp2], %[temp3] \n\t" 196 197 "msub $ac1, %[load1], %[cospi_28_64] \n\t" 198 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 199 "msub $ac3, %[load1], %[cospi_4_64] \n\t" 200 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 201 202 "extp %[step1_18], $ac1, 31 \n\t" 203 "extp %[step1_29], $ac3, 31 \n\t" 204 "add %[step1_19], %[temp0], %[temp1] \n\t" 205 "add %[step1_28], %[temp2], %[temp3] \n\t" 206 207 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 208 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 209 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 210 [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), 211 [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) 212 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 213 [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), 214 [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), 215 [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); 216 217 __asm__ __volatile__( 218 "lh %[load1], 10(%[input]) \n\t" 219 "lh %[load2], 54(%[input]) \n\t" 220 "lh %[load3], 42(%[input]) \n\t" 221 "lh %[load4], 22(%[input]) \n\t" 222 223 "mtlo %[const_2_power_13], $ac1 \n\t" 224 "mthi $zero, $ac1 \n\t" 225 "mtlo %[const_2_power_13], $ac3 \n\t" 226 "mthi $zero, $ac3 \n\t" 227 228 "madd $ac1, %[load1], %[cospi_27_64] \n\t" 229 "msub $ac1, %[load2], %[cospi_5_64] \n\t" 230 "extp %[temp0], $ac1, 31 \n\t" 231 232 "madd $ac3, %[load1], %[cospi_5_64] \n\t" 233 "madd $ac3, %[load2], %[cospi_27_64] \n\t" 234 "extp %[temp3], $ac3, 31 \n\t" 235 236 "mtlo %[const_2_power_13], $ac1 \n\t" 237 "mthi $zero, $ac1 \n\t" 238 "mtlo %[const_2_power_13], $ac2 \n\t" 239 "mthi $zero, $ac2 \n\t" 240 241 "madd $ac2, %[load3], %[cospi_11_64] \n\t" 242 "msub $ac2, %[load4], %[cospi_21_64] \n\t" 243 "extp %[temp1], $ac2, 31 \n\t" 244 245 "madd $ac1, %[load3], %[cospi_21_64] \n\t" 246 "madd $ac1, %[load4], %[cospi_11_64] \n\t" 247 "extp %[temp2], $ac1, 31 \n\t" 248 249 "mtlo %[const_2_power_13], $ac1 \n\t" 250 "mthi $zero, $ac1 \n\t" 251 "mtlo %[const_2_power_13], $ac3 \n\t" 252 "mthi $zero, $ac3 \n\t" 253 254 "sub %[load1], %[temp0], %[temp1] \n\t" 255 "sub %[load2], %[temp3], %[temp2] \n\t" 256 257 "madd $ac1, %[load2], %[cospi_12_64] \n\t" 258 "msub $ac1, %[load1], %[cospi_20_64] \n\t" 259 "madd $ac3, %[load1], %[cospi_12_64] \n\t" 260 "madd $ac3, %[load2], %[cospi_20_64] \n\t" 261 262 "extp %[step1_21], $ac1, 31 \n\t" 263 "extp %[step1_26], $ac3, 31 \n\t" 264 "add %[step1_20], %[temp0], %[temp1] \n\t" 265 "add %[step1_27], %[temp2], %[temp3] \n\t" 266 267 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 268 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 269 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 270 [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), 271 [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) 272 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 273 [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), 274 [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), 275 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); 276 277 __asm__ __volatile__( 278 "lh %[load1], 26(%[input]) \n\t" 279 "lh %[load2], 38(%[input]) \n\t" 280 "lh %[load3], 58(%[input]) \n\t" 281 "lh %[load4], 6(%[input]) \n\t" 282 283 "mtlo %[const_2_power_13], $ac1 \n\t" 284 "mthi $zero, $ac1 \n\t" 285 "mtlo %[const_2_power_13], $ac3 \n\t" 286 "mthi $zero, $ac3 \n\t" 287 288 "madd $ac1, %[load1], %[cospi_19_64] \n\t" 289 "msub $ac1, %[load2], %[cospi_13_64] \n\t" 290 "extp %[temp0], $ac1, 31 \n\t" 291 "madd $ac3, %[load1], %[cospi_13_64] \n\t" 292 "madd $ac3, %[load2], %[cospi_19_64] \n\t" 293 "extp %[temp3], $ac3, 31 \n\t" 294 295 "mtlo %[const_2_power_13], $ac1 \n\t" 296 "mthi $zero, $ac1 \n\t" 297 "mtlo %[const_2_power_13], $ac2 \n\t" 298 "mthi $zero, $ac2 \n\t" 299 300 "madd $ac2, %[load3], %[cospi_3_64] \n\t" 301 "msub $ac2, %[load4], %[cospi_29_64] \n\t" 302 "extp %[temp1], $ac2, 31 \n\t" 303 "madd $ac1, %[load3], %[cospi_29_64] \n\t" 304 "madd $ac1, %[load4], %[cospi_3_64] \n\t" 305 "extp %[temp2], $ac1, 31 \n\t" 306 307 "mtlo %[const_2_power_13], $ac1 \n\t" 308 "mthi $zero, $ac1 \n\t" 309 "mtlo %[const_2_power_13], $ac3 \n\t" 310 "mthi $zero, $ac3 \n\t" 311 312 "sub %[load1], %[temp1], %[temp0] \n\t" 313 "sub %[load2], %[temp2], %[temp3] \n\t" 314 "msub $ac1, %[load1], %[cospi_12_64] \n\t" 315 "msub $ac1, %[load2], %[cospi_20_64] \n\t" 316 "msub $ac3, %[load1], %[cospi_20_64] \n\t" 317 "madd $ac3, %[load2], %[cospi_12_64] \n\t" 318 "extp %[step1_22], $ac1, 31 \n\t" 319 "extp %[step1_25], $ac3, 31 \n\t" 320 "add %[step1_23], %[temp0], %[temp1] \n\t" 321 "add %[step1_24], %[temp2], %[temp3] \n\t" 322 323 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 324 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 325 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 326 [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), 327 [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) 328 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 329 [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), 330 [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), 331 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); 332 333 __asm__ __volatile__( 334 "lh %[load1], 4(%[input]) \n\t" 335 "lh %[load2], 60(%[input]) \n\t" 336 "lh %[load3], 36(%[input]) \n\t" 337 "lh %[load4], 28(%[input]) \n\t" 338 339 "mtlo %[const_2_power_13], $ac1 \n\t" 340 "mthi $zero, $ac1 \n\t" 341 "mtlo %[const_2_power_13], $ac3 \n\t" 342 "mthi $zero, $ac3 \n\t" 343 344 "madd $ac1, %[load1], %[cospi_30_64] \n\t" 345 "msub $ac1, %[load2], %[cospi_2_64] \n\t" 346 "extp %[temp0], $ac1, 31 \n\t" 347 "madd $ac3, %[load1], %[cospi_2_64] \n\t" 348 "madd $ac3, %[load2], %[cospi_30_64] \n\t" 349 "extp %[temp3], $ac3, 31 \n\t" 350 351 "mtlo %[const_2_power_13], $ac1 \n\t" 352 "mthi $zero, $ac1 \n\t" 353 "mtlo %[const_2_power_13], $ac2 \n\t" 354 "mthi $zero, $ac2 \n\t" 355 356 "madd $ac2, %[load3], %[cospi_14_64] \n\t" 357 "msub $ac2, %[load4], %[cospi_18_64] \n\t" 358 "extp %[temp1], $ac2, 31 \n\t" 359 "madd $ac1, %[load3], %[cospi_18_64] \n\t" 360 "madd $ac1, %[load4], %[cospi_14_64] \n\t" 361 "extp %[temp2], $ac1, 31 \n\t" 362 363 "mtlo %[const_2_power_13], $ac1 \n\t" 364 "mthi $zero, $ac1 \n\t" 365 "mtlo %[const_2_power_13], $ac3 \n\t" 366 "mthi $zero, $ac3 \n\t" 367 368 "sub %[load1], %[temp0], %[temp1] \n\t" 369 "sub %[load2], %[temp3], %[temp2] \n\t" 370 "msub $ac1, %[load1], %[cospi_8_64] \n\t" 371 "madd $ac1, %[load2], %[cospi_24_64] \n\t" 372 "madd $ac3, %[load1], %[cospi_24_64] \n\t" 373 "madd $ac3, %[load2], %[cospi_8_64] \n\t" 374 "extp %[step2_9], $ac1, 31 \n\t" 375 "extp %[step2_14], $ac3, 31 \n\t" 376 "add %[step2_8], %[temp0], %[temp1] \n\t" 377 "add %[step2_15], %[temp2], %[temp3] \n\t" 378 379 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 380 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 381 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), 382 [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), 383 [step2_15] "=&r"(step2_15) 384 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 385 [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), 386 [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), 387 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); 388 389 __asm__ __volatile__( 390 "lh %[load1], 20(%[input]) \n\t" 391 "lh %[load2], 44(%[input]) \n\t" 392 "lh %[load3], 52(%[input]) \n\t" 393 "lh %[load4], 12(%[input]) \n\t" 394 395 "mtlo %[const_2_power_13], $ac1 \n\t" 396 "mthi $zero, $ac1 \n\t" 397 "mtlo %[const_2_power_13], $ac3 \n\t" 398 "mthi $zero, $ac3 \n\t" 399 400 "madd $ac1, %[load1], %[cospi_22_64] \n\t" 401 "msub $ac1, %[load2], %[cospi_10_64] \n\t" 402 "extp %[temp0], $ac1, 31 \n\t" 403 "madd $ac3, %[load1], %[cospi_10_64] \n\t" 404 "madd $ac3, %[load2], %[cospi_22_64] \n\t" 405 "extp %[temp3], $ac3, 31 \n\t" 406 407 "mtlo %[const_2_power_13], $ac1 \n\t" 408 "mthi $zero, $ac1 \n\t" 409 "mtlo %[const_2_power_13], $ac2 \n\t" 410 "mthi $zero, $ac2 \n\t" 411 412 "madd $ac2, %[load3], %[cospi_6_64] \n\t" 413 "msub $ac2, %[load4], %[cospi_26_64] \n\t" 414 "extp %[temp1], $ac2, 31 \n\t" 415 "madd $ac1, %[load3], %[cospi_26_64] \n\t" 416 "madd $ac1, %[load4], %[cospi_6_64] \n\t" 417 "extp %[temp2], $ac1, 31 \n\t" 418 419 "mtlo %[const_2_power_13], $ac1 \n\t" 420 "mthi $zero, $ac1 \n\t" 421 "mtlo %[const_2_power_13], $ac3 \n\t" 422 "mthi $zero, $ac3 \n\t" 423 424 "sub %[load1], %[temp1], %[temp0] \n\t" 425 "sub %[load2], %[temp2], %[temp3] \n\t" 426 "msub $ac1, %[load1], %[cospi_24_64] \n\t" 427 "msub $ac1, %[load2], %[cospi_8_64] \n\t" 428 "madd $ac3, %[load2], %[cospi_24_64] \n\t" 429 "msub $ac3, %[load1], %[cospi_8_64] \n\t" 430 "extp %[step2_10], $ac1, 31 \n\t" 431 "extp %[step2_13], $ac3, 31 \n\t" 432 "add %[step2_11], %[temp0], %[temp1] \n\t" 433 "add %[step2_12], %[temp2], %[temp3] \n\t" 434 435 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 436 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 437 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), 438 [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), 439 [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) 440 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 441 [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), 442 [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), 443 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); 444 445 __asm__ __volatile__( 446 "mtlo %[const_2_power_13], $ac0 \n\t" 447 "mthi $zero, $ac0 \n\t" 448 "sub %[temp0], %[step2_14], %[step2_13] \n\t" 449 "sub %[temp0], %[temp0], %[step2_9] \n\t" 450 "add %[temp0], %[temp0], %[step2_10] \n\t" 451 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 452 "mtlo %[const_2_power_13], $ac1 \n\t" 453 "mthi $zero, $ac1 \n\t" 454 "sub %[temp1], %[step2_14], %[step2_13] \n\t" 455 "add %[temp1], %[temp1], %[step2_9] \n\t" 456 "sub %[temp1], %[temp1], %[step2_10] \n\t" 457 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 458 "mtlo %[const_2_power_13], $ac2 \n\t" 459 "mthi $zero, $ac2 \n\t" 460 "sub %[temp0], %[step2_15], %[step2_12] \n\t" 461 "sub %[temp0], %[temp0], %[step2_8] \n\t" 462 "add %[temp0], %[temp0], %[step2_11] \n\t" 463 "madd $ac2, %[temp0], %[cospi_16_64] \n\t" 464 "mtlo %[const_2_power_13], $ac3 \n\t" 465 "mthi $zero, $ac3 \n\t" 466 "sub %[temp1], %[step2_15], %[step2_12] \n\t" 467 "add %[temp1], %[temp1], %[step2_8] \n\t" 468 "sub %[temp1], %[temp1], %[step2_11] \n\t" 469 "madd $ac3, %[temp1], %[cospi_16_64] \n\t" 470 471 "add %[step3_8], %[step2_8], %[step2_11] \n\t" 472 "add %[step3_9], %[step2_9], %[step2_10] \n\t" 473 "add %[step3_14], %[step2_13], %[step2_14] \n\t" 474 "add %[step3_15], %[step2_12], %[step2_15] \n\t" 475 "extp %[step3_10], $ac0, 31 \n\t" 476 "extp %[step3_13], $ac1, 31 \n\t" 477 "extp %[step3_11], $ac2, 31 \n\t" 478 "extp %[step3_12], $ac3, 31 \n\t" 479 480 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), 481 [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), 482 [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), 483 [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), 484 [step3_15] "=&r"(step3_15) 485 : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), 486 [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), 487 [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), 488 [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), 489 [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); 490 491 __asm__ __volatile__( 492 "mtlo %[const_2_power_13], $ac0 \n\t" 493 "mthi $zero, $ac0 \n\t" 494 "mtlo %[const_2_power_13], $ac1 \n\t" 495 "mthi $zero, $ac1 \n\t" 496 "sub %[temp0], %[step1_17], %[step1_18] \n\t" 497 "sub %[temp1], %[step1_30], %[step1_29] \n\t" 498 "add %[step3_17], %[step1_17], %[step1_18] \n\t" 499 "add %[step3_30], %[step1_30], %[step1_29] \n\t" 500 501 "msub $ac0, %[temp0], %[cospi_8_64] \n\t" 502 "madd $ac0, %[temp1], %[cospi_24_64] \n\t" 503 "extp %[step3_18], $ac0, 31 \n\t" 504 "madd $ac1, %[temp0], %[cospi_24_64] \n\t" 505 "madd $ac1, %[temp1], %[cospi_8_64] \n\t" 506 "extp %[step3_29], $ac1, 31 \n\t" 507 508 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 509 [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), 510 [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) 511 : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), 512 [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), 513 [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), 514 [cospi_8_64] "r"(cospi_8_64)); 515 516 __asm__ __volatile__( 517 "mtlo %[const_2_power_13], $ac0 \n\t" 518 "mthi $zero, $ac0 \n\t" 519 "mtlo %[const_2_power_13], $ac1 \n\t" 520 "mthi $zero, $ac1 \n\t" 521 "sub %[temp0], %[step1_16], %[step1_19] \n\t" 522 "sub %[temp1], %[step1_31], %[step1_28] \n\t" 523 "add %[step3_16], %[step1_16], %[step1_19] \n\t" 524 "add %[step3_31], %[step1_31], %[step1_28] \n\t" 525 526 "msub $ac0, %[temp0], %[cospi_8_64] \n\t" 527 "madd $ac0, %[temp1], %[cospi_24_64] \n\t" 528 "extp %[step3_19], $ac0, 31 \n\t" 529 "madd $ac1, %[temp0], %[cospi_24_64] \n\t" 530 "madd $ac1, %[temp1], %[cospi_8_64] \n\t" 531 "extp %[step3_28], $ac1, 31 \n\t" 532 533 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 534 [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), 535 [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) 536 : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), 537 [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), 538 [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), 539 [cospi_8_64] "r"(cospi_8_64)); 540 541 __asm__ __volatile__( 542 "mtlo %[const_2_power_13], $ac0 \n\t" 543 "mthi $zero, $ac0 \n\t" 544 "mtlo %[const_2_power_13], $ac1 \n\t" 545 "mthi $zero, $ac1 \n\t" 546 "sub %[temp0], %[step1_23], %[step1_20] \n\t" 547 "sub %[temp1], %[step1_24], %[step1_27] \n\t" 548 "add %[step3_23], %[step1_23], %[step1_20] \n\t" 549 "add %[step3_24], %[step1_24], %[step1_27] \n\t" 550 551 "msub $ac0, %[temp0], %[cospi_8_64] \n\t" 552 "madd $ac0, %[temp1], %[cospi_24_64] \n\t" 553 "extp %[step3_27], $ac0, 31 \n\t" 554 "msub $ac1, %[temp0], %[cospi_24_64] \n\t" 555 "msub $ac1, %[temp1], %[cospi_8_64] \n\t" 556 "extp %[step3_20], $ac1, 31 \n\t" 557 558 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 559 [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), 560 [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) 561 : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), 562 [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), 563 [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), 564 [cospi_8_64] "r"(cospi_8_64)); 565 566 __asm__ __volatile__( 567 "mtlo %[const_2_power_13], $ac0 \n\t" 568 "mthi $zero, $ac0 \n\t" 569 "mtlo %[const_2_power_13], $ac1 \n\t" 570 "mthi $zero, $ac1 \n\t" 571 "sub %[temp0], %[step1_22], %[step1_21] \n\t" 572 "sub %[temp1], %[step1_25], %[step1_26] \n\t" 573 "add %[step3_22], %[step1_22], %[step1_21] \n\t" 574 "add %[step3_25], %[step1_25], %[step1_26] \n\t" 575 576 "msub $ac0, %[temp0], %[cospi_24_64] \n\t" 577 "msub $ac0, %[temp1], %[cospi_8_64] \n\t" 578 "extp %[step3_21], $ac0, 31 \n\t" 579 "msub $ac1, %[temp0], %[cospi_8_64] \n\t" 580 "madd $ac1, %[temp1], %[cospi_24_64] \n\t" 581 "extp %[step3_26], $ac1, 31 \n\t" 582 583 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 584 [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), 585 [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) 586 : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), 587 [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), 588 [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), 589 [cospi_8_64] "r"(cospi_8_64)); 590 591 __asm__ __volatile__( 592 "add %[step2_16], %[step3_16], %[step3_23] \n\t" 593 "add %[step2_17], %[step3_17], %[step3_22] \n\t" 594 "add %[step2_18], %[step3_18], %[step3_21] \n\t" 595 "add %[step2_19], %[step3_19], %[step3_20] \n\t" 596 "sub %[step2_20], %[step3_19], %[step3_20] \n\t" 597 "sub %[step2_21], %[step3_18], %[step3_21] \n\t" 598 "sub %[step2_22], %[step3_17], %[step3_22] \n\t" 599 "sub %[step2_23], %[step3_16], %[step3_23] \n\t" 600 601 : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), 602 [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), 603 [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), 604 [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) 605 : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), 606 [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), 607 [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), 608 [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); 609 610 __asm__ __volatile__( 611 "sub %[step2_24], %[step3_31], %[step3_24] \n\t" 612 "sub %[step2_25], %[step3_30], %[step3_25] \n\t" 613 "sub %[step2_26], %[step3_29], %[step3_26] \n\t" 614 "sub %[step2_27], %[step3_28], %[step3_27] \n\t" 615 "add %[step2_28], %[step3_28], %[step3_27] \n\t" 616 "add %[step2_29], %[step3_29], %[step3_26] \n\t" 617 "add %[step2_30], %[step3_30], %[step3_25] \n\t" 618 "add %[step2_31], %[step3_31], %[step3_24] \n\t" 619 620 : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), 621 [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), 622 [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), 623 [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) 624 : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), 625 [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), 626 [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), 627 [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); 628 629 __asm__ __volatile__( 630 "lh %[load1], 0(%[input]) \n\t" 631 "lh %[load2], 32(%[input]) \n\t" 632 "lh %[load3], 16(%[input]) \n\t" 633 "lh %[load4], 48(%[input]) \n\t" 634 635 "mtlo %[const_2_power_13], $ac1 \n\t" 636 "mthi $zero, $ac1 \n\t" 637 "mtlo %[const_2_power_13], $ac2 \n\t" 638 "mthi $zero, $ac2 \n\t" 639 "add %[result1], %[load1], %[load2] \n\t" 640 "sub %[result2], %[load1], %[load2] \n\t" 641 "madd $ac1, %[result1], %[cospi_16_64] \n\t" 642 "madd $ac2, %[result2], %[cospi_16_64] \n\t" 643 "extp %[temp0], $ac1, 31 \n\t" 644 "extp %[temp1], $ac2, 31 \n\t" 645 646 "mtlo %[const_2_power_13], $ac3 \n\t" 647 "mthi $zero, $ac3 \n\t" 648 "madd $ac3, %[load3], %[cospi_24_64] \n\t" 649 "msub $ac3, %[load4], %[cospi_8_64] \n\t" 650 "extp %[temp2], $ac3, 31 \n\t" 651 "mtlo %[const_2_power_13], $ac1 \n\t" 652 "mthi $zero, $ac1 \n\t" 653 "madd $ac1, %[load3], %[cospi_8_64] \n\t" 654 "madd $ac1, %[load4], %[cospi_24_64] \n\t" 655 "extp %[temp3], $ac1, 31 \n\t" 656 "add %[step1_0], %[temp0], %[temp3] \n\t" 657 "add %[step1_1], %[temp1], %[temp2] \n\t" 658 "sub %[step1_2], %[temp1], %[temp2] \n\t" 659 "sub %[step1_3], %[temp0], %[temp3] \n\t" 660 661 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 662 [load4] "=&r"(load4), [result1] "=&r"(result1), 663 [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 664 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), 665 [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), 666 [step1_3] "=&r"(step1_3) 667 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 668 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), 669 [cospi_16_64] "r"(cospi_16_64)); 670 671 __asm__ __volatile__( 672 "lh %[load1], 8(%[input]) \n\t" 673 "lh %[load2], 56(%[input]) \n\t" 674 "lh %[load3], 40(%[input]) \n\t" 675 "lh %[load4], 24(%[input]) \n\t" 676 677 "mtlo %[const_2_power_13], $ac1 \n\t" 678 "mthi $zero, $ac1 \n\t" 679 "mtlo %[const_2_power_13], $ac3 \n\t" 680 "mthi $zero, $ac3 \n\t" 681 682 "madd $ac1, %[load1], %[cospi_28_64] \n\t" 683 "msub $ac1, %[load2], %[cospi_4_64] \n\t" 684 "extp %[temp0], $ac1, 31 \n\t" 685 "madd $ac3, %[load1], %[cospi_4_64] \n\t" 686 "madd $ac3, %[load2], %[cospi_28_64] \n\t" 687 "extp %[temp3], $ac3, 31 \n\t" 688 689 "mtlo %[const_2_power_13], $ac1 \n\t" 690 "mthi $zero, $ac1 \n\t" 691 "mtlo %[const_2_power_13], $ac2 \n\t" 692 "mthi $zero, $ac2 \n\t" 693 694 "madd $ac2, %[load3], %[cospi_12_64] \n\t" 695 "msub $ac2, %[load4], %[cospi_20_64] \n\t" 696 "extp %[temp1], $ac2, 31 \n\t" 697 "madd $ac1, %[load3], %[cospi_20_64] \n\t" 698 "madd $ac1, %[load4], %[cospi_12_64] \n\t" 699 "extp %[temp2], $ac1, 31 \n\t" 700 701 "mtlo %[const_2_power_13], $ac1 \n\t" 702 "mthi $zero, $ac1 \n\t" 703 "mtlo %[const_2_power_13], $ac3 \n\t" 704 "mthi $zero, $ac3 \n\t" 705 706 "sub %[load1], %[temp3], %[temp2] \n\t" 707 "sub %[load1], %[load1], %[temp0] \n\t" 708 "add %[load1], %[load1], %[temp1] \n\t" 709 "sub %[load2], %[temp0], %[temp1] \n\t" 710 "sub %[load2], %[load2], %[temp2] \n\t" 711 "add %[load2], %[load2], %[temp3] \n\t" 712 "madd $ac1, %[load1], %[cospi_16_64] \n\t" 713 "madd $ac3, %[load2], %[cospi_16_64] \n\t" 714 715 "extp %[step1_5], $ac1, 31 \n\t" 716 "extp %[step1_6], $ac3, 31 \n\t" 717 "add %[step1_4], %[temp0], %[temp1] \n\t" 718 "add %[step1_7], %[temp3], %[temp2] \n\t" 719 720 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 721 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), 722 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), 723 [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), 724 [step1_7] "=&r"(step1_7) 725 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), 726 [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), 727 [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), 728 [cospi_16_64] "r"(cospi_16_64)); 729 730 __asm__ __volatile__( 731 "add %[step2_0], %[step1_0], %[step1_7] \n\t" 732 "add %[step2_1], %[step1_1], %[step1_6] \n\t" 733 "add %[step2_2], %[step1_2], %[step1_5] \n\t" 734 "add %[step2_3], %[step1_3], %[step1_4] \n\t" 735 "sub %[step2_4], %[step1_3], %[step1_4] \n\t" 736 "sub %[step2_5], %[step1_2], %[step1_5] \n\t" 737 "sub %[step2_6], %[step1_1], %[step1_6] \n\t" 738 "sub %[step2_7], %[step1_0], %[step1_7] \n\t" 739 740 : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), 741 [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), 742 [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), 743 [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) 744 : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), 745 [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), 746 [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), 747 [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); 748 749 // stage 7 750 __asm__ __volatile__( 751 "add %[step1_0], %[step2_0], %[step3_15] \n\t" 752 "add %[step1_1], %[step2_1], %[step3_14] \n\t" 753 "add %[step1_2], %[step2_2], %[step3_13] \n\t" 754 "add %[step1_3], %[step2_3], %[step3_12] \n\t" 755 "sub %[step1_12], %[step2_3], %[step3_12] \n\t" 756 "sub %[step1_13], %[step2_2], %[step3_13] \n\t" 757 "sub %[step1_14], %[step2_1], %[step3_14] \n\t" 758 "sub %[step1_15], %[step2_0], %[step3_15] \n\t" 759 760 : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), 761 [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), 762 [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), 763 [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) 764 : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), 765 [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), 766 [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), 767 [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); 768 769 __asm__ __volatile__( 770 "add %[step1_4], %[step2_4], %[step3_11] \n\t" 771 "add %[step1_5], %[step2_5], %[step3_10] \n\t" 772 "add %[step1_6], %[step2_6], %[step3_9] \n\t" 773 "add %[step1_7], %[step2_7], %[step3_8] \n\t" 774 "sub %[step1_8], %[step2_7], %[step3_8] \n\t" 775 "sub %[step1_9], %[step2_6], %[step3_9] \n\t" 776 "sub %[step1_10], %[step2_5], %[step3_10] \n\t" 777 "sub %[step1_11], %[step2_4], %[step3_11] \n\t" 778 779 : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), 780 [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), 781 [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), 782 [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) 783 : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), 784 [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), 785 [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), 786 [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); 787 788 __asm__ __volatile__( 789 "sub %[temp0], %[step2_27], %[step2_20] \n\t" 790 "add %[temp1], %[step2_27], %[step2_20] \n\t" 791 "sub %[temp2], %[step2_26], %[step2_21] \n\t" 792 "add %[temp3], %[step2_26], %[step2_21] \n\t" 793 794 "mtlo %[const_2_power_13], $ac0 \n\t" 795 "mthi $zero, $ac0 \n\t" 796 "mtlo %[const_2_power_13], $ac1 \n\t" 797 "mthi $zero, $ac1 \n\t" 798 "mtlo %[const_2_power_13], $ac2 \n\t" 799 "mthi $zero, $ac2 \n\t" 800 "mtlo %[const_2_power_13], $ac3 \n\t" 801 "mthi $zero, $ac3 \n\t" 802 803 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 804 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 805 "madd $ac2, %[temp2], %[cospi_16_64] \n\t" 806 "madd $ac3, %[temp3], %[cospi_16_64] \n\t" 807 808 "extp %[step1_20], $ac0, 31 \n\t" 809 "extp %[step1_27], $ac1, 31 \n\t" 810 "extp %[step1_21], $ac2, 31 \n\t" 811 "extp %[step1_26], $ac3, 31 \n\t" 812 813 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 814 [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), 815 [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), 816 [step1_26] "=&r"(step1_26) 817 : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), 818 [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), 819 [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); 820 821 __asm__ __volatile__( 822 "sub %[temp0], %[step2_25], %[step2_22] \n\t" 823 "add %[temp1], %[step2_25], %[step2_22] \n\t" 824 "sub %[temp2], %[step2_24], %[step2_23] \n\t" 825 "add %[temp3], %[step2_24], %[step2_23] \n\t" 826 827 "mtlo %[const_2_power_13], $ac0 \n\t" 828 "mthi $zero, $ac0 \n\t" 829 "mtlo %[const_2_power_13], $ac1 \n\t" 830 "mthi $zero, $ac1 \n\t" 831 "mtlo %[const_2_power_13], $ac2 \n\t" 832 "mthi $zero, $ac2 \n\t" 833 "mtlo %[const_2_power_13], $ac3 \n\t" 834 "mthi $zero, $ac3 \n\t" 835 836 "madd $ac0, %[temp0], %[cospi_16_64] \n\t" 837 "madd $ac1, %[temp1], %[cospi_16_64] \n\t" 838 "madd $ac2, %[temp2], %[cospi_16_64] \n\t" 839 "madd $ac3, %[temp3], %[cospi_16_64] \n\t" 840 841 "extp %[step1_22], $ac0, 31 \n\t" 842 "extp %[step1_25], $ac1, 31 \n\t" 843 "extp %[step1_23], $ac2, 31 \n\t" 844 "extp %[step1_24], $ac3, 31 \n\t" 845 846 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), 847 [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), 848 [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), 849 [step1_24] "=&r"(step1_24) 850 : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), 851 [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), 852 [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); 853 854 // final stage 855 __asm__ __volatile__( 856 "add %[temp0], %[step1_0], %[step2_31] \n\t" 857 "add %[temp1], %[step1_1], %[step2_30] \n\t" 858 "add %[temp2], %[step1_2], %[step2_29] \n\t" 859 "add %[temp3], %[step1_3], %[step2_28] \n\t" 860 "sub %[load1], %[step1_3], %[step2_28] \n\t" 861 "sub %[load2], %[step1_2], %[step2_29] \n\t" 862 "sub %[load3], %[step1_1], %[step2_30] \n\t" 863 "sub %[load4], %[step1_0], %[step2_31] \n\t" 864 "sh %[temp0], 0(%[output]) \n\t" 865 "sh %[temp1], 64(%[output]) \n\t" 866 "sh %[temp2], 128(%[output]) \n\t" 867 "sh %[temp3], 192(%[output]) \n\t" 868 "sh %[load1], 1792(%[output]) \n\t" 869 "sh %[load2], 1856(%[output]) \n\t" 870 "sh %[load3], 1920(%[output]) \n\t" 871 "sh %[load4], 1984(%[output]) \n\t" 872 873 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), 874 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), 875 [temp3] "=&r"(temp3), [load4] "=&r"(load4) 876 : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31), 877 [step1_1] "r"(step1_1), [step2_30] "r"(step2_30), 878 [step1_2] "r"(step1_2), [step2_29] "r"(step2_29), 879 [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), 880 [output] "r"(output)); 881 882 __asm__ __volatile__( 883 "add %[temp0], %[step1_4], %[step1_27] \n\t" 884 "add %[temp1], %[step1_5], %[step1_26] \n\t" 885 "add %[temp2], %[step1_6], %[step1_25] \n\t" 886 "add %[temp3], %[step1_7], %[step1_24] \n\t" 887 "sub %[load1], %[step1_7], %[step1_24] \n\t" 888 "sub %[load2], %[step1_6], %[step1_25] \n\t" 889 "sub %[load3], %[step1_5], %[step1_26] \n\t" 890 "sub %[load4], %[step1_4], %[step1_27] \n\t" 891 "sh %[temp0], 256(%[output]) \n\t" 892 "sh %[temp1], 320(%[output]) \n\t" 893 "sh %[temp2], 384(%[output]) \n\t" 894 "sh %[temp3], 448(%[output]) \n\t" 895 "sh %[load1], 1536(%[output]) \n\t" 896 "sh %[load2], 1600(%[output]) \n\t" 897 "sh %[load3], 1664(%[output]) \n\t" 898 "sh %[load4], 1728(%[output]) \n\t" 899 900 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), 901 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), 902 [temp3] "=&r"(temp3), [load4] "=&r"(load4) 903 : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27), 904 [step1_5] "r"(step1_5), [step1_26] "r"(step1_26), 905 [step1_6] "r"(step1_6), [step1_25] "r"(step1_25), 906 [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), 907 [output] "r"(output)); 908 909 __asm__ __volatile__( 910 "add %[temp0], %[step1_8], %[step1_23] \n\t" 911 "add %[temp1], %[step1_9], %[step1_22] \n\t" 912 "add %[temp2], %[step1_10], %[step1_21] \n\t" 913 "add %[temp3], %[step1_11], %[step1_20] \n\t" 914 "sub %[load1], %[step1_11], %[step1_20] \n\t" 915 "sub %[load2], %[step1_10], %[step1_21] \n\t" 916 "sub %[load3], %[step1_9], %[step1_22] \n\t" 917 "sub %[load4], %[step1_8], %[step1_23] \n\t" 918 "sh %[temp0], 512(%[output]) \n\t" 919 "sh %[temp1], 576(%[output]) \n\t" 920 "sh %[temp2], 640(%[output]) \n\t" 921 "sh %[temp3], 704(%[output]) \n\t" 922 "sh %[load1], 1280(%[output]) \n\t" 923 "sh %[load2], 1344(%[output]) \n\t" 924 "sh %[load3], 1408(%[output]) \n\t" 925 "sh %[load4], 1472(%[output]) \n\t" 926 927 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), 928 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), 929 [temp3] "=&r"(temp3), [load4] "=&r"(load4) 930 : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23), 931 [step1_9] "r"(step1_9), [step1_22] "r"(step1_22), 932 [step1_10] "r"(step1_10), [step1_21] "r"(step1_21), 933 [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), 934 [output] "r"(output)); 935 936 __asm__ __volatile__( 937 "add %[temp0], %[step1_12], %[step2_19] \n\t" 938 "add %[temp1], %[step1_13], %[step2_18] \n\t" 939 "add %[temp2], %[step1_14], %[step2_17] \n\t" 940 "add %[temp3], %[step1_15], %[step2_16] \n\t" 941 "sub %[load1], %[step1_15], %[step2_16] \n\t" 942 "sub %[load2], %[step1_14], %[step2_17] \n\t" 943 "sub %[load3], %[step1_13], %[step2_18] \n\t" 944 "sub %[load4], %[step1_12], %[step2_19] \n\t" 945 "sh %[temp0], 768(%[output]) \n\t" 946 "sh %[temp1], 832(%[output]) \n\t" 947 "sh %[temp2], 896(%[output]) \n\t" 948 "sh %[temp3], 960(%[output]) \n\t" 949 "sh %[load1], 1024(%[output]) \n\t" 950 "sh %[load2], 1088(%[output]) \n\t" 951 "sh %[load3], 1152(%[output]) \n\t" 952 "sh %[load4], 1216(%[output]) \n\t" 953 954 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), 955 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), 956 [temp3] "=&r"(temp3), [load4] "=&r"(load4) 957 : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19), 958 [step1_13] "r"(step1_13), [step2_18] "r"(step2_18), 959 [step1_14] "r"(step1_14), [step2_17] "r"(step2_17), 960 [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), 961 [output] "r"(output)); 962 963 input += 32; 964 output += 1; 965 } 966 } 967 968 void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, 969 int stride) { 970 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); 971 int16_t *outptr = out; 972 uint32_t pos = 45; 973 974 /* bit positon for extract from acc */ 975 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 976 : 977 : [pos] "r"(pos)); 978 979 // Rows 980 idct32_rows_dspr2(input, outptr, 32); 981 982 // Columns 983 vpx_idct32_cols_add_blk_dspr2(out, dest, stride); 984 } 985 986 void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, 987 int stride) { 988 DECLARE_ALIGNED(32, int16_t, out[32 * 32]); 989 int16_t *outptr = out; 990 uint32_t i; 991 uint32_t pos = 45; 992 993 /* bit positon for extract from acc */ 994 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 995 : 996 : [pos] "r"(pos)); 997 998 // Rows 999 idct32_rows_dspr2(input, outptr, 8); 1000 1001 outptr += 8; 1002 __asm__ __volatile__( 1003 "sw $zero, 0(%[outptr]) \n\t" 1004 "sw $zero, 4(%[outptr]) \n\t" 1005 "sw $zero, 8(%[outptr]) \n\t" 1006 "sw $zero, 12(%[outptr]) \n\t" 1007 "sw $zero, 16(%[outptr]) \n\t" 1008 "sw $zero, 20(%[outptr]) \n\t" 1009 "sw $zero, 24(%[outptr]) \n\t" 1010 "sw $zero, 28(%[outptr]) \n\t" 1011 "sw $zero, 32(%[outptr]) \n\t" 1012 "sw $zero, 36(%[outptr]) \n\t" 1013 "sw $zero, 40(%[outptr]) \n\t" 1014 "sw $zero, 44(%[outptr]) \n\t" 1015 1016 : 1017 : [outptr] "r"(outptr)); 1018 1019 for (i = 0; i < 31; ++i) { 1020 outptr += 32; 1021 1022 __asm__ __volatile__( 1023 "sw $zero, 0(%[outptr]) \n\t" 1024 "sw $zero, 4(%[outptr]) \n\t" 1025 "sw $zero, 8(%[outptr]) \n\t" 1026 "sw $zero, 12(%[outptr]) \n\t" 1027 "sw $zero, 16(%[outptr]) \n\t" 1028 "sw $zero, 20(%[outptr]) \n\t" 1029 "sw $zero, 24(%[outptr]) \n\t" 1030 "sw $zero, 28(%[outptr]) \n\t" 1031 "sw $zero, 32(%[outptr]) \n\t" 1032 "sw $zero, 36(%[outptr]) \n\t" 1033 "sw $zero, 40(%[outptr]) \n\t" 1034 "sw $zero, 44(%[outptr]) \n\t" 1035 1036 : 1037 : [outptr] "r"(outptr)); 1038 } 1039 1040 // Columns 1041 vpx_idct32_cols_add_blk_dspr2(out, dest, stride); 1042 } 1043 1044 void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, 1045 int stride) { 1046 int r, out; 1047 int32_t a1, absa1; 1048 int32_t vector_a1; 1049 int32_t t1, t2, t3, t4; 1050 int32_t vector_1, vector_2, vector_3, vector_4; 1051 uint32_t pos = 45; 1052 1053 /* bit positon for extract from acc */ 1054 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 1055 1056 : 1057 : [pos] "r"(pos)); 1058 1059 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); 1060 __asm__ __volatile__( 1061 "addi %[out], %[out], 32 \n\t" 1062 "sra %[a1], %[out], 6 \n\t" 1063 1064 : [out] "+r"(out), [a1] "=r"(a1) 1065 :); 1066 1067 if (a1 < 0) { 1068 /* use quad-byte 1069 * input and output memory are four byte aligned */ 1070 __asm__ __volatile__( 1071 "abs %[absa1], %[a1] \n\t" 1072 "replv.qb %[vector_a1], %[absa1] \n\t" 1073 1074 : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1) 1075 : [a1] "r"(a1)); 1076 1077 for (r = 32; r--;) { 1078 __asm__ __volatile__( 1079 "lw %[t1], 0(%[dest]) \n\t" 1080 "lw %[t2], 4(%[dest]) \n\t" 1081 "lw %[t3], 8(%[dest]) \n\t" 1082 "lw %[t4], 12(%[dest]) \n\t" 1083 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1084 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1085 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1086 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1087 "sw %[vector_1], 0(%[dest]) \n\t" 1088 "sw %[vector_2], 4(%[dest]) \n\t" 1089 "sw %[vector_3], 8(%[dest]) \n\t" 1090 "sw %[vector_4], 12(%[dest]) \n\t" 1091 1092 "lw %[t1], 16(%[dest]) \n\t" 1093 "lw %[t2], 20(%[dest]) \n\t" 1094 "lw %[t3], 24(%[dest]) \n\t" 1095 "lw %[t4], 28(%[dest]) \n\t" 1096 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1097 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1098 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1099 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1100 "sw %[vector_1], 16(%[dest]) \n\t" 1101 "sw %[vector_2], 20(%[dest]) \n\t" 1102 "sw %[vector_3], 24(%[dest]) \n\t" 1103 "sw %[vector_4], 28(%[dest]) \n\t" 1104 1105 "add %[dest], %[dest], %[stride] \n\t" 1106 1107 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), 1108 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), 1109 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), 1110 [dest] "+&r"(dest) 1111 : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); 1112 } 1113 } else if (a1 > 255) { 1114 int32_t a11, a12, vector_a11, vector_a12; 1115 1116 /* use quad-byte 1117 * input and output memory are four byte aligned */ 1118 a11 = a1 >> 1; 1119 a12 = a1 - a11; 1120 __asm__ __volatile__( 1121 "replv.qb %[vector_a11], %[a11] \n\t" 1122 "replv.qb %[vector_a12], %[a12] \n\t" 1123 1124 : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) 1125 : [a11] "r"(a11), [a12] "r"(a12)); 1126 1127 for (r = 32; r--;) { 1128 __asm__ __volatile__( 1129 "lw %[t1], 0(%[dest]) \n\t" 1130 "lw %[t2], 4(%[dest]) \n\t" 1131 "lw %[t3], 8(%[dest]) \n\t" 1132 "lw %[t4], 12(%[dest]) \n\t" 1133 "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" 1134 "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" 1135 "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" 1136 "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" 1137 "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" 1138 "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" 1139 "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" 1140 "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" 1141 "sw %[vector_1], 0(%[dest]) \n\t" 1142 "sw %[vector_2], 4(%[dest]) \n\t" 1143 "sw %[vector_3], 8(%[dest]) \n\t" 1144 "sw %[vector_4], 12(%[dest]) \n\t" 1145 1146 "lw %[t1], 16(%[dest]) \n\t" 1147 "lw %[t2], 20(%[dest]) \n\t" 1148 "lw %[t3], 24(%[dest]) \n\t" 1149 "lw %[t4], 28(%[dest]) \n\t" 1150 "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" 1151 "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" 1152 "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" 1153 "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" 1154 "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" 1155 "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" 1156 "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" 1157 "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" 1158 "sw %[vector_1], 16(%[dest]) \n\t" 1159 "sw %[vector_2], 20(%[dest]) \n\t" 1160 "sw %[vector_3], 24(%[dest]) \n\t" 1161 "sw %[vector_4], 28(%[dest]) \n\t" 1162 1163 "add %[dest], %[dest], %[stride] \n\t" 1164 1165 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), 1166 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), 1167 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), 1168 [dest] "+&r"(dest) 1169 : [stride] "r"(stride), [vector_a11] "r"(vector_a11), 1170 [vector_a12] "r"(vector_a12)); 1171 } 1172 } else { 1173 /* use quad-byte 1174 * input and output memory are four byte aligned */ 1175 __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" 1176 1177 : [vector_a1] "=&r"(vector_a1) 1178 : [a1] "r"(a1)); 1179 1180 for (r = 32; r--;) { 1181 __asm__ __volatile__( 1182 "lw %[t1], 0(%[dest]) \n\t" 1183 "lw %[t2], 4(%[dest]) \n\t" 1184 "lw %[t3], 8(%[dest]) \n\t" 1185 "lw %[t4], 12(%[dest]) \n\t" 1186 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1187 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1188 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1189 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1190 "sw %[vector_1], 0(%[dest]) \n\t" 1191 "sw %[vector_2], 4(%[dest]) \n\t" 1192 "sw %[vector_3], 8(%[dest]) \n\t" 1193 "sw %[vector_4], 12(%[dest]) \n\t" 1194 1195 "lw %[t1], 16(%[dest]) \n\t" 1196 "lw %[t2], 20(%[dest]) \n\t" 1197 "lw %[t3], 24(%[dest]) \n\t" 1198 "lw %[t4], 28(%[dest]) \n\t" 1199 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" 1200 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" 1201 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" 1202 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" 1203 "sw %[vector_1], 16(%[dest]) \n\t" 1204 "sw %[vector_2], 20(%[dest]) \n\t" 1205 "sw %[vector_3], 24(%[dest]) \n\t" 1206 "sw %[vector_4], 28(%[dest]) \n\t" 1207 1208 "add %[dest], %[dest], %[stride] \n\t" 1209 1210 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), 1211 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), 1212 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), 1213 [dest] "+&r"(dest) 1214 : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); 1215 } 1216 } 1217 } 1218 #endif // #if HAVE_DSPR2 1219