1 /* 2 * MIPS DSPr2 optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. 5 * All Rights Reserved. 6 * Authors: Teodora Novkovic (teodora.novkovic (at) imgtec.com) 7 * Darko Laus (darko.laus (at) imgtec.com) 8 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 9 * This software is provided 'as-is', without any express or implied 10 * warranty. In no event will the authors be held liable for any damages 11 * arising from the use of this software. 12 * 13 * Permission is granted to anyone to use this software for any purpose, 14 * including commercial applications, and to alter it and redistribute it 15 * freely, subject to the following restrictions: 16 * 17 * 1. The origin of this software must not be misrepresented; you must not 18 * claim that you wrote the original software. If you use this software 19 * in a product, an acknowledgment in the product documentation would be 20 * appreciated but is not required. 21 * 2. Altered source versions must be plainly marked as such, and must not be 22 * misrepresented as being the original software. 23 * 3. This notice may not be removed or altered from any source distribution. 24 */ 25 26 #include "jsimd_mips_dspr2_asm.h" 27 28 /*****************************************************************************/ 29 LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2) 30 /* 31 * a0 - cinfo->image_width 32 * a1 - input_buf 33 * a2 - output_buf 34 * a3 - output_row 35 * 16(sp) - num_rows 36 * 20(sp) - cinfo->num_components 37 * 38 * Null conversion for compression 39 */ 40 41 SAVE_REGS_ON_STACK 8, s0, s1 42 43 lw t9, 24(sp) // t9 = num_rows 44 lw s0, 28(sp) // s0 = cinfo->num_components 45 andi t0, a0, 3 // t0 = cinfo->image_width & 3 46 beqz t0, 4f // no residual 47 nop 48 0: 49 addiu t9, t9, -1 50 bltz t9, 7f 51 li t1, 0 52 1: 53 sll t3, t1, 2 54 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] 55 lw t2, 0(a1) // t2 = inptr = *input_buf 56 sll t4, a3, 2 57 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] 58 addu t2, t2, t1 59 addu s1, t5, a0 60 addu t6, t5, t0 61 2: 62 lbu t3, 0(t2) 63 addiu t5, t5, 1 64 sb t3, -1(t5) 65 bne t6, t5, 2b 66 addu t2, t2, s0 67 3: 68 lbu t3, 0(t2) 69 addu t4, t2, s0 70 addu t7, t4, s0 71 addu t8, t7, s0 72 addu t2, t8, s0 73 lbu t4, 0(t4) 74 lbu t7, 0(t7) 75 lbu t8, 0(t8) 76 addiu t5, t5, 4 77 sb t3, -4(t5) 78 sb t4, -3(t5) 79 sb t7, -2(t5) 80 bne s1, t5, 3b 81 sb t8, -1(t5) 82 addiu t1, t1, 1 83 bne t1, s0, 1b 84 nop 85 addiu a1, a1, 4 86 bgez t9, 0b 87 addiu a3, a3, 1 88 b 7f 89 nop 90 4: 91 addiu t9, t9, -1 92 bltz t9, 7f 93 li t1, 0 94 5: 95 sll t3, t1, 2 96 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] 97 lw t2, 0(a1) // t2 = inptr = *input_buf 98 sll t4, a3, 2 99 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] 100 addu t2, t2, t1 101 addu s1, t5, a0 102 addu t6, t5, t0 103 6: 104 lbu t3, 0(t2) 105 addu t4, t2, s0 106 addu t7, t4, s0 107 addu t8, t7, s0 108 addu t2, t8, s0 109 lbu t4, 0(t4) 110 lbu t7, 0(t7) 111 lbu t8, 0(t8) 112 addiu t5, t5, 4 113 sb t3, -4(t5) 114 sb t4, -3(t5) 115 sb t7, -2(t5) 116 bne s1, t5, 6b 117 sb t8, -1(t5) 118 addiu t1, t1, 1 119 bne t1, s0, 5b 120 nop 121 addiu a1, a1, 4 122 bgez t9, 4b 123 addiu a3, a3, 1 124 7: 125 RESTORE_REGS_FROM_STACK 8, s0, s1 126 127 j ra 128 nop 129 130 END(jsimd_c_null_convert_mips_dspr2) 131 132 /*****************************************************************************/ 133 /* 134 * jsimd_extrgb_ycc_convert_mips_dspr2 135 * jsimd_extbgr_ycc_convert_mips_dspr2 136 * jsimd_extrgbx_ycc_convert_mips_dspr2 137 * jsimd_extbgrx_ycc_convert_mips_dspr2 138 * jsimd_extxbgr_ycc_convert_mips_dspr2 139 * jsimd_extxrgb_ycc_convert_mips_dspr2 140 * 141 * Colorspace conversion RGB -> YCbCr 142 */ 143 144 .macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs 145 146 .macro DO_RGB_TO_YCC r, \ 147 g, \ 148 b, \ 149 inptr 150 lbu \r, \r_offs(\inptr) 151 lbu \g, \g_offs(\inptr) 152 lbu \b, \b_offs(\inptr) 153 addiu \inptr, \pixel_size 154 .endm 155 156 LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2) 157 /* 158 * a0 - cinfo->image_width 159 * a1 - input_buf 160 * a2 - output_buf 161 * a3 - output_row 162 * 16(sp) - num_rows 163 */ 164 165 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 166 167 lw t7, 48(sp) // t7 = num_rows 168 li s0, 0x4c8b // FIX(0.29900) 169 li s1, 0x9646 // FIX(0.58700) 170 li s2, 0x1d2f // FIX(0.11400) 171 li s3, 0xffffd4cd // -FIX(0.16874) 172 li s4, 0xffffab33 // -FIX(0.33126) 173 li s5, 0x8000 // FIX(0.50000) 174 li s6, 0xffff94d1 // -FIX(0.41869) 175 li s7, 0xffffeb2f // -FIX(0.08131) 176 li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 177 178 0: 179 addiu t7, -1 // --num_rows 180 lw t6, 0(a1) // t6 = input_buf[0] 181 lw t0, 0(a2) 182 lw t1, 4(a2) 183 lw t2, 8(a2) 184 sll t3, a3, 2 185 lwx t0, t3(t0) // t0 = output_buf[0][output_row] 186 lwx t1, t3(t1) // t1 = output_buf[1][output_row] 187 lwx t2, t3(t2) // t2 = output_buf[2][output_row] 188 189 addu t9, t2, a0 // t9 = end address 190 addiu a3, 1 191 192 1: 193 DO_RGB_TO_YCC t3, t4, t5, t6 194 195 mtlo s5, $ac0 196 mtlo t8, $ac1 197 mtlo t8, $ac2 198 maddu $ac0, s2, t5 199 maddu $ac1, s5, t5 200 maddu $ac2, s5, t3 201 maddu $ac0, s0, t3 202 maddu $ac1, s3, t3 203 maddu $ac2, s6, t4 204 maddu $ac0, s1, t4 205 maddu $ac1, s4, t4 206 maddu $ac2, s7, t5 207 extr.w t3, $ac0, 16 208 extr.w t4, $ac1, 16 209 extr.w t5, $ac2, 16 210 sb t3, 0(t0) 211 sb t4, 0(t1) 212 sb t5, 0(t2) 213 addiu t0, 1 214 addiu t2, 1 215 bne t2, t9, 1b 216 addiu t1, 1 217 bgtz t7, 0b 218 addiu a1, 4 219 220 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 221 222 j ra 223 nop 224 END(jsimd_\colorid\()_ycc_convert_mips_dspr2) 225 226 .purgem DO_RGB_TO_YCC 227 228 .endm 229 230 /*------------------------------------------id -- pix R G B */ 231 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 232 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 233 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 234 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 235 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 236 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 237 238 /*****************************************************************************/ 239 /* 240 * jsimd_ycc_extrgb_convert_mips_dspr2 241 * jsimd_ycc_extbgr_convert_mips_dspr2 242 * jsimd_ycc_extrgbx_convert_mips_dspr2 243 * jsimd_ycc_extbgrx_convert_mips_dspr2 244 * jsimd_ycc_extxbgr_convert_mips_dspr2 245 * jsimd_ycc_extxrgb_convert_mips_dspr2 246 * 247 * Colorspace conversion YCbCr -> RGB 248 */ 249 250 .macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs 251 252 .macro STORE_YCC_TO_RGB scratch0 \ 253 scratch1 \ 254 scratch2 \ 255 outptr 256 sb \scratch0, \r_offs(\outptr) 257 sb \scratch1, \g_offs(\outptr) 258 sb \scratch2, \b_offs(\outptr) 259 .if (\pixel_size == 4) 260 li t0, 0xFF 261 sb t0, \a_offs(\outptr) 262 .endif 263 addiu \outptr, \pixel_size 264 .endm 265 266 LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2) 267 /* 268 * a0 - cinfo->image_width 269 * a1 - input_buf 270 * a2 - input_row 271 * a3 - output_buf 272 * 16(sp) - num_rows 273 */ 274 275 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 276 277 lw s1, 48(sp) 278 li t3, 0x8000 279 li t4, 0x166e9 // FIX(1.40200) 280 li t5, 0x1c5a2 // FIX(1.77200) 281 li t6, 0xffff492e // -FIX(0.71414) 282 li t7, 0xffffa7e6 // -FIX(0.34414) 283 repl.ph t8, 128 284 285 0: 286 lw s0, 0(a3) 287 lw t0, 0(a1) 288 lw t1, 4(a1) 289 lw t2, 8(a1) 290 sll s5, a2, 2 291 addiu s1, -1 292 lwx s2, s5(t0) 293 lwx s3, s5(t1) 294 lwx s4, s5(t2) 295 addu t9, s2, a0 296 addiu a2, 1 297 298 1: 299 lbu s7, 0(s4) // cr 300 lbu s6, 0(s3) // cb 301 lbu s5, 0(s2) // y 302 addiu s2, 1 303 addiu s4, 1 304 addiu s7, -128 305 addiu s6, -128 306 mul t2, t7, s6 307 mul t0, t6, s7 // Crgtab[cr] 308 sll s7, 15 309 mulq_rs.w t1, t4, s7 // Crrtab[cr] 310 sll s6, 15 311 addu t2, t3 // Cbgtab[cb] 312 addu t2, t0 313 314 mulq_rs.w t0, t5, s6 // Cbbtab[cb] 315 sra t2, 16 316 addu t1, s5 317 addu t2, s5 // add y 318 ins t2, t1, 16, 16 319 subu.ph t2, t2, t8 320 addu t0, s5 321 shll_s.ph t2, t2, 8 322 subu t0, 128 323 shra.ph t2, t2, 8 324 shll_s.w t0, t0, 24 325 addu.ph t2, t2, t8 // clip & store 326 sra t0, t0, 24 327 sra t1, t2, 16 328 addiu t0, 128 329 330 STORE_YCC_TO_RGB t1, t2, t0, s0 331 332 bne s2, t9, 1b 333 addiu s3, 1 334 bgtz s1, 0b 335 addiu a3, 4 336 337 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 338 339 j ra 340 nop 341 END(jsimd_ycc_\colorid\()_convert_mips_dspr2) 342 343 .purgem STORE_YCC_TO_RGB 344 345 .endm 346 347 /*------------------------------------------id -- pix R G B A */ 348 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3 349 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3 350 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3 351 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3 352 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0 353 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0 354 355 /*****************************************************************************/ 356 /* 357 * jsimd_extrgb_gray_convert_mips_dspr2 358 * jsimd_extbgr_gray_convert_mips_dspr2 359 * jsimd_extrgbx_gray_convert_mips_dspr2 360 * jsimd_extbgrx_gray_convert_mips_dspr2 361 * jsimd_extxbgr_gray_convert_mips_dspr2 362 * jsimd_extxrgb_gray_convert_mips_dspr2 363 * 364 * Colorspace conversion RGB -> GRAY 365 */ 366 367 .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs 368 369 .macro DO_RGB_TO_GRAY r, \ 370 g, \ 371 b, \ 372 inptr 373 lbu \r, \r_offs(\inptr) 374 lbu \g, \g_offs(\inptr) 375 lbu \b, \b_offs(\inptr) 376 addiu \inptr, \pixel_size 377 .endm 378 379 LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2) 380 /* 381 * a0 - cinfo->image_width 382 * a1 - input_buf 383 * a2 - output_buf 384 * a3 - output_row 385 * 16(sp) - num_rows 386 */ 387 388 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 389 390 li s0, 0x4c8b // s0 = FIX(0.29900) 391 li s1, 0x9646 // s1 = FIX(0.58700) 392 li s2, 0x1d2f // s2 = FIX(0.11400) 393 li s7, 0x8000 // s7 = FIX(0.50000) 394 lw s6, 48(sp) 395 andi t7, a0, 3 396 397 0: 398 addiu s6, -1 // s6 = num_rows 399 lw t0, 0(a1) 400 lw t1, 0(a2) 401 sll t3, a3, 2 402 lwx t1, t3(t1) 403 addiu a3, 1 404 addu t9, t1, a0 405 subu t8, t9, t7 406 beq t1, t8, 2f 407 nop 408 409 1: 410 DO_RGB_TO_GRAY t3, t4, t5, t0 411 DO_RGB_TO_GRAY s3, s4, s5, t0 412 413 mtlo s7, $ac0 414 maddu $ac0, s2, t5 415 maddu $ac0, s1, t4 416 maddu $ac0, s0, t3 417 mtlo s7, $ac1 418 maddu $ac1, s2, s5 419 maddu $ac1, s1, s4 420 maddu $ac1, s0, s3 421 extr.w t6, $ac0, 16 422 423 DO_RGB_TO_GRAY t3, t4, t5, t0 424 DO_RGB_TO_GRAY s3, s4, s5, t0 425 426 mtlo s7, $ac0 427 maddu $ac0, s2, t5 428 maddu $ac0, s1, t4 429 extr.w t2, $ac1, 16 430 maddu $ac0, s0, t3 431 mtlo s7, $ac1 432 maddu $ac1, s2, s5 433 maddu $ac1, s1, s4 434 maddu $ac1, s0, s3 435 extr.w t5, $ac0, 16 436 sb t6, 0(t1) 437 sb t2, 1(t1) 438 extr.w t3, $ac1, 16 439 addiu t1, 4 440 sb t5, -2(t1) 441 sb t3, -1(t1) 442 bne t1, t8, 1b 443 nop 444 445 2: 446 beqz t7, 4f 447 nop 448 449 3: 450 DO_RGB_TO_GRAY t3, t4, t5, t0 451 452 mtlo s7, $ac0 453 maddu $ac0, s2, t5 454 maddu $ac0, s1, t4 455 maddu $ac0, s0, t3 456 extr.w t6, $ac0, 16 457 sb t6, 0(t1) 458 addiu t1, 1 459 bne t1, t9, 3b 460 nop 461 462 4: 463 bgtz s6, 0b 464 addiu a1, 4 465 466 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 467 468 j ra 469 nop 470 END(jsimd_\colorid\()_gray_convert_mips_dspr2) 471 472 .purgem DO_RGB_TO_GRAY 473 474 .endm 475 476 /*------------------------------------------id -- pix R G B */ 477 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 478 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 479 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 480 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 481 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 482 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 483 /*****************************************************************************/ 484 /* 485 * jsimd_h2v2_merged_upsample_mips_dspr2 486 * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2 487 * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2 488 * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2 489 * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2 490 * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2 491 * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2 492 * 493 * Merged h2v2 upsample routines 494 */ 495 .macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ 496 pixel_size, \ 497 r1_offs, \ 498 g1_offs, \ 499 b1_offs, \ 500 a1_offs, \ 501 r2_offs, \ 502 g2_offs, \ 503 b2_offs, \ 504 a2_offs 505 506 .macro STORE_H2V2_2_PIXELS scratch0 \ 507 scratch1 \ 508 scratch2 \ 509 scratch3 \ 510 scratch4 \ 511 scratch5 \ 512 outptr 513 sb \scratch0, \r1_offs(\outptr) 514 sb \scratch1, \g1_offs(\outptr) 515 sb \scratch2, \b1_offs(\outptr) 516 sb \scratch3, \r2_offs(\outptr) 517 sb \scratch4, \g2_offs(\outptr) 518 sb \scratch5, \b2_offs(\outptr) 519 .if (\pixel_size == 8) 520 li \scratch0, 0xFF 521 sb \scratch0, \a1_offs(\outptr) 522 sb \scratch0, \a2_offs(\outptr) 523 .endif 524 addiu \outptr, \pixel_size 525 .endm 526 527 .macro STORE_H2V2_1_PIXEL scratch0 \ 528 scratch1 \ 529 scratch2 \ 530 outptr 531 sb \scratch0, \r1_offs(\outptr) 532 sb \scratch1, \g1_offs(\outptr) 533 sb \scratch2, \b1_offs(\outptr) 534 535 .if (\pixel_size == 8) 536 li t0, 0xFF 537 sb t0, \a1_offs(\outptr) 538 .endif 539 .endm 540 541 LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) 542 /* 543 * a0 - cinfo->output_width 544 * a1 - input_buf 545 * a2 - in_row_group_ctr 546 * a3 - output_buf 547 * 16(sp) - cinfo->sample_range_limit 548 */ 549 550 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 551 552 lw t9, 56(sp) // cinfo->sample_range_limit 553 lw v0, 0(a1) 554 lw v1, 4(a1) 555 lw t0, 8(a1) 556 sll t1, a2, 3 557 addiu t2, t1, 4 558 sll t3, a2, 2 559 lw t4, 0(a3) // t4 = output_buf[0] 560 lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] 561 lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] 562 lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] 563 lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] 564 lw t7, 4(a3) // t7 = output_buf[1] 565 li s1, 0xe6ea 566 addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] 567 addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] 568 addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] 569 xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] 570 srl t3, a0, 1 571 blez t3, 2f 572 addu t0, t5, t3 // t0 = end address 573 1: 574 lbu t3, 0(t5) 575 lbu s3, 0(t6) 576 addiu t5, t5, 1 577 addiu t3, t3, -128 // (cb - 128) 578 addiu s3, s3, -128 // (cr - 128) 579 mult $ac1, s1, t3 580 madd $ac1, s2, s3 581 sll s3, s3, 15 582 sll t3, t3, 15 583 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS 584 extr_r.w s5, $ac1, 16 585 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS 586 lbu v0, 0(t1) 587 addiu t6, t6, 1 588 addiu t1, t1, 2 589 addu t3, v0, s4 // y+cred 590 addu s3, v0, s5 // y+cgreen 591 addu v1, v0, s6 // y+cblue 592 addu t3, t9, t3 // y+cred 593 addu s3, t9, s3 // y+cgreen 594 addu v1, t9, v1 // y+cblue 595 lbu AT, 0(t3) 596 lbu s7, 0(s3) 597 lbu ra, 0(v1) 598 lbu v0, -1(t1) 599 addu t3, v0, s4 // y+cred 600 addu s3, v0, s5 // y+cgreen 601 addu v1, v0, s6 // y+cblue 602 addu t3, t9, t3 // y+cred 603 addu s3, t9, s3 // y+cgreen 604 addu v1, t9, v1 // y+cblue 605 lbu t3, 0(t3) 606 lbu s3, 0(s3) 607 lbu v1, 0(v1) 608 lbu v0, 0(t2) 609 610 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 611 612 addu t3, v0, s4 // y+cred 613 addu s3, v0, s5 // y+cgreen 614 addu v1, v0, s6 // y+cblue 615 addu t3, t9, t3 // y+cred 616 addu s3, t9, s3 // y+cgreen 617 addu v1, t9, v1 // y+cblue 618 lbu AT, 0(t3) 619 lbu s7, 0(s3) 620 lbu ra, 0(v1) 621 lbu v0, 1(t2) 622 addiu t2, t2, 2 623 addu t3, v0, s4 // y+cred 624 addu s3, v0, s5 // y+cgreen 625 addu v1, v0, s6 // y+cblue 626 addu t3, t9, t3 // y+cred 627 addu s3, t9, s3 // y+cgreen 628 addu v1, t9, v1 // y+cblue 629 lbu t3, 0(t3) 630 lbu s3, 0(s3) 631 lbu v1, 0(v1) 632 633 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 634 635 bne t0, t5, 1b 636 nop 637 2: 638 andi t0, a0, 1 639 beqz t0, 4f 640 lbu t3, 0(t5) 641 lbu s3, 0(t6) 642 addiu t3, t3, -128 // (cb - 128) 643 addiu s3, s3, -128 // (cr - 128) 644 mult $ac1, s1, t3 645 madd $ac1, s2, s3 646 sll s3, s3, 15 647 sll t3, t3, 15 648 lbu v0, 0(t1) 649 extr_r.w s5, $ac1, 16 650 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS 651 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS 652 addu t3, v0, s4 // y+cred 653 addu s3, v0, s5 // y+cgreen 654 addu v1, v0, s6 // y+cblue 655 addu t3, t9, t3 // y+cred 656 addu s3, t9, s3 // y+cgreen 657 addu v1, t9, v1 // y+cblue 658 lbu t3, 0(t3) 659 lbu s3, 0(s3) 660 lbu v1, 0(v1) 661 lbu v0, 0(t2) 662 663 STORE_H2V2_1_PIXEL t3, s3, v1, t4 664 665 addu t3, v0, s4 // y+cred 666 addu s3, v0, s5 // y+cgreen 667 addu v1, v0, s6 // y+cblue 668 addu t3, t9, t3 // y+cred 669 addu s3, t9, s3 // y+cgreen 670 addu v1, t9, v1 // y+cblue 671 lbu t3, 0(t3) 672 lbu s3, 0(s3) 673 lbu v1, 0(v1) 674 675 STORE_H2V2_1_PIXEL t3, s3, v1, t7 676 4: 677 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 678 679 j ra 680 nop 681 682 END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) 683 684 .purgem STORE_H2V2_1_PIXEL 685 .purgem STORE_H2V2_2_PIXELS 686 .endm 687 688 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 689 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 690 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 691 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 692 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 693 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 694 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 695 /*****************************************************************************/ 696 /* 697 * jsimd_h2v1_merged_upsample_mips_dspr2 698 * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2 699 * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2 700 * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2 701 * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2 702 * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2 703 * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2 704 * 705 * Merged h2v1 upsample routines 706 */ 707 708 .macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ 709 pixel_size, \ 710 r1_offs, \ 711 g1_offs, \ 712 b1_offs, \ 713 a1_offs, \ 714 r2_offs, \ 715 g2_offs, \ 716 b2_offs, \ 717 a2_offs 718 719 .macro STORE_H2V1_2_PIXELS scratch0 \ 720 scratch1 \ 721 scratch2 \ 722 scratch3 \ 723 scratch4 \ 724 scratch5 \ 725 outptr 726 sb \scratch0, \r1_offs(\outptr) 727 sb \scratch1, \g1_offs(\outptr) 728 sb \scratch2, \b1_offs(\outptr) 729 sb \scratch3, \r2_offs(\outptr) 730 sb \scratch4, \g2_offs(\outptr) 731 sb \scratch5, \b2_offs(\outptr) 732 .if (\pixel_size == 8) 733 li t0, 0xFF 734 sb t0, \a1_offs(\outptr) 735 sb t0, \a2_offs(\outptr) 736 .endif 737 addiu \outptr, \pixel_size 738 .endm 739 740 .macro STORE_H2V1_1_PIXEL scratch0 \ 741 scratch1 \ 742 scratch2 \ 743 outptr 744 sb \scratch0, \r1_offs(\outptr) 745 sb \scratch1, \g1_offs(\outptr) 746 sb \scratch2, \b1_offs(\outptr) 747 .if (\pixel_size == 8) 748 li t0, 0xFF 749 sb t0, \a1_offs(\outptr) 750 .endif 751 .endm 752 753 LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) 754 /* 755 * a0 - cinfo->output_width 756 * a1 - input_buf 757 * a2 - in_row_group_ctr 758 * a3 - output_buf 759 * 16(sp) - range_limit 760 */ 761 762 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 763 764 li t0, 0xe6ea 765 lw t1, 0(a1) // t1 = input_buf[0] 766 lw t2, 4(a1) // t2 = input_buf[1] 767 lw t3, 8(a1) // t3 = input_buf[2] 768 lw t8, 56(sp) // t8 = range_limit 769 addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] 770 addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] 771 addiu s0, t0, 0x9916 // s0 = 0x8000 772 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] 773 xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] 774 srl t0, a0, 1 775 sll t4, a2, 2 776 lwx s5, t4(t1) // s5 = inptr0 777 lwx s6, t4(t2) // s6 = inptr1 778 lwx s7, t4(t3) // s7 = inptr2 779 lw t7, 0(a3) // t7 = outptr 780 blez t0, 2f 781 addu t9, s6, t0 // t9 = end address 782 1: 783 lbu t2, 0(s6) // t2 = cb 784 lbu t0, 0(s7) // t0 = cr 785 lbu t1, 0(s5) // t1 = y 786 addiu t2, t2, -128 // t2 = cb - 128 787 addiu t0, t0, -128 // t0 = cr - 128 788 mult $ac1, s4, t2 789 madd $ac1, s3, t0 790 sll t0, t0, 15 791 sll t2, t2, 15 792 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS 793 extr_r.w t5, $ac1, 16 794 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS 795 addiu s7, s7, 1 796 addiu s6, s6, 1 797 addu t2, t1, t0 // t2 = y + cred 798 addu t3, t1, t5 // t3 = y + cgreen 799 addu t4, t1, t6 // t4 = y + cblue 800 addu t2, t8, t2 801 addu t3, t8, t3 802 addu t4, t8, t4 803 lbu t1, 1(s5) 804 lbu v0, 0(t2) 805 lbu v1, 0(t3) 806 lbu ra, 0(t4) 807 addu t2, t1, t0 808 addu t3, t1, t5 809 addu t4, t1, t6 810 addu t2, t8, t2 811 addu t3, t8, t3 812 addu t4, t8, t4 813 lbu t2, 0(t2) 814 lbu t3, 0(t3) 815 lbu t4, 0(t4) 816 817 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 818 819 bne t9, s6, 1b 820 addiu s5, s5, 2 821 2: 822 andi t0, a0, 1 823 beqz t0, 4f 824 nop 825 3: 826 lbu t2, 0(s6) 827 lbu t0, 0(s7) 828 lbu t1, 0(s5) 829 addiu t2, t2, -128 //(cb - 128) 830 addiu t0, t0, -128 //(cr - 128) 831 mul t3, s4, t2 832 mul t4, s3, t0 833 sll t0, t0, 15 834 sll t2, t2, 15 835 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS 836 mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS 837 addu t3, t3, s0 838 addu t3, t4, t3 839 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS 840 addu t2, t1, t0 // y + cred 841 addu t3, t1, t5 // y + cgreen 842 addu t4, t1, t6 // y + cblue 843 addu t2, t8, t2 844 addu t3, t8, t3 845 addu t4, t8, t4 846 lbu t2, 0(t2) 847 lbu t3, 0(t3) 848 lbu t4, 0(t4) 849 850 STORE_H2V1_1_PIXEL t2, t3, t4, t7 851 4: 852 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 853 854 j ra 855 nop 856 857 END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) 858 859 .purgem STORE_H2V1_1_PIXEL 860 .purgem STORE_H2V1_2_PIXELS 861 .endm 862 863 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 864 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 865 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 866 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 867 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 868 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 869 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 870 /*****************************************************************************/ 871 /* 872 * jsimd_h2v2_fancy_upsample_mips_dspr2 873 * 874 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 875 */ 876 LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2) 877 /* 878 * a0 - cinfo->max_v_samp_factor 879 * a1 - downsampled_width 880 * a2 - input_data 881 * a3 - output_data_ptr 882 */ 883 884 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 885 886 li s4, 0 887 lw s2, 0(a3) // s2 = *output_data_ptr 888 0: 889 li t9, 2 890 lw s1, -4(a2) // s1 = inptr1 891 892 1: 893 lw s0, 0(a2) // s0 = inptr0 894 lwx s3, s4(s2) 895 addiu s5, a1, -2 // s5 = downsampled_width - 2 896 srl t4, s5, 1 897 sll t4, t4, 1 898 lbu t0, 0(s0) 899 lbu t1, 1(s0) 900 lbu t2, 0(s1) 901 lbu t3, 1(s1) 902 addiu s0, 2 903 addiu s1, 2 904 addu t8, s0, t4 // t8 = end address 905 andi s5, s5, 1 // s5 = residual 906 sll t4, t0, 1 907 sll t6, t1, 1 908 addu t0, t0, t4 // t0 = (*inptr0++) * 3 909 addu t1, t1, t6 // t1 = (*inptr0++) * 3 910 addu t7, t0, t2 // t7 = thiscolsum 911 addu t6, t1, t3 // t5 = nextcolsum 912 sll t0, t7, 2 // t0 = thiscolsum * 4 913 subu t1, t0, t7 // t1 = thiscolsum * 3 914 shra_r.w t0, t0, 4 915 addiu t1, 7 916 addu t1, t1, t6 917 srl t1, t1, 4 918 sb t0, 0(s3) 919 sb t1, 1(s3) 920 beq t8, s0, 22f // skip to final iteration if width == 3 921 addiu s3, 2 922 2: 923 lh t0, 0(s0) // t0 = A3|A2 924 lh t2, 0(s1) // t2 = B3|B2 925 addiu s0, 2 926 addiu s1, 2 927 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 928 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 929 shll.ph t1, t0, 1 930 sll t3, t6, 1 931 addu.ph t0, t1, t0 // t0 = A3*3|A2*3 932 addu t3, t3, t6 // t3 = this * 3 933 addu.ph t0, t0, t2 // t0 = next2|next1 934 addu t1, t3, t7 935 andi t7, t0, 0xFFFF // t7 = next1 936 sll t2, t7, 1 937 addu t2, t7, t2 // t2 = next1*3 938 addu t4, t2, t6 939 srl t6, t0, 16 // t6 = next2 940 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 941 addu t0, t3, t7 942 addiu t0, 7 943 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 944 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 945 addu t2, t2, t6 946 addiu t2, 7 947 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 948 sb t1, 0(s3) 949 sb t0, 1(s3) 950 sb t4, 2(s3) 951 sb t2, 3(s3) 952 bne t8, s0, 2b 953 addiu s3, 4 954 22: 955 beqz s5, 4f 956 addu t8, s0, s5 957 3: 958 lbu t0, 0(s0) 959 lbu t2, 0(s1) 960 addiu s0, 1 961 addiu s1, 1 962 sll t3, t6, 1 963 sll t1, t0, 1 964 addu t1, t0, t1 // t1 = inptr0 * 3 965 addu t3, t3, t6 // t3 = thiscolsum * 3 966 addu t5, t1, t2 967 addu t1, t3, t7 968 shra_r.w t1, t1, 4 969 addu t0, t3, t5 970 addiu t0, 7 971 srl t0, t0, 4 972 sb t1, 0(s3) 973 sb t0, 1(s3) 974 addiu s3, 2 975 move t7, t6 976 bne t8, s0, 3b 977 move t6, t5 978 4: 979 sll t0, t6, 2 // t0 = thiscolsum * 4 980 subu t1, t0, t6 // t1 = thiscolsum * 3 981 addu t1, t1, t7 982 addiu s4, 4 983 shra_r.w t1, t1, 4 984 addiu t0, 7 985 srl t0, t0, 4 986 sb t1, 0(s3) 987 sb t0, 1(s3) 988 addiu t9, -1 989 addiu s3, 2 990 bnez t9, 1b 991 lw s1, 4(a2) 992 srl t0, s4, 2 993 subu t0, a0, t0 994 bgtz t0, 0b 995 addiu a2, 4 996 997 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 998 999 j ra 1000 nop 1001 END(jsimd_h2v2_fancy_upsample_mips_dspr2) 1002 1003 /*****************************************************************************/ 1004 LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2) 1005 /* 1006 * a0 - cinfo->max_v_samp_factor 1007 * a1 - downsampled_width 1008 * a2 - input_data 1009 * a3 - output_data_ptr 1010 */ 1011 1012 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 1013 1014 .set at 1015 1016 beqz a0, 3f 1017 sll t0, a0, 2 1018 lw s1, 0(a3) 1019 li s3, 0x10001 1020 addu s0, s1, t0 1021 0: 1022 addiu t8, a1, -2 1023 srl t9, t8, 2 1024 lw t7, 0(a2) 1025 lw s2, 0(s1) 1026 lbu t0, 0(t7) 1027 lbu t1, 1(t7) // t1 = inptr[1] 1028 sll t2, t0, 1 1029 addu t2, t2, t0 // t2 = invalue*3 1030 addu t2, t2, t1 1031 shra_r.w t2, t2, 2 1032 sb t0, 0(s2) 1033 sb t2, 1(s2) 1034 beqz t9, 11f 1035 addiu s2, 2 1036 1: 1037 ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| 1038 ulw t1, 1(t7) 1039 ulh t2, 4(t7) // t2 = |0|0|P5|P4| 1040 preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| 1041 preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| 1042 preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| 1043 preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| 1044 preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| 1045 shll.ph t5, t4, 1 1046 shll.ph t6, t1, 1 1047 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| 1048 addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| 1049 addu.ph t4, t3, s3 1050 addu.ph t0, t0, s3 1051 addu.ph t4, t4, t5 1052 addu.ph t0, t0, t6 1053 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| 1054 shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| 1055 addu.ph t2, t2, t5 1056 addu.ph t3, t3, t6 1057 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| 1058 shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| 1059 shll.ph t2, t2, 8 1060 shll.ph t3, t3, 8 1061 or t2, t4, t2 1062 or t3, t3, t0 1063 addiu t9, -1 1064 usw t3, 0(s2) 1065 usw t2, 4(s2) 1066 addiu s2, 8 1067 bgtz t9, 1b 1068 addiu t7, 4 1069 11: 1070 andi t8, 3 1071 beqz t8, 22f 1072 addiu t7, 1 1073 1074 2: 1075 lbu t0, 0(t7) 1076 addiu t7, 1 1077 sll t1, t0, 1 1078 addu t2, t0, t1 // t2 = invalue 1079 lbu t3, -2(t7) 1080 lbu t4, 0(t7) 1081 addiu t3, 1 1082 addiu t4, 2 1083 addu t3, t3, t2 1084 addu t4, t4, t2 1085 srl t3, 2 1086 srl t4, 2 1087 sb t3, 0(s2) 1088 sb t4, 1(s2) 1089 addiu t8, -1 1090 bgtz t8, 2b 1091 addiu s2, 2 1092 1093 22: 1094 lbu t0, 0(t7) 1095 lbu t2, -1(t7) 1096 sll t1, t0, 1 1097 addu t1, t1, t0 // t1 = invalue * 3 1098 addu t1, t1, t2 1099 addiu t1, 1 1100 srl t1, t1, 2 1101 sb t1, 0(s2) 1102 sb t0, 1(s2) 1103 addiu s1, 4 1104 bne s1, s0, 0b 1105 addiu a2, 4 1106 3: 1107 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1108 1109 j ra 1110 nop 1111 END(jsimd_h2v1_fancy_upsample_mips_dspr2) 1112 1113 /*****************************************************************************/ 1114 LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2) 1115 /* 1116 * a0 - cinfo->image_width 1117 * a1 - cinfo->max_v_samp_factor 1118 * a2 - compptr->v_samp_factor 1119 * a3 - compptr->width_in_blocks 1120 * 16(sp) - input_data 1121 * 20(sp) - output_data 1122 */ 1123 .set at 1124 1125 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 1126 1127 beqz a2, 7f 1128 lw s1, 44(sp) // s1 = output_data 1129 lw s0, 40(sp) // s0 = input_data 1130 srl s2, a0, 2 1131 andi t9, a0, 2 1132 srl t7, t9, 1 1133 addu s2, t7, s2 1134 sll t0, a3, 3 // t0 = width_in_blocks*DCT 1135 srl t7, t0, 1 1136 subu s2, t7, s2 1137 0: 1138 andi t6, a0, 1 // t6 = temp_index 1139 addiu t6, -1 1140 lw t4, 0(s1) // t4 = outptr 1141 lw t5, 0(s0) // t5 = inptr0 1142 li s3, 0 // s3 = bias 1143 srl t7, a0, 1 // t7 = image_width1 1144 srl s4, t7, 2 1145 andi t8, t7, 3 1146 1: 1147 ulhu t0, 0(t5) 1148 ulhu t1, 2(t5) 1149 ulhu t2, 4(t5) 1150 ulhu t3, 6(t5) 1151 raddu.w.qb t0, t0 1152 raddu.w.qb t1, t1 1153 raddu.w.qb t2, t2 1154 raddu.w.qb t3, t3 1155 shra.ph t0, t0, 1 1156 shra_r.ph t1, t1, 1 1157 shra.ph t2, t2, 1 1158 shra_r.ph t3, t3, 1 1159 sb t0, 0(t4) 1160 sb t1, 1(t4) 1161 sb t2, 2(t4) 1162 sb t3, 3(t4) 1163 addiu s4, -1 1164 addiu t4, 4 1165 bgtz s4, 1b 1166 addiu t5, 8 1167 beqz t8, 3f 1168 addu s4, t4, t8 1169 2: 1170 ulhu t0, 0(t5) 1171 raddu.w.qb t0, t0 1172 addqh.w t0, t0, s3 1173 xori s3, s3, 1 1174 sb t0, 0(t4) 1175 addiu t4, 1 1176 bne t4, s4, 2b 1177 addiu t5, 2 1178 3: 1179 lbux t1, t6(t5) 1180 sll t1, 1 1181 addqh.w t2, t1, s3 // t2 = pixval1 1182 xori s3, s3, 1 1183 addqh.w t3, t1, s3 // t3 = pixval2 1184 blez s2, 5f 1185 append t3, t2, 8 1186 addu t5, t4, s2 // t5 = loop_end2 1187 4: 1188 ush t3, 0(t4) 1189 addiu s2, -1 1190 bgtz s2, 4b 1191 addiu t4, 2 1192 5: 1193 beqz t9, 6f 1194 nop 1195 sb t2, 0(t4) 1196 6: 1197 addiu s1, 4 1198 addiu a2, -1 1199 bnez a2, 0b 1200 addiu s0, 4 1201 7: 1202 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 1203 1204 j ra 1205 nop 1206 END(jsimd_h2v1_downsample_mips_dspr2) 1207 1208 /*****************************************************************************/ 1209 LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2) 1210 1211 /* 1212 * a0 - cinfo->image_width 1213 * a1 - cinfo->max_v_samp_factor 1214 * a2 - compptr->v_samp_factor 1215 * a3 - compptr->width_in_blocks 1216 * 16(sp) - input_data 1217 * 20(sp) - output_data 1218 */ 1219 .set at 1220 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1221 1222 beqz a2, 8f 1223 lw s1, 52(sp) // s1 = output_data 1224 lw s0, 48(sp) // s0 = input_data 1225 1226 andi t6, a0, 1 // t6 = temp_index 1227 addiu t6, -1 1228 srl t7, a0, 1 // t7 = image_width1 1229 srl s4, t7, 2 1230 andi t8, t7, 3 1231 andi t9, a0, 2 1232 srl s2, a0, 2 1233 srl t7, t9, 1 1234 addu s2, t7, s2 1235 sll t0, a3, 3 // s2 = width_in_blocks*DCT 1236 srl t7, t0, 1 1237 subu s2, t7, s2 1238 0: 1239 lw t4, 0(s1) // t4 = outptr 1240 lw t5, 0(s0) // t5 = inptr0 1241 lw s7, 4(s0) // s7 = inptr1 1242 li s6, 1 // s6 = bias 1243 2: 1244 ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| 1245 ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| 1246 ulw t2, 4(t5) 1247 ulw t3, 4(s7) 1248 precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| 1249 ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| 1250 raddu.w.qb t1, t7 1251 raddu.w.qb t0, t0 1252 shra_r.w t1, t1, 2 1253 addiu t0, 1 1254 srl t0, 2 1255 precrq.ph.w t7, t2, t3 1256 ins t2, t3, 16, 16 1257 raddu.w.qb t7, t7 1258 raddu.w.qb t2, t2 1259 shra_r.w t7, t7, 2 1260 addiu t2, 1 1261 srl t2, 2 1262 sb t0, 0(t4) 1263 sb t1, 1(t4) 1264 sb t2, 2(t4) 1265 sb t7, 3(t4) 1266 addiu t4, 4 1267 addiu t5, 8 1268 addiu s4, s4, -1 1269 bgtz s4, 2b 1270 addiu s7, 8 1271 beqz t8, 4f 1272 addu t8, t4, t8 1273 3: 1274 ulhu t0, 0(t5) 1275 ulhu t1, 0(s7) 1276 ins t0, t1, 16, 16 1277 raddu.w.qb t0, t0 1278 addu t0, t0, s6 1279 srl t0, 2 1280 xori s6, s6, 3 1281 sb t0, 0(t4) 1282 addiu t5, 2 1283 addiu t4, 1 1284 bne t8, t4, 3b 1285 addiu s7, 2 1286 4: 1287 lbux t1, t6(t5) 1288 sll t1, 1 1289 lbux t0, t6(s7) 1290 sll t0, 1 1291 addu t1, t1, t0 1292 addu t3, t1, s6 1293 srl t0, t3, 2 // t2 = pixval1 1294 xori s6, s6, 3 1295 addu t2, t1, s6 1296 srl t1, t2, 2 // t3 = pixval2 1297 blez s2, 6f 1298 append t1, t0, 8 1299 5: 1300 ush t1, 0(t4) 1301 addiu s2, -1 1302 bgtz s2, 5b 1303 addiu t4, 2 1304 6: 1305 beqz t9, 7f 1306 nop 1307 sb t0, 0(t4) 1308 7: 1309 addiu s1, 4 1310 addiu a2, -1 1311 bnez a2, 0b 1312 addiu s0, 8 1313 8: 1314 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1315 1316 j ra 1317 nop 1318 END(jsimd_h2v2_downsample_mips_dspr2) 1319 /*****************************************************************************/ 1320 LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2) 1321 /* 1322 * a0 - input_data 1323 * a1 - output_data 1324 * a2 - compptr->v_samp_factor 1325 * a3 - cinfo->max_v_samp_factor 1326 * 16(sp) - cinfo->smoothing_factor 1327 * 20(sp) - compptr->width_in_blocks 1328 * 24(sp) - cinfo->image_width 1329 */ 1330 1331 .set at 1332 1333 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1334 1335 lw s7, 52(sp) // compptr->width_in_blocks 1336 lw s0, 56(sp) // cinfo->image_width 1337 lw s6, 48(sp) // cinfo->smoothing_factor 1338 sll s7, 3 // output_cols = width_in_blocks * DCTSIZE 1339 sll v0, s7, 1 1340 subu v0, v0, s0 1341 blez v0, 2f 1342 move v1, zero 1343 addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 1344 0: 1345 addiu t1, a0, -4 1346 sll t2, v1, 2 1347 lwx t1, t2(t1) 1348 move t3, v0 1349 addu t1, t1, s0 1350 lbu t2, -1(t1) 1351 1: 1352 addiu t3, t3, -1 1353 sb t2, 0(t1) 1354 bgtz t3, 1b 1355 addiu t1, t1, 1 1356 addiu v1, v1, 1 1357 bne v1, t0, 0b 1358 nop 1359 2: 1360 li v0, 80 1361 mul v0, s6, v0 1362 li v1, 16384 1363 move t4, zero 1364 move t5, zero 1365 subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 1366 sll t7, s6, 4 // t7 = tmp_smoot_f * 16 1367 3: 1368 /* Special case for first column: pretend column -1 is same as column 0 */ 1369 sll v0, t4, 2 1370 lwx t8, v0(a1) // outptr = output_data[outrow] 1371 sll v1, t5, 2 1372 addiu t9, v1, 4 1373 addiu s0, v1, -4 1374 addiu s1, v1, 8 1375 lwx s2, v1(a0) // inptr0 = input_data[inrow] 1376 lwx t9, t9(a0) // inptr1 = input_data[inrow+1] 1377 lwx s0, s0(a0) // above_ptr = input_data[inrow-1] 1378 lwx s1, s1(a0) // below_ptr = input_data[inrow+2] 1379 lh v0, 0(s2) 1380 lh v1, 0(t9) 1381 lh t0, 0(s0) 1382 lh t1, 0(s1) 1383 ins v0, v1, 16, 16 1384 ins t0, t1, 16, 16 1385 raddu.w.qb t2, v0 1386 raddu.w.qb s3, t0 1387 lbu v0, 0(s2) 1388 lbu v1, 2(s2) 1389 lbu t0, 0(t9) 1390 lbu t1, 2(t9) 1391 addu v0, v0, v1 1392 mult $ac1,t2, t6 1393 addu t0, t0, t1 1394 lbu t2, 2(s0) 1395 addu t0, t0, v0 1396 lbu t3, 2(s1) 1397 addu s3, t0, s3 1398 lbu v0, 0(s0) 1399 lbu t0, 0(s1) 1400 sll s3, s3, 1 1401 addu v0, v0, t2 1402 addu t0, t0, t3 1403 addu t0, t0, v0 1404 addu s3, t0, s3 1405 madd $ac1,s3, t7 1406 extr_r.w v0, $ac1, 16 1407 addiu t8, t8, 1 1408 addiu s2, s2, 2 1409 addiu t9, t9, 2 1410 addiu s0, s0, 2 1411 addiu s1, s1, 2 1412 sb v0, -1(t8) 1413 addiu s4, s7, -2 1414 and s4, s4, 3 1415 addu s5, s4, t8 //end adress 1416 4: 1417 lh v0, 0(s2) 1418 lh v1, 0(t9) 1419 lh t0, 0(s0) 1420 lh t1, 0(s1) 1421 ins v0, v1, 16, 16 1422 ins t0, t1, 16, 16 1423 raddu.w.qb t2, v0 1424 raddu.w.qb s3, t0 1425 lbu v0, -1(s2) 1426 lbu v1, 2(s2) 1427 lbu t0, -1(t9) 1428 lbu t1, 2(t9) 1429 addu v0, v0, v1 1430 mult $ac1, t2, t6 1431 addu t0, t0, t1 1432 lbu t2, 2(s0) 1433 addu t0, t0, v0 1434 lbu t3, 2(s1) 1435 addu s3, t0, s3 1436 lbu v0, -1(s0) 1437 lbu t0, -1(s1) 1438 sll s3, s3, 1 1439 addu v0, v0, t2 1440 addu t0, t0, t3 1441 addu t0, t0, v0 1442 addu s3, t0, s3 1443 madd $ac1, s3, t7 1444 extr_r.w t2, $ac1, 16 1445 addiu t8, t8, 1 1446 addiu s2, s2, 2 1447 addiu t9, t9, 2 1448 addiu s0, s0, 2 1449 sb t2, -1(t8) 1450 bne s5, t8, 4b 1451 addiu s1, s1, 2 1452 addiu s5, s7, -2 1453 subu s5, s5, s4 1454 addu s5, s5, t8 //end adress 1455 5: 1456 lh v0, 0(s2) 1457 lh v1, 0(t9) 1458 lh t0, 0(s0) 1459 lh t1, 0(s1) 1460 ins v0, v1, 16, 16 1461 ins t0, t1, 16, 16 1462 raddu.w.qb t2, v0 1463 raddu.w.qb s3, t0 1464 lbu v0, -1(s2) 1465 lbu v1, 2(s2) 1466 lbu t0, -1(t9) 1467 lbu t1, 2(t9) 1468 addu v0, v0, v1 1469 mult $ac1, t2, t6 1470 addu t0, t0, t1 1471 lbu t2, 2(s0) 1472 addu t0, t0, v0 1473 lbu t3, 2(s1) 1474 addu s3, t0, s3 1475 lbu v0, -1(s0) 1476 lbu t0, -1(s1) 1477 sll s3, s3, 1 1478 addu v0, v0, t2 1479 addu t0, t0, t3 1480 lh v1, 2(t9) 1481 addu t0, t0, v0 1482 lh v0, 2(s2) 1483 addu s3, t0, s3 1484 lh t0, 2(s0) 1485 lh t1, 2(s1) 1486 madd $ac1, s3, t7 1487 extr_r.w t2, $ac1, 16 1488 ins t0, t1, 16, 16 1489 ins v0, v1, 16, 16 1490 raddu.w.qb s3, t0 1491 lbu v1, 4(s2) 1492 lbu t0, 1(t9) 1493 lbu t1, 4(t9) 1494 sb t2, 0(t8) 1495 raddu.w.qb t3, v0 1496 lbu v0, 1(s2) 1497 addu t0, t0, t1 1498 mult $ac1, t3, t6 1499 addu v0, v0, v1 1500 lbu t2, 4(s0) 1501 addu t0, t0, v0 1502 lbu v0, 1(s0) 1503 addu s3, t0, s3 1504 lbu t0, 1(s1) 1505 lbu t3, 4(s1) 1506 addu v0, v0, t2 1507 sll s3, s3, 1 1508 addu t0, t0, t3 1509 lh v1, 4(t9) 1510 addu t0, t0, v0 1511 lh v0, 4(s2) 1512 addu s3, t0, s3 1513 lh t0, 4(s0) 1514 lh t1, 4(s1) 1515 madd $ac1, s3, t7 1516 extr_r.w t2, $ac1, 16 1517 ins t0, t1, 16, 16 1518 ins v0, v1, 16, 16 1519 raddu.w.qb s3, t0 1520 lbu v1, 6(s2) 1521 lbu t0, 3(t9) 1522 lbu t1, 6(t9) 1523 sb t2, 1(t8) 1524 raddu.w.qb t3, v0 1525 lbu v0, 3(s2) 1526 addu t0, t0,t1 1527 mult $ac1, t3, t6 1528 addu v0, v0, v1 1529 lbu t2, 6(s0) 1530 addu t0, t0, v0 1531 lbu v0, 3(s0) 1532 addu s3, t0, s3 1533 lbu t0, 3(s1) 1534 lbu t3, 6(s1) 1535 addu v0, v0, t2 1536 sll s3, s3, 1 1537 addu t0, t0, t3 1538 lh v1, 6(t9) 1539 addu t0, t0, v0 1540 lh v0, 6(s2) 1541 addu s3, t0, s3 1542 lh t0, 6(s0) 1543 lh t1, 6(s1) 1544 madd $ac1, s3, t7 1545 extr_r.w t3, $ac1, 16 1546 ins t0, t1, 16, 16 1547 ins v0, v1, 16, 16 1548 raddu.w.qb s3, t0 1549 lbu v1, 8(s2) 1550 lbu t0, 5(t9) 1551 lbu t1, 8(t9) 1552 sb t3, 2(t8) 1553 raddu.w.qb t2, v0 1554 lbu v0, 5(s2) 1555 addu t0, t0, t1 1556 mult $ac1, t2, t6 1557 addu v0, v0, v1 1558 lbu t2, 8(s0) 1559 addu t0, t0, v0 1560 lbu v0, 5(s0) 1561 addu s3, t0, s3 1562 lbu t0, 5(s1) 1563 lbu t3, 8(s1) 1564 addu v0, v0, t2 1565 sll s3, s3, 1 1566 addu t0, t0, t3 1567 addiu t8, t8, 4 1568 addu t0, t0, v0 1569 addiu s2, s2, 8 1570 addu s3, t0, s3 1571 addiu t9, t9, 8 1572 madd $ac1, s3, t7 1573 extr_r.w t1, $ac1, 16 1574 addiu s0, s0, 8 1575 addiu s1, s1, 8 1576 bne s5, t8, 5b 1577 sb t1, -1(t8) 1578 /* Special case for last column */ 1579 lh v0, 0(s2) 1580 lh v1, 0(t9) 1581 lh t0, 0(s0) 1582 lh t1, 0(s1) 1583 ins v0, v1, 16, 16 1584 ins t0, t1, 16, 16 1585 raddu.w.qb t2, v0 1586 raddu.w.qb s3, t0 1587 lbu v0, -1(s2) 1588 lbu v1, 1(s2) 1589 lbu t0, -1(t9) 1590 lbu t1, 1(t9) 1591 addu v0, v0, v1 1592 mult $ac1, t2, t6 1593 addu t0, t0, t1 1594 lbu t2, 1(s0) 1595 addu t0, t0, v0 1596 lbu t3, 1(s1) 1597 addu s3, t0, s3 1598 lbu v0, -1(s0) 1599 lbu t0, -1(s1) 1600 sll s3, s3, 1 1601 addu v0, v0, t2 1602 addu t0, t0, t3 1603 addu t0, t0, v0 1604 addu s3, t0, s3 1605 madd $ac1, s3, t7 1606 extr_r.w t0, $ac1, 16 1607 addiu t5, t5, 2 1608 sb t0, 0(t8) 1609 addiu t4, t4, 1 1610 bne t4, a2, 3b 1611 addiu t5, t5, 2 1612 1613 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1614 1615 j ra 1616 nop 1617 1618 END(jsimd_h2v2_smooth_downsample_mips_dspr2) 1619 1620 /*****************************************************************************/ 1621 LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2) 1622 /* 1623 * a0 - upsample->h_expand[compptr->component_index] 1624 * a1 - upsample->v_expand[compptr->component_index] 1625 * a2 - input_data 1626 * a3 - output_data_ptr 1627 * 16(sp) - cinfo->output_width 1628 * 20(sp) - cinfo->max_v_samp_factor 1629 */ 1630 .set at 1631 1632 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 1633 1634 lw s0, 0(a3) // s0 = output_data 1635 lw s1, 32(sp) // s1 = cinfo->output_width 1636 lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor 1637 li t6, 0 // t6 = inrow 1638 beqz s2, 10f 1639 li s3, 0 // s3 = outrow 1640 0: 1641 addu t0, a2, t6 1642 addu t7, s0, s3 1643 lw t3, 0(t0) // t3 = inptr 1644 lw t8, 0(t7) // t8 = outptr 1645 beqz s1, 4f 1646 addu t5, t8, s1 // t5 = outend 1647 1: 1648 lb t2, 0(t3) // t2 = invalue = *inptr++ 1649 addiu t3, 1 1650 beqz a0, 3f 1651 move t0, a0 // t0 = h_expand 1652 2: 1653 sb t2, 0(t8) 1654 addiu t0, -1 1655 bgtz t0, 2b 1656 addiu t8, 1 1657 3: 1658 bgt t5, t8, 1b 1659 nop 1660 4: 1661 addiu t9, a1, -1 // t9 = v_expand - 1 1662 blez t9, 9f 1663 nop 1664 5: 1665 lw t3, 0(s0) 1666 lw t4, 4(s0) 1667 subu t0, s1, 0xF 1668 blez t0, 7f 1669 addu t5, t3, s1 // t5 = end address 1670 andi t7, s1, 0xF // t7 = residual 1671 subu t8, t5, t7 1672 6: 1673 ulw t0, 0(t3) 1674 ulw t1, 4(t3) 1675 ulw t2, 8(t3) 1676 usw t0, 0(t4) 1677 ulw t0, 12(t3) 1678 usw t1, 4(t4) 1679 usw t2, 8(t4) 1680 usw t0, 12(t4) 1681 addiu t3, 16 1682 bne t3, t8, 6b 1683 addiu t4, 16 1684 beqz t7, 8f 1685 nop 1686 7: 1687 lbu t0, 0(t3) 1688 sb t0, 0(t4) 1689 addiu t3, 1 1690 bne t3, t5, 7b 1691 addiu t4, 1 1692 8: 1693 addiu t9, -1 1694 bgtz t9, 5b 1695 addiu s0, 8 1696 9: 1697 addu s3, s3, a1 1698 bne s3, s2, 0b 1699 addiu t6, 1 1700 10: 1701 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1702 1703 j ra 1704 nop 1705 END(jsimd_int_upsample_mips_dspr2) 1706 1707 /*****************************************************************************/ 1708 LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2) 1709 /* 1710 * a0 - cinfo->max_v_samp_factor 1711 * a1 - cinfo->output_width 1712 * a2 - input_data 1713 * a3 - output_data_ptr 1714 */ 1715 lw t7, 0(a3) // t7 = output_data 1716 andi t8, a1, 0xf // t8 = residual 1717 sll t0, a0, 2 1718 blez a0, 4f 1719 addu t9, t7, t0 // t9 = output_data end address 1720 0: 1721 lw t5, 0(t7) // t5 = outptr 1722 lw t6, 0(a2) // t6 = inptr 1723 addu t3, t5, a1 // t3 = outptr + output_width (end address) 1724 subu t3, t8 // t3 = end address - residual 1725 beq t5, t3, 2f 1726 move t4, t8 1727 1: 1728 ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| 1729 ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| 1730 srl t1, t0, 16 // t1 = |X|X|P3|P2| 1731 ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| 1732 ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| 1733 ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| 1734 ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| 1735 usw t0, 0(t5) 1736 usw t1, 4(t5) 1737 srl t0, t2, 16 // t0 = |X|X|P7|P6| 1738 ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| 1739 ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| 1740 ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| 1741 ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| 1742 usw t2, 8(t5) 1743 usw t0, 12(t5) 1744 addiu t5, 16 1745 bne t5, t3, 1b 1746 addiu t6, 8 1747 beqz t8, 3f 1748 move t4, t8 1749 2: 1750 lbu t1, 0(t6) 1751 sb t1, 0(t5) 1752 sb t1, 1(t5) 1753 addiu t4, -2 1754 addiu t6, 1 1755 bgtz t4, 2b 1756 addiu t5, 2 1757 3: 1758 addiu t7, 4 1759 bne t9, t7, 0b 1760 addiu a2, 4 1761 4: 1762 j ra 1763 nop 1764 END(jsimd_h2v1_upsample_mips_dspr2) 1765 1766 /*****************************************************************************/ 1767 LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2) 1768 /* 1769 * a0 - cinfo->max_v_samp_factor 1770 * a1 - cinfo->output_width 1771 * a2 - input_data 1772 * a3 - output_data_ptr 1773 */ 1774 lw t7, 0(a3) 1775 blez a0, 7f 1776 andi t9, a1, 0xf // t9 = residual 1777 0: 1778 lw t6, 0(a2) // t6 = inptr 1779 lw t5, 0(t7) // t5 = outptr 1780 addu t8, t5, a1 // t8 = outptr end address 1781 subu t8, t9 // t8 = end address - residual 1782 beq t5, t8, 2f 1783 move t4, t9 1784 1: 1785 ulw t0, 0(t6) 1786 srl t1, t0, 16 1787 ins t0, t0, 16, 16 1788 ins t0, t0, 8, 16 1789 ins t1, t1, 16, 16 1790 ins t1, t1, 8, 16 1791 ulw t2, 4(t6) 1792 usw t0, 0(t5) 1793 usw t1, 4(t5) 1794 srl t3, t2, 16 1795 ins t2, t2, 16, 16 1796 ins t2, t2, 8, 16 1797 ins t3, t3, 16, 16 1798 ins t3, t3, 8, 16 1799 usw t2, 8(t5) 1800 usw t3, 12(t5) 1801 addiu t5, 16 1802 bne t5, t8, 1b 1803 addiu t6, 8 1804 beqz t9, 3f 1805 move t4, t9 1806 2: 1807 lbu t0, 0(t6) 1808 sb t0, 0(t5) 1809 sb t0, 1(t5) 1810 addiu t4, -2 1811 addiu t6, 1 1812 bgtz t4, 2b 1813 addiu t5, 2 1814 3: 1815 lw t6, 0(t7) // t6 = outptr[0] 1816 lw t5, 4(t7) // t5 = outptr[1] 1817 addu t4, t6, a1 // t4 = new end address 1818 beq a1, t9, 5f 1819 subu t8, t4, t9 1820 4: 1821 ulw t0, 0(t6) 1822 ulw t1, 4(t6) 1823 ulw t2, 8(t6) 1824 usw t0, 0(t5) 1825 ulw t0, 12(t6) 1826 usw t1, 4(t5) 1827 usw t2, 8(t5) 1828 usw t0, 12(t5) 1829 addiu t6, 16 1830 bne t6, t8, 4b 1831 addiu t5, 16 1832 beqz t9, 6f 1833 nop 1834 5: 1835 lbu t0, 0(t6) 1836 sb t0, 0(t5) 1837 addiu t6, 1 1838 bne t6, t4, 5b 1839 addiu t5, 1 1840 6: 1841 addiu t7, 8 1842 addiu a0, -2 1843 bgtz a0, 0b 1844 addiu a2, 4 1845 7: 1846 j ra 1847 nop 1848 END(jsimd_h2v2_upsample_mips_dspr2) 1849 1850 /*****************************************************************************/ 1851 LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2) 1852 /* 1853 * a0 - coef_block 1854 * a1 - compptr->dcttable 1855 * a2 - output 1856 * a3 - range_limit 1857 */ 1858 1859 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1860 1861 addiu sp, sp, -256 1862 move v0, sp 1863 addiu v1, zero, 8 // v1 = DCTSIZE = 8 1864 1: 1865 lh s4, 32(a0) // s4 = inptr[16] 1866 lh s5, 64(a0) // s5 = inptr[32] 1867 lh s6, 96(a0) // s6 = inptr[48] 1868 lh t1, 112(a0) // t1 = inptr[56] 1869 lh t7, 16(a0) // t7 = inptr[8] 1870 lh t5, 80(a0) // t5 = inptr[40] 1871 lh t3, 48(a0) // t3 = inptr[24] 1872 or s4, s4, t1 1873 or s4, s4, t3 1874 or s4, s4, t5 1875 or s4, s4, t7 1876 or s4, s4, s5 1877 or s4, s4, s6 1878 bnez s4, 2f 1879 addiu v1, v1, -1 1880 lh s5, 0(a1) // quantptr[DCTSIZE*0] 1881 lh s6, 0(a0) // inptr[DCTSIZE*0] 1882 mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) 1883 sll s5, s5, 2 1884 sw s5, 0(v0) 1885 sw s5, 32(v0) 1886 sw s5, 64(v0) 1887 sw s5, 96(v0) 1888 sw s5, 128(v0) 1889 sw s5, 160(v0) 1890 sw s5, 192(v0) 1891 b 3f 1892 sw s5, 224(v0) 1893 2: 1894 lh t0, 112(a1) 1895 lh t2, 48(a1) 1896 lh t4, 80(a1) 1897 lh t6, 16(a1) 1898 mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7]) 1899 mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3]) 1900 mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5]) 1901 mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1]) 1902 lh t4, 32(a1) 1903 lh t5, 32(a0) 1904 lh t6, 96(a1) 1905 lh t7, 96(a0) 1906 addu s0, t0, t1 // z3 = tmp0 + tmp2 1907 addu s1, t1, t2 // z2 = tmp1 + tmp2 1908 addu s2, t2, t3 // z4 = tmp1 + tmp3 1909 addu s3, s0, s2 // z3 + z4 1910 addiu t9, zero, 9633 // FIX_1_175875602 1911 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) 1912 addu t8, t0, t3 // z1 = tmp0 + tmp3 1913 addiu t9, zero, 2446 // FIX_0_298631336 1914 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) 1915 addiu t9, zero, 16819 // FIX_2_053119869 1916 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) 1917 addiu t9, zero, 25172 // FIX_3_072711026 1918 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) 1919 addiu t9, zero, 12299 // FIX_1_501321110 1920 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) 1921 addiu t9, zero, 16069 // FIX_1_961570560 1922 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) 1923 addiu t9, zero, 3196 // FIX_0_390180644 1924 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) 1925 addiu t9, zero, 7373 // FIX_0_899976223 1926 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) 1927 addiu t9, zero, 20995 // FIX_2_562915447 1928 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) 1929 subu s0, s3, s0 // z3 += z5 1930 addu t0, t0, s0 // tmp0 += z3 1931 addu t1, t1, s0 // tmp2 += z3 1932 subu s2, s3, s2 // z4 += z5 1933 addu t2, t2, s2 // tmp1 += z4 1934 addu t3, t3, s2 // tmp3 += z4 1935 subu t0, t0, t8 // tmp0 += z1 1936 subu t1, t1, s1 // tmp2 += z2 1937 subu t2, t2, s1 // tmp1 += z2 1938 subu t3, t3, t8 // tmp3 += z1 1939 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2]) 1940 addiu t9, zero, 6270 // FIX_0_765366865 1941 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6]) 1942 lh t4, 0(a1) 1943 lh t5, 0(a0) 1944 lh t6, 64(a1) 1945 lh t7, 64(a0) 1946 mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) 1947 mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0]) 1948 mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4]) 1949 addiu t9, zero, 4433 // FIX_0_541196100 1950 addu s3, s0, s1 // z2 + z3 1951 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) 1952 addiu t9, zero, 15137 // FIX_1_847759065 1953 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) 1954 addu t4, t5, t6 1955 subu t5, t5, t6 1956 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS 1957 sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS 1958 addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) 1959 subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) 1960 addu s0, t4, t7 1961 subu s1, t4, t7 1962 addu s2, t5, t6 1963 subu s3, t5, t6 1964 addu t4, s0, t3 1965 subu s0, s0, t3 1966 addu t3, s2, t1 1967 subu s2, s2, t1 1968 addu t1, s3, t2 1969 subu s3, s3, t2 1970 addu t2, s1, t0 1971 subu s1, s1, t0 1972 shra_r.w t4, t4, 11 1973 shra_r.w t3, t3, 11 1974 shra_r.w t1, t1, 11 1975 shra_r.w t2, t2, 11 1976 shra_r.w s1, s1, 11 1977 shra_r.w s3, s3, 11 1978 shra_r.w s2, s2, 11 1979 shra_r.w s0, s0, 11 1980 sw t4, 0(v0) 1981 sw t3, 32(v0) 1982 sw t1, 64(v0) 1983 sw t2, 96(v0) 1984 sw s1, 128(v0) 1985 sw s3, 160(v0) 1986 sw s2, 192(v0) 1987 sw s0, 224(v0) 1988 3: 1989 addiu a1, a1, 2 1990 addiu a0, a0, 2 1991 bgtz v1, 1b 1992 addiu v0, v0, 4 1993 move v0, sp 1994 addiu v1, zero, 8 1995 4: 1996 lw t0, 8(v0) // z2 = (JLONG) wsptr[2] 1997 lw t1, 24(v0) // z3 = (JLONG) wsptr[6] 1998 lw t2, 0(v0) // (JLONG) wsptr[0] 1999 lw t3, 16(v0) // (JLONG) wsptr[4] 2000 lw s4, 4(v0) // (JLONG) wsptr[1] 2001 lw s5, 12(v0) // (JLONG) wsptr[3] 2002 lw s6, 20(v0) // (JLONG) wsptr[5] 2003 lw s7, 28(v0) // (JLONG) wsptr[7] 2004 or s4, s4, t0 2005 or s4, s4, t1 2006 or s4, s4, t3 2007 or s4, s4, s7 2008 or s4, s4, s5 2009 or s4, s4, s6 2010 bnez s4, 5f 2011 addiu v1, v1, -1 2012 shra_r.w s5, t2, 5 2013 andi s5, s5, 0x3ff 2014 lbux s5, s5(a3) 2015 lw s1, 0(a2) 2016 replv.qb s5, s5 2017 usw s5, 0(s1) 2018 usw s5, 4(s1) 2019 b 6f 2020 nop 2021 5: 2022 addu t4, t0, t1 // z2 + z3 2023 addiu t8, zero, 4433 // FIX_0_541196100 2024 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) 2025 addiu t8, zero, 15137 // FIX_1_847759065 2026 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) 2027 addiu t8, zero, 6270 // FIX_0_765366865 2028 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) 2029 addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4] 2030 subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4] 2031 sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS 2032 sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS 2033 subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) 2034 subu t3, t2, t1 // tmp12 = tmp1 - tmp2 2035 addu t2, t2, t1 // tmp11 = tmp1 + tmp2 2036 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) 2037 subu t1, t4, t5 // tmp13 = tmp0 - tmp3 2038 addu t0, t4, t5 // tmp10 = tmp0 + tmp3 2039 lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7] 2040 lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3] 2041 lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5] 2042 lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1] 2043 addu s0, t4, t6 // z3 = tmp0 + tmp2 2044 addiu t8, zero, 9633 // FIX_1_175875602 2045 addu s1, t5, t7 // z4 = tmp1 + tmp3 2046 addu s2, s0, s1 // z3 + z4 2047 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) 2048 addu s3, t4, t7 // z1 = tmp0 + tmp3 2049 addu t9, t5, t6 // z2 = tmp1 + tmp2 2050 addiu t8, zero, 16069 // FIX_1_961570560 2051 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) 2052 addiu t8, zero, 3196 // FIX_0_390180644 2053 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) 2054 addiu t8, zero, 2446 // FIX_0_298631336 2055 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) 2056 addiu t8, zero, 7373 // FIX_0_899976223 2057 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) 2058 addiu t8, zero, 16819 // FIX_2_053119869 2059 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) 2060 addiu t8, zero, 20995 // FIX_2_562915447 2061 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) 2062 addiu t8, zero, 25172 // FIX_3_072711026 2063 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) 2064 addiu t8, zero, 12299 // FIX_1_501321110 2065 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) 2066 subu s0, s2, s0 // z3 += z5 2067 subu s1, s2, s1 // z4 += z5 2068 addu t4, t4, s0 2069 subu t4, t4, s3 // tmp0 2070 addu t5, t5, s1 2071 subu t5, t5, t9 // tmp1 2072 addu t6, t6, s0 2073 subu t6, t6, t9 // tmp2 2074 addu t7, t7, s1 2075 subu t7, t7, s3 // tmp3 2076 addu s0, t0, t7 2077 subu t0, t0, t7 2078 addu t7, t2, t6 2079 subu t2, t2, t6 2080 addu t6, t3, t5 2081 subu t3, t3, t5 2082 addu t5, t1, t4 2083 subu t1, t1, t4 2084 shra_r.w s0, s0, 18 2085 shra_r.w t7, t7, 18 2086 shra_r.w t6, t6, 18 2087 shra_r.w t5, t5, 18 2088 shra_r.w t1, t1, 18 2089 shra_r.w t3, t3, 18 2090 shra_r.w t2, t2, 18 2091 shra_r.w t0, t0, 18 2092 andi s0, s0, 0x3ff 2093 andi t7, t7, 0x3ff 2094 andi t6, t6, 0x3ff 2095 andi t5, t5, 0x3ff 2096 andi t1, t1, 0x3ff 2097 andi t3, t3, 0x3ff 2098 andi t2, t2, 0x3ff 2099 andi t0, t0, 0x3ff 2100 lw s1, 0(a2) 2101 lbux s0, s0(a3) 2102 lbux t7, t7(a3) 2103 lbux t6, t6(a3) 2104 lbux t5, t5(a3) 2105 lbux t1, t1(a3) 2106 lbux t3, t3(a3) 2107 lbux t2, t2(a3) 2108 lbux t0, t0(a3) 2109 sb s0, 0(s1) 2110 sb t7, 1(s1) 2111 sb t6, 2(s1) 2112 sb t5, 3(s1) 2113 sb t1, 4(s1) 2114 sb t3, 5(s1) 2115 sb t2, 6(s1) 2116 sb t0, 7(s1) 2117 6: 2118 addiu v0, v0, 32 2119 bgtz v1, 4b 2120 addiu a2, a2, 4 2121 addiu sp, sp, 256 2122 2123 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2124 2125 j ra 2126 nop 2127 2128 END(jsimd_idct_islow_mips_dspr2) 2129 2130 /*****************************************************************************/ 2131 LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2) 2132 /* 2133 * a0 - inptr 2134 * a1 - quantptr 2135 * a2 - wsptr 2136 * a3 - mips_idct_ifast_coefs 2137 */ 2138 2139 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2140 2141 addiu t9, a0, 16 // end address 2142 or AT, a3, zero 2143 2144 0: 2145 lw s0, 0(a1) // quantptr[DCTSIZE*0] 2146 lw t0, 0(a0) // inptr[DCTSIZE*0] 2147 lw t1, 16(a0) // inptr[DCTSIZE*1] 2148 muleq_s.w.phl v0, t0, s0 // tmp0 ... 2149 lw t2, 32(a0) // inptr[DCTSIZE*2] 2150 lw t3, 48(a0) // inptr[DCTSIZE*3] 2151 lw t4, 64(a0) // inptr[DCTSIZE*4] 2152 lw t5, 80(a0) // inptr[DCTSIZE*5] 2153 muleq_s.w.phr t0, t0, s0 // ... tmp0 ... 2154 lw t6, 96(a0) // inptr[DCTSIZE*6] 2155 lw t7, 112(a0) // inptr[DCTSIZE*7] 2156 or s4, t1, t2 2157 or s5, t3, t4 2158 bnez s4, 1f 2159 ins t0, v0, 16, 16 // ... tmp0 2160 bnez s5, 1f 2161 or s6, t5, t6 2162 or s6, s6, t7 2163 bnez s6, 1f 2164 sw t0, 0(a2) // wsptr[DCTSIZE*0] 2165 sw t0, 16(a2) // wsptr[DCTSIZE*1] 2166 sw t0, 32(a2) // wsptr[DCTSIZE*2] 2167 sw t0, 48(a2) // wsptr[DCTSIZE*3] 2168 sw t0, 64(a2) // wsptr[DCTSIZE*4] 2169 sw t0, 80(a2) // wsptr[DCTSIZE*5] 2170 sw t0, 96(a2) // wsptr[DCTSIZE*6] 2171 sw t0, 112(a2) // wsptr[DCTSIZE*7] 2172 addiu a0, a0, 4 2173 b 2f 2174 addiu a1, a1, 4 2175 2176 1: 2177 lw s1, 32(a1) // quantptr[DCTSIZE*2] 2178 lw s2, 64(a1) // quantptr[DCTSIZE*4] 2179 muleq_s.w.phl v0, t2, s1 // tmp1 ... 2180 muleq_s.w.phr t2, t2, s1 // ... tmp1 ... 2181 lw s0, 16(a1) // quantptr[DCTSIZE*1] 2182 lw s1, 48(a1) // quantptr[DCTSIZE*3] 2183 lw s3, 96(a1) // quantptr[DCTSIZE*6] 2184 muleq_s.w.phl v1, t4, s2 // tmp2 ... 2185 muleq_s.w.phr t4, t4, s2 // ... tmp2 ... 2186 lw s2, 80(a1) // quantptr[DCTSIZE*5] 2187 lw t8, 4(AT) // FIX(1.414213562) 2188 ins t2, v0, 16, 16 // ... tmp1 2189 muleq_s.w.phl v0, t6, s3 // tmp3 ... 2190 muleq_s.w.phr t6, t6, s3 // ... tmp3 ... 2191 ins t4, v1, 16, 16 // ... tmp2 2192 addq.ph s4, t0, t4 // tmp10 2193 subq.ph s5, t0, t4 // tmp11 2194 ins t6, v0, 16, 16 // ... tmp3 2195 subq.ph s6, t2, t6 // tmp12 ... 2196 addq.ph s7, t2, t6 // tmp13 2197 mulq_s.ph s6, s6, t8 // ... tmp12 ... 2198 addq.ph t0, s4, s7 // tmp0 2199 subq.ph t6, s4, s7 // tmp3 2200 muleq_s.w.phl v0, t1, s0 // tmp4 ... 2201 muleq_s.w.phr t1, t1, s0 // ... tmp4 ... 2202 shll_s.ph s6, s6, 1 // x2 2203 lw s3, 112(a1) // quantptr[DCTSIZE*7] 2204 subq.ph s6, s6, s7 // ... tmp12 2205 muleq_s.w.phl v1, t7, s3 // tmp7 ... 2206 muleq_s.w.phr t7, t7, s3 // ... tmp7 ... 2207 ins t1, v0, 16, 16 // ... tmp4 2208 addq.ph t2, s5, s6 // tmp1 2209 subq.ph t4, s5, s6 // tmp2 2210 muleq_s.w.phl v0, t5, s2 // tmp6 ... 2211 muleq_s.w.phr t5, t5, s2 // ... tmp6 ... 2212 ins t7, v1, 16, 16 // ... tmp7 2213 addq.ph s5, t1, t7 // z11 2214 subq.ph s6, t1, t7 // z12 2215 muleq_s.w.phl v1, t3, s1 // tmp5 ... 2216 muleq_s.w.phr t3, t3, s1 // ... tmp5 ... 2217 ins t5, v0, 16, 16 // ... tmp6 2218 ins t3, v1, 16, 16 // ... tmp5 2219 addq.ph s7, t5, t3 // z13 2220 subq.ph v0, t5, t3 // z10 2221 addq.ph t7, s5, s7 // tmp7 2222 subq.ph s5, s5, s7 // tmp11 ... 2223 addq.ph v1, v0, s6 // z5 ... 2224 mulq_s.ph s5, s5, t8 // ... tmp11 2225 lw t8, 8(AT) // FIX(1.847759065) 2226 lw s4, 0(AT) // FIX(1.082392200) 2227 addq.ph s0, t0, t7 2228 subq.ph s1, t0, t7 2229 mulq_s.ph v1, v1, t8 // ... z5 2230 shll_s.ph s5, s5, 1 // x2 2231 lw t8, 12(AT) // FIX(-2.613125930) 2232 sw s0, 0(a2) // wsptr[DCTSIZE*0] 2233 shll_s.ph v0, v0, 1 // x4 2234 mulq_s.ph v0, v0, t8 // tmp12 ... 2235 mulq_s.ph s4, s6, s4 // tmp10 ... 2236 shll_s.ph v1, v1, 1 // x2 2237 addiu a0, a0, 4 2238 addiu a1, a1, 4 2239 sw s1, 112(a2) // wsptr[DCTSIZE*7] 2240 shll_s.ph s6, v0, 1 // x4 2241 shll_s.ph s4, s4, 1 // x2 2242 addq.ph s6, s6, v1 // ... tmp12 2243 subq.ph t5, s6, t7 // tmp6 2244 subq.ph s4, s4, v1 // ... tmp10 2245 subq.ph t3, s5, t5 // tmp5 2246 addq.ph s2, t2, t5 2247 addq.ph t1, s4, t3 // tmp4 2248 subq.ph s3, t2, t5 2249 sw s2, 16(a2) // wsptr[DCTSIZE*1] 2250 sw s3, 96(a2) // wsptr[DCTSIZE*6] 2251 addq.ph v0, t4, t3 2252 subq.ph v1, t4, t3 2253 sw v0, 32(a2) // wsptr[DCTSIZE*2] 2254 sw v1, 80(a2) // wsptr[DCTSIZE*5] 2255 addq.ph v0, t6, t1 2256 subq.ph v1, t6, t1 2257 sw v0, 64(a2) // wsptr[DCTSIZE*4] 2258 sw v1, 48(a2) // wsptr[DCTSIZE*3] 2259 2260 2: 2261 bne a0, t9, 0b 2262 addiu a2, a2, 4 2263 2264 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2265 2266 j ra 2267 nop 2268 2269 END(jsimd_idct_ifast_cols_mips_dspr2) 2270 2271 /*****************************************************************************/ 2272 LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2) 2273 /* 2274 * a0 - wsptr 2275 * a1 - output_buf 2276 * a2 - output_col 2277 * a3 - mips_idct_ifast_coefs 2278 */ 2279 2280 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2281 2282 addiu t9, a0, 128 // end address 2283 lui s8, 0x8080 2284 ori s8, s8, 0x8080 2285 2286 0: 2287 lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) 2288 lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a 2289 lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A 2290 lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c 2291 lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C 2292 lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e 2293 lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E 2294 lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g 2295 lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G 2296 precrq.ph.w t1, s0, t0 // B b 2297 ins t0, s0, 16, 16 // A a 2298 bnez t1, 1f 2299 or s0, t2, s2 2300 bnez s0, 1f 2301 or s0, t4, s4 2302 bnez s0, 1f 2303 or s0, t6, s6 2304 bnez s0, 1f 2305 shll_s.ph s0, t0, 2 // A a 2306 lw a3, 0(a1) 2307 lw AT, 4(a1) 2308 precrq.ph.w t0, s0, s0 // A A 2309 ins s0, s0, 16, 16 // a a 2310 addu a3, a3, a2 2311 addu AT, AT, a2 2312 precrq.qb.ph t0, t0, t0 // A A A A 2313 precrq.qb.ph s0, s0, s0 // a a a a 2314 addu.qb s0, s0, s8 2315 addu.qb t0, t0, s8 2316 sw s0, 0(a3) 2317 sw s0, 4(a3) 2318 sw t0, 0(AT) 2319 sw t0, 4(AT) 2320 addiu a0, a0, 32 2321 bne a0, t9, 0b 2322 addiu a1, a1, 8 2323 b 2f 2324 nop 2325 2326 1: 2327 precrq.ph.w t3, s2, t2 2328 ins t2, s2, 16, 16 2329 precrq.ph.w t5, s4, t4 2330 ins t4, s4, 16, 16 2331 precrq.ph.w t7, s6, t6 2332 ins t6, s6, 16, 16 2333 lw t8, 4(AT) // FIX(1.414213562) 2334 addq.ph s4, t0, t4 // tmp10 2335 subq.ph s5, t0, t4 // tmp11 2336 subq.ph s6, t2, t6 // tmp12 ... 2337 addq.ph s7, t2, t6 // tmp13 2338 mulq_s.ph s6, s6, t8 // ... tmp12 ... 2339 addq.ph t0, s4, s7 // tmp0 2340 subq.ph t6, s4, s7 // tmp3 2341 shll_s.ph s6, s6, 1 // x2 2342 subq.ph s6, s6, s7 // ... tmp12 2343 addq.ph t2, s5, s6 // tmp1 2344 subq.ph t4, s5, s6 // tmp2 2345 addq.ph s5, t1, t7 // z11 2346 subq.ph s6, t1, t7 // z12 2347 addq.ph s7, t5, t3 // z13 2348 subq.ph v0, t5, t3 // z10 2349 addq.ph t7, s5, s7 // tmp7 2350 subq.ph s5, s5, s7 // tmp11 ... 2351 addq.ph v1, v0, s6 // z5 ... 2352 mulq_s.ph s5, s5, t8 // ... tmp11 2353 lw t8, 8(AT) // FIX(1.847759065) 2354 lw s4, 0(AT) // FIX(1.082392200) 2355 addq.ph s0, t0, t7 // tmp0 + tmp7 2356 subq.ph s7, t0, t7 // tmp0 - tmp7 2357 mulq_s.ph v1, v1, t8 // ... z5 2358 lw a3, 0(a1) 2359 lw t8, 12(AT) // FIX(-2.613125930) 2360 shll_s.ph s5, s5, 1 // x2 2361 addu a3, a3, a2 2362 shll_s.ph v0, v0, 1 // x4 2363 mulq_s.ph v0, v0, t8 // tmp12 ... 2364 mulq_s.ph s4, s6, s4 // tmp10 ... 2365 shll_s.ph v1, v1, 1 // x2 2366 addiu a0, a0, 32 2367 addiu a1, a1, 8 2368 shll_s.ph s6, v0, 1 // x4 2369 shll_s.ph s4, s4, 1 // x2 2370 addq.ph s6, s6, v1 // ... tmp12 2371 shll_s.ph s0, s0, 2 2372 subq.ph t5, s6, t7 // tmp6 2373 subq.ph s4, s4, v1 // ... tmp10 2374 subq.ph t3, s5, t5 // tmp5 2375 shll_s.ph s7, s7, 2 2376 addq.ph t1, s4, t3 // tmp4 2377 addq.ph s1, t2, t5 // tmp1 + tmp6 2378 subq.ph s6, t2, t5 // tmp1 - tmp6 2379 addq.ph s2, t4, t3 // tmp2 + tmp5 2380 subq.ph s5, t4, t3 // tmp2 - tmp5 2381 addq.ph s4, t6, t1 // tmp3 + tmp4 2382 subq.ph s3, t6, t1 // tmp3 - tmp4 2383 shll_s.ph s1, s1, 2 2384 shll_s.ph s2, s2, 2 2385 shll_s.ph s3, s3, 2 2386 shll_s.ph s4, s4, 2 2387 shll_s.ph s5, s5, 2 2388 shll_s.ph s6, s6, 2 2389 precrq.ph.w t0, s1, s0 // B A 2390 ins s0, s1, 16, 16 // b a 2391 precrq.ph.w t2, s3, s2 // D C 2392 ins s2, s3, 16, 16 // d c 2393 precrq.ph.w t4, s5, s4 // F E 2394 ins s4, s5, 16, 16 // f e 2395 precrq.ph.w t6, s7, s6 // H G 2396 ins s6, s7, 16, 16 // h g 2397 precrq.qb.ph t0, t2, t0 // D C B A 2398 precrq.qb.ph s0, s2, s0 // d c b a 2399 precrq.qb.ph t4, t6, t4 // H G F E 2400 precrq.qb.ph s4, s6, s4 // h g f e 2401 addu.qb s0, s0, s8 2402 addu.qb s4, s4, s8 2403 sw s0, 0(a3) // outptr[0/1/2/3] d c b a 2404 sw s4, 4(a3) // outptr[4/5/6/7] h g f e 2405 lw a3, -4(a1) 2406 addu.qb t0, t0, s8 2407 addu a3, a3, a2 2408 addu.qb t4, t4, s8 2409 sw t0, 0(a3) // outptr[0/1/2/3] D C B A 2410 bne a0, t9, 0b 2411 sw t4, 4(a3) // outptr[4/5/6/7] H G F E 2412 2413 2: 2414 2415 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2416 2417 j ra 2418 nop 2419 2420 END(jsimd_idct_ifast_rows_mips_dspr2) 2421 2422 /*****************************************************************************/ 2423 LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2) 2424 /* 2425 * a0 - data 2426 */ 2427 2428 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2429 2430 lui t0, 6437 2431 ori t0, 2260 2432 lui t1, 9633 2433 ori t1, 11363 2434 lui t2, 0xd39e 2435 ori t2, 0xe6dc 2436 lui t3, 0xf72d 2437 ori t3, 9633 2438 lui t4, 2261 2439 ori t4, 9633 2440 lui t5, 0xd39e 2441 ori t5, 6437 2442 lui t6, 9633 2443 ori t6, 0xd39d 2444 lui t7, 0xe6dc 2445 ori t7, 2260 2446 lui t8, 4433 2447 ori t8, 10703 2448 lui t9, 0xd630 2449 ori t9, 4433 2450 li s8, 8 2451 move a1, a0 2452 1: 2453 lw s0, 0(a1) // tmp0 = 1|0 2454 lw s1, 4(a1) // tmp1 = 3|2 2455 lw s2, 8(a1) // tmp2 = 5|4 2456 lw s3, 12(a1) // tmp3 = 7|6 2457 packrl.ph s1, s1, s1 // tmp1 = 2|3 2458 packrl.ph s3, s3, s3 // tmp3 = 6|7 2459 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 2460 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 2461 mult $0, $0 // ac0 = 0 2462 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 2463 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 2464 mult $ac1, $0, $0 // ac1 = 0 2465 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 2466 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 2467 mult $ac2, $0, $0 // ac2 = 0 2468 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 2469 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 2470 mult $ac3, $0, $0 // ac3 = 0 2471 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 2472 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 2473 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 2474 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 2475 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 2476 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 2477 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 2478 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 2479 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 2480 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 2481 sh s0, 2(a1) 2482 sh s1, 6(a1) 2483 sh s2, 10(a1) 2484 sh s3, 14(a1) 2485 mult $0, $0 // ac0 = 0 2486 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 2487 mult $ac1, $0, $0 // ac1 = 0 2488 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 2489 sra s4, s5, 16 // tmp4 = t11 2490 addiu a1, a1, 16 2491 addiu s8, s8, -1 2492 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 2493 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 2494 addu s2, s5, s4 // tmp2 = t10 + t11 2495 subu s3, s5, s4 // tmp3 = t10 - t11 2496 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 2497 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 2498 sh s2, -16(a1) 2499 sh s3, -8(a1) 2500 sh s0, -12(a1) 2501 bgtz s8, 1b 2502 sh s1, -4(a1) 2503 li t0, 2260 2504 li t1, 11363 2505 li t2, 9633 2506 li t3, 6436 2507 li t4, 6437 2508 li t5, 2261 2509 li t6, 11362 2510 li t7, 2259 2511 li t8, 4433 2512 li t9, 10703 2513 li a1, 10704 2514 li s8, 8 2515 2516 2: 2517 lh a2, 0(a0) // 0 2518 lh a3, 16(a0) // 8 2519 lh v0, 32(a0) // 16 2520 lh v1, 48(a0) // 24 2521 lh s4, 64(a0) // 32 2522 lh s5, 80(a0) // 40 2523 lh s6, 96(a0) // 48 2524 lh s7, 112(a0) // 56 2525 addu s2, v0, s5 // tmp2 = 16 + 40 2526 subu s5, v0, s5 // tmp5 = 16 - 40 2527 addu s3, v1, s4 // tmp3 = 24 + 32 2528 subu s4, v1, s4 // tmp4 = 24 - 32 2529 addu s0, a2, s7 // tmp0 = 0 + 56 2530 subu s7, a2, s7 // tmp7 = 0 - 56 2531 addu s1, a3, s6 // tmp1 = 8 + 48 2532 subu s6, a3, s6 // tmp6 = 8 - 48 2533 addu a2, s0, s3 // tmp10 = tmp0 + tmp3 2534 subu v1, s0, s3 // tmp13 = tmp0 - tmp3 2535 addu a3, s1, s2 // tmp11 = tmp1 + tmp2 2536 subu v0, s1, s2 // tmp12 = tmp1 - tmp2 2537 mult s7, t1 // ac0 = tmp7 * c1 2538 madd s4, t0 // ac0 += tmp4 * c0 2539 madd s5, t4 // ac0 += tmp5 * c4 2540 madd s6, t2 // ac0 += tmp6 * c2 2541 mult $ac1, s7, t2 // ac1 = tmp7 * c2 2542 msub $ac1, s4, t3 // ac1 -= tmp4 * c3 2543 msub $ac1, s5, t6 // ac1 -= tmp5 * c6 2544 msub $ac1, s6, t7 // ac1 -= tmp6 * c7 2545 mult $ac2, s7, t4 // ac2 = tmp7 * c4 2546 madd $ac2, s4, t2 // ac2 += tmp4 * c2 2547 madd $ac2, s5, t5 // ac2 += tmp5 * c5 2548 msub $ac2, s6, t6 // ac2 -= tmp6 * c6 2549 mult $ac3, s7, t0 // ac3 = tmp7 * c0 2550 msub $ac3, s4, t1 // ac3 -= tmp4 * c1 2551 madd $ac3, s5, t2 // ac3 += tmp5 * c2 2552 msub $ac3, s6, t3 // ac3 -= tmp6 * c3 2553 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 2554 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 2555 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 2556 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 2557 addiu s8, s8, -1 2558 addu s4, a2, a3 // tmp4 = tmp10 + tmp11 2559 subu s5, a2, a3 // tmp5 = tmp10 - tmp11 2560 sh s0, 16(a0) 2561 sh s1, 48(a0) 2562 sh s2, 80(a0) 2563 sh s3, 112(a0) 2564 mult v0, t8 // ac0 = tmp12 * c8 2565 madd v1, t9 // ac0 += tmp13 * c9 2566 mult $ac1, v1, t8 // ac1 = tmp13 * c8 2567 msub $ac1, v0, a1 // ac1 -= tmp12 * c10 2568 addiu a0, a0, 2 2569 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 2570 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 2571 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 2572 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 2573 sh s4, -2(a0) 2574 sh s5, 62(a0) 2575 sh s6, 30(a0) 2576 bgtz s8, 2b 2577 sh s7, 94(a0) 2578 2579 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2580 2581 jr ra 2582 nop 2583 2584 END(jsimd_fdct_islow_mips_dspr2) 2585 2586 /*****************************************************************************/ 2587 LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2) 2588 /* 2589 * a0 - data 2590 */ 2591 .set at 2592 SAVE_REGS_ON_STACK 8, s0, s1 2593 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) 2594 li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) 2595 li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) 2596 li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) 2597 2598 move v0, a0 2599 addiu v1, v0, 128 // end address 2600 2601 0: 2602 lw t0, 0(v0) // tmp0 = 1|0 2603 lw t1, 4(v0) // tmp1 = 3|2 2604 lw t2, 8(v0) // tmp2 = 5|4 2605 lw t3, 12(v0) // tmp3 = 7|6 2606 packrl.ph t1, t1, t1 // tmp1 = 2|3 2607 packrl.ph t3, t3, t3 // tmp3 = 6|7 2608 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 2609 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 2610 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 2611 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 2612 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 2613 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 2614 sra t4, t8, 16 // tmp4 = t11 2615 mult $0, $0 // ac0 = 0 2616 dpa.w.ph $ac0, t9, s1 2617 mult $ac1, $0, $0 // ac1 = 0 2618 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 2619 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 2620 mult $ac2, $0, $0 // ac2 = 0 2621 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 2622 mult $ac3, $0, $0 // ac3 = 0 2623 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 2624 precrq.ph.w t0, t5, t7 // t0 = t5|t6 2625 addq.ph t2, t8, t4 // tmp2 = t10 + t11 2626 subq.ph t3, t8, t4 // tmp3 = t10 - t11 2627 extr.w t4, $ac0, 8 2628 mult $0, $0 // ac0 = 0 2629 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 2630 extr.w t0, $ac1, 8 // t0 = z5 2631 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) 2632 extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) 2633 extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) 2634 add t6, t1, t0 // t6 = z2 2635 add t7, t7, t0 // t7 = z4 2636 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 2637 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 2638 addq.ph t1, t0, t6 // t1 = z13 + z2 2639 subq.ph t6, t0, t6 // t6 = z13 - z2 2640 addq.ph t0, t8, t7 // t0 = z11 + z4 2641 subq.ph t7, t8, t7 // t7 = z11 - z4 2642 addq.ph t5, t4, t9 2643 subq.ph t4, t9, t4 2644 sh t2, 0(v0) 2645 sh t5, 4(v0) 2646 sh t3, 8(v0) 2647 sh t4, 12(v0) 2648 sh t1, 10(v0) 2649 sh t6, 6(v0) 2650 sh t0, 2(v0) 2651 sh t7, 14(v0) 2652 addiu v0, 16 2653 bne v1, v0, 0b 2654 nop 2655 move v0, a0 2656 addiu v1, v0, 16 2657 2658 1: 2659 lh t0, 0(v0) // 0 2660 lh t1, 16(v0) // 8 2661 lh t2, 32(v0) // 16 2662 lh t3, 48(v0) // 24 2663 lh t4, 64(v0) // 32 2664 lh t5, 80(v0) // 40 2665 lh t6, 96(v0) // 48 2666 lh t7, 112(v0) // 56 2667 add t8, t0, t7 // t8 = tmp0 2668 sub t7, t0, t7 // t7 = tmp7 2669 add t0, t1, t6 // t0 = tmp1 2670 sub t1, t1, t6 // t1 = tmp6 2671 add t6, t2, t5 // t6 = tmp2 2672 sub t5, t2, t5 // t5 = tmp5 2673 add t2, t3, t4 // t2 = tmp3 2674 sub t3, t3, t4 // t3 = tmp4 2675 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 2676 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 2677 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 2678 ins t8, s0, 16, 16 // t8 = tmp12|tmp13 2679 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 2680 mult $0, $0 // ac0 = 0 2681 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 2682 add s0, t4, t2 // t8 = tmp10+tmp11 2683 sub t4, t4, t2 // t4 = tmp10-tmp11 2684 sh s0, 0(v0) 2685 sh t4, 64(v0) 2686 extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781) 2687 addq.ph t4, t8, t2 // t9 = tmp13 + z1 2688 subq.ph t8, t8, t2 // t2 = tmp13 - z1 2689 sh t4, 32(v0) 2690 sh t8, 96(v0) 2691 add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 2692 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 2693 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 2694 andi t4, a1, 0xffff 2695 mul s0, t1, t4 2696 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) 2697 ins t1, t3, 16, 16 // t1 = tmp10|tmp12 2698 mult $0, $0 // ac0 = 0 2699 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 2700 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433) 2701 add t2, t7, t8 // t2 = tmp7 + z5 2702 sub t7, t7, t8 // t7 = tmp7 - z5 2703 andi t4, a2, 0xffff 2704 mul t8, t3, t4 2705 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) 2706 andi t4, s1, 0xffff 2707 mul t6, t0, t4 2708 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) 2709 add t0, t6, t8 // t0 = z3 + z2 2710 sub t1, t6, t8 // t1 = z3 - z2 2711 add t3, t6, s0 // t3 = z3 + z4 2712 sub t4, t6, s0 // t4 = z3 - z4 2713 sub t5, t2, t1 // t5 = dataptr[5] 2714 sub t6, t7, t0 // t6 = dataptr[3] 2715 add t3, t2, t3 // t3 = dataptr[1] 2716 add t4, t7, t4 // t4 = dataptr[7] 2717 sh t5, 80(v0) 2718 sh t6, 48(v0) 2719 sh t3, 16(v0) 2720 sh t4, 112(v0) 2721 addiu v0, 2 2722 bne v0, v1, 1b 2723 nop 2724 2725 RESTORE_REGS_FROM_STACK 8, s0, s1 2726 2727 j ra 2728 nop 2729 END(jsimd_fdct_ifast_mips_dspr2) 2730 2731 /*****************************************************************************/ 2732 LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2) 2733 /* 2734 * a0 - coef_block 2735 * a1 - divisors 2736 * a2 - workspace 2737 */ 2738 2739 .set at 2740 2741 SAVE_REGS_ON_STACK 16, s0, s1, s2 2742 2743 addiu v0, a2, 124 // v0 = workspace_end 2744 lh t0, 0(a2) 2745 lh t1, 0(a1) 2746 lh t2, 128(a1) 2747 sra t3, t0, 15 2748 sll t3, t3, 1 2749 addiu t3, t3, 1 2750 mul t0, t0, t3 2751 lh t4, 384(a1) 2752 lh t5, 130(a1) 2753 lh t6, 2(a2) 2754 lh t7, 2(a1) 2755 lh t8, 386(a1) 2756 2757 1: 2758 andi t1, 0xffff 2759 add t9, t0, t2 2760 andi t9, 0xffff 2761 mul v1, t9, t1 2762 sra s0, t6, 15 2763 sll s0, s0, 1 2764 addiu s0, s0, 1 2765 addiu t9, t4, 16 2766 srav v1, v1, t9 2767 mul v1, v1, t3 2768 mul t6, t6, s0 2769 andi t7, 0xffff 2770 addiu a2, a2, 4 2771 addiu a1, a1, 4 2772 add s1, t6, t5 2773 andi s1, 0xffff 2774 sh v1, 0(a0) 2775 2776 mul s2, s1, t7 2777 addiu s1, t8, 16 2778 srav s2, s2, s1 2779 mul s2,s2, s0 2780 lh t0, 0(a2) 2781 lh t1, 0(a1) 2782 sra t3, t0, 15 2783 sll t3, t3, 1 2784 addiu t3, t3, 1 2785 mul t0, t0, t3 2786 lh t2, 128(a1) 2787 lh t4, 384(a1) 2788 lh t5, 130(a1) 2789 lh t8, 386(a1) 2790 lh t6, 2(a2) 2791 lh t7, 2(a1) 2792 sh s2, 2(a0) 2793 lh t0, 0(a2) 2794 sra t3, t0, 15 2795 sll t3, t3, 1 2796 addiu t3, t3, 1 2797 mul t0, t0,t3 2798 bne a2, v0, 1b 2799 addiu a0, a0, 4 2800 2801 andi t1, 0xffff 2802 add t9, t0, t2 2803 andi t9, 0xffff 2804 mul v1, t9, t1 2805 sra s0, t6, 15 2806 sll s0, s0, 1 2807 addiu s0, s0, 1 2808 addiu t9, t4, 16 2809 srav v1, v1, t9 2810 mul v1, v1, t3 2811 mul t6, t6, s0 2812 andi t7, 0xffff 2813 sh v1, 0(a0) 2814 add s1, t6, t5 2815 andi s1, 0xffff 2816 mul s2, s1, t7 2817 addiu s1, t8, 16 2818 addiu a2, a2, 4 2819 addiu a1, a1, 4 2820 srav s2, s2, s1 2821 mul s2, s2, s0 2822 sh s2, 2(a0) 2823 2824 RESTORE_REGS_FROM_STACK 16, s0, s1, s2 2825 2826 j ra 2827 nop 2828 2829 END(jsimd_quantize_mips_dspr2) 2830 2831 /*****************************************************************************/ 2832 LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2) 2833 /* 2834 * a0 - coef_block 2835 * a1 - divisors 2836 * a2 - workspace 2837 */ 2838 2839 .set at 2840 2841 li t1, 0x46800100 //integer representation 16384.5 2842 mtc1 t1, f0 2843 li t0, 63 2844 0: 2845 lwc1 f2, 0(a2) 2846 lwc1 f10, 0(a1) 2847 lwc1 f4, 4(a2) 2848 lwc1 f12, 4(a1) 2849 lwc1 f6, 8(a2) 2850 lwc1 f14, 8(a1) 2851 lwc1 f8, 12(a2) 2852 lwc1 f16, 12(a1) 2853 madd.s f2, f0, f2, f10 2854 madd.s f4, f0, f4, f12 2855 madd.s f6, f0, f6, f14 2856 madd.s f8, f0, f8, f16 2857 lwc1 f10, 16(a1) 2858 lwc1 f12, 20(a1) 2859 trunc.w.s f2, f2 2860 trunc.w.s f4, f4 2861 trunc.w.s f6, f6 2862 trunc.w.s f8, f8 2863 lwc1 f14, 24(a1) 2864 lwc1 f16, 28(a1) 2865 mfc1 t1, f2 2866 mfc1 t2, f4 2867 mfc1 t3, f6 2868 mfc1 t4, f8 2869 lwc1 f2, 16(a2) 2870 lwc1 f4, 20(a2) 2871 lwc1 f6, 24(a2) 2872 lwc1 f8, 28(a2) 2873 madd.s f2, f0, f2, f10 2874 madd.s f4, f0, f4, f12 2875 madd.s f6, f0, f6, f14 2876 madd.s f8, f0, f8, f16 2877 addiu t1, t1, -16384 2878 addiu t2, t2, -16384 2879 addiu t3, t3, -16384 2880 addiu t4, t4, -16384 2881 trunc.w.s f2, f2 2882 trunc.w.s f4, f4 2883 trunc.w.s f6, f6 2884 trunc.w.s f8, f8 2885 sh t1, 0(a0) 2886 sh t2, 2(a0) 2887 sh t3, 4(a0) 2888 sh t4, 6(a0) 2889 mfc1 t1, f2 2890 mfc1 t2, f4 2891 mfc1 t3, f6 2892 mfc1 t4, f8 2893 addiu t0, t0, -8 2894 addiu a2, a2, 32 2895 addiu a1, a1, 32 2896 addiu t1, t1, -16384 2897 addiu t2, t2, -16384 2898 addiu t3, t3, -16384 2899 addiu t4, t4, -16384 2900 sh t1, 8(a0) 2901 sh t2, 10(a0) 2902 sh t3, 12(a0) 2903 sh t4, 14(a0) 2904 bgez t0, 0b 2905 addiu a0, a0, 16 2906 2907 j ra 2908 nop 2909 2910 END(jsimd_quantize_float_mips_dspr2) 2911 /*****************************************************************************/ 2912 LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2) 2913 /* 2914 * a0 - compptr->dct_table 2915 * a1 - coef_block 2916 * a2 - output_buf 2917 * a3 - output_col 2918 */ 2919 .set at 2920 2921 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 2922 2923 addiu sp, sp, -40 2924 move v0, sp 2925 addiu s2, zero, 29692 2926 addiu s3, zero, -10426 2927 addiu s4, zero, 6967 2928 addiu s5, zero, -5906 2929 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] 2930 lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] 2931 lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] 2932 lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] 2933 mul t4, t5, t0 2934 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] 2935 lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] 2936 mul t6, t6, t1 2937 mul t5, t5, t0 2938 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] 2939 lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] 2940 lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] 2941 lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] 2942 mul t7, t7, t2 2943 mult zero, zero 2944 mul t8, t8, t3 2945 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) 2946 li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) 2947 ins t6, t5, 16, 16 // t6 = t5|t6 2948 sll t4, t4, 15 2949 dpa.w.ph $ac0, t6, s0 2950 lh t1, 2(a1) 2951 lh t6, 2(a0) 2952 ins t8, t7, 16, 16 // t8 = t7|t8 2953 dpa.w.ph $ac0, t8, s1 2954 mflo t0, $ac0 2955 mul t5, t6, t1 2956 lh t1, 18(a1) 2957 lh t6, 18(a0) 2958 lh t2, 50(a1) 2959 lh t7, 50(a0) 2960 mul t6, t6, t1 2961 subu t8, t4, t0 2962 mul t7, t7, t2 2963 addu t0, t4, t0 2964 shra_r.w t0, t0, 13 2965 lh t1, 82(a1) 2966 lh t2, 82(a0) 2967 lh t3, 114(a1) 2968 lh t4, 114(a0) 2969 shra_r.w t8, t8, 13 2970 mul t1, t1, t2 2971 mul t3, t3, t4 2972 sw t0, 0(v0) 2973 sw t8, 20(v0) 2974 sll t4, t5, 15 2975 ins t7, t6, 16, 16 2976 mult zero, zero 2977 dpa.w.ph $ac0, t7, s0 2978 ins t3, t1, 16, 16 2979 lh t1, 6(a1) 2980 lh t6, 6(a0) 2981 dpa.w.ph $ac0, t3, s1 2982 mflo t0, $ac0 2983 mul t5, t6, t1 2984 lh t1, 22(a1) 2985 lh t6, 22(a0) 2986 lh t2, 54(a1) 2987 lh t7, 54(a0) 2988 mul t6, t6, t1 2989 subu t8, t4, t0 2990 mul t7, t7, t2 2991 addu t0, t4, t0 2992 shra_r.w t0, t0, 13 2993 lh t1, 86(a1) 2994 lh t2, 86(a0) 2995 lh t3, 118(a1) 2996 lh t4, 118(a0) 2997 shra_r.w t8, t8, 13 2998 mul t1, t1, t2 2999 mul t3, t3, t4 3000 sw t0, 4(v0) 3001 sw t8, 24(v0) 3002 sll t4, t5, 15 3003 ins t7, t6, 16, 16 3004 mult zero, zero 3005 dpa.w.ph $ac0, t7, s0 3006 ins t3, t1, 16, 16 3007 lh t1, 10(a1) 3008 lh t6, 10(a0) 3009 dpa.w.ph $ac0, t3, s1 3010 mflo t0, $ac0 3011 mul t5, t6, t1 3012 lh t1, 26(a1) 3013 lh t6, 26(a0) 3014 lh t2, 58(a1) 3015 lh t7, 58(a0) 3016 mul t6, t6, t1 3017 subu t8, t4, t0 3018 mul t7, t7, t2 3019 addu t0, t4, t0 3020 shra_r.w t0, t0, 13 3021 lh t1, 90(a1) 3022 lh t2, 90(a0) 3023 lh t3, 122(a1) 3024 lh t4, 122(a0) 3025 shra_r.w t8, t8, 13 3026 mul t1, t1, t2 3027 mul t3, t3, t4 3028 sw t0, 8(v0) 3029 sw t8, 28(v0) 3030 sll t4, t5, 15 3031 ins t7, t6, 16, 16 3032 mult zero, zero 3033 dpa.w.ph $ac0, t7, s0 3034 ins t3, t1, 16, 16 3035 lh t1, 14(a1) 3036 lh t6, 14(a0) 3037 dpa.w.ph $ac0, t3, s1 3038 mflo t0, $ac0 3039 mul t5, t6, t1 3040 lh t1, 30(a1) 3041 lh t6, 30(a0) 3042 lh t2, 62(a1) 3043 lh t7, 62(a0) 3044 mul t6, t6, t1 3045 subu t8, t4, t0 3046 mul t7, t7, t2 3047 addu t0, t4, t0 3048 shra_r.w t0, t0, 13 3049 lh t1, 94(a1) 3050 lh t2, 94(a0) 3051 lh t3, 126(a1) 3052 lh t4, 126(a0) 3053 shra_r.w t8, t8, 13 3054 mul t1, t1, t2 3055 mul t3, t3, t4 3056 sw t0, 12(v0) 3057 sw t8, 32(v0) 3058 sll t4, t5, 15 3059 ins t7, t6, 16, 16 3060 mult zero, zero 3061 dpa.w.ph $ac0, t7, s0 3062 ins t3, t1, 16, 16 3063 dpa.w.ph $ac0, t3, s1 3064 mflo t0, $ac0 3065 lw t9, 0(a2) 3066 lw t3, 0(v0) 3067 lw t7, 4(v0) 3068 lw t1, 8(v0) 3069 addu t9, t9, a3 3070 sll t3, t3, 15 3071 subu t8, t4, t0 3072 addu t0, t4, t0 3073 shra_r.w t0, t0, 13 3074 shra_r.w t8, t8, 13 3075 sw t0, 16(v0) 3076 sw t8, 36(v0) 3077 lw t5, 12(v0) 3078 lw t6, 16(v0) 3079 mult t7, s2 3080 madd t1, s3 3081 madd t5, s4 3082 madd t6, s5 3083 lw t5, 24(v0) 3084 lw t7, 28(v0) 3085 mflo t0, $ac0 3086 lw t8, 32(v0) 3087 lw t2, 36(v0) 3088 mult $ac1, t5, s2 3089 madd $ac1, t7, s3 3090 madd $ac1, t8, s4 3091 madd $ac1, t2, s5 3092 addu t1, t3, t0 3093 subu t6, t3, t0 3094 shra_r.w t1, t1, 20 3095 shra_r.w t6, t6, 20 3096 mflo t4, $ac1 3097 shll_s.w t1, t1, 24 3098 shll_s.w t6, t6, 24 3099 sra t1, t1, 24 3100 sra t6, t6, 24 3101 addiu t1, t1, 128 3102 addiu t6, t6, 128 3103 lw t0, 20(v0) 3104 sb t1, 0(t9) 3105 sb t6, 1(t9) 3106 sll t0, t0, 15 3107 lw t9, 4(a2) 3108 addu t1, t0, t4 3109 subu t6, t0, t4 3110 addu t9, t9, a3 3111 shra_r.w t1, t1, 20 3112 shra_r.w t6, t6, 20 3113 shll_s.w t1, t1, 24 3114 shll_s.w t6, t6, 24 3115 sra t1, t1, 24 3116 sra t6, t6, 24 3117 addiu t1, t1, 128 3118 addiu t6, t6, 128 3119 sb t1, 0(t9) 3120 sb t6, 1(t9) 3121 addiu sp, sp, 40 3122 3123 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 3124 3125 j ra 3126 nop 3127 3128 END(jsimd_idct_2x2_mips_dspr2) 3129 3130 /*****************************************************************************/ 3131 LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2) 3132 /* 3133 * a0 - compptr->dct_table 3134 * a1 - coef_block 3135 * a2 - output_buf 3136 * a3 - output_col 3137 * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes 3138 */ 3139 3140 .set at 3141 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3142 3143 lw v1, 48(sp) 3144 move t0, a1 3145 move t1, v1 3146 li t9, 4 3147 li s0, 0x2e75f93e 3148 li s1, 0x21f9ba79 3149 li s2, 0xecc2efb0 3150 li s3, 0x52031ccd 3151 3152 0: 3153 lh s6, 32(t0) // inptr[DCTSIZE*2] 3154 lh t6, 32(a0) // quantptr[DCTSIZE*2] 3155 lh s7, 96(t0) // inptr[DCTSIZE*6] 3156 lh t7, 96(a0) // quantptr[DCTSIZE*6] 3157 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3158 lh s4, 0(t0) // inptr[DCTSIZE*0] 3159 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3160 lh s5, 0(a0) // quantptr[0] 3161 li s6, 15137 3162 li s7, 6270 3163 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) 3164 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3165 lh t5, 112(t0) // inptr[DCTSIZE*7] 3166 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3167 lh s4, 112(a0) // quantptr[DCTSIZE*7] 3168 lh v0, 80(t0) // inptr[DCTSIZE*5] 3169 lh s5, 80(a0) // quantptr[DCTSIZE*5] 3170 lh s6, 48(a0) // quantptr[DCTSIZE*3] 3171 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) 3172 lh s7, 16(a0) // quantptr[DCTSIZE*1] 3173 lh t8, 16(t0) // inptr[DCTSIZE*1] 3174 subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) 3175 lh t7, 48(t0) // inptr[DCTSIZE*3] 3176 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) 3177 mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) 3178 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) 3179 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) 3180 addu t3, t2, t6 // tmp10 = tmp0 + z2 3181 subu t4, t2, t6 // tmp10 = tmp0 - z2 3182 mult $ac0, zero, zero 3183 mult $ac1, zero, zero 3184 ins t5, v0, 16, 16 3185 ins t7, t8, 16, 16 3186 addiu t9, t9, -1 3187 dpa.w.ph $ac0, t5, s0 3188 dpa.w.ph $ac0, t7, s1 3189 dpa.w.ph $ac1, t5, s2 3190 dpa.w.ph $ac1, t7, s3 3191 mflo s4, $ac0 3192 mflo s5, $ac1 3193 addiu a0, a0, 2 3194 addiu t1, t1, 4 3195 addiu t0, t0, 2 3196 addu t6, t4, s4 3197 subu t5, t4, s4 3198 addu s6, t3, s5 3199 subu s7, t3, s5 3200 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) 3201 shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) 3202 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) 3203 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) 3204 sw t6, 28(t1) 3205 sw t5, 60(t1) 3206 sw s6, -4(t1) 3207 bgtz t9, 0b 3208 sw s7, 92(t1) 3209 // second loop three pass 3210 li t9, 3 3211 1: 3212 lh s6, 34(t0) // inptr[DCTSIZE*2] 3213 lh t6, 34(a0) // quantptr[DCTSIZE*2] 3214 lh s7, 98(t0) // inptr[DCTSIZE*6] 3215 lh t7, 98(a0) // quantptr[DCTSIZE*6] 3216 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3217 lh s4, 2(t0) // inptr[DCTSIZE*0] 3218 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3219 lh s5, 2(a0) // quantptr[DCTSIZE*0] 3220 li s6, 15137 3221 li s7, 6270 3222 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) 3223 mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3224 lh t5, 114(t0) // inptr[DCTSIZE*7] 3225 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3226 lh s4, 114(a0) // quantptr[DCTSIZE*7] 3227 lh s5, 82(a0) // quantptr[DCTSIZE*5] 3228 lh t6, 82(t0) // inptr[DCTSIZE*5] 3229 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) 3230 lh s6, 50(a0) // quantptr[DCTSIZE*3] 3231 lh t8, 18(t0) // inptr[DCTSIZE*1] 3232 subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) 3233 lh t7, 50(t0) // inptr[DCTSIZE*3] 3234 lh s7, 18(a0) // quantptr[DCTSIZE*1] 3235 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) 3236 mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) 3237 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) 3238 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) 3239 addu t3, t2, v0 // tmp10 = tmp0 + z2 3240 subu t4, t2, v0 // tmp10 = tmp0 - z2 3241 mult $ac0, zero, zero 3242 mult $ac1, zero, zero 3243 ins t5, t6, 16, 16 3244 ins t7, t8, 16, 16 3245 dpa.w.ph $ac0, t5, s0 3246 dpa.w.ph $ac0, t7, s1 3247 dpa.w.ph $ac1, t5, s2 3248 dpa.w.ph $ac1, t7, s3 3249 mflo t5, $ac0 3250 mflo t6, $ac1 3251 addiu t9, t9, -1 3252 addiu t0, t0, 2 3253 addiu a0, a0, 2 3254 addiu t1, t1, 4 3255 addu s5, t4, t5 3256 subu s4, t4, t5 3257 addu s6, t3, t6 3258 subu s7, t3, t6 3259 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) 3260 shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) 3261 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) 3262 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) 3263 sw s5, 32(t1) 3264 sw s4, 64(t1) 3265 sw s6, 0(t1) 3266 bgtz t9, 1b 3267 sw s7, 96(t1) 3268 move t1, v1 3269 li s4, 15137 3270 lw s6, 8(t1) // wsptr[2] 3271 li s5, 6270 3272 lw s7, 24(t1) // wsptr[6] 3273 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) 3274 lw t2, 0(t1) // wsptr[0] 3275 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) 3276 lh t5, 28(t1) // wsptr[7] 3277 lh t6, 20(t1) // wsptr[5] 3278 lh t7, 12(t1) // wsptr[3] 3279 lh t8, 4(t1) // wsptr[1] 3280 ins t5, t6, 16, 16 3281 ins t7, t8, 16, 16 3282 mult $ac0, zero, zero 3283 dpa.w.ph $ac0, t5, s0 3284 dpa.w.ph $ac0, t7, s1 3285 mult $ac1, zero, zero 3286 dpa.w.ph $ac1, t5, s2 3287 dpa.w.ph $ac1, t7, s3 3288 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) 3289 mflo s6, $ac0 3290 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3291 subu s4, s4, s5 3292 addu t3, t2, s4 // tmp10 = tmp0 + z2 3293 mflo s7, $ac1 3294 subu t4, t2, s4 // tmp10 = tmp0 - z2 3295 addu t7, t4, s6 3296 subu t8, t4, s6 3297 addu t5, t3, s7 3298 subu t6, t3, s7 3299 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3300 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3301 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3302 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3303 sll s4, t9, 2 3304 lw v0, 0(a2) // output_buf[ctr] 3305 shll_s.w t5, t5, 24 3306 shll_s.w t6, t6, 24 3307 shll_s.w t7, t7, 24 3308 shll_s.w t8, t8, 24 3309 sra t5, t5, 24 3310 sra t6, t6, 24 3311 sra t7, t7, 24 3312 sra t8, t8, 24 3313 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3314 addiu t5, t5, 128 3315 addiu t6, t6, 128 3316 addiu t7, t7, 128 3317 addiu t8, t8, 128 3318 sb t5, 0(v0) 3319 sb t7, 1(v0) 3320 sb t8, 2(v0) 3321 sb t6, 3(v0) 3322 // 2 3323 li s4, 15137 3324 lw s6, 40(t1) // wsptr[2] 3325 li s5, 6270 3326 lw s7, 56(t1) // wsptr[6] 3327 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) 3328 lw t2, 32(t1) // wsptr[0] 3329 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) 3330 lh t5, 60(t1) // wsptr[7] 3331 lh t6, 52(t1) // wsptr[5] 3332 lh t7, 44(t1) // wsptr[3] 3333 lh t8, 36(t1) // wsptr[1] 3334 ins t5, t6, 16, 16 3335 ins t7, t8, 16, 16 3336 mult $ac0, zero, zero 3337 dpa.w.ph $ac0, t5, s0 3338 dpa.w.ph $ac0, t7, s1 3339 mult $ac1, zero, zero 3340 dpa.w.ph $ac1, t5, s2 3341 dpa.w.ph $ac1, t7, s3 3342 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) 3343 mflo s6, $ac0 3344 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3345 subu s4, s4, s5 3346 addu t3, t2, s4 // tmp10 = tmp0 + z2 3347 mflo s7, $ac1 3348 subu t4, t2, s4 // tmp10 = tmp0 - z2 3349 addu t7, t4, s6 3350 subu t8, t4, s6 3351 addu t5, t3, s7 3352 subu t6, t3, s7 3353 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) 3354 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) 3355 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) 3356 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) 3357 sll s4, t9, 2 3358 lw v0, 4(a2) // output_buf[ctr] 3359 shll_s.w t5, t5, 24 3360 shll_s.w t6, t6, 24 3361 shll_s.w t7, t7, 24 3362 shll_s.w t8, t8, 24 3363 sra t5, t5, 24 3364 sra t6, t6, 24 3365 sra t7, t7, 24 3366 sra t8, t8, 24 3367 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3368 addiu t5, t5, 128 3369 addiu t6, t6, 128 3370 addiu t7, t7, 128 3371 addiu t8, t8, 128 3372 sb t5, 0(v0) 3373 sb t7, 1(v0) 3374 sb t8, 2(v0) 3375 sb t6, 3(v0) 3376 // 3 3377 li s4, 15137 3378 lw s6, 72(t1) // wsptr[2] 3379 li s5, 6270 3380 lw s7, 88(t1) // wsptr[6] 3381 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) 3382 lw t2, 64(t1) // wsptr[0] 3383 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) 3384 lh t5, 92(t1) // wsptr[7] 3385 lh t6, 84(t1) // wsptr[5] 3386 lh t7, 76(t1) // wsptr[3] 3387 lh t8, 68(t1) // wsptr[1] 3388 ins t5, t6, 16, 16 3389 ins t7, t8, 16, 16 3390 mult $ac0, zero, zero 3391 dpa.w.ph $ac0, t5, s0 3392 dpa.w.ph $ac0, t7, s1 3393 mult $ac1, zero, zero 3394 dpa.w.ph $ac1, t5, s2 3395 dpa.w.ph $ac1, t7, s3 3396 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) 3397 mflo s6, $ac0 3398 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3399 subu s4, s4, s5 3400 addu t3, t2, s4 // tmp10 = tmp0 + z2 3401 mflo s7, $ac1 3402 subu t4, t2, s4 // tmp10 = tmp0 - z2 3403 addu t7, t4, s6 3404 subu t8, t4, s6 3405 addu t5, t3, s7 3406 subu t6, t3, s7 3407 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3408 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3409 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3410 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3411 sll s4, t9, 2 3412 lw v0, 8(a2) // output_buf[ctr] 3413 shll_s.w t5, t5, 24 3414 shll_s.w t6, t6, 24 3415 shll_s.w t7, t7, 24 3416 shll_s.w t8, t8, 24 3417 sra t5, t5, 24 3418 sra t6, t6, 24 3419 sra t7, t7, 24 3420 sra t8, t8, 24 3421 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3422 addiu t5, t5, 128 3423 addiu t6, t6, 128 3424 addiu t7, t7, 128 3425 addiu t8, t8, 128 3426 sb t5, 0(v0) 3427 sb t7, 1(v0) 3428 sb t8, 2(v0) 3429 sb t6, 3(v0) 3430 li s4, 15137 3431 lw s6, 104(t1) // wsptr[2] 3432 li s5, 6270 3433 lw s7, 120(t1) // wsptr[6] 3434 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) 3435 lw t2, 96(t1) // wsptr[0] 3436 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865) 3437 lh t5, 124(t1) // wsptr[7] 3438 lh t6, 116(t1) // wsptr[5] 3439 lh t7, 108(t1) // wsptr[3] 3440 lh t8, 100(t1) // wsptr[1] 3441 ins t5, t6, 16, 16 3442 ins t7, t8, 16, 16 3443 mult $ac0, zero, zero 3444 dpa.w.ph $ac0, t5, s0 3445 dpa.w.ph $ac0, t7, s1 3446 mult $ac1, zero, zero 3447 dpa.w.ph $ac1, t5, s2 3448 dpa.w.ph $ac1, t7, s3 3449 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) 3450 mflo s6, $ac0 3451 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3452 subu s4, s4, s5 3453 addu t3, t2, s4 // tmp10 = tmp0 + z2; 3454 mflo s7, $ac1 3455 subu t4, t2, s4 // tmp10 = tmp0 - z2; 3456 addu t7, t4, s6 3457 subu t8, t4, s6 3458 addu t5, t3, s7 3459 subu t6, t3, s7 3460 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3461 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3462 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3463 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3464 sll s4, t9, 2 3465 lw v0, 12(a2) // output_buf[ctr] 3466 shll_s.w t5, t5, 24 3467 shll_s.w t6, t6, 24 3468 shll_s.w t7, t7, 24 3469 shll_s.w t8, t8, 24 3470 sra t5, t5, 24 3471 sra t6, t6, 24 3472 sra t7, t7, 24 3473 sra t8, t8, 24 3474 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3475 addiu t5, t5, 128 3476 addiu t6, t6, 128 3477 addiu t7, t7, 128 3478 addiu t8, t8, 128 3479 sb t5, 0(v0) 3480 sb t7, 1(v0) 3481 sb t8, 2(v0) 3482 sb t6, 3(v0) 3483 3484 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3485 3486 j ra 3487 nop 3488 END(jsimd_idct_4x4_mips_dspr2) 3489 3490 /*****************************************************************************/ 3491 LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2) 3492 /* 3493 * a0 - compptr->dct_table 3494 * a1 - coef_block 3495 * a2 - output_buf 3496 * a3 - output_col 3497 */ 3498 .set at 3499 3500 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3501 3502 addiu sp, sp, -144 3503 move v0, sp 3504 addiu v1, v0, 24 3505 addiu t9, zero, 5793 3506 addiu s0, zero, 10033 3507 addiu s1, zero, 2998 3508 3509 1: 3510 lh s2, 0(a0) // q0 = quantptr[ 0] 3511 lh s3, 32(a0) // q1 = quantptr[16] 3512 lh s4, 64(a0) // q2 = quantptr[32] 3513 lh t2, 64(a1) // tmp2 = inptr[32] 3514 lh t1, 32(a1) // tmp1 = inptr[16] 3515 lh t0, 0(a1) // tmp0 = inptr[ 0] 3516 mul t2, t2, s4 // tmp2 = tmp2 * q2 3517 mul t1, t1, s3 // tmp1 = tmp1 * q1 3518 mul t0, t0, s2 // tmp0 = tmp0 * q0 3519 lh t6, 16(a1) // z1 = inptr[ 8] 3520 lh t8, 80(a1) // z3 = inptr[40] 3521 lh t7, 48(a1) // z2 = inptr[24] 3522 lh s2, 16(a0) // q0 = quantptr[ 8] 3523 lh s4, 80(a0) // q2 = quantptr[40] 3524 lh s3, 48(a0) // q1 = quantptr[24] 3525 mul t2, t2, t9 // tmp2 = tmp2 * 5793 3526 mul t1, t1, s0 // tmp1 = tmp1 * 10033 3527 sll t0, t0, 13 // tmp0 = tmp0 << 13 3528 mul t6, t6, s2 // z1 = z1 * q0 3529 mul t8, t8, s4 // z3 = z3 * q2 3530 mul t7, t7, s3 // z2 = z2 * q1 3531 addu t3, t0, t2 // tmp10 = tmp0 + tmp2 3532 sll t2, t2, 1 // tmp2 = tmp2 << 2 3533 subu t4, t0, t2 // tmp11 = tmp0 - tmp2; 3534 subu t5, t3, t1 // tmp12 = tmp10 - tmp1 3535 addu t3, t3, t1 // tmp10 = tmp10 + tmp1 3536 addu t1, t6, t8 // tmp1 = z1 + z3 3537 mul t1, t1, s1 // tmp1 = tmp1 * 2998 3538 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 3539 subu t2, t6, t8 // tmp2 = z1 - z3 3540 subu t2, t2, t7 // tmp2 = tmp2 - z2 3541 sll t2, t2, 2 // tmp2 = tmp2 << 2 3542 addu t0, t6, t7 // tmp0 = z1 + z2 3543 sll t0, t0, 13 // tmp0 = tmp0 << 13 3544 subu s2, t8, t7 // q0 = z3 - z2 3545 sll s2, s2, 13 // q0 = q0 << 13 3546 addu t0, t0, t1 // tmp0 = tmp0 + tmp1 3547 addu t1, s2, t1 // tmp1 = q0 + tmp1 3548 addu s2, t4, t2 // q0 = tmp11 + tmp2 3549 subu s3, t4, t2 // q1 = tmp11 - tmp2 3550 addu t6, t3, t0 // z1 = tmp10 + tmp0 3551 subu t7, t3, t0 // z2 = tmp10 - tmp0 3552 addu t4, t5, t1 // tmp11 = tmp12 + tmp1 3553 subu t5, t5, t1 // tmp12 = tmp12 - tmp1 3554 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 3555 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 3556 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 3557 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 3558 sw s2, 24(v0) 3559 sw s3, 96(v0) 3560 sw t6, 0(v0) 3561 sw t7, 120(v0) 3562 sw t4, 48(v0) 3563 sw t5, 72(v0) 3564 addiu v0, v0, 4 3565 addiu a1, a1, 2 3566 bne v0, v1, 1b 3567 addiu a0, a0, 2 3568 3569 /* Pass 2: process 6 rows from work array, store into output array. */ 3570 move v0, sp 3571 addiu v1, v0, 144 3572 3573 2: 3574 lw t0, 0(v0) 3575 lw t2, 16(v0) 3576 lw s5, 0(a2) 3577 addiu t0, t0, 16 3578 sll t0, t0, 13 3579 mul t3, t2, t9 3580 lw t6, 4(v0) 3581 lw t8, 20(v0) 3582 lw t7, 12(v0) 3583 addu s5, s5, a3 3584 addu s6, t6, t8 3585 mul s6, s6, s1 3586 addu t1, t0, t3 3587 subu t4, t0, t3 3588 subu t4, t4, t3 3589 lw t3, 8(v0) 3590 mul t0, t3, s0 3591 addu s7, t6, t7 3592 sll s7, s7, 13 3593 addu s7, s6, s7 3594 subu t2, t8, t7 3595 sll t2, t2, 13 3596 addu t2, s6, t2 3597 subu s6, t6, t7 3598 subu s6, s6, t8 3599 sll s6, s6, 13 3600 addu t3, t1, t0 3601 subu t5, t1, t0 3602 addu t6, t3, s7 3603 subu t3, t3, s7 3604 addu t7, t4, s6 3605 subu t4, t4, s6 3606 addu t8, t5, t2 3607 subu t5, t5, t2 3608 shll_s.w t6, t6, 6 3609 shll_s.w t3, t3, 6 3610 shll_s.w t7, t7, 6 3611 shll_s.w t4, t4, 6 3612 shll_s.w t8, t8, 6 3613 shll_s.w t5, t5, 6 3614 sra t6, t6, 24 3615 addiu t6, t6, 128 3616 sra t3, t3, 24 3617 addiu t3, t3, 128 3618 sb t6, 0(s5) 3619 sra t7, t7, 24 3620 addiu t7, t7, 128 3621 sb t3, 5(s5) 3622 sra t4, t4, 24 3623 addiu t4, t4, 128 3624 sb t7, 1(s5) 3625 sra t8, t8, 24 3626 addiu t8, t8, 128 3627 sb t4, 4(s5) 3628 addiu v0, v0, 24 3629 sra t5, t5, 24 3630 addiu t5, t5, 128 3631 sb t8, 2(s5) 3632 addiu a2, a2, 4 3633 bne v0, v1, 2b 3634 sb t5, 3(s5) 3635 3636 addiu sp, sp, 144 3637 3638 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3639 3640 j ra 3641 nop 3642 3643 END(jsimd_idct_6x6_mips_dspr2) 3644 3645 /*****************************************************************************/ 3646 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2) 3647 /* 3648 * a0 - compptr->dct_table 3649 * a1 - coef_block 3650 * a2 - workspace 3651 */ 3652 3653 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3654 3655 li a3, 8 3656 3657 1: 3658 // odd part 3659 lh t0, 48(a1) 3660 lh t1, 48(a0) 3661 lh t2, 16(a1) 3662 lh t3, 16(a0) 3663 lh t4, 80(a1) 3664 lh t5, 80(a0) 3665 lh t6, 112(a1) 3666 lh t7, 112(a0) 3667 mul t0, t0, t1 // z2 3668 mul t1, t2, t3 // z1 3669 mul t2, t4, t5 // z3 3670 mul t3, t6, t7 // z4 3671 li t4, 10703 // FIX(1.306562965) 3672 li t5, 4433 // FIX_0_541196100 3673 li t6, 7053 // FIX(0.860918669) 3674 mul t4, t0,t4 // tmp11 3675 mul t5, t0,t5 // -tmp14 3676 addu t7, t1,t2 // tmp10 3677 addu t8, t7,t3 // tmp10 + z4 3678 mul t6, t6, t8 // tmp15 3679 li t8, 2139 // FIX(0.261052384) 3680 mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) 3681 li t7, 2295 // FIX(0.280143716) 3682 mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) 3683 addu t9, t2, t3 // z3 + z4 3684 li s0, 8565 // FIX(1.045510580) 3685 mul t9, t9, s0 // -tmp13 3686 li s0, 12112 // FIX(1.478575242) 3687 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) 3688 li s1, 12998 // FIX(1.586706681) 3689 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) 3690 li s2, 5540 // FIX(0.676326758) 3691 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) 3692 li s3, 16244 // FIX(1.982889723) 3693 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) 3694 subu t1, t1, t3 // z1-=z4 3695 subu t0, t0, t2 // z2-=z3 3696 addu t2, t0, t1 // z1+z2 3697 li t3, 4433 // FIX_0_541196100 3698 mul t2, t2, t3 // z3 3699 li t3, 6270 // FIX_0_765366865 3700 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) 3701 li t3, 15137 // FIX_0_765366865 3702 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) 3703 addu t8, t6, t8 // tmp12 3704 addu t3, t8, t4 // tmp12 + tmp11 3705 addu t3, t3, t7 // tmp10 3706 subu t8, t8, t9 // tmp12 + tmp13 3707 addu s0, t5, s0 3708 subu t8, t8, s0 // tmp12 3709 subu t9, t6, t9 3710 subu s1, s1, t4 3711 addu t9, t9, s1 // tmp13 3712 subu t6, t6, t5 3713 subu t6, t6, s2 3714 subu t6, t6, s3 // tmp15 3715 // even part start 3716 lh t4, 64(a1) 3717 lh t5, 64(a0) 3718 lh t7, 32(a1) 3719 lh s0, 32(a0) 3720 lh s1, 0(a1) 3721 lh s2, 0(a0) 3722 lh s3, 96(a1) 3723 lh v0, 96(a0) 3724 mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4]) 3725 mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2]) 3726 mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0]) 3727 mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6]) 3728 // odd part end 3729 addu t1, t2, t1 // tmp11 3730 subu t0, t2, t0 // tmp14 3731 // update counter and pointers 3732 addiu a3, a3, -1 3733 addiu a0, a0, 2 3734 addiu a1, a1, 2 3735 // even part rest 3736 li s1, 10033 3737 li s2, 11190 3738 mul t4, t4, s1 // z4 3739 mul s1, t5, s2 // z4 3740 sll t5, t5, 13 // z1 3741 sll t7, t7, 13 3742 addiu t7, t7, 1024 // z3 3743 sll s0, s0, 13 // z2 3744 addu s2, t7, t4 // tmp10 3745 subu t4, t7, t4 // tmp11 3746 subu s3, t5, s0 // tmp12 3747 addu t2, t7, s3 // tmp21 3748 subu s3, t7, s3 // tmp24 3749 addu t7, s1, s0 // tmp12 3750 addu v0, s2, t7 // tmp20 3751 subu s2, s2, t7 // tmp25 3752 subu s1, s1, t5 // z4 - z1 3753 subu s1, s1, s0 // tmp12 3754 addu s0, t4, s1 // tmp22 3755 subu t4, t4, s1 // tmp23 3756 // final output stage 3757 addu t5, v0, t3 3758 subu v0, v0, t3 3759 addu t3, t2, t1 3760 subu t2, t2, t1 3761 addu t1, s0, t8 3762 subu s0, s0, t8 3763 addu t8, t4, t9 3764 subu t4, t4, t9 3765 addu t9, s3, t0 3766 subu s3, s3, t0 3767 addu t0, s2, t6 3768 subu s2, s2, t6 3769 sra t5, t5, 11 3770 sra t3, t3, 11 3771 sra t1, t1, 11 3772 sra t8, t8, 11 3773 sra t9, t9, 11 3774 sra t0, t0, 11 3775 sra s2, s2, 11 3776 sra s3, s3, 11 3777 sra t4, t4, 11 3778 sra s0, s0, 11 3779 sra t2, t2, 11 3780 sra v0, v0, 11 3781 sw t5, 0(a2) 3782 sw t3, 32(a2) 3783 sw t1, 64(a2) 3784 sw t8, 96(a2) 3785 sw t9, 128(a2) 3786 sw t0, 160(a2) 3787 sw s2, 192(a2) 3788 sw s3, 224(a2) 3789 sw t4, 256(a2) 3790 sw s0, 288(a2) 3791 sw t2, 320(a2) 3792 sw v0, 352(a2) 3793 bgtz a3, 1b 3794 addiu a2, a2, 4 3795 3796 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 3797 3798 j ra 3799 nop 3800 3801 END(jsimd_idct_12x12_pass1_mips_dspr2) 3802 3803 /*****************************************************************************/ 3804 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2) 3805 /* 3806 * a0 - workspace 3807 * a1 - output 3808 */ 3809 3810 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3811 3812 li a3, 12 3813 3814 1: 3815 // Odd part 3816 lw t0, 12(a0) 3817 lw t1, 4(a0) 3818 lw t2, 20(a0) 3819 lw t3, 28(a0) 3820 li t4, 10703 // FIX(1.306562965) 3821 li t5, 4433 // FIX_0_541196100 3822 mul t4, t0, t4 // tmp11 3823 mul t5, t0, t5 // -tmp14 3824 addu t6, t1, t2 // tmp10 3825 li t7, 2139 // FIX(0.261052384) 3826 mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) 3827 addu t6, t6, t3 // tmp10 + z4 3828 li t8, 7053 // FIX(0.860918669) 3829 mul t6, t6, t8 // tmp15 3830 li t8, 2295 // FIX(0.280143716) 3831 mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) 3832 addu t9, t2, t3 // z3 + z4 3833 li s0, 8565 // FIX(1.045510580) 3834 mul t9, t9, s0 // -tmp13 3835 li s0, 12112 // FIX(1.478575242) 3836 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) 3837 li s1, 12998 // FIX(1.586706681) 3838 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) 3839 li s2, 5540 // FIX(0.676326758) 3840 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) 3841 li s3, 16244 // FIX(1.982889723) 3842 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) 3843 subu t1, t1, t3 // z1 -= z4 3844 subu t0, t0, t2 // z2 -= z3 3845 addu t2, t1, t0 // z1 + z2 3846 li t3, 4433 // FIX_0_541196100 3847 mul t2, t2, t3 // z3 3848 li t3, 6270 // FIX_0_765366865 3849 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) 3850 li t3, 15137 // FIX_1_847759065 3851 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) 3852 addu t3, t6, t7 // tmp12 3853 addu t7, t3, t4 3854 addu t7, t7, t8 // tmp10 3855 subu t3, t3, t9 3856 subu t3, t3, t5 3857 subu t3, t3, s0 // tmp12 3858 subu t9, t6, t9 3859 subu t9, t9, t4 3860 addu t9, t9, s1 // tmp13 3861 subu t6, t6, t5 3862 subu t6, t6, s2 3863 subu t6, t6, s3 // tmp15 3864 addu t1, t2, t1 // tmp11 3865 subu t0, t2, t0 // tmp14 3866 // even part 3867 lw t2, 16(a0) // z4 3868 lw t4, 8(a0) // z1 3869 lw t5, 0(a0) // z3 3870 lw t8, 24(a0) // z2 3871 li s0, 10033 // FIX(1.224744871) 3872 li s1, 11190 // FIX(1.366025404) 3873 mul t2, t2, s0 // z4 3874 mul s0, t4, s1 // z4 3875 addiu t5, t5, 0x10 3876 sll t5, t5, 13 // z3 3877 sll t4, t4, 13 // z1 3878 sll t8, t8, 13 // z2 3879 subu s1, t4, t8 // tmp12 3880 addu s2, t5, t2 // tmp10 3881 subu t2, t5, t2 // tmp11 3882 addu s3, t5, s1 // tmp21 3883 subu s1, t5, s1 // tmp24 3884 addu t5, s0, t8 // tmp12 3885 addu v0, s2, t5 // tmp20 3886 subu t5, s2, t5 // tmp25 3887 subu t4, s0, t4 3888 subu t4, t4, t8 // tmp12 3889 addu t8, t2, t4 // tmp22 3890 subu t2, t2, t4 // tmp23 3891 // increment counter and pointers 3892 addiu a3, a3, -1 3893 addiu a0, a0, 32 3894 // Final stage 3895 addu t4, v0, t7 3896 subu v0, v0, t7 3897 addu t7, s3, t1 3898 subu s3, s3, t1 3899 addu t1, t8, t3 3900 subu t8, t8, t3 3901 addu t3, t2, t9 3902 subu t2, t2, t9 3903 addu t9, s1, t0 3904 subu s1, s1, t0 3905 addu t0, t5, t6 3906 subu t5, t5, t6 3907 sll t4, t4, 4 3908 sll t7, t7, 4 3909 sll t1, t1, 4 3910 sll t3, t3, 4 3911 sll t9, t9, 4 3912 sll t0, t0, 4 3913 sll t5, t5, 4 3914 sll s1, s1, 4 3915 sll t2, t2, 4 3916 sll t8, t8, 4 3917 sll s3, s3, 4 3918 sll v0, v0, 4 3919 shll_s.w t4, t4, 2 3920 shll_s.w t7, t7, 2 3921 shll_s.w t1, t1, 2 3922 shll_s.w t3, t3, 2 3923 shll_s.w t9, t9, 2 3924 shll_s.w t0, t0, 2 3925 shll_s.w t5, t5, 2 3926 shll_s.w s1, s1, 2 3927 shll_s.w t2, t2, 2 3928 shll_s.w t8, t8, 2 3929 shll_s.w s3, s3, 2 3930 shll_s.w v0, v0, 2 3931 srl t4, t4, 24 3932 srl t7, t7, 24 3933 srl t1, t1, 24 3934 srl t3, t3, 24 3935 srl t9, t9, 24 3936 srl t0, t0, 24 3937 srl t5, t5, 24 3938 srl s1, s1, 24 3939 srl t2, t2, 24 3940 srl t8, t8, 24 3941 srl s3, s3, 24 3942 srl v0, v0, 24 3943 lw t6, 0(a1) 3944 addiu t4, t4, 0x80 3945 addiu t7, t7, 0x80 3946 addiu t1, t1, 0x80 3947 addiu t3, t3, 0x80 3948 addiu t9, t9, 0x80 3949 addiu t0, t0, 0x80 3950 addiu t5, t5, 0x80 3951 addiu s1, s1, 0x80 3952 addiu t2, t2, 0x80 3953 addiu t8, t8, 0x80 3954 addiu s3, s3, 0x80 3955 addiu v0, v0, 0x80 3956 sb t4, 0(t6) 3957 sb t7, 1(t6) 3958 sb t1, 2(t6) 3959 sb t3, 3(t6) 3960 sb t9, 4(t6) 3961 sb t0, 5(t6) 3962 sb t5, 6(t6) 3963 sb s1, 7(t6) 3964 sb t2, 8(t6) 3965 sb t8, 9(t6) 3966 sb s3, 10(t6) 3967 sb v0, 11(t6) 3968 bgtz a3, 1b 3969 addiu a1, a1, 4 3970 3971 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 3972 3973 jr ra 3974 nop 3975 3976 END(jsimd_idct_12x12_pass2_mips_dspr2) 3977 3978 /*****************************************************************************/ 3979 LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2) 3980 /* 3981 * a0 - sample_data 3982 * a1 - start_col 3983 * a2 - workspace 3984 */ 3985 3986 lw t0, 0(a0) 3987 li t7, 0xff80ff80 3988 addu t0, t0, a1 3989 ulw t1, 0(t0) 3990 ulw t2, 4(t0) 3991 preceu.ph.qbr t3, t1 3992 preceu.ph.qbl t4, t1 3993 lw t0, 4(a0) 3994 preceu.ph.qbr t5, t2 3995 preceu.ph.qbl t6, t2 3996 addu t0, t0, a1 3997 addu.ph t3, t3, t7 3998 addu.ph t4, t4, t7 3999 ulw t1, 0(t0) 4000 ulw t2, 4(t0) 4001 addu.ph t5, t5, t7 4002 addu.ph t6, t6, t7 4003 usw t3, 0(a2) 4004 usw t4, 4(a2) 4005 preceu.ph.qbr t3, t1 4006 preceu.ph.qbl t4, t1 4007 usw t5, 8(a2) 4008 usw t6, 12(a2) 4009 4010 lw t0, 8(a0) 4011 preceu.ph.qbr t5, t2 4012 preceu.ph.qbl t6, t2 4013 addu t0, t0, a1 4014 addu.ph t3, t3, t7 4015 addu.ph t4, t4, t7 4016 ulw t1, 0(t0) 4017 ulw t2, 4(t0) 4018 addu.ph t5, t5, t7 4019 addu.ph t6, t6, t7 4020 usw t3, 16(a2) 4021 usw t4, 20(a2) 4022 preceu.ph.qbr t3, t1 4023 preceu.ph.qbl t4, t1 4024 usw t5, 24(a2) 4025 usw t6, 28(a2) 4026 4027 lw t0, 12(a0) 4028 preceu.ph.qbr t5, t2 4029 preceu.ph.qbl t6, t2 4030 addu t0, t0, a1 4031 addu.ph t3, t3, t7 4032 addu.ph t4, t4, t7 4033 ulw t1, 0(t0) 4034 ulw t2, 4(t0) 4035 addu.ph t5, t5, t7 4036 addu.ph t6, t6, t7 4037 usw t3, 32(a2) 4038 usw t4, 36(a2) 4039 preceu.ph.qbr t3, t1 4040 preceu.ph.qbl t4, t1 4041 usw t5, 40(a2) 4042 usw t6, 44(a2) 4043 4044 lw t0, 16(a0) 4045 preceu.ph.qbr t5, t2 4046 preceu.ph.qbl t6, t2 4047 addu t0, t0, a1 4048 addu.ph t3, t3, t7 4049 addu.ph t4, t4, t7 4050 ulw t1, 0(t0) 4051 ulw t2, 4(t0) 4052 addu.ph t5, t5, t7 4053 addu.ph t6, t6, t7 4054 usw t3, 48(a2) 4055 usw t4, 52(a2) 4056 preceu.ph.qbr t3, t1 4057 preceu.ph.qbl t4, t1 4058 usw t5, 56(a2) 4059 usw t6, 60(a2) 4060 4061 lw t0, 20(a0) 4062 preceu.ph.qbr t5, t2 4063 preceu.ph.qbl t6, t2 4064 addu t0, t0, a1 4065 addu.ph t3, t3, t7 4066 addu.ph t4, t4, t7 4067 ulw t1, 0(t0) 4068 ulw t2, 4(t0) 4069 addu.ph t5, t5, t7 4070 addu.ph t6, t6, t7 4071 usw t3, 64(a2) 4072 usw t4, 68(a2) 4073 preceu.ph.qbr t3, t1 4074 preceu.ph.qbl t4, t1 4075 usw t5, 72(a2) 4076 usw t6, 76(a2) 4077 4078 lw t0, 24(a0) 4079 preceu.ph.qbr t5, t2 4080 preceu.ph.qbl t6, t2 4081 addu t0, t0, a1 4082 addu.ph t3, t3, t7 4083 addu.ph t4, t4, t7 4084 ulw t1, 0(t0) 4085 ulw t2, 4(t0) 4086 addu.ph t5, t5, t7 4087 addu.ph t6, t6, t7 4088 usw t3, 80(a2) 4089 usw t4, 84(a2) 4090 preceu.ph.qbr t3, t1 4091 preceu.ph.qbl t4, t1 4092 usw t5, 88(a2) 4093 usw t6, 92(a2) 4094 4095 lw t0, 28(a0) 4096 preceu.ph.qbr t5, t2 4097 preceu.ph.qbl t6, t2 4098 addu t0, t0, a1 4099 addu.ph t3, t3, t7 4100 addu.ph t4, t4, t7 4101 ulw t1, 0(t0) 4102 ulw t2, 4(t0) 4103 addu.ph t5, t5, t7 4104 addu.ph t6, t6, t7 4105 usw t3, 96(a2) 4106 usw t4, 100(a2) 4107 preceu.ph.qbr t3, t1 4108 preceu.ph.qbl t4, t1 4109 usw t5, 104(a2) 4110 usw t6, 108(a2) 4111 preceu.ph.qbr t5, t2 4112 preceu.ph.qbl t6, t2 4113 addu.ph t3, t3, t7 4114 addu.ph t4, t4, t7 4115 addu.ph t5, t5, t7 4116 addu.ph t6, t6, t7 4117 usw t3, 112(a2) 4118 usw t4, 116(a2) 4119 usw t5, 120(a2) 4120 usw t6, 124(a2) 4121 4122 j ra 4123 nop 4124 4125 END(jsimd_convsamp_mips_dspr2) 4126 4127 /*****************************************************************************/ 4128 LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2) 4129 /* 4130 * a0 - sample_data 4131 * a1 - start_col 4132 * a2 - workspace 4133 */ 4134 4135 .set at 4136 4137 lw t0, 0(a0) 4138 addu t0, t0, a1 4139 lbu t1, 0(t0) 4140 lbu t2, 1(t0) 4141 lbu t3, 2(t0) 4142 lbu t4, 3(t0) 4143 lbu t5, 4(t0) 4144 lbu t6, 5(t0) 4145 lbu t7, 6(t0) 4146 lbu t8, 7(t0) 4147 addiu t1, t1, -128 4148 addiu t2, t2, -128 4149 addiu t3, t3, -128 4150 addiu t4, t4, -128 4151 addiu t5, t5, -128 4152 addiu t6, t6, -128 4153 addiu t7, t7, -128 4154 addiu t8, t8, -128 4155 mtc1 t1, f2 4156 mtc1 t2, f4 4157 mtc1 t3, f6 4158 mtc1 t4, f8 4159 mtc1 t5, f10 4160 mtc1 t6, f12 4161 mtc1 t7, f14 4162 mtc1 t8, f16 4163 cvt.s.w f2, f2 4164 cvt.s.w f4, f4 4165 cvt.s.w f6, f6 4166 cvt.s.w f8, f8 4167 cvt.s.w f10, f10 4168 cvt.s.w f12, f12 4169 cvt.s.w f14, f14 4170 cvt.s.w f16, f16 4171 lw t0, 4(a0) 4172 swc1 f2, 0(a2) 4173 swc1 f4, 4(a2) 4174 swc1 f6, 8(a2) 4175 addu t0, t0, a1 4176 swc1 f8, 12(a2) 4177 swc1 f10, 16(a2) 4178 swc1 f12, 20(a2) 4179 swc1 f14, 24(a2) 4180 swc1 f16, 28(a2) 4181 //elemr 1 4182 lbu t1, 0(t0) 4183 lbu t2, 1(t0) 4184 lbu t3, 2(t0) 4185 lbu t4, 3(t0) 4186 lbu t5, 4(t0) 4187 lbu t6, 5(t0) 4188 lbu t7, 6(t0) 4189 lbu t8, 7(t0) 4190 addiu t1, t1, -128 4191 addiu t2, t2, -128 4192 addiu t3, t3, -128 4193 addiu t4, t4, -128 4194 addiu t5, t5, -128 4195 addiu t6, t6, -128 4196 addiu t7, t7, -128 4197 addiu t8, t8, -128 4198 mtc1 t1, f2 4199 mtc1 t2, f4 4200 mtc1 t3, f6 4201 mtc1 t4, f8 4202 mtc1 t5, f10 4203 mtc1 t6, f12 4204 mtc1 t7, f14 4205 mtc1 t8, f16 4206 cvt.s.w f2, f2 4207 cvt.s.w f4, f4 4208 cvt.s.w f6, f6 4209 cvt.s.w f8, f8 4210 cvt.s.w f10, f10 4211 cvt.s.w f12, f12 4212 cvt.s.w f14, f14 4213 cvt.s.w f16, f16 4214 lw t0, 8(a0) 4215 swc1 f2, 32(a2) 4216 swc1 f4, 36(a2) 4217 swc1 f6, 40(a2) 4218 addu t0, t0, a1 4219 swc1 f8, 44(a2) 4220 swc1 f10, 48(a2) 4221 swc1 f12, 52(a2) 4222 swc1 f14, 56(a2) 4223 swc1 f16, 60(a2) 4224 //elemr 2 4225 lbu t1, 0(t0) 4226 lbu t2, 1(t0) 4227 lbu t3, 2(t0) 4228 lbu t4, 3(t0) 4229 lbu t5, 4(t0) 4230 lbu t6, 5(t0) 4231 lbu t7, 6(t0) 4232 lbu t8, 7(t0) 4233 addiu t1, t1, -128 4234 addiu t2, t2, -128 4235 addiu t3, t3, -128 4236 addiu t4, t4, -128 4237 addiu t5, t5, -128 4238 addiu t6, t6, -128 4239 addiu t7, t7, -128 4240 addiu t8, t8, -128 4241 mtc1 t1, f2 4242 mtc1 t2, f4 4243 mtc1 t3, f6 4244 mtc1 t4, f8 4245 mtc1 t5, f10 4246 mtc1 t6, f12 4247 mtc1 t7, f14 4248 mtc1 t8, f16 4249 cvt.s.w f2, f2 4250 cvt.s.w f4, f4 4251 cvt.s.w f6, f6 4252 cvt.s.w f8, f8 4253 cvt.s.w f10, f10 4254 cvt.s.w f12, f12 4255 cvt.s.w f14, f14 4256 cvt.s.w f16, f16 4257 lw t0, 12(a0) 4258 swc1 f2, 64(a2) 4259 swc1 f4, 68(a2) 4260 swc1 f6, 72(a2) 4261 addu t0, t0, a1 4262 swc1 f8, 76(a2) 4263 swc1 f10, 80(a2) 4264 swc1 f12, 84(a2) 4265 swc1 f14, 88(a2) 4266 swc1 f16, 92(a2) 4267 //elemr 3 4268 lbu t1, 0(t0) 4269 lbu t2, 1(t0) 4270 lbu t3, 2(t0) 4271 lbu t4, 3(t0) 4272 lbu t5, 4(t0) 4273 lbu t6, 5(t0) 4274 lbu t7, 6(t0) 4275 lbu t8, 7(t0) 4276 addiu t1, t1, -128 4277 addiu t2, t2, -128 4278 addiu t3, t3, -128 4279 addiu t4, t4, -128 4280 addiu t5, t5, -128 4281 addiu t6, t6, -128 4282 addiu t7, t7, -128 4283 addiu t8, t8, -128 4284 mtc1 t1, f2 4285 mtc1 t2, f4 4286 mtc1 t3, f6 4287 mtc1 t4, f8 4288 mtc1 t5, f10 4289 mtc1 t6, f12 4290 mtc1 t7, f14 4291 mtc1 t8, f16 4292 cvt.s.w f2, f2 4293 cvt.s.w f4, f4 4294 cvt.s.w f6, f6 4295 cvt.s.w f8, f8 4296 cvt.s.w f10, f10 4297 cvt.s.w f12, f12 4298 cvt.s.w f14, f14 4299 cvt.s.w f16, f16 4300 lw t0, 16(a0) 4301 swc1 f2, 96(a2) 4302 swc1 f4, 100(a2) 4303 swc1 f6, 104(a2) 4304 addu t0, t0, a1 4305 swc1 f8, 108(a2) 4306 swc1 f10, 112(a2) 4307 swc1 f12, 116(a2) 4308 swc1 f14, 120(a2) 4309 swc1 f16, 124(a2) 4310 //elemr 4 4311 lbu t1, 0(t0) 4312 lbu t2, 1(t0) 4313 lbu t3, 2(t0) 4314 lbu t4, 3(t0) 4315 lbu t5, 4(t0) 4316 lbu t6, 5(t0) 4317 lbu t7, 6(t0) 4318 lbu t8, 7(t0) 4319 addiu t1, t1, -128 4320 addiu t2, t2, -128 4321 addiu t3, t3, -128 4322 addiu t4, t4, -128 4323 addiu t5, t5, -128 4324 addiu t6, t6, -128 4325 addiu t7, t7, -128 4326 addiu t8, t8, -128 4327 mtc1 t1, f2 4328 mtc1 t2, f4 4329 mtc1 t3, f6 4330 mtc1 t4, f8 4331 mtc1 t5, f10 4332 mtc1 t6, f12 4333 mtc1 t7, f14 4334 mtc1 t8, f16 4335 cvt.s.w f2, f2 4336 cvt.s.w f4, f4 4337 cvt.s.w f6, f6 4338 cvt.s.w f8, f8 4339 cvt.s.w f10, f10 4340 cvt.s.w f12, f12 4341 cvt.s.w f14, f14 4342 cvt.s.w f16, f16 4343 lw t0, 20(a0) 4344 swc1 f2, 128(a2) 4345 swc1 f4, 132(a2) 4346 swc1 f6, 136(a2) 4347 addu t0, t0, a1 4348 swc1 f8, 140(a2) 4349 swc1 f10, 144(a2) 4350 swc1 f12, 148(a2) 4351 swc1 f14, 152(a2) 4352 swc1 f16, 156(a2) 4353 //elemr 5 4354 lbu t1, 0(t0) 4355 lbu t2, 1(t0) 4356 lbu t3, 2(t0) 4357 lbu t4, 3(t0) 4358 lbu t5, 4(t0) 4359 lbu t6, 5(t0) 4360 lbu t7, 6(t0) 4361 lbu t8, 7(t0) 4362 addiu t1, t1, -128 4363 addiu t2, t2, -128 4364 addiu t3, t3, -128 4365 addiu t4, t4, -128 4366 addiu t5, t5, -128 4367 addiu t6, t6, -128 4368 addiu t7, t7, -128 4369 addiu t8, t8, -128 4370 mtc1 t1, f2 4371 mtc1 t2, f4 4372 mtc1 t3, f6 4373 mtc1 t4, f8 4374 mtc1 t5, f10 4375 mtc1 t6, f12 4376 mtc1 t7, f14 4377 mtc1 t8, f16 4378 cvt.s.w f2, f2 4379 cvt.s.w f4, f4 4380 cvt.s.w f6, f6 4381 cvt.s.w f8, f8 4382 cvt.s.w f10, f10 4383 cvt.s.w f12, f12 4384 cvt.s.w f14, f14 4385 cvt.s.w f16, f16 4386 lw t0, 24(a0) 4387 swc1 f2, 160(a2) 4388 swc1 f4, 164(a2) 4389 swc1 f6, 168(a2) 4390 addu t0, t0, a1 4391 swc1 f8, 172(a2) 4392 swc1 f10, 176(a2) 4393 swc1 f12, 180(a2) 4394 swc1 f14, 184(a2) 4395 swc1 f16, 188(a2) 4396 //elemr 6 4397 lbu t1, 0(t0) 4398 lbu t2, 1(t0) 4399 lbu t3, 2(t0) 4400 lbu t4, 3(t0) 4401 lbu t5, 4(t0) 4402 lbu t6, 5(t0) 4403 lbu t7, 6(t0) 4404 lbu t8, 7(t0) 4405 addiu t1, t1, -128 4406 addiu t2, t2, -128 4407 addiu t3, t3, -128 4408 addiu t4, t4, -128 4409 addiu t5, t5, -128 4410 addiu t6, t6, -128 4411 addiu t7, t7, -128 4412 addiu t8, t8, -128 4413 mtc1 t1, f2 4414 mtc1 t2, f4 4415 mtc1 t3, f6 4416 mtc1 t4, f8 4417 mtc1 t5, f10 4418 mtc1 t6, f12 4419 mtc1 t7, f14 4420 mtc1 t8, f16 4421 cvt.s.w f2, f2 4422 cvt.s.w f4, f4 4423 cvt.s.w f6, f6 4424 cvt.s.w f8, f8 4425 cvt.s.w f10, f10 4426 cvt.s.w f12, f12 4427 cvt.s.w f14, f14 4428 cvt.s.w f16, f16 4429 lw t0, 28(a0) 4430 swc1 f2, 192(a2) 4431 swc1 f4, 196(a2) 4432 swc1 f6, 200(a2) 4433 addu t0, t0, a1 4434 swc1 f8, 204(a2) 4435 swc1 f10, 208(a2) 4436 swc1 f12, 212(a2) 4437 swc1 f14, 216(a2) 4438 swc1 f16, 220(a2) 4439 //elemr 7 4440 lbu t1, 0(t0) 4441 lbu t2, 1(t0) 4442 lbu t3, 2(t0) 4443 lbu t4, 3(t0) 4444 lbu t5, 4(t0) 4445 lbu t6, 5(t0) 4446 lbu t7, 6(t0) 4447 lbu t8, 7(t0) 4448 addiu t1, t1, -128 4449 addiu t2, t2, -128 4450 addiu t3, t3, -128 4451 addiu t4, t4, -128 4452 addiu t5, t5, -128 4453 addiu t6, t6, -128 4454 addiu t7, t7, -128 4455 addiu t8, t8, -128 4456 mtc1 t1, f2 4457 mtc1 t2, f4 4458 mtc1 t3, f6 4459 mtc1 t4, f8 4460 mtc1 t5, f10 4461 mtc1 t6, f12 4462 mtc1 t7, f14 4463 mtc1 t8, f16 4464 cvt.s.w f2, f2 4465 cvt.s.w f4, f4 4466 cvt.s.w f6, f6 4467 cvt.s.w f8, f8 4468 cvt.s.w f10, f10 4469 cvt.s.w f12, f12 4470 cvt.s.w f14, f14 4471 cvt.s.w f16, f16 4472 swc1 f2, 224(a2) 4473 swc1 f4, 228(a2) 4474 swc1 f6, 232(a2) 4475 swc1 f8, 236(a2) 4476 swc1 f10, 240(a2) 4477 swc1 f12, 244(a2) 4478 swc1 f14, 248(a2) 4479 swc1 f16, 252(a2) 4480 4481 j ra 4482 nop 4483 4484 END(jsimd_convsamp_float_mips_dspr2) 4485 4486 /*****************************************************************************/ 4487 4488