Home | History | Annotate | Download | only in simd
      1 /*
      2  * MIPS DSPr2 optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
      5  * All Rights Reserved.
      6  * Authors:  Teodora Novkovic (teodora.novkovic (at) imgtec.com)
      7  *           Darko Laus       (darko.laus (at) imgtec.com)
      8  * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
      9  * This software is provided 'as-is', without any express or implied
     10  * warranty.  In no event will the authors be held liable for any damages
     11  * arising from the use of this software.
     12  *
     13  * Permission is granted to anyone to use this software for any purpose,
     14  * including commercial applications, and to alter it and redistribute it
     15  * freely, subject to the following restrictions:
     16  *
     17  * 1. The origin of this software must not be misrepresented; you must not
     18  *    claim that you wrote the original software. If you use this software
     19  *    in a product, an acknowledgment in the product documentation would be
     20  *    appreciated but is not required.
     21  * 2. Altered source versions must be plainly marked as such, and must not be
     22  *    misrepresented as being the original software.
     23  * 3. This notice may not be removed or altered from any source distribution.
     24  */
     25 
     26 #include "jsimd_mips_dspr2_asm.h"
     27 
     28 /*****************************************************************************/
     29 LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
     30 /*
     31  * a0     - cinfo->image_width
     32  * a1     - input_buf
     33  * a2     - output_buf
     34  * a3     - output_row
     35  * 16(sp) - num_rows
     36  * 20(sp) - cinfo->num_components
     37  *
     38  * Null conversion for compression
     39  */
     40 
     41     SAVE_REGS_ON_STACK 8, s0, s1
     42 
     43     lw        t9, 24(sp)   // t9 = num_rows
     44     lw        s0, 28(sp)   // s0 = cinfo->num_components
     45     andi      t0, a0, 3    // t0 = cinfo->image_width & 3
     46     beqz      t0, 4f       // no residual
     47      nop
     48 0:
     49     addiu     t9, t9, -1
     50     bltz      t9, 7f
     51      li       t1, 0
     52 1:
     53     sll       t3, t1, 2
     54     lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
     55     lw        t2, 0(a1)    // t2 = inptr = *input_buf
     56     sll       t4, a3, 2
     57     lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
     58     addu      t2, t2, t1
     59     addu      s1, t5, a0
     60     addu      t6, t5, t0
     61 2:
     62     lbu       t3, 0(t2)
     63     addiu     t5, t5, 1
     64     sb        t3, -1(t5)
     65     bne       t6, t5, 2b
     66      addu     t2, t2, s0
     67 3:
     68     lbu       t3, 0(t2)
     69     addu      t4, t2, s0
     70     addu      t7, t4, s0
     71     addu      t8, t7, s0
     72     addu      t2, t8, s0
     73     lbu       t4, 0(t4)
     74     lbu       t7, 0(t7)
     75     lbu       t8, 0(t8)
     76     addiu     t5, t5, 4
     77     sb        t3, -4(t5)
     78     sb        t4, -3(t5)
     79     sb        t7, -2(t5)
     80     bne       s1, t5, 3b
     81      sb       t8, -1(t5)
     82     addiu     t1, t1, 1
     83     bne       t1, s0, 1b
     84      nop
     85     addiu     a1, a1, 4
     86     bgez      t9, 0b
     87      addiu    a3, a3, 1
     88     b         7f
     89      nop
     90 4:
     91     addiu     t9, t9, -1
     92     bltz      t9, 7f
     93      li       t1, 0
     94 5:
     95     sll       t3, t1, 2
     96     lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
     97     lw        t2, 0(a1)    // t2 = inptr = *input_buf
     98     sll       t4, a3, 2
     99     lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
    100     addu      t2, t2, t1
    101     addu      s1, t5, a0
    102     addu      t6, t5, t0
    103 6:
    104     lbu       t3, 0(t2)
    105     addu      t4, t2, s0
    106     addu      t7, t4, s0
    107     addu      t8, t7, s0
    108     addu      t2, t8, s0
    109     lbu       t4, 0(t4)
    110     lbu       t7, 0(t7)
    111     lbu       t8, 0(t8)
    112     addiu     t5, t5, 4
    113     sb        t3, -4(t5)
    114     sb        t4, -3(t5)
    115     sb        t7, -2(t5)
    116     bne       s1, t5, 6b
    117      sb       t8, -1(t5)
    118     addiu     t1, t1, 1
    119     bne       t1, s0, 5b
    120      nop
    121     addiu     a1, a1, 4
    122     bgez      t9, 4b
    123      addiu    a3, a3, 1
    124 7:
    125     RESTORE_REGS_FROM_STACK 8, s0, s1
    126 
    127     j         ra
    128      nop
    129 
    130 END(jsimd_c_null_convert_mips_dspr2)
    131 
    132 /*****************************************************************************/
    133 /*
    134  * jsimd_extrgb_ycc_convert_mips_dspr2
    135  * jsimd_extbgr_ycc_convert_mips_dspr2
    136  * jsimd_extrgbx_ycc_convert_mips_dspr2
    137  * jsimd_extbgrx_ycc_convert_mips_dspr2
    138  * jsimd_extxbgr_ycc_convert_mips_dspr2
    139  * jsimd_extxrgb_ycc_convert_mips_dspr2
    140  *
    141  * Colorspace conversion RGB -> YCbCr
    142  */
    143 
    144 .macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
    145 
    146 .macro DO_RGB_TO_YCC r,    \
    147                      g,    \
    148                      b,    \
    149                      inptr
    150     lbu     \r, \r_offs(\inptr)
    151     lbu     \g, \g_offs(\inptr)
    152     lbu     \b, \b_offs(\inptr)
    153     addiu   \inptr, \pixel_size
    154 .endm
    155 
    156 LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
    157 /*
    158  * a0     - cinfo->image_width
    159  * a1     - input_buf
    160  * a2     - output_buf
    161  * a3     - output_row
    162  * 16(sp) - num_rows
    163  */
    164 
    165     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
    166 
    167     lw      t7, 48(sp)        // t7 = num_rows
    168     li      s0, 0x4c8b        // FIX(0.29900)
    169     li      s1, 0x9646        // FIX(0.58700)
    170     li      s2, 0x1d2f        // FIX(0.11400)
    171     li      s3, 0xffffd4cd    // -FIX(0.16874)
    172     li      s4, 0xffffab33    // -FIX(0.33126)
    173     li      s5, 0x8000        // FIX(0.50000)
    174     li      s6, 0xffff94d1    // -FIX(0.41869)
    175     li      s7, 0xffffeb2f    // -FIX(0.08131)
    176     li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
    177 
    178 0:
    179     addiu   t7, -1            // --num_rows
    180     lw      t6, 0(a1)         // t6 = input_buf[0]
    181     lw      t0, 0(a2)
    182     lw      t1, 4(a2)
    183     lw      t2, 8(a2)
    184     sll     t3, a3, 2
    185     lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
    186     lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
    187     lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
    188 
    189     addu    t9, t2, a0        // t9 = end address
    190     addiu   a3, 1
    191 
    192 1:
    193     DO_RGB_TO_YCC t3, t4, t5, t6
    194 
    195     mtlo    s5, $ac0
    196     mtlo    t8, $ac1
    197     mtlo    t8, $ac2
    198     maddu   $ac0, s2, t5
    199     maddu   $ac1, s5, t5
    200     maddu   $ac2, s5, t3
    201     maddu   $ac0, s0, t3
    202     maddu   $ac1, s3, t3
    203     maddu   $ac2, s6, t4
    204     maddu   $ac0, s1, t4
    205     maddu   $ac1, s4, t4
    206     maddu   $ac2, s7, t5
    207     extr.w  t3, $ac0, 16
    208     extr.w  t4, $ac1, 16
    209     extr.w  t5, $ac2, 16
    210     sb      t3, 0(t0)
    211     sb      t4, 0(t1)
    212     sb      t5, 0(t2)
    213     addiu   t0, 1
    214     addiu   t2, 1
    215     bne     t2, t9, 1b
    216      addiu  t1, 1
    217     bgtz    t7, 0b
    218      addiu  a1, 4
    219 
    220     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
    221 
    222     j ra
    223      nop
    224 END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
    225 
    226 .purgem DO_RGB_TO_YCC
    227 
    228 .endm
    229 
    230 /*------------------------------------------id -- pix R  G  B */
    231 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
    232 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
    233 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
    234 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
    235 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
    236 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
    237 
    238 /*****************************************************************************/
    239 /*
    240  * jsimd_ycc_extrgb_convert_mips_dspr2
    241  * jsimd_ycc_extbgr_convert_mips_dspr2
    242  * jsimd_ycc_extrgbx_convert_mips_dspr2
    243  * jsimd_ycc_extbgrx_convert_mips_dspr2
    244  * jsimd_ycc_extxbgr_convert_mips_dspr2
    245  * jsimd_ycc_extxrgb_convert_mips_dspr2
    246  *
    247  * Colorspace conversion YCbCr -> RGB
    248  */
    249 
    250 .macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
    251 
    252 .macro STORE_YCC_TO_RGB  scratch0 \
    253                          scratch1 \
    254                          scratch2 \
    255                          outptr
    256     sb       \scratch0, \r_offs(\outptr)
    257     sb       \scratch1, \g_offs(\outptr)
    258     sb       \scratch2, \b_offs(\outptr)
    259 .if (\pixel_size == 4)
    260     li       t0, 0xFF
    261     sb       t0, \a_offs(\outptr)
    262 .endif
    263     addiu    \outptr, \pixel_size
    264 .endm
    265 
    266 LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
    267 /*
    268  * a0     - cinfo->image_width
    269  * a1     - input_buf
    270  * a2     - input_row
    271  * a3     - output_buf
    272  * 16(sp) - num_rows
    273  */
    274 
    275     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
    276 
    277     lw         s1, 48(sp)
    278     li         t3, 0x8000
    279     li         t4, 0x166e9     // FIX(1.40200)
    280     li         t5, 0x1c5a2     // FIX(1.77200)
    281     li         t6, 0xffff492e  // -FIX(0.71414)
    282     li         t7, 0xffffa7e6  // -FIX(0.34414)
    283     repl.ph    t8, 128
    284 
    285 0:
    286     lw         s0, 0(a3)
    287     lw         t0, 0(a1)
    288     lw         t1, 4(a1)
    289     lw         t2, 8(a1)
    290     sll        s5, a2, 2
    291     addiu      s1, -1
    292     lwx        s2, s5(t0)
    293     lwx        s3, s5(t1)
    294     lwx        s4, s5(t2)
    295     addu       t9, s2, a0
    296     addiu      a2, 1
    297 
    298 1:
    299     lbu        s7, 0(s4)       // cr
    300     lbu        s6, 0(s3)       // cb
    301     lbu        s5, 0(s2)       // y
    302     addiu      s2, 1
    303     addiu      s4, 1
    304     addiu      s7, -128
    305     addiu      s6, -128
    306     mul        t2, t7, s6
    307     mul        t0, t6, s7      // Crgtab[cr]
    308     sll        s7, 15
    309     mulq_rs.w  t1, t4, s7      // Crrtab[cr]
    310     sll        s6, 15
    311     addu       t2, t3          // Cbgtab[cb]
    312     addu       t2, t0
    313 
    314     mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
    315     sra        t2, 16
    316     addu       t1, s5
    317     addu       t2, s5          // add y
    318     ins        t2, t1, 16, 16
    319     subu.ph    t2, t2, t8
    320     addu       t0, s5
    321     shll_s.ph  t2, t2, 8
    322     subu       t0, 128
    323     shra.ph    t2, t2, 8
    324     shll_s.w   t0, t0, 24
    325     addu.ph    t2, t2, t8      // clip & store
    326     sra        t0, t0, 24
    327     sra        t1, t2, 16
    328     addiu      t0, 128
    329 
    330     STORE_YCC_TO_RGB t1, t2, t0, s0
    331 
    332     bne        s2, t9, 1b
    333      addiu     s3, 1
    334     bgtz       s1, 0b
    335      addiu     a3, 4
    336 
    337     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
    338 
    339     j ra
    340      nop
    341 END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
    342 
    343 .purgem STORE_YCC_TO_RGB
    344 
    345 .endm
    346 
    347 /*------------------------------------------id -- pix R  G  B  A */
    348 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
    349 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
    350 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
    351 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
    352 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
    353 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
    354 
    355 /*****************************************************************************/
    356 /*
    357  * jsimd_extrgb_gray_convert_mips_dspr2
    358  * jsimd_extbgr_gray_convert_mips_dspr2
    359  * jsimd_extrgbx_gray_convert_mips_dspr2
    360  * jsimd_extbgrx_gray_convert_mips_dspr2
    361  * jsimd_extxbgr_gray_convert_mips_dspr2
    362  * jsimd_extxrgb_gray_convert_mips_dspr2
    363  *
    364  * Colorspace conversion RGB -> GRAY
    365  */
    366 
    367 .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
    368 
    369 .macro DO_RGB_TO_GRAY r,    \
    370                       g,    \
    371                       b,    \
    372                       inptr
    373     lbu     \r, \r_offs(\inptr)
    374     lbu     \g, \g_offs(\inptr)
    375     lbu     \b, \b_offs(\inptr)
    376     addiu   \inptr, \pixel_size
    377 .endm
    378 
    379 LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
    380 /*
    381  * a0     - cinfo->image_width
    382  * a1     - input_buf
    383  * a2     - output_buf
    384  * a3     - output_row
    385  * 16(sp) - num_rows
    386  */
    387 
    388     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
    389 
    390     li      s0, 0x4c8b             // s0 = FIX(0.29900)
    391     li      s1, 0x9646             // s1 = FIX(0.58700)
    392     li      s2, 0x1d2f             // s2 = FIX(0.11400)
    393     li      s7, 0x8000             // s7 = FIX(0.50000)
    394     lw      s6, 48(sp)
    395     andi    t7, a0, 3
    396 
    397 0:
    398     addiu   s6, -1                 // s6 = num_rows
    399     lw      t0, 0(a1)
    400     lw      t1, 0(a2)
    401     sll     t3, a3, 2
    402     lwx     t1, t3(t1)
    403     addiu   a3, 1
    404     addu    t9, t1, a0
    405     subu    t8, t9, t7
    406     beq     t1, t8, 2f
    407      nop
    408 
    409 1:
    410     DO_RGB_TO_GRAY t3, t4, t5, t0
    411     DO_RGB_TO_GRAY s3, s4, s5, t0
    412 
    413     mtlo    s7, $ac0
    414     maddu   $ac0, s2, t5
    415     maddu   $ac0, s1, t4
    416     maddu   $ac0, s0, t3
    417     mtlo    s7, $ac1
    418     maddu   $ac1, s2, s5
    419     maddu   $ac1, s1, s4
    420     maddu   $ac1, s0, s3
    421     extr.w  t6, $ac0, 16
    422 
    423     DO_RGB_TO_GRAY t3, t4, t5, t0
    424     DO_RGB_TO_GRAY s3, s4, s5, t0
    425 
    426     mtlo    s7, $ac0
    427     maddu   $ac0, s2, t5
    428     maddu   $ac0, s1, t4
    429     extr.w  t2, $ac1, 16
    430     maddu   $ac0, s0, t3
    431     mtlo    s7, $ac1
    432     maddu   $ac1, s2, s5
    433     maddu   $ac1, s1, s4
    434     maddu   $ac1, s0, s3
    435     extr.w  t5, $ac0, 16
    436     sb      t6, 0(t1)
    437     sb      t2, 1(t1)
    438     extr.w  t3, $ac1, 16
    439     addiu   t1, 4
    440     sb      t5, -2(t1)
    441     sb      t3, -1(t1)
    442     bne     t1, t8, 1b
    443      nop
    444 
    445 2:
    446     beqz    t7, 4f
    447      nop
    448 
    449 3:
    450     DO_RGB_TO_GRAY t3, t4, t5, t0
    451 
    452     mtlo    s7, $ac0
    453     maddu   $ac0, s2, t5
    454     maddu   $ac0, s1, t4
    455     maddu   $ac0, s0, t3
    456     extr.w  t6, $ac0, 16
    457     sb      t6, 0(t1)
    458     addiu   t1, 1
    459     bne     t1, t9, 3b
    460      nop
    461 
    462 4:
    463     bgtz    s6, 0b
    464      addiu  a1, 4
    465 
    466     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
    467 
    468     j ra
    469      nop
    470 END(jsimd_\colorid\()_gray_convert_mips_dspr2)
    471 
    472 .purgem DO_RGB_TO_GRAY
    473 
    474 .endm
    475 
    476 /*------------------------------------------id --  pix R  G  B */
    477 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
    478 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
    479 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
    480 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
    481 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
    482 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
    483 /*****************************************************************************/
    484 /*
    485  * jsimd_h2v2_merged_upsample_mips_dspr2
    486  * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
    487  * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
    488  * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
    489  * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
    490  * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
    491  * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
    492  *
    493  * Merged h2v2 upsample routines
    494  */
    495 .macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
    496                                                 pixel_size, \
    497                                                 r1_offs,    \
    498                                                 g1_offs,    \
    499                                                 b1_offs,    \
    500                                                 a1_offs,    \
    501                                                 r2_offs,    \
    502                                                 g2_offs,    \
    503                                                 b2_offs,    \
    504                                                 a2_offs
    505 
    506 .macro STORE_H2V2_2_PIXELS  scratch0 \
    507                             scratch1 \
    508                             scratch2 \
    509                             scratch3 \
    510                             scratch4 \
    511                             scratch5 \
    512                             outptr
    513     sb       \scratch0, \r1_offs(\outptr)
    514     sb       \scratch1, \g1_offs(\outptr)
    515     sb       \scratch2, \b1_offs(\outptr)
    516     sb       \scratch3, \r2_offs(\outptr)
    517     sb       \scratch4, \g2_offs(\outptr)
    518     sb       \scratch5, \b2_offs(\outptr)
    519 .if (\pixel_size == 8)
    520     li       \scratch0, 0xFF
    521     sb       \scratch0, \a1_offs(\outptr)
    522     sb       \scratch0, \a2_offs(\outptr)
    523 .endif
    524     addiu    \outptr, \pixel_size
    525 .endm
    526 
    527 .macro STORE_H2V2_1_PIXEL  scratch0 \
    528                            scratch1 \
    529                            scratch2 \
    530                            outptr
    531     sb    \scratch0, \r1_offs(\outptr)
    532     sb    \scratch1, \g1_offs(\outptr)
    533     sb    \scratch2, \b1_offs(\outptr)
    534 
    535 .if (\pixel_size == 8)
    536     li    t0, 0xFF
    537     sb    t0, \a1_offs(\outptr)
    538 .endif
    539 .endm
    540 
    541 LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
    542 /*
    543  * a0     - cinfo->output_width
    544  * a1     - input_buf
    545  * a2     - in_row_group_ctr
    546  * a3     - output_buf
    547  * 16(sp) - cinfo->sample_range_limit
    548  */
    549 
    550     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
    551 
    552     lw           t9, 56(sp)        // cinfo->sample_range_limit
    553     lw           v0, 0(a1)
    554     lw           v1, 4(a1)
    555     lw           t0, 8(a1)
    556     sll          t1, a2, 3
    557     addiu        t2, t1, 4
    558     sll          t3, a2, 2
    559     lw           t4, 0(a3)         // t4 = output_buf[0]
    560     lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
    561     lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
    562     lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
    563     lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
    564     lw           t7, 4(a3)         // t7 = output_buf[1]
    565     li           s1, 0xe6ea
    566     addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
    567     addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
    568     addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
    569     xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
    570     srl          t3, a0, 1
    571     blez         t3, 2f
    572      addu        t0, t5, t3        // t0 = end address
    573  1:
    574     lbu          t3, 0(t5)
    575     lbu          s3, 0(t6)
    576     addiu        t5, t5, 1
    577     addiu        t3, t3, -128      // (cb - 128)
    578     addiu        s3, s3, -128      // (cr - 128)
    579     mult         $ac1, s1, t3
    580     madd         $ac1, s2, s3
    581     sll          s3, s3, 15
    582     sll          t3, t3, 15
    583     mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
    584     extr_r.w     s5, $ac1, 16
    585     mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
    586     lbu          v0, 0(t1)
    587     addiu        t6, t6, 1
    588     addiu        t1, t1, 2
    589     addu         t3, v0, s4        // y+cred
    590     addu         s3, v0, s5        // y+cgreen
    591     addu         v1, v0, s6        // y+cblue
    592     addu         t3, t9, t3        // y+cred
    593     addu         s3, t9, s3        // y+cgreen
    594     addu         v1, t9, v1        // y+cblue
    595     lbu          AT, 0(t3)
    596     lbu          s7, 0(s3)
    597     lbu          ra, 0(v1)
    598     lbu          v0, -1(t1)
    599     addu         t3, v0, s4        // y+cred
    600     addu         s3, v0, s5        // y+cgreen
    601     addu         v1, v0, s6        // y+cblue
    602     addu         t3, t9, t3        // y+cred
    603     addu         s3, t9, s3        // y+cgreen
    604     addu         v1, t9, v1        // y+cblue
    605     lbu          t3, 0(t3)
    606     lbu          s3, 0(s3)
    607     lbu          v1, 0(v1)
    608     lbu          v0, 0(t2)
    609 
    610     STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
    611 
    612     addu         t3, v0, s4        // y+cred
    613     addu         s3, v0, s5        // y+cgreen
    614     addu         v1, v0, s6        // y+cblue
    615     addu         t3, t9, t3        // y+cred
    616     addu         s3, t9, s3        // y+cgreen
    617     addu         v1, t9, v1        // y+cblue
    618     lbu          AT, 0(t3)
    619     lbu          s7, 0(s3)
    620     lbu          ra, 0(v1)
    621     lbu          v0, 1(t2)
    622     addiu        t2, t2, 2
    623     addu         t3, v0, s4        // y+cred
    624     addu         s3, v0, s5        // y+cgreen
    625     addu         v1, v0, s6        // y+cblue
    626     addu         t3, t9, t3        // y+cred
    627     addu         s3, t9, s3        // y+cgreen
    628     addu         v1, t9, v1        // y+cblue
    629     lbu          t3, 0(t3)
    630     lbu          s3, 0(s3)
    631     lbu          v1, 0(v1)
    632 
    633     STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
    634 
    635     bne          t0, t5, 1b
    636      nop
    637 2:
    638     andi         t0, a0, 1
    639     beqz         t0, 4f
    640      lbu          t3, 0(t5)
    641     lbu          s3, 0(t6)
    642     addiu        t3, t3, -128      // (cb - 128)
    643     addiu        s3, s3, -128      // (cr - 128)
    644     mult         $ac1, s1, t3
    645     madd         $ac1, s2, s3
    646     sll          s3, s3, 15
    647     sll          t3, t3, 15
    648     lbu          v0, 0(t1)
    649     extr_r.w     s5, $ac1, 16
    650     mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
    651     mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
    652     addu         t3, v0, s4        // y+cred
    653     addu         s3, v0, s5        // y+cgreen
    654     addu         v1, v0, s6        // y+cblue
    655     addu         t3, t9, t3        // y+cred
    656     addu         s3, t9, s3        // y+cgreen
    657     addu         v1, t9, v1        // y+cblue
    658     lbu          t3, 0(t3)
    659     lbu          s3, 0(s3)
    660     lbu          v1, 0(v1)
    661     lbu          v0, 0(t2)
    662 
    663     STORE_H2V2_1_PIXEL t3, s3, v1, t4
    664 
    665     addu         t3, v0, s4        // y+cred
    666     addu         s3, v0, s5        // y+cgreen
    667     addu         v1, v0, s6        // y+cblue
    668     addu         t3, t9, t3        // y+cred
    669     addu         s3, t9, s3        // y+cgreen
    670     addu         v1, t9, v1        // y+cblue
    671     lbu          t3, 0(t3)
    672     lbu          s3, 0(s3)
    673     lbu          v1, 0(v1)
    674 
    675     STORE_H2V2_1_PIXEL t3, s3, v1, t7
    676 4:
    677     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
    678 
    679     j           ra
    680      nop
    681 
    682 END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
    683 
    684 .purgem STORE_H2V2_1_PIXEL
    685 .purgem STORE_H2V2_2_PIXELS
    686 .endm
    687 
    688 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
    689 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
    690 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
    691 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
    692 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
    693 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
    694 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
    695 /*****************************************************************************/
    696 /*
    697  * jsimd_h2v1_merged_upsample_mips_dspr2
    698  * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
    699  * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
    700  * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
    701  * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
    702  * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
    703  * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
    704  *
    705  * Merged h2v1 upsample routines
    706  */
    707 
    708 .macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
    709                                                 pixel_size, \
    710                                                 r1_offs,    \
    711                                                 g1_offs,    \
    712                                                 b1_offs,    \
    713                                                 a1_offs,    \
    714                                                 r2_offs,    \
    715                                                 g2_offs,    \
    716                                                 b2_offs,    \
    717                                                 a2_offs
    718 
    719 .macro STORE_H2V1_2_PIXELS  scratch0 \
    720                             scratch1 \
    721                             scratch2 \
    722                             scratch3 \
    723                             scratch4 \
    724                             scratch5 \
    725                             outptr
    726     sb       \scratch0, \r1_offs(\outptr)
    727     sb       \scratch1, \g1_offs(\outptr)
    728     sb       \scratch2, \b1_offs(\outptr)
    729     sb       \scratch3, \r2_offs(\outptr)
    730     sb       \scratch4, \g2_offs(\outptr)
    731     sb       \scratch5, \b2_offs(\outptr)
    732 .if (\pixel_size == 8)
    733     li       t0, 0xFF
    734     sb       t0, \a1_offs(\outptr)
    735     sb       t0, \a2_offs(\outptr)
    736 .endif
    737     addiu    \outptr, \pixel_size
    738 .endm
    739 
    740 .macro STORE_H2V1_1_PIXEL  scratch0 \
    741                            scratch1 \
    742                            scratch2 \
    743                            outptr
    744     sb    \scratch0, \r1_offs(\outptr)
    745     sb    \scratch1, \g1_offs(\outptr)
    746     sb    \scratch2, \b1_offs(\outptr)
    747 .if (\pixel_size == 8)
    748     li    t0, 0xFF
    749     sb    t0, \a1_offs(\outptr)
    750 .endif
    751 .endm
    752 
    753 LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
    754 /*
    755  * a0     - cinfo->output_width
    756  * a1     - input_buf
    757  * a2     - in_row_group_ctr
    758  * a3     - output_buf
    759  * 16(sp) - range_limit
    760  */
    761 
    762     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
    763 
    764     li           t0, 0xe6ea
    765     lw           t1, 0(a1)         // t1 = input_buf[0]
    766     lw           t2, 4(a1)         // t2 = input_buf[1]
    767     lw           t3, 8(a1)         // t3 = input_buf[2]
    768     lw           t8, 56(sp)        // t8 = range_limit
    769     addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
    770     addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
    771     addiu        s0, t0, 0x9916    // s0 = 0x8000
    772     addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
    773     xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
    774     srl          t0, a0, 1
    775     sll          t4, a2, 2
    776     lwx          s5, t4(t1)        // s5 = inptr0
    777     lwx          s6, t4(t2)        // s6 = inptr1
    778     lwx          s7, t4(t3)        // s7 = inptr2
    779     lw           t7, 0(a3)         // t7 = outptr
    780     blez         t0, 2f
    781      addu        t9, s6, t0        // t9 = end address
    782 1:
    783     lbu          t2, 0(s6)         // t2 = cb
    784     lbu          t0, 0(s7)         // t0 = cr
    785     lbu          t1, 0(s5)         // t1 = y
    786     addiu        t2, t2, -128      // t2 = cb - 128
    787     addiu        t0, t0, -128      // t0 = cr - 128
    788     mult         $ac1, s4, t2
    789     madd         $ac1, s3, t0
    790     sll          t0, t0, 15
    791     sll          t2, t2, 15
    792     mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
    793     extr_r.w     t5, $ac1, 16
    794     mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
    795     addiu        s7, s7, 1
    796     addiu        s6, s6, 1
    797     addu         t2, t1, t0        // t2 = y + cred
    798     addu         t3, t1, t5        // t3 = y + cgreen
    799     addu         t4, t1, t6        // t4 = y + cblue
    800     addu         t2, t8, t2
    801     addu         t3, t8, t3
    802     addu         t4, t8, t4
    803     lbu          t1, 1(s5)
    804     lbu          v0, 0(t2)
    805     lbu          v1, 0(t3)
    806     lbu          ra, 0(t4)
    807     addu         t2, t1, t0
    808     addu         t3, t1, t5
    809     addu         t4, t1, t6
    810     addu         t2, t8, t2
    811     addu         t3, t8, t3
    812     addu         t4, t8, t4
    813     lbu          t2, 0(t2)
    814     lbu          t3, 0(t3)
    815     lbu          t4, 0(t4)
    816 
    817     STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
    818 
    819     bne          t9, s6, 1b
    820      addiu       s5, s5, 2
    821 2:
    822     andi         t0, a0, 1
    823     beqz         t0, 4f
    824      nop
    825 3:
    826     lbu          t2, 0(s6)
    827     lbu          t0, 0(s7)
    828     lbu          t1, 0(s5)
    829     addiu        t2, t2, -128      //(cb - 128)
    830     addiu        t0, t0, -128      //(cr - 128)
    831     mul          t3, s4, t2
    832     mul          t4, s3, t0
    833     sll          t0, t0, 15
    834     sll          t2, t2, 15
    835     mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
    836     mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
    837     addu         t3, t3, s0
    838     addu         t3, t4, t3
    839     sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
    840     addu         t2, t1, t0       // y + cred
    841     addu         t3, t1, t5       // y + cgreen
    842     addu         t4, t1, t6       // y + cblue
    843     addu         t2, t8, t2
    844     addu         t3, t8, t3
    845     addu         t4, t8, t4
    846     lbu          t2, 0(t2)
    847     lbu          t3, 0(t3)
    848     lbu          t4, 0(t4)
    849 
    850     STORE_H2V1_1_PIXEL t2, t3, t4, t7
    851 4:
    852     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
    853 
    854     j            ra
    855      nop
    856 
    857 END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
    858 
    859 .purgem STORE_H2V1_1_PIXEL
    860 .purgem STORE_H2V1_2_PIXELS
    861 .endm
    862 
    863 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
    864 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
    865 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
    866 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
    867 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
    868 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
    869 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
    870 /*****************************************************************************/
    871 /*
    872  * jsimd_h2v2_fancy_upsample_mips_dspr2
    873  *
    874  * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    875  */
    876 LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
    877 /*
    878  * a0     - cinfo->max_v_samp_factor
    879  * a1     - downsampled_width
    880  * a2     - input_data
    881  * a3     - output_data_ptr
    882  */
    883 
    884     SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
    885 
    886     li             s4, 0
    887     lw             s2, 0(a3)       // s2 = *output_data_ptr
    888 0:
    889     li             t9, 2
    890     lw             s1, -4(a2)      // s1 = inptr1
    891 
    892 1:
    893     lw             s0, 0(a2)       // s0 = inptr0
    894     lwx            s3, s4(s2)
    895     addiu          s5, a1, -2      // s5 = downsampled_width - 2
    896     srl            t4, s5, 1
    897     sll            t4, t4, 1
    898     lbu            t0, 0(s0)
    899     lbu            t1, 1(s0)
    900     lbu            t2, 0(s1)
    901     lbu            t3, 1(s1)
    902     addiu          s0, 2
    903     addiu          s1, 2
    904     addu           t8, s0, t4      // t8 = end address
    905     andi           s5, s5, 1       // s5 = residual
    906     sll            t4, t0, 1
    907     sll            t6, t1, 1
    908     addu           t0, t0, t4      // t0 = (*inptr0++) * 3
    909     addu           t1, t1, t6      // t1 = (*inptr0++) * 3
    910     addu           t7, t0, t2      // t7 = thiscolsum
    911     addu           t6, t1, t3      // t5 = nextcolsum
    912     sll            t0, t7, 2       // t0 = thiscolsum * 4
    913     subu           t1, t0, t7      // t1 = thiscolsum * 3
    914     shra_r.w       t0, t0, 4
    915     addiu          t1, 7
    916     addu           t1, t1, t6
    917     srl            t1, t1, 4
    918     sb             t0, 0(s3)
    919     sb             t1, 1(s3)
    920     beq            t8, s0, 22f     // skip to final iteration if width == 3
    921      addiu          s3, 2
    922 2:
    923     lh             t0, 0(s0)       // t0 = A3|A2
    924     lh             t2, 0(s1)       // t2 = B3|B2
    925     addiu          s0, 2
    926     addiu          s1, 2
    927     preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
    928     preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
    929     shll.ph        t1, t0, 1
    930     sll            t3, t6, 1
    931     addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
    932     addu           t3, t3, t6      // t3 = this * 3
    933     addu.ph        t0, t0, t2      // t0 = next2|next1
    934     addu           t1, t3, t7
    935     andi           t7, t0, 0xFFFF  // t7 = next1
    936     sll            t2, t7, 1
    937     addu           t2, t7, t2      // t2 = next1*3
    938     addu           t4, t2, t6
    939     srl            t6, t0, 16      // t6 = next2
    940     shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
    941     addu           t0, t3, t7
    942     addiu          t0, 7
    943     srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
    944     shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
    945     addu           t2, t2, t6
    946     addiu          t2, 7
    947     srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
    948     sb             t1, 0(s3)
    949     sb             t0, 1(s3)
    950     sb             t4, 2(s3)
    951     sb             t2, 3(s3)
    952     bne            t8, s0, 2b
    953      addiu         s3, 4
    954 22:
    955     beqz           s5, 4f
    956      addu          t8, s0, s5
    957 3:
    958     lbu            t0, 0(s0)
    959     lbu            t2, 0(s1)
    960     addiu          s0, 1
    961     addiu          s1, 1
    962     sll            t3, t6, 1
    963     sll            t1, t0, 1
    964     addu           t1, t0, t1      // t1 = inptr0 * 3
    965     addu           t3, t3, t6      // t3 = thiscolsum * 3
    966     addu           t5, t1, t2
    967     addu           t1, t3, t7
    968     shra_r.w       t1, t1, 4
    969     addu           t0, t3, t5
    970     addiu          t0, 7
    971     srl            t0, t0, 4
    972     sb             t1, 0(s3)
    973     sb             t0, 1(s3)
    974     addiu          s3, 2
    975     move           t7, t6
    976     bne            t8, s0, 3b
    977      move          t6, t5
    978 4:
    979     sll            t0, t6, 2       // t0 = thiscolsum * 4
    980     subu           t1, t0, t6      // t1 = thiscolsum * 3
    981     addu           t1, t1, t7
    982     addiu          s4, 4
    983     shra_r.w       t1, t1, 4
    984     addiu          t0, 7
    985     srl            t0, t0, 4
    986     sb             t1, 0(s3)
    987     sb             t0, 1(s3)
    988     addiu          t9, -1
    989     addiu          s3, 2
    990     bnez           t9, 1b
    991      lw            s1, 4(a2)
    992     srl            t0, s4, 2
    993     subu           t0, a0, t0
    994     bgtz           t0, 0b
    995      addiu         a2, 4
    996 
    997     RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
    998 
    999     j ra
   1000      nop
   1001 END(jsimd_h2v2_fancy_upsample_mips_dspr2)
   1002 
   1003 /*****************************************************************************/
   1004 LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
   1005 /*
   1006  * a0     - cinfo->max_v_samp_factor
   1007  * a1     - downsampled_width
   1008  * a2     - input_data
   1009  * a3     - output_data_ptr
   1010  */
   1011 
   1012     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
   1013 
   1014     .set at
   1015 
   1016     beqz           a0, 3f
   1017      sll           t0, a0, 2
   1018     lw             s1, 0(a3)
   1019     li             s3, 0x10001
   1020     addu           s0, s1, t0
   1021 0:
   1022     addiu          t8, a1, -2
   1023     srl            t9, t8, 2
   1024     lw             t7, 0(a2)
   1025     lw             s2, 0(s1)
   1026     lbu            t0, 0(t7)
   1027     lbu            t1, 1(t7)   // t1 = inptr[1]
   1028     sll            t2, t0, 1
   1029     addu           t2, t2, t0  // t2 = invalue*3
   1030     addu           t2, t2, t1
   1031     shra_r.w       t2, t2, 2
   1032     sb             t0, 0(s2)
   1033     sb             t2, 1(s2)
   1034     beqz           t9, 11f
   1035      addiu         s2, 2
   1036 1:
   1037     ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
   1038     ulw            t1, 1(t7)
   1039     ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
   1040     preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
   1041     preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
   1042     preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
   1043     preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
   1044     preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
   1045     shll.ph        t5, t4, 1
   1046     shll.ph        t6, t1, 1
   1047     addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
   1048     addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
   1049     addu.ph        t4, t3, s3
   1050     addu.ph        t0, t0, s3
   1051     addu.ph        t4, t4, t5
   1052     addu.ph        t0, t0, t6
   1053     shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
   1054     shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
   1055     addu.ph        t2, t2, t5
   1056     addu.ph        t3, t3, t6
   1057     shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
   1058     shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
   1059     shll.ph        t2, t2, 8
   1060     shll.ph        t3, t3, 8
   1061     or             t2, t4, t2
   1062     or             t3, t3, t0
   1063     addiu          t9, -1
   1064     usw            t3, 0(s2)
   1065     usw            t2, 4(s2)
   1066     addiu          s2, 8
   1067     bgtz           t9, 1b
   1068      addiu         t7, 4
   1069 11:
   1070     andi           t8, 3
   1071     beqz           t8, 22f
   1072      addiu         t7, 1
   1073 
   1074 2:
   1075     lbu            t0, 0(t7)
   1076     addiu          t7, 1
   1077     sll            t1, t0, 1
   1078     addu           t2, t0, t1  // t2 = invalue
   1079     lbu            t3, -2(t7)
   1080     lbu            t4, 0(t7)
   1081     addiu          t3, 1
   1082     addiu          t4, 2
   1083     addu           t3, t3, t2
   1084     addu           t4, t4, t2
   1085     srl            t3, 2
   1086     srl            t4, 2
   1087     sb             t3, 0(s2)
   1088     sb             t4, 1(s2)
   1089     addiu          t8, -1
   1090     bgtz           t8, 2b
   1091      addiu         s2, 2
   1092 
   1093 22:
   1094     lbu            t0, 0(t7)
   1095     lbu            t2, -1(t7)
   1096     sll            t1, t0, 1
   1097     addu           t1, t1, t0 // t1 = invalue * 3
   1098     addu           t1, t1, t2
   1099     addiu          t1, 1
   1100     srl            t1, t1, 2
   1101     sb             t1, 0(s2)
   1102     sb             t0, 1(s2)
   1103     addiu          s1, 4
   1104     bne            s1, s0, 0b
   1105      addiu         a2, 4
   1106 3:
   1107     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
   1108 
   1109     j              ra
   1110      nop
   1111 END(jsimd_h2v1_fancy_upsample_mips_dspr2)
   1112 
   1113 /*****************************************************************************/
   1114 LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
   1115 /*
   1116  * a0     - cinfo->image_width
   1117  * a1     - cinfo->max_v_samp_factor
   1118  * a2     - compptr->v_samp_factor
   1119  * a3     - compptr->width_in_blocks
   1120  * 16(sp) - input_data
   1121  * 20(sp) - output_data
   1122  */
   1123     .set at
   1124 
   1125     SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
   1126 
   1127     beqz        a2, 7f
   1128      lw         s1, 44(sp)  // s1 = output_data
   1129     lw          s0, 40(sp)  // s0 = input_data
   1130     srl         s2, a0, 2
   1131     andi        t9, a0, 2
   1132     srl         t7, t9, 1
   1133     addu        s2, t7, s2
   1134     sll         t0, a3, 3   // t0 = width_in_blocks*DCT
   1135     srl         t7, t0, 1
   1136     subu        s2, t7, s2
   1137 0:
   1138     andi        t6, a0, 1   // t6 = temp_index
   1139     addiu       t6, -1
   1140     lw          t4, 0(s1)   // t4 = outptr
   1141     lw          t5, 0(s0)   // t5 = inptr0
   1142     li          s3, 0       // s3 = bias
   1143     srl         t7, a0, 1   // t7 = image_width1
   1144     srl         s4, t7, 2
   1145     andi        t8, t7, 3
   1146 1:
   1147     ulhu        t0, 0(t5)
   1148     ulhu        t1, 2(t5)
   1149     ulhu        t2, 4(t5)
   1150     ulhu        t3, 6(t5)
   1151     raddu.w.qb  t0, t0
   1152     raddu.w.qb  t1, t1
   1153     raddu.w.qb  t2, t2
   1154     raddu.w.qb  t3, t3
   1155     shra.ph     t0, t0, 1
   1156     shra_r.ph   t1, t1, 1
   1157     shra.ph     t2, t2, 1
   1158     shra_r.ph   t3, t3, 1
   1159     sb          t0, 0(t4)
   1160     sb          t1, 1(t4)
   1161     sb          t2, 2(t4)
   1162     sb          t3, 3(t4)
   1163     addiu       s4, -1
   1164     addiu       t4, 4
   1165     bgtz        s4, 1b
   1166      addiu      t5, 8
   1167     beqz        t8, 3f
   1168      addu       s4, t4, t8
   1169 2:
   1170     ulhu        t0, 0(t5)
   1171     raddu.w.qb  t0, t0
   1172     addqh.w     t0, t0, s3
   1173     xori        s3, s3, 1
   1174     sb          t0, 0(t4)
   1175     addiu       t4, 1
   1176     bne         t4, s4, 2b
   1177      addiu      t5, 2
   1178 3:
   1179     lbux        t1, t6(t5)
   1180     sll         t1, 1
   1181     addqh.w     t2, t1, s3  // t2 = pixval1
   1182     xori        s3, s3, 1
   1183     addqh.w     t3, t1, s3  // t3 = pixval2
   1184     blez        s2, 5f
   1185      append     t3, t2,  8
   1186     addu        t5, t4, s2  // t5 = loop_end2
   1187 4:
   1188     ush         t3, 0(t4)
   1189     addiu       s2, -1
   1190     bgtz        s2, 4b
   1191      addiu      t4,  2
   1192 5:
   1193     beqz        t9, 6f
   1194      nop
   1195     sb          t2, 0(t4)
   1196 6:
   1197     addiu       s1, 4
   1198     addiu       a2, -1
   1199     bnez        a2, 0b
   1200      addiu      s0, 4
   1201 7:
   1202     RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
   1203 
   1204     j           ra
   1205     nop
   1206 END(jsimd_h2v1_downsample_mips_dspr2)
   1207 
   1208 /*****************************************************************************/
   1209 LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
   1210 
   1211 /*
   1212  * a0     - cinfo->image_width
   1213  * a1     - cinfo->max_v_samp_factor
   1214  * a2     - compptr->v_samp_factor
   1215  * a3     - compptr->width_in_blocks
   1216  * 16(sp) - input_data
   1217  * 20(sp) - output_data
   1218  */
   1219     .set at
   1220     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   1221 
   1222     beqz         a2, 8f
   1223      lw          s1, 52(sp)      // s1 = output_data
   1224     lw           s0, 48(sp)      // s0 = input_data
   1225 
   1226     andi         t6, a0, 1       // t6 = temp_index
   1227     addiu        t6, -1
   1228     srl          t7, a0, 1       // t7 = image_width1
   1229     srl          s4, t7, 2
   1230     andi         t8, t7, 3
   1231     andi         t9, a0, 2
   1232     srl          s2, a0, 2
   1233     srl          t7, t9, 1
   1234     addu         s2, t7, s2
   1235     sll          t0, a3, 3       // s2 = width_in_blocks*DCT
   1236     srl          t7, t0, 1
   1237     subu         s2, t7, s2
   1238 0:
   1239     lw           t4, 0(s1)       // t4 = outptr
   1240     lw           t5, 0(s0)       // t5 = inptr0
   1241     lw           s7, 4(s0)       // s7 = inptr1
   1242     li           s6, 1           // s6 = bias
   1243 2:
   1244     ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
   1245     ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
   1246     ulw          t2, 4(t5)
   1247     ulw          t3, 4(s7)
   1248     precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
   1249     ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
   1250     raddu.w.qb   t1, t7
   1251     raddu.w.qb   t0, t0
   1252     shra_r.w     t1, t1, 2
   1253     addiu        t0, 1
   1254     srl          t0, 2
   1255     precrq.ph.w  t7, t2, t3
   1256     ins          t2, t3, 16, 16
   1257     raddu.w.qb   t7, t7
   1258     raddu.w.qb   t2, t2
   1259     shra_r.w     t7, t7, 2
   1260     addiu        t2, 1
   1261     srl          t2, 2
   1262     sb           t0, 0(t4)
   1263     sb           t1, 1(t4)
   1264     sb           t2, 2(t4)
   1265     sb           t7, 3(t4)
   1266     addiu        t4, 4
   1267     addiu        t5, 8
   1268     addiu        s4, s4, -1
   1269     bgtz         s4, 2b
   1270      addiu       s7, 8
   1271     beqz         t8, 4f
   1272      addu        t8, t4, t8
   1273 3:
   1274     ulhu         t0, 0(t5)
   1275     ulhu         t1, 0(s7)
   1276     ins          t0, t1, 16, 16
   1277     raddu.w.qb   t0, t0
   1278     addu         t0, t0, s6
   1279     srl          t0, 2
   1280     xori         s6, s6, 3
   1281     sb           t0, 0(t4)
   1282     addiu        t5, 2
   1283     addiu        t4, 1
   1284     bne          t8, t4, 3b
   1285      addiu       s7, 2
   1286 4:
   1287     lbux         t1, t6(t5)
   1288     sll          t1, 1
   1289     lbux         t0, t6(s7)
   1290     sll          t0, 1
   1291     addu         t1, t1, t0
   1292     addu         t3, t1, s6
   1293     srl          t0, t3, 2       // t2 = pixval1
   1294     xori         s6, s6, 3
   1295     addu         t2, t1, s6
   1296     srl          t1, t2, 2       // t3 = pixval2
   1297     blez         s2, 6f
   1298      append      t1, t0, 8
   1299 5:
   1300     ush          t1, 0(t4)
   1301     addiu        s2, -1
   1302     bgtz         s2, 5b
   1303      addiu       t4, 2
   1304 6:
   1305     beqz         t9, 7f
   1306      nop
   1307     sb           t0, 0(t4)
   1308 7:
   1309     addiu        s1, 4
   1310     addiu        a2, -1
   1311     bnez         a2, 0b
   1312      addiu       s0, 8
   1313 8:
   1314     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   1315 
   1316     j            ra
   1317      nop
   1318 END(jsimd_h2v2_downsample_mips_dspr2)
   1319 /*****************************************************************************/
   1320 LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
   1321 /*
   1322  * a0     - input_data
   1323  * a1     - output_data
   1324  * a2     - compptr->v_samp_factor
   1325  * a3     - cinfo->max_v_samp_factor
   1326  * 16(sp) - cinfo->smoothing_factor
   1327  * 20(sp) - compptr->width_in_blocks
   1328  * 24(sp) - cinfo->image_width
   1329  */
   1330 
   1331     .set at
   1332 
   1333     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   1334 
   1335     lw          s7, 52(sp)      // compptr->width_in_blocks
   1336     lw          s0, 56(sp)      // cinfo->image_width
   1337     lw          s6, 48(sp)      // cinfo->smoothing_factor
   1338     sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
   1339     sll         v0, s7, 1
   1340     subu        v0, v0, s0
   1341     blez        v0, 2f
   1342     move        v1, zero
   1343     addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
   1344 0:
   1345     addiu       t1, a0, -4
   1346     sll         t2, v1, 2
   1347     lwx         t1, t2(t1)
   1348     move        t3, v0
   1349     addu        t1, t1, s0
   1350     lbu         t2, -1(t1)
   1351 1:
   1352     addiu       t3, t3, -1
   1353     sb          t2, 0(t1)
   1354     bgtz        t3, 1b
   1355     addiu       t1, t1, 1
   1356     addiu       v1, v1, 1
   1357     bne         v1, t0, 0b
   1358     nop
   1359 2:
   1360     li          v0, 80
   1361     mul         v0, s6, v0
   1362     li          v1, 16384
   1363     move        t4, zero
   1364     move        t5, zero
   1365     subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
   1366     sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
   1367 3:
   1368 /* Special case for first column: pretend column -1 is same as column 0 */
   1369     sll         v0, t4, 2
   1370     lwx         t8, v0(a1)      //  outptr = output_data[outrow]
   1371     sll         v1, t5, 2
   1372     addiu       t9, v1, 4
   1373     addiu       s0, v1, -4
   1374     addiu       s1, v1, 8
   1375     lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
   1376     lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
   1377     lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
   1378     lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
   1379     lh          v0, 0(s2)
   1380     lh          v1, 0(t9)
   1381     lh          t0, 0(s0)
   1382     lh          t1, 0(s1)
   1383     ins         v0, v1, 16, 16
   1384     ins         t0, t1, 16, 16
   1385     raddu.w.qb  t2, v0
   1386     raddu.w.qb  s3, t0
   1387     lbu         v0, 0(s2)
   1388     lbu         v1, 2(s2)
   1389     lbu         t0, 0(t9)
   1390     lbu         t1, 2(t9)
   1391     addu        v0, v0, v1
   1392     mult        $ac1,t2, t6
   1393     addu        t0, t0, t1
   1394     lbu         t2, 2(s0)
   1395     addu        t0, t0, v0
   1396     lbu         t3, 2(s1)
   1397     addu        s3, t0, s3
   1398     lbu         v0, 0(s0)
   1399     lbu         t0, 0(s1)
   1400     sll         s3, s3, 1
   1401     addu        v0, v0, t2
   1402     addu        t0, t0, t3
   1403     addu        t0, t0, v0
   1404     addu        s3, t0, s3
   1405     madd        $ac1,s3, t7
   1406     extr_r.w    v0, $ac1, 16
   1407     addiu       t8, t8, 1
   1408     addiu       s2, s2, 2
   1409     addiu       t9, t9, 2
   1410     addiu       s0, s0, 2
   1411     addiu       s1, s1, 2
   1412     sb          v0, -1(t8)
   1413     addiu       s4, s7, -2
   1414     and         s4, s4, 3
   1415     addu        s5, s4, t8      //end adress
   1416 4:
   1417     lh          v0, 0(s2)
   1418     lh          v1, 0(t9)
   1419     lh          t0, 0(s0)
   1420     lh          t1, 0(s1)
   1421     ins         v0, v1, 16, 16
   1422     ins         t0, t1, 16, 16
   1423     raddu.w.qb  t2, v0
   1424     raddu.w.qb  s3, t0
   1425     lbu         v0, -1(s2)
   1426     lbu         v1, 2(s2)
   1427     lbu         t0, -1(t9)
   1428     lbu         t1, 2(t9)
   1429     addu        v0, v0, v1
   1430     mult        $ac1, t2, t6
   1431     addu        t0, t0, t1
   1432     lbu         t2, 2(s0)
   1433     addu        t0, t0, v0
   1434     lbu         t3, 2(s1)
   1435     addu        s3, t0, s3
   1436     lbu         v0, -1(s0)
   1437     lbu         t0, -1(s1)
   1438     sll         s3, s3, 1
   1439     addu        v0, v0, t2
   1440     addu        t0, t0, t3
   1441     addu        t0, t0, v0
   1442     addu        s3, t0, s3
   1443     madd        $ac1, s3, t7
   1444     extr_r.w    t2, $ac1, 16
   1445     addiu       t8, t8, 1
   1446     addiu       s2, s2, 2
   1447     addiu       t9, t9, 2
   1448     addiu       s0, s0, 2
   1449     sb          t2, -1(t8)
   1450     bne         s5, t8, 4b
   1451     addiu       s1, s1, 2
   1452     addiu       s5, s7, -2
   1453     subu        s5, s5, s4
   1454     addu        s5, s5, t8      //end adress
   1455 5:
   1456     lh          v0, 0(s2)
   1457     lh          v1, 0(t9)
   1458     lh          t0, 0(s0)
   1459     lh          t1, 0(s1)
   1460     ins         v0, v1, 16, 16
   1461     ins         t0, t1, 16, 16
   1462     raddu.w.qb  t2, v0
   1463     raddu.w.qb  s3, t0
   1464     lbu         v0, -1(s2)
   1465     lbu         v1, 2(s2)
   1466     lbu         t0, -1(t9)
   1467     lbu         t1, 2(t9)
   1468     addu        v0, v0, v1
   1469     mult        $ac1, t2, t6
   1470     addu        t0, t0, t1
   1471     lbu         t2, 2(s0)
   1472     addu        t0, t0, v0
   1473     lbu         t3, 2(s1)
   1474     addu        s3, t0, s3
   1475     lbu         v0, -1(s0)
   1476     lbu         t0, -1(s1)
   1477     sll         s3, s3, 1
   1478     addu        v0, v0, t2
   1479     addu        t0, t0, t3
   1480     lh          v1, 2(t9)
   1481     addu        t0, t0, v0
   1482     lh          v0, 2(s2)
   1483     addu        s3, t0, s3
   1484     lh          t0, 2(s0)
   1485     lh          t1, 2(s1)
   1486     madd        $ac1, s3, t7
   1487     extr_r.w    t2, $ac1, 16
   1488     ins         t0, t1, 16, 16
   1489     ins         v0, v1, 16, 16
   1490     raddu.w.qb  s3, t0
   1491     lbu         v1, 4(s2)
   1492     lbu         t0, 1(t9)
   1493     lbu         t1, 4(t9)
   1494     sb          t2, 0(t8)
   1495     raddu.w.qb  t3, v0
   1496     lbu         v0, 1(s2)
   1497     addu        t0, t0, t1
   1498     mult        $ac1, t3, t6
   1499     addu        v0, v0, v1
   1500     lbu         t2, 4(s0)
   1501     addu        t0, t0, v0
   1502     lbu         v0, 1(s0)
   1503     addu        s3, t0, s3
   1504     lbu         t0, 1(s1)
   1505     lbu         t3, 4(s1)
   1506     addu        v0, v0, t2
   1507     sll         s3, s3, 1
   1508     addu        t0, t0, t3
   1509     lh          v1, 4(t9)
   1510     addu        t0, t0, v0
   1511     lh          v0, 4(s2)
   1512     addu        s3, t0, s3
   1513     lh          t0, 4(s0)
   1514     lh          t1, 4(s1)
   1515     madd        $ac1, s3, t7
   1516     extr_r.w    t2, $ac1, 16
   1517     ins         t0, t1, 16, 16
   1518     ins         v0, v1, 16, 16
   1519     raddu.w.qb  s3, t0
   1520     lbu         v1, 6(s2)
   1521     lbu         t0, 3(t9)
   1522     lbu         t1, 6(t9)
   1523     sb          t2, 1(t8)
   1524     raddu.w.qb  t3, v0
   1525     lbu         v0, 3(s2)
   1526     addu        t0, t0,t1
   1527     mult        $ac1, t3, t6
   1528     addu        v0, v0, v1
   1529     lbu         t2, 6(s0)
   1530     addu        t0, t0, v0
   1531     lbu         v0, 3(s0)
   1532     addu        s3, t0, s3
   1533     lbu         t0, 3(s1)
   1534     lbu         t3, 6(s1)
   1535     addu        v0, v0, t2
   1536     sll         s3, s3, 1
   1537     addu        t0, t0, t3
   1538     lh          v1, 6(t9)
   1539     addu        t0, t0, v0
   1540     lh          v0, 6(s2)
   1541     addu        s3, t0, s3
   1542     lh          t0, 6(s0)
   1543     lh          t1, 6(s1)
   1544     madd        $ac1, s3, t7
   1545     extr_r.w    t3, $ac1, 16
   1546     ins         t0, t1, 16, 16
   1547     ins         v0, v1, 16, 16
   1548     raddu.w.qb  s3, t0
   1549     lbu         v1, 8(s2)
   1550     lbu         t0, 5(t9)
   1551     lbu         t1, 8(t9)
   1552     sb          t3, 2(t8)
   1553     raddu.w.qb  t2, v0
   1554     lbu         v0, 5(s2)
   1555     addu        t0, t0, t1
   1556     mult        $ac1, t2, t6
   1557     addu        v0, v0, v1
   1558     lbu         t2, 8(s0)
   1559     addu        t0, t0, v0
   1560     lbu         v0, 5(s0)
   1561     addu        s3, t0, s3
   1562     lbu         t0, 5(s1)
   1563     lbu         t3, 8(s1)
   1564     addu        v0, v0, t2
   1565     sll         s3, s3, 1
   1566     addu        t0, t0, t3
   1567     addiu       t8, t8, 4
   1568     addu        t0, t0, v0
   1569     addiu       s2, s2, 8
   1570     addu        s3, t0, s3
   1571     addiu       t9, t9, 8
   1572     madd        $ac1, s3, t7
   1573     extr_r.w    t1, $ac1, 16
   1574     addiu       s0, s0, 8
   1575     addiu       s1, s1, 8
   1576     bne         s5, t8, 5b
   1577     sb          t1, -1(t8)
   1578 /* Special case for last column */
   1579     lh          v0, 0(s2)
   1580     lh          v1, 0(t9)
   1581     lh          t0, 0(s0)
   1582     lh          t1, 0(s1)
   1583     ins         v0, v1, 16, 16
   1584     ins         t0, t1, 16, 16
   1585     raddu.w.qb  t2, v0
   1586     raddu.w.qb  s3, t0
   1587     lbu         v0, -1(s2)
   1588     lbu         v1, 1(s2)
   1589     lbu         t0, -1(t9)
   1590     lbu         t1, 1(t9)
   1591     addu        v0, v0, v1
   1592     mult        $ac1, t2, t6
   1593     addu        t0, t0, t1
   1594     lbu         t2, 1(s0)
   1595     addu        t0, t0, v0
   1596     lbu         t3, 1(s1)
   1597     addu        s3, t0, s3
   1598     lbu         v0, -1(s0)
   1599     lbu         t0, -1(s1)
   1600     sll         s3, s3, 1
   1601     addu        v0, v0, t2
   1602     addu        t0, t0, t3
   1603     addu        t0, t0, v0
   1604     addu        s3, t0, s3
   1605     madd        $ac1, s3, t7
   1606     extr_r.w    t0, $ac1, 16
   1607     addiu       t5, t5, 2
   1608     sb          t0, 0(t8)
   1609     addiu       t4, t4, 1
   1610     bne         t4, a2, 3b
   1611     addiu       t5, t5, 2
   1612 
   1613     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   1614 
   1615     j           ra
   1616      nop
   1617 
   1618 END(jsimd_h2v2_smooth_downsample_mips_dspr2)
   1619 
   1620 /*****************************************************************************/
   1621 LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
   1622 /*
   1623  * a0     - upsample->h_expand[compptr->component_index]
   1624  * a1     - upsample->v_expand[compptr->component_index]
   1625  * a2     - input_data
   1626  * a3     - output_data_ptr
   1627  * 16(sp) - cinfo->output_width
   1628  * 20(sp) - cinfo->max_v_samp_factor
   1629  */
   1630     .set at
   1631 
   1632     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
   1633 
   1634     lw      s0, 0(a3)    // s0 = output_data
   1635     lw      s1, 32(sp)   // s1 = cinfo->output_width
   1636     lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
   1637     li      t6, 0        // t6 = inrow
   1638     beqz    s2, 10f
   1639      li     s3, 0        // s3 = outrow
   1640 0:
   1641     addu    t0, a2, t6
   1642     addu    t7, s0, s3
   1643     lw      t3, 0(t0)    // t3 = inptr
   1644     lw      t8, 0(t7)    // t8 = outptr
   1645     beqz    s1, 4f
   1646      addu   t5, t8, s1   // t5 = outend
   1647 1:
   1648     lb      t2, 0(t3)    // t2 = invalue = *inptr++
   1649     addiu   t3, 1
   1650     beqz    a0, 3f
   1651      move   t0, a0       // t0 = h_expand
   1652 2:
   1653     sb      t2, 0(t8)
   1654     addiu   t0, -1
   1655     bgtz    t0, 2b
   1656      addiu  t8, 1
   1657 3:
   1658     bgt     t5, t8, 1b
   1659      nop
   1660 4:
   1661     addiu   t9, a1, -1   // t9 = v_expand - 1
   1662     blez    t9, 9f
   1663      nop
   1664 5:
   1665     lw      t3, 0(s0)
   1666     lw      t4, 4(s0)
   1667     subu    t0, s1, 0xF
   1668     blez    t0, 7f
   1669      addu   t5, t3, s1   // t5 = end address
   1670     andi    t7, s1, 0xF  // t7 = residual
   1671     subu    t8, t5, t7
   1672 6:
   1673     ulw     t0, 0(t3)
   1674     ulw     t1, 4(t3)
   1675     ulw     t2, 8(t3)
   1676     usw     t0, 0(t4)
   1677     ulw     t0, 12(t3)
   1678     usw     t1, 4(t4)
   1679     usw     t2, 8(t4)
   1680     usw     t0, 12(t4)
   1681     addiu   t3, 16
   1682     bne     t3, t8, 6b
   1683      addiu  t4, 16
   1684     beqz    t7, 8f
   1685      nop
   1686 7:
   1687     lbu     t0, 0(t3)
   1688     sb      t0, 0(t4)
   1689     addiu   t3, 1
   1690     bne     t3, t5, 7b
   1691      addiu  t4, 1
   1692 8:
   1693     addiu   t9, -1
   1694     bgtz    t9, 5b
   1695      addiu  s0, 8
   1696 9:
   1697     addu    s3, s3, a1
   1698     bne     s3, s2, 0b
   1699      addiu  t6, 1
   1700 10:
   1701     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
   1702 
   1703     j       ra
   1704      nop
   1705 END(jsimd_int_upsample_mips_dspr2)
   1706 
   1707 /*****************************************************************************/
   1708 LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
   1709 /*
   1710  * a0     - cinfo->max_v_samp_factor
   1711  * a1     - cinfo->output_width
   1712  * a2     - input_data
   1713  * a3     - output_data_ptr
   1714  */
   1715     lw      t7, 0(a3)       // t7 = output_data
   1716     andi    t8, a1, 0xf     // t8 = residual
   1717     sll     t0, a0, 2
   1718     blez    a0, 4f
   1719      addu   t9, t7, t0      // t9 = output_data end address
   1720 0:
   1721     lw      t5, 0(t7)       // t5 = outptr
   1722     lw      t6, 0(a2)       // t6 = inptr
   1723     addu    t3, t5, a1      // t3 = outptr + output_width (end address)
   1724     subu    t3, t8          // t3 = end address - residual
   1725     beq     t5, t3, 2f
   1726      move   t4, t8
   1727 1:
   1728     ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
   1729     ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
   1730     srl     t1, t0, 16      // t1 = |X|X|P3|P2|
   1731     ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
   1732     ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
   1733     ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
   1734     ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
   1735     usw     t0, 0(t5)
   1736     usw     t1, 4(t5)
   1737     srl     t0, t2, 16      // t0 = |X|X|P7|P6|
   1738     ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
   1739     ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
   1740     ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
   1741     ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
   1742     usw     t2, 8(t5)
   1743     usw     t0, 12(t5)
   1744     addiu   t5, 16
   1745     bne     t5, t3, 1b
   1746      addiu  t6, 8
   1747     beqz    t8, 3f
   1748      move   t4, t8
   1749 2:
   1750     lbu     t1, 0(t6)
   1751     sb      t1, 0(t5)
   1752     sb      t1, 1(t5)
   1753     addiu   t4, -2
   1754     addiu   t6, 1
   1755     bgtz    t4, 2b
   1756      addiu  t5, 2
   1757 3:
   1758     addiu   t7, 4
   1759     bne     t9, t7, 0b
   1760      addiu  a2, 4
   1761 4:
   1762     j       ra
   1763      nop
   1764 END(jsimd_h2v1_upsample_mips_dspr2)
   1765 
   1766 /*****************************************************************************/
   1767 LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
   1768 /*
   1769  * a0     - cinfo->max_v_samp_factor
   1770  * a1     - cinfo->output_width
   1771  * a2     - input_data
   1772  * a3     - output_data_ptr
   1773  */
   1774     lw      t7, 0(a3)
   1775     blez    a0, 7f
   1776      andi   t9, a1, 0xf     // t9 = residual
   1777 0:
   1778     lw      t6, 0(a2)       // t6 = inptr
   1779     lw      t5, 0(t7)       // t5 = outptr
   1780     addu    t8, t5, a1      // t8 = outptr end address
   1781     subu    t8, t9          // t8 = end address - residual
   1782     beq     t5, t8, 2f
   1783      move   t4, t9
   1784 1:
   1785     ulw     t0, 0(t6)
   1786     srl     t1, t0, 16
   1787     ins     t0, t0, 16, 16
   1788     ins     t0, t0, 8, 16
   1789     ins     t1, t1, 16, 16
   1790     ins     t1, t1, 8, 16
   1791     ulw     t2, 4(t6)
   1792     usw     t0, 0(t5)
   1793     usw     t1, 4(t5)
   1794     srl     t3, t2, 16
   1795     ins     t2, t2, 16, 16
   1796     ins     t2, t2, 8, 16
   1797     ins     t3, t3, 16, 16
   1798     ins     t3, t3, 8, 16
   1799     usw     t2, 8(t5)
   1800     usw     t3, 12(t5)
   1801     addiu   t5, 16
   1802     bne     t5, t8, 1b
   1803      addiu  t6, 8
   1804     beqz    t9, 3f
   1805      move   t4, t9
   1806 2:
   1807     lbu     t0, 0(t6)
   1808     sb      t0, 0(t5)
   1809     sb      t0, 1(t5)
   1810     addiu   t4, -2
   1811     addiu   t6, 1
   1812     bgtz    t4, 2b
   1813      addiu  t5, 2
   1814 3:
   1815     lw      t6, 0(t7)       // t6 = outptr[0]
   1816     lw      t5, 4(t7)       // t5 = outptr[1]
   1817     addu    t4, t6, a1      // t4 = new end address
   1818     beq     a1, t9, 5f
   1819      subu   t8, t4, t9
   1820 4:
   1821     ulw     t0, 0(t6)
   1822     ulw     t1, 4(t6)
   1823     ulw     t2, 8(t6)
   1824     usw     t0, 0(t5)
   1825     ulw     t0, 12(t6)
   1826     usw     t1, 4(t5)
   1827     usw     t2, 8(t5)
   1828     usw     t0, 12(t5)
   1829     addiu   t6, 16
   1830     bne     t6, t8, 4b
   1831      addiu  t5, 16
   1832     beqz    t9, 6f
   1833      nop
   1834 5:
   1835     lbu     t0, 0(t6)
   1836     sb      t0, 0(t5)
   1837     addiu   t6, 1
   1838     bne     t6, t4, 5b
   1839      addiu  t5, 1
   1840 6:
   1841     addiu   t7, 8
   1842     addiu   a0, -2
   1843     bgtz    a0, 0b
   1844      addiu  a2, 4
   1845 7:
   1846     j       ra
   1847      nop
   1848 END(jsimd_h2v2_upsample_mips_dspr2)
   1849 
   1850 /*****************************************************************************/
   1851 LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
   1852 /*
   1853  * a0     - coef_block
   1854  * a1     - compptr->dcttable
   1855  * a2     - output
   1856  * a3     - range_limit
   1857  */
   1858 
   1859     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   1860 
   1861     addiu     sp, sp, -256
   1862     move      v0, sp
   1863     addiu     v1, zero, 8      // v1 = DCTSIZE = 8
   1864 1:
   1865     lh        s4, 32(a0)       // s4 = inptr[16]
   1866     lh        s5, 64(a0)       // s5 = inptr[32]
   1867     lh        s6, 96(a0)       // s6 = inptr[48]
   1868     lh        t1, 112(a0)      // t1 = inptr[56]
   1869     lh        t7, 16(a0)       // t7 = inptr[8]
   1870     lh        t5, 80(a0)       // t5 = inptr[40]
   1871     lh        t3, 48(a0)       // t3 = inptr[24]
   1872     or        s4, s4, t1
   1873     or        s4, s4, t3
   1874     or        s4, s4, t5
   1875     or        s4, s4, t7
   1876     or        s4, s4, s5
   1877     or        s4, s4, s6
   1878     bnez      s4, 2f
   1879      addiu    v1, v1, -1
   1880     lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
   1881     lh        s6, 0(a0)        // inptr[DCTSIZE*0]
   1882     mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
   1883     sll       s5, s5, 2
   1884     sw        s5, 0(v0)
   1885     sw        s5, 32(v0)
   1886     sw        s5, 64(v0)
   1887     sw        s5, 96(v0)
   1888     sw        s5, 128(v0)
   1889     sw        s5, 160(v0)
   1890     sw        s5, 192(v0)
   1891     b         3f
   1892      sw       s5, 224(v0)
   1893 2:
   1894     lh        t0, 112(a1)
   1895     lh        t2, 48(a1)
   1896     lh        t4, 80(a1)
   1897     lh        t6, 16(a1)
   1898     mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
   1899     mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
   1900     mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
   1901     mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
   1902     lh        t4, 32(a1)
   1903     lh        t5, 32(a0)
   1904     lh        t6, 96(a1)
   1905     lh        t7, 96(a0)
   1906     addu      s0, t0, t1       // z3 = tmp0 + tmp2
   1907     addu      s1, t1, t2       // z2 = tmp1 + tmp2
   1908     addu      s2, t2, t3       // z4 = tmp1 + tmp3
   1909     addu      s3, s0, s2       // z3 + z4
   1910     addiu     t9, zero, 9633   // FIX_1_175875602
   1911     mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
   1912     addu      t8, t0, t3       // z1 = tmp0 + tmp3
   1913     addiu     t9, zero, 2446   // FIX_0_298631336
   1914     mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
   1915     addiu     t9, zero, 16819  // FIX_2_053119869
   1916     mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
   1917     addiu     t9, zero, 25172  // FIX_3_072711026
   1918     mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
   1919     addiu     t9, zero, 12299  // FIX_1_501321110
   1920     mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
   1921     addiu     t9, zero, 16069  // FIX_1_961570560
   1922     mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
   1923     addiu     t9, zero, 3196   // FIX_0_390180644
   1924     mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
   1925     addiu     t9, zero, 7373   // FIX_0_899976223
   1926     mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
   1927     addiu     t9, zero, 20995  // FIX_2_562915447
   1928     mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
   1929     subu      s0, s3, s0       // z3 += z5
   1930     addu      t0, t0, s0       // tmp0 += z3
   1931     addu      t1, t1, s0       // tmp2 += z3
   1932     subu      s2, s3, s2       // z4 += z5
   1933     addu      t2, t2, s2       // tmp1 += z4
   1934     addu      t3, t3, s2       // tmp3 += z4
   1935     subu      t0, t0, t8       // tmp0 += z1
   1936     subu      t1, t1, s1       // tmp2 += z2
   1937     subu      t2, t2, s1       // tmp1 += z2
   1938     subu      t3, t3, t8       // tmp3 += z1
   1939     mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
   1940     addiu     t9, zero, 6270   // FIX_0_765366865
   1941     mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
   1942     lh        t4, 0(a1)
   1943     lh        t5, 0(a0)
   1944     lh        t6, 64(a1)
   1945     lh        t7, 64(a0)
   1946     mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
   1947     mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
   1948     mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
   1949     addiu     t9, zero, 4433   // FIX_0_541196100
   1950     addu      s3, s0, s1       // z2 + z3
   1951     mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
   1952     addiu     t9, zero, 15137  // FIX_1_847759065
   1953     mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
   1954     addu      t4, t5, t6
   1955     subu      t5, t5, t6
   1956     sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
   1957     sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
   1958     addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
   1959     subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
   1960     addu      s0, t4, t7
   1961     subu      s1, t4, t7
   1962     addu      s2, t5, t6
   1963     subu      s3, t5, t6
   1964     addu      t4, s0, t3
   1965     subu      s0, s0, t3
   1966     addu      t3, s2, t1
   1967     subu      s2, s2, t1
   1968     addu      t1, s3, t2
   1969     subu      s3, s3, t2
   1970     addu      t2, s1, t0
   1971     subu      s1, s1, t0
   1972     shra_r.w  t4, t4, 11
   1973     shra_r.w  t3, t3, 11
   1974     shra_r.w  t1, t1, 11
   1975     shra_r.w  t2, t2, 11
   1976     shra_r.w  s1, s1, 11
   1977     shra_r.w  s3, s3, 11
   1978     shra_r.w  s2, s2, 11
   1979     shra_r.w  s0, s0, 11
   1980     sw        t4, 0(v0)
   1981     sw        t3, 32(v0)
   1982     sw        t1, 64(v0)
   1983     sw        t2, 96(v0)
   1984     sw        s1, 128(v0)
   1985     sw        s3, 160(v0)
   1986     sw        s2, 192(v0)
   1987     sw        s0, 224(v0)
   1988 3:
   1989     addiu     a1, a1, 2
   1990     addiu     a0, a0, 2
   1991     bgtz      v1, 1b
   1992      addiu    v0, v0, 4
   1993     move      v0, sp
   1994     addiu     v1, zero, 8
   1995 4:
   1996     lw        t0, 8(v0)        // z2 = (JLONG) wsptr[2]
   1997     lw        t1, 24(v0)       // z3 = (JLONG) wsptr[6]
   1998     lw        t2, 0(v0)        // (JLONG) wsptr[0]
   1999     lw        t3, 16(v0)       // (JLONG) wsptr[4]
   2000     lw        s4, 4(v0)        // (JLONG) wsptr[1]
   2001     lw        s5, 12(v0)       // (JLONG) wsptr[3]
   2002     lw        s6, 20(v0)       // (JLONG) wsptr[5]
   2003     lw        s7, 28(v0)       // (JLONG) wsptr[7]
   2004     or        s4, s4, t0
   2005     or        s4, s4, t1
   2006     or        s4, s4, t3
   2007     or        s4, s4, s7
   2008     or        s4, s4, s5
   2009     or        s4, s4, s6
   2010     bnez      s4, 5f
   2011      addiu    v1, v1, -1
   2012     shra_r.w  s5, t2, 5
   2013     andi      s5, s5, 0x3ff
   2014     lbux      s5, s5(a3)
   2015     lw        s1, 0(a2)
   2016     replv.qb  s5, s5
   2017     usw       s5, 0(s1)
   2018     usw       s5, 4(s1)
   2019     b         6f
   2020      nop
   2021 5:
   2022     addu      t4, t0, t1       // z2 + z3
   2023     addiu     t8, zero, 4433   // FIX_0_541196100
   2024     mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
   2025     addiu     t8, zero, 15137  // FIX_1_847759065
   2026     mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
   2027     addiu     t8, zero, 6270   // FIX_0_765366865
   2028     mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
   2029     addu      t4, t2, t3       // (JLONG) wsptr[0] + (JLONG) wsptr[4]
   2030     subu      t2, t2, t3       // (JLONG) wsptr[0] - (JLONG) wsptr[4]
   2031     sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
   2032     sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
   2033     subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
   2034     subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
   2035     addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
   2036     addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
   2037     subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
   2038     addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
   2039     lw        t4, 28(v0)       // tmp0 = (JLONG) wsptr[7]
   2040     lw        t6, 12(v0)       // tmp2 = (JLONG) wsptr[3]
   2041     lw        t5, 20(v0)       // tmp1 = (JLONG) wsptr[5]
   2042     lw        t7, 4(v0)        // tmp3 = (JLONG) wsptr[1]
   2043     addu      s0, t4, t6       // z3 = tmp0 + tmp2
   2044     addiu     t8, zero, 9633   // FIX_1_175875602
   2045     addu      s1, t5, t7       // z4 = tmp1 + tmp3
   2046     addu      s2, s0, s1       // z3 + z4
   2047     mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
   2048     addu      s3, t4, t7       // z1 = tmp0 + tmp3
   2049     addu      t9, t5, t6       // z2 = tmp1 + tmp2
   2050     addiu     t8, zero, 16069  // FIX_1_961570560
   2051     mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
   2052     addiu     t8, zero, 3196   // FIX_0_390180644
   2053     mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
   2054     addiu     t8, zero, 2446   // FIX_0_298631336
   2055     mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
   2056     addiu     t8, zero, 7373   // FIX_0_899976223
   2057     mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
   2058     addiu     t8, zero, 16819  // FIX_2_053119869
   2059     mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
   2060     addiu     t8, zero, 20995  // FIX_2_562915447
   2061     mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
   2062     addiu     t8, zero, 25172  // FIX_3_072711026
   2063     mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
   2064     addiu     t8, zero, 12299  // FIX_1_501321110
   2065     mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
   2066     subu      s0, s2, s0       // z3 += z5
   2067     subu      s1, s2, s1       // z4 += z5
   2068     addu      t4, t4, s0
   2069     subu      t4, t4, s3       // tmp0
   2070     addu      t5, t5, s1
   2071     subu      t5, t5, t9       // tmp1
   2072     addu      t6, t6, s0
   2073     subu      t6, t6, t9       // tmp2
   2074     addu      t7, t7, s1
   2075     subu      t7, t7, s3       // tmp3
   2076     addu      s0, t0, t7
   2077     subu      t0, t0, t7
   2078     addu      t7, t2, t6
   2079     subu      t2, t2, t6
   2080     addu      t6, t3, t5
   2081     subu      t3, t3, t5
   2082     addu      t5, t1, t4
   2083     subu      t1, t1, t4
   2084     shra_r.w  s0, s0, 18
   2085     shra_r.w  t7, t7, 18
   2086     shra_r.w  t6, t6, 18
   2087     shra_r.w  t5, t5, 18
   2088     shra_r.w  t1, t1, 18
   2089     shra_r.w  t3, t3, 18
   2090     shra_r.w  t2, t2, 18
   2091     shra_r.w  t0, t0, 18
   2092     andi      s0, s0, 0x3ff
   2093     andi      t7, t7, 0x3ff
   2094     andi      t6, t6, 0x3ff
   2095     andi      t5, t5, 0x3ff
   2096     andi      t1, t1, 0x3ff
   2097     andi      t3, t3, 0x3ff
   2098     andi      t2, t2, 0x3ff
   2099     andi      t0, t0, 0x3ff
   2100     lw        s1, 0(a2)
   2101     lbux      s0, s0(a3)
   2102     lbux      t7, t7(a3)
   2103     lbux      t6, t6(a3)
   2104     lbux      t5, t5(a3)
   2105     lbux      t1, t1(a3)
   2106     lbux      t3, t3(a3)
   2107     lbux      t2, t2(a3)
   2108     lbux      t0, t0(a3)
   2109     sb        s0, 0(s1)
   2110     sb        t7, 1(s1)
   2111     sb        t6, 2(s1)
   2112     sb        t5, 3(s1)
   2113     sb        t1, 4(s1)
   2114     sb        t3, 5(s1)
   2115     sb        t2, 6(s1)
   2116     sb        t0, 7(s1)
   2117 6:
   2118     addiu     v0, v0, 32
   2119     bgtz      v1, 4b
   2120      addiu    a2, a2, 4
   2121     addiu     sp, sp, 256
   2122 
   2123     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   2124 
   2125     j         ra
   2126      nop
   2127 
   2128 END(jsimd_idct_islow_mips_dspr2)
   2129 
   2130 /*****************************************************************************/
   2131 LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
   2132 /*
   2133  * a0     - inptr
   2134  * a1     - quantptr
   2135  * a2     - wsptr
   2136  * a3     - mips_idct_ifast_coefs
   2137  */
   2138 
   2139     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   2140 
   2141     addiu          t9, a0, 16            // end address
   2142     or             AT, a3, zero
   2143 
   2144 0:
   2145     lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
   2146     lw             t0, 0(a0)             // inptr[DCTSIZE*0]
   2147     lw             t1, 16(a0)            // inptr[DCTSIZE*1]
   2148     muleq_s.w.phl  v0, t0, s0            // tmp0 ...
   2149     lw             t2, 32(a0)            // inptr[DCTSIZE*2]
   2150     lw             t3, 48(a0)            // inptr[DCTSIZE*3]
   2151     lw             t4, 64(a0)            // inptr[DCTSIZE*4]
   2152     lw             t5, 80(a0)            // inptr[DCTSIZE*5]
   2153     muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
   2154     lw             t6, 96(a0)            // inptr[DCTSIZE*6]
   2155     lw             t7, 112(a0)           // inptr[DCTSIZE*7]
   2156     or             s4, t1, t2
   2157     or             s5, t3, t4
   2158     bnez           s4, 1f
   2159      ins           t0, v0, 16, 16        // ... tmp0
   2160     bnez           s5, 1f
   2161      or            s6, t5, t6
   2162     or             s6, s6, t7
   2163     bnez           s6, 1f
   2164      sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
   2165     sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
   2166     sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
   2167     sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
   2168     sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
   2169     sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
   2170     sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
   2171     sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
   2172     addiu          a0, a0, 4
   2173     b              2f
   2174      addiu         a1, a1, 4
   2175 
   2176 1:
   2177     lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
   2178     lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
   2179     muleq_s.w.phl  v0, t2, s1            // tmp1 ...
   2180     muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
   2181     lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
   2182     lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
   2183     lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
   2184     muleq_s.w.phl  v1, t4, s2            // tmp2 ...
   2185     muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
   2186     lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
   2187     lw             t8, 4(AT)             // FIX(1.414213562)
   2188     ins            t2, v0, 16, 16        // ... tmp1
   2189     muleq_s.w.phl  v0, t6, s3            // tmp3 ...
   2190     muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
   2191     ins            t4, v1, 16, 16        // ... tmp2
   2192     addq.ph        s4, t0, t4            // tmp10
   2193     subq.ph        s5, t0, t4            // tmp11
   2194     ins            t6, v0, 16, 16        // ... tmp3
   2195     subq.ph        s6, t2, t6            // tmp12 ...
   2196     addq.ph        s7, t2, t6            // tmp13
   2197     mulq_s.ph      s6, s6, t8            // ... tmp12 ...
   2198     addq.ph        t0, s4, s7            // tmp0
   2199     subq.ph        t6, s4, s7            // tmp3
   2200     muleq_s.w.phl  v0, t1, s0            // tmp4 ...
   2201     muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
   2202     shll_s.ph      s6, s6, 1             // x2
   2203     lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
   2204     subq.ph        s6, s6, s7            // ... tmp12
   2205     muleq_s.w.phl  v1, t7, s3            // tmp7 ...
   2206     muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
   2207     ins            t1, v0, 16, 16        // ... tmp4
   2208     addq.ph        t2, s5, s6            // tmp1
   2209     subq.ph        t4, s5, s6            // tmp2
   2210     muleq_s.w.phl  v0, t5, s2            // tmp6 ...
   2211     muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
   2212     ins            t7, v1, 16, 16        // ... tmp7
   2213     addq.ph        s5, t1, t7            // z11
   2214     subq.ph        s6, t1, t7            // z12
   2215     muleq_s.w.phl  v1, t3, s1            // tmp5 ...
   2216     muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
   2217     ins            t5, v0, 16, 16        // ... tmp6
   2218     ins            t3, v1, 16, 16        // ... tmp5
   2219     addq.ph        s7, t5, t3            // z13
   2220     subq.ph        v0, t5, t3            // z10
   2221     addq.ph        t7, s5, s7            // tmp7
   2222     subq.ph        s5, s5, s7            // tmp11 ...
   2223     addq.ph        v1, v0, s6            // z5 ...
   2224     mulq_s.ph      s5, s5, t8            // ... tmp11
   2225     lw             t8, 8(AT)             // FIX(1.847759065)
   2226     lw             s4, 0(AT)             // FIX(1.082392200)
   2227     addq.ph        s0, t0, t7
   2228     subq.ph        s1, t0, t7
   2229     mulq_s.ph      v1, v1, t8            // ... z5
   2230     shll_s.ph      s5, s5, 1             // x2
   2231     lw             t8, 12(AT)            // FIX(-2.613125930)
   2232     sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
   2233     shll_s.ph      v0, v0, 1             // x4
   2234     mulq_s.ph      v0, v0, t8            // tmp12 ...
   2235     mulq_s.ph      s4, s6, s4            // tmp10 ...
   2236     shll_s.ph      v1, v1, 1             // x2
   2237     addiu          a0, a0, 4
   2238     addiu          a1, a1, 4
   2239     sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
   2240     shll_s.ph      s6, v0, 1             // x4
   2241     shll_s.ph      s4, s4, 1             // x2
   2242     addq.ph        s6, s6, v1            // ... tmp12
   2243     subq.ph        t5, s6, t7            // tmp6
   2244     subq.ph        s4, s4, v1            // ... tmp10
   2245     subq.ph        t3, s5, t5            // tmp5
   2246     addq.ph        s2, t2, t5
   2247     addq.ph        t1, s4, t3            // tmp4
   2248     subq.ph        s3, t2, t5
   2249     sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
   2250     sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
   2251     addq.ph        v0, t4, t3
   2252     subq.ph        v1, t4, t3
   2253     sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
   2254     sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
   2255     addq.ph        v0, t6, t1
   2256     subq.ph        v1, t6, t1
   2257     sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
   2258     sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
   2259 
   2260 2:
   2261     bne            a0, t9, 0b
   2262      addiu         a2, a2, 4
   2263 
   2264     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   2265 
   2266     j              ra
   2267      nop
   2268 
   2269 END(jsimd_idct_ifast_cols_mips_dspr2)
   2270 
   2271 /*****************************************************************************/
   2272 LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
   2273 /*
   2274  * a0     - wsptr
   2275  * a1     - output_buf
   2276  * a2     - output_col
   2277  * a3     - mips_idct_ifast_coefs
   2278  */
   2279 
   2280     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
   2281 
   2282     addiu         t9, a0, 128        // end address
   2283     lui           s8, 0x8080
   2284     ori           s8, s8, 0x8080
   2285 
   2286 0:
   2287     lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
   2288     lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
   2289     lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
   2290     lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
   2291     lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
   2292     lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
   2293     lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
   2294     lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
   2295     lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
   2296     precrq.ph.w   t1, s0, t0         // B b
   2297     ins           t0, s0, 16, 16     // A a
   2298     bnez          t1, 1f
   2299      or           s0, t2, s2
   2300     bnez          s0, 1f
   2301      or           s0, t4, s4
   2302     bnez          s0, 1f
   2303      or           s0, t6, s6
   2304     bnez          s0, 1f
   2305      shll_s.ph    s0, t0, 2          // A a
   2306     lw            a3, 0(a1)
   2307     lw            AT, 4(a1)
   2308     precrq.ph.w   t0, s0, s0         // A A
   2309     ins           s0, s0, 16, 16     // a a
   2310     addu          a3, a3, a2
   2311     addu          AT, AT, a2
   2312     precrq.qb.ph  t0, t0, t0         // A A A A
   2313     precrq.qb.ph  s0, s0, s0         // a a a a
   2314     addu.qb       s0, s0, s8
   2315     addu.qb       t0, t0, s8
   2316     sw            s0, 0(a3)
   2317     sw            s0, 4(a3)
   2318     sw            t0, 0(AT)
   2319     sw            t0, 4(AT)
   2320     addiu         a0, a0, 32
   2321     bne           a0, t9, 0b
   2322      addiu        a1, a1, 8
   2323     b             2f
   2324      nop
   2325 
   2326 1:
   2327     precrq.ph.w   t3, s2, t2
   2328     ins           t2, s2, 16, 16
   2329     precrq.ph.w   t5, s4, t4
   2330     ins           t4, s4, 16, 16
   2331     precrq.ph.w   t7, s6, t6
   2332     ins           t6, s6, 16, 16
   2333     lw            t8, 4(AT)          // FIX(1.414213562)
   2334     addq.ph       s4, t0, t4         // tmp10
   2335     subq.ph       s5, t0, t4         // tmp11
   2336     subq.ph       s6, t2, t6         // tmp12 ...
   2337     addq.ph       s7, t2, t6         // tmp13
   2338     mulq_s.ph     s6, s6, t8         // ... tmp12 ...
   2339     addq.ph       t0, s4, s7         // tmp0
   2340     subq.ph       t6, s4, s7         // tmp3
   2341     shll_s.ph     s6, s6, 1          // x2
   2342     subq.ph       s6, s6, s7         // ... tmp12
   2343     addq.ph       t2, s5, s6         // tmp1
   2344     subq.ph       t4, s5, s6         // tmp2
   2345     addq.ph       s5, t1, t7         // z11
   2346     subq.ph       s6, t1, t7         // z12
   2347     addq.ph       s7, t5, t3         // z13
   2348     subq.ph       v0, t5, t3         // z10
   2349     addq.ph       t7, s5, s7         // tmp7
   2350     subq.ph       s5, s5, s7         // tmp11 ...
   2351     addq.ph       v1, v0, s6         // z5 ...
   2352     mulq_s.ph     s5, s5, t8         // ... tmp11
   2353     lw            t8, 8(AT)          // FIX(1.847759065)
   2354     lw            s4, 0(AT)          // FIX(1.082392200)
   2355     addq.ph       s0, t0, t7         // tmp0 + tmp7
   2356     subq.ph       s7, t0, t7         // tmp0 - tmp7
   2357     mulq_s.ph     v1, v1, t8         // ... z5
   2358     lw            a3, 0(a1)
   2359     lw            t8, 12(AT)         // FIX(-2.613125930)
   2360     shll_s.ph     s5, s5, 1          // x2
   2361     addu          a3, a3, a2
   2362     shll_s.ph     v0, v0, 1          // x4
   2363     mulq_s.ph     v0, v0, t8         // tmp12 ...
   2364     mulq_s.ph     s4, s6, s4         // tmp10 ...
   2365     shll_s.ph     v1, v1, 1          // x2
   2366     addiu         a0, a0, 32
   2367     addiu         a1, a1, 8
   2368     shll_s.ph     s6, v0, 1          // x4
   2369     shll_s.ph     s4, s4, 1          // x2
   2370     addq.ph       s6, s6, v1         // ... tmp12
   2371     shll_s.ph     s0, s0, 2
   2372     subq.ph       t5, s6, t7         // tmp6
   2373     subq.ph       s4, s4, v1         // ... tmp10
   2374     subq.ph       t3, s5, t5         // tmp5
   2375     shll_s.ph     s7, s7, 2
   2376     addq.ph       t1, s4, t3         // tmp4
   2377     addq.ph       s1, t2, t5         // tmp1 + tmp6
   2378     subq.ph       s6, t2, t5         // tmp1 - tmp6
   2379     addq.ph       s2, t4, t3         // tmp2 + tmp5
   2380     subq.ph       s5, t4, t3         // tmp2 - tmp5
   2381     addq.ph       s4, t6, t1         // tmp3 + tmp4
   2382     subq.ph       s3, t6, t1         // tmp3 - tmp4
   2383     shll_s.ph     s1, s1, 2
   2384     shll_s.ph     s2, s2, 2
   2385     shll_s.ph     s3, s3, 2
   2386     shll_s.ph     s4, s4, 2
   2387     shll_s.ph     s5, s5, 2
   2388     shll_s.ph     s6, s6, 2
   2389     precrq.ph.w   t0, s1, s0         // B A
   2390     ins           s0, s1, 16, 16     // b a
   2391     precrq.ph.w   t2, s3, s2         // D C
   2392     ins           s2, s3, 16, 16     // d c
   2393     precrq.ph.w   t4, s5, s4         // F E
   2394     ins           s4, s5, 16, 16     // f e
   2395     precrq.ph.w   t6, s7, s6         // H G
   2396     ins           s6, s7, 16, 16     // h g
   2397     precrq.qb.ph  t0, t2, t0         // D C B A
   2398     precrq.qb.ph  s0, s2, s0         // d c b a
   2399     precrq.qb.ph  t4, t6, t4         // H G F E
   2400     precrq.qb.ph  s4, s6, s4         // h g f e
   2401     addu.qb       s0, s0, s8
   2402     addu.qb       s4, s4, s8
   2403     sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
   2404     sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
   2405     lw            a3, -4(a1)
   2406     addu.qb       t0, t0, s8
   2407     addu          a3, a3, a2
   2408     addu.qb       t4, t4, s8
   2409     sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
   2410     bne           a0, t9, 0b
   2411      sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
   2412 
   2413 2:
   2414 
   2415     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
   2416 
   2417     j             ra
   2418      nop
   2419 
   2420 END(jsimd_idct_ifast_rows_mips_dspr2)
   2421 
   2422 /*****************************************************************************/
   2423 LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
   2424 /*
   2425  * a0     - data
   2426  */
   2427 
   2428     SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
   2429 
   2430     lui       t0, 6437
   2431     ori       t0, 2260
   2432     lui       t1, 9633
   2433     ori       t1, 11363
   2434     lui       t2, 0xd39e
   2435     ori       t2, 0xe6dc
   2436     lui       t3, 0xf72d
   2437     ori       t3, 9633
   2438     lui       t4, 2261
   2439     ori       t4, 9633
   2440     lui       t5, 0xd39e
   2441     ori       t5, 6437
   2442     lui       t6, 9633
   2443     ori       t6, 0xd39d
   2444     lui       t7, 0xe6dc
   2445     ori       t7, 2260
   2446     lui       t8, 4433
   2447     ori       t8, 10703
   2448     lui       t9, 0xd630
   2449     ori       t9, 4433
   2450     li        s8, 8
   2451     move      a1, a0
   2452 1:
   2453     lw        s0, 0(a1)     // tmp0 = 1|0
   2454     lw        s1, 4(a1)     // tmp1 = 3|2
   2455     lw        s2, 8(a1)     // tmp2 = 5|4
   2456     lw        s3, 12(a1)    // tmp3 = 7|6
   2457     packrl.ph s1, s1, s1    // tmp1 = 2|3
   2458     packrl.ph s3, s3, s3    // tmp3 = 6|7
   2459     subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
   2460     subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
   2461     mult      $0, $0        // ac0  = 0
   2462     dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
   2463     dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
   2464     mult      $ac1, $0, $0  // ac1  = 0
   2465     dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
   2466     dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
   2467     mult      $ac2, $0, $0  // ac2  = 0
   2468     dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
   2469     dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
   2470     mult      $ac3, $0, $0  // ac3  = 0
   2471     dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
   2472     dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
   2473     addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
   2474     addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
   2475     extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
   2476     extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
   2477     extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
   2478     extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
   2479     addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
   2480     subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
   2481     sh        s0, 2(a1)
   2482     sh        s1, 6(a1)
   2483     sh        s2, 10(a1)
   2484     sh        s3, 14(a1)
   2485     mult      $0, $0        // ac0  = 0
   2486     dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
   2487     mult      $ac1, $0, $0  // ac1  = 0
   2488     dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
   2489     sra       s4, s5, 16    // tmp4 = t11
   2490     addiu     a1, a1, 16
   2491     addiu     s8, s8, -1
   2492     extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
   2493     extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
   2494     addu      s2, s5, s4    // tmp2 = t10 + t11
   2495     subu      s3, s5, s4    // tmp3 = t10 - t11
   2496     sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
   2497     sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
   2498     sh        s2, -16(a1)
   2499     sh        s3, -8(a1)
   2500     sh        s0, -12(a1)
   2501     bgtz      s8, 1b
   2502      sh       s1, -4(a1)
   2503     li        t0, 2260
   2504     li        t1, 11363
   2505     li        t2, 9633
   2506     li        t3, 6436
   2507     li        t4, 6437
   2508     li        t5, 2261
   2509     li        t6, 11362
   2510     li        t7, 2259
   2511     li        t8, 4433
   2512     li        t9, 10703
   2513     li        a1, 10704
   2514     li        s8, 8
   2515 
   2516 2:
   2517     lh        a2, 0(a0)     // 0
   2518     lh        a3, 16(a0)    // 8
   2519     lh        v0, 32(a0)    // 16
   2520     lh        v1, 48(a0)    // 24
   2521     lh        s4, 64(a0)    // 32
   2522     lh        s5, 80(a0)    // 40
   2523     lh        s6, 96(a0)    // 48
   2524     lh        s7, 112(a0)   // 56
   2525     addu      s2, v0, s5    // tmp2 = 16 + 40
   2526     subu      s5, v0, s5    // tmp5 = 16 - 40
   2527     addu      s3, v1, s4    // tmp3 = 24 + 32
   2528     subu      s4, v1, s4    // tmp4 = 24 - 32
   2529     addu      s0, a2, s7    // tmp0 =  0 + 56
   2530     subu      s7, a2, s7    // tmp7 =  0 - 56
   2531     addu      s1, a3, s6    // tmp1 =  8 + 48
   2532     subu      s6, a3, s6    // tmp6 =  8 - 48
   2533     addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
   2534     subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
   2535     addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
   2536     subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
   2537     mult      s7, t1        // ac0  = tmp7 * c1
   2538     madd      s4, t0        // ac0 += tmp4 * c0
   2539     madd      s5, t4        // ac0 += tmp5 * c4
   2540     madd      s6, t2        // ac0 += tmp6 * c2
   2541     mult      $ac1, s7, t2  // ac1  = tmp7 * c2
   2542     msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
   2543     msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
   2544     msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
   2545     mult      $ac2, s7, t4  // ac2  = tmp7 * c4
   2546     madd      $ac2, s4, t2  // ac2 += tmp4 * c2
   2547     madd      $ac2, s5, t5  // ac2 += tmp5 * c5
   2548     msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
   2549     mult      $ac3, s7, t0  // ac3  = tmp7 * c0
   2550     msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
   2551     madd      $ac3, s5, t2  // ac3 += tmp5 * c2
   2552     msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
   2553     extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
   2554     extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
   2555     extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
   2556     extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
   2557     addiu     s8, s8, -1
   2558     addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
   2559     subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
   2560     sh        s0, 16(a0)
   2561     sh        s1, 48(a0)
   2562     sh        s2, 80(a0)
   2563     sh        s3, 112(a0)
   2564     mult      v0, t8        // ac0  = tmp12 * c8
   2565     madd      v1, t9        // ac0 += tmp13 * c9
   2566     mult      $ac1, v1, t8  // ac1  = tmp13 * c8
   2567     msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
   2568     addiu     a0, a0, 2
   2569     extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
   2570     extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
   2571     shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
   2572     shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
   2573     sh        s4, -2(a0)
   2574     sh        s5, 62(a0)
   2575     sh        s6, 30(a0)
   2576     bgtz      s8, 2b
   2577      sh       s7, 94(a0)
   2578 
   2579     RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
   2580 
   2581     jr       ra
   2582      nop
   2583 
   2584 END(jsimd_fdct_islow_mips_dspr2)
   2585 
   2586 /*****************************************************************************/
   2587 LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
   2588 /*
   2589  * a0     - data
   2590  */
   2591     .set at
   2592     SAVE_REGS_ON_STACK 8, s0, s1
   2593     li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
   2594     li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
   2595     li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
   2596     li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
   2597 
   2598     move         v0, a0
   2599     addiu        v1, v0, 128     // end address
   2600 
   2601 0:
   2602     lw           t0, 0(v0)       // tmp0 = 1|0
   2603     lw           t1, 4(v0)       // tmp1 = 3|2
   2604     lw           t2, 8(v0)       // tmp2 = 5|4
   2605     lw           t3, 12(v0)      // tmp3 = 7|6
   2606     packrl.ph    t1, t1, t1      // tmp1 = 2|3
   2607     packrl.ph    t3, t3, t3      // tmp3 = 6|7
   2608     subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
   2609     subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
   2610     addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
   2611     addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
   2612     addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
   2613     subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
   2614     sra          t4, t8, 16      // tmp4 = t11
   2615     mult         $0, $0          // ac0  = 0
   2616     dpa.w.ph     $ac0, t9, s1
   2617     mult         $ac1, $0, $0    // ac1  = 0
   2618     dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
   2619     dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
   2620     mult         $ac2, $0, $0    // ac2  = 0
   2621     dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
   2622     mult         $ac3, $0, $0    // ac3  = 0
   2623     dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
   2624     precrq.ph.w  t0, t5, t7      // t0 = t5|t6
   2625     addq.ph      t2, t8, t4      // tmp2 = t10 + t11
   2626     subq.ph      t3, t8, t4      // tmp3 = t10 - t11
   2627     extr.w       t4, $ac0, 8
   2628     mult         $0, $0          // ac0  = 0
   2629     dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
   2630     extr.w       t0, $ac1, 8     // t0 = z5
   2631     extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
   2632     extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
   2633     extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
   2634     add          t6, t1, t0      // t6 = z2
   2635     add          t7, t7, t0      // t7 = z4
   2636     subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
   2637     addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
   2638     addq.ph      t1, t0, t6      // t1 = z13 + z2
   2639     subq.ph      t6, t0, t6      // t6 = z13 - z2
   2640     addq.ph      t0, t8, t7      // t0 = z11 + z4
   2641     subq.ph      t7, t8, t7      // t7 = z11 - z4
   2642     addq.ph      t5, t4, t9
   2643     subq.ph      t4, t9, t4
   2644     sh           t2, 0(v0)
   2645     sh           t5, 4(v0)
   2646     sh           t3, 8(v0)
   2647     sh           t4, 12(v0)
   2648     sh           t1, 10(v0)
   2649     sh           t6, 6(v0)
   2650     sh           t0, 2(v0)
   2651     sh           t7, 14(v0)
   2652     addiu        v0, 16
   2653     bne          v1, v0, 0b
   2654      nop
   2655     move         v0, a0
   2656     addiu        v1, v0, 16
   2657 
   2658 1:
   2659     lh           t0, 0(v0)       // 0
   2660     lh           t1, 16(v0)      // 8
   2661     lh           t2, 32(v0)      // 16
   2662     lh           t3, 48(v0)      // 24
   2663     lh           t4, 64(v0)      // 32
   2664     lh           t5, 80(v0)      // 40
   2665     lh           t6, 96(v0)      // 48
   2666     lh           t7, 112(v0)     // 56
   2667     add          t8, t0, t7      // t8 = tmp0
   2668     sub          t7, t0, t7      // t7 = tmp7
   2669     add          t0, t1, t6      // t0 = tmp1
   2670     sub          t1, t1, t6      // t1 = tmp6
   2671     add          t6, t2, t5      // t6 = tmp2
   2672     sub          t5, t2, t5      // t5 = tmp5
   2673     add          t2, t3, t4      // t2 = tmp3
   2674     sub          t3, t3, t4      // t3 = tmp4
   2675     add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
   2676     sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
   2677     sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
   2678     ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
   2679     add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
   2680     mult         $0, $0          // ac0  = 0
   2681     dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
   2682     add          s0, t4, t2      // t8 = tmp10+tmp11
   2683     sub          t4, t4, t2      // t4 = tmp10-tmp11
   2684     sh           s0, 0(v0)
   2685     sh           t4, 64(v0)
   2686     extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
   2687     addq.ph      t4, t8, t2      // t9 = tmp13 + z1
   2688     subq.ph      t8, t8, t2      // t2 = tmp13 - z1
   2689     sh           t4, 32(v0)
   2690     sh           t8, 96(v0)
   2691     add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
   2692     add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
   2693     add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
   2694     andi         t4, a1, 0xffff
   2695     mul          s0, t1, t4
   2696     sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
   2697     ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
   2698     mult         $0, $0          // ac0  = 0
   2699     mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
   2700     extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
   2701     add          t2, t7, t8      // t2 = tmp7 + z5
   2702     sub          t7, t7, t8      // t7 = tmp7 - z5
   2703     andi         t4, a2, 0xffff
   2704     mul          t8, t3, t4
   2705     sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
   2706     andi         t4, s1, 0xffff
   2707     mul          t6, t0, t4
   2708     sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
   2709     add          t0, t6, t8      // t0 = z3 + z2
   2710     sub          t1, t6, t8      // t1 = z3 - z2
   2711     add          t3, t6, s0      // t3 = z3 + z4
   2712     sub          t4, t6, s0      // t4 = z3 - z4
   2713     sub          t5, t2, t1      // t5 = dataptr[5]
   2714     sub          t6, t7, t0      // t6 = dataptr[3]
   2715     add          t3, t2, t3      // t3 = dataptr[1]
   2716     add          t4, t7, t4      // t4 = dataptr[7]
   2717     sh           t5, 80(v0)
   2718     sh           t6, 48(v0)
   2719     sh           t3, 16(v0)
   2720     sh           t4, 112(v0)
   2721     addiu        v0, 2
   2722     bne          v0, v1, 1b
   2723      nop
   2724 
   2725     RESTORE_REGS_FROM_STACK 8, s0, s1
   2726 
   2727     j            ra
   2728      nop
   2729 END(jsimd_fdct_ifast_mips_dspr2)
   2730 
   2731 /*****************************************************************************/
   2732 LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
   2733 /*
   2734  * a0     - coef_block
   2735  * a1     - divisors
   2736  * a2     - workspace
   2737  */
   2738 
   2739     .set at
   2740 
   2741     SAVE_REGS_ON_STACK 16, s0, s1, s2
   2742 
   2743     addiu   v0, a2, 124  // v0 = workspace_end
   2744     lh      t0, 0(a2)
   2745     lh      t1, 0(a1)
   2746     lh      t2, 128(a1)
   2747     sra     t3, t0, 15
   2748     sll     t3, t3, 1
   2749     addiu   t3, t3, 1
   2750     mul     t0, t0, t3
   2751     lh      t4, 384(a1)
   2752     lh      t5, 130(a1)
   2753     lh      t6, 2(a2)
   2754     lh      t7, 2(a1)
   2755     lh      t8, 386(a1)
   2756 
   2757 1:
   2758     andi    t1, 0xffff
   2759     add     t9, t0, t2
   2760     andi    t9, 0xffff
   2761     mul     v1, t9, t1
   2762     sra     s0, t6, 15
   2763     sll     s0, s0, 1
   2764     addiu   s0, s0, 1
   2765     addiu   t9, t4, 16
   2766     srav    v1, v1, t9
   2767     mul     v1, v1, t3
   2768     mul     t6, t6, s0
   2769     andi    t7, 0xffff
   2770     addiu   a2, a2, 4
   2771     addiu   a1, a1, 4
   2772     add     s1, t6, t5
   2773     andi    s1, 0xffff
   2774     sh      v1, 0(a0)
   2775 
   2776     mul     s2, s1, t7
   2777     addiu   s1, t8, 16
   2778     srav    s2, s2, s1
   2779     mul     s2,s2, s0
   2780     lh      t0, 0(a2)
   2781     lh      t1, 0(a1)
   2782     sra     t3, t0, 15
   2783     sll     t3, t3, 1
   2784     addiu   t3, t3, 1
   2785     mul     t0, t0, t3
   2786     lh      t2, 128(a1)
   2787     lh      t4, 384(a1)
   2788     lh      t5, 130(a1)
   2789     lh      t8, 386(a1)
   2790     lh      t6, 2(a2)
   2791     lh      t7, 2(a1)
   2792     sh      s2, 2(a0)
   2793     lh      t0, 0(a2)
   2794     sra     t3, t0, 15
   2795     sll     t3, t3, 1
   2796     addiu   t3, t3, 1
   2797     mul     t0, t0,t3
   2798     bne     a2, v0, 1b
   2799      addiu  a0, a0, 4
   2800 
   2801     andi    t1, 0xffff
   2802     add     t9, t0, t2
   2803     andi    t9, 0xffff
   2804     mul     v1, t9, t1
   2805     sra     s0, t6, 15
   2806     sll     s0, s0, 1
   2807     addiu   s0, s0, 1
   2808     addiu   t9, t4, 16
   2809     srav    v1, v1, t9
   2810     mul     v1, v1, t3
   2811     mul     t6, t6, s0
   2812     andi    t7, 0xffff
   2813     sh      v1, 0(a0)
   2814     add     s1, t6, t5
   2815     andi    s1, 0xffff
   2816     mul     s2, s1, t7
   2817     addiu   s1, t8, 16
   2818     addiu   a2, a2, 4
   2819     addiu   a1, a1, 4
   2820     srav    s2, s2, s1
   2821     mul     s2, s2, s0
   2822     sh      s2, 2(a0)
   2823 
   2824     RESTORE_REGS_FROM_STACK 16, s0, s1, s2
   2825 
   2826     j       ra
   2827      nop
   2828 
   2829 END(jsimd_quantize_mips_dspr2)
   2830 
   2831 /*****************************************************************************/
   2832 LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
   2833 /*
   2834  * a0     - coef_block
   2835  * a1     - divisors
   2836  * a2     - workspace
   2837  */
   2838 
   2839     .set at
   2840 
   2841     li         t1, 0x46800100     //integer representation 16384.5
   2842     mtc1       t1, f0
   2843     li         t0, 63
   2844 0:
   2845     lwc1       f2, 0(a2)
   2846     lwc1       f10, 0(a1)
   2847     lwc1       f4, 4(a2)
   2848     lwc1       f12, 4(a1)
   2849     lwc1       f6, 8(a2)
   2850     lwc1       f14, 8(a1)
   2851     lwc1       f8, 12(a2)
   2852     lwc1       f16, 12(a1)
   2853     madd.s     f2, f0, f2, f10
   2854     madd.s     f4, f0, f4, f12
   2855     madd.s     f6, f0, f6, f14
   2856     madd.s     f8, f0, f8, f16
   2857     lwc1       f10, 16(a1)
   2858     lwc1       f12, 20(a1)
   2859     trunc.w.s  f2, f2
   2860     trunc.w.s  f4, f4
   2861     trunc.w.s  f6, f6
   2862     trunc.w.s  f8, f8
   2863     lwc1       f14, 24(a1)
   2864     lwc1       f16, 28(a1)
   2865     mfc1       t1, f2
   2866     mfc1       t2, f4
   2867     mfc1       t3, f6
   2868     mfc1       t4, f8
   2869     lwc1       f2, 16(a2)
   2870     lwc1       f4, 20(a2)
   2871     lwc1       f6, 24(a2)
   2872     lwc1       f8, 28(a2)
   2873     madd.s     f2, f0, f2, f10
   2874     madd.s     f4, f0, f4, f12
   2875     madd.s     f6, f0, f6, f14
   2876     madd.s     f8, f0, f8, f16
   2877     addiu      t1, t1, -16384
   2878     addiu      t2, t2, -16384
   2879     addiu      t3, t3, -16384
   2880     addiu      t4, t4, -16384
   2881     trunc.w.s  f2, f2
   2882     trunc.w.s  f4, f4
   2883     trunc.w.s  f6, f6
   2884     trunc.w.s  f8, f8
   2885     sh         t1, 0(a0)
   2886     sh         t2, 2(a0)
   2887     sh         t3, 4(a0)
   2888     sh         t4, 6(a0)
   2889     mfc1       t1, f2
   2890     mfc1       t2, f4
   2891     mfc1       t3, f6
   2892     mfc1       t4, f8
   2893     addiu      t0, t0, -8
   2894     addiu      a2, a2, 32
   2895     addiu      a1, a1, 32
   2896     addiu      t1, t1, -16384
   2897     addiu      t2, t2, -16384
   2898     addiu      t3, t3, -16384
   2899     addiu      t4, t4, -16384
   2900     sh         t1, 8(a0)
   2901     sh         t2, 10(a0)
   2902     sh         t3, 12(a0)
   2903     sh         t4, 14(a0)
   2904     bgez       t0, 0b
   2905      addiu     a0, a0, 16
   2906 
   2907     j          ra
   2908      nop
   2909 
   2910 END(jsimd_quantize_float_mips_dspr2)
   2911 /*****************************************************************************/
   2912 LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
   2913 /*
   2914  * a0     - compptr->dct_table
   2915  * a1     - coef_block
   2916  * a2     - output_buf
   2917  * a3     - output_col
   2918  */
   2919     .set at
   2920 
   2921     SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
   2922 
   2923     addiu     sp, sp, -40
   2924     move      v0, sp
   2925     addiu     s2, zero, 29692
   2926     addiu     s3, zero, -10426
   2927     addiu     s4, zero, 6967
   2928     addiu     s5, zero, -5906
   2929     lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
   2930     lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
   2931     lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
   2932     lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
   2933     mul       t4, t5, t0
   2934     lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
   2935     lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
   2936     mul       t6, t6, t1
   2937     mul       t5, t5, t0
   2938     lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
   2939     lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
   2940     lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
   2941     lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
   2942     mul       t7, t7, t2
   2943     mult      zero, zero
   2944     mul       t8, t8, t3
   2945     li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
   2946     li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
   2947     ins       t6, t5, 16, 16    // t6 = t5|t6
   2948     sll       t4, t4, 15
   2949     dpa.w.ph  $ac0, t6, s0
   2950     lh        t1, 2(a1)
   2951     lh        t6, 2(a0)
   2952     ins       t8, t7, 16, 16    // t8 = t7|t8
   2953     dpa.w.ph  $ac0, t8, s1
   2954     mflo      t0, $ac0
   2955     mul       t5, t6, t1
   2956     lh        t1, 18(a1)
   2957     lh        t6, 18(a0)
   2958     lh        t2, 50(a1)
   2959     lh        t7, 50(a0)
   2960     mul       t6, t6, t1
   2961     subu      t8, t4, t0
   2962     mul       t7, t7, t2
   2963     addu      t0, t4, t0
   2964     shra_r.w  t0, t0, 13
   2965     lh        t1, 82(a1)
   2966     lh        t2, 82(a0)
   2967     lh        t3, 114(a1)
   2968     lh        t4, 114(a0)
   2969     shra_r.w  t8, t8, 13
   2970     mul       t1, t1, t2
   2971     mul       t3, t3, t4
   2972     sw        t0, 0(v0)
   2973     sw        t8, 20(v0)
   2974     sll       t4, t5, 15
   2975     ins       t7, t6, 16, 16
   2976     mult      zero, zero
   2977     dpa.w.ph  $ac0, t7, s0
   2978     ins       t3, t1, 16, 16
   2979     lh        t1, 6(a1)
   2980     lh        t6, 6(a0)
   2981     dpa.w.ph  $ac0, t3, s1
   2982     mflo      t0, $ac0
   2983     mul       t5, t6, t1
   2984     lh        t1, 22(a1)
   2985     lh        t6, 22(a0)
   2986     lh        t2, 54(a1)
   2987     lh        t7, 54(a0)
   2988     mul       t6, t6, t1
   2989     subu      t8, t4, t0
   2990     mul       t7, t7, t2
   2991     addu      t0, t4, t0
   2992     shra_r.w  t0, t0, 13
   2993     lh        t1, 86(a1)
   2994     lh        t2, 86(a0)
   2995     lh        t3, 118(a1)
   2996     lh        t4, 118(a0)
   2997     shra_r.w  t8, t8, 13
   2998     mul       t1, t1, t2
   2999     mul       t3, t3, t4
   3000     sw        t0, 4(v0)
   3001     sw        t8, 24(v0)
   3002     sll       t4, t5, 15
   3003     ins       t7, t6, 16, 16
   3004     mult      zero, zero
   3005     dpa.w.ph  $ac0, t7, s0
   3006     ins       t3, t1, 16, 16
   3007     lh        t1, 10(a1)
   3008     lh        t6, 10(a0)
   3009     dpa.w.ph  $ac0, t3, s1
   3010     mflo      t0, $ac0
   3011     mul       t5, t6, t1
   3012     lh        t1, 26(a1)
   3013     lh        t6, 26(a0)
   3014     lh        t2, 58(a1)
   3015     lh        t7, 58(a0)
   3016     mul       t6, t6, t1
   3017     subu      t8, t4, t0
   3018     mul       t7, t7, t2
   3019     addu      t0, t4, t0
   3020     shra_r.w  t0, t0, 13
   3021     lh        t1, 90(a1)
   3022     lh        t2, 90(a0)
   3023     lh        t3, 122(a1)
   3024     lh        t4, 122(a0)
   3025     shra_r.w  t8, t8, 13
   3026     mul       t1, t1, t2
   3027     mul       t3, t3, t4
   3028     sw        t0, 8(v0)
   3029     sw        t8, 28(v0)
   3030     sll       t4, t5, 15
   3031     ins       t7, t6, 16, 16
   3032     mult      zero, zero
   3033     dpa.w.ph  $ac0, t7, s0
   3034     ins       t3, t1, 16, 16
   3035     lh        t1, 14(a1)
   3036     lh        t6, 14(a0)
   3037     dpa.w.ph  $ac0, t3, s1
   3038     mflo      t0, $ac0
   3039     mul       t5, t6, t1
   3040     lh        t1, 30(a1)
   3041     lh        t6, 30(a0)
   3042     lh        t2, 62(a1)
   3043     lh        t7, 62(a0)
   3044     mul       t6, t6, t1
   3045     subu      t8, t4, t0
   3046     mul       t7, t7, t2
   3047     addu      t0, t4, t0
   3048     shra_r.w  t0, t0, 13
   3049     lh        t1, 94(a1)
   3050     lh        t2, 94(a0)
   3051     lh        t3, 126(a1)
   3052     lh        t4, 126(a0)
   3053     shra_r.w  t8, t8, 13
   3054     mul       t1, t1, t2
   3055     mul       t3, t3, t4
   3056     sw        t0, 12(v0)
   3057     sw        t8, 32(v0)
   3058     sll       t4, t5, 15
   3059     ins       t7, t6, 16, 16
   3060     mult      zero, zero
   3061     dpa.w.ph  $ac0, t7, s0
   3062     ins       t3, t1, 16, 16
   3063     dpa.w.ph  $ac0, t3, s1
   3064     mflo      t0, $ac0
   3065     lw        t9, 0(a2)
   3066     lw        t3, 0(v0)
   3067     lw        t7, 4(v0)
   3068     lw        t1, 8(v0)
   3069     addu      t9, t9, a3
   3070     sll       t3, t3, 15
   3071     subu      t8, t4, t0
   3072     addu      t0, t4, t0
   3073     shra_r.w  t0, t0, 13
   3074     shra_r.w  t8, t8, 13
   3075     sw        t0, 16(v0)
   3076     sw        t8, 36(v0)
   3077     lw        t5, 12(v0)
   3078     lw        t6, 16(v0)
   3079     mult      t7, s2
   3080     madd      t1, s3
   3081     madd      t5, s4
   3082     madd      t6, s5
   3083     lw        t5, 24(v0)
   3084     lw        t7, 28(v0)
   3085     mflo      t0, $ac0
   3086     lw        t8, 32(v0)
   3087     lw        t2, 36(v0)
   3088     mult      $ac1, t5, s2
   3089     madd      $ac1, t7, s3
   3090     madd      $ac1, t8, s4
   3091     madd      $ac1, t2, s5
   3092     addu      t1, t3, t0
   3093     subu      t6, t3, t0
   3094     shra_r.w  t1, t1, 20
   3095     shra_r.w  t6, t6, 20
   3096     mflo      t4, $ac1
   3097     shll_s.w  t1, t1, 24
   3098     shll_s.w  t6, t6, 24
   3099     sra       t1, t1, 24
   3100     sra       t6, t6, 24
   3101     addiu     t1, t1, 128
   3102     addiu     t6, t6, 128
   3103     lw        t0, 20(v0)
   3104     sb        t1, 0(t9)
   3105     sb        t6, 1(t9)
   3106     sll       t0, t0, 15
   3107     lw        t9, 4(a2)
   3108     addu      t1, t0, t4
   3109     subu      t6, t0, t4
   3110     addu      t9, t9, a3
   3111     shra_r.w  t1, t1, 20
   3112     shra_r.w  t6, t6, 20
   3113     shll_s.w  t1, t1, 24
   3114     shll_s.w  t6, t6, 24
   3115     sra       t1, t1, 24
   3116     sra       t6, t6, 24
   3117     addiu     t1, t1, 128
   3118     addiu     t6, t6, 128
   3119     sb        t1, 0(t9)
   3120     sb        t6, 1(t9)
   3121     addiu     sp, sp, 40
   3122 
   3123     RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
   3124 
   3125     j         ra
   3126      nop
   3127 
   3128 END(jsimd_idct_2x2_mips_dspr2)
   3129 
   3130 /*****************************************************************************/
   3131 LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
   3132 /*
   3133  * a0     - compptr->dct_table
   3134  * a1     - coef_block
   3135  * a2     - output_buf
   3136  * a3     - output_col
   3137  * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
   3138  */
   3139 
   3140     .set at
   3141     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   3142 
   3143     lw        v1, 48(sp)
   3144     move      t0, a1
   3145     move      t1, v1
   3146     li        t9, 4
   3147     li        s0, 0x2e75f93e
   3148     li        s1, 0x21f9ba79
   3149     li        s2, 0xecc2efb0
   3150     li        s3, 0x52031ccd
   3151 
   3152 0:
   3153     lh        s6, 32(t0)        // inptr[DCTSIZE*2]
   3154     lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
   3155     lh        s7, 96(t0)        // inptr[DCTSIZE*6]
   3156     lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
   3157     mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
   3158     lh        s4, 0(t0)         // inptr[DCTSIZE*0]
   3159     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
   3160     lh        s5, 0(a0)         // quantptr[0]
   3161     li        s6, 15137
   3162     li        s7, 6270
   3163     mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
   3164     mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
   3165     lh        t5, 112(t0)       // inptr[DCTSIZE*7]
   3166     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
   3167     lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
   3168     lh        v0, 80(t0)        // inptr[DCTSIZE*5]
   3169     lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
   3170     lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
   3171     sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
   3172     lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
   3173     lh        t8, 16(t0)        // inptr[DCTSIZE*1]
   3174     subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
   3175     lh        t7, 48(t0)        // inptr[DCTSIZE*3]
   3176     mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
   3177     mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
   3178     mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
   3179     mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
   3180     addu      t3, t2, t6        // tmp10 = tmp0 + z2
   3181     subu      t4, t2, t6        // tmp10 = tmp0 - z2
   3182     mult      $ac0, zero, zero
   3183     mult      $ac1, zero, zero
   3184     ins       t5, v0, 16, 16
   3185     ins       t7, t8, 16, 16
   3186     addiu     t9, t9, -1
   3187     dpa.w.ph  $ac0, t5, s0
   3188     dpa.w.ph  $ac0, t7, s1
   3189     dpa.w.ph  $ac1, t5, s2
   3190     dpa.w.ph  $ac1, t7, s3
   3191     mflo      s4, $ac0
   3192     mflo      s5, $ac1
   3193     addiu     a0, a0, 2
   3194     addiu     t1, t1, 4
   3195     addiu     t0, t0, 2
   3196     addu      t6, t4, s4
   3197     subu      t5, t4, s4
   3198     addu      s6, t3, s5
   3199     subu      s7, t3, s5
   3200     shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
   3201     shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
   3202     shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
   3203     shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
   3204     sw        t6, 28(t1)
   3205     sw        t5, 60(t1)
   3206     sw        s6, -4(t1)
   3207     bgtz      t9, 0b
   3208      sw       s7, 92(t1)
   3209     // second loop three pass
   3210     li        t9, 3
   3211 1:
   3212     lh        s6, 34(t0)        // inptr[DCTSIZE*2]
   3213     lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
   3214     lh        s7, 98(t0)        // inptr[DCTSIZE*6]
   3215     lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
   3216     mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
   3217     lh        s4, 2(t0)         // inptr[DCTSIZE*0]
   3218     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
   3219     lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
   3220     li        s6, 15137
   3221     li        s7, 6270
   3222     mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
   3223     mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
   3224     lh        t5, 114(t0)       // inptr[DCTSIZE*7]
   3225     mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
   3226     lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
   3227     lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
   3228     lh        t6, 82(t0)        // inptr[DCTSIZE*5]
   3229     sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
   3230     lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
   3231     lh        t8, 18(t0)        // inptr[DCTSIZE*1]
   3232     subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
   3233     lh        t7, 50(t0)        // inptr[DCTSIZE*3]
   3234     lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
   3235     mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
   3236     mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
   3237     mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
   3238     mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
   3239     addu      t3, t2, v0        // tmp10 = tmp0 + z2
   3240     subu      t4, t2, v0        // tmp10 = tmp0 - z2
   3241     mult      $ac0, zero, zero
   3242     mult      $ac1, zero, zero
   3243     ins       t5, t6, 16, 16
   3244     ins       t7, t8, 16, 16
   3245     dpa.w.ph  $ac0, t5, s0
   3246     dpa.w.ph  $ac0, t7, s1
   3247     dpa.w.ph  $ac1, t5, s2
   3248     dpa.w.ph  $ac1, t7, s3
   3249     mflo      t5, $ac0
   3250     mflo      t6, $ac1
   3251     addiu     t9, t9, -1
   3252     addiu     t0, t0, 2
   3253     addiu     a0, a0, 2
   3254     addiu     t1, t1, 4
   3255     addu      s5, t4, t5
   3256     subu      s4, t4, t5
   3257     addu      s6, t3, t6
   3258     subu      s7, t3, t6
   3259     shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
   3260     shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
   3261     shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
   3262     shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
   3263     sw        s5, 32(t1)
   3264     sw        s4, 64(t1)
   3265     sw        s6, 0(t1)
   3266     bgtz      t9, 1b
   3267      sw       s7, 96(t1)
   3268     move      t1, v1
   3269     li        s4, 15137
   3270     lw        s6, 8(t1)         // wsptr[2]
   3271     li        s5, 6270
   3272     lw        s7, 24(t1)        // wsptr[6]
   3273     mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
   3274     lw        t2, 0(t1)         // wsptr[0]
   3275     mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
   3276     lh        t5, 28(t1)        // wsptr[7]
   3277     lh        t6, 20(t1)        // wsptr[5]
   3278     lh        t7, 12(t1)        // wsptr[3]
   3279     lh        t8, 4(t1)         // wsptr[1]
   3280     ins       t5, t6, 16, 16
   3281     ins       t7, t8, 16, 16
   3282     mult      $ac0, zero, zero
   3283     dpa.w.ph  $ac0, t5, s0
   3284     dpa.w.ph  $ac0, t7, s1
   3285     mult      $ac1, zero, zero
   3286     dpa.w.ph  $ac1, t5, s2
   3287     dpa.w.ph  $ac1, t7, s3
   3288     sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
   3289     mflo      s6, $ac0
   3290     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
   3291     subu      s4, s4, s5
   3292     addu      t3, t2, s4        // tmp10 = tmp0 + z2
   3293     mflo      s7, $ac1
   3294     subu      t4, t2, s4        // tmp10 = tmp0 - z2
   3295     addu      t7, t4, s6
   3296     subu      t8, t4, s6
   3297     addu      t5, t3, s7
   3298     subu      t6, t3, s7
   3299     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
   3300     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
   3301     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
   3302     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
   3303     sll       s4, t9, 2
   3304     lw        v0, 0(a2)         // output_buf[ctr]
   3305     shll_s.w  t5, t5, 24
   3306     shll_s.w  t6, t6, 24
   3307     shll_s.w  t7, t7, 24
   3308     shll_s.w  t8, t8, 24
   3309     sra       t5, t5, 24
   3310     sra       t6, t6, 24
   3311     sra       t7, t7, 24
   3312     sra       t8, t8, 24
   3313     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
   3314     addiu     t5, t5, 128
   3315     addiu     t6, t6, 128
   3316     addiu     t7, t7, 128
   3317     addiu     t8, t8, 128
   3318     sb        t5, 0(v0)
   3319     sb        t7, 1(v0)
   3320     sb        t8, 2(v0)
   3321     sb        t6, 3(v0)
   3322     // 2
   3323     li        s4, 15137
   3324     lw        s6, 40(t1)        // wsptr[2]
   3325     li        s5, 6270
   3326     lw        s7, 56(t1)        // wsptr[6]
   3327     mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
   3328     lw        t2, 32(t1)        // wsptr[0]
   3329     mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
   3330     lh        t5, 60(t1)        // wsptr[7]
   3331     lh        t6, 52(t1)        // wsptr[5]
   3332     lh        t7, 44(t1)        // wsptr[3]
   3333     lh        t8, 36(t1)        // wsptr[1]
   3334     ins       t5, t6, 16, 16
   3335     ins       t7, t8, 16, 16
   3336     mult      $ac0, zero, zero
   3337     dpa.w.ph  $ac0, t5, s0
   3338     dpa.w.ph  $ac0, t7, s1
   3339     mult      $ac1, zero, zero
   3340     dpa.w.ph  $ac1, t5, s2
   3341     dpa.w.ph  $ac1, t7, s3
   3342     sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
   3343     mflo      s6, $ac0
   3344     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
   3345     subu      s4, s4, s5
   3346     addu      t3, t2, s4        // tmp10 = tmp0 + z2
   3347     mflo      s7, $ac1
   3348     subu      t4, t2, s4        // tmp10 = tmp0 - z2
   3349     addu      t7, t4, s6
   3350     subu      t8, t4, s6
   3351     addu      t5, t3, s7
   3352     subu      t6, t3, s7
   3353     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
   3354     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
   3355     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
   3356     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
   3357     sll       s4, t9, 2
   3358     lw        v0, 4(a2)         // output_buf[ctr]
   3359     shll_s.w  t5, t5, 24
   3360     shll_s.w  t6, t6, 24
   3361     shll_s.w  t7, t7, 24
   3362     shll_s.w  t8, t8, 24
   3363     sra       t5, t5, 24
   3364     sra       t6, t6, 24
   3365     sra       t7, t7, 24
   3366     sra       t8, t8, 24
   3367     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
   3368     addiu     t5, t5, 128
   3369     addiu     t6, t6, 128
   3370     addiu     t7, t7, 128
   3371     addiu     t8, t8, 128
   3372     sb        t5, 0(v0)
   3373     sb        t7, 1(v0)
   3374     sb        t8, 2(v0)
   3375     sb        t6, 3(v0)
   3376     // 3
   3377     li        s4, 15137
   3378     lw        s6, 72(t1)        // wsptr[2]
   3379     li        s5, 6270
   3380     lw        s7, 88(t1)        // wsptr[6]
   3381     mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
   3382     lw        t2, 64(t1)        // wsptr[0]
   3383     mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
   3384     lh        t5, 92(t1)        // wsptr[7]
   3385     lh        t6, 84(t1)        // wsptr[5]
   3386     lh        t7, 76(t1)        // wsptr[3]
   3387     lh        t8, 68(t1)        // wsptr[1]
   3388     ins       t5, t6, 16, 16
   3389     ins       t7, t8, 16, 16
   3390     mult      $ac0, zero, zero
   3391     dpa.w.ph  $ac0, t5, s0
   3392     dpa.w.ph  $ac0, t7, s1
   3393     mult      $ac1, zero, zero
   3394     dpa.w.ph  $ac1, t5, s2
   3395     dpa.w.ph  $ac1, t7, s3
   3396     sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
   3397     mflo      s6, $ac0
   3398     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
   3399     subu      s4, s4, s5
   3400     addu      t3, t2, s4        // tmp10 = tmp0 + z2
   3401     mflo      s7, $ac1
   3402     subu      t4, t2, s4        // tmp10 = tmp0 - z2
   3403     addu      t7, t4, s6
   3404     subu      t8, t4, s6
   3405     addu      t5, t3, s7
   3406     subu      t6, t3, s7
   3407     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
   3408     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
   3409     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
   3410     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
   3411     sll       s4, t9, 2
   3412     lw        v0, 8(a2)         // output_buf[ctr]
   3413     shll_s.w  t5, t5, 24
   3414     shll_s.w  t6, t6, 24
   3415     shll_s.w  t7, t7, 24
   3416     shll_s.w  t8, t8, 24
   3417     sra       t5, t5, 24
   3418     sra       t6, t6, 24
   3419     sra       t7, t7, 24
   3420     sra       t8, t8, 24
   3421     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
   3422     addiu     t5, t5, 128
   3423     addiu     t6, t6, 128
   3424     addiu     t7, t7, 128
   3425     addiu     t8, t8, 128
   3426     sb        t5, 0(v0)
   3427     sb        t7, 1(v0)
   3428     sb        t8, 2(v0)
   3429     sb        t6, 3(v0)
   3430     li        s4, 15137
   3431     lw        s6, 104(t1)       // wsptr[2]
   3432     li        s5, 6270
   3433     lw        s7, 120(t1)       // wsptr[6]
   3434     mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
   3435     lw        t2, 96(t1)        // wsptr[0]
   3436     mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
   3437     lh        t5, 124(t1)       // wsptr[7]
   3438     lh        t6, 116(t1)       // wsptr[5]
   3439     lh        t7, 108(t1)       // wsptr[3]
   3440     lh        t8, 100(t1)       // wsptr[1]
   3441     ins       t5, t6, 16, 16
   3442     ins       t7, t8, 16, 16
   3443     mult      $ac0, zero, zero
   3444     dpa.w.ph  $ac0, t5, s0
   3445     dpa.w.ph  $ac0, t7, s1
   3446     mult      $ac1, zero, zero
   3447     dpa.w.ph  $ac1, t5, s2
   3448     dpa.w.ph  $ac1, t7, s3
   3449     sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
   3450     mflo      s6, $ac0
   3451     // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
   3452     subu      s4, s4, s5
   3453     addu      t3, t2, s4        // tmp10 = tmp0 + z2;
   3454     mflo      s7, $ac1
   3455     subu      t4, t2, s4        // tmp10 = tmp0 - z2;
   3456     addu      t7, t4, s6
   3457     subu      t8, t4, s6
   3458     addu      t5, t3, s7
   3459     subu      t6, t3, s7
   3460     shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
   3461     shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
   3462     shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
   3463     shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
   3464     sll       s4, t9, 2
   3465     lw        v0, 12(a2)        // output_buf[ctr]
   3466     shll_s.w  t5, t5, 24
   3467     shll_s.w  t6, t6, 24
   3468     shll_s.w  t7, t7, 24
   3469     shll_s.w  t8, t8, 24
   3470     sra       t5, t5, 24
   3471     sra       t6, t6, 24
   3472     sra       t7, t7, 24
   3473     sra       t8, t8, 24
   3474     addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
   3475     addiu     t5, t5, 128
   3476     addiu     t6, t6, 128
   3477     addiu     t7, t7, 128
   3478     addiu     t8, t8, 128
   3479     sb        t5, 0(v0)
   3480     sb        t7, 1(v0)
   3481     sb        t8, 2(v0)
   3482     sb        t6, 3(v0)
   3483 
   3484     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   3485 
   3486     j         ra
   3487      nop
   3488 END(jsimd_idct_4x4_mips_dspr2)
   3489 
   3490 /*****************************************************************************/
   3491 LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
   3492 /*
   3493  * a0     - compptr->dct_table
   3494  * a1     - coef_block
   3495  * a2     - output_buf
   3496  * a3     - output_col
   3497  */
   3498     .set at
   3499 
   3500     SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   3501 
   3502     addiu     sp, sp, -144
   3503     move      v0, sp
   3504     addiu     v1, v0, 24
   3505     addiu     t9, zero, 5793
   3506     addiu     s0, zero, 10033
   3507     addiu     s1, zero, 2998
   3508 
   3509 1:
   3510     lh        s2, 0(a0)   // q0 = quantptr[ 0]
   3511     lh        s3, 32(a0)  // q1 = quantptr[16]
   3512     lh        s4, 64(a0)  // q2 = quantptr[32]
   3513     lh        t2, 64(a1)  // tmp2 = inptr[32]
   3514     lh        t1, 32(a1)  // tmp1 = inptr[16]
   3515     lh        t0, 0(a1)   // tmp0 = inptr[ 0]
   3516     mul       t2, t2, s4  // tmp2 = tmp2 * q2
   3517     mul       t1, t1, s3  // tmp1 = tmp1 * q1
   3518     mul       t0, t0, s2  // tmp0 = tmp0 * q0
   3519     lh        t6, 16(a1)  // z1 = inptr[ 8]
   3520     lh        t8, 80(a1)  // z3 = inptr[40]
   3521     lh        t7, 48(a1)  // z2 = inptr[24]
   3522     lh        s2, 16(a0)  // q0 = quantptr[ 8]
   3523     lh        s4, 80(a0)  // q2 = quantptr[40]
   3524     lh        s3, 48(a0)  // q1 = quantptr[24]
   3525     mul       t2, t2, t9  // tmp2 = tmp2 * 5793
   3526     mul       t1, t1, s0  // tmp1 = tmp1 * 10033
   3527     sll       t0, t0, 13  // tmp0 = tmp0 << 13
   3528     mul       t6, t6, s2  // z1 = z1 * q0
   3529     mul       t8, t8, s4  // z3 = z3 * q2
   3530     mul       t7, t7, s3  // z2 = z2 * q1
   3531     addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
   3532     sll       t2, t2, 1   // tmp2 = tmp2 << 2
   3533     subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
   3534     subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
   3535     addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
   3536     addu      t1, t6, t8  // tmp1 = z1 + z3
   3537     mul       t1, t1, s1  // tmp1 = tmp1 * 2998
   3538     shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
   3539     subu      t2, t6, t8  // tmp2 = z1 - z3
   3540     subu      t2, t2, t7  // tmp2 = tmp2 - z2
   3541     sll       t2, t2, 2   // tmp2 = tmp2 << 2
   3542     addu      t0, t6, t7  // tmp0 = z1 + z2
   3543     sll       t0, t0, 13  // tmp0 = tmp0 << 13
   3544     subu      s2, t8, t7  // q0 = z3 - z2
   3545     sll       s2, s2, 13  // q0 = q0 << 13
   3546     addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
   3547     addu      t1, s2, t1  // tmp1 = q0 + tmp1
   3548     addu      s2, t4, t2  // q0 = tmp11 + tmp2
   3549     subu      s3, t4, t2  // q1 = tmp11 - tmp2
   3550     addu      t6, t3, t0  // z1 = tmp10 + tmp0
   3551     subu      t7, t3, t0  // z2 = tmp10 - tmp0
   3552     addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
   3553     subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
   3554     shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
   3555     shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
   3556     shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
   3557     shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
   3558     sw        s2, 24(v0)
   3559     sw        s3, 96(v0)
   3560     sw        t6, 0(v0)
   3561     sw        t7, 120(v0)
   3562     sw        t4, 48(v0)
   3563     sw        t5, 72(v0)
   3564     addiu     v0, v0, 4
   3565     addiu     a1, a1, 2
   3566     bne       v0, v1, 1b
   3567      addiu    a0, a0, 2
   3568 
   3569     /* Pass 2: process 6 rows from work array, store into output array. */
   3570     move      v0, sp
   3571     addiu     v1, v0, 144
   3572 
   3573 2:
   3574     lw        t0, 0(v0)
   3575     lw        t2, 16(v0)
   3576     lw        s5, 0(a2)
   3577     addiu     t0, t0, 16
   3578     sll       t0, t0, 13
   3579     mul       t3, t2, t9
   3580     lw        t6, 4(v0)
   3581     lw        t8, 20(v0)
   3582     lw        t7, 12(v0)
   3583     addu      s5, s5, a3
   3584     addu      s6, t6, t8
   3585     mul       s6, s6, s1
   3586     addu      t1, t0, t3
   3587     subu      t4, t0, t3
   3588     subu      t4, t4, t3
   3589     lw        t3, 8(v0)
   3590     mul       t0, t3, s0
   3591     addu      s7, t6, t7
   3592     sll       s7, s7, 13
   3593     addu      s7, s6, s7
   3594     subu      t2, t8, t7
   3595     sll       t2, t2, 13
   3596     addu      t2, s6, t2
   3597     subu      s6, t6, t7
   3598     subu      s6, s6, t8
   3599     sll       s6, s6, 13
   3600     addu      t3, t1, t0
   3601     subu      t5, t1, t0
   3602     addu      t6, t3, s7
   3603     subu      t3, t3, s7
   3604     addu      t7, t4, s6
   3605     subu      t4, t4, s6
   3606     addu      t8, t5, t2
   3607     subu      t5, t5, t2
   3608     shll_s.w  t6, t6, 6
   3609     shll_s.w  t3, t3, 6
   3610     shll_s.w  t7, t7, 6
   3611     shll_s.w  t4, t4, 6
   3612     shll_s.w  t8, t8, 6
   3613     shll_s.w  t5, t5, 6
   3614     sra       t6, t6, 24
   3615     addiu     t6, t6, 128
   3616     sra       t3, t3, 24
   3617     addiu     t3, t3, 128
   3618     sb        t6, 0(s5)
   3619     sra       t7, t7, 24
   3620     addiu     t7, t7, 128
   3621     sb        t3, 5(s5)
   3622     sra       t4, t4, 24
   3623     addiu     t4, t4, 128
   3624     sb        t7, 1(s5)
   3625     sra       t8, t8, 24
   3626     addiu     t8, t8, 128
   3627     sb        t4, 4(s5)
   3628     addiu     v0, v0, 24
   3629     sra       t5, t5, 24
   3630     addiu     t5, t5, 128
   3631     sb        t8, 2(s5)
   3632     addiu     a2, a2,  4
   3633     bne       v0, v1, 2b
   3634      sb       t5, 3(s5)
   3635 
   3636     addiu     sp, sp, 144
   3637 
   3638     RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
   3639 
   3640     j         ra
   3641      nop
   3642 
   3643 END(jsimd_idct_6x6_mips_dspr2)
   3644 
   3645 /*****************************************************************************/
   3646 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
   3647 /*
   3648  * a0     - compptr->dct_table
   3649  * a1     - coef_block
   3650  * a2     - workspace
   3651  */
   3652 
   3653     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
   3654 
   3655     li         a3, 8
   3656 
   3657 1:
   3658     // odd part
   3659     lh         t0, 48(a1)
   3660     lh         t1, 48(a0)
   3661     lh         t2, 16(a1)
   3662     lh         t3, 16(a0)
   3663     lh         t4, 80(a1)
   3664     lh         t5, 80(a0)
   3665     lh         t6, 112(a1)
   3666     lh         t7, 112(a0)
   3667     mul        t0, t0, t1    // z2
   3668     mul        t1, t2, t3    // z1
   3669     mul        t2, t4, t5    // z3
   3670     mul        t3, t6, t7    // z4
   3671     li         t4, 10703     // FIX(1.306562965)
   3672     li         t5, 4433      // FIX_0_541196100
   3673     li         t6, 7053      // FIX(0.860918669)
   3674     mul        t4, t0,t4     // tmp11
   3675     mul        t5, t0,t5     // -tmp14
   3676     addu       t7, t1,t2     // tmp10
   3677     addu       t8, t7,t3     // tmp10 + z4
   3678     mul        t6, t6, t8    // tmp15
   3679     li         t8, 2139      // FIX(0.261052384)
   3680     mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
   3681     li         t7, 2295      // FIX(0.280143716)
   3682     mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
   3683     addu       t9, t2, t3    // z3 + z4
   3684     li         s0, 8565      // FIX(1.045510580)
   3685     mul        t9, t9, s0    // -tmp13
   3686     li         s0, 12112     // FIX(1.478575242)
   3687     mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
   3688     li         s1, 12998     // FIX(1.586706681)
   3689     mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
   3690     li         s2, 5540      // FIX(0.676326758)
   3691     mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
   3692     li         s3, 16244     // FIX(1.982889723)
   3693     mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
   3694     subu       t1, t1, t3    // z1-=z4
   3695     subu       t0, t0, t2    // z2-=z3
   3696     addu       t2, t0, t1    // z1+z2
   3697     li         t3, 4433      // FIX_0_541196100
   3698     mul        t2, t2, t3    // z3
   3699     li         t3, 6270      // FIX_0_765366865
   3700     mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
   3701     li         t3, 15137     // FIX_0_765366865
   3702     mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
   3703     addu       t8, t6, t8    // tmp12
   3704     addu       t3, t8, t4    // tmp12 + tmp11
   3705     addu       t3, t3, t7    // tmp10
   3706     subu       t8, t8, t9    // tmp12 + tmp13
   3707     addu       s0, t5, s0
   3708     subu       t8, t8, s0    // tmp12
   3709     subu       t9, t6, t9
   3710     subu       s1, s1, t4
   3711     addu       t9, t9, s1    // tmp13
   3712     subu       t6, t6, t5
   3713     subu       t6, t6, s2
   3714     subu       t6, t6, s3    // tmp15
   3715     // even part start
   3716     lh         t4, 64(a1)
   3717     lh         t5, 64(a0)
   3718     lh         t7, 32(a1)
   3719     lh         s0, 32(a0)
   3720     lh         s1, 0(a1)
   3721     lh         s2, 0(a0)
   3722     lh         s3, 96(a1)
   3723     lh         v0, 96(a0)
   3724     mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
   3725     mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
   3726     mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
   3727     mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
   3728     // odd part end
   3729     addu       t1, t2, t1    // tmp11
   3730     subu       t0, t2, t0    // tmp14
   3731     // update counter and pointers
   3732     addiu      a3, a3, -1
   3733     addiu      a0, a0, 2
   3734     addiu      a1, a1, 2
   3735     // even part rest
   3736     li         s1, 10033
   3737     li         s2, 11190
   3738     mul        t4, t4, s1    // z4
   3739     mul        s1, t5, s2    // z4
   3740     sll        t5, t5, 13    // z1
   3741     sll        t7, t7, 13
   3742     addiu      t7, t7, 1024  // z3
   3743     sll        s0, s0, 13    // z2
   3744     addu       s2, t7, t4    // tmp10
   3745     subu       t4, t7, t4    // tmp11
   3746     subu       s3, t5, s0    // tmp12
   3747     addu       t2, t7, s3    // tmp21
   3748     subu       s3, t7, s3    // tmp24
   3749     addu       t7, s1, s0    // tmp12
   3750     addu       v0, s2, t7    // tmp20
   3751     subu       s2, s2, t7    // tmp25
   3752     subu       s1, s1, t5    // z4 - z1
   3753     subu       s1, s1, s0    // tmp12
   3754     addu       s0, t4, s1    // tmp22
   3755     subu       t4, t4, s1    // tmp23
   3756     // final output stage
   3757     addu       t5, v0, t3
   3758     subu       v0, v0, t3
   3759     addu       t3, t2, t1
   3760     subu       t2, t2, t1
   3761     addu       t1, s0, t8
   3762     subu       s0, s0, t8
   3763     addu       t8, t4, t9
   3764     subu       t4, t4, t9
   3765     addu       t9, s3, t0
   3766     subu       s3, s3, t0
   3767     addu       t0, s2, t6
   3768     subu       s2, s2, t6
   3769     sra        t5, t5, 11
   3770     sra        t3, t3, 11
   3771     sra        t1, t1, 11
   3772     sra        t8, t8, 11
   3773     sra        t9, t9, 11
   3774     sra        t0, t0, 11
   3775     sra        s2, s2, 11
   3776     sra        s3, s3, 11
   3777     sra        t4, t4, 11
   3778     sra        s0, s0, 11
   3779     sra        t2, t2, 11
   3780     sra        v0, v0, 11
   3781     sw         t5, 0(a2)
   3782     sw         t3, 32(a2)
   3783     sw         t1, 64(a2)
   3784     sw         t8, 96(a2)
   3785     sw         t9, 128(a2)
   3786     sw         t0, 160(a2)
   3787     sw         s2, 192(a2)
   3788     sw         s3, 224(a2)
   3789     sw         t4, 256(a2)
   3790     sw         s0, 288(a2)
   3791     sw         t2, 320(a2)
   3792     sw         v0, 352(a2)
   3793     bgtz       a3, 1b
   3794      addiu     a2, a2, 4
   3795 
   3796     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
   3797 
   3798     j          ra
   3799      nop
   3800 
   3801 END(jsimd_idct_12x12_pass1_mips_dspr2)
   3802 
   3803 /*****************************************************************************/
   3804 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
   3805 /*
   3806  * a0     - workspace
   3807  * a1     - output
   3808  */
   3809 
   3810     SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
   3811 
   3812     li        a3, 12
   3813 
   3814 1:
   3815     // Odd part
   3816     lw        t0, 12(a0)
   3817     lw        t1, 4(a0)
   3818     lw        t2, 20(a0)
   3819     lw        t3, 28(a0)
   3820     li        t4, 10703     // FIX(1.306562965)
   3821     li        t5, 4433      // FIX_0_541196100
   3822     mul       t4, t0, t4    // tmp11
   3823     mul       t5, t0, t5    // -tmp14
   3824     addu      t6, t1, t2    // tmp10
   3825     li        t7, 2139      // FIX(0.261052384)
   3826     mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
   3827     addu      t6, t6, t3    // tmp10 + z4
   3828     li        t8, 7053      // FIX(0.860918669)
   3829     mul       t6, t6, t8    // tmp15
   3830     li        t8, 2295      // FIX(0.280143716)
   3831     mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
   3832     addu      t9, t2, t3    // z3 + z4
   3833     li        s0, 8565      // FIX(1.045510580)
   3834     mul       t9, t9, s0    // -tmp13
   3835     li        s0, 12112     // FIX(1.478575242)
   3836     mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
   3837     li        s1, 12998     // FIX(1.586706681)
   3838     mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
   3839     li        s2, 5540      // FIX(0.676326758)
   3840     mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
   3841     li        s3, 16244     // FIX(1.982889723)
   3842     mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
   3843     subu      t1, t1, t3    // z1 -= z4
   3844     subu      t0, t0, t2    // z2 -= z3
   3845     addu      t2, t1, t0    // z1 + z2
   3846     li        t3, 4433      // FIX_0_541196100
   3847     mul       t2, t2, t3    // z3
   3848     li        t3, 6270      // FIX_0_765366865
   3849     mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
   3850     li        t3, 15137     // FIX_1_847759065
   3851     mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
   3852     addu      t3, t6, t7    // tmp12
   3853     addu      t7, t3, t4
   3854     addu      t7, t7, t8    // tmp10
   3855     subu      t3, t3, t9
   3856     subu      t3, t3, t5
   3857     subu      t3, t3, s0    // tmp12
   3858     subu      t9, t6, t9
   3859     subu      t9, t9, t4
   3860     addu      t9, t9, s1    // tmp13
   3861     subu      t6, t6, t5
   3862     subu      t6, t6, s2
   3863     subu      t6, t6, s3    // tmp15
   3864     addu      t1, t2, t1    // tmp11
   3865     subu      t0, t2, t0    // tmp14
   3866     // even part
   3867     lw        t2, 16(a0)    // z4
   3868     lw        t4, 8(a0)     // z1
   3869     lw        t5, 0(a0)     // z3
   3870     lw        t8, 24(a0)    // z2
   3871     li        s0, 10033     // FIX(1.224744871)
   3872     li        s1, 11190     // FIX(1.366025404)
   3873     mul       t2, t2, s0    // z4
   3874     mul       s0, t4, s1    // z4
   3875     addiu     t5, t5, 0x10
   3876     sll       t5, t5, 13    // z3
   3877     sll       t4, t4, 13    // z1
   3878     sll       t8, t8, 13    // z2
   3879     subu      s1, t4, t8    // tmp12
   3880     addu      s2, t5, t2    // tmp10
   3881     subu      t2, t5, t2    // tmp11
   3882     addu      s3, t5, s1    // tmp21
   3883     subu      s1, t5, s1    // tmp24
   3884     addu      t5, s0, t8    // tmp12
   3885     addu      v0, s2, t5    // tmp20
   3886     subu      t5, s2, t5    // tmp25
   3887     subu      t4, s0, t4
   3888     subu      t4, t4, t8    // tmp12
   3889     addu      t8, t2, t4    // tmp22
   3890     subu      t2, t2, t4    // tmp23
   3891     // increment counter and pointers
   3892     addiu     a3, a3, -1
   3893     addiu     a0, a0, 32
   3894     // Final stage
   3895     addu      t4, v0, t7
   3896     subu      v0, v0, t7
   3897     addu      t7, s3, t1
   3898     subu      s3, s3, t1
   3899     addu      t1, t8, t3
   3900     subu      t8, t8, t3
   3901     addu      t3, t2, t9
   3902     subu      t2, t2, t9
   3903     addu      t9, s1, t0
   3904     subu      s1, s1, t0
   3905     addu      t0, t5, t6
   3906     subu      t5, t5, t6
   3907     sll       t4, t4, 4
   3908     sll       t7, t7, 4
   3909     sll       t1, t1, 4
   3910     sll       t3, t3, 4
   3911     sll       t9, t9, 4
   3912     sll       t0, t0, 4
   3913     sll       t5, t5, 4
   3914     sll       s1, s1, 4
   3915     sll       t2, t2, 4
   3916     sll       t8, t8, 4
   3917     sll       s3, s3, 4
   3918     sll       v0, v0, 4
   3919     shll_s.w  t4, t4, 2
   3920     shll_s.w  t7, t7, 2
   3921     shll_s.w  t1, t1, 2
   3922     shll_s.w  t3, t3, 2
   3923     shll_s.w  t9, t9, 2
   3924     shll_s.w  t0, t0, 2
   3925     shll_s.w  t5, t5, 2
   3926     shll_s.w  s1, s1, 2
   3927     shll_s.w  t2, t2, 2
   3928     shll_s.w  t8, t8, 2
   3929     shll_s.w  s3, s3, 2
   3930     shll_s.w  v0, v0, 2
   3931     srl       t4, t4, 24
   3932     srl       t7, t7, 24
   3933     srl       t1, t1, 24
   3934     srl       t3, t3, 24
   3935     srl       t9, t9, 24
   3936     srl       t0, t0, 24
   3937     srl       t5, t5, 24
   3938     srl       s1, s1, 24
   3939     srl       t2, t2, 24
   3940     srl       t8, t8, 24
   3941     srl       s3, s3, 24
   3942     srl       v0, v0, 24
   3943     lw        t6, 0(a1)
   3944     addiu     t4, t4, 0x80
   3945     addiu     t7, t7, 0x80
   3946     addiu     t1, t1, 0x80
   3947     addiu     t3, t3, 0x80
   3948     addiu     t9, t9, 0x80
   3949     addiu     t0, t0, 0x80
   3950     addiu     t5, t5, 0x80
   3951     addiu     s1, s1, 0x80
   3952     addiu     t2, t2, 0x80
   3953     addiu     t8, t8, 0x80
   3954     addiu     s3, s3, 0x80
   3955     addiu     v0, v0, 0x80
   3956     sb        t4, 0(t6)
   3957     sb        t7, 1(t6)
   3958     sb        t1, 2(t6)
   3959     sb        t3, 3(t6)
   3960     sb        t9, 4(t6)
   3961     sb        t0, 5(t6)
   3962     sb        t5, 6(t6)
   3963     sb        s1, 7(t6)
   3964     sb        t2, 8(t6)
   3965     sb        t8, 9(t6)
   3966     sb        s3, 10(t6)
   3967     sb        v0, 11(t6)
   3968     bgtz      a3, 1b
   3969      addiu    a1, a1, 4
   3970 
   3971     RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
   3972 
   3973     jr        ra
   3974      nop
   3975 
   3976 END(jsimd_idct_12x12_pass2_mips_dspr2)
   3977 
   3978 /*****************************************************************************/
   3979 LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
   3980 /*
   3981  * a0     - sample_data
   3982  * a1     - start_col
   3983  * a2     - workspace
   3984  */
   3985 
   3986     lw             t0, 0(a0)
   3987     li             t7, 0xff80ff80
   3988     addu           t0, t0, a1
   3989     ulw            t1, 0(t0)
   3990     ulw            t2, 4(t0)
   3991     preceu.ph.qbr  t3, t1
   3992     preceu.ph.qbl  t4, t1
   3993     lw             t0, 4(a0)
   3994     preceu.ph.qbr  t5, t2
   3995     preceu.ph.qbl  t6, t2
   3996     addu           t0, t0, a1
   3997     addu.ph        t3, t3, t7
   3998     addu.ph        t4, t4, t7
   3999     ulw            t1, 0(t0)
   4000     ulw            t2, 4(t0)
   4001     addu.ph        t5, t5, t7
   4002     addu.ph        t6, t6, t7
   4003     usw            t3, 0(a2)
   4004     usw            t4, 4(a2)
   4005     preceu.ph.qbr  t3, t1
   4006     preceu.ph.qbl  t4, t1
   4007     usw            t5, 8(a2)
   4008     usw            t6, 12(a2)
   4009 
   4010     lw             t0, 8(a0)
   4011     preceu.ph.qbr  t5, t2
   4012     preceu.ph.qbl  t6, t2
   4013     addu           t0, t0, a1
   4014     addu.ph        t3, t3, t7
   4015     addu.ph        t4, t4, t7
   4016     ulw            t1, 0(t0)
   4017     ulw            t2, 4(t0)
   4018     addu.ph        t5, t5, t7
   4019     addu.ph        t6, t6, t7
   4020     usw            t3, 16(a2)
   4021     usw            t4, 20(a2)
   4022     preceu.ph.qbr  t3, t1
   4023     preceu.ph.qbl  t4, t1
   4024     usw            t5, 24(a2)
   4025     usw            t6, 28(a2)
   4026 
   4027     lw             t0, 12(a0)
   4028     preceu.ph.qbr  t5, t2
   4029     preceu.ph.qbl  t6, t2
   4030     addu           t0, t0, a1
   4031     addu.ph        t3, t3, t7
   4032     addu.ph        t4, t4, t7
   4033     ulw            t1, 0(t0)
   4034     ulw            t2, 4(t0)
   4035     addu.ph        t5, t5, t7
   4036     addu.ph        t6, t6, t7
   4037     usw            t3, 32(a2)
   4038     usw            t4, 36(a2)
   4039     preceu.ph.qbr  t3, t1
   4040     preceu.ph.qbl  t4, t1
   4041     usw            t5, 40(a2)
   4042     usw            t6, 44(a2)
   4043 
   4044     lw             t0, 16(a0)
   4045     preceu.ph.qbr  t5, t2
   4046     preceu.ph.qbl  t6, t2
   4047     addu           t0, t0, a1
   4048     addu.ph        t3, t3, t7
   4049     addu.ph        t4, t4, t7
   4050     ulw            t1, 0(t0)
   4051     ulw            t2, 4(t0)
   4052     addu.ph        t5, t5, t7
   4053     addu.ph        t6, t6, t7
   4054     usw            t3, 48(a2)
   4055     usw            t4, 52(a2)
   4056     preceu.ph.qbr  t3, t1
   4057     preceu.ph.qbl  t4, t1
   4058     usw            t5, 56(a2)
   4059     usw            t6, 60(a2)
   4060 
   4061     lw             t0, 20(a0)
   4062     preceu.ph.qbr  t5, t2
   4063     preceu.ph.qbl  t6, t2
   4064     addu           t0, t0, a1
   4065     addu.ph        t3, t3, t7
   4066     addu.ph        t4, t4, t7
   4067     ulw            t1, 0(t0)
   4068     ulw            t2, 4(t0)
   4069     addu.ph        t5, t5, t7
   4070     addu.ph        t6, t6, t7
   4071     usw            t3, 64(a2)
   4072     usw            t4, 68(a2)
   4073     preceu.ph.qbr  t3, t1
   4074     preceu.ph.qbl  t4, t1
   4075     usw            t5, 72(a2)
   4076     usw            t6, 76(a2)
   4077 
   4078     lw             t0, 24(a0)
   4079     preceu.ph.qbr  t5, t2
   4080     preceu.ph.qbl  t6, t2
   4081     addu           t0, t0, a1
   4082     addu.ph        t3, t3, t7
   4083     addu.ph        t4, t4, t7
   4084     ulw            t1, 0(t0)
   4085     ulw            t2, 4(t0)
   4086     addu.ph        t5, t5, t7
   4087     addu.ph        t6, t6, t7
   4088     usw            t3, 80(a2)
   4089     usw            t4, 84(a2)
   4090     preceu.ph.qbr  t3, t1
   4091     preceu.ph.qbl  t4, t1
   4092     usw            t5, 88(a2)
   4093     usw            t6, 92(a2)
   4094 
   4095     lw             t0, 28(a0)
   4096     preceu.ph.qbr  t5, t2
   4097     preceu.ph.qbl  t6, t2
   4098     addu           t0, t0, a1
   4099     addu.ph        t3, t3, t7
   4100     addu.ph        t4, t4, t7
   4101     ulw            t1, 0(t0)
   4102     ulw            t2, 4(t0)
   4103     addu.ph        t5, t5, t7
   4104     addu.ph        t6, t6, t7
   4105     usw            t3, 96(a2)
   4106     usw            t4, 100(a2)
   4107     preceu.ph.qbr  t3, t1
   4108     preceu.ph.qbl  t4, t1
   4109     usw            t5, 104(a2)
   4110     usw            t6, 108(a2)
   4111     preceu.ph.qbr  t5, t2
   4112     preceu.ph.qbl  t6, t2
   4113     addu.ph        t3, t3, t7
   4114     addu.ph        t4, t4, t7
   4115     addu.ph        t5, t5, t7
   4116     addu.ph        t6, t6, t7
   4117     usw            t3, 112(a2)
   4118     usw            t4, 116(a2)
   4119     usw            t5, 120(a2)
   4120     usw            t6, 124(a2)
   4121 
   4122     j              ra
   4123      nop
   4124 
   4125 END(jsimd_convsamp_mips_dspr2)
   4126 
   4127 /*****************************************************************************/
   4128 LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
   4129 /*
   4130  * a0     - sample_data
   4131  * a1     - start_col
   4132  * a2     - workspace
   4133  */
   4134 
   4135     .set at
   4136 
   4137     lw       t0, 0(a0)
   4138     addu     t0, t0, a1
   4139     lbu      t1, 0(t0)
   4140     lbu      t2, 1(t0)
   4141     lbu      t3, 2(t0)
   4142     lbu      t4, 3(t0)
   4143     lbu      t5, 4(t0)
   4144     lbu      t6, 5(t0)
   4145     lbu      t7, 6(t0)
   4146     lbu      t8, 7(t0)
   4147     addiu    t1, t1, -128
   4148     addiu    t2, t2, -128
   4149     addiu    t3, t3, -128
   4150     addiu    t4, t4, -128
   4151     addiu    t5, t5, -128
   4152     addiu    t6, t6, -128
   4153     addiu    t7, t7, -128
   4154     addiu    t8, t8, -128
   4155     mtc1     t1, f2
   4156     mtc1     t2, f4
   4157     mtc1     t3, f6
   4158     mtc1     t4, f8
   4159     mtc1     t5, f10
   4160     mtc1     t6, f12
   4161     mtc1     t7, f14
   4162     mtc1     t8, f16
   4163     cvt.s.w  f2, f2
   4164     cvt.s.w  f4, f4
   4165     cvt.s.w  f6, f6
   4166     cvt.s.w  f8, f8
   4167     cvt.s.w  f10, f10
   4168     cvt.s.w  f12, f12
   4169     cvt.s.w  f14, f14
   4170     cvt.s.w  f16, f16
   4171     lw       t0, 4(a0)
   4172     swc1     f2, 0(a2)
   4173     swc1     f4, 4(a2)
   4174     swc1     f6, 8(a2)
   4175     addu     t0, t0, a1
   4176     swc1     f8, 12(a2)
   4177     swc1     f10, 16(a2)
   4178     swc1     f12, 20(a2)
   4179     swc1     f14, 24(a2)
   4180     swc1     f16, 28(a2)
   4181     //elemr 1
   4182     lbu      t1, 0(t0)
   4183     lbu      t2, 1(t0)
   4184     lbu      t3, 2(t0)
   4185     lbu      t4, 3(t0)
   4186     lbu      t5, 4(t0)
   4187     lbu      t6, 5(t0)
   4188     lbu      t7, 6(t0)
   4189     lbu      t8, 7(t0)
   4190     addiu    t1, t1, -128
   4191     addiu    t2, t2, -128
   4192     addiu    t3, t3, -128
   4193     addiu    t4, t4, -128
   4194     addiu    t5, t5, -128
   4195     addiu    t6, t6, -128
   4196     addiu    t7, t7, -128
   4197     addiu    t8, t8, -128
   4198     mtc1     t1, f2
   4199     mtc1     t2, f4
   4200     mtc1     t3, f6
   4201     mtc1     t4, f8
   4202     mtc1     t5, f10
   4203     mtc1     t6, f12
   4204     mtc1     t7, f14
   4205     mtc1     t8, f16
   4206     cvt.s.w  f2, f2
   4207     cvt.s.w  f4, f4
   4208     cvt.s.w  f6, f6
   4209     cvt.s.w  f8, f8
   4210     cvt.s.w  f10, f10
   4211     cvt.s.w  f12, f12
   4212     cvt.s.w  f14, f14
   4213     cvt.s.w  f16, f16
   4214     lw       t0, 8(a0)
   4215     swc1     f2, 32(a2)
   4216     swc1     f4, 36(a2)
   4217     swc1     f6, 40(a2)
   4218     addu     t0, t0, a1
   4219     swc1     f8, 44(a2)
   4220     swc1     f10, 48(a2)
   4221     swc1     f12, 52(a2)
   4222     swc1     f14, 56(a2)
   4223     swc1     f16, 60(a2)
   4224     //elemr 2
   4225     lbu      t1, 0(t0)
   4226     lbu      t2, 1(t0)
   4227     lbu      t3, 2(t0)
   4228     lbu      t4, 3(t0)
   4229     lbu      t5, 4(t0)
   4230     lbu      t6, 5(t0)
   4231     lbu      t7, 6(t0)
   4232     lbu      t8, 7(t0)
   4233     addiu    t1, t1, -128
   4234     addiu    t2, t2, -128
   4235     addiu    t3, t3, -128
   4236     addiu    t4, t4, -128
   4237     addiu    t5, t5, -128
   4238     addiu    t6, t6, -128
   4239     addiu    t7, t7, -128
   4240     addiu    t8, t8, -128
   4241     mtc1     t1, f2
   4242     mtc1     t2, f4
   4243     mtc1     t3, f6
   4244     mtc1     t4, f8
   4245     mtc1     t5, f10
   4246     mtc1     t6, f12
   4247     mtc1     t7, f14
   4248     mtc1     t8, f16
   4249     cvt.s.w  f2, f2
   4250     cvt.s.w  f4, f4
   4251     cvt.s.w  f6, f6
   4252     cvt.s.w  f8, f8
   4253     cvt.s.w  f10, f10
   4254     cvt.s.w  f12, f12
   4255     cvt.s.w  f14, f14
   4256     cvt.s.w  f16, f16
   4257     lw       t0, 12(a0)
   4258     swc1     f2, 64(a2)
   4259     swc1     f4, 68(a2)
   4260     swc1     f6, 72(a2)
   4261     addu     t0, t0, a1
   4262     swc1     f8, 76(a2)
   4263     swc1     f10, 80(a2)
   4264     swc1     f12, 84(a2)
   4265     swc1     f14, 88(a2)
   4266     swc1     f16, 92(a2)
   4267     //elemr 3
   4268     lbu      t1, 0(t0)
   4269     lbu      t2, 1(t0)
   4270     lbu      t3, 2(t0)
   4271     lbu      t4, 3(t0)
   4272     lbu      t5, 4(t0)
   4273     lbu      t6, 5(t0)
   4274     lbu      t7, 6(t0)
   4275     lbu      t8, 7(t0)
   4276     addiu    t1, t1, -128
   4277     addiu    t2, t2, -128
   4278     addiu    t3, t3, -128
   4279     addiu    t4, t4, -128
   4280     addiu    t5, t5, -128
   4281     addiu    t6, t6, -128
   4282     addiu    t7, t7, -128
   4283     addiu    t8, t8, -128
   4284     mtc1     t1, f2
   4285     mtc1     t2, f4
   4286     mtc1     t3, f6
   4287     mtc1     t4, f8
   4288     mtc1     t5, f10
   4289     mtc1     t6, f12
   4290     mtc1     t7, f14
   4291     mtc1     t8, f16
   4292     cvt.s.w  f2, f2
   4293     cvt.s.w  f4, f4
   4294     cvt.s.w  f6, f6
   4295     cvt.s.w  f8, f8
   4296     cvt.s.w  f10, f10
   4297     cvt.s.w  f12, f12
   4298     cvt.s.w  f14, f14
   4299     cvt.s.w  f16, f16
   4300     lw       t0, 16(a0)
   4301     swc1     f2, 96(a2)
   4302     swc1     f4, 100(a2)
   4303     swc1     f6, 104(a2)
   4304     addu     t0, t0, a1
   4305     swc1     f8, 108(a2)
   4306     swc1     f10, 112(a2)
   4307     swc1     f12, 116(a2)
   4308     swc1     f14, 120(a2)
   4309     swc1     f16, 124(a2)
   4310     //elemr 4
   4311     lbu      t1, 0(t0)
   4312     lbu      t2, 1(t0)
   4313     lbu      t3, 2(t0)
   4314     lbu      t4, 3(t0)
   4315     lbu      t5, 4(t0)
   4316     lbu      t6, 5(t0)
   4317     lbu      t7, 6(t0)
   4318     lbu      t8, 7(t0)
   4319     addiu    t1, t1, -128
   4320     addiu    t2, t2, -128
   4321     addiu    t3, t3, -128
   4322     addiu    t4, t4, -128
   4323     addiu    t5, t5, -128
   4324     addiu    t6, t6, -128
   4325     addiu    t7, t7, -128
   4326     addiu    t8, t8, -128
   4327     mtc1     t1, f2
   4328     mtc1     t2, f4
   4329     mtc1     t3, f6
   4330     mtc1     t4, f8
   4331     mtc1     t5, f10
   4332     mtc1     t6, f12
   4333     mtc1     t7, f14
   4334     mtc1     t8, f16
   4335     cvt.s.w  f2, f2
   4336     cvt.s.w  f4, f4
   4337     cvt.s.w  f6, f6
   4338     cvt.s.w  f8, f8
   4339     cvt.s.w  f10, f10
   4340     cvt.s.w  f12, f12
   4341     cvt.s.w  f14, f14
   4342     cvt.s.w  f16, f16
   4343     lw       t0, 20(a0)
   4344     swc1     f2, 128(a2)
   4345     swc1     f4, 132(a2)
   4346     swc1     f6, 136(a2)
   4347     addu     t0, t0, a1
   4348     swc1     f8, 140(a2)
   4349     swc1     f10, 144(a2)
   4350     swc1     f12, 148(a2)
   4351     swc1     f14, 152(a2)
   4352     swc1     f16, 156(a2)
   4353     //elemr 5
   4354     lbu      t1, 0(t0)
   4355     lbu      t2, 1(t0)
   4356     lbu      t3, 2(t0)
   4357     lbu      t4, 3(t0)
   4358     lbu      t5, 4(t0)
   4359     lbu      t6, 5(t0)
   4360     lbu      t7, 6(t0)
   4361     lbu      t8, 7(t0)
   4362     addiu    t1, t1, -128
   4363     addiu    t2, t2, -128
   4364     addiu    t3, t3, -128
   4365     addiu    t4, t4, -128
   4366     addiu    t5, t5, -128
   4367     addiu    t6, t6, -128
   4368     addiu    t7, t7, -128
   4369     addiu    t8, t8, -128
   4370     mtc1     t1, f2
   4371     mtc1     t2, f4
   4372     mtc1     t3, f6
   4373     mtc1     t4, f8
   4374     mtc1     t5, f10
   4375     mtc1     t6, f12
   4376     mtc1     t7, f14
   4377     mtc1     t8, f16
   4378     cvt.s.w  f2, f2
   4379     cvt.s.w  f4, f4
   4380     cvt.s.w  f6, f6
   4381     cvt.s.w  f8, f8
   4382     cvt.s.w  f10, f10
   4383     cvt.s.w  f12, f12
   4384     cvt.s.w  f14, f14
   4385     cvt.s.w  f16, f16
   4386     lw       t0, 24(a0)
   4387     swc1     f2, 160(a2)
   4388     swc1     f4, 164(a2)
   4389     swc1     f6, 168(a2)
   4390     addu     t0, t0, a1
   4391     swc1     f8, 172(a2)
   4392     swc1     f10, 176(a2)
   4393     swc1     f12, 180(a2)
   4394     swc1     f14, 184(a2)
   4395     swc1     f16, 188(a2)
   4396     //elemr 6
   4397     lbu      t1, 0(t0)
   4398     lbu      t2, 1(t0)
   4399     lbu      t3, 2(t0)
   4400     lbu      t4, 3(t0)
   4401     lbu      t5, 4(t0)
   4402     lbu      t6, 5(t0)
   4403     lbu      t7, 6(t0)
   4404     lbu      t8, 7(t0)
   4405     addiu    t1, t1, -128
   4406     addiu    t2, t2, -128
   4407     addiu    t3, t3, -128
   4408     addiu    t4, t4, -128
   4409     addiu    t5, t5, -128
   4410     addiu    t6, t6, -128
   4411     addiu    t7, t7, -128
   4412     addiu    t8, t8, -128
   4413     mtc1     t1, f2
   4414     mtc1     t2, f4
   4415     mtc1     t3, f6
   4416     mtc1     t4, f8
   4417     mtc1     t5, f10
   4418     mtc1     t6, f12
   4419     mtc1     t7, f14
   4420     mtc1     t8, f16
   4421     cvt.s.w  f2, f2
   4422     cvt.s.w  f4, f4
   4423     cvt.s.w  f6, f6
   4424     cvt.s.w  f8, f8
   4425     cvt.s.w  f10, f10
   4426     cvt.s.w  f12, f12
   4427     cvt.s.w  f14, f14
   4428     cvt.s.w  f16, f16
   4429     lw       t0, 28(a0)
   4430     swc1     f2, 192(a2)
   4431     swc1     f4, 196(a2)
   4432     swc1     f6, 200(a2)
   4433     addu     t0, t0, a1
   4434     swc1     f8, 204(a2)
   4435     swc1     f10, 208(a2)
   4436     swc1     f12, 212(a2)
   4437     swc1     f14, 216(a2)
   4438     swc1     f16, 220(a2)
   4439     //elemr 7
   4440     lbu      t1, 0(t0)
   4441     lbu      t2, 1(t0)
   4442     lbu      t3, 2(t0)
   4443     lbu      t4, 3(t0)
   4444     lbu      t5, 4(t0)
   4445     lbu      t6, 5(t0)
   4446     lbu      t7, 6(t0)
   4447     lbu      t8, 7(t0)
   4448     addiu    t1, t1, -128
   4449     addiu    t2, t2, -128
   4450     addiu    t3, t3, -128
   4451     addiu    t4, t4, -128
   4452     addiu    t5, t5, -128
   4453     addiu    t6, t6, -128
   4454     addiu    t7, t7, -128
   4455     addiu    t8, t8, -128
   4456     mtc1     t1, f2
   4457     mtc1     t2, f4
   4458     mtc1     t3, f6
   4459     mtc1     t4, f8
   4460     mtc1     t5, f10
   4461     mtc1     t6, f12
   4462     mtc1     t7, f14
   4463     mtc1     t8, f16
   4464     cvt.s.w  f2, f2
   4465     cvt.s.w  f4, f4
   4466     cvt.s.w  f6, f6
   4467     cvt.s.w  f8, f8
   4468     cvt.s.w  f10, f10
   4469     cvt.s.w  f12, f12
   4470     cvt.s.w  f14, f14
   4471     cvt.s.w  f16, f16
   4472     swc1     f2, 224(a2)
   4473     swc1     f4, 228(a2)
   4474     swc1     f6, 232(a2)
   4475     swc1     f8, 236(a2)
   4476     swc1     f10, 240(a2)
   4477     swc1     f12, 244(a2)
   4478     swc1     f14, 248(a2)
   4479     swc1     f16, 252(a2)
   4480 
   4481     j        ra
   4482      nop
   4483 
   4484 END(jsimd_convsamp_float_mips_dspr2)
   4485 
   4486 /*****************************************************************************/
   4487 
   4488