Home | History | Annotate | Download | only in jpeg
      1 #
      2 # Copyright (C) 2011 The Android Open Source Project
      3 #
      4 # Licensed under the Apache License, Version 2.0 (the "License");
      5 # you may not use this file except in compliance with the License.
      6 # You may obtain a copy of the License at
      7 #
      8 #      http://www.apache.org/licenses/LICENSE-2.0
      9 #
     10 # Unless required by applicable law or agreed to in writing, software
     11 # distributed under the License is distributed on an "AS IS" BASIS,
     12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 # See the License for the specific language governing permissions and
     14 # limitations under the License.
     15 
     16 
     17 # IDCT implementation using the MIPS DSP ASE (little endian version)
     18 #
     19 # See MIPS Technologies Inc documents:
     20 # "JPEG Decoder Optimization for MIPS32(R) Cores"  MD00483
     21 #
     22 # "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP
     23 #       Application Specifice Extension to the MIPS32(R) Architecture" MD00374
     24 #
     25 
     26         .set            noreorder
     27         .set            nomacro
     28         .set            noat
     29 
     30 # This table has been moved to mips_jidctfst.c to avoid having to mess
     31 # with the global pointer to make this code PIC.
     32 #       .rdata
     33 #
     34 # mips_idct_coefs:
     35 #       # Constant table of scaled IDCT coefficients.
     36 #
     37 #       .word           0x45464546              # FIX( 1.082392200 / 2) =  17734 = 0x4546
     38 #       .word           0x5A825A82              # FIX( 1.414213562 / 2) =  23170 = 0x5A82
     39 #       .word           0x76427642              # FIX( 1.847759065 / 2) =  30274 = 0x7642
     40 #       .word           0xAC61AC61              # FIX(-2.613125930 / 4) = -21407 = 0xAC61
     41 
     42         .text
     43 
     44         .global         mips_idct_columns
     45         .ent            mips_idct_columns
     46 
     47 # void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr,
     48 #                        DCTELEM * wsptr, const int * mips_idct_coefs);
     49 
     50 mips_idct_columns:
     51 
     52 # $a0   - inptr
     53 # $a1   - quantptr
     54 # $a2   - wsptr
     55 # $a3, $at   - mips_idct_coefs
     56 # $t0:7 - simd data
     57 # $t8   - coefficients, temp
     58 # $t9   - loop end address
     59 # $s0:3 - simd quantization factors
     60 # $s4:7 - temp results
     61 # $v0:1 - temp results
     62 
     63         addiu           $sp, $sp, -32           # reserve stack space for s0-s7
     64 
     65         sw              $s0, 28($sp)
     66         sw              $s1, 24($sp)
     67         sw              $s2, 20($sp)
     68         sw              $s3, 16($sp)
     69         sw              $s4, 12($sp)
     70         sw              $s5,  8($sp)
     71         sw              $s6,  4($sp)
     72         sw              $s7,  0($sp)
     73 
     74         addiu           $t9, $a0, 16            # end address
     75 
     76         #lui            $at, %hi(mips_idct_coefs)
     77         #ori            $at, %lo(mips_idct_coefs)
     78         # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it
     79         or              $at, $a3, $zero
     80 
     81 loop_columns:
     82 
     83         lw              $s0, 0($a1)             # quantptr[DCTSIZE*0]
     84 
     85         lw              $t0, 0($a0)             # inptr[DCTSIZE*0]
     86         lw              $t1, 16($a0)            # inptr[DCTSIZE*1]
     87 
     88         muleq_s.w.phl   $v0, $t0, $s0           # tmp0 ...
     89 
     90         lw              $t2, 32($a0)            # inptr[DCTSIZE*2]
     91         lw              $t3, 48($a0)            # inptr[DCTSIZE*3]
     92         lw              $t4, 64($a0)            # inptr[DCTSIZE*4]
     93         lw              $t5, 80($a0)            # inptr[DCTSIZE*5]
     94 
     95         muleq_s.w.phr   $t0, $t0, $s0           # ... tmp0 ...
     96 
     97         lw              $t6, 96($a0)            # inptr[DCTSIZE*6]
     98         lw              $t7, 112($a0)           # inptr[DCTSIZE*7]
     99 
    100         or              $s4, $t1, $t2
    101         or              $s5, $t3, $t4
    102 
    103         bnez            $s4, full_column
    104         ins             $t0, $v0, 16, 16        # ... tmp0
    105 
    106         bnez            $s5, full_column
    107         or              $s6, $t5, $t6
    108         or              $s6, $s6, $t7
    109         bnez            $s6, full_column
    110 
    111         sw              $t0, 0($a2)             # wsptr[DCTSIZE*0]
    112         sw              $t0, 16($a2)            # wsptr[DCTSIZE*1]
    113         sw              $t0, 32($a2)            # wsptr[DCTSIZE*2]
    114         sw              $t0, 48($a2)            # wsptr[DCTSIZE*3]
    115         sw              $t0, 64($a2)            # wsptr[DCTSIZE*4]
    116         sw              $t0, 80($a2)            # wsptr[DCTSIZE*5]
    117         sw              $t0, 96($a2)            # wsptr[DCTSIZE*6]
    118         sw              $t0, 112($a2)           # wsptr[DCTSIZE*7]
    119 
    120         addiu           $a0, $a0, 4
    121 
    122         b               continue_columns
    123         addiu           $a1, $a1, 4
    124 
    125 
    126 full_column:
    127 
    128         lw              $s1, 32($a1)            # quantptr[DCTSIZE*2]
    129         lw              $s2, 64($a1)            # quantptr[DCTSIZE*4]
    130 
    131         muleq_s.w.phl   $v0, $t2, $s1           # tmp1 ...
    132         muleq_s.w.phr   $t2, $t2, $s1           # ... tmp1 ...
    133 
    134         lw              $s0, 16($a1)            # quantptr[DCTSIZE*1]
    135         lw              $s1, 48($a1)            # quantptr[DCTSIZE*3]
    136         lw              $s3, 96($a1)            # quantptr[DCTSIZE*6]
    137 
    138         muleq_s.w.phl   $v1, $t4, $s2           # tmp2 ...
    139         muleq_s.w.phr   $t4, $t4, $s2           # ... tmp2 ...
    140 
    141         lw              $s2, 80($a1)            # quantptr[DCTSIZE*5]
    142         lw              $t8, 4($at)             # FIX(1.414213562)
    143         ins             $t2, $v0, 16, 16        # ... tmp1
    144 
    145         muleq_s.w.phl   $v0, $t6, $s3           # tmp3 ...
    146         muleq_s.w.phr   $t6, $t6, $s3           # ... tmp3 ...
    147 
    148         ins             $t4, $v1, 16, 16        # ... tmp2
    149 
    150         addq.ph         $s4, $t0, $t4           # tmp10
    151         subq.ph         $s5, $t0, $t4           # tmp11
    152 
    153         ins             $t6, $v0, 16, 16        # ... tmp3
    154 
    155         subq.ph         $s6, $t2, $t6           # tmp12 ...
    156         addq.ph         $s7, $t2, $t6           # tmp13
    157 
    158         mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...
    159 
    160         addq.ph         $t0, $s4, $s7           # tmp0
    161         subq.ph         $t6, $s4, $s7           # tmp3
    162 
    163 ################
    164 
    165         muleq_s.w.phl   $v0, $t1, $s0           # tmp4 ...
    166         muleq_s.w.phr   $t1, $t1, $s0           # ... tmp4 ...
    167 
    168         shll_s.ph       $s6, $s6, 1             # x2
    169 
    170         lw              $s3, 112($a1)           # quantptr[DCTSIZE*7]
    171 
    172         subq.ph         $s6, $s6, $s7           # ... tmp12
    173 
    174         muleq_s.w.phl   $v1, $t7, $s3           # tmp7 ...
    175         muleq_s.w.phr   $t7, $t7, $s3           # ... tmp7 ...
    176 
    177         ins             $t1, $v0, 16, 16        # ... tmp4
    178 
    179         addq.ph         $t2, $s5, $s6           # tmp1
    180         subq.ph         $t4, $s5, $s6           # tmp2
    181 
    182         muleq_s.w.phl   $v0, $t5, $s2           # tmp6 ...
    183         muleq_s.w.phr   $t5, $t5, $s2           # ... tmp6 ...
    184 
    185         ins             $t7, $v1, 16, 16        # ... tmp7
    186 
    187         addq.ph         $s5, $t1, $t7           # z11
    188         subq.ph         $s6, $t1, $t7           # z12
    189 
    190         muleq_s.w.phl   $v1, $t3, $s1           # tmp5 ...
    191         muleq_s.w.phr   $t3, $t3, $s1           # ... tmp5 ...
    192 
    193         ins             $t5, $v0, 16, 16        # ... tmp6
    194 
    195 # stalls
    196 
    197         ins             $t3, $v1, 16, 16        # ... tmp5
    198 
    199 
    200         addq.ph         $s7, $t5, $t3           # z13
    201         subq.ph         $v0, $t5, $t3           # z10
    202 
    203         addq.ph         $t7, $s5, $s7           # tmp7
    204         subq.ph         $s5, $s5, $s7           # tmp11 ...
    205 
    206         addq.ph         $v1, $v0, $s6           # z5 ...
    207 
    208         mulq_rs.ph      $s5, $s5, $t8           # ... tmp11
    209 
    210         lw              $t8, 8($at)             # FIX(1.847759065)
    211         lw              $s4, 0($at)             # FIX(1.082392200)
    212 
    213         addq.ph         $s0, $t0, $t7
    214         subq.ph         $s1, $t0, $t7
    215 
    216         mulq_rs.ph      $v1, $v1, $t8           # ... z5
    217 
    218         shll_s.ph       $s5, $s5, 1             # x2
    219 
    220         lw              $t8, 12($at)            # FIX(-2.613125930)
    221         sw              $s0, 0($a2)             # wsptr[DCTSIZE*0]
    222 
    223         mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
    224         mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...
    225 
    226         shll_s.ph       $v1, $v1, 1             # x2
    227 
    228         addiu           $a0, $a0, 4
    229         addiu           $a1, $a1, 4
    230 
    231         sw              $s1, 112($a2)           # wsptr[DCTSIZE*7]
    232 
    233         shll_s.ph       $s6, $v0, 2             # x4
    234         shll_s.ph       $s4, $s4, 1             # x2
    235         addq.ph         $s6, $s6, $v1           # ... tmp12
    236 
    237         subq.ph         $t5, $s6, $t7           # tmp6
    238         subq.ph         $s4, $s4, $v1           # ... tmp10
    239         subq.ph         $t3, $s5, $t5           # tmp5
    240         addq.ph         $s2, $t2, $t5
    241         addq.ph         $t1, $s4, $t3           # tmp4
    242         subq.ph         $s3, $t2, $t5
    243 
    244         sw              $s2, 16($a2)            # wsptr[DCTSIZE*1]
    245         sw              $s3, 96($a2)            # wsptr[DCTSIZE*6]
    246 
    247         addq.ph         $v0, $t4, $t3
    248         subq.ph         $v1, $t4, $t3
    249 
    250         sw              $v0, 32($a2)            # wsptr[DCTSIZE*2]
    251         sw              $v1, 80($a2)            # wsptr[DCTSIZE*5]
    252 
    253         addq.ph         $v0, $t6, $t1
    254         subq.ph         $v1, $t6, $t1
    255 
    256         sw              $v0, 64($a2)            # wsptr[DCTSIZE*4]
    257         sw              $v1, 48($a2)            # wsptr[DCTSIZE*3]
    258 
    259 continue_columns:
    260 
    261         bne             $a0, $t9, loop_columns
    262         addiu           $a2, $a2, 4
    263 
    264 
    265         lw              $s0, 28($sp)
    266         lw              $s1, 24($sp)
    267         lw              $s2, 20($sp)
    268         lw              $s3, 16($sp)
    269         lw              $s4, 12($sp)
    270         lw              $s5,  8($sp)
    271         lw              $s6,  4($sp)
    272         lw              $s7,  0($sp)
    273 
    274         jr              $ra
    275         addiu           $sp, $sp, 32
    276 
    277 
    278         .end            mips_idct_columns
    279 
    280 
    281 ##################################################################
    282 
    283 
    284         .global         mips_idct_rows
    285         .ent            mips_idct_rows
    286 
    287 # void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf,
    288 #                     JDIMENSION output_col, const int * mips_idct_coefs);
    289 
    290 mips_idct_rows:
    291 
    292 # $a0   - wsptr
    293 # $a1   - output_buf
    294 # $a2   - output_col
    295 # $a3   - outptr
    296 # $a3, $at   - mips_idct_coefs
    297 # $t0:7 - simd data
    298 # $t8   - coefficients, temp
    299 # $t9   - loop end address
    300 # $s0:3 - simd quantization factors
    301 # $s4:7 - temp results
    302 # s8    - const 0x80808080
    303 # $v0:1 - temp results
    304 
    305 SHIFT   =               2
    306 
    307         addiu           $sp, $sp, -48           # reserve stack space for s0-s8
    308 
    309         # save $a3 (mips_idct_coefs) because it might get clobbered below
    310         sw              $a3, 36($sp)
    311 
    312         sw              $s0, 32($sp)
    313         sw              $s1, 28($sp)
    314         sw              $s2, 24($sp)
    315         sw              $s3, 20($sp)
    316         sw              $s4, 16($sp)
    317         sw              $s5, 12($sp)
    318         sw              $s6,  8($sp)
    319         sw              $s7,  4($sp)
    320         sw              $s8,  0($sp)
    321 
    322         addiu           $t9, $a0, 128           # end address
    323 
    324         lui             $s8, 0x8080
    325         ori             $s8, $s8, 0x8080
    326 
    327 loop_rows:
    328 
    329         lw              $at, 36($sp)            # restore saved $a3 (mips_idct_coefs)
    330 
    331         lw              $t0, 0+0($a0)           # wsptr[DCTSIZE*0+0/1]  b a
    332         lw              $s0, 16+0($a0)          # wsptr[DCTSIZE*1+0/1]  B A
    333         lw              $t2, 0+4($a0)           # wsptr[DCTSIZE*0+2/3]  d c
    334         lw              $s2, 16+4($a0)          # wsptr[DCTSIZE*1+2/3]  D C
    335         lw              $t4, 0+8($a0)           # wsptr[DCTSIZE*0+4/5]  f e
    336         lw              $s4, 16+8($a0)          # wsptr[DCTSIZE*1+4/5]  F E
    337         lw              $t6, 0+12($a0)          # wsptr[DCTSIZE*0+6/7]  h g
    338         lw              $s6, 16+12($a0)         # wsptr[DCTSIZE*1+6/7]  H G
    339 
    340         precrq.ph.w     $t1, $s0, $t0           # B b
    341         ins             $t0, $s0, 16, 16        # A a
    342 
    343         bnez            $t1, full_row
    344         or              $s0, $t2, $s2
    345         bnez            $s0, full_row
    346         or              $s0, $t4, $s4
    347         bnez            $s0, full_row
    348         or              $s0, $t6, $s6
    349         bnez            $s0, full_row
    350 
    351         shll_s.ph       $s0, $t0, SHIFT         # A a
    352 
    353         lw              $a3, 0($a1)
    354         lw              $at, 4($a1)
    355 
    356         precrq.ph.w     $t0, $s0, $s0           # A A
    357         ins             $s0, $s0, 16, 16        # a a
    358 
    359         addu            $a3, $a3, $a2
    360         addu            $at, $at, $a2
    361 
    362         precrq.qb.ph    $t0, $t0, $t0           # A A A A
    363         precrq.qb.ph    $s0, $s0, $s0           # a a a a
    364 
    365 
    366         addu.qb         $s0, $s0, $s8
    367         addu.qb         $t0, $t0, $s8
    368 
    369 
    370         sw              $s0, 0($a3)
    371         sw              $s0, 4($a3)
    372 
    373         sw              $t0, 0($at)
    374         sw              $t0, 4($at)
    375 
    376 
    377         addiu           $a0, $a0, 32
    378 
    379         bne             $a0, $t9, loop_rows
    380         addiu           $a1, $a1, 8
    381 
    382         b               exit_rows
    383         nop
    384 
    385 
    386 full_row:
    387 
    388         precrq.ph.w     $t3, $s2, $t2
    389         ins             $t2, $s2, 16, 16
    390 
    391         precrq.ph.w     $t5, $s4, $t4
    392         ins             $t4, $s4, 16, 16
    393 
    394         precrq.ph.w     $t7, $s6, $t6
    395         ins             $t6, $s6, 16, 16
    396 
    397 
    398         lw              $t8, 4($at)             # FIX(1.414213562)
    399 
    400         addq.ph         $s4, $t0, $t4           # tmp10
    401         subq.ph         $s5, $t0, $t4           # tmp11
    402 
    403         subq.ph         $s6, $t2, $t6           # tmp12 ...
    404         addq.ph         $s7, $t2, $t6           # tmp13
    405 
    406         mulq_rs.ph      $s6, $s6, $t8           # ... tmp12 ...
    407 
    408         addq.ph         $t0, $s4, $s7           # tmp0
    409         subq.ph         $t6, $s4, $s7           # tmp3
    410 
    411         shll_s.ph       $s6, $s6, 1             # x2
    412 
    413         subq.ph         $s6, $s6, $s7           # ... tmp12
    414 
    415         addq.ph         $t2, $s5, $s6           # tmp1
    416         subq.ph         $t4, $s5, $s6           # tmp2
    417 
    418 ################
    419 
    420         addq.ph         $s5, $t1, $t7           # z11
    421         subq.ph         $s6, $t1, $t7           # z12
    422 
    423         addq.ph         $s7, $t5, $t3           # z13
    424         subq.ph         $v0, $t5, $t3           # z10
    425 
    426         addq.ph         $t7, $s5, $s7           # tmp7
    427         subq.ph         $s5, $s5, $s7           # tmp11 ...
    428 
    429         addq.ph         $v1, $v0, $s6           # z5 ...
    430 
    431         mulq_rs.ph      $s5, $s5, $t8           # ... tmp11
    432 
    433         lw              $t8, 8($at)             # FIX(1.847759065)
    434         lw              $s4, 0($at)             # FIX(1.082392200)
    435 
    436         addq.ph         $s0, $t0, $t7           # tmp0 + tmp7
    437         subq.ph         $s7, $t0, $t7           # tmp0 - tmp7
    438 
    439         mulq_rs.ph      $v1, $v1, $t8           # ... z5
    440 
    441         lw              $a3, 0($a1)
    442         lw              $t8, 12($at)            # FIX(-2.613125930)
    443 
    444         shll_s.ph       $s5, $s5, 1             # x2
    445 
    446         addu            $a3, $a3, $a2
    447 
    448         mulq_rs.ph      $v0, $v0, $t8           # tmp12 ...
    449         mulq_rs.ph      $s4, $s6, $s4           # tmp10 ...
    450 
    451         shll_s.ph       $v1, $v1, 1             # x2
    452 
    453         addiu           $a0, $a0, 32
    454         addiu           $a1, $a1, 8
    455 
    456 
    457         shll_s.ph       $s6, $v0, 2             # x4
    458         shll_s.ph       $s4, $s4, 1             # x2
    459         addq.ph         $s6, $s6, $v1           # ... tmp12
    460 
    461         shll_s.ph       $s0, $s0, SHIFT
    462 
    463         subq.ph         $t5, $s6, $t7           # tmp6
    464         subq.ph         $s4, $s4, $v1           # ... tmp10
    465         subq.ph         $t3, $s5, $t5           # tmp5
    466 
    467         shll_s.ph       $s7, $s7, SHIFT
    468 
    469         addq.ph         $t1, $s4, $t3           # tmp4
    470 
    471 
    472         addq.ph         $s1, $t2, $t5           # tmp1 + tmp6
    473         subq.ph         $s6, $t2, $t5           # tmp1 - tmp6
    474 
    475         addq.ph         $s2, $t4, $t3           # tmp2 + tmp5
    476         subq.ph         $s5, $t4, $t3           # tmp2 - tmp5
    477 
    478         addq.ph         $s4, $t6, $t1           # tmp3 + tmp4
    479         subq.ph         $s3, $t6, $t1           # tmp3 - tmp4
    480 
    481 
    482         shll_s.ph       $s1, $s1, SHIFT
    483         shll_s.ph       $s2, $s2, SHIFT
    484         shll_s.ph       $s3, $s3, SHIFT
    485         shll_s.ph       $s4, $s4, SHIFT
    486         shll_s.ph       $s5, $s5, SHIFT
    487         shll_s.ph       $s6, $s6, SHIFT
    488 
    489 
    490         precrq.ph.w     $t0, $s1, $s0           # B A
    491         ins             $s0, $s1, 16, 16        # b a
    492 
    493         precrq.ph.w     $t2, $s3, $s2           # D C
    494         ins             $s2, $s3, 16, 16        # d c
    495 
    496         precrq.ph.w     $t4, $s5, $s4           # F E
    497         ins             $s4, $s5, 16, 16        # f e
    498 
    499         precrq.ph.w     $t6, $s7, $s6           # H G
    500         ins             $s6, $s7, 16, 16        # h g
    501 
    502         precrq.qb.ph    $t0, $t2, $t0           # D C B A
    503         precrq.qb.ph    $s0, $s2, $s0           # d c b a
    504 
    505         precrq.qb.ph    $t4, $t6, $t4           # H G F E
    506         precrq.qb.ph    $s4, $s6, $s4           # h g f e
    507 
    508 
    509         addu.qb         $s0, $s0, $s8
    510         addu.qb         $s4, $s4, $s8
    511 
    512 
    513         sw              $s0, 0($a3)             # outptr[0/1/2/3]       d c b a
    514         sw              $s4, 4($a3)             # outptr[4/5/6/7]       h g f e
    515 
    516         lw              $a3, -4($a1)
    517 
    518         addu.qb         $t0, $t0, $s8
    519 
    520         addu            $a3, $a3, $a2
    521 
    522         addu.qb         $t4, $t4, $s8
    523 
    524 
    525         sw              $t0, 0($a3)             # outptr[0/1/2/3]       D C B A
    526 
    527         bne             $a0, $t9, loop_rows
    528         sw              $t4, 4($a3)             # outptr[4/5/6/7]       H G F E
    529 
    530 
    531 exit_rows:
    532 
    533         lw              $s0, 32($sp)
    534         lw              $s1, 28($sp)
    535         lw              $s2, 24($sp)
    536         lw              $s3, 20($sp)
    537         lw              $s4, 16($sp)
    538         lw              $s5, 12($sp)
    539         lw              $s6,  8($sp)
    540         lw              $s7,  4($sp)
    541         lw              $s8,  0($sp)
    542 
    543         jr              $ra
    544         addiu           $sp, $sp, 48
    545 
    546 
    547         .end            mips_idct_rows
    548