Home | History | Annotate | Download | only in jpeg
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <machine/cpu-features.h>
     18 
     19     .text
     20     .align
     21 
     22     .global jpeg_idct_ifast
     23     .func   jpeg_idct_ifast
     24 
     25 // NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
     26 
     27 // jpeg_idct_ifast (j_decompress_ptr       cinfo,
     28 //                 jpeg_component_info *   compptr,
     29 //                 short*                  coef_block,
     30 //                 unsigned char*          output_buf,
     31 //                 int                     output_col)
     32 
     33 #define  local_TMP0123       sp
     34 #define  local_TMP0          [sp, #0]
     35 #define  local_TMP1          [sp, #4]
     36 #define  local_TMP2          [sp, #8]
     37 #define  local_TMP3          [sp, #12]
     38 #define  local_RANGE_TABLE   [sp, #16]
     39 #define  local_OUTPUT_COL    [sp, #20]
     40 #define  local_OUTPUT_BUF    [sp, #24]
     41 #define  local_UNUSED        [sp, #28]
     42 #define  off_WORKSPACE       32
     43 #define  local_WORKSPACE     [sp, #offWORKSPACE]
     44 #define  local_SIZE          (off_WORKSPACE + 8*8*4)
     45 
     46 #define  off_DECOMPRESS_range_limit_base  324
     47 #define  off_COMPINFO_quanttable          80
     48 
     49 #define  DCTSIZE   8
     50 #define  VY(x)   ((x)*DCTSIZE*2)
     51 #define  QY(x)   ((x)*DCTSIZE*4)
     52 
     53 #define  VX(x)   ((x)*2)
     54 #define  QX(x)   ((x)*4)
     55 
     56 #define  FIX_1_414213562    #362
     57 #define  FIX_1_082392200    #277
     58 #define  FIX_1_847759065    #473
     59 #define  FIX_2_613125930    #669
     60 
     61 #define  RANGE_MASK   1023
     62 
     63 
     64 
     65 jpeg_idct_ifast:
     66     PLD     (r2, #0)
     67     stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
     68     ldr     r4, [sp, #4*10]
     69     sub     sp, #local_SIZE
     70 
     71     ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
     72     str     r4, local_OUTPUT_COL
     73     str     r3, local_OUTPUT_BUF
     74     ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
     75     add     r5, r5, #128
     76     str     r5, local_RANGE_TABLE
     77     mov     fp, r2                                      // fp = coef_block
     78     add     ip, sp, #off_WORKSPACE
     79 
     80 VLoopTail:
     81     ldrsh    r0, [fp, #VY(0)]
     82     ldrsh    r1, [fp, #VY(1)]
     83     ldrsh    r2, [fp, #VY(2)]
     84     ldrsh    r3, [fp, #VY(3)]
     85     ldrsh    r4, [fp, #VY(4)]
     86     ldrsh    r5, [fp, #VY(5)]
     87     ldrsh    r6, [fp, #VY(6)]
     88     ldrsh    r7, [fp, #VY(7)]
     89 
     90     cmp      r1, #0
     91     orreqs   r8, r2, r3
     92     orreqs   r8, r4, r5
     93     orreqs   r8, r6, r7
     94     beq      VLoopHeadZero
     95 
     96 VLoopHead:
     97     // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
     98     // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
     99     // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
    100     // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
    101     // tmp10 = tmp0 + tmp2   (r0)
    102     // tmp11 = tmp0 - tmp2   (r4)
    103 
    104     ldr      r9, [r10, #QY(4)]
    105     ldr      r8, [r10, #QY(0)]
    106 #if __ARM_HAVE_HALFWORD_MULTIPLY
    107     smulbb   r4, r9, r4
    108     smlabb   r0, r8, r0, r4
    109 #else
    110     mul      r4, r9, r4
    111     mul      r0, r8, r0
    112     add      r0, r4
    113 #endif
    114     ldr      r9, [r10, #QY(6)]
    115     ldr      r8, [r10, #QY(2)]
    116     sub      r4, r0, r4, lsl #1
    117 #if __ARM_HAVE_HALFWORD_MULTIPLY
    118     smulbb   r6, r9, r6
    119     smlabb   r2, r8, r2, r6
    120 #else
    121     mul      r6, r9, r6
    122     mul      r2, r8, r2
    123     add      r2, r6
    124 #endif
    125 
    126     // tmp13 = tmp1 + tmp3                                       (r2)
    127     // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
    128     // FIX_1_4142... = 362 = 45*8 + 2
    129     sub      r6, r2, r6, lsl #1
    130     mov      r8, #360
    131     add      r8, r8, #2
    132     mul      r9, r6, r8
    133 
    134     // tmp0 = tmp10 + tmp13;   (r0)
    135     // tmp3 = tmp10 - tmp13;   (r8)
    136     // tmp1 = tmp11 + tmp12;   (r4)
    137     // tmp2 = tmp11 - tmp12;   (r6)
    138     add     r0, r0, r2
    139     rsb     r6, r2, r9, asr #8
    140     sub     r8, r0, r2, lsl #1
    141     add     r4, r4, r6
    142     sub     r6, r4, r6, lsl #1
    143 
    144     stmia   local_TMP0123, {r0, r4, r6, r8}
    145 
    146     // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
    147 
    148     // odd part
    149     // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
    150     // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
    151     // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
    152     // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
    153     // z13 = tmp6 + tmp5;  (r0)
    154     // z10 = tmp6 - tmp5;  (r2)
    155     // z11 = tmp4 + tmp7;  (r4)
    156     // z12 = tmp4 - tmp7;  (r6)
    157 
    158     ldr     r2, [r10, #QY(1)]
    159     ldr     r9, [r10, #QY(5)]
    160 #if __ARM_HAVE_HALFWORD_MULTIPLY
    161     smulbb  r1, r2, r1
    162 #else
    163     mul     r1, r2, r1
    164 #endif
    165     ldr     r2, [r10, #QY(3)]
    166 #if __ARM_HAVE_HALFWORD_MULTIPLY
    167     smulbb  r5, r9, r5
    168 #else
    169     mul     r5, r9, r5
    170 #endif
    171     ldr     r9, [r10, #QY(7)]
    172 #if __ARM_HAVE_HALFWORD_MULTIPLY
    173     smlabb  r0, r2, r3, r5
    174     smlabb  r4, r9, r7, r1
    175 #else
    176     mul     r0, r2, r3
    177     add     r0, r5
    178     mul     r4, r9, r7
    179     add     r4, r1
    180 #endif
    181     rsb  r2, r0, r5, lsl #1
    182     rsb  r6, r4, r1, lsl #1
    183 
    184     // tmp7 = z11 + z13;                             (r7)
    185     // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
    186     // FIX_... = 360 + 2
    187     add   r7, r4, r0
    188     sub   r1, r4, r0
    189     mov   r8, #360
    190     add   r8, r8, #2
    191     mul   r1, r8, r1
    192 
    193     // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
    194     // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
    195     // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
    196     // FIX_1_8477... = 473 = 472 + 1
    197     // FIX_1_082...  = 277 = 276 + 1
    198     // FIX_2_...     = 669 = 668 + 1
    199     add     r8, r2, r6
    200     mov     r9, #472
    201     mla     r8, r9, r8, r8
    202     mov     r9, #276
    203     mla     r0, r6, r9, r6
    204     mov     r9, #668
    205     mla     r2, r9, r2, r2
    206     sub     r0, r0, r8
    207     rsb     r2, r2, r8
    208 
    209     // tmp6 = tmp12 - tmp7;  (r6)
    210     // tmp5 = tmp11 - tmp6;  (r5)
    211     // tmp4 = tmp10 + tmp5;  (r4)
    212     rsb  r6, r7, r2, asr #8
    213     rsb  r5, r6, r1, asr #8
    214     add  r4, r5, r0, asr #8
    215 
    216     ldmia local_TMP0123, {r0, r1, r2, r3}
    217 
    218     // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
    219     // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
    220     // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
    221     // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
    222     // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
    223     // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
    224     // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
    225     // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
    226 
    227     add   r0, r0, r7
    228     sub   r7, r0, r7, lsl #1
    229     add   r1, r1, r6
    230     sub   r6, r1, r6, lsl #1
    231     add   r2, r2, r5
    232     sub   r5, r2, r5, lsl #1
    233     sub   r3, r3, r4
    234     add   r4, r3, r4, lsl #1
    235 
    236     str   r0, [ip, #QY(0)]
    237     str   r1, [ip, #QY(1)]
    238     str   r2, [ip, #QY(2)]
    239     str   r3, [ip, #QY(3)]
    240     str   r4, [ip, #QY(4)]
    241     str   r5, [ip, #QY(5)]
    242     str   r6, [ip, #QY(6)]
    243     str   r7, [ip, #QY(7)]
    244 
    245     // inptr++;                    /* advance pointers to next column */
    246     // quantptr++;
    247     // wsptr++;
    248     add  fp, fp, #2
    249     add  r10, r10, #4
    250     add  ip, ip, #4
    251     add  r0, sp, #(off_WORKSPACE + 4*8)
    252     cmp  ip, r0
    253     bne  VLoopTail
    254 
    255 
    256 
    257 HLoopStart:
    258     // reset pointers
    259     PLD     (sp, #off_WORKSPACE)
    260     add     ip, sp, #off_WORKSPACE
    261     ldr     r10, local_RANGE_TABLE
    262 
    263 HLoopTail:
    264     // output = *output_buf++ + output_col
    265     ldr      r0, local_OUTPUT_BUF
    266     ldr      r1, local_OUTPUT_COL
    267     ldr      r2, [r0], #4
    268     str      r0, local_OUTPUT_BUF
    269     add      fp, r2, r1
    270 
    271     PLD      (ip, #32)
    272     ldmia    ip!, {r0-r7}
    273 
    274     cmp      r1, #0
    275     orreqs   r8, r2, r3
    276     orreqs   r8, r4, r5
    277     orreqs   r8, r6, r7
    278     beq      HLoopTailZero
    279 
    280 HLoopHead:
    281     // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
    282     // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
    283     add     r0, r0, r4
    284     sub     r4, r0, r4, lsl #1
    285 
    286     // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
    287     // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
    288     // FIX_... = 360 + 2
    289     add     r2, r2, r6
    290     sub     r6, r2, r6, lsl #1
    291     mov     r8, #360
    292     add     r8, r8, #2
    293     mul     r6, r8, r6
    294 
    295     // tmp0 = tmp10 + tmp13;   (r0)
    296     // tmp3 = tmp10 - tmp13;   (r8)
    297     // tmp1 = tmp11 + tmp12;   (r4)
    298     // tmp2 = tmp11 - tmp12;   (r6)
    299     add     r0, r0, r2
    300     rsb     r6, r2, r6, asr #8
    301     sub     r8, r0, r2, lsl #1
    302     add     r4, r4, r6
    303     sub     r6, r4, r6, lsl #1
    304 
    305     stmia   local_TMP0123, {r0, r4, r6, r8}
    306 
    307     // Odd part
    308 
    309     // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
    310     // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
    311     // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
    312     // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
    313     add  r0, r5, r3
    314     sub  r2, r5, r3
    315     add  r4, r1, r7
    316     sub  r6, r1, r7
    317 
    318     // tmp7 = z11 + z13;                             (r7)
    319     // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
    320     // FIX_... = 360 + 2
    321     add   r7, r4, r0
    322     sub   r1, r4, r0
    323     mov   r8, #360
    324     add   r8, r8, #2
    325     mul   r1, r8, r1
    326 
    327     // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
    328     // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
    329     // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
    330     // FIX_1_8477... = 473 = 472 + 1
    331     // FIX_1_082...  = 277 = 276 + 1
    332     // FIX_2_...     = 669 = 668 + 1
    333     add  r8, r2, r6
    334     mov  r9, #472
    335     mla  r8, r9, r8, r8
    336     mov  r9, #276
    337     mla  r0, r6, r9, r6
    338     mov  r9, #668
    339     mla  r2, r9, r2, r2
    340     sub  r0, r0, r8
    341     sub  r2, r8, r2
    342 
    343     // tmp6 = tmp12 - tmp7;  (r6)
    344     // tmp5 = tmp11 - tmp6;  (r5)
    345     // tmp4 = tmp10 + tmp5;  (r4)
    346     rsb  r6, r7, r2, asr #8
    347     rsb  r5, r6, r1, asr #8
    348     add  r4, r5, r0, asr #8
    349 
    350     ldmia local_TMP0123, {r0, r1, r2, r3}
    351 
    352     // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
    353     // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
    354     // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
    355     // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
    356     // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
    357     // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
    358     // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
    359     // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
    360 
    361     mov    r8, #128
    362     add    r0, r0, r7
    363     sub    r7, r0, r7, lsl #1
    364     add    r0, r8, r0, asr #5
    365     add    r7, r8, r7, asr #5
    366     add    r1, r1, r6
    367     sub    r6, r1, r6, lsl #1
    368     add    r1, r8, r1, asr #5
    369     add    r6, r8, r6, asr #5
    370     add    r2, r2, r5
    371     sub    r5, r2, r5, lsl #1
    372     add    r2, r8, r2, asr #5
    373     add    r5, r8, r5, asr #5
    374     sub    r3, r3, r4
    375     add    r4, r3, r4, lsl #1
    376     add    r3, r8, r3, asr #5
    377     add    r4, r8, r4, asr #5
    378 
    379 #if __ARM_ARCH__ >= 6
    380     usat   r0, #8, r0
    381     usat   r1, #8, r1
    382     usat   r2, #8, r2
    383     usat   r3, #8, r3
    384     usat   r4, #8, r4
    385     usat   r5, #8, r5
    386     usat   r6, #8, r6
    387     usat   r7, #8, r7
    388 #else
    389     cmp    r0, #255
    390     mvnhi  r0, r0, asr #31
    391     andhi  r0, #255
    392     cmp    r7, #255
    393     mvnhi  r7, r7, asr #31
    394     cmp    r1, #255
    395     mvnhi  r1, r1, asr #31
    396     andhi  r1, #255
    397     cmp    r6, #255
    398     mvnhi  r6, r6, asr #31
    399     andhi  r6, #255
    400     cmp    r2, #255
    401     mvnhi  r2, r2, asr #31
    402     andhi  r2, #255
    403     cmp    r5, #255
    404     mvnhi  r5, r5, asr #31
    405     andhi  r5, #255
    406     cmp    r3, #255
    407     mvnhi  r3, r3, asr #31
    408     cmp    r4, #255
    409     mvnhi  r4, r4, asr #31
    410     andhi  r4, #255
    411 #endif
    412 
    413     // r3 r2 r1 r0
    414     orr    r0, r0, r1, lsl #8
    415     orr    r0, r0, r2, lsl #16
    416     orr    r0, r0, r3, lsl #24
    417 
    418     // r7 r6 r5 r4
    419     orr    r1, r4, r5, lsl #8
    420     orr    r1, r1, r6, lsl #16
    421     orr    r1, r1, r7, lsl #24
    422     stmia  fp, {r0, r1}
    423 
    424     add    r0, sp, #(off_WORKSPACE + 8*8*4)
    425     cmp    ip, r0
    426     bne    HLoopTail
    427 
    428 Exit:
    429     add    sp, sp, #local_SIZE
    430     ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
    431     bx     lr
    432 
    433 
    434 VLoopHeadZero:
    435 // ok, all AC coefficients are 0
    436     ldr      r1, [r10, #QY(0)]
    437     add      fp, fp, #2
    438     add      r10, r10, #4
    439     mul      r0, r1, r0
    440     str      r0, [ip, #QY(0)]
    441     str      r0, [ip, #QY(1)]
    442     str      r0, [ip, #QY(2)]
    443     str      r0, [ip, #QY(3)]
    444     str      r0, [ip, #QY(4)]
    445     str      r0, [ip, #QY(5)]
    446     str      r0, [ip, #QY(6)]
    447     str      r0, [ip, #QY(7)]
    448     add      ip, ip, #4
    449     add      r0, sp, #(off_WORKSPACE + 4*8)
    450     cmp      ip, r0
    451     beq      HLoopStart
    452     b        VLoopTail
    453 
    454 HLoopTailZero:
    455     mov      r0, r0, asr #5
    456     add      r0, #128
    457 
    458 #if __ARM_ARCH__ >= 6
    459     usat     r0, #8, r0
    460 #else
    461     cmp      r0, #255
    462     mvnhi    r0, r0, asr #31
    463     andhi    r0, r0, #255
    464 #endif
    465 
    466     orr      r0, r0, lsl #8
    467     orr      r0, r0, lsl #16
    468     mov      r1, r0
    469     stmia    fp, {r0, r1}
    470 
    471     add      r0, sp, #(off_WORKSPACE + 64*4)
    472     cmp      ip, r0
    473     beq      Exit
    474     b        HLoopTail
    475 
    476     .endfunc
    477