Home | History | Annotate | Download | only in libjpeg-turbo
      1 /*
      2  * jidctint.c
      3  *
      4  * This file was part of the Independent JPEG Group's software.
      5  * Copyright (C) 1991-1998, Thomas G. Lane.
      6  * Modification developed 2002-2009 by Guido Vollbeding.
      7  * libjpeg-turbo Modifications:
      8  * Copyright (C) 2015, D. R. Commander.
      9  * For conditions of distribution and use, see the accompanying README.ijg
     10  * file.
     11  *
     12  * This file contains a slow-but-accurate integer implementation of the
     13  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
     14  * must also perform dequantization of the input coefficients.
     15  *
     16  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
     17  * on each row (or vice versa, but it's more convenient to emit a row at
     18  * a time).  Direct algorithms are also available, but they are much more
     19  * complex and seem not to be any faster when reduced to code.
     20  *
     21  * This implementation is based on an algorithm described in
     22  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
     23  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
     24  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
     25  * The primary algorithm described there uses 11 multiplies and 29 adds.
     26  * We use their alternate method with 12 multiplies and 32 adds.
     27  * The advantage of this method is that no data path contains more than one
     28  * multiplication; this allows a very simple and accurate implementation in
     29  * scaled fixed-point arithmetic, with a minimal number of shifts.
     30  *
     31  * We also provide IDCT routines with various output sample block sizes for
     32  * direct resolution reduction or enlargement without additional resampling:
     33  * NxN (N=1...16) pixels for one 8x8 input DCT block.
     34  *
     35  * For N<8 we simply take the corresponding low-frequency coefficients of
     36  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
     37  * to yield the downscaled outputs.
     38  * This can be seen as direct low-pass downsampling from the DCT domain
     39  * point of view rather than the usual spatial domain point of view,
     40  * yielding significant computational savings and results at least
     41  * as good as common bilinear (averaging) spatial downsampling.
     42  *
     43  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
     44  * lower frequencies and higher frequencies assumed to be zero.
     45  * It turns out that the computational effort is similar to the 8x8 IDCT
     46  * regarding the output size.
     47  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
     48  *
     49  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
     50  * since there would be too many additional constants to pre-calculate.
     51  */
     52 
     53 #define JPEG_INTERNALS
     54 #include "jinclude.h"
     55 #include "jpeglib.h"
     56 #include "jdct.h"               /* Private declarations for DCT subsystem */
     57 
     58 #ifdef DCT_ISLOW_SUPPORTED
     59 
     60 
     61 /*
     62  * This module is specialized to the case DCTSIZE = 8.
     63  */
     64 
     65 #if DCTSIZE != 8
     66   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
     67 #endif
     68 
     69 
     70 /*
     71  * The poop on this scaling stuff is as follows:
     72  *
     73  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
     74  * larger than the true IDCT outputs.  The final outputs are therefore
     75  * a factor of N larger than desired; since N=8 this can be cured by
     76  * a simple right shift at the end of the algorithm.  The advantage of
     77  * this arrangement is that we save two multiplications per 1-D IDCT,
     78  * because the y0 and y4 inputs need not be divided by sqrt(N).
     79  *
     80  * We have to do addition and subtraction of the integer inputs, which
     81  * is no problem, and multiplication by fractional constants, which is
     82  * a problem to do in integer arithmetic.  We multiply all the constants
     83  * by CONST_SCALE and convert them to integer constants (thus retaining
     84  * CONST_BITS bits of precision in the constants).  After doing a
     85  * multiplication we have to divide the product by CONST_SCALE, with proper
     86  * rounding, to produce the correct output.  This division can be done
     87  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
     88  * as long as possible so that partial sums can be added together with
     89  * full fractional precision.
     90  *
     91  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
     92  * they are represented to better-than-integral precision.  These outputs
     93  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
     94  * with the recommended scaling.  (To scale up 12-bit sample data further, an
     95  * intermediate JLONG array would be needed.)
     96  *
     97  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
     98  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
     99  * shows that the values given below are the most effective.
    100  */
    101 
    102 #if BITS_IN_JSAMPLE == 8
    103 #define CONST_BITS  13
    104 #define PASS1_BITS  2
    105 #else
    106 #define CONST_BITS  13
    107 #define PASS1_BITS  1           /* lose a little precision to avoid overflow */
    108 #endif
    109 
    110 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
    111  * causing a lot of useless floating-point operations at run time.
    112  * To get around this we use the following pre-calculated constants.
    113  * If you change CONST_BITS you may want to add appropriate values.
    114  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
    115  */
    116 
    117 #if CONST_BITS == 13
    118 #define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
    119 #define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
    120 #define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
    121 #define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
    122 #define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
    123 #define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
    124 #define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
    125 #define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
    126 #define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
    127 #define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
    128 #define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
    129 #define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
    130 #else
    131 #define FIX_0_298631336  FIX(0.298631336)
    132 #define FIX_0_390180644  FIX(0.390180644)
    133 #define FIX_0_541196100  FIX(0.541196100)
    134 #define FIX_0_765366865  FIX(0.765366865)
    135 #define FIX_0_899976223  FIX(0.899976223)
    136 #define FIX_1_175875602  FIX(1.175875602)
    137 #define FIX_1_501321110  FIX(1.501321110)
    138 #define FIX_1_847759065  FIX(1.847759065)
    139 #define FIX_1_961570560  FIX(1.961570560)
    140 #define FIX_2_053119869  FIX(2.053119869)
    141 #define FIX_2_562915447  FIX(2.562915447)
    142 #define FIX_3_072711026  FIX(3.072711026)
    143 #endif
    144 
    145 
    146 /* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
    147  * For 8-bit samples with the recommended scaling, all the variable
    148  * and constant values involved are no more than 16 bits wide, so a
    149  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
    150  * For 12-bit samples, a full 32-bit multiplication will be needed.
    151  */
    152 
    153 #if BITS_IN_JSAMPLE == 8
    154 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
    155 #else
    156 #define MULTIPLY(var,const)  ((var) * (const))
    157 #endif
    158 
    159 
    160 /* Dequantize a coefficient by multiplying it by the multiplier-table
    161  * entry; produce an int result.  In this module, both inputs and result
    162  * are 16 bits or less, so either int or short multiply will work.
    163  */
    164 
    165 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
    166 
    167 
    168 /*
    169  * Perform dequantization and inverse DCT on one block of coefficients.
    170  */
    171 
    172 GLOBAL(void)
    173 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
    174                  JCOEFPTR coef_block,
    175                  JSAMPARRAY output_buf, JDIMENSION output_col)
    176 {
    177   JLONG tmp0, tmp1, tmp2, tmp3;
    178   JLONG tmp10, tmp11, tmp12, tmp13;
    179   JLONG z1, z2, z3, z4, z5;
    180   JCOEFPTR inptr;
    181   ISLOW_MULT_TYPE *quantptr;
    182   int *wsptr;
    183   JSAMPROW outptr;
    184   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    185   int ctr;
    186   int workspace[DCTSIZE2];      /* buffers data between passes */
    187   SHIFT_TEMPS
    188 
    189   /* Pass 1: process columns from input, store into work array. */
    190   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
    191   /* furthermore, we scale the results by 2**PASS1_BITS. */
    192 
    193   inptr = coef_block;
    194   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    195   wsptr = workspace;
    196   for (ctr = DCTSIZE; ctr > 0; ctr--) {
    197     /* Due to quantization, we will usually find that many of the input
    198      * coefficients are zero, especially the AC terms.  We can exploit this
    199      * by short-circuiting the IDCT calculation for any column in which all
    200      * the AC terms are zero.  In that case each output is equal to the
    201      * DC coefficient (with scale factor as needed).
    202      * With typical images and quantization tables, half or more of the
    203      * column DCT calculations can be simplified this way.
    204      */
    205 
    206     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
    207         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
    208         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
    209         inptr[DCTSIZE*7] == 0) {
    210       /* AC terms all zero */
    211       int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
    212                              PASS1_BITS);
    213 
    214       wsptr[DCTSIZE*0] = dcval;
    215       wsptr[DCTSIZE*1] = dcval;
    216       wsptr[DCTSIZE*2] = dcval;
    217       wsptr[DCTSIZE*3] = dcval;
    218       wsptr[DCTSIZE*4] = dcval;
    219       wsptr[DCTSIZE*5] = dcval;
    220       wsptr[DCTSIZE*6] = dcval;
    221       wsptr[DCTSIZE*7] = dcval;
    222 
    223       inptr++;                  /* advance pointers to next column */
    224       quantptr++;
    225       wsptr++;
    226       continue;
    227     }
    228 
    229     /* Even part: reverse the even part of the forward DCT. */
    230     /* The rotator is sqrt(2)*c(-6). */
    231 
    232     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    233     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    234 
    235     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    236     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
    237     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
    238 
    239     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    240     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    241 
    242     tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
    243     tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
    244 
    245     tmp10 = tmp0 + tmp3;
    246     tmp13 = tmp0 - tmp3;
    247     tmp11 = tmp1 + tmp2;
    248     tmp12 = tmp1 - tmp2;
    249 
    250     /* Odd part per figure 8; the matrix is unitary and hence its
    251      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    252      */
    253 
    254     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
    255     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    256     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    257     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    258 
    259     z1 = tmp0 + tmp3;
    260     z2 = tmp1 + tmp2;
    261     z3 = tmp0 + tmp2;
    262     z4 = tmp1 + tmp3;
    263     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    264 
    265     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    266     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    267     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    268     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    269     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    270     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    271     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    272     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    273 
    274     z3 += z5;
    275     z4 += z5;
    276 
    277     tmp0 += z1 + z3;
    278     tmp1 += z2 + z4;
    279     tmp2 += z2 + z3;
    280     tmp3 += z1 + z4;
    281 
    282     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    283 
    284     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
    285     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
    286     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
    287     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
    288     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
    289     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
    290     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
    291     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
    292 
    293     inptr++;                    /* advance pointers to next column */
    294     quantptr++;
    295     wsptr++;
    296   }
    297 
    298   /* Pass 2: process rows from work array, store into output array. */
    299   /* Note that we must descale the results by a factor of 8 == 2**3, */
    300   /* and also undo the PASS1_BITS scaling. */
    301 
    302   wsptr = workspace;
    303   for (ctr = 0; ctr < DCTSIZE; ctr++) {
    304     outptr = output_buf[ctr] + output_col;
    305     /* Rows of zeroes can be exploited in the same way as we did with columns.
    306      * However, the column calculation has created many nonzero AC terms, so
    307      * the simplification applies less often (typically 5% to 10% of the time).
    308      * On machines with very fast multiplication, it's possible that the
    309      * test takes more time than it's worth.  In that case this section
    310      * may be commented out.
    311      */
    312 
    313 #ifndef NO_ZERO_ROW_TEST
    314     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
    315         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
    316       /* AC terms all zero */
    317       JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
    318                                   & RANGE_MASK];
    319 
    320       outptr[0] = dcval;
    321       outptr[1] = dcval;
    322       outptr[2] = dcval;
    323       outptr[3] = dcval;
    324       outptr[4] = dcval;
    325       outptr[5] = dcval;
    326       outptr[6] = dcval;
    327       outptr[7] = dcval;
    328 
    329       wsptr += DCTSIZE;         /* advance pointer to next row */
    330       continue;
    331     }
    332 #endif
    333 
    334     /* Even part: reverse the even part of the forward DCT. */
    335     /* The rotator is sqrt(2)*c(-6). */
    336 
    337     z2 = (JLONG) wsptr[2];
    338     z3 = (JLONG) wsptr[6];
    339 
    340     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    341     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
    342     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
    343 
    344     tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS);
    345     tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS);
    346 
    347     tmp10 = tmp0 + tmp3;
    348     tmp13 = tmp0 - tmp3;
    349     tmp11 = tmp1 + tmp2;
    350     tmp12 = tmp1 - tmp2;
    351 
    352     /* Odd part per figure 8; the matrix is unitary and hence its
    353      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    354      */
    355 
    356     tmp0 = (JLONG) wsptr[7];
    357     tmp1 = (JLONG) wsptr[5];
    358     tmp2 = (JLONG) wsptr[3];
    359     tmp3 = (JLONG) wsptr[1];
    360 
    361     z1 = tmp0 + tmp3;
    362     z2 = tmp1 + tmp2;
    363     z3 = tmp0 + tmp2;
    364     z4 = tmp1 + tmp3;
    365     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    366 
    367     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    368     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    369     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    370     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    371     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    372     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    373     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    374     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    375 
    376     z3 += z5;
    377     z4 += z5;
    378 
    379     tmp0 += z1 + z3;
    380     tmp1 += z2 + z4;
    381     tmp2 += z2 + z3;
    382     tmp3 += z1 + z4;
    383 
    384     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    385 
    386     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
    387                                           CONST_BITS+PASS1_BITS+3)
    388                             & RANGE_MASK];
    389     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
    390                                           CONST_BITS+PASS1_BITS+3)
    391                             & RANGE_MASK];
    392     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
    393                                           CONST_BITS+PASS1_BITS+3)
    394                             & RANGE_MASK];
    395     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
    396                                           CONST_BITS+PASS1_BITS+3)
    397                             & RANGE_MASK];
    398     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
    399                                           CONST_BITS+PASS1_BITS+3)
    400                             & RANGE_MASK];
    401     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
    402                                           CONST_BITS+PASS1_BITS+3)
    403                             & RANGE_MASK];
    404     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
    405                                           CONST_BITS+PASS1_BITS+3)
    406                             & RANGE_MASK];
    407     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
    408                                           CONST_BITS+PASS1_BITS+3)
    409                             & RANGE_MASK];
    410 
    411     wsptr += DCTSIZE;           /* advance pointer to next row */
    412   }
    413 }
    414 
    415 #ifdef IDCT_SCALING_SUPPORTED
    416 
    417 
    418 /*
    419  * Perform dequantization and inverse DCT on one block of coefficients,
    420  * producing a 7x7 output block.
    421  *
    422  * Optimized algorithm with 12 multiplications in the 1-D kernel.
    423  * cK represents sqrt(2) * cos(K*pi/14).
    424  */
    425 
    426 GLOBAL(void)
    427 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
    428                JCOEFPTR coef_block,
    429                JSAMPARRAY output_buf, JDIMENSION output_col)
    430 {
    431   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
    432   JLONG z1, z2, z3;
    433   JCOEFPTR inptr;
    434   ISLOW_MULT_TYPE *quantptr;
    435   int *wsptr;
    436   JSAMPROW outptr;
    437   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    438   int ctr;
    439   int workspace[7*7];   /* buffers data between passes */
    440   SHIFT_TEMPS
    441 
    442   /* Pass 1: process columns from input, store into work array. */
    443 
    444   inptr = coef_block;
    445   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    446   wsptr = workspace;
    447   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
    448     /* Even part */
    449 
    450     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    451     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
    452     /* Add fudge factor here for final descale. */
    453     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
    454 
    455     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    456     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    457     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    458 
    459     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    460     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    461     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    462     tmp0 = z1 + z3;
    463     z2 -= tmp0;
    464     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    465     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    466     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    467     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    468 
    469     /* Odd part */
    470 
    471     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    472     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    473     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    474 
    475     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    476     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    477     tmp0 = tmp1 - tmp2;
    478     tmp1 += tmp2;
    479     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
    480     tmp1 += tmp2;
    481     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    482     tmp0 += z2;
    483     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    484 
    485     /* Final output stage */
    486 
    487     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    488     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    489     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    490     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    491     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    492     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    493     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
    494   }
    495 
    496   /* Pass 2: process 7 rows from work array, store into output array. */
    497 
    498   wsptr = workspace;
    499   for (ctr = 0; ctr < 7; ctr++) {
    500     outptr = output_buf[ctr] + output_col;
    501 
    502     /* Even part */
    503 
    504     /* Add fudge factor here for final descale. */
    505     tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
    506     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
    507 
    508     z1 = (JLONG) wsptr[2];
    509     z2 = (JLONG) wsptr[4];
    510     z3 = (JLONG) wsptr[6];
    511 
    512     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    513     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    514     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    515     tmp0 = z1 + z3;
    516     z2 -= tmp0;
    517     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    518     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    519     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    520     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    521 
    522     /* Odd part */
    523 
    524     z1 = (JLONG) wsptr[1];
    525     z2 = (JLONG) wsptr[3];
    526     z3 = (JLONG) wsptr[5];
    527 
    528     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    529     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    530     tmp0 = tmp1 - tmp2;
    531     tmp1 += tmp2;
    532     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
    533     tmp1 += tmp2;
    534     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    535     tmp0 += z2;
    536     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    537 
    538     /* Final output stage */
    539 
    540     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    541                                               CONST_BITS+PASS1_BITS+3)
    542                             & RANGE_MASK];
    543     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    544                                               CONST_BITS+PASS1_BITS+3)
    545                             & RANGE_MASK];
    546     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    547                                               CONST_BITS+PASS1_BITS+3)
    548                             & RANGE_MASK];
    549     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    550                                               CONST_BITS+PASS1_BITS+3)
    551                             & RANGE_MASK];
    552     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
    553                                               CONST_BITS+PASS1_BITS+3)
    554                             & RANGE_MASK];
    555     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
    556                                               CONST_BITS+PASS1_BITS+3)
    557                             & RANGE_MASK];
    558     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
    559                                               CONST_BITS+PASS1_BITS+3)
    560                             & RANGE_MASK];
    561 
    562     wsptr += 7;         /* advance pointer to next row */
    563   }
    564 }
    565 
    566 
    567 /*
    568  * Perform dequantization and inverse DCT on one block of coefficients,
    569  * producing a reduced-size 6x6 output block.
    570  *
    571  * Optimized algorithm with 3 multiplications in the 1-D kernel.
    572  * cK represents sqrt(2) * cos(K*pi/12).
    573  */
    574 
    575 GLOBAL(void)
    576 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
    577                JCOEFPTR coef_block,
    578                JSAMPARRAY output_buf, JDIMENSION output_col)
    579 {
    580   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
    581   JLONG z1, z2, z3;
    582   JCOEFPTR inptr;
    583   ISLOW_MULT_TYPE *quantptr;
    584   int *wsptr;
    585   JSAMPROW outptr;
    586   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    587   int ctr;
    588   int workspace[6*6];   /* buffers data between passes */
    589   SHIFT_TEMPS
    590 
    591   /* Pass 1: process columns from input, store into work array. */
    592 
    593   inptr = coef_block;
    594   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    595   wsptr = workspace;
    596   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
    597     /* Even part */
    598 
    599     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    600     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    601     /* Add fudge factor here for final descale. */
    602     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    603     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    604     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    605     tmp1 = tmp0 + tmp10;
    606     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
    607     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    608     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    609     tmp10 = tmp1 + tmp0;
    610     tmp12 = tmp1 - tmp0;
    611 
    612     /* Odd part */
    613 
    614     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    615     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    616     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    617     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    618     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
    619     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
    620     tmp1 = LEFT_SHIFT(z1 - z2 - z3, PASS1_BITS);
    621 
    622     /* Final output stage */
    623 
    624     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    625     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    626     wsptr[6*1] = (int) (tmp11 + tmp1);
    627     wsptr[6*4] = (int) (tmp11 - tmp1);
    628     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    629     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    630   }
    631 
    632   /* Pass 2: process 6 rows from work array, store into output array. */
    633 
    634   wsptr = workspace;
    635   for (ctr = 0; ctr < 6; ctr++) {
    636     outptr = output_buf[ctr] + output_col;
    637 
    638     /* Even part */
    639 
    640     /* Add fudge factor here for final descale. */
    641     tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
    642     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    643     tmp2 = (JLONG) wsptr[4];
    644     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    645     tmp1 = tmp0 + tmp10;
    646     tmp11 = tmp0 - tmp10 - tmp10;
    647     tmp10 = (JLONG) wsptr[2];
    648     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    649     tmp10 = tmp1 + tmp0;
    650     tmp12 = tmp1 - tmp0;
    651 
    652     /* Odd part */
    653 
    654     z1 = (JLONG) wsptr[1];
    655     z2 = (JLONG) wsptr[3];
    656     z3 = (JLONG) wsptr[5];
    657     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    658     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
    659     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
    660     tmp1 = LEFT_SHIFT(z1 - z2 - z3, CONST_BITS);
    661 
    662     /* Final output stage */
    663 
    664     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    665                                               CONST_BITS+PASS1_BITS+3)
    666                             & RANGE_MASK];
    667     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    668                                               CONST_BITS+PASS1_BITS+3)
    669                             & RANGE_MASK];
    670     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    671                                               CONST_BITS+PASS1_BITS+3)
    672                             & RANGE_MASK];
    673     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    674                                               CONST_BITS+PASS1_BITS+3)
    675                             & RANGE_MASK];
    676     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
    677                                               CONST_BITS+PASS1_BITS+3)
    678                             & RANGE_MASK];
    679     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
    680                                               CONST_BITS+PASS1_BITS+3)
    681                             & RANGE_MASK];
    682 
    683     wsptr += 6;         /* advance pointer to next row */
    684   }
    685 }
    686 
    687 
    688 /*
    689  * Perform dequantization and inverse DCT on one block of coefficients,
    690  * producing a reduced-size 5x5 output block.
    691  *
    692  * Optimized algorithm with 5 multiplications in the 1-D kernel.
    693  * cK represents sqrt(2) * cos(K*pi/10).
    694  */
    695 
    696 GLOBAL(void)
    697 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
    698                JCOEFPTR coef_block,
    699                JSAMPARRAY output_buf, JDIMENSION output_col)
    700 {
    701   JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
    702   JLONG z1, z2, z3;
    703   JCOEFPTR inptr;
    704   ISLOW_MULT_TYPE *quantptr;
    705   int *wsptr;
    706   JSAMPROW outptr;
    707   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    708   int ctr;
    709   int workspace[5*5];   /* buffers data between passes */
    710   SHIFT_TEMPS
    711 
    712   /* Pass 1: process columns from input, store into work array. */
    713 
    714   inptr = coef_block;
    715   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    716   wsptr = workspace;
    717   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
    718     /* Even part */
    719 
    720     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    721     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
    722     /* Add fudge factor here for final descale. */
    723     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
    724     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    725     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    726     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    727     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    728     z3 = tmp12 + z2;
    729     tmp10 = z3 + z1;
    730     tmp11 = z3 - z1;
    731     tmp12 -= LEFT_SHIFT(z2, 2);
    732 
    733     /* Odd part */
    734 
    735     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    736     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    737 
    738     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    739     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    740     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    741 
    742     /* Final output stage */
    743 
    744     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    745     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    746     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    747     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    748     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
    749   }
    750 
    751   /* Pass 2: process 5 rows from work array, store into output array. */
    752 
    753   wsptr = workspace;
    754   for (ctr = 0; ctr < 5; ctr++) {
    755     outptr = output_buf[ctr] + output_col;
    756 
    757     /* Even part */
    758 
    759     /* Add fudge factor here for final descale. */
    760     tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
    761     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
    762     tmp0 = (JLONG) wsptr[2];
    763     tmp1 = (JLONG) wsptr[4];
    764     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    765     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    766     z3 = tmp12 + z2;
    767     tmp10 = z3 + z1;
    768     tmp11 = z3 - z1;
    769     tmp12 -= LEFT_SHIFT(z2, 2);
    770 
    771     /* Odd part */
    772 
    773     z2 = (JLONG) wsptr[1];
    774     z3 = (JLONG) wsptr[3];
    775 
    776     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    777     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    778     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    779 
    780     /* Final output stage */
    781 
    782     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    783                                               CONST_BITS+PASS1_BITS+3)
    784                             & RANGE_MASK];
    785     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    786                                               CONST_BITS+PASS1_BITS+3)
    787                             & RANGE_MASK];
    788     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    789                                               CONST_BITS+PASS1_BITS+3)
    790                             & RANGE_MASK];
    791     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    792                                               CONST_BITS+PASS1_BITS+3)
    793                             & RANGE_MASK];
    794     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
    795                                               CONST_BITS+PASS1_BITS+3)
    796                             & RANGE_MASK];
    797 
    798     wsptr += 5;         /* advance pointer to next row */
    799   }
    800 }
    801 
    802 
    803 /*
    804  * Perform dequantization and inverse DCT on one block of coefficients,
    805  * producing a reduced-size 3x3 output block.
    806  *
    807  * Optimized algorithm with 2 multiplications in the 1-D kernel.
    808  * cK represents sqrt(2) * cos(K*pi/6).
    809  */
    810 
    811 GLOBAL(void)
    812 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
    813                JCOEFPTR coef_block,
    814                JSAMPARRAY output_buf, JDIMENSION output_col)
    815 {
    816   JLONG tmp0, tmp2, tmp10, tmp12;
    817   JCOEFPTR inptr;
    818   ISLOW_MULT_TYPE *quantptr;
    819   int *wsptr;
    820   JSAMPROW outptr;
    821   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    822   int ctr;
    823   int workspace[3*3];   /* buffers data between passes */
    824   SHIFT_TEMPS
    825 
    826   /* Pass 1: process columns from input, store into work array. */
    827 
    828   inptr = coef_block;
    829   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    830   wsptr = workspace;
    831   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
    832     /* Even part */
    833 
    834     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    835     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    836     /* Add fudge factor here for final descale. */
    837     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    838     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    839     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    840     tmp10 = tmp0 + tmp12;
    841     tmp2 = tmp0 - tmp12 - tmp12;
    842 
    843     /* Odd part */
    844 
    845     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    846     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    847 
    848     /* Final output stage */
    849 
    850     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    851     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    852     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
    853   }
    854 
    855   /* Pass 2: process 3 rows from work array, store into output array. */
    856 
    857   wsptr = workspace;
    858   for (ctr = 0; ctr < 3; ctr++) {
    859     outptr = output_buf[ctr] + output_col;
    860 
    861     /* Even part */
    862 
    863     /* Add fudge factor here for final descale. */
    864     tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
    865     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    866     tmp2 = (JLONG) wsptr[2];
    867     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    868     tmp10 = tmp0 + tmp12;
    869     tmp2 = tmp0 - tmp12 - tmp12;
    870 
    871     /* Odd part */
    872 
    873     tmp12 = (JLONG) wsptr[1];
    874     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    875 
    876     /* Final output stage */
    877 
    878     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    879                                               CONST_BITS+PASS1_BITS+3)
    880                             & RANGE_MASK];
    881     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    882                                               CONST_BITS+PASS1_BITS+3)
    883                             & RANGE_MASK];
    884     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
    885                                               CONST_BITS+PASS1_BITS+3)
    886                             & RANGE_MASK];
    887 
    888     wsptr += 3;         /* advance pointer to next row */
    889   }
    890 }
    891 
    892 
    893 /*
    894  * Perform dequantization and inverse DCT on one block of coefficients,
    895  * producing a 9x9 output block.
    896  *
    897  * Optimized algorithm with 10 multiplications in the 1-D kernel.
    898  * cK represents sqrt(2) * cos(K*pi/18).
    899  */
    900 
    901 GLOBAL(void)
    902 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
    903                JCOEFPTR coef_block,
    904                JSAMPARRAY output_buf, JDIMENSION output_col)
    905 {
    906   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
    907   JLONG z1, z2, z3, z4;
    908   JCOEFPTR inptr;
    909   ISLOW_MULT_TYPE *quantptr;
    910   int *wsptr;
    911   JSAMPROW outptr;
    912   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    913   int ctr;
    914   int workspace[8*9];   /* buffers data between passes */
    915   SHIFT_TEMPS
    916 
    917   /* Pass 1: process columns from input, store into work array. */
    918 
    919   inptr = coef_block;
    920   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    921   wsptr = workspace;
    922   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
    923     /* Even part */
    924 
    925     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    926     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    927     /* Add fudge factor here for final descale. */
    928     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    929 
    930     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    931     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    932     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    933 
    934     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
    935     tmp1 = tmp0 + tmp3;
    936     tmp2 = tmp0 - tmp3 - tmp3;
    937 
    938     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
    939     tmp11 = tmp2 + tmp0;
    940     tmp14 = tmp2 - tmp0 - tmp0;
    941 
    942     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
    943     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
    944     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
    945 
    946     tmp10 = tmp1 + tmp0 - tmp3;
    947     tmp12 = tmp1 - tmp0 + tmp2;
    948     tmp13 = tmp1 - tmp2 + tmp3;
    949 
    950     /* Odd part */
    951 
    952     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    953     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    954     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    955     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
    956 
    957     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
    958 
    959     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
    960     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
    961     tmp0 = tmp2 + tmp3 - z2;
    962     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
    963     tmp2 += z2 - tmp1;
    964     tmp3 += z2 + tmp1;
    965     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
    966 
    967     /* Final output stage */
    968 
    969     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    970     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    971     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    972     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    973     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    974     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    975     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
    976     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
    977     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
    978   }
    979 
    980   /* Pass 2: process 9 rows from work array, store into output array. */
    981 
    982   wsptr = workspace;
    983   for (ctr = 0; ctr < 9; ctr++) {
    984     outptr = output_buf[ctr] + output_col;
    985 
    986     /* Even part */
    987 
    988     /* Add fudge factor here for final descale. */
    989     tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
    990     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    991 
    992     z1 = (JLONG) wsptr[2];
    993     z2 = (JLONG) wsptr[4];
    994     z3 = (JLONG) wsptr[6];
    995 
    996     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
    997     tmp1 = tmp0 + tmp3;
    998     tmp2 = tmp0 - tmp3 - tmp3;
    999 
   1000     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
   1001     tmp11 = tmp2 + tmp0;
   1002     tmp14 = tmp2 - tmp0 - tmp0;
   1003 
   1004     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
   1005     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
   1006     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
   1007 
   1008     tmp10 = tmp1 + tmp0 - tmp3;
   1009     tmp12 = tmp1 - tmp0 + tmp2;
   1010     tmp13 = tmp1 - tmp2 + tmp3;
   1011 
   1012     /* Odd part */
   1013 
   1014     z1 = (JLONG) wsptr[1];
   1015     z2 = (JLONG) wsptr[3];
   1016     z3 = (JLONG) wsptr[5];
   1017     z4 = (JLONG) wsptr[7];
   1018 
   1019     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
   1020 
   1021     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
   1022     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
   1023     tmp0 = tmp2 + tmp3 - z2;
   1024     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
   1025     tmp2 += z2 - tmp1;
   1026     tmp3 += z2 + tmp1;
   1027     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
   1028 
   1029     /* Final output stage */
   1030 
   1031     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   1032                                               CONST_BITS+PASS1_BITS+3)
   1033                             & RANGE_MASK];
   1034     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   1035                                               CONST_BITS+PASS1_BITS+3)
   1036                             & RANGE_MASK];
   1037     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
   1038                                               CONST_BITS+PASS1_BITS+3)
   1039                             & RANGE_MASK];
   1040     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
   1041                                               CONST_BITS+PASS1_BITS+3)
   1042                             & RANGE_MASK];
   1043     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   1044                                               CONST_BITS+PASS1_BITS+3)
   1045                             & RANGE_MASK];
   1046     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   1047                                               CONST_BITS+PASS1_BITS+3)
   1048                             & RANGE_MASK];
   1049     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
   1050                                               CONST_BITS+PASS1_BITS+3)
   1051                             & RANGE_MASK];
   1052     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
   1053                                               CONST_BITS+PASS1_BITS+3)
   1054                             & RANGE_MASK];
   1055     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
   1056                                               CONST_BITS+PASS1_BITS+3)
   1057                             & RANGE_MASK];
   1058 
   1059     wsptr += 8;         /* advance pointer to next row */
   1060   }
   1061 }
   1062 
   1063 
   1064 /*
   1065  * Perform dequantization and inverse DCT on one block of coefficients,
   1066  * producing a 10x10 output block.
   1067  *
   1068  * Optimized algorithm with 12 multiplications in the 1-D kernel.
   1069  * cK represents sqrt(2) * cos(K*pi/20).
   1070  */
   1071 
   1072 GLOBAL(void)
   1073 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1074                  JCOEFPTR coef_block,
   1075                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1076 {
   1077   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   1078   JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
   1079   JLONG z1, z2, z3, z4, z5;
   1080   JCOEFPTR inptr;
   1081   ISLOW_MULT_TYPE *quantptr;
   1082   int *wsptr;
   1083   JSAMPROW outptr;
   1084   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1085   int ctr;
   1086   int workspace[8*10];  /* buffers data between passes */
   1087   SHIFT_TEMPS
   1088 
   1089   /* Pass 1: process columns from input, store into work array. */
   1090 
   1091   inptr = coef_block;
   1092   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1093   wsptr = workspace;
   1094   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1095     /* Even part */
   1096 
   1097     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1098     z3 = LEFT_SHIFT(z3, CONST_BITS);
   1099     /* Add fudge factor here for final descale. */
   1100     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   1101     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1102     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1103     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1104     tmp10 = z3 + z1;
   1105     tmp11 = z3 - z2;
   1106 
   1107     tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
   1108                         CONST_BITS-PASS1_BITS);  /* c0 = (c4-c8)*2 */
   1109 
   1110     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1111     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1112 
   1113     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1114     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1115     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1116 
   1117     tmp20 = tmp10 + tmp12;
   1118     tmp24 = tmp10 - tmp12;
   1119     tmp21 = tmp11 + tmp13;
   1120     tmp23 = tmp11 - tmp13;
   1121 
   1122     /* Odd part */
   1123 
   1124     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1125     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1126     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1127     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1128 
   1129     tmp11 = z2 + z4;
   1130     tmp13 = z2 - z4;
   1131 
   1132     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1133     z5 = LEFT_SHIFT(z3, CONST_BITS);
   1134 
   1135     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1136     z4 = z5 + tmp12;
   1137 
   1138     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1139     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1140 
   1141     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1142     z4 = z5 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
   1143 
   1144     tmp12 = LEFT_SHIFT(z1 - tmp13 - z3, PASS1_BITS);
   1145 
   1146     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1147     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1148 
   1149     /* Final output stage */
   1150 
   1151     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1152     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1153     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1154     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1155     wsptr[8*2] = (int) (tmp22 + tmp12);
   1156     wsptr[8*7] = (int) (tmp22 - tmp12);
   1157     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1158     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1159     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1160     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1161   }
   1162 
   1163   /* Pass 2: process 10 rows from work array, store into output array. */
   1164 
   1165   wsptr = workspace;
   1166   for (ctr = 0; ctr < 10; ctr++) {
   1167     outptr = output_buf[ctr] + output_col;
   1168 
   1169     /* Even part */
   1170 
   1171     /* Add fudge factor here for final descale. */
   1172     z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
   1173     z3 = LEFT_SHIFT(z3, CONST_BITS);
   1174     z4 = (JLONG) wsptr[4];
   1175     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1176     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1177     tmp10 = z3 + z1;
   1178     tmp11 = z3 - z2;
   1179 
   1180     tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
   1181 
   1182     z2 = (JLONG) wsptr[2];
   1183     z3 = (JLONG) wsptr[6];
   1184 
   1185     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1186     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1187     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1188 
   1189     tmp20 = tmp10 + tmp12;
   1190     tmp24 = tmp10 - tmp12;
   1191     tmp21 = tmp11 + tmp13;
   1192     tmp23 = tmp11 - tmp13;
   1193 
   1194     /* Odd part */
   1195 
   1196     z1 = (JLONG) wsptr[1];
   1197     z2 = (JLONG) wsptr[3];
   1198     z3 = (JLONG) wsptr[5];
   1199     z3 = LEFT_SHIFT(z3, CONST_BITS);
   1200     z4 = (JLONG) wsptr[7];
   1201 
   1202     tmp11 = z2 + z4;
   1203     tmp13 = z2 - z4;
   1204 
   1205     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1206 
   1207     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1208     z4 = z3 + tmp12;
   1209 
   1210     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1211     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1212 
   1213     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1214     z4 = z3 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
   1215 
   1216     tmp12 = LEFT_SHIFT(z1 - tmp13, CONST_BITS) - z3;
   1217 
   1218     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1219     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1220 
   1221     /* Final output stage */
   1222 
   1223     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1224                                               CONST_BITS+PASS1_BITS+3)
   1225                             & RANGE_MASK];
   1226     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1227                                               CONST_BITS+PASS1_BITS+3)
   1228                             & RANGE_MASK];
   1229     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1230                                               CONST_BITS+PASS1_BITS+3)
   1231                             & RANGE_MASK];
   1232     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1233                                               CONST_BITS+PASS1_BITS+3)
   1234                             & RANGE_MASK];
   1235     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1236                                               CONST_BITS+PASS1_BITS+3)
   1237                             & RANGE_MASK];
   1238     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1239                                               CONST_BITS+PASS1_BITS+3)
   1240                             & RANGE_MASK];
   1241     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1242                                               CONST_BITS+PASS1_BITS+3)
   1243                             & RANGE_MASK];
   1244     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1245                                               CONST_BITS+PASS1_BITS+3)
   1246                             & RANGE_MASK];
   1247     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1248                                               CONST_BITS+PASS1_BITS+3)
   1249                             & RANGE_MASK];
   1250     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1251                                               CONST_BITS+PASS1_BITS+3)
   1252                             & RANGE_MASK];
   1253 
   1254     wsptr += 8;         /* advance pointer to next row */
   1255   }
   1256 }
   1257 
   1258 
   1259 /*
   1260  * Perform dequantization and inverse DCT on one block of coefficients,
   1261  * producing a 11x11 output block.
   1262  *
   1263  * Optimized algorithm with 24 multiplications in the 1-D kernel.
   1264  * cK represents sqrt(2) * cos(K*pi/22).
   1265  */
   1266 
   1267 GLOBAL(void)
   1268 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1269                  JCOEFPTR coef_block,
   1270                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1271 {
   1272   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   1273   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1274   JLONG z1, z2, z3, z4;
   1275   JCOEFPTR inptr;
   1276   ISLOW_MULT_TYPE *quantptr;
   1277   int *wsptr;
   1278   JSAMPROW outptr;
   1279   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1280   int ctr;
   1281   int workspace[8*11];  /* buffers data between passes */
   1282   SHIFT_TEMPS
   1283 
   1284   /* Pass 1: process columns from input, store into work array. */
   1285 
   1286   inptr = coef_block;
   1287   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1288   wsptr = workspace;
   1289   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1290     /* Even part */
   1291 
   1292     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1293     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
   1294     /* Add fudge factor here for final descale. */
   1295     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
   1296 
   1297     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1298     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1299     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1300 
   1301     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1302     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1303     z4 = z1 + z3;
   1304     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
   1305     z4 -= z2;
   1306     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1307     tmp21 = tmp20 + tmp23 + tmp25 -
   1308             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1309     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1310     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1311     tmp24 += tmp25;
   1312     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1313     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1314              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1315     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1316 
   1317     /* Odd part */
   1318 
   1319     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1320     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1321     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1322     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1323 
   1324     tmp11 = z1 + z2;
   1325     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1326     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1327     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1328     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1329     tmp10 = tmp11 + tmp12 + tmp13 -
   1330             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1331     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1332     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1333     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1334     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
   1335     tmp11 += z1;
   1336     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1337     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
   1338              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1339              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1340 
   1341     /* Final output stage */
   1342 
   1343     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1344     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1345     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1346     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1347     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1348     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1349     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1350     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1351     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1352     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1353     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
   1354   }
   1355 
   1356   /* Pass 2: process 11 rows from work array, store into output array. */
   1357 
   1358   wsptr = workspace;
   1359   for (ctr = 0; ctr < 11; ctr++) {
   1360     outptr = output_buf[ctr] + output_col;
   1361 
   1362     /* Even part */
   1363 
   1364     /* Add fudge factor here for final descale. */
   1365     tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
   1366     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
   1367 
   1368     z1 = (JLONG) wsptr[2];
   1369     z2 = (JLONG) wsptr[4];
   1370     z3 = (JLONG) wsptr[6];
   1371 
   1372     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1373     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1374     z4 = z1 + z3;
   1375     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
   1376     z4 -= z2;
   1377     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1378     tmp21 = tmp20 + tmp23 + tmp25 -
   1379             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1380     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1381     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1382     tmp24 += tmp25;
   1383     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1384     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1385              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1386     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1387 
   1388     /* Odd part */
   1389 
   1390     z1 = (JLONG) wsptr[1];
   1391     z2 = (JLONG) wsptr[3];
   1392     z3 = (JLONG) wsptr[5];
   1393     z4 = (JLONG) wsptr[7];
   1394 
   1395     tmp11 = z1 + z2;
   1396     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1397     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1398     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1399     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1400     tmp10 = tmp11 + tmp12 + tmp13 -
   1401             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1402     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1403     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1404     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1405     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
   1406     tmp11 += z1;
   1407     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1408     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
   1409              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1410              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1411 
   1412     /* Final output stage */
   1413 
   1414     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1415                                                CONST_BITS+PASS1_BITS+3)
   1416                              & RANGE_MASK];
   1417     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1418                                                CONST_BITS+PASS1_BITS+3)
   1419                              & RANGE_MASK];
   1420     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1421                                                CONST_BITS+PASS1_BITS+3)
   1422                              & RANGE_MASK];
   1423     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1424                                                CONST_BITS+PASS1_BITS+3)
   1425                              & RANGE_MASK];
   1426     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1427                                                CONST_BITS+PASS1_BITS+3)
   1428                              & RANGE_MASK];
   1429     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1430                                                CONST_BITS+PASS1_BITS+3)
   1431                              & RANGE_MASK];
   1432     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1433                                                CONST_BITS+PASS1_BITS+3)
   1434                              & RANGE_MASK];
   1435     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1436                                                CONST_BITS+PASS1_BITS+3)
   1437                              & RANGE_MASK];
   1438     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1439                                                CONST_BITS+PASS1_BITS+3)
   1440                              & RANGE_MASK];
   1441     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1442                                                CONST_BITS+PASS1_BITS+3)
   1443                              & RANGE_MASK];
   1444     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
   1445                                                CONST_BITS+PASS1_BITS+3)
   1446                              & RANGE_MASK];
   1447 
   1448     wsptr += 8;         /* advance pointer to next row */
   1449   }
   1450 }
   1451 
   1452 
   1453 /*
   1454  * Perform dequantization and inverse DCT on one block of coefficients,
   1455  * producing a 12x12 output block.
   1456  *
   1457  * Optimized algorithm with 15 multiplications in the 1-D kernel.
   1458  * cK represents sqrt(2) * cos(K*pi/24).
   1459  */
   1460 
   1461 GLOBAL(void)
   1462 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1463                  JCOEFPTR coef_block,
   1464                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1465 {
   1466   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1467   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1468   JLONG z1, z2, z3, z4;
   1469   JCOEFPTR inptr;
   1470   ISLOW_MULT_TYPE *quantptr;
   1471   int *wsptr;
   1472   JSAMPROW outptr;
   1473   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1474   int ctr;
   1475   int workspace[8*12];  /* buffers data between passes */
   1476   SHIFT_TEMPS
   1477 
   1478   /* Pass 1: process columns from input, store into work array. */
   1479 
   1480   inptr = coef_block;
   1481   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1482   wsptr = workspace;
   1483   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1484     /* Even part */
   1485 
   1486     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1487     z3 = LEFT_SHIFT(z3, CONST_BITS);
   1488     /* Add fudge factor here for final descale. */
   1489     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   1490 
   1491     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1492     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1493 
   1494     tmp10 = z3 + z4;
   1495     tmp11 = z3 - z4;
   1496 
   1497     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1498     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1499     z1 = LEFT_SHIFT(z1, CONST_BITS);
   1500     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1501     z2 = LEFT_SHIFT(z2, CONST_BITS);
   1502 
   1503     tmp12 = z1 - z2;
   1504 
   1505     tmp21 = z3 + tmp12;
   1506     tmp24 = z3 - tmp12;
   1507 
   1508     tmp12 = z4 + z2;
   1509 
   1510     tmp20 = tmp10 + tmp12;
   1511     tmp25 = tmp10 - tmp12;
   1512 
   1513     tmp12 = z4 - z1 - z2;
   1514 
   1515     tmp22 = tmp11 + tmp12;
   1516     tmp23 = tmp11 - tmp12;
   1517 
   1518     /* Odd part */
   1519 
   1520     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1521     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1522     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1523     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1524 
   1525     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1526     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   1527 
   1528     tmp10 = z1 + z3;
   1529     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1530     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1531     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1532     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   1533     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1534     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1535     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1536              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1537 
   1538     z1 -= z4;
   1539     z2 -= z3;
   1540     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1541     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1542     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1543 
   1544     /* Final output stage */
   1545 
   1546     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1547     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1548     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1549     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1550     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1551     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1552     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1553     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1554     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1555     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1556     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   1557     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   1558   }
   1559 
   1560   /* Pass 2: process 12 rows from work array, store into output array. */
   1561 
   1562   wsptr = workspace;
   1563   for (ctr = 0; ctr < 12; ctr++) {
   1564     outptr = output_buf[ctr] + output_col;
   1565 
   1566     /* Even part */
   1567 
   1568     /* Add fudge factor here for final descale. */
   1569     z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
   1570     z3 = LEFT_SHIFT(z3, CONST_BITS);
   1571 
   1572     z4 = (JLONG) wsptr[4];
   1573     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1574 
   1575     tmp10 = z3 + z4;
   1576     tmp11 = z3 - z4;
   1577 
   1578     z1 = (JLONG) wsptr[2];
   1579     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1580     z1 = LEFT_SHIFT(z1, CONST_BITS);
   1581     z2 = (JLONG) wsptr[6];
   1582     z2 = LEFT_SHIFT(z2, CONST_BITS);
   1583 
   1584     tmp12 = z1 - z2;
   1585 
   1586     tmp21 = z3 + tmp12;
   1587     tmp24 = z3 - tmp12;
   1588 
   1589     tmp12 = z4 + z2;
   1590 
   1591     tmp20 = tmp10 + tmp12;
   1592     tmp25 = tmp10 - tmp12;
   1593 
   1594     tmp12 = z4 - z1 - z2;
   1595 
   1596     tmp22 = tmp11 + tmp12;
   1597     tmp23 = tmp11 - tmp12;
   1598 
   1599     /* Odd part */
   1600 
   1601     z1 = (JLONG) wsptr[1];
   1602     z2 = (JLONG) wsptr[3];
   1603     z3 = (JLONG) wsptr[5];
   1604     z4 = (JLONG) wsptr[7];
   1605 
   1606     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1607     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   1608 
   1609     tmp10 = z1 + z3;
   1610     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1611     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1612     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1613     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   1614     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1615     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1616     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1617              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1618 
   1619     z1 -= z4;
   1620     z2 -= z3;
   1621     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1622     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1623     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1624 
   1625     /* Final output stage */
   1626 
   1627     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1628                                                CONST_BITS+PASS1_BITS+3)
   1629                              & RANGE_MASK];
   1630     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1631                                                CONST_BITS+PASS1_BITS+3)
   1632                              & RANGE_MASK];
   1633     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1634                                                CONST_BITS+PASS1_BITS+3)
   1635                              & RANGE_MASK];
   1636     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1637                                                CONST_BITS+PASS1_BITS+3)
   1638                              & RANGE_MASK];
   1639     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1640                                                CONST_BITS+PASS1_BITS+3)
   1641                              & RANGE_MASK];
   1642     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1643                                                CONST_BITS+PASS1_BITS+3)
   1644                              & RANGE_MASK];
   1645     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1646                                                CONST_BITS+PASS1_BITS+3)
   1647                              & RANGE_MASK];
   1648     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1649                                                CONST_BITS+PASS1_BITS+3)
   1650                              & RANGE_MASK];
   1651     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1652                                                CONST_BITS+PASS1_BITS+3)
   1653                              & RANGE_MASK];
   1654     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1655                                                CONST_BITS+PASS1_BITS+3)
   1656                              & RANGE_MASK];
   1657     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   1658                                                CONST_BITS+PASS1_BITS+3)
   1659                              & RANGE_MASK];
   1660     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   1661                                                CONST_BITS+PASS1_BITS+3)
   1662                              & RANGE_MASK];
   1663 
   1664     wsptr += 8;         /* advance pointer to next row */
   1665   }
   1666 }
   1667 
   1668 
   1669 /*
   1670  * Perform dequantization and inverse DCT on one block of coefficients,
   1671  * producing a 13x13 output block.
   1672  *
   1673  * Optimized algorithm with 29 multiplications in the 1-D kernel.
   1674  * cK represents sqrt(2) * cos(K*pi/26).
   1675  */
   1676 
   1677 GLOBAL(void)
   1678 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1679                  JCOEFPTR coef_block,
   1680                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1681 {
   1682   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1683   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   1684   JLONG z1, z2, z3, z4;
   1685   JCOEFPTR inptr;
   1686   ISLOW_MULT_TYPE *quantptr;
   1687   int *wsptr;
   1688   JSAMPROW outptr;
   1689   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1690   int ctr;
   1691   int workspace[8*13];  /* buffers data between passes */
   1692   SHIFT_TEMPS
   1693 
   1694   /* Pass 1: process columns from input, store into work array. */
   1695 
   1696   inptr = coef_block;
   1697   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1698   wsptr = workspace;
   1699   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1700     /* Even part */
   1701 
   1702     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1703     z1 = LEFT_SHIFT(z1, CONST_BITS);
   1704     /* Add fudge factor here for final descale. */
   1705     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   1706 
   1707     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1708     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1709     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1710 
   1711     tmp10 = z3 + z4;
   1712     tmp11 = z3 - z4;
   1713 
   1714     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1715     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1716 
   1717     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1718     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1719 
   1720     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1721     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1722 
   1723     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1724     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
   1725 
   1726     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   1727     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   1728 
   1729     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
   1730     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
   1731 
   1732     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   1733 
   1734     /* Odd part */
   1735 
   1736     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1737     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1738     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1739     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1740 
   1741     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   1742     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   1743     tmp15 = z1 + z4;
   1744     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   1745     tmp10 = tmp11 + tmp12 + tmp13 -
   1746             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   1747     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
   1748     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   1749     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   1750     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
   1751     tmp11 += tmp14;
   1752     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   1753     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
   1754     tmp12 += tmp14;
   1755     tmp13 += tmp14;
   1756     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   1757     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   1758             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   1759     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   1760     tmp14 += z1;
   1761     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   1762              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   1763 
   1764     /* Final output stage */
   1765 
   1766     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1767     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1768     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1769     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1770     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1771     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1772     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1773     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1774     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1775     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1776     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   1777     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   1778     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
   1779   }
   1780 
   1781   /* Pass 2: process 13 rows from work array, store into output array. */
   1782 
   1783   wsptr = workspace;
   1784   for (ctr = 0; ctr < 13; ctr++) {
   1785     outptr = output_buf[ctr] + output_col;
   1786 
   1787     /* Even part */
   1788 
   1789     /* Add fudge factor here for final descale. */
   1790     z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
   1791     z1 = LEFT_SHIFT(z1, CONST_BITS);
   1792 
   1793     z2 = (JLONG) wsptr[2];
   1794     z3 = (JLONG) wsptr[4];
   1795     z4 = (JLONG) wsptr[6];
   1796 
   1797     tmp10 = z3 + z4;
   1798     tmp11 = z3 - z4;
   1799 
   1800     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1801     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1802 
   1803     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1804     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1805 
   1806     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1807     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1808 
   1809     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1810     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
   1811 
   1812     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   1813     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   1814 
   1815     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
   1816     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
   1817 
   1818     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   1819 
   1820     /* Odd part */
   1821 
   1822     z1 = (JLONG) wsptr[1];
   1823     z2 = (JLONG) wsptr[3];
   1824     z3 = (JLONG) wsptr[5];
   1825     z4 = (JLONG) wsptr[7];
   1826 
   1827     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   1828     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   1829     tmp15 = z1 + z4;
   1830     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   1831     tmp10 = tmp11 + tmp12 + tmp13 -
   1832             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   1833     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
   1834     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   1835     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   1836     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
   1837     tmp11 += tmp14;
   1838     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   1839     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
   1840     tmp12 += tmp14;
   1841     tmp13 += tmp14;
   1842     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   1843     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   1844             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   1845     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   1846     tmp14 += z1;
   1847     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   1848              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   1849 
   1850     /* Final output stage */
   1851 
   1852     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1853                                                CONST_BITS+PASS1_BITS+3)
   1854                              & RANGE_MASK];
   1855     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1856                                                CONST_BITS+PASS1_BITS+3)
   1857                              & RANGE_MASK];
   1858     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1859                                                CONST_BITS+PASS1_BITS+3)
   1860                              & RANGE_MASK];
   1861     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1862                                                CONST_BITS+PASS1_BITS+3)
   1863                              & RANGE_MASK];
   1864     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1865                                                CONST_BITS+PASS1_BITS+3)
   1866                              & RANGE_MASK];
   1867     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1868                                                CONST_BITS+PASS1_BITS+3)
   1869                              & RANGE_MASK];
   1870     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1871                                                CONST_BITS+PASS1_BITS+3)
   1872                              & RANGE_MASK];
   1873     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1874                                                CONST_BITS+PASS1_BITS+3)
   1875                              & RANGE_MASK];
   1876     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1877                                                CONST_BITS+PASS1_BITS+3)
   1878                              & RANGE_MASK];
   1879     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1880                                                CONST_BITS+PASS1_BITS+3)
   1881                              & RANGE_MASK];
   1882     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   1883                                                CONST_BITS+PASS1_BITS+3)
   1884                              & RANGE_MASK];
   1885     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   1886                                                CONST_BITS+PASS1_BITS+3)
   1887                              & RANGE_MASK];
   1888     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
   1889                                                CONST_BITS+PASS1_BITS+3)
   1890                              & RANGE_MASK];
   1891 
   1892     wsptr += 8;         /* advance pointer to next row */
   1893   }
   1894 }
   1895 
   1896 
   1897 /*
   1898  * Perform dequantization and inverse DCT on one block of coefficients,
   1899  * producing a 14x14 output block.
   1900  *
   1901  * Optimized algorithm with 20 multiplications in the 1-D kernel.
   1902  * cK represents sqrt(2) * cos(K*pi/28).
   1903  */
   1904 
   1905 GLOBAL(void)
   1906 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1907                  JCOEFPTR coef_block,
   1908                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1909 {
   1910   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   1911   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   1912   JLONG z1, z2, z3, z4;
   1913   JCOEFPTR inptr;
   1914   ISLOW_MULT_TYPE *quantptr;
   1915   int *wsptr;
   1916   JSAMPROW outptr;
   1917   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1918   int ctr;
   1919   int workspace[8*14];  /* buffers data between passes */
   1920   SHIFT_TEMPS
   1921 
   1922   /* Pass 1: process columns from input, store into work array. */
   1923 
   1924   inptr = coef_block;
   1925   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1926   wsptr = workspace;
   1927   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1928     /* Even part */
   1929 
   1930     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1931     z1 = LEFT_SHIFT(z1, CONST_BITS);
   1932     /* Add fudge factor here for final descale. */
   1933     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   1934     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1935     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   1936     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   1937     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   1938 
   1939     tmp10 = z1 + z2;
   1940     tmp11 = z1 + z3;
   1941     tmp12 = z1 - z4;
   1942 
   1943     tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
   1944                         CONST_BITS-PASS1_BITS);  /* c0 = (c4+c12-c8)*2 */
   1945 
   1946     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1947     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1948 
   1949     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   1950 
   1951     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   1952     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   1953     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   1954             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   1955 
   1956     tmp20 = tmp10 + tmp13;
   1957     tmp26 = tmp10 - tmp13;
   1958     tmp21 = tmp11 + tmp14;
   1959     tmp25 = tmp11 - tmp14;
   1960     tmp22 = tmp12 + tmp15;
   1961     tmp24 = tmp12 - tmp15;
   1962 
   1963     /* Odd part */
   1964 
   1965     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1966     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1967     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1968     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1969     tmp13 = LEFT_SHIFT(z4, CONST_BITS);
   1970 
   1971     tmp14 = z1 + z3;
   1972     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   1973     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   1974     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   1975     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   1976     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   1977     z1    -= z2;
   1978     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
   1979     tmp16 += tmp15;
   1980     z1    += z4;
   1981     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
   1982     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
   1983     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
   1984     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   1985     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   1986     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
   1987 
   1988     tmp13 = LEFT_SHIFT(z1 - z3, PASS1_BITS);
   1989 
   1990     /* Final output stage */
   1991 
   1992     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1993     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1994     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1995     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1996     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1997     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1998     wsptr[8*3]  = (int) (tmp23 + tmp13);
   1999     wsptr[8*10] = (int) (tmp23 - tmp13);
   2000     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   2001     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   2002     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   2003     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   2004     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
   2005     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
   2006   }
   2007 
   2008   /* Pass 2: process 14 rows from work array, store into output array. */
   2009 
   2010   wsptr = workspace;
   2011   for (ctr = 0; ctr < 14; ctr++) {
   2012     outptr = output_buf[ctr] + output_col;
   2013 
   2014     /* Even part */
   2015 
   2016     /* Add fudge factor here for final descale. */
   2017     z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
   2018     z1 = LEFT_SHIFT(z1, CONST_BITS);
   2019     z4 = (JLONG) wsptr[4];
   2020     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   2021     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   2022     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   2023 
   2024     tmp10 = z1 + z2;
   2025     tmp11 = z1 + z3;
   2026     tmp12 = z1 - z4;
   2027 
   2028     tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
   2029 
   2030     z1 = (JLONG) wsptr[2];
   2031     z2 = (JLONG) wsptr[6];
   2032 
   2033     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   2034 
   2035     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   2036     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   2037     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   2038             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   2039 
   2040     tmp20 = tmp10 + tmp13;
   2041     tmp26 = tmp10 - tmp13;
   2042     tmp21 = tmp11 + tmp14;
   2043     tmp25 = tmp11 - tmp14;
   2044     tmp22 = tmp12 + tmp15;
   2045     tmp24 = tmp12 - tmp15;
   2046 
   2047     /* Odd part */
   2048 
   2049     z1 = (JLONG) wsptr[1];
   2050     z2 = (JLONG) wsptr[3];
   2051     z3 = (JLONG) wsptr[5];
   2052     z4 = (JLONG) wsptr[7];
   2053     z4 = LEFT_SHIFT(z4, CONST_BITS);
   2054 
   2055     tmp14 = z1 + z3;
   2056     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   2057     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   2058     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   2059     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   2060     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   2061     z1    -= z2;
   2062     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
   2063     tmp16 += tmp15;
   2064     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
   2065     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
   2066     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
   2067     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   2068     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   2069     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
   2070 
   2071     tmp13 = LEFT_SHIFT(z1 - z3, CONST_BITS) + z4;
   2072 
   2073     /* Final output stage */
   2074 
   2075     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   2076                                                CONST_BITS+PASS1_BITS+3)
   2077                              & RANGE_MASK];
   2078     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   2079                                                CONST_BITS+PASS1_BITS+3)
   2080                              & RANGE_MASK];
   2081     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   2082                                                CONST_BITS+PASS1_BITS+3)
   2083                              & RANGE_MASK];
   2084     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   2085                                                CONST_BITS+PASS1_BITS+3)
   2086                              & RANGE_MASK];
   2087     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   2088                                                CONST_BITS+PASS1_BITS+3)
   2089                              & RANGE_MASK];
   2090     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   2091                                                CONST_BITS+PASS1_BITS+3)
   2092                              & RANGE_MASK];
   2093     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   2094                                                CONST_BITS+PASS1_BITS+3)
   2095                              & RANGE_MASK];
   2096     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   2097                                                CONST_BITS+PASS1_BITS+3)
   2098                              & RANGE_MASK];
   2099     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   2100                                                CONST_BITS+PASS1_BITS+3)
   2101                              & RANGE_MASK];
   2102     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   2103                                                CONST_BITS+PASS1_BITS+3)
   2104                              & RANGE_MASK];
   2105     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   2106                                                CONST_BITS+PASS1_BITS+3)
   2107                              & RANGE_MASK];
   2108     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   2109                                                CONST_BITS+PASS1_BITS+3)
   2110                              & RANGE_MASK];
   2111     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
   2112                                                CONST_BITS+PASS1_BITS+3)
   2113                              & RANGE_MASK];
   2114     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
   2115                                                CONST_BITS+PASS1_BITS+3)
   2116                              & RANGE_MASK];
   2117 
   2118     wsptr += 8;         /* advance pointer to next row */
   2119   }
   2120 }
   2121 
   2122 
   2123 /*
   2124  * Perform dequantization and inverse DCT on one block of coefficients,
   2125  * producing a 15x15 output block.
   2126  *
   2127  * Optimized algorithm with 22 multiplications in the 1-D kernel.
   2128  * cK represents sqrt(2) * cos(K*pi/30).
   2129  */
   2130 
   2131 GLOBAL(void)
   2132 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   2133                  JCOEFPTR coef_block,
   2134                  JSAMPARRAY output_buf, JDIMENSION output_col)
   2135 {
   2136   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   2137   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2138   JLONG z1, z2, z3, z4;
   2139   JCOEFPTR inptr;
   2140   ISLOW_MULT_TYPE *quantptr;
   2141   int *wsptr;
   2142   JSAMPROW outptr;
   2143   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2144   int ctr;
   2145   int workspace[8*15];  /* buffers data between passes */
   2146   SHIFT_TEMPS
   2147 
   2148   /* Pass 1: process columns from input, store into work array. */
   2149 
   2150   inptr = coef_block;
   2151   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2152   wsptr = workspace;
   2153   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2154     /* Even part */
   2155 
   2156     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2157     z1 = LEFT_SHIFT(z1, CONST_BITS);
   2158     /* Add fudge factor here for final descale. */
   2159     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   2160 
   2161     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2162     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2163     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2164 
   2165     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2166     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2167 
   2168     tmp12 = z1 - tmp10;
   2169     tmp13 = z1 + tmp11;
   2170     z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
   2171 
   2172     z4 = z2 - z3;
   2173     z3 += z2;
   2174     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2175     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2176     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2177 
   2178     tmp20 = tmp13 + tmp10 + tmp11;
   2179     tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2180 
   2181     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2182     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2183 
   2184     tmp25 = tmp13 - tmp10 - tmp11;
   2185     tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2186 
   2187     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2188     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2189 
   2190     tmp21 = tmp12 + tmp10 + tmp11;
   2191     tmp24 = tmp13 - tmp10 + tmp11;
   2192     tmp11 += tmp11;
   2193     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2194     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2195 
   2196     /* Odd part */
   2197 
   2198     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2199     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2200     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2201     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2202     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2203 
   2204     tmp13 = z2 - z4;
   2205     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2206     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2207     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2208 
   2209     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
   2210     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
   2211     z2 = z1 - z4;
   2212     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2213 
   2214     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2215     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2216     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2217     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2218     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2219     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2220 
   2221     /* Final output stage */
   2222 
   2223     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   2224     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   2225     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   2226     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   2227     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   2228     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   2229     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   2230     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   2231     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   2232     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   2233     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   2234     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   2235     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
   2236     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
   2237     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
   2238   }
   2239 
   2240   /* Pass 2: process 15 rows from work array, store into output array. */
   2241 
   2242   wsptr = workspace;
   2243   for (ctr = 0; ctr < 15; ctr++) {
   2244     outptr = output_buf[ctr] + output_col;
   2245 
   2246     /* Even part */
   2247 
   2248     /* Add fudge factor here for final descale. */
   2249     z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
   2250     z1 = LEFT_SHIFT(z1, CONST_BITS);
   2251 
   2252     z2 = (JLONG) wsptr[2];
   2253     z3 = (JLONG) wsptr[4];
   2254     z4 = (JLONG) wsptr[6];
   2255 
   2256     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2257     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2258 
   2259     tmp12 = z1 - tmp10;
   2260     tmp13 = z1 + tmp11;
   2261     z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
   2262 
   2263     z4 = z2 - z3;
   2264     z3 += z2;
   2265     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2266     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2267     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2268 
   2269     tmp20 = tmp13 + tmp10 + tmp11;
   2270     tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2271 
   2272     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2273     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2274 
   2275     tmp25 = tmp13 - tmp10 - tmp11;
   2276     tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2277 
   2278     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2279     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2280 
   2281     tmp21 = tmp12 + tmp10 + tmp11;
   2282     tmp24 = tmp13 - tmp10 + tmp11;
   2283     tmp11 += tmp11;
   2284     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2285     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2286 
   2287     /* Odd part */
   2288 
   2289     z1 = (JLONG) wsptr[1];
   2290     z2 = (JLONG) wsptr[3];
   2291     z4 = (JLONG) wsptr[5];
   2292     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2293     z4 = (JLONG) wsptr[7];
   2294 
   2295     tmp13 = z2 - z4;
   2296     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2297     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2298     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2299 
   2300     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
   2301     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
   2302     z2 = z1 - z4;
   2303     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2304 
   2305     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2306     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2307     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2308     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2309     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2310     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2311 
   2312     /* Final output stage */
   2313 
   2314     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   2315                                                CONST_BITS+PASS1_BITS+3)
   2316                              & RANGE_MASK];
   2317     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   2318                                                CONST_BITS+PASS1_BITS+3)
   2319                              & RANGE_MASK];
   2320     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   2321                                                CONST_BITS+PASS1_BITS+3)
   2322                              & RANGE_MASK];
   2323     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   2324                                                CONST_BITS+PASS1_BITS+3)
   2325                              & RANGE_MASK];
   2326     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   2327                                                CONST_BITS+PASS1_BITS+3)
   2328                              & RANGE_MASK];
   2329     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   2330                                                CONST_BITS+PASS1_BITS+3)
   2331                              & RANGE_MASK];
   2332     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   2333                                                CONST_BITS+PASS1_BITS+3)
   2334                              & RANGE_MASK];
   2335     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   2336                                                CONST_BITS+PASS1_BITS+3)
   2337                              & RANGE_MASK];
   2338     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   2339                                                CONST_BITS+PASS1_BITS+3)
   2340                              & RANGE_MASK];
   2341     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   2342                                                CONST_BITS+PASS1_BITS+3)
   2343                              & RANGE_MASK];
   2344     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   2345                                                CONST_BITS+PASS1_BITS+3)
   2346                              & RANGE_MASK];
   2347     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   2348                                                CONST_BITS+PASS1_BITS+3)
   2349                              & RANGE_MASK];
   2350     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
   2351                                                CONST_BITS+PASS1_BITS+3)
   2352                              & RANGE_MASK];
   2353     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
   2354                                                CONST_BITS+PASS1_BITS+3)
   2355                              & RANGE_MASK];
   2356     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
   2357                                                CONST_BITS+PASS1_BITS+3)
   2358                              & RANGE_MASK];
   2359 
   2360     wsptr += 8;         /* advance pointer to next row */
   2361   }
   2362 }
   2363 
   2364 
   2365 /*
   2366  * Perform dequantization and inverse DCT on one block of coefficients,
   2367  * producing a 16x16 output block.
   2368  *
   2369  * Optimized algorithm with 28 multiplications in the 1-D kernel.
   2370  * cK represents sqrt(2) * cos(K*pi/32).
   2371  */
   2372 
   2373 GLOBAL(void)
   2374 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   2375                  JCOEFPTR coef_block,
   2376                  JSAMPARRAY output_buf, JDIMENSION output_col)
   2377 {
   2378   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   2379   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2380   JLONG z1, z2, z3, z4;
   2381   JCOEFPTR inptr;
   2382   ISLOW_MULT_TYPE *quantptr;
   2383   int *wsptr;
   2384   JSAMPROW outptr;
   2385   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2386   int ctr;
   2387   int workspace[8*16];  /* buffers data between passes */
   2388   SHIFT_TEMPS
   2389 
   2390   /* Pass 1: process columns from input, store into work array. */
   2391 
   2392   inptr = coef_block;
   2393   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2394   wsptr = workspace;
   2395   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2396     /* Even part */
   2397 
   2398     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2399     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
   2400     /* Add fudge factor here for final descale. */
   2401     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
   2402 
   2403     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2404     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2405     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2406 
   2407     tmp10 = tmp0 + tmp1;
   2408     tmp11 = tmp0 - tmp1;
   2409     tmp12 = tmp0 + tmp2;
   2410     tmp13 = tmp0 - tmp2;
   2411 
   2412     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2413     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2414     z3 = z1 - z2;
   2415     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2416     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2417 
   2418     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2419     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2420     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2421     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2422 
   2423     tmp20 = tmp10 + tmp0;
   2424     tmp27 = tmp10 - tmp0;
   2425     tmp21 = tmp12 + tmp1;
   2426     tmp26 = tmp12 - tmp1;
   2427     tmp22 = tmp13 + tmp2;
   2428     tmp25 = tmp13 - tmp2;
   2429     tmp23 = tmp11 + tmp3;
   2430     tmp24 = tmp11 - tmp3;
   2431 
   2432     /* Odd part */
   2433 
   2434     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2435     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2436     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2437     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2438 
   2439     tmp11 = z1 + z3;
   2440 
   2441     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2442     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2443     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2444     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2445     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2446     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2447     tmp0  = tmp1 + tmp2 + tmp3 -
   2448             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2449     tmp13 = tmp10 + tmp11 + tmp12 -
   2450             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2451     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2452     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2453     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2454     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2455     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2456     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2457     z2    += z4;
   2458     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   2459     tmp1  += z1;
   2460     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2461     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   2462     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2463     tmp12 += z2;
   2464     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   2465     tmp2  += z2;
   2466     tmp3  += z2;
   2467     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2468     tmp10 += z2;
   2469     tmp11 += z2;
   2470 
   2471     /* Final output stage */
   2472 
   2473     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
   2474     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
   2475     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
   2476     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
   2477     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
   2478     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
   2479     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
   2480     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
   2481     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
   2482     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
   2483     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
   2484     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
   2485     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
   2486     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
   2487     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
   2488     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
   2489   }
   2490 
   2491   /* Pass 2: process 16 rows from work array, store into output array. */
   2492 
   2493   wsptr = workspace;
   2494   for (ctr = 0; ctr < 16; ctr++) {
   2495     outptr = output_buf[ctr] + output_col;
   2496 
   2497     /* Even part */
   2498 
   2499     /* Add fudge factor here for final descale. */
   2500     tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
   2501     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
   2502 
   2503     z1 = (JLONG) wsptr[4];
   2504     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2505     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2506 
   2507     tmp10 = tmp0 + tmp1;
   2508     tmp11 = tmp0 - tmp1;
   2509     tmp12 = tmp0 + tmp2;
   2510     tmp13 = tmp0 - tmp2;
   2511 
   2512     z1 = (JLONG) wsptr[2];
   2513     z2 = (JLONG) wsptr[6];
   2514     z3 = z1 - z2;
   2515     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2516     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2517 
   2518     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2519     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2520     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2521     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2522 
   2523     tmp20 = tmp10 + tmp0;
   2524     tmp27 = tmp10 - tmp0;
   2525     tmp21 = tmp12 + tmp1;
   2526     tmp26 = tmp12 - tmp1;
   2527     tmp22 = tmp13 + tmp2;
   2528     tmp25 = tmp13 - tmp2;
   2529     tmp23 = tmp11 + tmp3;
   2530     tmp24 = tmp11 - tmp3;
   2531 
   2532     /* Odd part */
   2533 
   2534     z1 = (JLONG) wsptr[1];
   2535     z2 = (JLONG) wsptr[3];
   2536     z3 = (JLONG) wsptr[5];
   2537     z4 = (JLONG) wsptr[7];
   2538 
   2539     tmp11 = z1 + z3;
   2540 
   2541     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2542     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2543     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2544     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2545     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2546     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2547     tmp0  = tmp1 + tmp2 + tmp3 -
   2548             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2549     tmp13 = tmp10 + tmp11 + tmp12 -
   2550             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2551     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2552     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2553     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2554     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2555     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2556     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2557     z2    += z4;
   2558     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   2559     tmp1  += z1;
   2560     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2561     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   2562     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2563     tmp12 += z2;
   2564     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   2565     tmp2  += z2;
   2566     tmp3  += z2;
   2567     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2568     tmp10 += z2;
   2569     tmp11 += z2;
   2570 
   2571     /* Final output stage */
   2572 
   2573     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
   2574                                                CONST_BITS+PASS1_BITS+3)
   2575                              & RANGE_MASK];
   2576     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
   2577                                                CONST_BITS+PASS1_BITS+3)
   2578                              & RANGE_MASK];
   2579     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
   2580                                                CONST_BITS+PASS1_BITS+3)
   2581                              & RANGE_MASK];
   2582     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
   2583                                                CONST_BITS+PASS1_BITS+3)
   2584                              & RANGE_MASK];
   2585     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
   2586                                                CONST_BITS+PASS1_BITS+3)
   2587                              & RANGE_MASK];
   2588     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
   2589                                                CONST_BITS+PASS1_BITS+3)
   2590                              & RANGE_MASK];
   2591     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
   2592                                                CONST_BITS+PASS1_BITS+3)
   2593                              & RANGE_MASK];
   2594     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
   2595                                                CONST_BITS+PASS1_BITS+3)
   2596                              & RANGE_MASK];
   2597     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
   2598                                                CONST_BITS+PASS1_BITS+3)
   2599                              & RANGE_MASK];
   2600     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
   2601                                                CONST_BITS+PASS1_BITS+3)
   2602                              & RANGE_MASK];
   2603     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
   2604                                                CONST_BITS+PASS1_BITS+3)
   2605                              & RANGE_MASK];
   2606     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
   2607                                                CONST_BITS+PASS1_BITS+3)
   2608                              & RANGE_MASK];
   2609     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
   2610                                                CONST_BITS+PASS1_BITS+3)
   2611                              & RANGE_MASK];
   2612     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
   2613                                                CONST_BITS+PASS1_BITS+3)
   2614                              & RANGE_MASK];
   2615     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
   2616                                                CONST_BITS+PASS1_BITS+3)
   2617                              & RANGE_MASK];
   2618     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
   2619                                                CONST_BITS+PASS1_BITS+3)
   2620                              & RANGE_MASK];
   2621 
   2622     wsptr += 8;         /* advance pointer to next row */
   2623   }
   2624 }
   2625 
   2626 #endif /* IDCT_SCALING_SUPPORTED */
   2627 #endif /* DCT_ISLOW_SUPPORTED */
   2628