Home | History | Annotate | Download | only in libjpeg
      1 /*
      2  * jidctint.c
      3  *
      4  * Copyright (C) 1991-1998, Thomas G. Lane.
      5  * Modification developed 2002-2009 by Guido Vollbeding.
      6  * This file is part of the Independent JPEG Group's software.
      7  * For conditions of distribution and use, see the accompanying README file.
      8  *
      9  * This file contains a slow-but-accurate integer implementation of the
     10  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
     11  * must also perform dequantization of the input coefficients.
     12  *
     13  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
     14  * on each row (or vice versa, but it's more convenient to emit a row at
     15  * a time).  Direct algorithms are also available, but they are much more
     16  * complex and seem not to be any faster when reduced to code.
     17  *
     18  * This implementation is based on an algorithm described in
     19  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
     20  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
     21  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
     22  * The primary algorithm described there uses 11 multiplies and 29 adds.
     23  * We use their alternate method with 12 multiplies and 32 adds.
     24  * The advantage of this method is that no data path contains more than one
     25  * multiplication; this allows a very simple and accurate implementation in
     26  * scaled fixed-point arithmetic, with a minimal number of shifts.
     27  *
     28  * We also provide IDCT routines with various output sample block sizes for
     29  * direct resolution reduction or enlargement and for direct resolving the
     30  * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
     31  * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
     32  *
     33  * For N<8 we simply take the corresponding low-frequency coefficients of
     34  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
     35  * to yield the downscaled outputs.
     36  * This can be seen as direct low-pass downsampling from the DCT domain
     37  * point of view rather than the usual spatial domain point of view,
     38  * yielding significant computational savings and results at least
     39  * as good as common bilinear (averaging) spatial downsampling.
     40  *
     41  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
     42  * lower frequencies and higher frequencies assumed to be zero.
     43  * It turns out that the computational effort is similar to the 8x8 IDCT
     44  * regarding the output size.
     45  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
     46  *
     47  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
     48  * since there would be too many additional constants to pre-calculate.
     49  */
     50 
     51 #define JPEG_INTERNALS
     52 #include "jinclude.h"
     53 #include "jpeglib.h"
     54 #include "jdct.h"		/* Private declarations for DCT subsystem */
     55 
     56 #ifdef DCT_ISLOW_SUPPORTED
     57 
     58 
     59 /*
     60  * This module is specialized to the case DCTSIZE = 8.
     61  */
     62 
     63 #if DCTSIZE != 8
     64   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
     65 #endif
     66 
     67 
     68 /*
     69  * The poop on this scaling stuff is as follows:
     70  *
     71  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
     72  * larger than the true IDCT outputs.  The final outputs are therefore
     73  * a factor of N larger than desired; since N=8 this can be cured by
     74  * a simple right shift at the end of the algorithm.  The advantage of
     75  * this arrangement is that we save two multiplications per 1-D IDCT,
     76  * because the y0 and y4 inputs need not be divided by sqrt(N).
     77  *
     78  * We have to do addition and subtraction of the integer inputs, which
     79  * is no problem, and multiplication by fractional constants, which is
     80  * a problem to do in integer arithmetic.  We multiply all the constants
     81  * by CONST_SCALE and convert them to integer constants (thus retaining
     82  * CONST_BITS bits of precision in the constants).  After doing a
     83  * multiplication we have to divide the product by CONST_SCALE, with proper
     84  * rounding, to produce the correct output.  This division can be done
     85  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
     86  * as long as possible so that partial sums can be added together with
     87  * full fractional precision.
     88  *
     89  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
     90  * they are represented to better-than-integral precision.  These outputs
     91  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
     92  * with the recommended scaling.  (To scale up 12-bit sample data further, an
     93  * intermediate INT32 array would be needed.)
     94  *
     95  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
     96  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
     97  * shows that the values given below are the most effective.
     98  */
     99 
    100 #if BITS_IN_JSAMPLE == 8
    101 #define CONST_BITS  13
    102 #define PASS1_BITS  2
    103 #else
    104 #define CONST_BITS  13
    105 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
    106 #endif
    107 
    108 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
    109  * causing a lot of useless floating-point operations at run time.
    110  * To get around this we use the following pre-calculated constants.
    111  * If you change CONST_BITS you may want to add appropriate values.
    112  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
    113  */
    114 
    115 #if CONST_BITS == 13
    116 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
    117 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
    118 #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
    119 #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
    120 #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
    121 #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
    122 #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
    123 #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
    124 #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
    125 #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
    126 #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
    127 #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
    128 #else
    129 #define FIX_0_298631336  FIX(0.298631336)
    130 #define FIX_0_390180644  FIX(0.390180644)
    131 #define FIX_0_541196100  FIX(0.541196100)
    132 #define FIX_0_765366865  FIX(0.765366865)
    133 #define FIX_0_899976223  FIX(0.899976223)
    134 #define FIX_1_175875602  FIX(1.175875602)
    135 #define FIX_1_501321110  FIX(1.501321110)
    136 #define FIX_1_847759065  FIX(1.847759065)
    137 #define FIX_1_961570560  FIX(1.961570560)
    138 #define FIX_2_053119869  FIX(2.053119869)
    139 #define FIX_2_562915447  FIX(2.562915447)
    140 #define FIX_3_072711026  FIX(3.072711026)
    141 #endif
    142 
    143 
    144 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
    145  * For 8-bit samples with the recommended scaling, all the variable
    146  * and constant values involved are no more than 16 bits wide, so a
    147  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
    148  * For 12-bit samples, a full 32-bit multiplication will be needed.
    149  */
    150 
    151 #if BITS_IN_JSAMPLE == 8
    152 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
    153 #else
    154 #define MULTIPLY(var,const)  ((var) * (const))
    155 #endif
    156 
    157 
    158 /* Dequantize a coefficient by multiplying it by the multiplier-table
    159  * entry; produce an int result.  In this module, both inputs and result
    160  * are 16 bits or less, so either int or short multiply will work.
    161  */
    162 
    163 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
    164 
    165 
    166 /*
    167  * Perform dequantization and inverse DCT on one block of coefficients.
    168  */
    169 
    170 GLOBAL(void)
    171 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    172                  JCOEFPTR coef_block,
    173                  JSAMPARRAY output_buf, JDIMENSION output_col)
    174 {
    175   INT32 tmp0, tmp1, tmp2, tmp3;
    176   INT32 tmp10, tmp11, tmp12, tmp13;
    177   INT32 z1, z2, z3;
    178   JCOEFPTR inptr;
    179   ISLOW_MULT_TYPE * quantptr;
    180   int * wsptr;
    181   JSAMPROW outptr;
    182   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    183   int ctr;
    184   int workspace[DCTSIZE2];	/* buffers data between passes */
    185   SHIFT_TEMPS
    186 
    187   /* Pass 1: process columns from input, store into work array. */
    188   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
    189   /* furthermore, we scale the results by 2**PASS1_BITS. */
    190 
    191   inptr = coef_block;
    192   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    193   wsptr = workspace;
    194   for (ctr = DCTSIZE; ctr > 0; ctr--) {
    195     /* Due to quantization, we will usually find that many of the input
    196      * coefficients are zero, especially the AC terms.  We can exploit this
    197      * by short-circuiting the IDCT calculation for any column in which all
    198      * the AC terms are zero.  In that case each output is equal to the
    199      * DC coefficient (with scale factor as needed).
    200      * With typical images and quantization tables, half or more of the
    201      * column DCT calculations can be simplified this way.
    202      */
    203 
    204     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
    205         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
    206         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
    207         inptr[DCTSIZE*7] == 0) {
    208       /* AC terms all zero */
    209       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
    210 
    211       wsptr[DCTSIZE*0] = dcval;
    212       wsptr[DCTSIZE*1] = dcval;
    213       wsptr[DCTSIZE*2] = dcval;
    214       wsptr[DCTSIZE*3] = dcval;
    215       wsptr[DCTSIZE*4] = dcval;
    216       wsptr[DCTSIZE*5] = dcval;
    217       wsptr[DCTSIZE*6] = dcval;
    218       wsptr[DCTSIZE*7] = dcval;
    219 
    220       inptr++;			/* advance pointers to next column */
    221       quantptr++;
    222       wsptr++;
    223       continue;
    224     }
    225 
    226     /* Even part: reverse the even part of the forward DCT. */
    227     /* The rotator is sqrt(2)*c(-6). */
    228 
    229     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    230     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    231 
    232     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    233     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
    234     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
    235 
    236     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    237     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    238     z2 <<= CONST_BITS;
    239     z3 <<= CONST_BITS;
    240     /* Add fudge factor here for final descale. */
    241     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
    242 
    243     tmp0 = z2 + z3;
    244     tmp1 = z2 - z3;
    245 
    246     tmp10 = tmp0 + tmp2;
    247     tmp13 = tmp0 - tmp2;
    248     tmp11 = tmp1 + tmp3;
    249     tmp12 = tmp1 - tmp3;
    250 
    251     /* Odd part per figure 8; the matrix is unitary and hence its
    252      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    253      */
    254 
    255     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
    256     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    257     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    258     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    259 
    260     z2 = tmp0 + tmp2;
    261     z3 = tmp1 + tmp3;
    262 
    263     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
    264     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    265     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    266     z2 += z1;
    267     z3 += z1;
    268 
    269     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    270     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    271     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    272     tmp0 += z1 + z2;
    273     tmp3 += z1 + z3;
    274 
    275     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    276     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    277     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    278     tmp1 += z1 + z3;
    279     tmp2 += z1 + z2;
    280 
    281     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    282 
    283     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
    284     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
    285     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
    286     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
    287     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
    288     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
    289     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
    290     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
    291 
    292     inptr++;			/* advance pointers to next column */
    293     quantptr++;
    294     wsptr++;
    295   }
    296 
    297   /* Pass 2: process rows from work array, store into output array. */
    298   /* Note that we must descale the results by a factor of 8 == 2**3, */
    299   /* and also undo the PASS1_BITS scaling. */
    300 
    301   wsptr = workspace;
    302   for (ctr = 0; ctr < DCTSIZE; ctr++) {
    303     outptr = output_buf[ctr] + output_col;
    304     /* Rows of zeroes can be exploited in the same way as we did with columns.
    305      * However, the column calculation has created many nonzero AC terms, so
    306      * the simplification applies less often (typically 5% to 10% of the time).
    307      * On machines with very fast multiplication, it's possible that the
    308      * test takes more time than it's worth.  In that case this section
    309      * may be commented out.
    310      */
    311 
    312 #ifndef NO_ZERO_ROW_TEST
    313     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
    314         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
    315       /* AC terms all zero */
    316       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
    317                                   & RANGE_MASK];
    318 
    319       outptr[0] = dcval;
    320       outptr[1] = dcval;
    321       outptr[2] = dcval;
    322       outptr[3] = dcval;
    323       outptr[4] = dcval;
    324       outptr[5] = dcval;
    325       outptr[6] = dcval;
    326       outptr[7] = dcval;
    327 
    328       wsptr += DCTSIZE;		/* advance pointer to next row */
    329       continue;
    330     }
    331 #endif
    332 
    333     /* Even part: reverse the even part of the forward DCT. */
    334     /* The rotator is sqrt(2)*c(-6). */
    335 
    336     z2 = (INT32) wsptr[2];
    337     z3 = (INT32) wsptr[6];
    338 
    339     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    340     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
    341     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
    342 
    343     /* Add fudge factor here for final descale. */
    344     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    345     z3 = (INT32) wsptr[4];
    346 
    347     tmp0 = (z2 + z3) << CONST_BITS;
    348     tmp1 = (z2 - z3) << CONST_BITS;
    349 
    350     tmp10 = tmp0 + tmp2;
    351     tmp13 = tmp0 - tmp2;
    352     tmp11 = tmp1 + tmp3;
    353     tmp12 = tmp1 - tmp3;
    354 
    355     /* Odd part per figure 8; the matrix is unitary and hence its
    356      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    357      */
    358 
    359     tmp0 = (INT32) wsptr[7];
    360     tmp1 = (INT32) wsptr[5];
    361     tmp2 = (INT32) wsptr[3];
    362     tmp3 = (INT32) wsptr[1];
    363 
    364     z2 = tmp0 + tmp2;
    365     z3 = tmp1 + tmp3;
    366 
    367     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
    368     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    369     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    370     z2 += z1;
    371     z3 += z1;
    372 
    373     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    374     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    375     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    376     tmp0 += z1 + z2;
    377     tmp3 += z1 + z3;
    378 
    379     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    380     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    381     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    382     tmp1 += z1 + z3;
    383     tmp2 += z1 + z2;
    384 
    385     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    386 
    387     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
    388                                               CONST_BITS+PASS1_BITS+3)
    389                             & RANGE_MASK];
    390     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
    391                                               CONST_BITS+PASS1_BITS+3)
    392                             & RANGE_MASK];
    393     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
    394                                               CONST_BITS+PASS1_BITS+3)
    395                             & RANGE_MASK];
    396     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
    397                                               CONST_BITS+PASS1_BITS+3)
    398                             & RANGE_MASK];
    399     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
    400                                               CONST_BITS+PASS1_BITS+3)
    401                             & RANGE_MASK];
    402     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
    403                                               CONST_BITS+PASS1_BITS+3)
    404                             & RANGE_MASK];
    405     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
    406                                               CONST_BITS+PASS1_BITS+3)
    407                             & RANGE_MASK];
    408     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
    409                                               CONST_BITS+PASS1_BITS+3)
    410                             & RANGE_MASK];
    411 
    412     wsptr += DCTSIZE;		/* advance pointer to next row */
    413   }
    414 }
    415 
    416 #ifdef IDCT_SCALING_SUPPORTED
    417 
    418 
    419 /*
    420  * Perform dequantization and inverse DCT on one block of coefficients,
    421  * producing a 7x7 output block.
    422  *
    423  * Optimized algorithm with 12 multiplications in the 1-D kernel.
    424  * cK represents sqrt(2) * cos(K*pi/14).
    425  */
    426 
    427 GLOBAL(void)
    428 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    429                JCOEFPTR coef_block,
    430                JSAMPARRAY output_buf, JDIMENSION output_col)
    431 {
    432   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
    433   INT32 z1, z2, z3;
    434   JCOEFPTR inptr;
    435   ISLOW_MULT_TYPE * quantptr;
    436   int * wsptr;
    437   JSAMPROW outptr;
    438   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    439   int ctr;
    440   int workspace[7*7];	/* buffers data between passes */
    441   SHIFT_TEMPS
    442 
    443   /* Pass 1: process columns from input, store into work array. */
    444 
    445   inptr = coef_block;
    446   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    447   wsptr = workspace;
    448   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
    449     /* Even part */
    450 
    451     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    452     tmp13 <<= CONST_BITS;
    453     /* Add fudge factor here for final descale. */
    454     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
    455 
    456     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    457     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    458     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    459 
    460     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    461     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    462     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    463     tmp0 = z1 + z3;
    464     z2 -= tmp0;
    465     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    466     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    467     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    468     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    469 
    470     /* Odd part */
    471 
    472     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    473     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    474     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    475 
    476     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    477     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    478     tmp0 = tmp1 - tmp2;
    479     tmp1 += tmp2;
    480     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
    481     tmp1 += tmp2;
    482     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    483     tmp0 += z2;
    484     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    485 
    486     /* Final output stage */
    487 
    488     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    489     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    490     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    491     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    492     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    493     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    494     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
    495   }
    496 
    497   /* Pass 2: process 7 rows from work array, store into output array. */
    498 
    499   wsptr = workspace;
    500   for (ctr = 0; ctr < 7; ctr++) {
    501     outptr = output_buf[ctr] + output_col;
    502 
    503     /* Even part */
    504 
    505     /* Add fudge factor here for final descale. */
    506     tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    507     tmp13 <<= CONST_BITS;
    508 
    509     z1 = (INT32) wsptr[2];
    510     z2 = (INT32) wsptr[4];
    511     z3 = (INT32) wsptr[6];
    512 
    513     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    514     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    515     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    516     tmp0 = z1 + z3;
    517     z2 -= tmp0;
    518     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    519     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    520     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    521     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    522 
    523     /* Odd part */
    524 
    525     z1 = (INT32) wsptr[1];
    526     z2 = (INT32) wsptr[3];
    527     z3 = (INT32) wsptr[5];
    528 
    529     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    530     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    531     tmp0 = tmp1 - tmp2;
    532     tmp1 += tmp2;
    533     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
    534     tmp1 += tmp2;
    535     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    536     tmp0 += z2;
    537     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    538 
    539     /* Final output stage */
    540 
    541     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    542                                               CONST_BITS+PASS1_BITS+3)
    543                             & RANGE_MASK];
    544     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    545                                               CONST_BITS+PASS1_BITS+3)
    546                             & RANGE_MASK];
    547     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    548                                               CONST_BITS+PASS1_BITS+3)
    549                             & RANGE_MASK];
    550     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    551                                               CONST_BITS+PASS1_BITS+3)
    552                             & RANGE_MASK];
    553     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
    554                                               CONST_BITS+PASS1_BITS+3)
    555                             & RANGE_MASK];
    556     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
    557                                               CONST_BITS+PASS1_BITS+3)
    558                             & RANGE_MASK];
    559     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
    560                                               CONST_BITS+PASS1_BITS+3)
    561                             & RANGE_MASK];
    562 
    563     wsptr += 7;		/* advance pointer to next row */
    564   }
    565 }
    566 
    567 
    568 /*
    569  * Perform dequantization and inverse DCT on one block of coefficients,
    570  * producing a reduced-size 6x6 output block.
    571  *
    572  * Optimized algorithm with 3 multiplications in the 1-D kernel.
    573  * cK represents sqrt(2) * cos(K*pi/12).
    574  */
    575 
    576 GLOBAL(void)
    577 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    578                JCOEFPTR coef_block,
    579                JSAMPARRAY output_buf, JDIMENSION output_col)
    580 {
    581   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
    582   INT32 z1, z2, z3;
    583   JCOEFPTR inptr;
    584   ISLOW_MULT_TYPE * quantptr;
    585   int * wsptr;
    586   JSAMPROW outptr;
    587   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    588   int ctr;
    589   int workspace[6*6];	/* buffers data between passes */
    590   SHIFT_TEMPS
    591 
    592   /* Pass 1: process columns from input, store into work array. */
    593 
    594   inptr = coef_block;
    595   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    596   wsptr = workspace;
    597   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
    598     /* Even part */
    599 
    600     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    601     tmp0 <<= CONST_BITS;
    602     /* Add fudge factor here for final descale. */
    603     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    604     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    605     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    606     tmp1 = tmp0 + tmp10;
    607     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
    608     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    609     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    610     tmp10 = tmp1 + tmp0;
    611     tmp12 = tmp1 - tmp0;
    612 
    613     /* Odd part */
    614 
    615     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    616     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    617     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    618     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    619     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
    620     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
    621     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
    622 
    623     /* Final output stage */
    624 
    625     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    626     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    627     wsptr[6*1] = (int) (tmp11 + tmp1);
    628     wsptr[6*4] = (int) (tmp11 - tmp1);
    629     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    630     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    631   }
    632 
    633   /* Pass 2: process 6 rows from work array, store into output array. */
    634 
    635   wsptr = workspace;
    636   for (ctr = 0; ctr < 6; ctr++) {
    637     outptr = output_buf[ctr] + output_col;
    638 
    639     /* Even part */
    640 
    641     /* Add fudge factor here for final descale. */
    642     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    643     tmp0 <<= CONST_BITS;
    644     tmp2 = (INT32) wsptr[4];
    645     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    646     tmp1 = tmp0 + tmp10;
    647     tmp11 = tmp0 - tmp10 - tmp10;
    648     tmp10 = (INT32) wsptr[2];
    649     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    650     tmp10 = tmp1 + tmp0;
    651     tmp12 = tmp1 - tmp0;
    652 
    653     /* Odd part */
    654 
    655     z1 = (INT32) wsptr[1];
    656     z2 = (INT32) wsptr[3];
    657     z3 = (INT32) wsptr[5];
    658     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    659     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
    660     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
    661     tmp1 = (z1 - z2 - z3) << CONST_BITS;
    662 
    663     /* Final output stage */
    664 
    665     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    666                                               CONST_BITS+PASS1_BITS+3)
    667                             & RANGE_MASK];
    668     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    669                                               CONST_BITS+PASS1_BITS+3)
    670                             & RANGE_MASK];
    671     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    672                                               CONST_BITS+PASS1_BITS+3)
    673                             & RANGE_MASK];
    674     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    675                                               CONST_BITS+PASS1_BITS+3)
    676                             & RANGE_MASK];
    677     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
    678                                               CONST_BITS+PASS1_BITS+3)
    679                             & RANGE_MASK];
    680     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
    681                                               CONST_BITS+PASS1_BITS+3)
    682                             & RANGE_MASK];
    683 
    684     wsptr += 6;		/* advance pointer to next row */
    685   }
    686 }
    687 
    688 
    689 /*
    690  * Perform dequantization and inverse DCT on one block of coefficients,
    691  * producing a reduced-size 5x5 output block.
    692  *
    693  * Optimized algorithm with 5 multiplications in the 1-D kernel.
    694  * cK represents sqrt(2) * cos(K*pi/10).
    695  */
    696 
    697 GLOBAL(void)
    698 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    699                JCOEFPTR coef_block,
    700                JSAMPARRAY output_buf, JDIMENSION output_col)
    701 {
    702   INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
    703   INT32 z1, z2, z3;
    704   JCOEFPTR inptr;
    705   ISLOW_MULT_TYPE * quantptr;
    706   int * wsptr;
    707   JSAMPROW outptr;
    708   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    709   int ctr;
    710   int workspace[5*5];	/* buffers data between passes */
    711   SHIFT_TEMPS
    712 
    713   /* Pass 1: process columns from input, store into work array. */
    714 
    715   inptr = coef_block;
    716   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    717   wsptr = workspace;
    718   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
    719     /* Even part */
    720 
    721     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    722     tmp12 <<= CONST_BITS;
    723     /* Add fudge factor here for final descale. */
    724     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
    725     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    726     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    727     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    728     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    729     z3 = tmp12 + z2;
    730     tmp10 = z3 + z1;
    731     tmp11 = z3 - z1;
    732     tmp12 -= z2 << 2;
    733 
    734     /* Odd part */
    735 
    736     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    737     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    738 
    739     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    740     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    741     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    742 
    743     /* Final output stage */
    744 
    745     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    746     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    747     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    748     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    749     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
    750   }
    751 
    752   /* Pass 2: process 5 rows from work array, store into output array. */
    753 
    754   wsptr = workspace;
    755   for (ctr = 0; ctr < 5; ctr++) {
    756     outptr = output_buf[ctr] + output_col;
    757 
    758     /* Even part */
    759 
    760     /* Add fudge factor here for final descale. */
    761     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    762     tmp12 <<= CONST_BITS;
    763     tmp0 = (INT32) wsptr[2];
    764     tmp1 = (INT32) wsptr[4];
    765     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    766     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    767     z3 = tmp12 + z2;
    768     tmp10 = z3 + z1;
    769     tmp11 = z3 - z1;
    770     tmp12 -= z2 << 2;
    771 
    772     /* Odd part */
    773 
    774     z2 = (INT32) wsptr[1];
    775     z3 = (INT32) wsptr[3];
    776 
    777     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    778     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    779     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    780 
    781     /* Final output stage */
    782 
    783     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    784                                               CONST_BITS+PASS1_BITS+3)
    785                             & RANGE_MASK];
    786     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    787                                               CONST_BITS+PASS1_BITS+3)
    788                             & RANGE_MASK];
    789     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    790                                               CONST_BITS+PASS1_BITS+3)
    791                             & RANGE_MASK];
    792     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    793                                               CONST_BITS+PASS1_BITS+3)
    794                             & RANGE_MASK];
    795     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
    796                                               CONST_BITS+PASS1_BITS+3)
    797                             & RANGE_MASK];
    798 
    799     wsptr += 5;		/* advance pointer to next row */
    800   }
    801 }
    802 
    803 
    804 /*
    805  * Perform dequantization and inverse DCT on one block of coefficients,
    806  * producing a reduced-size 4x4 output block.
    807  *
    808  * Optimized algorithm with 3 multiplications in the 1-D kernel.
    809  * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
    810  */
    811 
    812 GLOBAL(void)
    813 jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    814                JCOEFPTR coef_block,
    815                JSAMPARRAY output_buf, JDIMENSION output_col)
    816 {
    817   INT32 tmp0, tmp2, tmp10, tmp12;
    818   INT32 z1, z2, z3;
    819   JCOEFPTR inptr;
    820   ISLOW_MULT_TYPE * quantptr;
    821   int * wsptr;
    822   JSAMPROW outptr;
    823   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    824   int ctr;
    825   int workspace[4*4];	/* buffers data between passes */
    826   SHIFT_TEMPS
    827 
    828   /* Pass 1: process columns from input, store into work array. */
    829 
    830   inptr = coef_block;
    831   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    832   wsptr = workspace;
    833   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
    834     /* Even part */
    835 
    836     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    837     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    838 
    839     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
    840     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
    841 
    842     /* Odd part */
    843     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
    844 
    845     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    846     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    847 
    848     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
    849     /* Add fudge factor here for final descale. */
    850     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
    851     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
    852                        CONST_BITS-PASS1_BITS);
    853     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
    854                        CONST_BITS-PASS1_BITS);
    855 
    856     /* Final output stage */
    857 
    858     wsptr[4*0] = (int) (tmp10 + tmp0);
    859     wsptr[4*3] = (int) (tmp10 - tmp0);
    860     wsptr[4*1] = (int) (tmp12 + tmp2);
    861     wsptr[4*2] = (int) (tmp12 - tmp2);
    862   }
    863 
    864   /* Pass 2: process 4 rows from work array, store into output array. */
    865 
    866   wsptr = workspace;
    867   for (ctr = 0; ctr < 4; ctr++) {
    868     outptr = output_buf[ctr] + output_col;
    869 
    870     /* Even part */
    871 
    872     /* Add fudge factor here for final descale. */
    873     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    874     tmp2 = (INT32) wsptr[2];
    875 
    876     tmp10 = (tmp0 + tmp2) << CONST_BITS;
    877     tmp12 = (tmp0 - tmp2) << CONST_BITS;
    878 
    879     /* Odd part */
    880     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
    881 
    882     z2 = (INT32) wsptr[1];
    883     z3 = (INT32) wsptr[3];
    884 
    885     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
    886     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
    887     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
    888 
    889     /* Final output stage */
    890 
    891     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    892                                               CONST_BITS+PASS1_BITS+3)
    893                             & RANGE_MASK];
    894     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    895                                               CONST_BITS+PASS1_BITS+3)
    896                             & RANGE_MASK];
    897     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
    898                                               CONST_BITS+PASS1_BITS+3)
    899                             & RANGE_MASK];
    900     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
    901                                               CONST_BITS+PASS1_BITS+3)
    902                             & RANGE_MASK];
    903 
    904     wsptr += 4;		/* advance pointer to next row */
    905   }
    906 }
    907 
    908 
    909 /*
    910  * Perform dequantization and inverse DCT on one block of coefficients,
    911  * producing a reduced-size 3x3 output block.
    912  *
    913  * Optimized algorithm with 2 multiplications in the 1-D kernel.
    914  * cK represents sqrt(2) * cos(K*pi/6).
    915  */
    916 
    917 GLOBAL(void)
    918 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    919                JCOEFPTR coef_block,
    920                JSAMPARRAY output_buf, JDIMENSION output_col)
    921 {
    922   INT32 tmp0, tmp2, tmp10, tmp12;
    923   JCOEFPTR inptr;
    924   ISLOW_MULT_TYPE * quantptr;
    925   int * wsptr;
    926   JSAMPROW outptr;
    927   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    928   int ctr;
    929   int workspace[3*3];	/* buffers data between passes */
    930   SHIFT_TEMPS
    931 
    932   /* Pass 1: process columns from input, store into work array. */
    933 
    934   inptr = coef_block;
    935   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    936   wsptr = workspace;
    937   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
    938     /* Even part */
    939 
    940     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    941     tmp0 <<= CONST_BITS;
    942     /* Add fudge factor here for final descale. */
    943     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    944     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    945     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    946     tmp10 = tmp0 + tmp12;
    947     tmp2 = tmp0 - tmp12 - tmp12;
    948 
    949     /* Odd part */
    950 
    951     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    952     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    953 
    954     /* Final output stage */
    955 
    956     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    957     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    958     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
    959   }
    960 
    961   /* Pass 2: process 3 rows from work array, store into output array. */
    962 
    963   wsptr = workspace;
    964   for (ctr = 0; ctr < 3; ctr++) {
    965     outptr = output_buf[ctr] + output_col;
    966 
    967     /* Even part */
    968 
    969     /* Add fudge factor here for final descale. */
    970     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    971     tmp0 <<= CONST_BITS;
    972     tmp2 = (INT32) wsptr[2];
    973     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    974     tmp10 = tmp0 + tmp12;
    975     tmp2 = tmp0 - tmp12 - tmp12;
    976 
    977     /* Odd part */
    978 
    979     tmp12 = (INT32) wsptr[1];
    980     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    981 
    982     /* Final output stage */
    983 
    984     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    985                                               CONST_BITS+PASS1_BITS+3)
    986                             & RANGE_MASK];
    987     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    988                                               CONST_BITS+PASS1_BITS+3)
    989                             & RANGE_MASK];
    990     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
    991                                               CONST_BITS+PASS1_BITS+3)
    992                             & RANGE_MASK];
    993 
    994     wsptr += 3;		/* advance pointer to next row */
    995   }
    996 }
    997 
    998 
    999 /*
   1000  * Perform dequantization and inverse DCT on one block of coefficients,
   1001  * producing a reduced-size 2x2 output block.
   1002  *
   1003  * Multiplication-less algorithm.
   1004  */
   1005 
   1006 GLOBAL(void)
   1007 jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1008                JCOEFPTR coef_block,
   1009                JSAMPARRAY output_buf, JDIMENSION output_col)
   1010 {
   1011   INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
   1012   ISLOW_MULT_TYPE * quantptr;
   1013   JSAMPROW outptr;
   1014   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1015   SHIFT_TEMPS
   1016 
   1017   /* Pass 1: process columns from input. */
   1018 
   1019   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1020 
   1021   /* Column 0 */
   1022   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1023   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1024   /* Add fudge factor here for final descale. */
   1025   tmp4 += ONE << 2;
   1026 
   1027   tmp0 = tmp4 + tmp5;
   1028   tmp2 = tmp4 - tmp5;
   1029 
   1030   /* Column 1 */
   1031   tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
   1032   tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
   1033 
   1034   tmp1 = tmp4 + tmp5;
   1035   tmp3 = tmp4 - tmp5;
   1036 
   1037   /* Pass 2: process 2 rows, store into output array. */
   1038 
   1039   /* Row 0 */
   1040   outptr = output_buf[0] + output_col;
   1041 
   1042   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
   1043   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
   1044 
   1045   /* Row 1 */
   1046   outptr = output_buf[1] + output_col;
   1047 
   1048   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
   1049   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
   1050 }
   1051 
   1052 
   1053 /*
   1054  * Perform dequantization and inverse DCT on one block of coefficients,
   1055  * producing a reduced-size 1x1 output block.
   1056  *
   1057  * We hardly need an inverse DCT routine for this: just take the
   1058  * average pixel value, which is one-eighth of the DC coefficient.
   1059  */
   1060 
   1061 GLOBAL(void)
   1062 jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1063                JCOEFPTR coef_block,
   1064                JSAMPARRAY output_buf, JDIMENSION output_col)
   1065 {
   1066   int dcval;
   1067   ISLOW_MULT_TYPE * quantptr;
   1068   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1069   SHIFT_TEMPS
   1070 
   1071   /* 1x1 is trivial: just take the DC coefficient divided by 8. */
   1072   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1073   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
   1074   dcval = (int) DESCALE((INT32) dcval, 3);
   1075 
   1076   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
   1077 }
   1078 
   1079 
   1080 /*
   1081  * Perform dequantization and inverse DCT on one block of coefficients,
   1082  * producing a 9x9 output block.
   1083  *
   1084  * Optimized algorithm with 10 multiplications in the 1-D kernel.
   1085  * cK represents sqrt(2) * cos(K*pi/18).
   1086  */
   1087 
   1088 GLOBAL(void)
   1089 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1090                JCOEFPTR coef_block,
   1091                JSAMPARRAY output_buf, JDIMENSION output_col)
   1092 {
   1093   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
   1094   INT32 z1, z2, z3, z4;
   1095   JCOEFPTR inptr;
   1096   ISLOW_MULT_TYPE * quantptr;
   1097   int * wsptr;
   1098   JSAMPROW outptr;
   1099   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1100   int ctr;
   1101   int workspace[8*9];	/* buffers data between passes */
   1102   SHIFT_TEMPS
   1103 
   1104   /* Pass 1: process columns from input, store into work array. */
   1105 
   1106   inptr = coef_block;
   1107   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1108   wsptr = workspace;
   1109   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1110     /* Even part */
   1111 
   1112     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1113     tmp0 <<= CONST_BITS;
   1114     /* Add fudge factor here for final descale. */
   1115     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
   1116 
   1117     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1118     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1119     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1120 
   1121     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
   1122     tmp1 = tmp0 + tmp3;
   1123     tmp2 = tmp0 - tmp3 - tmp3;
   1124 
   1125     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
   1126     tmp11 = tmp2 + tmp0;
   1127     tmp14 = tmp2 - tmp0 - tmp0;
   1128 
   1129     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
   1130     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
   1131     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
   1132 
   1133     tmp10 = tmp1 + tmp0 - tmp3;
   1134     tmp12 = tmp1 - tmp0 + tmp2;
   1135     tmp13 = tmp1 - tmp2 + tmp3;
   1136 
   1137     /* Odd part */
   1138 
   1139     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1140     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1141     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1142     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1143 
   1144     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
   1145 
   1146     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
   1147     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
   1148     tmp0 = tmp2 + tmp3 - z2;
   1149     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
   1150     tmp2 += z2 - tmp1;
   1151     tmp3 += z2 + tmp1;
   1152     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
   1153 
   1154     /* Final output stage */
   1155 
   1156     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   1157     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   1158     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
   1159     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
   1160     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
   1161     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
   1162     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
   1163     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
   1164     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
   1165   }
   1166 
   1167   /* Pass 2: process 9 rows from work array, store into output array. */
   1168 
   1169   wsptr = workspace;
   1170   for (ctr = 0; ctr < 9; ctr++) {
   1171     outptr = output_buf[ctr] + output_col;
   1172 
   1173     /* Even part */
   1174 
   1175     /* Add fudge factor here for final descale. */
   1176     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1177     tmp0 <<= CONST_BITS;
   1178 
   1179     z1 = (INT32) wsptr[2];
   1180     z2 = (INT32) wsptr[4];
   1181     z3 = (INT32) wsptr[6];
   1182 
   1183     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
   1184     tmp1 = tmp0 + tmp3;
   1185     tmp2 = tmp0 - tmp3 - tmp3;
   1186 
   1187     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
   1188     tmp11 = tmp2 + tmp0;
   1189     tmp14 = tmp2 - tmp0 - tmp0;
   1190 
   1191     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
   1192     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
   1193     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
   1194 
   1195     tmp10 = tmp1 + tmp0 - tmp3;
   1196     tmp12 = tmp1 - tmp0 + tmp2;
   1197     tmp13 = tmp1 - tmp2 + tmp3;
   1198 
   1199     /* Odd part */
   1200 
   1201     z1 = (INT32) wsptr[1];
   1202     z2 = (INT32) wsptr[3];
   1203     z3 = (INT32) wsptr[5];
   1204     z4 = (INT32) wsptr[7];
   1205 
   1206     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
   1207 
   1208     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
   1209     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
   1210     tmp0 = tmp2 + tmp3 - z2;
   1211     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
   1212     tmp2 += z2 - tmp1;
   1213     tmp3 += z2 + tmp1;
   1214     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
   1215 
   1216     /* Final output stage */
   1217 
   1218     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   1219                                               CONST_BITS+PASS1_BITS+3)
   1220                             & RANGE_MASK];
   1221     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   1222                                               CONST_BITS+PASS1_BITS+3)
   1223                             & RANGE_MASK];
   1224     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
   1225                                               CONST_BITS+PASS1_BITS+3)
   1226                             & RANGE_MASK];
   1227     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
   1228                                               CONST_BITS+PASS1_BITS+3)
   1229                             & RANGE_MASK];
   1230     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   1231                                               CONST_BITS+PASS1_BITS+3)
   1232                             & RANGE_MASK];
   1233     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   1234                                               CONST_BITS+PASS1_BITS+3)
   1235                             & RANGE_MASK];
   1236     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
   1237                                               CONST_BITS+PASS1_BITS+3)
   1238                             & RANGE_MASK];
   1239     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
   1240                                               CONST_BITS+PASS1_BITS+3)
   1241                             & RANGE_MASK];
   1242     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
   1243                                               CONST_BITS+PASS1_BITS+3)
   1244                             & RANGE_MASK];
   1245 
   1246     wsptr += 8;		/* advance pointer to next row */
   1247   }
   1248 }
   1249 
   1250 
   1251 /*
   1252  * Perform dequantization and inverse DCT on one block of coefficients,
   1253  * producing a 10x10 output block.
   1254  *
   1255  * Optimized algorithm with 12 multiplications in the 1-D kernel.
   1256  * cK represents sqrt(2) * cos(K*pi/20).
   1257  */
   1258 
   1259 GLOBAL(void)
   1260 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1261                  JCOEFPTR coef_block,
   1262                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1263 {
   1264   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
   1265   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
   1266   INT32 z1, z2, z3, z4, z5;
   1267   JCOEFPTR inptr;
   1268   ISLOW_MULT_TYPE * quantptr;
   1269   int * wsptr;
   1270   JSAMPROW outptr;
   1271   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1272   int ctr;
   1273   int workspace[8*10];	/* buffers data between passes */
   1274   SHIFT_TEMPS
   1275 
   1276   /* Pass 1: process columns from input, store into work array. */
   1277 
   1278   inptr = coef_block;
   1279   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1280   wsptr = workspace;
   1281   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1282     /* Even part */
   1283 
   1284     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1285     z3 <<= CONST_BITS;
   1286     /* Add fudge factor here for final descale. */
   1287     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   1288     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1289     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1290     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1291     tmp10 = z3 + z1;
   1292     tmp11 = z3 - z2;
   1293 
   1294     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
   1295                         CONST_BITS-PASS1_BITS);
   1296 
   1297     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1298     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1299 
   1300     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1301     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1302     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1303 
   1304     tmp20 = tmp10 + tmp12;
   1305     tmp24 = tmp10 - tmp12;
   1306     tmp21 = tmp11 + tmp13;
   1307     tmp23 = tmp11 - tmp13;
   1308 
   1309     /* Odd part */
   1310 
   1311     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1312     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1313     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1314     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1315 
   1316     tmp11 = z2 + z4;
   1317     tmp13 = z2 - z4;
   1318 
   1319     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1320     z5 = z3 << CONST_BITS;
   1321 
   1322     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1323     z4 = z5 + tmp12;
   1324 
   1325     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1326     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1327 
   1328     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1329     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
   1330 
   1331     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
   1332 
   1333     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1334     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1335 
   1336     /* Final output stage */
   1337 
   1338     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1339     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1340     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1341     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1342     wsptr[8*2] = (int) (tmp22 + tmp12);
   1343     wsptr[8*7] = (int) (tmp22 - tmp12);
   1344     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1345     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1346     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1347     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1348   }
   1349 
   1350   /* Pass 2: process 10 rows from work array, store into output array. */
   1351 
   1352   wsptr = workspace;
   1353   for (ctr = 0; ctr < 10; ctr++) {
   1354     outptr = output_buf[ctr] + output_col;
   1355 
   1356     /* Even part */
   1357 
   1358     /* Add fudge factor here for final descale. */
   1359     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1360     z3 <<= CONST_BITS;
   1361     z4 = (INT32) wsptr[4];
   1362     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1363     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1364     tmp10 = z3 + z1;
   1365     tmp11 = z3 - z2;
   1366 
   1367     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
   1368 
   1369     z2 = (INT32) wsptr[2];
   1370     z3 = (INT32) wsptr[6];
   1371 
   1372     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1373     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1374     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1375 
   1376     tmp20 = tmp10 + tmp12;
   1377     tmp24 = tmp10 - tmp12;
   1378     tmp21 = tmp11 + tmp13;
   1379     tmp23 = tmp11 - tmp13;
   1380 
   1381     /* Odd part */
   1382 
   1383     z1 = (INT32) wsptr[1];
   1384     z2 = (INT32) wsptr[3];
   1385     z3 = (INT32) wsptr[5];
   1386     z3 <<= CONST_BITS;
   1387     z4 = (INT32) wsptr[7];
   1388 
   1389     tmp11 = z2 + z4;
   1390     tmp13 = z2 - z4;
   1391 
   1392     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1393 
   1394     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1395     z4 = z3 + tmp12;
   1396 
   1397     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1398     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1399 
   1400     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1401     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
   1402 
   1403     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
   1404 
   1405     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1406     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1407 
   1408     /* Final output stage */
   1409 
   1410     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1411                                               CONST_BITS+PASS1_BITS+3)
   1412                             & RANGE_MASK];
   1413     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1414                                               CONST_BITS+PASS1_BITS+3)
   1415                             & RANGE_MASK];
   1416     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1417                                               CONST_BITS+PASS1_BITS+3)
   1418                             & RANGE_MASK];
   1419     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1420                                               CONST_BITS+PASS1_BITS+3)
   1421                             & RANGE_MASK];
   1422     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1423                                               CONST_BITS+PASS1_BITS+3)
   1424                             & RANGE_MASK];
   1425     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1426                                               CONST_BITS+PASS1_BITS+3)
   1427                             & RANGE_MASK];
   1428     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1429                                               CONST_BITS+PASS1_BITS+3)
   1430                             & RANGE_MASK];
   1431     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1432                                               CONST_BITS+PASS1_BITS+3)
   1433                             & RANGE_MASK];
   1434     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1435                                               CONST_BITS+PASS1_BITS+3)
   1436                             & RANGE_MASK];
   1437     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1438                                               CONST_BITS+PASS1_BITS+3)
   1439                             & RANGE_MASK];
   1440 
   1441     wsptr += 8;		/* advance pointer to next row */
   1442   }
   1443 }
   1444 
   1445 
   1446 /*
   1447  * Perform dequantization and inverse DCT on one block of coefficients,
   1448  * producing a 11x11 output block.
   1449  *
   1450  * Optimized algorithm with 24 multiplications in the 1-D kernel.
   1451  * cK represents sqrt(2) * cos(K*pi/22).
   1452  */
   1453 
   1454 GLOBAL(void)
   1455 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1456                  JCOEFPTR coef_block,
   1457                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1458 {
   1459   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
   1460   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1461   INT32 z1, z2, z3, z4;
   1462   JCOEFPTR inptr;
   1463   ISLOW_MULT_TYPE * quantptr;
   1464   int * wsptr;
   1465   JSAMPROW outptr;
   1466   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1467   int ctr;
   1468   int workspace[8*11];	/* buffers data between passes */
   1469   SHIFT_TEMPS
   1470 
   1471   /* Pass 1: process columns from input, store into work array. */
   1472 
   1473   inptr = coef_block;
   1474   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1475   wsptr = workspace;
   1476   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1477     /* Even part */
   1478 
   1479     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1480     tmp10 <<= CONST_BITS;
   1481     /* Add fudge factor here for final descale. */
   1482     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
   1483 
   1484     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1485     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1486     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1487 
   1488     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1489     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1490     z4 = z1 + z3;
   1491     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
   1492     z4 -= z2;
   1493     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1494     tmp21 = tmp20 + tmp23 + tmp25 -
   1495             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1496     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1497     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1498     tmp24 += tmp25;
   1499     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1500     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1501              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1502     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1503 
   1504     /* Odd part */
   1505 
   1506     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1507     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1508     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1509     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1510 
   1511     tmp11 = z1 + z2;
   1512     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1513     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1514     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1515     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1516     tmp10 = tmp11 + tmp12 + tmp13 -
   1517             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1518     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1519     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1520     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1521     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
   1522     tmp11 += z1;
   1523     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1524     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
   1525              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1526              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1527 
   1528     /* Final output stage */
   1529 
   1530     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1531     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1532     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1533     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1534     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1535     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1536     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1537     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1538     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1539     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1540     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
   1541   }
   1542 
   1543   /* Pass 2: process 11 rows from work array, store into output array. */
   1544 
   1545   wsptr = workspace;
   1546   for (ctr = 0; ctr < 11; ctr++) {
   1547     outptr = output_buf[ctr] + output_col;
   1548 
   1549     /* Even part */
   1550 
   1551     /* Add fudge factor here for final descale. */
   1552     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1553     tmp10 <<= CONST_BITS;
   1554 
   1555     z1 = (INT32) wsptr[2];
   1556     z2 = (INT32) wsptr[4];
   1557     z3 = (INT32) wsptr[6];
   1558 
   1559     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1560     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1561     z4 = z1 + z3;
   1562     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
   1563     z4 -= z2;
   1564     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1565     tmp21 = tmp20 + tmp23 + tmp25 -
   1566             MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1567     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1568     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1569     tmp24 += tmp25;
   1570     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1571     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1572              MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1573     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1574 
   1575     /* Odd part */
   1576 
   1577     z1 = (INT32) wsptr[1];
   1578     z2 = (INT32) wsptr[3];
   1579     z3 = (INT32) wsptr[5];
   1580     z4 = (INT32) wsptr[7];
   1581 
   1582     tmp11 = z1 + z2;
   1583     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1584     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1585     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1586     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1587     tmp10 = tmp11 + tmp12 + tmp13 -
   1588             MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1589     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1590     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1591     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1592     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
   1593     tmp11 += z1;
   1594     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1595     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
   1596              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1597              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1598 
   1599     /* Final output stage */
   1600 
   1601     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1602                                                CONST_BITS+PASS1_BITS+3)
   1603                              & RANGE_MASK];
   1604     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1605                                                CONST_BITS+PASS1_BITS+3)
   1606                              & RANGE_MASK];
   1607     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1608                                                CONST_BITS+PASS1_BITS+3)
   1609                              & RANGE_MASK];
   1610     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1611                                                CONST_BITS+PASS1_BITS+3)
   1612                              & RANGE_MASK];
   1613     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1614                                                CONST_BITS+PASS1_BITS+3)
   1615                              & RANGE_MASK];
   1616     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1617                                                CONST_BITS+PASS1_BITS+3)
   1618                              & RANGE_MASK];
   1619     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1620                                                CONST_BITS+PASS1_BITS+3)
   1621                              & RANGE_MASK];
   1622     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1623                                                CONST_BITS+PASS1_BITS+3)
   1624                              & RANGE_MASK];
   1625     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1626                                                CONST_BITS+PASS1_BITS+3)
   1627                              & RANGE_MASK];
   1628     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1629                                                CONST_BITS+PASS1_BITS+3)
   1630                              & RANGE_MASK];
   1631     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
   1632                                                CONST_BITS+PASS1_BITS+3)
   1633                              & RANGE_MASK];
   1634 
   1635     wsptr += 8;		/* advance pointer to next row */
   1636   }
   1637 }
   1638 
   1639 
   1640 /*
   1641  * Perform dequantization and inverse DCT on one block of coefficients,
   1642  * producing a 12x12 output block.
   1643  *
   1644  * Optimized algorithm with 15 multiplications in the 1-D kernel.
   1645  * cK represents sqrt(2) * cos(K*pi/24).
   1646  */
   1647 
   1648 GLOBAL(void)
   1649 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1650                  JCOEFPTR coef_block,
   1651                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1652 {
   1653   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1654   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1655   INT32 z1, z2, z3, z4;
   1656   JCOEFPTR inptr;
   1657   ISLOW_MULT_TYPE * quantptr;
   1658   int * wsptr;
   1659   JSAMPROW outptr;
   1660   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1661   int ctr;
   1662   int workspace[8*12];	/* buffers data between passes */
   1663   SHIFT_TEMPS
   1664 
   1665   /* Pass 1: process columns from input, store into work array. */
   1666 
   1667   inptr = coef_block;
   1668   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1669   wsptr = workspace;
   1670   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1671     /* Even part */
   1672 
   1673     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1674     z3 <<= CONST_BITS;
   1675     /* Add fudge factor here for final descale. */
   1676     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   1677 
   1678     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1679     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1680 
   1681     tmp10 = z3 + z4;
   1682     tmp11 = z3 - z4;
   1683 
   1684     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1685     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1686     z1 <<= CONST_BITS;
   1687     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1688     z2 <<= CONST_BITS;
   1689 
   1690     tmp12 = z1 - z2;
   1691 
   1692     tmp21 = z3 + tmp12;
   1693     tmp24 = z3 - tmp12;
   1694 
   1695     tmp12 = z4 + z2;
   1696 
   1697     tmp20 = tmp10 + tmp12;
   1698     tmp25 = tmp10 - tmp12;
   1699 
   1700     tmp12 = z4 - z1 - z2;
   1701 
   1702     tmp22 = tmp11 + tmp12;
   1703     tmp23 = tmp11 - tmp12;
   1704 
   1705     /* Odd part */
   1706 
   1707     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1708     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1709     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1710     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1711 
   1712     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1713     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   1714 
   1715     tmp10 = z1 + z3;
   1716     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1717     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1718     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1719     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   1720     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1721     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1722     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1723              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1724 
   1725     z1 -= z4;
   1726     z2 -= z3;
   1727     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1728     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1729     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1730 
   1731     /* Final output stage */
   1732 
   1733     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1734     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1735     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1736     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1737     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1738     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1739     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1740     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1741     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1742     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1743     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   1744     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   1745   }
   1746 
   1747   /* Pass 2: process 12 rows from work array, store into output array. */
   1748 
   1749   wsptr = workspace;
   1750   for (ctr = 0; ctr < 12; ctr++) {
   1751     outptr = output_buf[ctr] + output_col;
   1752 
   1753     /* Even part */
   1754 
   1755     /* Add fudge factor here for final descale. */
   1756     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1757     z3 <<= CONST_BITS;
   1758 
   1759     z4 = (INT32) wsptr[4];
   1760     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1761 
   1762     tmp10 = z3 + z4;
   1763     tmp11 = z3 - z4;
   1764 
   1765     z1 = (INT32) wsptr[2];
   1766     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1767     z1 <<= CONST_BITS;
   1768     z2 = (INT32) wsptr[6];
   1769     z2 <<= CONST_BITS;
   1770 
   1771     tmp12 = z1 - z2;
   1772 
   1773     tmp21 = z3 + tmp12;
   1774     tmp24 = z3 - tmp12;
   1775 
   1776     tmp12 = z4 + z2;
   1777 
   1778     tmp20 = tmp10 + tmp12;
   1779     tmp25 = tmp10 - tmp12;
   1780 
   1781     tmp12 = z4 - z1 - z2;
   1782 
   1783     tmp22 = tmp11 + tmp12;
   1784     tmp23 = tmp11 - tmp12;
   1785 
   1786     /* Odd part */
   1787 
   1788     z1 = (INT32) wsptr[1];
   1789     z2 = (INT32) wsptr[3];
   1790     z3 = (INT32) wsptr[5];
   1791     z4 = (INT32) wsptr[7];
   1792 
   1793     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1794     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   1795 
   1796     tmp10 = z1 + z3;
   1797     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1798     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1799     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1800     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   1801     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1802     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1803     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1804              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1805 
   1806     z1 -= z4;
   1807     z2 -= z3;
   1808     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1809     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1810     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1811 
   1812     /* Final output stage */
   1813 
   1814     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1815                                                CONST_BITS+PASS1_BITS+3)
   1816                              & RANGE_MASK];
   1817     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1818                                                CONST_BITS+PASS1_BITS+3)
   1819                              & RANGE_MASK];
   1820     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1821                                                CONST_BITS+PASS1_BITS+3)
   1822                              & RANGE_MASK];
   1823     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1824                                                CONST_BITS+PASS1_BITS+3)
   1825                              & RANGE_MASK];
   1826     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1827                                                CONST_BITS+PASS1_BITS+3)
   1828                              & RANGE_MASK];
   1829     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1830                                                CONST_BITS+PASS1_BITS+3)
   1831                              & RANGE_MASK];
   1832     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1833                                                CONST_BITS+PASS1_BITS+3)
   1834                              & RANGE_MASK];
   1835     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1836                                                CONST_BITS+PASS1_BITS+3)
   1837                              & RANGE_MASK];
   1838     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1839                                                CONST_BITS+PASS1_BITS+3)
   1840                              & RANGE_MASK];
   1841     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1842                                                CONST_BITS+PASS1_BITS+3)
   1843                              & RANGE_MASK];
   1844     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   1845                                                CONST_BITS+PASS1_BITS+3)
   1846                              & RANGE_MASK];
   1847     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   1848                                                CONST_BITS+PASS1_BITS+3)
   1849                              & RANGE_MASK];
   1850 
   1851     wsptr += 8;		/* advance pointer to next row */
   1852   }
   1853 }
   1854 
   1855 
   1856 /*
   1857  * Perform dequantization and inverse DCT on one block of coefficients,
   1858  * producing a 13x13 output block.
   1859  *
   1860  * Optimized algorithm with 29 multiplications in the 1-D kernel.
   1861  * cK represents sqrt(2) * cos(K*pi/26).
   1862  */
   1863 
   1864 GLOBAL(void)
   1865 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1866                  JCOEFPTR coef_block,
   1867                  JSAMPARRAY output_buf, JDIMENSION output_col)
   1868 {
   1869   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1870   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   1871   INT32 z1, z2, z3, z4;
   1872   JCOEFPTR inptr;
   1873   ISLOW_MULT_TYPE * quantptr;
   1874   int * wsptr;
   1875   JSAMPROW outptr;
   1876   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1877   int ctr;
   1878   int workspace[8*13];	/* buffers data between passes */
   1879   SHIFT_TEMPS
   1880 
   1881   /* Pass 1: process columns from input, store into work array. */
   1882 
   1883   inptr = coef_block;
   1884   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1885   wsptr = workspace;
   1886   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1887     /* Even part */
   1888 
   1889     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1890     z1 <<= CONST_BITS;
   1891     /* Add fudge factor here for final descale. */
   1892     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   1893 
   1894     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1895     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1896     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1897 
   1898     tmp10 = z3 + z4;
   1899     tmp11 = z3 - z4;
   1900 
   1901     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1902     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1903 
   1904     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1905     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1906 
   1907     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1908     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1909 
   1910     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1911     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
   1912 
   1913     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   1914     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   1915 
   1916     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
   1917     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
   1918 
   1919     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   1920 
   1921     /* Odd part */
   1922 
   1923     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1924     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1925     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1926     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1927 
   1928     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   1929     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   1930     tmp15 = z1 + z4;
   1931     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   1932     tmp10 = tmp11 + tmp12 + tmp13 -
   1933             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   1934     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
   1935     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   1936     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   1937     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
   1938     tmp11 += tmp14;
   1939     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   1940     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
   1941     tmp12 += tmp14;
   1942     tmp13 += tmp14;
   1943     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   1944     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   1945             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   1946     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   1947     tmp14 += z1;
   1948     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   1949              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   1950 
   1951     /* Final output stage */
   1952 
   1953     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1954     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1955     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1956     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1957     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1958     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1959     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1960     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1961     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1962     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1963     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   1964     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   1965     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
   1966   }
   1967 
   1968   /* Pass 2: process 13 rows from work array, store into output array. */
   1969 
   1970   wsptr = workspace;
   1971   for (ctr = 0; ctr < 13; ctr++) {
   1972     outptr = output_buf[ctr] + output_col;
   1973 
   1974     /* Even part */
   1975 
   1976     /* Add fudge factor here for final descale. */
   1977     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1978     z1 <<= CONST_BITS;
   1979 
   1980     z2 = (INT32) wsptr[2];
   1981     z3 = (INT32) wsptr[4];
   1982     z4 = (INT32) wsptr[6];
   1983 
   1984     tmp10 = z3 + z4;
   1985     tmp11 = z3 - z4;
   1986 
   1987     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1988     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1989 
   1990     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1991     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1992 
   1993     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1994     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1995 
   1996     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1997     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
   1998 
   1999     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   2000     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   2001 
   2002     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
   2003     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
   2004 
   2005     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   2006 
   2007     /* Odd part */
   2008 
   2009     z1 = (INT32) wsptr[1];
   2010     z2 = (INT32) wsptr[3];
   2011     z3 = (INT32) wsptr[5];
   2012     z4 = (INT32) wsptr[7];
   2013 
   2014     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   2015     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   2016     tmp15 = z1 + z4;
   2017     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   2018     tmp10 = tmp11 + tmp12 + tmp13 -
   2019             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   2020     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
   2021     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   2022     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   2023     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
   2024     tmp11 += tmp14;
   2025     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   2026     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
   2027     tmp12 += tmp14;
   2028     tmp13 += tmp14;
   2029     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   2030     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   2031             MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   2032     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   2033     tmp14 += z1;
   2034     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   2035              MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   2036 
   2037     /* Final output stage */
   2038 
   2039     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   2040                                                CONST_BITS+PASS1_BITS+3)
   2041                              & RANGE_MASK];
   2042     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   2043                                                CONST_BITS+PASS1_BITS+3)
   2044                              & RANGE_MASK];
   2045     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   2046                                                CONST_BITS+PASS1_BITS+3)
   2047                              & RANGE_MASK];
   2048     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   2049                                                CONST_BITS+PASS1_BITS+3)
   2050                              & RANGE_MASK];
   2051     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   2052                                                CONST_BITS+PASS1_BITS+3)
   2053                              & RANGE_MASK];
   2054     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   2055                                                CONST_BITS+PASS1_BITS+3)
   2056                              & RANGE_MASK];
   2057     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   2058                                                CONST_BITS+PASS1_BITS+3)
   2059                              & RANGE_MASK];
   2060     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   2061                                                CONST_BITS+PASS1_BITS+3)
   2062                              & RANGE_MASK];
   2063     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   2064                                                CONST_BITS+PASS1_BITS+3)
   2065                              & RANGE_MASK];
   2066     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   2067                                                CONST_BITS+PASS1_BITS+3)
   2068                              & RANGE_MASK];
   2069     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   2070                                                CONST_BITS+PASS1_BITS+3)
   2071                              & RANGE_MASK];
   2072     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   2073                                                CONST_BITS+PASS1_BITS+3)
   2074                              & RANGE_MASK];
   2075     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
   2076                                                CONST_BITS+PASS1_BITS+3)
   2077                              & RANGE_MASK];
   2078 
   2079     wsptr += 8;		/* advance pointer to next row */
   2080   }
   2081 }
   2082 
   2083 
   2084 /*
   2085  * Perform dequantization and inverse DCT on one block of coefficients,
   2086  * producing a 14x14 output block.
   2087  *
   2088  * Optimized algorithm with 20 multiplications in the 1-D kernel.
   2089  * cK represents sqrt(2) * cos(K*pi/28).
   2090  */
   2091 
   2092 GLOBAL(void)
   2093 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2094                  JCOEFPTR coef_block,
   2095                  JSAMPARRAY output_buf, JDIMENSION output_col)
   2096 {
   2097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   2098   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   2099   INT32 z1, z2, z3, z4;
   2100   JCOEFPTR inptr;
   2101   ISLOW_MULT_TYPE * quantptr;
   2102   int * wsptr;
   2103   JSAMPROW outptr;
   2104   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2105   int ctr;
   2106   int workspace[8*14];	/* buffers data between passes */
   2107   SHIFT_TEMPS
   2108 
   2109   /* Pass 1: process columns from input, store into work array. */
   2110 
   2111   inptr = coef_block;
   2112   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2113   wsptr = workspace;
   2114   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2115     /* Even part */
   2116 
   2117     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2118     z1 <<= CONST_BITS;
   2119     /* Add fudge factor here for final descale. */
   2120     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   2121     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2122     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   2123     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   2124     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   2125 
   2126     tmp10 = z1 + z2;
   2127     tmp11 = z1 + z3;
   2128     tmp12 = z1 - z4;
   2129 
   2130     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
   2131                         CONST_BITS-PASS1_BITS);
   2132 
   2133     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2134     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2135 
   2136     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   2137 
   2138     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   2139     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   2140     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   2141             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   2142 
   2143     tmp20 = tmp10 + tmp13;
   2144     tmp26 = tmp10 - tmp13;
   2145     tmp21 = tmp11 + tmp14;
   2146     tmp25 = tmp11 - tmp14;
   2147     tmp22 = tmp12 + tmp15;
   2148     tmp24 = tmp12 - tmp15;
   2149 
   2150     /* Odd part */
   2151 
   2152     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2153     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2154     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2155     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2156     tmp13 = z4 << CONST_BITS;
   2157 
   2158     tmp14 = z1 + z3;
   2159     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   2160     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   2161     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   2162     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   2163     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   2164     z1    -= z2;
   2165     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
   2166     tmp16 += tmp15;
   2167     z1    += z4;
   2168     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
   2169     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
   2170     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
   2171     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   2172     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   2173     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
   2174 
   2175     tmp13 = (z1 - z3) << PASS1_BITS;
   2176 
   2177     /* Final output stage */
   2178 
   2179     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   2180     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   2181     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   2182     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   2183     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   2184     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   2185     wsptr[8*3]  = (int) (tmp23 + tmp13);
   2186     wsptr[8*10] = (int) (tmp23 - tmp13);
   2187     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   2188     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   2189     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   2190     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   2191     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
   2192     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
   2193   }
   2194 
   2195   /* Pass 2: process 14 rows from work array, store into output array. */
   2196 
   2197   wsptr = workspace;
   2198   for (ctr = 0; ctr < 14; ctr++) {
   2199     outptr = output_buf[ctr] + output_col;
   2200 
   2201     /* Even part */
   2202 
   2203     /* Add fudge factor here for final descale. */
   2204     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   2205     z1 <<= CONST_BITS;
   2206     z4 = (INT32) wsptr[4];
   2207     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   2208     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   2209     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   2210 
   2211     tmp10 = z1 + z2;
   2212     tmp11 = z1 + z3;
   2213     tmp12 = z1 - z4;
   2214 
   2215     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
   2216 
   2217     z1 = (INT32) wsptr[2];
   2218     z2 = (INT32) wsptr[6];
   2219 
   2220     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   2221 
   2222     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   2223     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   2224     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   2225             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   2226 
   2227     tmp20 = tmp10 + tmp13;
   2228     tmp26 = tmp10 - tmp13;
   2229     tmp21 = tmp11 + tmp14;
   2230     tmp25 = tmp11 - tmp14;
   2231     tmp22 = tmp12 + tmp15;
   2232     tmp24 = tmp12 - tmp15;
   2233 
   2234     /* Odd part */
   2235 
   2236     z1 = (INT32) wsptr[1];
   2237     z2 = (INT32) wsptr[3];
   2238     z3 = (INT32) wsptr[5];
   2239     z4 = (INT32) wsptr[7];
   2240     z4 <<= CONST_BITS;
   2241 
   2242     tmp14 = z1 + z3;
   2243     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   2244     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   2245     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   2246     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   2247     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   2248     z1    -= z2;
   2249     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
   2250     tmp16 += tmp15;
   2251     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
   2252     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
   2253     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
   2254     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   2255     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   2256     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
   2257 
   2258     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
   2259 
   2260     /* Final output stage */
   2261 
   2262     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   2263                                                CONST_BITS+PASS1_BITS+3)
   2264                              & RANGE_MASK];
   2265     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   2266                                                CONST_BITS+PASS1_BITS+3)
   2267                              & RANGE_MASK];
   2268     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   2269                                                CONST_BITS+PASS1_BITS+3)
   2270                              & RANGE_MASK];
   2271     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   2272                                                CONST_BITS+PASS1_BITS+3)
   2273                              & RANGE_MASK];
   2274     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   2275                                                CONST_BITS+PASS1_BITS+3)
   2276                              & RANGE_MASK];
   2277     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   2278                                                CONST_BITS+PASS1_BITS+3)
   2279                              & RANGE_MASK];
   2280     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   2281                                                CONST_BITS+PASS1_BITS+3)
   2282                              & RANGE_MASK];
   2283     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   2284                                                CONST_BITS+PASS1_BITS+3)
   2285                              & RANGE_MASK];
   2286     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   2287                                                CONST_BITS+PASS1_BITS+3)
   2288                              & RANGE_MASK];
   2289     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   2290                                                CONST_BITS+PASS1_BITS+3)
   2291                              & RANGE_MASK];
   2292     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   2293                                                CONST_BITS+PASS1_BITS+3)
   2294                              & RANGE_MASK];
   2295     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   2296                                                CONST_BITS+PASS1_BITS+3)
   2297                              & RANGE_MASK];
   2298     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
   2299                                                CONST_BITS+PASS1_BITS+3)
   2300                              & RANGE_MASK];
   2301     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
   2302                                                CONST_BITS+PASS1_BITS+3)
   2303                              & RANGE_MASK];
   2304 
   2305     wsptr += 8;		/* advance pointer to next row */
   2306   }
   2307 }
   2308 
   2309 
   2310 /*
   2311  * Perform dequantization and inverse DCT on one block of coefficients,
   2312  * producing a 15x15 output block.
   2313  *
   2314  * Optimized algorithm with 22 multiplications in the 1-D kernel.
   2315  * cK represents sqrt(2) * cos(K*pi/30).
   2316  */
   2317 
   2318 GLOBAL(void)
   2319 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2320                  JCOEFPTR coef_block,
   2321                  JSAMPARRAY output_buf, JDIMENSION output_col)
   2322 {
   2323   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   2324   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2325   INT32 z1, z2, z3, z4;
   2326   JCOEFPTR inptr;
   2327   ISLOW_MULT_TYPE * quantptr;
   2328   int * wsptr;
   2329   JSAMPROW outptr;
   2330   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2331   int ctr;
   2332   int workspace[8*15];	/* buffers data between passes */
   2333   SHIFT_TEMPS
   2334 
   2335   /* Pass 1: process columns from input, store into work array. */
   2336 
   2337   inptr = coef_block;
   2338   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2339   wsptr = workspace;
   2340   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2341     /* Even part */
   2342 
   2343     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2344     z1 <<= CONST_BITS;
   2345     /* Add fudge factor here for final descale. */
   2346     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   2347 
   2348     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2349     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2350     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2351 
   2352     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2353     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2354 
   2355     tmp12 = z1 - tmp10;
   2356     tmp13 = z1 + tmp11;
   2357     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
   2358 
   2359     z4 = z2 - z3;
   2360     z3 += z2;
   2361     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2362     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2363     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2364 
   2365     tmp20 = tmp13 + tmp10 + tmp11;
   2366     tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2367 
   2368     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2369     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2370 
   2371     tmp25 = tmp13 - tmp10 - tmp11;
   2372     tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2373 
   2374     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2375     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2376 
   2377     tmp21 = tmp12 + tmp10 + tmp11;
   2378     tmp24 = tmp13 - tmp10 + tmp11;
   2379     tmp11 += tmp11;
   2380     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2381     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2382 
   2383     /* Odd part */
   2384 
   2385     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2386     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2387     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2388     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2389     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2390 
   2391     tmp13 = z2 - z4;
   2392     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2393     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2394     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2395 
   2396     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
   2397     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
   2398     z2 = z1 - z4;
   2399     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2400 
   2401     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2402     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2403     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2404     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2405     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2406     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2407 
   2408     /* Final output stage */
   2409 
   2410     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   2411     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   2412     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   2413     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   2414     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   2415     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   2416     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   2417     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   2418     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   2419     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   2420     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   2421     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   2422     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
   2423     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
   2424     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
   2425   }
   2426 
   2427   /* Pass 2: process 15 rows from work array, store into output array. */
   2428 
   2429   wsptr = workspace;
   2430   for (ctr = 0; ctr < 15; ctr++) {
   2431     outptr = output_buf[ctr] + output_col;
   2432 
   2433     /* Even part */
   2434 
   2435     /* Add fudge factor here for final descale. */
   2436     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   2437     z1 <<= CONST_BITS;
   2438 
   2439     z2 = (INT32) wsptr[2];
   2440     z3 = (INT32) wsptr[4];
   2441     z4 = (INT32) wsptr[6];
   2442 
   2443     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2444     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2445 
   2446     tmp12 = z1 - tmp10;
   2447     tmp13 = z1 + tmp11;
   2448     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
   2449 
   2450     z4 = z2 - z3;
   2451     z3 += z2;
   2452     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2453     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2454     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2455 
   2456     tmp20 = tmp13 + tmp10 + tmp11;
   2457     tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2458 
   2459     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2460     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2461 
   2462     tmp25 = tmp13 - tmp10 - tmp11;
   2463     tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2464 
   2465     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2466     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2467 
   2468     tmp21 = tmp12 + tmp10 + tmp11;
   2469     tmp24 = tmp13 - tmp10 + tmp11;
   2470     tmp11 += tmp11;
   2471     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2472     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2473 
   2474     /* Odd part */
   2475 
   2476     z1 = (INT32) wsptr[1];
   2477     z2 = (INT32) wsptr[3];
   2478     z4 = (INT32) wsptr[5];
   2479     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2480     z4 = (INT32) wsptr[7];
   2481 
   2482     tmp13 = z2 - z4;
   2483     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2484     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2485     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2486 
   2487     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
   2488     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
   2489     z2 = z1 - z4;
   2490     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2491 
   2492     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2493     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2494     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2495     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2496     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2497     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2498 
   2499     /* Final output stage */
   2500 
   2501     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   2502                                                CONST_BITS+PASS1_BITS+3)
   2503                              & RANGE_MASK];
   2504     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   2505                                                CONST_BITS+PASS1_BITS+3)
   2506                              & RANGE_MASK];
   2507     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   2508                                                CONST_BITS+PASS1_BITS+3)
   2509                              & RANGE_MASK];
   2510     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   2511                                                CONST_BITS+PASS1_BITS+3)
   2512                              & RANGE_MASK];
   2513     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   2514                                                CONST_BITS+PASS1_BITS+3)
   2515                              & RANGE_MASK];
   2516     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   2517                                                CONST_BITS+PASS1_BITS+3)
   2518                              & RANGE_MASK];
   2519     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   2520                                                CONST_BITS+PASS1_BITS+3)
   2521                              & RANGE_MASK];
   2522     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   2523                                                CONST_BITS+PASS1_BITS+3)
   2524                              & RANGE_MASK];
   2525     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   2526                                                CONST_BITS+PASS1_BITS+3)
   2527                              & RANGE_MASK];
   2528     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   2529                                                CONST_BITS+PASS1_BITS+3)
   2530                              & RANGE_MASK];
   2531     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   2532                                                CONST_BITS+PASS1_BITS+3)
   2533                              & RANGE_MASK];
   2534     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   2535                                                CONST_BITS+PASS1_BITS+3)
   2536                              & RANGE_MASK];
   2537     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
   2538                                                CONST_BITS+PASS1_BITS+3)
   2539                              & RANGE_MASK];
   2540     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
   2541                                                CONST_BITS+PASS1_BITS+3)
   2542                              & RANGE_MASK];
   2543     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
   2544                                                CONST_BITS+PASS1_BITS+3)
   2545                              & RANGE_MASK];
   2546 
   2547     wsptr += 8;		/* advance pointer to next row */
   2548   }
   2549 }
   2550 
   2551 
   2552 /*
   2553  * Perform dequantization and inverse DCT on one block of coefficients,
   2554  * producing a 16x16 output block.
   2555  *
   2556  * Optimized algorithm with 28 multiplications in the 1-D kernel.
   2557  * cK represents sqrt(2) * cos(K*pi/32).
   2558  */
   2559 
   2560 GLOBAL(void)
   2561 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2562                  JCOEFPTR coef_block,
   2563                  JSAMPARRAY output_buf, JDIMENSION output_col)
   2564 {
   2565   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   2566   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2567   INT32 z1, z2, z3, z4;
   2568   JCOEFPTR inptr;
   2569   ISLOW_MULT_TYPE * quantptr;
   2570   int * wsptr;
   2571   JSAMPROW outptr;
   2572   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2573   int ctr;
   2574   int workspace[8*16];	/* buffers data between passes */
   2575   SHIFT_TEMPS
   2576 
   2577   /* Pass 1: process columns from input, store into work array. */
   2578 
   2579   inptr = coef_block;
   2580   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2581   wsptr = workspace;
   2582   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2583     /* Even part */
   2584 
   2585     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2586     tmp0 <<= CONST_BITS;
   2587     /* Add fudge factor here for final descale. */
   2588     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
   2589 
   2590     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2591     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2592     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2593 
   2594     tmp10 = tmp0 + tmp1;
   2595     tmp11 = tmp0 - tmp1;
   2596     tmp12 = tmp0 + tmp2;
   2597     tmp13 = tmp0 - tmp2;
   2598 
   2599     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2600     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2601     z3 = z1 - z2;
   2602     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2603     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2604 
   2605     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2606     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2607     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2608     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2609 
   2610     tmp20 = tmp10 + tmp0;
   2611     tmp27 = tmp10 - tmp0;
   2612     tmp21 = tmp12 + tmp1;
   2613     tmp26 = tmp12 - tmp1;
   2614     tmp22 = tmp13 + tmp2;
   2615     tmp25 = tmp13 - tmp2;
   2616     tmp23 = tmp11 + tmp3;
   2617     tmp24 = tmp11 - tmp3;
   2618 
   2619     /* Odd part */
   2620 
   2621     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2622     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2623     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2624     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2625 
   2626     tmp11 = z1 + z3;
   2627 
   2628     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2629     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2630     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2631     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2632     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2633     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2634     tmp0  = tmp1 + tmp2 + tmp3 -
   2635             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2636     tmp13 = tmp10 + tmp11 + tmp12 -
   2637             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2638     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2639     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2640     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2641     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2642     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2643     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2644     z2    += z4;
   2645     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   2646     tmp1  += z1;
   2647     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2648     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   2649     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2650     tmp12 += z2;
   2651     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   2652     tmp2  += z2;
   2653     tmp3  += z2;
   2654     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2655     tmp10 += z2;
   2656     tmp11 += z2;
   2657 
   2658     /* Final output stage */
   2659 
   2660     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
   2661     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
   2662     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
   2663     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
   2664     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
   2665     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
   2666     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
   2667     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
   2668     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
   2669     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
   2670     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
   2671     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
   2672     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
   2673     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
   2674     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
   2675     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
   2676   }
   2677 
   2678   /* Pass 2: process 16 rows from work array, store into output array. */
   2679 
   2680   wsptr = workspace;
   2681   for (ctr = 0; ctr < 16; ctr++) {
   2682     outptr = output_buf[ctr] + output_col;
   2683 
   2684     /* Even part */
   2685 
   2686     /* Add fudge factor here for final descale. */
   2687     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   2688     tmp0 <<= CONST_BITS;
   2689 
   2690     z1 = (INT32) wsptr[4];
   2691     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2692     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2693 
   2694     tmp10 = tmp0 + tmp1;
   2695     tmp11 = tmp0 - tmp1;
   2696     tmp12 = tmp0 + tmp2;
   2697     tmp13 = tmp0 - tmp2;
   2698 
   2699     z1 = (INT32) wsptr[2];
   2700     z2 = (INT32) wsptr[6];
   2701     z3 = z1 - z2;
   2702     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2703     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2704 
   2705     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2706     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2707     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2708     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2709 
   2710     tmp20 = tmp10 + tmp0;
   2711     tmp27 = tmp10 - tmp0;
   2712     tmp21 = tmp12 + tmp1;
   2713     tmp26 = tmp12 - tmp1;
   2714     tmp22 = tmp13 + tmp2;
   2715     tmp25 = tmp13 - tmp2;
   2716     tmp23 = tmp11 + tmp3;
   2717     tmp24 = tmp11 - tmp3;
   2718 
   2719     /* Odd part */
   2720 
   2721     z1 = (INT32) wsptr[1];
   2722     z2 = (INT32) wsptr[3];
   2723     z3 = (INT32) wsptr[5];
   2724     z4 = (INT32) wsptr[7];
   2725 
   2726     tmp11 = z1 + z3;
   2727 
   2728     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2729     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2730     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2731     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2732     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2733     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2734     tmp0  = tmp1 + tmp2 + tmp3 -
   2735             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2736     tmp13 = tmp10 + tmp11 + tmp12 -
   2737             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2738     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2739     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2740     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2741     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2742     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2743     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2744     z2    += z4;
   2745     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   2746     tmp1  += z1;
   2747     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2748     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   2749     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2750     tmp12 += z2;
   2751     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   2752     tmp2  += z2;
   2753     tmp3  += z2;
   2754     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2755     tmp10 += z2;
   2756     tmp11 += z2;
   2757 
   2758     /* Final output stage */
   2759 
   2760     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
   2761                                                CONST_BITS+PASS1_BITS+3)
   2762                              & RANGE_MASK];
   2763     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
   2764                                                CONST_BITS+PASS1_BITS+3)
   2765                              & RANGE_MASK];
   2766     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
   2767                                                CONST_BITS+PASS1_BITS+3)
   2768                              & RANGE_MASK];
   2769     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
   2770                                                CONST_BITS+PASS1_BITS+3)
   2771                              & RANGE_MASK];
   2772     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
   2773                                                CONST_BITS+PASS1_BITS+3)
   2774                              & RANGE_MASK];
   2775     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
   2776                                                CONST_BITS+PASS1_BITS+3)
   2777                              & RANGE_MASK];
   2778     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
   2779                                                CONST_BITS+PASS1_BITS+3)
   2780                              & RANGE_MASK];
   2781     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
   2782                                                CONST_BITS+PASS1_BITS+3)
   2783                              & RANGE_MASK];
   2784     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
   2785                                                CONST_BITS+PASS1_BITS+3)
   2786                              & RANGE_MASK];
   2787     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
   2788                                                CONST_BITS+PASS1_BITS+3)
   2789                              & RANGE_MASK];
   2790     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
   2791                                                CONST_BITS+PASS1_BITS+3)
   2792                              & RANGE_MASK];
   2793     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
   2794                                                CONST_BITS+PASS1_BITS+3)
   2795                              & RANGE_MASK];
   2796     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
   2797                                                CONST_BITS+PASS1_BITS+3)
   2798                              & RANGE_MASK];
   2799     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
   2800                                                CONST_BITS+PASS1_BITS+3)
   2801                              & RANGE_MASK];
   2802     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
   2803                                                CONST_BITS+PASS1_BITS+3)
   2804                              & RANGE_MASK];
   2805     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
   2806                                                CONST_BITS+PASS1_BITS+3)
   2807                              & RANGE_MASK];
   2808 
   2809     wsptr += 8;		/* advance pointer to next row */
   2810   }
   2811 }
   2812 
   2813 
   2814 /*
   2815  * Perform dequantization and inverse DCT on one block of coefficients,
   2816  * producing a 16x8 output block.
   2817  *
   2818  * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
   2819  */
   2820 
   2821 GLOBAL(void)
   2822 jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2823                 JCOEFPTR coef_block,
   2824                 JSAMPARRAY output_buf, JDIMENSION output_col)
   2825 {
   2826   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   2827   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2828   INT32 z1, z2, z3, z4;
   2829   JCOEFPTR inptr;
   2830   ISLOW_MULT_TYPE * quantptr;
   2831   int * wsptr;
   2832   JSAMPROW outptr;
   2833   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2834   int ctr;
   2835   int workspace[8*8];	/* buffers data between passes */
   2836   SHIFT_TEMPS
   2837 
   2838   /* Pass 1: process columns from input, store into work array. */
   2839   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
   2840   /* furthermore, we scale the results by 2**PASS1_BITS. */
   2841 
   2842   inptr = coef_block;
   2843   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2844   wsptr = workspace;
   2845   for (ctr = DCTSIZE; ctr > 0; ctr--) {
   2846     /* Due to quantization, we will usually find that many of the input
   2847      * coefficients are zero, especially the AC terms.  We can exploit this
   2848      * by short-circuiting the IDCT calculation for any column in which all
   2849      * the AC terms are zero.  In that case each output is equal to the
   2850      * DC coefficient (with scale factor as needed).
   2851      * With typical images and quantization tables, half or more of the
   2852      * column DCT calculations can be simplified this way.
   2853      */
   2854 
   2855     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
   2856         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
   2857         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
   2858         inptr[DCTSIZE*7] == 0) {
   2859       /* AC terms all zero */
   2860       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
   2861 
   2862       wsptr[DCTSIZE*0] = dcval;
   2863       wsptr[DCTSIZE*1] = dcval;
   2864       wsptr[DCTSIZE*2] = dcval;
   2865       wsptr[DCTSIZE*3] = dcval;
   2866       wsptr[DCTSIZE*4] = dcval;
   2867       wsptr[DCTSIZE*5] = dcval;
   2868       wsptr[DCTSIZE*6] = dcval;
   2869       wsptr[DCTSIZE*7] = dcval;
   2870 
   2871       inptr++;			/* advance pointers to next column */
   2872       quantptr++;
   2873       wsptr++;
   2874       continue;
   2875     }
   2876 
   2877     /* Even part: reverse the even part of the forward DCT. */
   2878     /* The rotator is sqrt(2)*c(-6). */
   2879 
   2880     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2881     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2882 
   2883     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   2884     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
   2885     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
   2886 
   2887     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2888     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2889     z2 <<= CONST_BITS;
   2890     z3 <<= CONST_BITS;
   2891     /* Add fudge factor here for final descale. */
   2892     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
   2893 
   2894     tmp0 = z2 + z3;
   2895     tmp1 = z2 - z3;
   2896 
   2897     tmp10 = tmp0 + tmp2;
   2898     tmp13 = tmp0 - tmp2;
   2899     tmp11 = tmp1 + tmp3;
   2900     tmp12 = tmp1 - tmp3;
   2901 
   2902     /* Odd part per figure 8; the matrix is unitary and hence its
   2903      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   2904      */
   2905 
   2906     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2907     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2908     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2909     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2910 
   2911     z2 = tmp0 + tmp2;
   2912     z3 = tmp1 + tmp3;
   2913 
   2914     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
   2915     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   2916     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
   2917     z2 += z1;
   2918     z3 += z1;
   2919 
   2920     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   2921     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
   2922     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
   2923     tmp0 += z1 + z2;
   2924     tmp3 += z1 + z3;
   2925 
   2926     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   2927     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   2928     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   2929     tmp1 += z1 + z3;
   2930     tmp2 += z1 + z2;
   2931 
   2932     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   2933 
   2934     wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
   2935     wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
   2936     wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
   2937     wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
   2938     wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
   2939     wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
   2940     wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
   2941     wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
   2942 
   2943     inptr++;			/* advance pointers to next column */
   2944     quantptr++;
   2945     wsptr++;
   2946   }
   2947 
   2948   /* Pass 2: process 8 rows from work array, store into output array.
   2949    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
   2950    */
   2951   wsptr = workspace;
   2952   for (ctr = 0; ctr < 8; ctr++) {
   2953     outptr = output_buf[ctr] + output_col;
   2954 
   2955     /* Even part */
   2956 
   2957     /* Add fudge factor here for final descale. */
   2958     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   2959     tmp0 <<= CONST_BITS;
   2960 
   2961     z1 = (INT32) wsptr[4];
   2962     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2963     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2964 
   2965     tmp10 = tmp0 + tmp1;
   2966     tmp11 = tmp0 - tmp1;
   2967     tmp12 = tmp0 + tmp2;
   2968     tmp13 = tmp0 - tmp2;
   2969 
   2970     z1 = (INT32) wsptr[2];
   2971     z2 = (INT32) wsptr[6];
   2972     z3 = z1 - z2;
   2973     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2974     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2975 
   2976     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2977     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2978     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2979     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2980 
   2981     tmp20 = tmp10 + tmp0;
   2982     tmp27 = tmp10 - tmp0;
   2983     tmp21 = tmp12 + tmp1;
   2984     tmp26 = tmp12 - tmp1;
   2985     tmp22 = tmp13 + tmp2;
   2986     tmp25 = tmp13 - tmp2;
   2987     tmp23 = tmp11 + tmp3;
   2988     tmp24 = tmp11 - tmp3;
   2989 
   2990     /* Odd part */
   2991 
   2992     z1 = (INT32) wsptr[1];
   2993     z2 = (INT32) wsptr[3];
   2994     z3 = (INT32) wsptr[5];
   2995     z4 = (INT32) wsptr[7];
   2996 
   2997     tmp11 = z1 + z3;
   2998 
   2999     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   3000     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   3001     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   3002     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   3003     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   3004     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   3005     tmp0  = tmp1 + tmp2 + tmp3 -
   3006             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   3007     tmp13 = tmp10 + tmp11 + tmp12 -
   3008             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   3009     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   3010     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   3011     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   3012     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   3013     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   3014     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   3015     z2    += z4;
   3016     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   3017     tmp1  += z1;
   3018     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   3019     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   3020     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   3021     tmp12 += z2;
   3022     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   3023     tmp2  += z2;
   3024     tmp3  += z2;
   3025     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   3026     tmp10 += z2;
   3027     tmp11 += z2;
   3028 
   3029     /* Final output stage */
   3030 
   3031     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
   3032                                                CONST_BITS+PASS1_BITS+3)
   3033                              & RANGE_MASK];
   3034     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
   3035                                                CONST_BITS+PASS1_BITS+3)
   3036                              & RANGE_MASK];
   3037     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
   3038                                                CONST_BITS+PASS1_BITS+3)
   3039                              & RANGE_MASK];
   3040     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
   3041                                                CONST_BITS+PASS1_BITS+3)
   3042                              & RANGE_MASK];
   3043     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
   3044                                                CONST_BITS+PASS1_BITS+3)
   3045                              & RANGE_MASK];
   3046     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
   3047                                                CONST_BITS+PASS1_BITS+3)
   3048                              & RANGE_MASK];
   3049     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
   3050                                                CONST_BITS+PASS1_BITS+3)
   3051                              & RANGE_MASK];
   3052     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
   3053                                                CONST_BITS+PASS1_BITS+3)
   3054                              & RANGE_MASK];
   3055     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
   3056                                                CONST_BITS+PASS1_BITS+3)
   3057                              & RANGE_MASK];
   3058     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
   3059                                                CONST_BITS+PASS1_BITS+3)
   3060                              & RANGE_MASK];
   3061     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
   3062                                                CONST_BITS+PASS1_BITS+3)
   3063                              & RANGE_MASK];
   3064     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
   3065                                                CONST_BITS+PASS1_BITS+3)
   3066                              & RANGE_MASK];
   3067     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
   3068                                                CONST_BITS+PASS1_BITS+3)
   3069                              & RANGE_MASK];
   3070     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
   3071                                                CONST_BITS+PASS1_BITS+3)
   3072                              & RANGE_MASK];
   3073     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
   3074                                                CONST_BITS+PASS1_BITS+3)
   3075                              & RANGE_MASK];
   3076     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
   3077                                                CONST_BITS+PASS1_BITS+3)
   3078                              & RANGE_MASK];
   3079 
   3080     wsptr += 8;		/* advance pointer to next row */
   3081   }
   3082 }
   3083 
   3084 
   3085 /*
   3086  * Perform dequantization and inverse DCT on one block of coefficients,
   3087  * producing a 14x7 output block.
   3088  *
   3089  * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
   3090  */
   3091 
   3092 GLOBAL(void)
   3093 jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   3094                 JCOEFPTR coef_block,
   3095                 JSAMPARRAY output_buf, JDIMENSION output_col)
   3096 {
   3097   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   3098   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   3099   INT32 z1, z2, z3, z4;
   3100   JCOEFPTR inptr;
   3101   ISLOW_MULT_TYPE * quantptr;
   3102   int * wsptr;
   3103   JSAMPROW outptr;
   3104   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   3105   int ctr;
   3106   int workspace[8*7];	/* buffers data between passes */
   3107   SHIFT_TEMPS
   3108 
   3109   /* Pass 1: process columns from input, store into work array.
   3110    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
   3111    */
   3112   inptr = coef_block;
   3113   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   3114   wsptr = workspace;
   3115   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   3116     /* Even part */
   3117 
   3118     tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   3119     tmp23 <<= CONST_BITS;
   3120     /* Add fudge factor here for final descale. */
   3121     tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
   3122 
   3123     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   3124     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   3125     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   3126 
   3127     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
   3128     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
   3129     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
   3130     tmp10 = z1 + z3;
   3131     z2 -= tmp10;
   3132     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
   3133     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
   3134     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
   3135     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
   3136 
   3137     /* Odd part */
   3138 
   3139     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   3140     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   3141     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   3142 
   3143     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
   3144     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
   3145     tmp10 = tmp11 - tmp12;
   3146     tmp11 += tmp12;
   3147     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
   3148     tmp11 += tmp12;
   3149     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
   3150     tmp10 += z2;
   3151     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
   3152 
   3153     /* Final output stage */
   3154 
   3155     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   3156     wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   3157     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   3158     wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   3159     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   3160     wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   3161     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
   3162   }
   3163 
   3164   /* Pass 2: process 7 rows from work array, store into output array.
   3165    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
   3166    */
   3167   wsptr = workspace;
   3168   for (ctr = 0; ctr < 7; ctr++) {
   3169     outptr = output_buf[ctr] + output_col;
   3170 
   3171     /* Even part */
   3172 
   3173     /* Add fudge factor here for final descale. */
   3174     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   3175     z1 <<= CONST_BITS;
   3176     z4 = (INT32) wsptr[4];
   3177     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   3178     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   3179     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   3180 
   3181     tmp10 = z1 + z2;
   3182     tmp11 = z1 + z3;
   3183     tmp12 = z1 - z4;
   3184 
   3185     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
   3186 
   3187     z1 = (INT32) wsptr[2];
   3188     z2 = (INT32) wsptr[6];
   3189 
   3190     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   3191 
   3192     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   3193     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   3194     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   3195             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   3196 
   3197     tmp20 = tmp10 + tmp13;
   3198     tmp26 = tmp10 - tmp13;
   3199     tmp21 = tmp11 + tmp14;
   3200     tmp25 = tmp11 - tmp14;
   3201     tmp22 = tmp12 + tmp15;
   3202     tmp24 = tmp12 - tmp15;
   3203 
   3204     /* Odd part */
   3205 
   3206     z1 = (INT32) wsptr[1];
   3207     z2 = (INT32) wsptr[3];
   3208     z3 = (INT32) wsptr[5];
   3209     z4 = (INT32) wsptr[7];
   3210     z4 <<= CONST_BITS;
   3211 
   3212     tmp14 = z1 + z3;
   3213     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   3214     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   3215     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   3216     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   3217     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   3218     z1    -= z2;
   3219     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
   3220     tmp16 += tmp15;
   3221     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
   3222     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
   3223     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
   3224     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   3225     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   3226     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
   3227 
   3228     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
   3229 
   3230     /* Final output stage */
   3231 
   3232     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   3233                                                CONST_BITS+PASS1_BITS+3)
   3234                              & RANGE_MASK];
   3235     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   3236                                                CONST_BITS+PASS1_BITS+3)
   3237                              & RANGE_MASK];
   3238     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   3239                                                CONST_BITS+PASS1_BITS+3)
   3240                              & RANGE_MASK];
   3241     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   3242                                                CONST_BITS+PASS1_BITS+3)
   3243                              & RANGE_MASK];
   3244     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   3245                                                CONST_BITS+PASS1_BITS+3)
   3246                              & RANGE_MASK];
   3247     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   3248                                                CONST_BITS+PASS1_BITS+3)
   3249                              & RANGE_MASK];
   3250     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   3251                                                CONST_BITS+PASS1_BITS+3)
   3252                              & RANGE_MASK];
   3253     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   3254                                                CONST_BITS+PASS1_BITS+3)
   3255                              & RANGE_MASK];
   3256     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   3257                                                CONST_BITS+PASS1_BITS+3)
   3258                              & RANGE_MASK];
   3259     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   3260                                                CONST_BITS+PASS1_BITS+3)
   3261                              & RANGE_MASK];
   3262     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   3263                                                CONST_BITS+PASS1_BITS+3)
   3264                              & RANGE_MASK];
   3265     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   3266                                                CONST_BITS+PASS1_BITS+3)
   3267                              & RANGE_MASK];
   3268     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
   3269                                                CONST_BITS+PASS1_BITS+3)
   3270                              & RANGE_MASK];
   3271     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
   3272                                                CONST_BITS+PASS1_BITS+3)
   3273                              & RANGE_MASK];
   3274 
   3275     wsptr += 8;		/* advance pointer to next row */
   3276   }
   3277 }
   3278 
   3279 
   3280 /*
   3281  * Perform dequantization and inverse DCT on one block of coefficients,
   3282  * producing a 12x6 output block.
   3283  *
   3284  * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
   3285  */
   3286 
   3287 GLOBAL(void)
   3288 jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   3289                 JCOEFPTR coef_block,
   3290                 JSAMPARRAY output_buf, JDIMENSION output_col)
   3291 {
   3292   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   3293   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   3294   INT32 z1, z2, z3, z4;
   3295   JCOEFPTR inptr;
   3296   ISLOW_MULT_TYPE * quantptr;
   3297   int * wsptr;
   3298   JSAMPROW outptr;
   3299   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   3300   int ctr;
   3301   int workspace[8*6];	/* buffers data between passes */
   3302   SHIFT_TEMPS
   3303 
   3304   /* Pass 1: process columns from input, store into work array.
   3305    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
   3306    */
   3307   inptr = coef_block;
   3308   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   3309   wsptr = workspace;
   3310   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   3311     /* Even part */
   3312 
   3313     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   3314     tmp10 <<= CONST_BITS;
   3315     /* Add fudge factor here for final descale. */
   3316     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
   3317     tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   3318     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
   3319     tmp11 = tmp10 + tmp20;
   3320     tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
   3321     tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   3322     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
   3323     tmp20 = tmp11 + tmp10;
   3324     tmp22 = tmp11 - tmp10;
   3325 
   3326     /* Odd part */
   3327 
   3328     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   3329     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   3330     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   3331     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
   3332     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
   3333     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
   3334     tmp11 = (z1 - z2 - z3) << PASS1_BITS;
   3335 
   3336     /* Final output stage */
   3337 
   3338     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   3339     wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   3340     wsptr[8*1] = (int) (tmp21 + tmp11);
   3341     wsptr[8*4] = (int) (tmp21 - tmp11);
   3342     wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   3343     wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   3344   }
   3345 
   3346   /* Pass 2: process 6 rows from work array, store into output array.
   3347    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
   3348    */
   3349   wsptr = workspace;
   3350   for (ctr = 0; ctr < 6; ctr++) {
   3351     outptr = output_buf[ctr] + output_col;
   3352 
   3353     /* Even part */
   3354 
   3355     /* Add fudge factor here for final descale. */
   3356     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   3357     z3 <<= CONST_BITS;
   3358 
   3359     z4 = (INT32) wsptr[4];
   3360     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   3361 
   3362     tmp10 = z3 + z4;
   3363     tmp11 = z3 - z4;
   3364 
   3365     z1 = (INT32) wsptr[2];
   3366     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   3367     z1 <<= CONST_BITS;
   3368     z2 = (INT32) wsptr[6];
   3369     z2 <<= CONST_BITS;
   3370 
   3371     tmp12 = z1 - z2;
   3372 
   3373     tmp21 = z3 + tmp12;
   3374     tmp24 = z3 - tmp12;
   3375 
   3376     tmp12 = z4 + z2;
   3377 
   3378     tmp20 = tmp10 + tmp12;
   3379     tmp25 = tmp10 - tmp12;
   3380 
   3381     tmp12 = z4 - z1 - z2;
   3382 
   3383     tmp22 = tmp11 + tmp12;
   3384     tmp23 = tmp11 - tmp12;
   3385 
   3386     /* Odd part */
   3387 
   3388     z1 = (INT32) wsptr[1];
   3389     z2 = (INT32) wsptr[3];
   3390     z3 = (INT32) wsptr[5];
   3391     z4 = (INT32) wsptr[7];
   3392 
   3393     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   3394     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   3395 
   3396     tmp10 = z1 + z3;
   3397     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   3398     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   3399     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   3400     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   3401     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   3402     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   3403     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   3404              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   3405 
   3406     z1 -= z4;
   3407     z2 -= z3;
   3408     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   3409     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   3410     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   3411 
   3412     /* Final output stage */
   3413 
   3414     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   3415                                                CONST_BITS+PASS1_BITS+3)
   3416                              & RANGE_MASK];
   3417     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   3418                                                CONST_BITS+PASS1_BITS+3)
   3419                              & RANGE_MASK];
   3420     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   3421                                                CONST_BITS+PASS1_BITS+3)
   3422                              & RANGE_MASK];
   3423     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   3424                                                CONST_BITS+PASS1_BITS+3)
   3425                              & RANGE_MASK];
   3426     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   3427                                                CONST_BITS+PASS1_BITS+3)
   3428                              & RANGE_MASK];
   3429     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   3430                                                CONST_BITS+PASS1_BITS+3)
   3431                              & RANGE_MASK];
   3432     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   3433                                                CONST_BITS+PASS1_BITS+3)
   3434                              & RANGE_MASK];
   3435     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   3436                                                CONST_BITS+PASS1_BITS+3)
   3437                              & RANGE_MASK];
   3438     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   3439                                                CONST_BITS+PASS1_BITS+3)
   3440                              & RANGE_MASK];
   3441     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   3442                                                CONST_BITS+PASS1_BITS+3)
   3443                              & RANGE_MASK];
   3444     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   3445                                                CONST_BITS+PASS1_BITS+3)
   3446                              & RANGE_MASK];
   3447     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   3448                                                CONST_BITS+PASS1_BITS+3)
   3449                              & RANGE_MASK];
   3450 
   3451     wsptr += 8;		/* advance pointer to next row */
   3452   }
   3453 }
   3454 
   3455 
   3456 /*
   3457  * Perform dequantization and inverse DCT on one block of coefficients,
   3458  * producing a 10x5 output block.
   3459  *
   3460  * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
   3461  */
   3462 
   3463 GLOBAL(void)
   3464 jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   3465                 JCOEFPTR coef_block,
   3466                 JSAMPARRAY output_buf, JDIMENSION output_col)
   3467 {
   3468   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
   3469   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
   3470   INT32 z1, z2, z3, z4;
   3471   JCOEFPTR inptr;
   3472   ISLOW_MULT_TYPE * quantptr;
   3473   int * wsptr;
   3474   JSAMPROW outptr;
   3475   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   3476   int ctr;
   3477   int workspace[8*5];	/* buffers data between passes */
   3478   SHIFT_TEMPS
   3479 
   3480   /* Pass 1: process columns from input, store into work array.
   3481    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
   3482    */
   3483   inptr = coef_block;
   3484   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   3485   wsptr = workspace;
   3486   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   3487     /* Even part */
   3488 
   3489     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   3490     tmp12 <<= CONST_BITS;
   3491     /* Add fudge factor here for final descale. */
   3492     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
   3493     tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   3494     tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   3495     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
   3496     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
   3497     z3 = tmp12 + z2;
   3498     tmp10 = z3 + z1;
   3499     tmp11 = z3 - z1;
   3500     tmp12 -= z2 << 2;
   3501 
   3502     /* Odd part */
   3503 
   3504     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   3505     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   3506 
   3507     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
   3508     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
   3509     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
   3510 
   3511     /* Final output stage */
   3512 
   3513     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
   3514     wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
   3515     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
   3516     wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
   3517     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
   3518   }
   3519 
   3520   /* Pass 2: process 5 rows from work array, store into output array.
   3521    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
   3522    */
   3523   wsptr = workspace;
   3524   for (ctr = 0; ctr < 5; ctr++) {
   3525     outptr = output_buf[ctr] + output_col;
   3526 
   3527     /* Even part */
   3528 
   3529     /* Add fudge factor here for final descale. */
   3530     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   3531     z3 <<= CONST_BITS;
   3532     z4 = (INT32) wsptr[4];
   3533     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   3534     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   3535     tmp10 = z3 + z1;
   3536     tmp11 = z3 - z2;
   3537 
   3538     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
   3539 
   3540     z2 = (INT32) wsptr[2];
   3541     z3 = (INT32) wsptr[6];
   3542 
   3543     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   3544     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   3545     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   3546 
   3547     tmp20 = tmp10 + tmp12;
   3548     tmp24 = tmp10 - tmp12;
   3549     tmp21 = tmp11 + tmp13;
   3550     tmp23 = tmp11 - tmp13;
   3551 
   3552     /* Odd part */
   3553 
   3554     z1 = (INT32) wsptr[1];
   3555     z2 = (INT32) wsptr[3];
   3556     z3 = (INT32) wsptr[5];
   3557     z3 <<= CONST_BITS;
   3558     z4 = (INT32) wsptr[7];
   3559 
   3560     tmp11 = z2 + z4;
   3561     tmp13 = z2 - z4;
   3562 
   3563     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   3564 
   3565     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   3566     z4 = z3 + tmp12;
   3567 
   3568     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   3569     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   3570 
   3571     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   3572     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
   3573 
   3574     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
   3575 
   3576     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   3577     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   3578 
   3579     /* Final output stage */
   3580 
   3581     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   3582                                               CONST_BITS+PASS1_BITS+3)
   3583                             & RANGE_MASK];
   3584     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   3585                                               CONST_BITS+PASS1_BITS+3)
   3586                             & RANGE_MASK];
   3587     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   3588                                               CONST_BITS+PASS1_BITS+3)
   3589                             & RANGE_MASK];
   3590     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   3591                                               CONST_BITS+PASS1_BITS+3)
   3592                             & RANGE_MASK];
   3593     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   3594                                               CONST_BITS+PASS1_BITS+3)
   3595                             & RANGE_MASK];
   3596     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   3597                                               CONST_BITS+PASS1_BITS+3)
   3598                             & RANGE_MASK];
   3599     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   3600                                               CONST_BITS+PASS1_BITS+3)
   3601                             & RANGE_MASK];
   3602     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   3603                                               CONST_BITS+PASS1_BITS+3)
   3604                             & RANGE_MASK];
   3605     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   3606                                               CONST_BITS+PASS1_BITS+3)
   3607                             & RANGE_MASK];
   3608     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   3609                                               CONST_BITS+PASS1_BITS+3)
   3610                             & RANGE_MASK];
   3611 
   3612     wsptr += 8;		/* advance pointer to next row */
   3613   }
   3614 }
   3615 
   3616 
   3617 /*
   3618  * Perform dequantization and inverse DCT on one block of coefficients,
   3619  * producing a 8x4 output block.
   3620  *
   3621  * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
   3622  */
   3623 
   3624 GLOBAL(void)
   3625 jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   3626                JCOEFPTR coef_block,
   3627                JSAMPARRAY output_buf, JDIMENSION output_col)
   3628 {
   3629   INT32 tmp0, tmp1, tmp2, tmp3;
   3630   INT32 tmp10, tmp11, tmp12, tmp13;
   3631   INT32 z1, z2, z3;
   3632   JCOEFPTR inptr;
   3633   ISLOW_MULT_TYPE * quantptr;
   3634   int * wsptr;
   3635   JSAMPROW outptr;
   3636   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   3637   int ctr;
   3638   int workspace[8*4];	/* buffers data between passes */
   3639   SHIFT_TEMPS
   3640 
   3641   /* Pass 1: process columns from input, store into work array.
   3642    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
   3643    */
   3644   inptr = coef_block;
   3645   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   3646   wsptr = workspace;
   3647   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   3648     /* Even part */
   3649 
   3650     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   3651     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   3652 
   3653     tmp10 = (tmp0 + tmp2) << PASS1_BITS;
   3654     tmp12 = (tmp0 - tmp2) << PASS1_BITS;
   3655 
   3656     /* Odd part */
   3657     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
   3658 
   3659     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   3660     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   3661 
   3662     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
   3663     /* Add fudge factor here for final descale. */
   3664     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   3665     tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
   3666                        CONST_BITS-PASS1_BITS);
   3667     tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
   3668                        CONST_BITS-PASS1_BITS);
   3669 
   3670     /* Final output stage */
   3671 
   3672     wsptr[8*0] = (int) (tmp10 + tmp0);
   3673     wsptr[8*3] = (int) (tmp10 - tmp0);
   3674     wsptr[8*1] = (int) (tmp12 + tmp2);
   3675     wsptr[8*2] = (int) (tmp12 - tmp2);
   3676   }
   3677 
   3678   /* Pass 2: process rows from work array, store into output array. */
   3679   /* Note that we must descale the results by a factor of 8 == 2**3, */
   3680   /* and also undo the PASS1_BITS scaling. */
   3681 
   3682   wsptr = workspace;
   3683   for (ctr = 0; ctr < 4; ctr++) {
   3684     outptr = output_buf[ctr] + output_col;
   3685 
   3686     /* Even part: reverse the even part of the forward DCT. */
   3687     /* The rotator is sqrt(2)*c(-6). */
   3688 
   3689     z2 = (INT32) wsptr[2];
   3690     z3 = (INT32) wsptr[6];
   3691 
   3692     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   3693     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
   3694     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
   3695 
   3696     /* Add fudge factor here for final descale. */
   3697     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   3698     z3 = (INT32) wsptr[4];
   3699 
   3700     tmp0 = (z2 + z3) << CONST_BITS;
   3701     tmp1 = (z2 - z3) << CONST_BITS;
   3702 
   3703     tmp10 = tmp0 + tmp2;
   3704     tmp13 = tmp0 - tmp2;
   3705     tmp11 = tmp1 + tmp3;
   3706     tmp12 = tmp1 - tmp3;
   3707 
   3708     /* Odd part per figure 8; the matrix is unitary and hence its
   3709      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   3710      */
   3711 
   3712     tmp0 = (INT32) wsptr[7];
   3713     tmp1 = (INT32) wsptr[5];
   3714     tmp2 = (INT32) wsptr[3];
   3715     tmp3 = (INT32) wsptr[1];
   3716 
   3717     z2 = tmp0 + tmp2;
   3718     z3 = tmp1 + tmp3;
   3719 
   3720     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
   3721     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   3722     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
   3723     z2 += z1;
   3724     z3 += z1;
   3725 
   3726     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   3727     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
   3728     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
   3729     tmp0 += z1 + z2;
   3730     tmp3 += z1 + z3;
   3731 
   3732     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   3733     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   3734     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   3735     tmp1 += z1 + z3;
   3736     tmp2 += z1 + z2;
   3737 
   3738     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   3739 
   3740     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
   3741                                               CONST_BITS+PASS1_BITS+3)
   3742                             & RANGE_MASK];
   3743     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
   3744                                               CONST_BITS+PASS1_BITS+3)
   3745                             & RANGE_MASK];
   3746     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
   3747                                               CONST_BITS+PASS1_BITS+3)
   3748                             & RANGE_MASK];
   3749     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
   3750                                               CONST_BITS+PASS1_BITS+3)
   3751                             & RANGE_MASK];
   3752     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
   3753                                               CONST_BITS+PASS1_BITS+3)
   3754                             & RANGE_MASK];
   3755     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
   3756                                               CONST_BITS+PASS1_BITS+3)
   3757                             & RANGE_MASK];
   3758     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
   3759                                               CONST_BITS+PASS1_BITS+3)
   3760                             & RANGE_MASK];
   3761     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
   3762                                               CONST_BITS+PASS1_BITS+3)
   3763                             & RANGE_MASK];
   3764 
   3765     wsptr += DCTSIZE;		/* advance pointer to next row */
   3766   }
   3767 }
   3768 
   3769 
   3770 /*
   3771  * Perform dequantization and inverse DCT on one block of coefficients,
   3772  * producing a reduced-size 6x3 output block.
   3773  *
   3774  * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
   3775  */
   3776 
   3777 GLOBAL(void)
   3778 jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   3779                JCOEFPTR coef_block,
   3780                JSAMPARRAY output_buf, JDIMENSION output_col)
   3781 {
   3782   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
   3783   INT32 z1, z2, z3;
   3784   JCOEFPTR inptr;
   3785   ISLOW_MULT_TYPE * quantptr;
   3786   int * wsptr;
   3787   JSAMPROW outptr;
   3788   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   3789   int ctr;
   3790   int workspace[6*3];	/* buffers data between passes */
   3791   SHIFT_TEMPS
   3792 
   3793   /* Pass 1: process columns from input, store into work array.
   3794    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
   3795    */
   3796   inptr = coef_block;
   3797   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   3798   wsptr = workspace;
   3799   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
   3800     /* Even part */
   3801 
   3802     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   3803     tmp0 <<= CONST_BITS;
   3804     /* Add fudge factor here for final descale. */
   3805     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
   3806     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   3807     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
   3808     tmp10 = tmp0 + tmp12;
   3809     tmp2 = tmp0 - tmp12 - tmp12;
   3810 
   3811     /* Odd part */
   3812 
   3813     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   3814     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
   3815 
   3816     /* Final output stage */
   3817 
   3818     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   3819     wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   3820     wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
   3821   }
   3822 
   3823   /* Pass 2: process 3 rows from work array, store into output array.
   3824    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
   3825    */
   3826   wsptr = workspace;
   3827   for (ctr = 0; ctr < 3; ctr++) {
   3828     outptr = output_buf[ctr] + output_col;
   3829 
   3830     /* Even part */
   3831 
   3832     /* Add fudge factor here for final descale. */
   3833     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   3834     tmp0 <<= CONST_BITS;
   3835     tmp2 = (INT32) wsptr[4];
   3836     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
   3837     tmp1 = tmp0 + tmp10;
   3838     tmp11 = tmp0 - tmp10 - tmp10;
   3839     tmp10 = (INT32) wsptr[2];
   3840     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
   3841     tmp10 = tmp1 + tmp0;
   3842     tmp12 = tmp1 - tmp0;
   3843 
   3844     /* Odd part */
   3845 
   3846     z1 = (INT32) wsptr[1];
   3847     z2 = (INT32) wsptr[3];
   3848     z3 = (INT32) wsptr[5];
   3849     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
   3850     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
   3851     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
   3852     tmp1 = (z1 - z2 - z3) << CONST_BITS;
   3853 
   3854     /* Final output stage */
   3855 
   3856     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   3857                                               CONST_BITS+PASS1_BITS+3)
   3858                             & RANGE_MASK];
   3859     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   3860                                               CONST_BITS+PASS1_BITS+3)
   3861                             & RANGE_MASK];
   3862     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
   3863                                               CONST_BITS+PASS1_BITS+3)
   3864                             & RANGE_MASK];
   3865     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
   3866                                               CONST_BITS+PASS1_BITS+3)
   3867                             & RANGE_MASK];
   3868     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   3869                                               CONST_BITS+PASS1_BITS+3)
   3870                             & RANGE_MASK];
   3871     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   3872                                               CONST_BITS+PASS1_BITS+3)
   3873                             & RANGE_MASK];
   3874 
   3875     wsptr += 6;		/* advance pointer to next row */
   3876   }
   3877 }
   3878 
   3879 
   3880 /*
   3881  * Perform dequantization and inverse DCT on one block of coefficients,
   3882  * producing a 4x2 output block.
   3883  *
   3884  * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
   3885  */
   3886 
   3887 GLOBAL(void)
   3888 jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   3889                JCOEFPTR coef_block,
   3890                JSAMPARRAY output_buf, JDIMENSION output_col)
   3891 {
   3892   INT32 tmp0, tmp2, tmp10, tmp12;
   3893   INT32 z1, z2, z3;
   3894   JCOEFPTR inptr;
   3895   ISLOW_MULT_TYPE * quantptr;
   3896   INT32 * wsptr;
   3897   JSAMPROW outptr;
   3898   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   3899   int ctr;
   3900   INT32 workspace[4*2];	/* buffers data between passes */
   3901   SHIFT_TEMPS
   3902 
   3903   /* Pass 1: process columns from input, store into work array. */
   3904 
   3905   inptr = coef_block;
   3906   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   3907   wsptr = workspace;
   3908   for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
   3909     /* Even part */
   3910 
   3911     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   3912 
   3913     /* Odd part */
   3914 
   3915     tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   3916 
   3917     /* Final output stage */
   3918 
   3919     wsptr[4*0] = tmp10 + tmp0;
   3920     wsptr[4*1] = tmp10 - tmp0;
   3921   }
   3922 
   3923   /* Pass 2: process 2 rows from work array, store into output array.
   3924    * 4-point IDCT kernel,
   3925    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
   3926    */
   3927   wsptr = workspace;
   3928   for (ctr = 0; ctr < 2; ctr++) {
   3929     outptr = output_buf[ctr] + output_col;
   3930 
   3931     /* Even part */
   3932 
   3933     /* Add fudge factor here for final descale. */
   3934     tmp0 = wsptr[0] + (ONE << 2);
   3935     tmp2 = wsptr[2];
   3936 
   3937     tmp10 = (tmp0 + tmp2) << CONST_BITS;
   3938     tmp12 = (tmp0 - tmp2) << CONST_BITS;
   3939 
   3940     /* Odd part */
   3941     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
   3942 
   3943     z2 = wsptr[1];
   3944     z3 = wsptr[3];
   3945 
   3946     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
   3947     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
   3948     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
   3949 
   3950     /* Final output stage */
   3951 
   3952     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   3953                                               CONST_BITS+3)
   3954                             & RANGE_MASK];
   3955     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   3956                                               CONST_BITS+3)
   3957                             & RANGE_MASK];
   3958     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   3959                                               CONST_BITS+3)
   3960                             & RANGE_MASK];
   3961     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   3962                                               CONST_BITS+3)
   3963                             & RANGE_MASK];
   3964 
   3965     wsptr += 4;		/* advance pointer to next row */
   3966   }
   3967 }
   3968 
   3969 
   3970 /*
   3971  * Perform dequantization and inverse DCT on one block of coefficients,
   3972  * producing a 2x1 output block.
   3973  *
   3974  * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
   3975  */
   3976 
   3977 GLOBAL(void)
   3978 jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   3979                JCOEFPTR coef_block,
   3980                JSAMPARRAY output_buf, JDIMENSION output_col)
   3981 {
   3982   INT32 tmp0, tmp10;
   3983   ISLOW_MULT_TYPE * quantptr;
   3984   JSAMPROW outptr;
   3985   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   3986   SHIFT_TEMPS
   3987 
   3988   /* Pass 1: empty. */
   3989 
   3990   /* Pass 2: process 1 row from input, store into output array. */
   3991 
   3992   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   3993   outptr = output_buf[0] + output_col;
   3994 
   3995   /* Even part */
   3996 
   3997   tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
   3998   /* Add fudge factor here for final descale. */
   3999   tmp10 += ONE << 2;
   4000 
   4001   /* Odd part */
   4002 
   4003   tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
   4004 
   4005   /* Final output stage */
   4006 
   4007   outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
   4008   outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
   4009 }
   4010 
   4011 
   4012 /*
   4013  * Perform dequantization and inverse DCT on one block of coefficients,
   4014  * producing a 8x16 output block.
   4015  *
   4016  * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
   4017  */
   4018 
   4019 GLOBAL(void)
   4020 jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   4021                 JCOEFPTR coef_block,
   4022                 JSAMPARRAY output_buf, JDIMENSION output_col)
   4023 {
   4024   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   4025   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   4026   INT32 z1, z2, z3, z4;
   4027   JCOEFPTR inptr;
   4028   ISLOW_MULT_TYPE * quantptr;
   4029   int * wsptr;
   4030   JSAMPROW outptr;
   4031   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   4032   int ctr;
   4033   int workspace[8*16];	/* buffers data between passes */
   4034   SHIFT_TEMPS
   4035 
   4036   /* Pass 1: process columns from input, store into work array.
   4037    * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
   4038    */
   4039   inptr = coef_block;
   4040   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   4041   wsptr = workspace;
   4042   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   4043     /* Even part */
   4044 
   4045     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   4046     tmp0 <<= CONST_BITS;
   4047     /* Add fudge factor here for final descale. */
   4048     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
   4049 
   4050     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   4051     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   4052     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   4053 
   4054     tmp10 = tmp0 + tmp1;
   4055     tmp11 = tmp0 - tmp1;
   4056     tmp12 = tmp0 + tmp2;
   4057     tmp13 = tmp0 - tmp2;
   4058 
   4059     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   4060     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   4061     z3 = z1 - z2;
   4062     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   4063     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   4064 
   4065     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   4066     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   4067     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   4068     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   4069 
   4070     tmp20 = tmp10 + tmp0;
   4071     tmp27 = tmp10 - tmp0;
   4072     tmp21 = tmp12 + tmp1;
   4073     tmp26 = tmp12 - tmp1;
   4074     tmp22 = tmp13 + tmp2;
   4075     tmp25 = tmp13 - tmp2;
   4076     tmp23 = tmp11 + tmp3;
   4077     tmp24 = tmp11 - tmp3;
   4078 
   4079     /* Odd part */
   4080 
   4081     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   4082     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   4083     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   4084     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   4085 
   4086     tmp11 = z1 + z3;
   4087 
   4088     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   4089     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   4090     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   4091     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   4092     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   4093     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   4094     tmp0  = tmp1 + tmp2 + tmp3 -
   4095             MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   4096     tmp13 = tmp10 + tmp11 + tmp12 -
   4097             MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   4098     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   4099     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   4100     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   4101     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   4102     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   4103     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   4104     z2    += z4;
   4105     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   4106     tmp1  += z1;
   4107     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   4108     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   4109     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   4110     tmp12 += z2;
   4111     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   4112     tmp2  += z2;
   4113     tmp3  += z2;
   4114     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   4115     tmp10 += z2;
   4116     tmp11 += z2;
   4117 
   4118     /* Final output stage */
   4119 
   4120     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
   4121     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
   4122     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
   4123     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
   4124     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
   4125     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
   4126     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
   4127     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
   4128     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
   4129     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
   4130     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
   4131     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
   4132     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
   4133     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
   4134     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
   4135     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
   4136   }
   4137 
   4138   /* Pass 2: process rows from work array, store into output array. */
   4139   /* Note that we must descale the results by a factor of 8 == 2**3, */
   4140   /* and also undo the PASS1_BITS scaling. */
   4141 
   4142   wsptr = workspace;
   4143   for (ctr = 0; ctr < 16; ctr++) {
   4144     outptr = output_buf[ctr] + output_col;
   4145 
   4146     /* Even part: reverse the even part of the forward DCT. */
   4147     /* The rotator is sqrt(2)*c(-6). */
   4148 
   4149     z2 = (INT32) wsptr[2];
   4150     z3 = (INT32) wsptr[6];
   4151 
   4152     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   4153     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
   4154     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
   4155 
   4156     /* Add fudge factor here for final descale. */
   4157     z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   4158     z3 = (INT32) wsptr[4];
   4159 
   4160     tmp0 = (z2 + z3) << CONST_BITS;
   4161     tmp1 = (z2 - z3) << CONST_BITS;
   4162 
   4163     tmp10 = tmp0 + tmp2;
   4164     tmp13 = tmp0 - tmp2;
   4165     tmp11 = tmp1 + tmp3;
   4166     tmp12 = tmp1 - tmp3;
   4167 
   4168     /* Odd part per figure 8; the matrix is unitary and hence its
   4169      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   4170      */
   4171 
   4172     tmp0 = (INT32) wsptr[7];
   4173     tmp1 = (INT32) wsptr[5];
   4174     tmp2 = (INT32) wsptr[3];
   4175     tmp3 = (INT32) wsptr[1];
   4176 
   4177     z2 = tmp0 + tmp2;
   4178     z3 = tmp1 + tmp3;
   4179 
   4180     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
   4181     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   4182     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
   4183     z2 += z1;
   4184     z3 += z1;
   4185 
   4186     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   4187     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
   4188     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
   4189     tmp0 += z1 + z2;
   4190     tmp3 += z1 + z3;
   4191 
   4192     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   4193     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   4194     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   4195     tmp1 += z1 + z3;
   4196     tmp2 += z1 + z2;
   4197 
   4198     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   4199 
   4200     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
   4201                                               CONST_BITS+PASS1_BITS+3)
   4202                             & RANGE_MASK];
   4203     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
   4204                                               CONST_BITS+PASS1_BITS+3)
   4205                             & RANGE_MASK];
   4206     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
   4207                                               CONST_BITS+PASS1_BITS+3)
   4208                             & RANGE_MASK];
   4209     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
   4210                                               CONST_BITS+PASS1_BITS+3)
   4211                             & RANGE_MASK];
   4212     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
   4213                                               CONST_BITS+PASS1_BITS+3)
   4214                             & RANGE_MASK];
   4215     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
   4216                                               CONST_BITS+PASS1_BITS+3)
   4217                             & RANGE_MASK];
   4218     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
   4219                                               CONST_BITS+PASS1_BITS+3)
   4220                             & RANGE_MASK];
   4221     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
   4222                                               CONST_BITS+PASS1_BITS+3)
   4223                             & RANGE_MASK];
   4224 
   4225     wsptr += DCTSIZE;		/* advance pointer to next row */
   4226   }
   4227 }
   4228 
   4229 
   4230 /*
   4231  * Perform dequantization and inverse DCT on one block of coefficients,
   4232  * producing a 7x14 output block.
   4233  *
   4234  * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
   4235  */
   4236 
   4237 GLOBAL(void)
   4238 jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   4239                 JCOEFPTR coef_block,
   4240                 JSAMPARRAY output_buf, JDIMENSION output_col)
   4241 {
   4242   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   4243   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   4244   INT32 z1, z2, z3, z4;
   4245   JCOEFPTR inptr;
   4246   ISLOW_MULT_TYPE * quantptr;
   4247   int * wsptr;
   4248   JSAMPROW outptr;
   4249   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   4250   int ctr;
   4251   int workspace[7*14];	/* buffers data between passes */
   4252   SHIFT_TEMPS
   4253 
   4254   /* Pass 1: process columns from input, store into work array.
   4255    * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
   4256    */
   4257   inptr = coef_block;
   4258   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   4259   wsptr = workspace;
   4260   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
   4261     /* Even part */
   4262 
   4263     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   4264     z1 <<= CONST_BITS;
   4265     /* Add fudge factor here for final descale. */
   4266     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   4267     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   4268     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   4269     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   4270     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   4271 
   4272     tmp10 = z1 + z2;
   4273     tmp11 = z1 + z3;
   4274     tmp12 = z1 - z4;
   4275 
   4276     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
   4277                         CONST_BITS-PASS1_BITS);
   4278 
   4279     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   4280     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   4281 
   4282     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   4283 
   4284     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   4285     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   4286     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   4287             MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   4288 
   4289     tmp20 = tmp10 + tmp13;
   4290     tmp26 = tmp10 - tmp13;
   4291     tmp21 = tmp11 + tmp14;
   4292     tmp25 = tmp11 - tmp14;
   4293     tmp22 = tmp12 + tmp15;
   4294     tmp24 = tmp12 - tmp15;
   4295 
   4296     /* Odd part */
   4297 
   4298     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   4299     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   4300     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   4301     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   4302     tmp13 = z4 << CONST_BITS;
   4303 
   4304     tmp14 = z1 + z3;
   4305     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   4306     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   4307     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   4308     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   4309     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   4310     z1    -= z2;
   4311     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
   4312     tmp16 += tmp15;
   4313     z1    += z4;
   4314     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
   4315     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
   4316     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
   4317     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   4318     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   4319     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
   4320 
   4321     tmp13 = (z1 - z3) << PASS1_BITS;
   4322 
   4323     /* Final output stage */
   4324 
   4325     wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   4326     wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   4327     wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   4328     wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   4329     wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   4330     wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   4331     wsptr[7*3]  = (int) (tmp23 + tmp13);
   4332     wsptr[7*10] = (int) (tmp23 - tmp13);
   4333     wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   4334     wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   4335     wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   4336     wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   4337     wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
   4338     wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
   4339   }
   4340 
   4341   /* Pass 2: process 14 rows from work array, store into output array.
   4342    * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
   4343    */
   4344   wsptr = workspace;
   4345   for (ctr = 0; ctr < 14; ctr++) {
   4346     outptr = output_buf[ctr] + output_col;
   4347 
   4348     /* Even part */
   4349 
   4350     /* Add fudge factor here for final descale. */
   4351     tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   4352     tmp23 <<= CONST_BITS;
   4353 
   4354     z1 = (INT32) wsptr[2];
   4355     z2 = (INT32) wsptr[4];
   4356     z3 = (INT32) wsptr[6];
   4357 
   4358     tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
   4359     tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
   4360     tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
   4361     tmp10 = z1 + z3;
   4362     z2 -= tmp10;
   4363     tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
   4364     tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
   4365     tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
   4366     tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
   4367 
   4368     /* Odd part */
   4369 
   4370     z1 = (INT32) wsptr[1];
   4371     z2 = (INT32) wsptr[3];
   4372     z3 = (INT32) wsptr[5];
   4373 
   4374     tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
   4375     tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
   4376     tmp10 = tmp11 - tmp12;
   4377     tmp11 += tmp12;
   4378     tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
   4379     tmp11 += tmp12;
   4380     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
   4381     tmp10 += z2;
   4382     tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
   4383 
   4384     /* Final output stage */
   4385 
   4386     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   4387                                               CONST_BITS+PASS1_BITS+3)
   4388                             & RANGE_MASK];
   4389     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   4390                                               CONST_BITS+PASS1_BITS+3)
   4391                             & RANGE_MASK];
   4392     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   4393                                               CONST_BITS+PASS1_BITS+3)
   4394                             & RANGE_MASK];
   4395     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   4396                                               CONST_BITS+PASS1_BITS+3)
   4397                             & RANGE_MASK];
   4398     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   4399                                               CONST_BITS+PASS1_BITS+3)
   4400                             & RANGE_MASK];
   4401     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   4402                                               CONST_BITS+PASS1_BITS+3)
   4403                             & RANGE_MASK];
   4404     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
   4405                                               CONST_BITS+PASS1_BITS+3)
   4406                             & RANGE_MASK];
   4407 
   4408     wsptr += 7;		/* advance pointer to next row */
   4409   }
   4410 }
   4411 
   4412 
   4413 /*
   4414  * Perform dequantization and inverse DCT on one block of coefficients,
   4415  * producing a 6x12 output block.
   4416  *
   4417  * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
   4418  */
   4419 
   4420 GLOBAL(void)
   4421 jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   4422                 JCOEFPTR coef_block,
   4423                 JSAMPARRAY output_buf, JDIMENSION output_col)
   4424 {
   4425   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   4426   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   4427   INT32 z1, z2, z3, z4;
   4428   JCOEFPTR inptr;
   4429   ISLOW_MULT_TYPE * quantptr;
   4430   int * wsptr;
   4431   JSAMPROW outptr;
   4432   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   4433   int ctr;
   4434   int workspace[6*12];	/* buffers data between passes */
   4435   SHIFT_TEMPS
   4436 
   4437   /* Pass 1: process columns from input, store into work array.
   4438    * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
   4439    */
   4440   inptr = coef_block;
   4441   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   4442   wsptr = workspace;
   4443   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
   4444     /* Even part */
   4445 
   4446     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   4447     z3 <<= CONST_BITS;
   4448     /* Add fudge factor here for final descale. */
   4449     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   4450 
   4451     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   4452     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   4453 
   4454     tmp10 = z3 + z4;
   4455     tmp11 = z3 - z4;
   4456 
   4457     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   4458     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   4459     z1 <<= CONST_BITS;
   4460     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   4461     z2 <<= CONST_BITS;
   4462 
   4463     tmp12 = z1 - z2;
   4464 
   4465     tmp21 = z3 + tmp12;
   4466     tmp24 = z3 - tmp12;
   4467 
   4468     tmp12 = z4 + z2;
   4469 
   4470     tmp20 = tmp10 + tmp12;
   4471     tmp25 = tmp10 - tmp12;
   4472 
   4473     tmp12 = z4 - z1 - z2;
   4474 
   4475     tmp22 = tmp11 + tmp12;
   4476     tmp23 = tmp11 - tmp12;
   4477 
   4478     /* Odd part */
   4479 
   4480     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   4481     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   4482     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   4483     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   4484 
   4485     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   4486     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   4487 
   4488     tmp10 = z1 + z3;
   4489     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   4490     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   4491     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   4492     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   4493     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   4494     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   4495     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   4496              MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   4497 
   4498     z1 -= z4;
   4499     z2 -= z3;
   4500     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   4501     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   4502     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   4503 
   4504     /* Final output stage */
   4505 
   4506     wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   4507     wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   4508     wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   4509     wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   4510     wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   4511     wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   4512     wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   4513     wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   4514     wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   4515     wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   4516     wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   4517     wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   4518   }
   4519 
   4520   /* Pass 2: process 12 rows from work array, store into output array.
   4521    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
   4522    */
   4523   wsptr = workspace;
   4524   for (ctr = 0; ctr < 12; ctr++) {
   4525     outptr = output_buf[ctr] + output_col;
   4526 
   4527     /* Even part */
   4528 
   4529     /* Add fudge factor here for final descale. */
   4530     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   4531     tmp10 <<= CONST_BITS;
   4532     tmp12 = (INT32) wsptr[4];
   4533     tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
   4534     tmp11 = tmp10 + tmp20;
   4535     tmp21 = tmp10 - tmp20 - tmp20;
   4536     tmp20 = (INT32) wsptr[2];
   4537     tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
   4538     tmp20 = tmp11 + tmp10;
   4539     tmp22 = tmp11 - tmp10;
   4540 
   4541     /* Odd part */
   4542 
   4543     z1 = (INT32) wsptr[1];
   4544     z2 = (INT32) wsptr[3];
   4545     z3 = (INT32) wsptr[5];
   4546     tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
   4547     tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
   4548     tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
   4549     tmp11 = (z1 - z2 - z3) << CONST_BITS;
   4550 
   4551     /* Final output stage */
   4552 
   4553     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   4554                                               CONST_BITS+PASS1_BITS+3)
   4555                             & RANGE_MASK];
   4556     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   4557                                               CONST_BITS+PASS1_BITS+3)
   4558                             & RANGE_MASK];
   4559     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   4560                                               CONST_BITS+PASS1_BITS+3)
   4561                             & RANGE_MASK];
   4562     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   4563                                               CONST_BITS+PASS1_BITS+3)
   4564                             & RANGE_MASK];
   4565     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   4566                                               CONST_BITS+PASS1_BITS+3)
   4567                             & RANGE_MASK];
   4568     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   4569                                               CONST_BITS+PASS1_BITS+3)
   4570                             & RANGE_MASK];
   4571 
   4572     wsptr += 6;		/* advance pointer to next row */
   4573   }
   4574 }
   4575 
   4576 
   4577 /*
   4578  * Perform dequantization and inverse DCT on one block of coefficients,
   4579  * producing a 5x10 output block.
   4580  *
   4581  * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
   4582  */
   4583 
   4584 GLOBAL(void)
   4585 jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   4586                 JCOEFPTR coef_block,
   4587                 JSAMPARRAY output_buf, JDIMENSION output_col)
   4588 {
   4589   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
   4590   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
   4591   INT32 z1, z2, z3, z4, z5;
   4592   JCOEFPTR inptr;
   4593   ISLOW_MULT_TYPE * quantptr;
   4594   int * wsptr;
   4595   JSAMPROW outptr;
   4596   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   4597   int ctr;
   4598   int workspace[5*10];	/* buffers data between passes */
   4599   SHIFT_TEMPS
   4600 
   4601   /* Pass 1: process columns from input, store into work array.
   4602    * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
   4603    */
   4604   inptr = coef_block;
   4605   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   4606   wsptr = workspace;
   4607   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
   4608     /* Even part */
   4609 
   4610     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   4611     z3 <<= CONST_BITS;
   4612     /* Add fudge factor here for final descale. */
   4613     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   4614     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   4615     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   4616     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   4617     tmp10 = z3 + z1;
   4618     tmp11 = z3 - z2;
   4619 
   4620     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
   4621                         CONST_BITS-PASS1_BITS);
   4622 
   4623     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   4624     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   4625 
   4626     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   4627     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   4628     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   4629 
   4630     tmp20 = tmp10 + tmp12;
   4631     tmp24 = tmp10 - tmp12;
   4632     tmp21 = tmp11 + tmp13;
   4633     tmp23 = tmp11 - tmp13;
   4634 
   4635     /* Odd part */
   4636 
   4637     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   4638     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   4639     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   4640     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   4641 
   4642     tmp11 = z2 + z4;
   4643     tmp13 = z2 - z4;
   4644 
   4645     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   4646     z5 = z3 << CONST_BITS;
   4647 
   4648     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   4649     z4 = z5 + tmp12;
   4650 
   4651     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   4652     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   4653 
   4654     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   4655     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
   4656 
   4657     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
   4658 
   4659     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   4660     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   4661 
   4662     /* Final output stage */
   4663 
   4664     wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   4665     wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   4666     wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   4667     wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   4668     wsptr[5*2] = (int) (tmp22 + tmp12);
   4669     wsptr[5*7] = (int) (tmp22 - tmp12);
   4670     wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   4671     wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   4672     wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   4673     wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   4674   }
   4675 
   4676   /* Pass 2: process 10 rows from work array, store into output array.
   4677    * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
   4678    */
   4679   wsptr = workspace;
   4680   for (ctr = 0; ctr < 10; ctr++) {
   4681     outptr = output_buf[ctr] + output_col;
   4682 
   4683     /* Even part */
   4684 
   4685     /* Add fudge factor here for final descale. */
   4686     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   4687     tmp12 <<= CONST_BITS;
   4688     tmp13 = (INT32) wsptr[2];
   4689     tmp14 = (INT32) wsptr[4];
   4690     z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
   4691     z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
   4692     z3 = tmp12 + z2;
   4693     tmp10 = z3 + z1;
   4694     tmp11 = z3 - z1;
   4695     tmp12 -= z2 << 2;
   4696 
   4697     /* Odd part */
   4698 
   4699     z2 = (INT32) wsptr[1];
   4700     z3 = (INT32) wsptr[3];
   4701 
   4702     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
   4703     tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
   4704     tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
   4705 
   4706     /* Final output stage */
   4707 
   4708     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
   4709                                               CONST_BITS+PASS1_BITS+3)
   4710                             & RANGE_MASK];
   4711     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
   4712                                               CONST_BITS+PASS1_BITS+3)
   4713                             & RANGE_MASK];
   4714     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
   4715                                               CONST_BITS+PASS1_BITS+3)
   4716                             & RANGE_MASK];
   4717     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
   4718                                               CONST_BITS+PASS1_BITS+3)
   4719                             & RANGE_MASK];
   4720     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
   4721                                               CONST_BITS+PASS1_BITS+3)
   4722                             & RANGE_MASK];
   4723 
   4724     wsptr += 5;		/* advance pointer to next row */
   4725   }
   4726 }
   4727 
   4728 
   4729 /*
   4730  * Perform dequantization and inverse DCT on one block of coefficients,
   4731  * producing a 4x8 output block.
   4732  *
   4733  * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
   4734  */
   4735 
   4736 GLOBAL(void)
   4737 jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   4738                JCOEFPTR coef_block,
   4739                JSAMPARRAY output_buf, JDIMENSION output_col)
   4740 {
   4741   INT32 tmp0, tmp1, tmp2, tmp3;
   4742   INT32 tmp10, tmp11, tmp12, tmp13;
   4743   INT32 z1, z2, z3;
   4744   JCOEFPTR inptr;
   4745   ISLOW_MULT_TYPE * quantptr;
   4746   int * wsptr;
   4747   JSAMPROW outptr;
   4748   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   4749   int ctr;
   4750   int workspace[4*8];	/* buffers data between passes */
   4751   SHIFT_TEMPS
   4752 
   4753   /* Pass 1: process columns from input, store into work array. */
   4754   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
   4755   /* furthermore, we scale the results by 2**PASS1_BITS. */
   4756 
   4757   inptr = coef_block;
   4758   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   4759   wsptr = workspace;
   4760   for (ctr = 4; ctr > 0; ctr--) {
   4761     /* Due to quantization, we will usually find that many of the input
   4762      * coefficients are zero, especially the AC terms.  We can exploit this
   4763      * by short-circuiting the IDCT calculation for any column in which all
   4764      * the AC terms are zero.  In that case each output is equal to the
   4765      * DC coefficient (with scale factor as needed).
   4766      * With typical images and quantization tables, half or more of the
   4767      * column DCT calculations can be simplified this way.
   4768      */
   4769 
   4770     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
   4771         inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
   4772         inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
   4773         inptr[DCTSIZE*7] == 0) {
   4774       /* AC terms all zero */
   4775       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
   4776 
   4777       wsptr[4*0] = dcval;
   4778       wsptr[4*1] = dcval;
   4779       wsptr[4*2] = dcval;
   4780       wsptr[4*3] = dcval;
   4781       wsptr[4*4] = dcval;
   4782       wsptr[4*5] = dcval;
   4783       wsptr[4*6] = dcval;
   4784       wsptr[4*7] = dcval;
   4785 
   4786       inptr++;			/* advance pointers to next column */
   4787       quantptr++;
   4788       wsptr++;
   4789       continue;
   4790     }
   4791 
   4792     /* Even part: reverse the even part of the forward DCT. */
   4793     /* The rotator is sqrt(2)*c(-6). */
   4794 
   4795     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   4796     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   4797 
   4798     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
   4799     tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
   4800     tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
   4801 
   4802     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   4803     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   4804     z2 <<= CONST_BITS;
   4805     z3 <<= CONST_BITS;
   4806     /* Add fudge factor here for final descale. */
   4807     z2 += ONE << (CONST_BITS-PASS1_BITS-1);
   4808 
   4809     tmp0 = z2 + z3;
   4810     tmp1 = z2 - z3;
   4811 
   4812     tmp10 = tmp0 + tmp2;
   4813     tmp13 = tmp0 - tmp2;
   4814     tmp11 = tmp1 + tmp3;
   4815     tmp12 = tmp1 - tmp3;
   4816 
   4817     /* Odd part per figure 8; the matrix is unitary and hence its
   4818      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
   4819      */
   4820 
   4821     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   4822     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   4823     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   4824     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   4825 
   4826     z2 = tmp0 + tmp2;
   4827     z3 = tmp1 + tmp3;
   4828 
   4829     z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
   4830     z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
   4831     z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
   4832     z2 += z1;
   4833     z3 += z1;
   4834 
   4835     z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
   4836     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
   4837     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
   4838     tmp0 += z1 + z2;
   4839     tmp3 += z1 + z3;
   4840 
   4841     z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
   4842     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
   4843     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
   4844     tmp1 += z1 + z3;
   4845     tmp2 += z1 + z2;
   4846 
   4847     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   4848 
   4849     wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
   4850     wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
   4851     wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
   4852     wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
   4853     wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
   4854     wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
   4855     wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
   4856     wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
   4857 
   4858     inptr++;			/* advance pointers to next column */
   4859     quantptr++;
   4860     wsptr++;
   4861   }
   4862 
   4863   /* Pass 2: process 8 rows from work array, store into output array.
   4864    * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
   4865    */
   4866   wsptr = workspace;
   4867   for (ctr = 0; ctr < 8; ctr++) {
   4868     outptr = output_buf[ctr] + output_col;
   4869 
   4870     /* Even part */
   4871 
   4872     /* Add fudge factor here for final descale. */
   4873     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   4874     tmp2 = (INT32) wsptr[2];
   4875 
   4876     tmp10 = (tmp0 + tmp2) << CONST_BITS;
   4877     tmp12 = (tmp0 - tmp2) << CONST_BITS;
   4878 
   4879     /* Odd part */
   4880     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
   4881 
   4882     z2 = (INT32) wsptr[1];
   4883     z3 = (INT32) wsptr[3];
   4884 
   4885     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
   4886     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
   4887     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
   4888 
   4889     /* Final output stage */
   4890 
   4891     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   4892                                               CONST_BITS+PASS1_BITS+3)
   4893                             & RANGE_MASK];
   4894     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   4895                                               CONST_BITS+PASS1_BITS+3)
   4896                             & RANGE_MASK];
   4897     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   4898                                               CONST_BITS+PASS1_BITS+3)
   4899                             & RANGE_MASK];
   4900     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   4901                                               CONST_BITS+PASS1_BITS+3)
   4902                             & RANGE_MASK];
   4903 
   4904     wsptr += 4;		/* advance pointer to next row */
   4905   }
   4906 }
   4907 
   4908 
   4909 /*
   4910  * Perform dequantization and inverse DCT on one block of coefficients,
   4911  * producing a reduced-size 3x6 output block.
   4912  *
   4913  * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
   4914  */
   4915 
   4916 GLOBAL(void)
   4917 jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   4918                JCOEFPTR coef_block,
   4919                JSAMPARRAY output_buf, JDIMENSION output_col)
   4920 {
   4921   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
   4922   INT32 z1, z2, z3;
   4923   JCOEFPTR inptr;
   4924   ISLOW_MULT_TYPE * quantptr;
   4925   int * wsptr;
   4926   JSAMPROW outptr;
   4927   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   4928   int ctr;
   4929   int workspace[3*6];	/* buffers data between passes */
   4930   SHIFT_TEMPS
   4931 
   4932   /* Pass 1: process columns from input, store into work array.
   4933    * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
   4934    */
   4935   inptr = coef_block;
   4936   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   4937   wsptr = workspace;
   4938   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
   4939     /* Even part */
   4940 
   4941     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   4942     tmp0 <<= CONST_BITS;
   4943     /* Add fudge factor here for final descale. */
   4944     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
   4945     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   4946     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
   4947     tmp1 = tmp0 + tmp10;
   4948     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
   4949     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   4950     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
   4951     tmp10 = tmp1 + tmp0;
   4952     tmp12 = tmp1 - tmp0;
   4953 
   4954     /* Odd part */
   4955 
   4956     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   4957     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   4958     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   4959     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
   4960     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
   4961     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
   4962     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
   4963 
   4964     /* Final output stage */
   4965 
   4966     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
   4967     wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
   4968     wsptr[3*1] = (int) (tmp11 + tmp1);
   4969     wsptr[3*4] = (int) (tmp11 - tmp1);
   4970     wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
   4971     wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
   4972   }
   4973 
   4974   /* Pass 2: process 6 rows from work array, store into output array.
   4975    * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
   4976    */
   4977   wsptr = workspace;
   4978   for (ctr = 0; ctr < 6; ctr++) {
   4979     outptr = output_buf[ctr] + output_col;
   4980 
   4981     /* Even part */
   4982 
   4983     /* Add fudge factor here for final descale. */
   4984     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   4985     tmp0 <<= CONST_BITS;
   4986     tmp2 = (INT32) wsptr[2];
   4987     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
   4988     tmp10 = tmp0 + tmp12;
   4989     tmp2 = tmp0 - tmp12 - tmp12;
   4990 
   4991     /* Odd part */
   4992 
   4993     tmp12 = (INT32) wsptr[1];
   4994     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
   4995 
   4996     /* Final output stage */
   4997 
   4998     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   4999                                               CONST_BITS+PASS1_BITS+3)
   5000                             & RANGE_MASK];
   5001     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   5002                                               CONST_BITS+PASS1_BITS+3)
   5003                             & RANGE_MASK];
   5004     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
   5005                                               CONST_BITS+PASS1_BITS+3)
   5006                             & RANGE_MASK];
   5007 
   5008     wsptr += 3;		/* advance pointer to next row */
   5009   }
   5010 }
   5011 
   5012 
   5013 /*
   5014  * Perform dequantization and inverse DCT on one block of coefficients,
   5015  * producing a 2x4 output block.
   5016  *
   5017  * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
   5018  */
   5019 
   5020 GLOBAL(void)
   5021 jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   5022                JCOEFPTR coef_block,
   5023                JSAMPARRAY output_buf, JDIMENSION output_col)
   5024 {
   5025   INT32 tmp0, tmp2, tmp10, tmp12;
   5026   INT32 z1, z2, z3;
   5027   JCOEFPTR inptr;
   5028   ISLOW_MULT_TYPE * quantptr;
   5029   INT32 * wsptr;
   5030   JSAMPROW outptr;
   5031   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   5032   int ctr;
   5033   INT32 workspace[2*4];	/* buffers data between passes */
   5034   SHIFT_TEMPS
   5035 
   5036   /* Pass 1: process columns from input, store into work array.
   5037    * 4-point IDCT kernel,
   5038    * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
   5039    */
   5040   inptr = coef_block;
   5041   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   5042   wsptr = workspace;
   5043   for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
   5044     /* Even part */
   5045 
   5046     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   5047     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   5048 
   5049     tmp10 = (tmp0 + tmp2) << CONST_BITS;
   5050     tmp12 = (tmp0 - tmp2) << CONST_BITS;
   5051 
   5052     /* Odd part */
   5053     /* Same rotation as in the even part of the 8x8 LL&M IDCT */
   5054 
   5055     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   5056     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   5057 
   5058     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
   5059     tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
   5060     tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
   5061 
   5062     /* Final output stage */
   5063 
   5064     wsptr[2*0] = tmp10 + tmp0;
   5065     wsptr[2*3] = tmp10 - tmp0;
   5066     wsptr[2*1] = tmp12 + tmp2;
   5067     wsptr[2*2] = tmp12 - tmp2;
   5068   }
   5069 
   5070   /* Pass 2: process 4 rows from work array, store into output array. */
   5071 
   5072   wsptr = workspace;
   5073   for (ctr = 0; ctr < 4; ctr++) {
   5074     outptr = output_buf[ctr] + output_col;
   5075 
   5076     /* Even part */
   5077 
   5078     /* Add fudge factor here for final descale. */
   5079     tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
   5080 
   5081     /* Odd part */
   5082 
   5083     tmp0 = wsptr[1];
   5084 
   5085     /* Final output stage */
   5086 
   5087     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
   5088                             & RANGE_MASK];
   5089     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
   5090                             & RANGE_MASK];
   5091 
   5092     wsptr += 2;		/* advance pointer to next row */
   5093   }
   5094 }
   5095 
   5096 
   5097 /*
   5098  * Perform dequantization and inverse DCT on one block of coefficients,
   5099  * producing a 1x2 output block.
   5100  *
   5101  * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
   5102  */
   5103 
   5104 GLOBAL(void)
   5105 jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   5106                JCOEFPTR coef_block,
   5107                JSAMPARRAY output_buf, JDIMENSION output_col)
   5108 {
   5109   INT32 tmp0, tmp10;
   5110   ISLOW_MULT_TYPE * quantptr;
   5111   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   5112   SHIFT_TEMPS
   5113 
   5114   /* Process 1 column from input, store into output array. */
   5115 
   5116   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   5117 
   5118   /* Even part */
   5119 
   5120   tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
   5121   /* Add fudge factor here for final descale. */
   5122   tmp10 += ONE << 2;
   5123 
   5124   /* Odd part */
   5125 
   5126   tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
   5127 
   5128   /* Final output stage */
   5129 
   5130   output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
   5131                                           & RANGE_MASK];
   5132   output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
   5133                                           & RANGE_MASK];
   5134 }
   5135 
   5136 #endif /* IDCT_SCALING_SUPPORTED */
   5137 #endif /* DCT_ISLOW_SUPPORTED */
   5138