Home | History | Annotate | Download | only in libjpeg_turbo
      1 /*
      2  * jidctint.c
      3  *
      4  * Copyright (C) 1991-1998, Thomas G. Lane.
      5  * Modification developed 2002-2009 by Guido Vollbeding.
      6  * This file is part of the Independent JPEG Group's software.
      7  * For conditions of distribution and use, see the accompanying README file.
      8  *
      9  * This file contains a slow-but-accurate integer implementation of the
     10  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
     11  * must also perform dequantization of the input coefficients.
     12  *
     13  * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
     14  * on each row (or vice versa, but it's more convenient to emit a row at
     15  * a time).  Direct algorithms are also available, but they are much more
     16  * complex and seem not to be any faster when reduced to code.
     17  *
     18  * This implementation is based on an algorithm described in
     19  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
     20  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
     21  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
     22  * The primary algorithm described there uses 11 multiplies and 29 adds.
     23  * We use their alternate method with 12 multiplies and 32 adds.
     24  * The advantage of this method is that no data path contains more than one
     25  * multiplication; this allows a very simple and accurate implementation in
     26  * scaled fixed-point arithmetic, with a minimal number of shifts.
     27  *
     28  * We also provide IDCT routines with various output sample block sizes for
     29  * direct resolution reduction or enlargement without additional resampling:
     30  * NxN (N=1...16) pixels for one 8x8 input DCT block.
     31  *
     32  * For N<8 we simply take the corresponding low-frequency coefficients of
     33  * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
     34  * to yield the downscaled outputs.
     35  * This can be seen as direct low-pass downsampling from the DCT domain
     36  * point of view rather than the usual spatial domain point of view,
     37  * yielding significant computational savings and results at least
     38  * as good as common bilinear (averaging) spatial downsampling.
     39  *
     40  * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
     41  * lower frequencies and higher frequencies assumed to be zero.
     42  * It turns out that the computational effort is similar to the 8x8 IDCT
     43  * regarding the output size.
     44  * Furthermore, the scaling and descaling is the same for all IDCT sizes.
     45  *
     46  * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
     47  * since there would be too many additional constants to pre-calculate.
     48  */
     49 
     50 #define JPEG_INTERNALS
     51 #include "jinclude.h"
     52 #include "jpeglib.h"
     53 #include "jdct.h"		/* Private declarations for DCT subsystem */
     54 
     55 #ifdef DCT_ISLOW_SUPPORTED
     56 
     57 
     58 /*
     59  * This module is specialized to the case DCTSIZE = 8.
     60  */
     61 
     62 #if DCTSIZE != 8
     63   Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
     64 #endif
     65 
     66 
     67 /*
     68  * The poop on this scaling stuff is as follows:
     69  *
     70  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
     71  * larger than the true IDCT outputs.  The final outputs are therefore
     72  * a factor of N larger than desired; since N=8 this can be cured by
     73  * a simple right shift at the end of the algorithm.  The advantage of
     74  * this arrangement is that we save two multiplications per 1-D IDCT,
     75  * because the y0 and y4 inputs need not be divided by sqrt(N).
     76  *
     77  * We have to do addition and subtraction of the integer inputs, which
     78  * is no problem, and multiplication by fractional constants, which is
     79  * a problem to do in integer arithmetic.  We multiply all the constants
     80  * by CONST_SCALE and convert them to integer constants (thus retaining
     81  * CONST_BITS bits of precision in the constants).  After doing a
     82  * multiplication we have to divide the product by CONST_SCALE, with proper
     83  * rounding, to produce the correct output.  This division can be done
     84  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
     85  * as long as possible so that partial sums can be added together with
     86  * full fractional precision.
     87  *
     88  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
     89  * they are represented to better-than-integral precision.  These outputs
     90  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
     91  * with the recommended scaling.  (To scale up 12-bit sample data further, an
     92  * intermediate INT32 array would be needed.)
     93  *
     94  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
     95  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
     96  * shows that the values given below are the most effective.
     97  */
     98 
     99 #if BITS_IN_JSAMPLE == 8
    100 #define CONST_BITS  13
    101 #define PASS1_BITS  2
    102 #else
    103 #define CONST_BITS  13
    104 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
    105 #endif
    106 
    107 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
    108  * causing a lot of useless floating-point operations at run time.
    109  * To get around this we use the following pre-calculated constants.
    110  * If you change CONST_BITS you may want to add appropriate values.
    111  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
    112  */
    113 
    114 #if CONST_BITS == 13
    115 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
    116 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
    117 #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
    118 #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
    119 #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
    120 #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
    121 #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
    122 #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
    123 #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
    124 #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
    125 #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
    126 #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
    127 #else
    128 #define FIX_0_298631336  FIX(0.298631336)
    129 #define FIX_0_390180644  FIX(0.390180644)
    130 #define FIX_0_541196100  FIX(0.541196100)
    131 #define FIX_0_765366865  FIX(0.765366865)
    132 #define FIX_0_899976223  FIX(0.899976223)
    133 #define FIX_1_175875602  FIX(1.175875602)
    134 #define FIX_1_501321110  FIX(1.501321110)
    135 #define FIX_1_847759065  FIX(1.847759065)
    136 #define FIX_1_961570560  FIX(1.961570560)
    137 #define FIX_2_053119869  FIX(2.053119869)
    138 #define FIX_2_562915447  FIX(2.562915447)
    139 #define FIX_3_072711026  FIX(3.072711026)
    140 #endif
    141 
    142 
    143 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
    144  * For 8-bit samples with the recommended scaling, all the variable
    145  * and constant values involved are no more than 16 bits wide, so a
    146  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
    147  * For 12-bit samples, a full 32-bit multiplication will be needed.
    148  */
    149 
    150 #if BITS_IN_JSAMPLE == 8
    151 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
    152 #else
    153 #define MULTIPLY(var,const)  ((var) * (const))
    154 #endif
    155 
    156 
    157 /* Dequantize a coefficient by multiplying it by the multiplier-table
    158  * entry; produce an int result.  In this module, both inputs and result
    159  * are 16 bits or less, so either int or short multiply will work.
    160  */
    161 
    162 #define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
    163 
    164 
    165 /*
    166  * Perform dequantization and inverse DCT on one block of coefficients.
    167  */
    168 
    169 GLOBAL(void)
    170 jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    171 		 JCOEFPTR coef_block,
    172 		 JSAMPARRAY output_buf, JDIMENSION output_col)
    173 {
    174   INT32 tmp0, tmp1, tmp2, tmp3;
    175   INT32 tmp10, tmp11, tmp12, tmp13;
    176   INT32 z1, z2, z3, z4, z5;
    177   JCOEFPTR inptr;
    178   ISLOW_MULT_TYPE * quantptr;
    179   int * wsptr;
    180   JSAMPROW outptr;
    181   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    182   int ctr;
    183   int workspace[DCTSIZE2];	/* buffers data between passes */
    184   SHIFT_TEMPS
    185 
    186   /* Pass 1: process columns from input, store into work array. */
    187   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
    188   /* furthermore, we scale the results by 2**PASS1_BITS. */
    189 
    190   inptr = coef_block;
    191   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    192   wsptr = workspace;
    193   for (ctr = DCTSIZE; ctr > 0; ctr--) {
    194     /* Due to quantization, we will usually find that many of the input
    195      * coefficients are zero, especially the AC terms.  We can exploit this
    196      * by short-circuiting the IDCT calculation for any column in which all
    197      * the AC terms are zero.  In that case each output is equal to the
    198      * DC coefficient (with scale factor as needed).
    199      * With typical images and quantization tables, half or more of the
    200      * column DCT calculations can be simplified this way.
    201      */
    202 
    203     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
    204 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
    205 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
    206 	inptr[DCTSIZE*7] == 0) {
    207       /* AC terms all zero */
    208       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
    209 
    210       wsptr[DCTSIZE*0] = dcval;
    211       wsptr[DCTSIZE*1] = dcval;
    212       wsptr[DCTSIZE*2] = dcval;
    213       wsptr[DCTSIZE*3] = dcval;
    214       wsptr[DCTSIZE*4] = dcval;
    215       wsptr[DCTSIZE*5] = dcval;
    216       wsptr[DCTSIZE*6] = dcval;
    217       wsptr[DCTSIZE*7] = dcval;
    218 
    219       inptr++;			/* advance pointers to next column */
    220       quantptr++;
    221       wsptr++;
    222       continue;
    223     }
    224 
    225     /* Even part: reverse the even part of the forward DCT. */
    226     /* The rotator is sqrt(2)*c(-6). */
    227 
    228     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    229     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    230 
    231     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    232     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
    233     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
    234 
    235     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    236     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    237 
    238     tmp0 = (z2 + z3) << CONST_BITS;
    239     tmp1 = (z2 - z3) << CONST_BITS;
    240 
    241     tmp10 = tmp0 + tmp3;
    242     tmp13 = tmp0 - tmp3;
    243     tmp11 = tmp1 + tmp2;
    244     tmp12 = tmp1 - tmp2;
    245 
    246     /* Odd part per figure 8; the matrix is unitary and hence its
    247      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    248      */
    249 
    250     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
    251     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    252     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    253     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    254 
    255     z1 = tmp0 + tmp3;
    256     z2 = tmp1 + tmp2;
    257     z3 = tmp0 + tmp2;
    258     z4 = tmp1 + tmp3;
    259     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    260 
    261     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    262     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    263     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    264     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    265     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    266     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    267     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    268     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    269 
    270     z3 += z5;
    271     z4 += z5;
    272 
    273     tmp0 += z1 + z3;
    274     tmp1 += z2 + z4;
    275     tmp2 += z2 + z3;
    276     tmp3 += z1 + z4;
    277 
    278     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    279 
    280     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
    281     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
    282     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
    283     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
    284     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
    285     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
    286     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
    287     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
    288 
    289     inptr++;			/* advance pointers to next column */
    290     quantptr++;
    291     wsptr++;
    292   }
    293 
    294   /* Pass 2: process rows from work array, store into output array. */
    295   /* Note that we must descale the results by a factor of 8 == 2**3, */
    296   /* and also undo the PASS1_BITS scaling. */
    297 
    298   wsptr = workspace;
    299   for (ctr = 0; ctr < DCTSIZE; ctr++) {
    300     outptr = output_buf[ctr] + output_col;
    301     /* Rows of zeroes can be exploited in the same way as we did with columns.
    302      * However, the column calculation has created many nonzero AC terms, so
    303      * the simplification applies less often (typically 5% to 10% of the time).
    304      * On machines with very fast multiplication, it's possible that the
    305      * test takes more time than it's worth.  In that case this section
    306      * may be commented out.
    307      */
    308 
    309 #ifndef NO_ZERO_ROW_TEST
    310     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
    311 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
    312       /* AC terms all zero */
    313       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
    314 				  & RANGE_MASK];
    315 
    316       outptr[0] = dcval;
    317       outptr[1] = dcval;
    318       outptr[2] = dcval;
    319       outptr[3] = dcval;
    320       outptr[4] = dcval;
    321       outptr[5] = dcval;
    322       outptr[6] = dcval;
    323       outptr[7] = dcval;
    324 
    325       wsptr += DCTSIZE;		/* advance pointer to next row */
    326       continue;
    327     }
    328 #endif
    329 
    330     /* Even part: reverse the even part of the forward DCT. */
    331     /* The rotator is sqrt(2)*c(-6). */
    332 
    333     z2 = (INT32) wsptr[2];
    334     z3 = (INT32) wsptr[6];
    335 
    336     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    337     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
    338     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
    339 
    340     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
    341     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
    342 
    343     tmp10 = tmp0 + tmp3;
    344     tmp13 = tmp0 - tmp3;
    345     tmp11 = tmp1 + tmp2;
    346     tmp12 = tmp1 - tmp2;
    347 
    348     /* Odd part per figure 8; the matrix is unitary and hence its
    349      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    350      */
    351 
    352     tmp0 = (INT32) wsptr[7];
    353     tmp1 = (INT32) wsptr[5];
    354     tmp2 = (INT32) wsptr[3];
    355     tmp3 = (INT32) wsptr[1];
    356 
    357     z1 = tmp0 + tmp3;
    358     z2 = tmp1 + tmp2;
    359     z3 = tmp0 + tmp2;
    360     z4 = tmp1 + tmp3;
    361     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    362 
    363     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    364     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    365     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    366     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    367     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    368     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    369     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    370     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    371 
    372     z3 += z5;
    373     z4 += z5;
    374 
    375     tmp0 += z1 + z3;
    376     tmp1 += z2 + z4;
    377     tmp2 += z2 + z3;
    378     tmp3 += z1 + z4;
    379 
    380     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    381 
    382     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
    383 					  CONST_BITS+PASS1_BITS+3)
    384 			    & RANGE_MASK];
    385     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
    386 					  CONST_BITS+PASS1_BITS+3)
    387 			    & RANGE_MASK];
    388     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
    389 					  CONST_BITS+PASS1_BITS+3)
    390 			    & RANGE_MASK];
    391     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
    392 					  CONST_BITS+PASS1_BITS+3)
    393 			    & RANGE_MASK];
    394     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
    395 					  CONST_BITS+PASS1_BITS+3)
    396 			    & RANGE_MASK];
    397     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
    398 					  CONST_BITS+PASS1_BITS+3)
    399 			    & RANGE_MASK];
    400     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
    401 					  CONST_BITS+PASS1_BITS+3)
    402 			    & RANGE_MASK];
    403     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
    404 					  CONST_BITS+PASS1_BITS+3)
    405 			    & RANGE_MASK];
    406 
    407     wsptr += DCTSIZE;		/* advance pointer to next row */
    408   }
    409 }
    410 
    411 #ifdef IDCT_SCALING_SUPPORTED
    412 
    413 
    414 /*
    415  * Perform dequantization and inverse DCT on one block of coefficients,
    416  * producing a 7x7 output block.
    417  *
    418  * Optimized algorithm with 12 multiplications in the 1-D kernel.
    419  * cK represents sqrt(2) * cos(K*pi/14).
    420  */
    421 
    422 GLOBAL(void)
    423 jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    424 	       JCOEFPTR coef_block,
    425 	       JSAMPARRAY output_buf, JDIMENSION output_col)
    426 {
    427   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
    428   INT32 z1, z2, z3;
    429   JCOEFPTR inptr;
    430   ISLOW_MULT_TYPE * quantptr;
    431   int * wsptr;
    432   JSAMPROW outptr;
    433   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    434   int ctr;
    435   int workspace[7*7];	/* buffers data between passes */
    436   SHIFT_TEMPS
    437 
    438   /* Pass 1: process columns from input, store into work array. */
    439 
    440   inptr = coef_block;
    441   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    442   wsptr = workspace;
    443   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
    444     /* Even part */
    445 
    446     tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    447     tmp13 <<= CONST_BITS;
    448     /* Add fudge factor here for final descale. */
    449     tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
    450 
    451     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    452     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    453     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    454 
    455     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    456     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    457     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    458     tmp0 = z1 + z3;
    459     z2 -= tmp0;
    460     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    461     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    462     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    463     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    464 
    465     /* Odd part */
    466 
    467     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    468     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    469     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    470 
    471     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    472     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    473     tmp0 = tmp1 - tmp2;
    474     tmp1 += tmp2;
    475     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
    476     tmp1 += tmp2;
    477     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    478     tmp0 += z2;
    479     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    480 
    481     /* Final output stage */
    482 
    483     wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    484     wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    485     wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    486     wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    487     wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    488     wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    489     wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
    490   }
    491 
    492   /* Pass 2: process 7 rows from work array, store into output array. */
    493 
    494   wsptr = workspace;
    495   for (ctr = 0; ctr < 7; ctr++) {
    496     outptr = output_buf[ctr] + output_col;
    497 
    498     /* Even part */
    499 
    500     /* Add fudge factor here for final descale. */
    501     tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    502     tmp13 <<= CONST_BITS;
    503 
    504     z1 = (INT32) wsptr[2];
    505     z2 = (INT32) wsptr[4];
    506     z3 = (INT32) wsptr[6];
    507 
    508     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    509     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    510     tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    511     tmp0 = z1 + z3;
    512     z2 -= tmp0;
    513     tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    514     tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    515     tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    516     tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    517 
    518     /* Odd part */
    519 
    520     z1 = (INT32) wsptr[1];
    521     z2 = (INT32) wsptr[3];
    522     z3 = (INT32) wsptr[5];
    523 
    524     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    525     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    526     tmp0 = tmp1 - tmp2;
    527     tmp1 += tmp2;
    528     tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
    529     tmp1 += tmp2;
    530     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    531     tmp0 += z2;
    532     tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    533 
    534     /* Final output stage */
    535 
    536     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    537 					      CONST_BITS+PASS1_BITS+3)
    538 			    & RANGE_MASK];
    539     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    540 					      CONST_BITS+PASS1_BITS+3)
    541 			    & RANGE_MASK];
    542     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    543 					      CONST_BITS+PASS1_BITS+3)
    544 			    & RANGE_MASK];
    545     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    546 					      CONST_BITS+PASS1_BITS+3)
    547 			    & RANGE_MASK];
    548     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
    549 					      CONST_BITS+PASS1_BITS+3)
    550 			    & RANGE_MASK];
    551     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
    552 					      CONST_BITS+PASS1_BITS+3)
    553 			    & RANGE_MASK];
    554     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
    555 					      CONST_BITS+PASS1_BITS+3)
    556 			    & RANGE_MASK];
    557 
    558     wsptr += 7;		/* advance pointer to next row */
    559   }
    560 }
    561 
    562 
    563 /*
    564  * Perform dequantization and inverse DCT on one block of coefficients,
    565  * producing a reduced-size 6x6 output block.
    566  *
    567  * Optimized algorithm with 3 multiplications in the 1-D kernel.
    568  * cK represents sqrt(2) * cos(K*pi/12).
    569  */
    570 
    571 GLOBAL(void)
    572 jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    573 	       JCOEFPTR coef_block,
    574 	       JSAMPARRAY output_buf, JDIMENSION output_col)
    575 {
    576   INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
    577   INT32 z1, z2, z3;
    578   JCOEFPTR inptr;
    579   ISLOW_MULT_TYPE * quantptr;
    580   int * wsptr;
    581   JSAMPROW outptr;
    582   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    583   int ctr;
    584   int workspace[6*6];	/* buffers data between passes */
    585   SHIFT_TEMPS
    586 
    587   /* Pass 1: process columns from input, store into work array. */
    588 
    589   inptr = coef_block;
    590   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    591   wsptr = workspace;
    592   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
    593     /* Even part */
    594 
    595     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    596     tmp0 <<= CONST_BITS;
    597     /* Add fudge factor here for final descale. */
    598     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    599     tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    600     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    601     tmp1 = tmp0 + tmp10;
    602     tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
    603     tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    604     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    605     tmp10 = tmp1 + tmp0;
    606     tmp12 = tmp1 - tmp0;
    607 
    608     /* Odd part */
    609 
    610     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    611     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    612     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    613     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    614     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
    615     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
    616     tmp1 = (z1 - z2 - z3) << PASS1_BITS;
    617 
    618     /* Final output stage */
    619 
    620     wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    621     wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    622     wsptr[6*1] = (int) (tmp11 + tmp1);
    623     wsptr[6*4] = (int) (tmp11 - tmp1);
    624     wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    625     wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    626   }
    627 
    628   /* Pass 2: process 6 rows from work array, store into output array. */
    629 
    630   wsptr = workspace;
    631   for (ctr = 0; ctr < 6; ctr++) {
    632     outptr = output_buf[ctr] + output_col;
    633 
    634     /* Even part */
    635 
    636     /* Add fudge factor here for final descale. */
    637     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    638     tmp0 <<= CONST_BITS;
    639     tmp2 = (INT32) wsptr[4];
    640     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    641     tmp1 = tmp0 + tmp10;
    642     tmp11 = tmp0 - tmp10 - tmp10;
    643     tmp10 = (INT32) wsptr[2];
    644     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    645     tmp10 = tmp1 + tmp0;
    646     tmp12 = tmp1 - tmp0;
    647 
    648     /* Odd part */
    649 
    650     z1 = (INT32) wsptr[1];
    651     z2 = (INT32) wsptr[3];
    652     z3 = (INT32) wsptr[5];
    653     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    654     tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
    655     tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
    656     tmp1 = (z1 - z2 - z3) << CONST_BITS;
    657 
    658     /* Final output stage */
    659 
    660     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    661 					      CONST_BITS+PASS1_BITS+3)
    662 			    & RANGE_MASK];
    663     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    664 					      CONST_BITS+PASS1_BITS+3)
    665 			    & RANGE_MASK];
    666     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    667 					      CONST_BITS+PASS1_BITS+3)
    668 			    & RANGE_MASK];
    669     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    670 					      CONST_BITS+PASS1_BITS+3)
    671 			    & RANGE_MASK];
    672     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
    673 					      CONST_BITS+PASS1_BITS+3)
    674 			    & RANGE_MASK];
    675     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
    676 					      CONST_BITS+PASS1_BITS+3)
    677 			    & RANGE_MASK];
    678 
    679     wsptr += 6;		/* advance pointer to next row */
    680   }
    681 }
    682 
    683 
    684 /*
    685  * Perform dequantization and inverse DCT on one block of coefficients,
    686  * producing a reduced-size 5x5 output block.
    687  *
    688  * Optimized algorithm with 5 multiplications in the 1-D kernel.
    689  * cK represents sqrt(2) * cos(K*pi/10).
    690  */
    691 
    692 GLOBAL(void)
    693 jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    694 	       JCOEFPTR coef_block,
    695 	       JSAMPARRAY output_buf, JDIMENSION output_col)
    696 {
    697   INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
    698   INT32 z1, z2, z3;
    699   JCOEFPTR inptr;
    700   ISLOW_MULT_TYPE * quantptr;
    701   int * wsptr;
    702   JSAMPROW outptr;
    703   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    704   int ctr;
    705   int workspace[5*5];	/* buffers data between passes */
    706   SHIFT_TEMPS
    707 
    708   /* Pass 1: process columns from input, store into work array. */
    709 
    710   inptr = coef_block;
    711   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    712   wsptr = workspace;
    713   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
    714     /* Even part */
    715 
    716     tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    717     tmp12 <<= CONST_BITS;
    718     /* Add fudge factor here for final descale. */
    719     tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
    720     tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    721     tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    722     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    723     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    724     z3 = tmp12 + z2;
    725     tmp10 = z3 + z1;
    726     tmp11 = z3 - z1;
    727     tmp12 -= z2 << 2;
    728 
    729     /* Odd part */
    730 
    731     z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    732     z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    733 
    734     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    735     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    736     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    737 
    738     /* Final output stage */
    739 
    740     wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    741     wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    742     wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    743     wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    744     wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
    745   }
    746 
    747   /* Pass 2: process 5 rows from work array, store into output array. */
    748 
    749   wsptr = workspace;
    750   for (ctr = 0; ctr < 5; ctr++) {
    751     outptr = output_buf[ctr] + output_col;
    752 
    753     /* Even part */
    754 
    755     /* Add fudge factor here for final descale. */
    756     tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    757     tmp12 <<= CONST_BITS;
    758     tmp0 = (INT32) wsptr[2];
    759     tmp1 = (INT32) wsptr[4];
    760     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    761     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    762     z3 = tmp12 + z2;
    763     tmp10 = z3 + z1;
    764     tmp11 = z3 - z1;
    765     tmp12 -= z2 << 2;
    766 
    767     /* Odd part */
    768 
    769     z2 = (INT32) wsptr[1];
    770     z3 = (INT32) wsptr[3];
    771 
    772     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    773     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    774     tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    775 
    776     /* Final output stage */
    777 
    778     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    779 					      CONST_BITS+PASS1_BITS+3)
    780 			    & RANGE_MASK];
    781     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    782 					      CONST_BITS+PASS1_BITS+3)
    783 			    & RANGE_MASK];
    784     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
    785 					      CONST_BITS+PASS1_BITS+3)
    786 			    & RANGE_MASK];
    787     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
    788 					      CONST_BITS+PASS1_BITS+3)
    789 			    & RANGE_MASK];
    790     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
    791 					      CONST_BITS+PASS1_BITS+3)
    792 			    & RANGE_MASK];
    793 
    794     wsptr += 5;		/* advance pointer to next row */
    795   }
    796 }
    797 
    798 
    799 /*
    800  * Perform dequantization and inverse DCT on one block of coefficients,
    801  * producing a reduced-size 3x3 output block.
    802  *
    803  * Optimized algorithm with 2 multiplications in the 1-D kernel.
    804  * cK represents sqrt(2) * cos(K*pi/6).
    805  */
    806 
    807 GLOBAL(void)
    808 jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    809 	       JCOEFPTR coef_block,
    810 	       JSAMPARRAY output_buf, JDIMENSION output_col)
    811 {
    812   INT32 tmp0, tmp2, tmp10, tmp12;
    813   JCOEFPTR inptr;
    814   ISLOW_MULT_TYPE * quantptr;
    815   int * wsptr;
    816   JSAMPROW outptr;
    817   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    818   int ctr;
    819   int workspace[3*3];	/* buffers data between passes */
    820   SHIFT_TEMPS
    821 
    822   /* Pass 1: process columns from input, store into work array. */
    823 
    824   inptr = coef_block;
    825   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    826   wsptr = workspace;
    827   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
    828     /* Even part */
    829 
    830     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    831     tmp0 <<= CONST_BITS;
    832     /* Add fudge factor here for final descale. */
    833     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    834     tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    835     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    836     tmp10 = tmp0 + tmp12;
    837     tmp2 = tmp0 - tmp12 - tmp12;
    838 
    839     /* Odd part */
    840 
    841     tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    842     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    843 
    844     /* Final output stage */
    845 
    846     wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    847     wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    848     wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
    849   }
    850 
    851   /* Pass 2: process 3 rows from work array, store into output array. */
    852 
    853   wsptr = workspace;
    854   for (ctr = 0; ctr < 3; ctr++) {
    855     outptr = output_buf[ctr] + output_col;
    856 
    857     /* Even part */
    858 
    859     /* Add fudge factor here for final descale. */
    860     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    861     tmp0 <<= CONST_BITS;
    862     tmp2 = (INT32) wsptr[2];
    863     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    864     tmp10 = tmp0 + tmp12;
    865     tmp2 = tmp0 - tmp12 - tmp12;
    866 
    867     /* Odd part */
    868 
    869     tmp12 = (INT32) wsptr[1];
    870     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    871 
    872     /* Final output stage */
    873 
    874     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
    875 					      CONST_BITS+PASS1_BITS+3)
    876 			    & RANGE_MASK];
    877     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
    878 					      CONST_BITS+PASS1_BITS+3)
    879 			    & RANGE_MASK];
    880     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
    881 					      CONST_BITS+PASS1_BITS+3)
    882 			    & RANGE_MASK];
    883 
    884     wsptr += 3;		/* advance pointer to next row */
    885   }
    886 }
    887 
    888 
    889 /*
    890  * Perform dequantization and inverse DCT on one block of coefficients,
    891  * producing a 9x9 output block.
    892  *
    893  * Optimized algorithm with 10 multiplications in the 1-D kernel.
    894  * cK represents sqrt(2) * cos(K*pi/18).
    895  */
    896 
    897 GLOBAL(void)
    898 jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    899 	       JCOEFPTR coef_block,
    900 	       JSAMPARRAY output_buf, JDIMENSION output_col)
    901 {
    902   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
    903   INT32 z1, z2, z3, z4;
    904   JCOEFPTR inptr;
    905   ISLOW_MULT_TYPE * quantptr;
    906   int * wsptr;
    907   JSAMPROW outptr;
    908   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    909   int ctr;
    910   int workspace[8*9];	/* buffers data between passes */
    911   SHIFT_TEMPS
    912 
    913   /* Pass 1: process columns from input, store into work array. */
    914 
    915   inptr = coef_block;
    916   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
    917   wsptr = workspace;
    918   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
    919     /* Even part */
    920 
    921     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
    922     tmp0 <<= CONST_BITS;
    923     /* Add fudge factor here for final descale. */
    924     tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
    925 
    926     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
    927     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
    928     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
    929 
    930     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
    931     tmp1 = tmp0 + tmp3;
    932     tmp2 = tmp0 - tmp3 - tmp3;
    933 
    934     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
    935     tmp11 = tmp2 + tmp0;
    936     tmp14 = tmp2 - tmp0 - tmp0;
    937 
    938     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
    939     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
    940     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
    941 
    942     tmp10 = tmp1 + tmp0 - tmp3;
    943     tmp12 = tmp1 - tmp0 + tmp2;
    944     tmp13 = tmp1 - tmp2 + tmp3;
    945 
    946     /* Odd part */
    947 
    948     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
    949     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
    950     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
    951     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
    952 
    953     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
    954 
    955     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
    956     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
    957     tmp0 = tmp2 + tmp3 - z2;
    958     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
    959     tmp2 += z2 - tmp1;
    960     tmp3 += z2 + tmp1;
    961     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
    962 
    963     /* Final output stage */
    964 
    965     wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
    966     wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
    967     wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
    968     wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
    969     wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
    970     wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
    971     wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
    972     wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
    973     wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
    974   }
    975 
    976   /* Pass 2: process 9 rows from work array, store into output array. */
    977 
    978   wsptr = workspace;
    979   for (ctr = 0; ctr < 9; ctr++) {
    980     outptr = output_buf[ctr] + output_col;
    981 
    982     /* Even part */
    983 
    984     /* Add fudge factor here for final descale. */
    985     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
    986     tmp0 <<= CONST_BITS;
    987 
    988     z1 = (INT32) wsptr[2];
    989     z2 = (INT32) wsptr[4];
    990     z3 = (INT32) wsptr[6];
    991 
    992     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
    993     tmp1 = tmp0 + tmp3;
    994     tmp2 = tmp0 - tmp3 - tmp3;
    995 
    996     tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
    997     tmp11 = tmp2 + tmp0;
    998     tmp14 = tmp2 - tmp0 - tmp0;
    999 
   1000     tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
   1001     tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
   1002     tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
   1003 
   1004     tmp10 = tmp1 + tmp0 - tmp3;
   1005     tmp12 = tmp1 - tmp0 + tmp2;
   1006     tmp13 = tmp1 - tmp2 + tmp3;
   1007 
   1008     /* Odd part */
   1009 
   1010     z1 = (INT32) wsptr[1];
   1011     z2 = (INT32) wsptr[3];
   1012     z3 = (INT32) wsptr[5];
   1013     z4 = (INT32) wsptr[7];
   1014 
   1015     z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
   1016 
   1017     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
   1018     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
   1019     tmp0 = tmp2 + tmp3 - z2;
   1020     tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
   1021     tmp2 += z2 - tmp1;
   1022     tmp3 += z2 + tmp1;
   1023     tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
   1024 
   1025     /* Final output stage */
   1026 
   1027     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
   1028 					      CONST_BITS+PASS1_BITS+3)
   1029 			    & RANGE_MASK];
   1030     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
   1031 					      CONST_BITS+PASS1_BITS+3)
   1032 			    & RANGE_MASK];
   1033     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
   1034 					      CONST_BITS+PASS1_BITS+3)
   1035 			    & RANGE_MASK];
   1036     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
   1037 					      CONST_BITS+PASS1_BITS+3)
   1038 			    & RANGE_MASK];
   1039     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
   1040 					      CONST_BITS+PASS1_BITS+3)
   1041 			    & RANGE_MASK];
   1042     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
   1043 					      CONST_BITS+PASS1_BITS+3)
   1044 			    & RANGE_MASK];
   1045     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
   1046 					      CONST_BITS+PASS1_BITS+3)
   1047 			    & RANGE_MASK];
   1048     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
   1049 					      CONST_BITS+PASS1_BITS+3)
   1050 			    & RANGE_MASK];
   1051     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
   1052 					      CONST_BITS+PASS1_BITS+3)
   1053 			    & RANGE_MASK];
   1054 
   1055     wsptr += 8;		/* advance pointer to next row */
   1056   }
   1057 }
   1058 
   1059 
   1060 /*
   1061  * Perform dequantization and inverse DCT on one block of coefficients,
   1062  * producing a 10x10 output block.
   1063  *
   1064  * Optimized algorithm with 12 multiplications in the 1-D kernel.
   1065  * cK represents sqrt(2) * cos(K*pi/20).
   1066  */
   1067 
   1068 GLOBAL(void)
   1069 jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1070 		 JCOEFPTR coef_block,
   1071 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   1072 {
   1073   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
   1074   INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
   1075   INT32 z1, z2, z3, z4, z5;
   1076   JCOEFPTR inptr;
   1077   ISLOW_MULT_TYPE * quantptr;
   1078   int * wsptr;
   1079   JSAMPROW outptr;
   1080   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1081   int ctr;
   1082   int workspace[8*10];	/* buffers data between passes */
   1083   SHIFT_TEMPS
   1084 
   1085   /* Pass 1: process columns from input, store into work array. */
   1086 
   1087   inptr = coef_block;
   1088   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1089   wsptr = workspace;
   1090   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1091     /* Even part */
   1092 
   1093     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1094     z3 <<= CONST_BITS;
   1095     /* Add fudge factor here for final descale. */
   1096     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   1097     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1098     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1099     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1100     tmp10 = z3 + z1;
   1101     tmp11 = z3 - z2;
   1102 
   1103     tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
   1104 			CONST_BITS-PASS1_BITS);
   1105 
   1106     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1107     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1108 
   1109     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1110     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1111     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1112 
   1113     tmp20 = tmp10 + tmp12;
   1114     tmp24 = tmp10 - tmp12;
   1115     tmp21 = tmp11 + tmp13;
   1116     tmp23 = tmp11 - tmp13;
   1117 
   1118     /* Odd part */
   1119 
   1120     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1121     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1122     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1123     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1124 
   1125     tmp11 = z2 + z4;
   1126     tmp13 = z2 - z4;
   1127 
   1128     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1129     z5 = z3 << CONST_BITS;
   1130 
   1131     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1132     z4 = z5 + tmp12;
   1133 
   1134     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1135     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1136 
   1137     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1138     z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
   1139 
   1140     tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
   1141 
   1142     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1143     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1144 
   1145     /* Final output stage */
   1146 
   1147     wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1148     wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1149     wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1150     wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1151     wsptr[8*2] = (int) (tmp22 + tmp12);
   1152     wsptr[8*7] = (int) (tmp22 - tmp12);
   1153     wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1154     wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1155     wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1156     wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1157   }
   1158 
   1159   /* Pass 2: process 10 rows from work array, store into output array. */
   1160 
   1161   wsptr = workspace;
   1162   for (ctr = 0; ctr < 10; ctr++) {
   1163     outptr = output_buf[ctr] + output_col;
   1164 
   1165     /* Even part */
   1166 
   1167     /* Add fudge factor here for final descale. */
   1168     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1169     z3 <<= CONST_BITS;
   1170     z4 = (INT32) wsptr[4];
   1171     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1172     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1173     tmp10 = z3 + z1;
   1174     tmp11 = z3 - z2;
   1175 
   1176     tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
   1177 
   1178     z2 = (INT32) wsptr[2];
   1179     z3 = (INT32) wsptr[6];
   1180 
   1181     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1182     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1183     tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1184 
   1185     tmp20 = tmp10 + tmp12;
   1186     tmp24 = tmp10 - tmp12;
   1187     tmp21 = tmp11 + tmp13;
   1188     tmp23 = tmp11 - tmp13;
   1189 
   1190     /* Odd part */
   1191 
   1192     z1 = (INT32) wsptr[1];
   1193     z2 = (INT32) wsptr[3];
   1194     z3 = (INT32) wsptr[5];
   1195     z3 <<= CONST_BITS;
   1196     z4 = (INT32) wsptr[7];
   1197 
   1198     tmp11 = z2 + z4;
   1199     tmp13 = z2 - z4;
   1200 
   1201     tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1202 
   1203     z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1204     z4 = z3 + tmp12;
   1205 
   1206     tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1207     tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1208 
   1209     z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1210     z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
   1211 
   1212     tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
   1213 
   1214     tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1215     tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1216 
   1217     /* Final output stage */
   1218 
   1219     outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1220 					      CONST_BITS+PASS1_BITS+3)
   1221 			    & RANGE_MASK];
   1222     outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1223 					      CONST_BITS+PASS1_BITS+3)
   1224 			    & RANGE_MASK];
   1225     outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1226 					      CONST_BITS+PASS1_BITS+3)
   1227 			    & RANGE_MASK];
   1228     outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1229 					      CONST_BITS+PASS1_BITS+3)
   1230 			    & RANGE_MASK];
   1231     outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1232 					      CONST_BITS+PASS1_BITS+3)
   1233 			    & RANGE_MASK];
   1234     outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1235 					      CONST_BITS+PASS1_BITS+3)
   1236 			    & RANGE_MASK];
   1237     outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1238 					      CONST_BITS+PASS1_BITS+3)
   1239 			    & RANGE_MASK];
   1240     outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1241 					      CONST_BITS+PASS1_BITS+3)
   1242 			    & RANGE_MASK];
   1243     outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1244 					      CONST_BITS+PASS1_BITS+3)
   1245 			    & RANGE_MASK];
   1246     outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1247 					      CONST_BITS+PASS1_BITS+3)
   1248 			    & RANGE_MASK];
   1249 
   1250     wsptr += 8;		/* advance pointer to next row */
   1251   }
   1252 }
   1253 
   1254 
   1255 /*
   1256  * Perform dequantization and inverse DCT on one block of coefficients,
   1257  * producing a 11x11 output block.
   1258  *
   1259  * Optimized algorithm with 24 multiplications in the 1-D kernel.
   1260  * cK represents sqrt(2) * cos(K*pi/22).
   1261  */
   1262 
   1263 GLOBAL(void)
   1264 jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1265 		 JCOEFPTR coef_block,
   1266 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   1267 {
   1268   INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
   1269   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1270   INT32 z1, z2, z3, z4;
   1271   JCOEFPTR inptr;
   1272   ISLOW_MULT_TYPE * quantptr;
   1273   int * wsptr;
   1274   JSAMPROW outptr;
   1275   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1276   int ctr;
   1277   int workspace[8*11];	/* buffers data between passes */
   1278   SHIFT_TEMPS
   1279 
   1280   /* Pass 1: process columns from input, store into work array. */
   1281 
   1282   inptr = coef_block;
   1283   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1284   wsptr = workspace;
   1285   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1286     /* Even part */
   1287 
   1288     tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1289     tmp10 <<= CONST_BITS;
   1290     /* Add fudge factor here for final descale. */
   1291     tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
   1292 
   1293     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1294     z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1295     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1296 
   1297     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1298     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1299     z4 = z1 + z3;
   1300     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
   1301     z4 -= z2;
   1302     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1303     tmp21 = tmp20 + tmp23 + tmp25 -
   1304 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1305     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1306     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1307     tmp24 += tmp25;
   1308     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1309     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1310 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1311     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1312 
   1313     /* Odd part */
   1314 
   1315     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1316     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1317     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1318     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1319 
   1320     tmp11 = z1 + z2;
   1321     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1322     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1323     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1324     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1325     tmp10 = tmp11 + tmp12 + tmp13 -
   1326 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1327     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1328     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1329     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1330     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
   1331     tmp11 += z1;
   1332     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1333     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
   1334 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1335 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1336 
   1337     /* Final output stage */
   1338 
   1339     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1340     wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1341     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1342     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1343     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1344     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1345     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1346     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1347     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1348     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1349     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
   1350   }
   1351 
   1352   /* Pass 2: process 11 rows from work array, store into output array. */
   1353 
   1354   wsptr = workspace;
   1355   for (ctr = 0; ctr < 11; ctr++) {
   1356     outptr = output_buf[ctr] + output_col;
   1357 
   1358     /* Even part */
   1359 
   1360     /* Add fudge factor here for final descale. */
   1361     tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1362     tmp10 <<= CONST_BITS;
   1363 
   1364     z1 = (INT32) wsptr[2];
   1365     z2 = (INT32) wsptr[4];
   1366     z3 = (INT32) wsptr[6];
   1367 
   1368     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1369     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1370     z4 = z1 + z3;
   1371     tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
   1372     z4 -= z2;
   1373     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1374     tmp21 = tmp20 + tmp23 + tmp25 -
   1375 	    MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1376     tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1377     tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1378     tmp24 += tmp25;
   1379     tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1380     tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1381 	     MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1382     tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1383 
   1384     /* Odd part */
   1385 
   1386     z1 = (INT32) wsptr[1];
   1387     z2 = (INT32) wsptr[3];
   1388     z3 = (INT32) wsptr[5];
   1389     z4 = (INT32) wsptr[7];
   1390 
   1391     tmp11 = z1 + z2;
   1392     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1393     tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1394     tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1395     tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1396     tmp10 = tmp11 + tmp12 + tmp13 -
   1397 	    MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1398     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1399     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1400     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1401     z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
   1402     tmp11 += z1;
   1403     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1404     tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
   1405 	     MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1406 	     MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1407 
   1408     /* Final output stage */
   1409 
   1410     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1411 					       CONST_BITS+PASS1_BITS+3)
   1412 			     & RANGE_MASK];
   1413     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1414 					       CONST_BITS+PASS1_BITS+3)
   1415 			     & RANGE_MASK];
   1416     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1417 					       CONST_BITS+PASS1_BITS+3)
   1418 			     & RANGE_MASK];
   1419     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1420 					       CONST_BITS+PASS1_BITS+3)
   1421 			     & RANGE_MASK];
   1422     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1423 					       CONST_BITS+PASS1_BITS+3)
   1424 			     & RANGE_MASK];
   1425     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1426 					       CONST_BITS+PASS1_BITS+3)
   1427 			     & RANGE_MASK];
   1428     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1429 					       CONST_BITS+PASS1_BITS+3)
   1430 			     & RANGE_MASK];
   1431     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1432 					       CONST_BITS+PASS1_BITS+3)
   1433 			     & RANGE_MASK];
   1434     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1435 					       CONST_BITS+PASS1_BITS+3)
   1436 			     & RANGE_MASK];
   1437     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1438 					       CONST_BITS+PASS1_BITS+3)
   1439 			     & RANGE_MASK];
   1440     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
   1441 					       CONST_BITS+PASS1_BITS+3)
   1442 			     & RANGE_MASK];
   1443 
   1444     wsptr += 8;		/* advance pointer to next row */
   1445   }
   1446 }
   1447 
   1448 
   1449 /*
   1450  * Perform dequantization and inverse DCT on one block of coefficients,
   1451  * producing a 12x12 output block.
   1452  *
   1453  * Optimized algorithm with 15 multiplications in the 1-D kernel.
   1454  * cK represents sqrt(2) * cos(K*pi/24).
   1455  */
   1456 
   1457 GLOBAL(void)
   1458 jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1459 		 JCOEFPTR coef_block,
   1460 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   1461 {
   1462   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1463   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1464   INT32 z1, z2, z3, z4;
   1465   JCOEFPTR inptr;
   1466   ISLOW_MULT_TYPE * quantptr;
   1467   int * wsptr;
   1468   JSAMPROW outptr;
   1469   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1470   int ctr;
   1471   int workspace[8*12];	/* buffers data between passes */
   1472   SHIFT_TEMPS
   1473 
   1474   /* Pass 1: process columns from input, store into work array. */
   1475 
   1476   inptr = coef_block;
   1477   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1478   wsptr = workspace;
   1479   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1480     /* Even part */
   1481 
   1482     z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1483     z3 <<= CONST_BITS;
   1484     /* Add fudge factor here for final descale. */
   1485     z3 += ONE << (CONST_BITS-PASS1_BITS-1);
   1486 
   1487     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1488     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1489 
   1490     tmp10 = z3 + z4;
   1491     tmp11 = z3 - z4;
   1492 
   1493     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1494     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1495     z1 <<= CONST_BITS;
   1496     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1497     z2 <<= CONST_BITS;
   1498 
   1499     tmp12 = z1 - z2;
   1500 
   1501     tmp21 = z3 + tmp12;
   1502     tmp24 = z3 - tmp12;
   1503 
   1504     tmp12 = z4 + z2;
   1505 
   1506     tmp20 = tmp10 + tmp12;
   1507     tmp25 = tmp10 - tmp12;
   1508 
   1509     tmp12 = z4 - z1 - z2;
   1510 
   1511     tmp22 = tmp11 + tmp12;
   1512     tmp23 = tmp11 - tmp12;
   1513 
   1514     /* Odd part */
   1515 
   1516     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1517     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1518     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1519     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1520 
   1521     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1522     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   1523 
   1524     tmp10 = z1 + z3;
   1525     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1526     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1527     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1528     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   1529     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1530     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1531     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1532 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1533 
   1534     z1 -= z4;
   1535     z2 -= z3;
   1536     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1537     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1538     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1539 
   1540     /* Final output stage */
   1541 
   1542     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1543     wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1544     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1545     wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1546     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1547     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1548     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1549     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1550     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1551     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1552     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   1553     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   1554   }
   1555 
   1556   /* Pass 2: process 12 rows from work array, store into output array. */
   1557 
   1558   wsptr = workspace;
   1559   for (ctr = 0; ctr < 12; ctr++) {
   1560     outptr = output_buf[ctr] + output_col;
   1561 
   1562     /* Even part */
   1563 
   1564     /* Add fudge factor here for final descale. */
   1565     z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1566     z3 <<= CONST_BITS;
   1567 
   1568     z4 = (INT32) wsptr[4];
   1569     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1570 
   1571     tmp10 = z3 + z4;
   1572     tmp11 = z3 - z4;
   1573 
   1574     z1 = (INT32) wsptr[2];
   1575     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1576     z1 <<= CONST_BITS;
   1577     z2 = (INT32) wsptr[6];
   1578     z2 <<= CONST_BITS;
   1579 
   1580     tmp12 = z1 - z2;
   1581 
   1582     tmp21 = z3 + tmp12;
   1583     tmp24 = z3 - tmp12;
   1584 
   1585     tmp12 = z4 + z2;
   1586 
   1587     tmp20 = tmp10 + tmp12;
   1588     tmp25 = tmp10 - tmp12;
   1589 
   1590     tmp12 = z4 - z1 - z2;
   1591 
   1592     tmp22 = tmp11 + tmp12;
   1593     tmp23 = tmp11 - tmp12;
   1594 
   1595     /* Odd part */
   1596 
   1597     z1 = (INT32) wsptr[1];
   1598     z2 = (INT32) wsptr[3];
   1599     z3 = (INT32) wsptr[5];
   1600     z4 = (INT32) wsptr[7];
   1601 
   1602     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1603     tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
   1604 
   1605     tmp10 = z1 + z3;
   1606     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1607     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1608     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1609     tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
   1610     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1611     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1612     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1613 	     MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1614 
   1615     z1 -= z4;
   1616     z2 -= z3;
   1617     z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1618     tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1619     tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1620 
   1621     /* Final output stage */
   1622 
   1623     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1624 					       CONST_BITS+PASS1_BITS+3)
   1625 			     & RANGE_MASK];
   1626     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1627 					       CONST_BITS+PASS1_BITS+3)
   1628 			     & RANGE_MASK];
   1629     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1630 					       CONST_BITS+PASS1_BITS+3)
   1631 			     & RANGE_MASK];
   1632     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1633 					       CONST_BITS+PASS1_BITS+3)
   1634 			     & RANGE_MASK];
   1635     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1636 					       CONST_BITS+PASS1_BITS+3)
   1637 			     & RANGE_MASK];
   1638     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1639 					       CONST_BITS+PASS1_BITS+3)
   1640 			     & RANGE_MASK];
   1641     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1642 					       CONST_BITS+PASS1_BITS+3)
   1643 			     & RANGE_MASK];
   1644     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1645 					       CONST_BITS+PASS1_BITS+3)
   1646 			     & RANGE_MASK];
   1647     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1648 					       CONST_BITS+PASS1_BITS+3)
   1649 			     & RANGE_MASK];
   1650     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1651 					       CONST_BITS+PASS1_BITS+3)
   1652 			     & RANGE_MASK];
   1653     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   1654 					       CONST_BITS+PASS1_BITS+3)
   1655 			     & RANGE_MASK];
   1656     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   1657 					       CONST_BITS+PASS1_BITS+3)
   1658 			     & RANGE_MASK];
   1659 
   1660     wsptr += 8;		/* advance pointer to next row */
   1661   }
   1662 }
   1663 
   1664 
   1665 /*
   1666  * Perform dequantization and inverse DCT on one block of coefficients,
   1667  * producing a 13x13 output block.
   1668  *
   1669  * Optimized algorithm with 29 multiplications in the 1-D kernel.
   1670  * cK represents sqrt(2) * cos(K*pi/26).
   1671  */
   1672 
   1673 GLOBAL(void)
   1674 jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1675 		 JCOEFPTR coef_block,
   1676 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   1677 {
   1678   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1679   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   1680   INT32 z1, z2, z3, z4;
   1681   JCOEFPTR inptr;
   1682   ISLOW_MULT_TYPE * quantptr;
   1683   int * wsptr;
   1684   JSAMPROW outptr;
   1685   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1686   int ctr;
   1687   int workspace[8*13];	/* buffers data between passes */
   1688   SHIFT_TEMPS
   1689 
   1690   /* Pass 1: process columns from input, store into work array. */
   1691 
   1692   inptr = coef_block;
   1693   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1694   wsptr = workspace;
   1695   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1696     /* Even part */
   1697 
   1698     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1699     z1 <<= CONST_BITS;
   1700     /* Add fudge factor here for final descale. */
   1701     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   1702 
   1703     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1704     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1705     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1706 
   1707     tmp10 = z3 + z4;
   1708     tmp11 = z3 - z4;
   1709 
   1710     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1711     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1712 
   1713     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1714     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1715 
   1716     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1717     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1718 
   1719     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1720     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
   1721 
   1722     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   1723     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   1724 
   1725     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
   1726     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
   1727 
   1728     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   1729 
   1730     /* Odd part */
   1731 
   1732     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1733     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1734     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1735     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1736 
   1737     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   1738     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   1739     tmp15 = z1 + z4;
   1740     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   1741     tmp10 = tmp11 + tmp12 + tmp13 -
   1742 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   1743     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
   1744     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   1745     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   1746     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
   1747     tmp11 += tmp14;
   1748     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   1749     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
   1750     tmp12 += tmp14;
   1751     tmp13 += tmp14;
   1752     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   1753     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   1754 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   1755     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   1756     tmp14 += z1;
   1757     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   1758 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   1759 
   1760     /* Final output stage */
   1761 
   1762     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1763     wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1764     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1765     wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1766     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1767     wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1768     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   1769     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   1770     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1771     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1772     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   1773     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   1774     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
   1775   }
   1776 
   1777   /* Pass 2: process 13 rows from work array, store into output array. */
   1778 
   1779   wsptr = workspace;
   1780   for (ctr = 0; ctr < 13; ctr++) {
   1781     outptr = output_buf[ctr] + output_col;
   1782 
   1783     /* Even part */
   1784 
   1785     /* Add fudge factor here for final descale. */
   1786     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   1787     z1 <<= CONST_BITS;
   1788 
   1789     z2 = (INT32) wsptr[2];
   1790     z3 = (INT32) wsptr[4];
   1791     z4 = (INT32) wsptr[6];
   1792 
   1793     tmp10 = z3 + z4;
   1794     tmp11 = z3 - z4;
   1795 
   1796     tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1797     tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1798 
   1799     tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1800     tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1801 
   1802     tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1803     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1804 
   1805     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1806     tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
   1807 
   1808     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   1809     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   1810 
   1811     tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
   1812     tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
   1813 
   1814     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   1815 
   1816     /* Odd part */
   1817 
   1818     z1 = (INT32) wsptr[1];
   1819     z2 = (INT32) wsptr[3];
   1820     z3 = (INT32) wsptr[5];
   1821     z4 = (INT32) wsptr[7];
   1822 
   1823     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   1824     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   1825     tmp15 = z1 + z4;
   1826     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   1827     tmp10 = tmp11 + tmp12 + tmp13 -
   1828 	    MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   1829     tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
   1830     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   1831     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   1832     tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
   1833     tmp11 += tmp14;
   1834     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   1835     tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
   1836     tmp12 += tmp14;
   1837     tmp13 += tmp14;
   1838     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   1839     tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   1840 	    MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   1841     z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   1842     tmp14 += z1;
   1843     tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   1844 	     MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   1845 
   1846     /* Final output stage */
   1847 
   1848     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   1849 					       CONST_BITS+PASS1_BITS+3)
   1850 			     & RANGE_MASK];
   1851     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   1852 					       CONST_BITS+PASS1_BITS+3)
   1853 			     & RANGE_MASK];
   1854     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   1855 					       CONST_BITS+PASS1_BITS+3)
   1856 			     & RANGE_MASK];
   1857     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   1858 					       CONST_BITS+PASS1_BITS+3)
   1859 			     & RANGE_MASK];
   1860     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   1861 					       CONST_BITS+PASS1_BITS+3)
   1862 			     & RANGE_MASK];
   1863     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   1864 					       CONST_BITS+PASS1_BITS+3)
   1865 			     & RANGE_MASK];
   1866     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   1867 					       CONST_BITS+PASS1_BITS+3)
   1868 			     & RANGE_MASK];
   1869     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   1870 					       CONST_BITS+PASS1_BITS+3)
   1871 			     & RANGE_MASK];
   1872     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   1873 					       CONST_BITS+PASS1_BITS+3)
   1874 			     & RANGE_MASK];
   1875     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   1876 					       CONST_BITS+PASS1_BITS+3)
   1877 			     & RANGE_MASK];
   1878     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   1879 					       CONST_BITS+PASS1_BITS+3)
   1880 			     & RANGE_MASK];
   1881     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   1882 					       CONST_BITS+PASS1_BITS+3)
   1883 			     & RANGE_MASK];
   1884     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
   1885 					       CONST_BITS+PASS1_BITS+3)
   1886 			     & RANGE_MASK];
   1887 
   1888     wsptr += 8;		/* advance pointer to next row */
   1889   }
   1890 }
   1891 
   1892 
   1893 /*
   1894  * Perform dequantization and inverse DCT on one block of coefficients,
   1895  * producing a 14x14 output block.
   1896  *
   1897  * Optimized algorithm with 20 multiplications in the 1-D kernel.
   1898  * cK represents sqrt(2) * cos(K*pi/28).
   1899  */
   1900 
   1901 GLOBAL(void)
   1902 jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   1903 		 JCOEFPTR coef_block,
   1904 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   1905 {
   1906   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   1907   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   1908   INT32 z1, z2, z3, z4;
   1909   JCOEFPTR inptr;
   1910   ISLOW_MULT_TYPE * quantptr;
   1911   int * wsptr;
   1912   JSAMPROW outptr;
   1913   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1914   int ctr;
   1915   int workspace[8*14];	/* buffers data between passes */
   1916   SHIFT_TEMPS
   1917 
   1918   /* Pass 1: process columns from input, store into work array. */
   1919 
   1920   inptr = coef_block;
   1921   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   1922   wsptr = workspace;
   1923   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1924     /* Even part */
   1925 
   1926     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   1927     z1 <<= CONST_BITS;
   1928     /* Add fudge factor here for final descale. */
   1929     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   1930     z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   1931     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   1932     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   1933     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   1934 
   1935     tmp10 = z1 + z2;
   1936     tmp11 = z1 + z3;
   1937     tmp12 = z1 - z4;
   1938 
   1939     tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
   1940 			CONST_BITS-PASS1_BITS);
   1941 
   1942     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   1943     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   1944 
   1945     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   1946 
   1947     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   1948     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   1949     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   1950 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   1951 
   1952     tmp20 = tmp10 + tmp13;
   1953     tmp26 = tmp10 - tmp13;
   1954     tmp21 = tmp11 + tmp14;
   1955     tmp25 = tmp11 - tmp14;
   1956     tmp22 = tmp12 + tmp15;
   1957     tmp24 = tmp12 - tmp15;
   1958 
   1959     /* Odd part */
   1960 
   1961     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   1962     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   1963     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   1964     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   1965     tmp13 = z4 << CONST_BITS;
   1966 
   1967     tmp14 = z1 + z3;
   1968     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   1969     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   1970     tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   1971     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   1972     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   1973     z1    -= z2;
   1974     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
   1975     tmp16 += tmp15;
   1976     z1    += z4;
   1977     z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
   1978     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
   1979     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
   1980     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   1981     tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   1982     tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
   1983 
   1984     tmp13 = (z1 - z3) << PASS1_BITS;
   1985 
   1986     /* Final output stage */
   1987 
   1988     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   1989     wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   1990     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   1991     wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   1992     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   1993     wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   1994     wsptr[8*3]  = (int) (tmp23 + tmp13);
   1995     wsptr[8*10] = (int) (tmp23 - tmp13);
   1996     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   1997     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   1998     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   1999     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   2000     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
   2001     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
   2002   }
   2003 
   2004   /* Pass 2: process 14 rows from work array, store into output array. */
   2005 
   2006   wsptr = workspace;
   2007   for (ctr = 0; ctr < 14; ctr++) {
   2008     outptr = output_buf[ctr] + output_col;
   2009 
   2010     /* Even part */
   2011 
   2012     /* Add fudge factor here for final descale. */
   2013     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   2014     z1 <<= CONST_BITS;
   2015     z4 = (INT32) wsptr[4];
   2016     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   2017     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   2018     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   2019 
   2020     tmp10 = z1 + z2;
   2021     tmp11 = z1 + z3;
   2022     tmp12 = z1 - z4;
   2023 
   2024     tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
   2025 
   2026     z1 = (INT32) wsptr[2];
   2027     z2 = (INT32) wsptr[6];
   2028 
   2029     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   2030 
   2031     tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   2032     tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   2033     tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   2034 	    MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   2035 
   2036     tmp20 = tmp10 + tmp13;
   2037     tmp26 = tmp10 - tmp13;
   2038     tmp21 = tmp11 + tmp14;
   2039     tmp25 = tmp11 - tmp14;
   2040     tmp22 = tmp12 + tmp15;
   2041     tmp24 = tmp12 - tmp15;
   2042 
   2043     /* Odd part */
   2044 
   2045     z1 = (INT32) wsptr[1];
   2046     z2 = (INT32) wsptr[3];
   2047     z3 = (INT32) wsptr[5];
   2048     z4 = (INT32) wsptr[7];
   2049     z4 <<= CONST_BITS;
   2050 
   2051     tmp14 = z1 + z3;
   2052     tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   2053     tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   2054     tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   2055     tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   2056     tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   2057     z1    -= z2;
   2058     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
   2059     tmp16 += tmp15;
   2060     tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
   2061     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
   2062     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
   2063     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   2064     tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   2065     tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
   2066 
   2067     tmp13 = ((z1 - z3) << CONST_BITS) + z4;
   2068 
   2069     /* Final output stage */
   2070 
   2071     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   2072 					       CONST_BITS+PASS1_BITS+3)
   2073 			     & RANGE_MASK];
   2074     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   2075 					       CONST_BITS+PASS1_BITS+3)
   2076 			     & RANGE_MASK];
   2077     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   2078 					       CONST_BITS+PASS1_BITS+3)
   2079 			     & RANGE_MASK];
   2080     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   2081 					       CONST_BITS+PASS1_BITS+3)
   2082 			     & RANGE_MASK];
   2083     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   2084 					       CONST_BITS+PASS1_BITS+3)
   2085 			     & RANGE_MASK];
   2086     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   2087 					       CONST_BITS+PASS1_BITS+3)
   2088 			     & RANGE_MASK];
   2089     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   2090 					       CONST_BITS+PASS1_BITS+3)
   2091 			     & RANGE_MASK];
   2092     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   2093 					       CONST_BITS+PASS1_BITS+3)
   2094 			     & RANGE_MASK];
   2095     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   2096 					       CONST_BITS+PASS1_BITS+3)
   2097 			     & RANGE_MASK];
   2098     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   2099 					       CONST_BITS+PASS1_BITS+3)
   2100 			     & RANGE_MASK];
   2101     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   2102 					       CONST_BITS+PASS1_BITS+3)
   2103 			     & RANGE_MASK];
   2104     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   2105 					       CONST_BITS+PASS1_BITS+3)
   2106 			     & RANGE_MASK];
   2107     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
   2108 					       CONST_BITS+PASS1_BITS+3)
   2109 			     & RANGE_MASK];
   2110     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
   2111 					       CONST_BITS+PASS1_BITS+3)
   2112 			     & RANGE_MASK];
   2113 
   2114     wsptr += 8;		/* advance pointer to next row */
   2115   }
   2116 }
   2117 
   2118 
   2119 /*
   2120  * Perform dequantization and inverse DCT on one block of coefficients,
   2121  * producing a 15x15 output block.
   2122  *
   2123  * Optimized algorithm with 22 multiplications in the 1-D kernel.
   2124  * cK represents sqrt(2) * cos(K*pi/30).
   2125  */
   2126 
   2127 GLOBAL(void)
   2128 jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2129 		 JCOEFPTR coef_block,
   2130 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   2131 {
   2132   INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   2133   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2134   INT32 z1, z2, z3, z4;
   2135   JCOEFPTR inptr;
   2136   ISLOW_MULT_TYPE * quantptr;
   2137   int * wsptr;
   2138   JSAMPROW outptr;
   2139   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2140   int ctr;
   2141   int workspace[8*15];	/* buffers data between passes */
   2142   SHIFT_TEMPS
   2143 
   2144   /* Pass 1: process columns from input, store into work array. */
   2145 
   2146   inptr = coef_block;
   2147   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2148   wsptr = workspace;
   2149   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2150     /* Even part */
   2151 
   2152     z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2153     z1 <<= CONST_BITS;
   2154     /* Add fudge factor here for final descale. */
   2155     z1 += ONE << (CONST_BITS-PASS1_BITS-1);
   2156 
   2157     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2158     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2159     z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2160 
   2161     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2162     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2163 
   2164     tmp12 = z1 - tmp10;
   2165     tmp13 = z1 + tmp11;
   2166     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
   2167 
   2168     z4 = z2 - z3;
   2169     z3 += z2;
   2170     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2171     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2172     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2173 
   2174     tmp20 = tmp13 + tmp10 + tmp11;
   2175     tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2176 
   2177     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2178     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2179 
   2180     tmp25 = tmp13 - tmp10 - tmp11;
   2181     tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2182 
   2183     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2184     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2185 
   2186     tmp21 = tmp12 + tmp10 + tmp11;
   2187     tmp24 = tmp13 - tmp10 + tmp11;
   2188     tmp11 += tmp11;
   2189     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2190     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2191 
   2192     /* Odd part */
   2193 
   2194     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2195     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2196     z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2197     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2198     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2199 
   2200     tmp13 = z2 - z4;
   2201     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2202     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2203     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2204 
   2205     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
   2206     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
   2207     z2 = z1 - z4;
   2208     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2209 
   2210     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2211     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2212     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2213     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2214     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2215     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2216 
   2217     /* Final output stage */
   2218 
   2219     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
   2220     wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
   2221     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
   2222     wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
   2223     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
   2224     wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
   2225     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
   2226     wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
   2227     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
   2228     wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
   2229     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
   2230     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
   2231     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
   2232     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
   2233     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
   2234   }
   2235 
   2236   /* Pass 2: process 15 rows from work array, store into output array. */
   2237 
   2238   wsptr = workspace;
   2239   for (ctr = 0; ctr < 15; ctr++) {
   2240     outptr = output_buf[ctr] + output_col;
   2241 
   2242     /* Even part */
   2243 
   2244     /* Add fudge factor here for final descale. */
   2245     z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   2246     z1 <<= CONST_BITS;
   2247 
   2248     z2 = (INT32) wsptr[2];
   2249     z3 = (INT32) wsptr[4];
   2250     z4 = (INT32) wsptr[6];
   2251 
   2252     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2253     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2254 
   2255     tmp12 = z1 - tmp10;
   2256     tmp13 = z1 + tmp11;
   2257     z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
   2258 
   2259     z4 = z2 - z3;
   2260     z3 += z2;
   2261     tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2262     tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2263     z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2264 
   2265     tmp20 = tmp13 + tmp10 + tmp11;
   2266     tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2267 
   2268     tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2269     tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2270 
   2271     tmp25 = tmp13 - tmp10 - tmp11;
   2272     tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2273 
   2274     tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2275     tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2276 
   2277     tmp21 = tmp12 + tmp10 + tmp11;
   2278     tmp24 = tmp13 - tmp10 + tmp11;
   2279     tmp11 += tmp11;
   2280     tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2281     tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2282 
   2283     /* Odd part */
   2284 
   2285     z1 = (INT32) wsptr[1];
   2286     z2 = (INT32) wsptr[3];
   2287     z4 = (INT32) wsptr[5];
   2288     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2289     z4 = (INT32) wsptr[7];
   2290 
   2291     tmp13 = z2 - z4;
   2292     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2293     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2294     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2295 
   2296     tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
   2297     tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
   2298     z2 = z1 - z4;
   2299     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2300 
   2301     tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2302     tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2303     tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2304     z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2305     tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2306     tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2307 
   2308     /* Final output stage */
   2309 
   2310     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
   2311 					       CONST_BITS+PASS1_BITS+3)
   2312 			     & RANGE_MASK];
   2313     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
   2314 					       CONST_BITS+PASS1_BITS+3)
   2315 			     & RANGE_MASK];
   2316     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
   2317 					       CONST_BITS+PASS1_BITS+3)
   2318 			     & RANGE_MASK];
   2319     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
   2320 					       CONST_BITS+PASS1_BITS+3)
   2321 			     & RANGE_MASK];
   2322     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
   2323 					       CONST_BITS+PASS1_BITS+3)
   2324 			     & RANGE_MASK];
   2325     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
   2326 					       CONST_BITS+PASS1_BITS+3)
   2327 			     & RANGE_MASK];
   2328     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
   2329 					       CONST_BITS+PASS1_BITS+3)
   2330 			     & RANGE_MASK];
   2331     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
   2332 					       CONST_BITS+PASS1_BITS+3)
   2333 			     & RANGE_MASK];
   2334     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
   2335 					       CONST_BITS+PASS1_BITS+3)
   2336 			     & RANGE_MASK];
   2337     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
   2338 					       CONST_BITS+PASS1_BITS+3)
   2339 			     & RANGE_MASK];
   2340     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
   2341 					       CONST_BITS+PASS1_BITS+3)
   2342 			     & RANGE_MASK];
   2343     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
   2344 					       CONST_BITS+PASS1_BITS+3)
   2345 			     & RANGE_MASK];
   2346     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
   2347 					       CONST_BITS+PASS1_BITS+3)
   2348 			     & RANGE_MASK];
   2349     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
   2350 					       CONST_BITS+PASS1_BITS+3)
   2351 			     & RANGE_MASK];
   2352     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
   2353 					       CONST_BITS+PASS1_BITS+3)
   2354 			     & RANGE_MASK];
   2355 
   2356     wsptr += 8;		/* advance pointer to next row */
   2357   }
   2358 }
   2359 
   2360 
   2361 /*
   2362  * Perform dequantization and inverse DCT on one block of coefficients,
   2363  * producing a 16x16 output block.
   2364  *
   2365  * Optimized algorithm with 28 multiplications in the 1-D kernel.
   2366  * cK represents sqrt(2) * cos(K*pi/32).
   2367  */
   2368 
   2369 GLOBAL(void)
   2370 jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2371 		 JCOEFPTR coef_block,
   2372 		 JSAMPARRAY output_buf, JDIMENSION output_col)
   2373 {
   2374   INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   2375   INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2376   INT32 z1, z2, z3, z4;
   2377   JCOEFPTR inptr;
   2378   ISLOW_MULT_TYPE * quantptr;
   2379   int * wsptr;
   2380   JSAMPROW outptr;
   2381   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2382   int ctr;
   2383   int workspace[8*16];	/* buffers data between passes */
   2384   SHIFT_TEMPS
   2385 
   2386   /* Pass 1: process columns from input, store into work array. */
   2387 
   2388   inptr = coef_block;
   2389   quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
   2390   wsptr = workspace;
   2391   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2392     /* Even part */
   2393 
   2394     tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
   2395     tmp0 <<= CONST_BITS;
   2396     /* Add fudge factor here for final descale. */
   2397     tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
   2398 
   2399     z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
   2400     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2401     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2402 
   2403     tmp10 = tmp0 + tmp1;
   2404     tmp11 = tmp0 - tmp1;
   2405     tmp12 = tmp0 + tmp2;
   2406     tmp13 = tmp0 - tmp2;
   2407 
   2408     z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
   2409     z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
   2410     z3 = z1 - z2;
   2411     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2412     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2413 
   2414     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2415     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2416     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2417     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2418 
   2419     tmp20 = tmp10 + tmp0;
   2420     tmp27 = tmp10 - tmp0;
   2421     tmp21 = tmp12 + tmp1;
   2422     tmp26 = tmp12 - tmp1;
   2423     tmp22 = tmp13 + tmp2;
   2424     tmp25 = tmp13 - tmp2;
   2425     tmp23 = tmp11 + tmp3;
   2426     tmp24 = tmp11 - tmp3;
   2427 
   2428     /* Odd part */
   2429 
   2430     z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
   2431     z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
   2432     z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
   2433     z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
   2434 
   2435     tmp11 = z1 + z3;
   2436 
   2437     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2438     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2439     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2440     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2441     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2442     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2443     tmp0  = tmp1 + tmp2 + tmp3 -
   2444 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2445     tmp13 = tmp10 + tmp11 + tmp12 -
   2446 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2447     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2448     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2449     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2450     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2451     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2452     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2453     z2    += z4;
   2454     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   2455     tmp1  += z1;
   2456     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2457     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   2458     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2459     tmp12 += z2;
   2460     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   2461     tmp2  += z2;
   2462     tmp3  += z2;
   2463     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2464     tmp10 += z2;
   2465     tmp11 += z2;
   2466 
   2467     /* Final output stage */
   2468 
   2469     wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
   2470     wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
   2471     wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
   2472     wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
   2473     wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
   2474     wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
   2475     wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
   2476     wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
   2477     wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
   2478     wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
   2479     wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
   2480     wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
   2481     wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
   2482     wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
   2483     wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
   2484     wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
   2485   }
   2486 
   2487   /* Pass 2: process 16 rows from work array, store into output array. */
   2488 
   2489   wsptr = workspace;
   2490   for (ctr = 0; ctr < 16; ctr++) {
   2491     outptr = output_buf[ctr] + output_col;
   2492 
   2493     /* Even part */
   2494 
   2495     /* Add fudge factor here for final descale. */
   2496     tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
   2497     tmp0 <<= CONST_BITS;
   2498 
   2499     z1 = (INT32) wsptr[4];
   2500     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2501     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2502 
   2503     tmp10 = tmp0 + tmp1;
   2504     tmp11 = tmp0 - tmp1;
   2505     tmp12 = tmp0 + tmp2;
   2506     tmp13 = tmp0 - tmp2;
   2507 
   2508     z1 = (INT32) wsptr[2];
   2509     z2 = (INT32) wsptr[6];
   2510     z3 = z1 - z2;
   2511     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2512     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2513 
   2514     tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2515     tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2516     tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2517     tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2518 
   2519     tmp20 = tmp10 + tmp0;
   2520     tmp27 = tmp10 - tmp0;
   2521     tmp21 = tmp12 + tmp1;
   2522     tmp26 = tmp12 - tmp1;
   2523     tmp22 = tmp13 + tmp2;
   2524     tmp25 = tmp13 - tmp2;
   2525     tmp23 = tmp11 + tmp3;
   2526     tmp24 = tmp11 - tmp3;
   2527 
   2528     /* Odd part */
   2529 
   2530     z1 = (INT32) wsptr[1];
   2531     z2 = (INT32) wsptr[3];
   2532     z3 = (INT32) wsptr[5];
   2533     z4 = (INT32) wsptr[7];
   2534 
   2535     tmp11 = z1 + z3;
   2536 
   2537     tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2538     tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2539     tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2540     tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2541     tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2542     tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2543     tmp0  = tmp1 + tmp2 + tmp3 -
   2544 	    MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2545     tmp13 = tmp10 + tmp11 + tmp12 -
   2546 	    MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2547     z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2548     tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2549     tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2550     z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2551     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2552     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2553     z2    += z4;
   2554     z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
   2555     tmp1  += z1;
   2556     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2557     z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
   2558     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2559     tmp12 += z2;
   2560     z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
   2561     tmp2  += z2;
   2562     tmp3  += z2;
   2563     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2564     tmp10 += z2;
   2565     tmp11 += z2;
   2566 
   2567     /* Final output stage */
   2568 
   2569     outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
   2570 					       CONST_BITS+PASS1_BITS+3)
   2571 			     & RANGE_MASK];
   2572     outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
   2573 					       CONST_BITS+PASS1_BITS+3)
   2574 			     & RANGE_MASK];
   2575     outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
   2576 					       CONST_BITS+PASS1_BITS+3)
   2577 			     & RANGE_MASK];
   2578     outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
   2579 					       CONST_BITS+PASS1_BITS+3)
   2580 			     & RANGE_MASK];
   2581     outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
   2582 					       CONST_BITS+PASS1_BITS+3)
   2583 			     & RANGE_MASK];
   2584     outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
   2585 					       CONST_BITS+PASS1_BITS+3)
   2586 			     & RANGE_MASK];
   2587     outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
   2588 					       CONST_BITS+PASS1_BITS+3)
   2589 			     & RANGE_MASK];
   2590     outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
   2591 					       CONST_BITS+PASS1_BITS+3)
   2592 			     & RANGE_MASK];
   2593     outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
   2594 					       CONST_BITS+PASS1_BITS+3)
   2595 			     & RANGE_MASK];
   2596     outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
   2597 					       CONST_BITS+PASS1_BITS+3)
   2598 			     & RANGE_MASK];
   2599     outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
   2600 					       CONST_BITS+PASS1_BITS+3)
   2601 			     & RANGE_MASK];
   2602     outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
   2603 					       CONST_BITS+PASS1_BITS+3)
   2604 			     & RANGE_MASK];
   2605     outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
   2606 					       CONST_BITS+PASS1_BITS+3)
   2607 			     & RANGE_MASK];
   2608     outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
   2609 					       CONST_BITS+PASS1_BITS+3)
   2610 			     & RANGE_MASK];
   2611     outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
   2612 					       CONST_BITS+PASS1_BITS+3)
   2613 			     & RANGE_MASK];
   2614     outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
   2615 					       CONST_BITS+PASS1_BITS+3)
   2616 			     & RANGE_MASK];
   2617 
   2618     wsptr += 8;		/* advance pointer to next row */
   2619   }
   2620 }
   2621 
   2622 #endif /* IDCT_SCALING_SUPPORTED */
   2623 #endif /* DCT_ISLOW_SUPPORTED */
   2624