Home | History | Annotate | Download | only in libjpeg_turbo
      1 /*
      2  * jcdctmgr.c
      3  *
      4  * This file was part of the Independent JPEG Group's software:
      5  * Copyright (C) 1994-1996, Thomas G. Lane.
      6  * libjpeg-turbo Modifications:
      7  * Copyright (C) 1999-2006, MIYASAKA Masaru.
      8  * Copyright 2009 Pierre Ossman <ossman (at) cendio.se> for Cendio AB
      9  * Copyright (C) 2011 D. R. Commander
     10  * For conditions of distribution and use, see the accompanying README file.
     11  *
     12  * This file contains the forward-DCT management logic.
     13  * This code selects a particular DCT implementation to be used,
     14  * and it performs related housekeeping chores including coefficient
     15  * quantization.
     16  */
     17 
     18 #define JPEG_INTERNALS
     19 #include "jinclude.h"
     20 #include "jpeglib.h"
     21 #include "jdct.h"		/* Private declarations for DCT subsystem */
     22 #include "jsimddct.h"
     23 
     24 
     25 /* Private subobject for this module */
     26 
     27 typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
     28 typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
     29 
     30 typedef JMETHOD(void, convsamp_method_ptr,
     31                 (JSAMPARRAY sample_data, JDIMENSION start_col,
     32                  DCTELEM * workspace));
     33 typedef JMETHOD(void, float_convsamp_method_ptr,
     34                 (JSAMPARRAY sample_data, JDIMENSION start_col,
     35                  FAST_FLOAT *workspace));
     36 
     37 typedef JMETHOD(void, quantize_method_ptr,
     38                 (JCOEFPTR coef_block, DCTELEM * divisors,
     39                  DCTELEM * workspace));
     40 typedef JMETHOD(void, float_quantize_method_ptr,
     41                 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
     42                  FAST_FLOAT * workspace));
     43 
     44 METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
     45 
     46 typedef struct {
     47   struct jpeg_forward_dct pub;	/* public fields */
     48 
     49   /* Pointer to the DCT routine actually in use */
     50   forward_DCT_method_ptr dct;
     51   convsamp_method_ptr convsamp;
     52   quantize_method_ptr quantize;
     53 
     54   /* The actual post-DCT divisors --- not identical to the quant table
     55    * entries, because of scaling (especially for an unnormalized DCT).
     56    * Each table is given in normal array order.
     57    */
     58   DCTELEM * divisors[NUM_QUANT_TBLS];
     59 
     60   /* work area for FDCT subroutine */
     61   DCTELEM * workspace;
     62 
     63 #ifdef DCT_FLOAT_SUPPORTED
     64   /* Same as above for the floating-point case. */
     65   float_DCT_method_ptr float_dct;
     66   float_convsamp_method_ptr float_convsamp;
     67   float_quantize_method_ptr float_quantize;
     68   FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
     69   FAST_FLOAT * float_workspace;
     70 #endif
     71 } my_fdct_controller;
     72 
     73 typedef my_fdct_controller * my_fdct_ptr;
     74 
     75 
     76 /*
     77  * Find the highest bit in an integer through binary search.
     78  */
     79 LOCAL(int)
     80 flss (UINT16 val)
     81 {
     82   int bit;
     83 
     84   bit = 16;
     85 
     86   if (!val)
     87     return 0;
     88 
     89   if (!(val & 0xff00)) {
     90     bit -= 8;
     91     val <<= 8;
     92   }
     93   if (!(val & 0xf000)) {
     94     bit -= 4;
     95     val <<= 4;
     96   }
     97   if (!(val & 0xc000)) {
     98     bit -= 2;
     99     val <<= 2;
    100   }
    101   if (!(val & 0x8000)) {
    102     bit -= 1;
    103     val <<= 1;
    104   }
    105 
    106   return bit;
    107 }
    108 
    109 /*
    110  * Compute values to do a division using reciprocal.
    111  *
    112  * This implementation is based on an algorithm described in
    113  *   "How to optimize for the Pentium family of microprocessors"
    114  *   (http://www.agner.org/assem/).
    115  * More information about the basic algorithm can be found in
    116  * the paper "Integer Division Using Reciprocals" by Robert Alverson.
    117  *
    118  * The basic idea is to replace x/d by x * d^-1. In order to store
    119  * d^-1 with enough precision we shift it left a few places. It turns
    120  * out that this algoright gives just enough precision, and also fits
    121  * into DCTELEM:
    122  *
    123  *   b = (the number of significant bits in divisor) - 1
    124  *   r = (word size) + b
    125  *   f = 2^r / divisor
    126  *
    127  * f will not be an integer for most cases, so we need to compensate
    128  * for the rounding error introduced:
    129  *
    130  *   no fractional part:
    131  *
    132  *       result = input >> r
    133  *
    134  *   fractional part of f < 0.5:
    135  *
    136  *       round f down to nearest integer
    137  *       result = ((input + 1) * f) >> r
    138  *
    139  *   fractional part of f > 0.5:
    140  *
    141  *       round f up to nearest integer
    142  *       result = (input * f) >> r
    143  *
    144  * This is the original algorithm that gives truncated results. But we
    145  * want properly rounded results, so we replace "input" with
    146  * "input + divisor/2".
    147  *
    148  * In order to allow SIMD implementations we also tweak the values to
    149  * allow the same calculation to be made at all times:
    150  *
    151  *   dctbl[0] = f rounded to nearest integer
    152  *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
    153  *   dctbl[2] = 1 << ((word size) * 2 - r)
    154  *   dctbl[3] = r - (word size)
    155  *
    156  * dctbl[2] is for stupid instruction sets where the shift operation
    157  * isn't member wise (e.g. MMX).
    158  *
    159  * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
    160  * is that most SIMD implementations have a "multiply and store top
    161  * half" operation.
    162  *
    163  * Lastly, we store each of the values in their own table instead
    164  * of in a consecutive manner, yet again in order to allow SIMD
    165  * routines.
    166  */
    167 LOCAL(int)
    168 compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
    169 {
    170   UDCTELEM2 fq, fr;
    171   UDCTELEM c;
    172   int b, r;
    173 
    174   b = flss(divisor) - 1;
    175   r  = sizeof(DCTELEM) * 8 + b;
    176 
    177   fq = ((UDCTELEM2)1 << r) / divisor;
    178   fr = ((UDCTELEM2)1 << r) % divisor;
    179 
    180   c = divisor / 2; /* for rounding */
    181 
    182   if (fr == 0) { /* divisor is power of two */
    183     /* fq will be one bit too large to fit in DCTELEM, so adjust */
    184     fq >>= 1;
    185     r--;
    186   } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
    187     c++;
    188   } else { /* fractional part is > 0.5 */
    189     fq++;
    190   }
    191 
    192   dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
    193   dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
    194   dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
    195   dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
    196 
    197   if(r <= 16) return 0;
    198   else return 1;
    199 }
    200 
    201 /*
    202  * Initialize for a processing pass.
    203  * Verify that all referenced Q-tables are present, and set up
    204  * the divisor table for each one.
    205  * In the current implementation, DCT of all components is done during
    206  * the first pass, even if only some components will be output in the
    207  * first scan.  Hence all components should be examined here.
    208  */
    209 
    210 METHODDEF(void)
    211 start_pass_fdctmgr (j_compress_ptr cinfo)
    212 {
    213   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
    214   int ci, qtblno, i;
    215   jpeg_component_info *compptr;
    216   JQUANT_TBL * qtbl;
    217   DCTELEM * dtbl;
    218 
    219   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
    220        ci++, compptr++) {
    221     qtblno = compptr->quant_tbl_no;
    222     /* Make sure specified quantization table is present */
    223     if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
    224 	cinfo->quant_tbl_ptrs[qtblno] == NULL)
    225       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
    226     qtbl = cinfo->quant_tbl_ptrs[qtblno];
    227     /* Compute divisors for this quant table */
    228     /* We may do this more than once for same table, but it's not a big deal */
    229     switch (cinfo->dct_method) {
    230 #ifdef DCT_ISLOW_SUPPORTED
    231     case JDCT_ISLOW:
    232       /* For LL&M IDCT method, divisors are equal to raw quantization
    233        * coefficients multiplied by 8 (to counteract scaling).
    234        */
    235       if (fdct->divisors[qtblno] == NULL) {
    236 	fdct->divisors[qtblno] = (DCTELEM *)
    237 	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
    238 				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
    239       }
    240       dtbl = fdct->divisors[qtblno];
    241       for (i = 0; i < DCTSIZE2; i++) {
    242 	if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
    243 	  && fdct->quantize == jsimd_quantize)
    244 	  fdct->quantize = quantize;
    245       }
    246       break;
    247 #endif
    248 #ifdef DCT_IFAST_SUPPORTED
    249     case JDCT_IFAST:
    250       {
    251 	/* For AA&N IDCT method, divisors are equal to quantization
    252 	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
    253 	 *   scalefactor[0] = 1
    254 	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
    255 	 * We apply a further scale factor of 8.
    256 	 */
    257 #define CONST_BITS 14
    258 	static const INT16 aanscales[DCTSIZE2] = {
    259 	  /* precomputed values scaled up by 14 bits */
    260 	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
    261 	  22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
    262 	  21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
    263 	  19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
    264 	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
    265 	  12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
    266 	   8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
    267 	   4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
    268 	};
    269 	SHIFT_TEMPS
    270 
    271 	if (fdct->divisors[qtblno] == NULL) {
    272 	  fdct->divisors[qtblno] = (DCTELEM *)
    273 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
    274 					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
    275 	}
    276 	dtbl = fdct->divisors[qtblno];
    277 	for (i = 0; i < DCTSIZE2; i++) {
    278 	  if(!compute_reciprocal(
    279 	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
    280 				  (INT32) aanscales[i]),
    281 		    CONST_BITS-3), &dtbl[i])
    282 	    && fdct->quantize == jsimd_quantize)
    283 	    fdct->quantize = quantize;
    284 	}
    285       }
    286       break;
    287 #endif
    288 #ifdef DCT_FLOAT_SUPPORTED
    289     case JDCT_FLOAT:
    290       {
    291 	/* For float AA&N IDCT method, divisors are equal to quantization
    292 	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
    293 	 *   scalefactor[0] = 1
    294 	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
    295 	 * We apply a further scale factor of 8.
    296 	 * What's actually stored is 1/divisor so that the inner loop can
    297 	 * use a multiplication rather than a division.
    298 	 */
    299 	FAST_FLOAT * fdtbl;
    300 	int row, col;
    301 	static const double aanscalefactor[DCTSIZE] = {
    302 	  1.0, 1.387039845, 1.306562965, 1.175875602,
    303 	  1.0, 0.785694958, 0.541196100, 0.275899379
    304 	};
    305 
    306 	if (fdct->float_divisors[qtblno] == NULL) {
    307 	  fdct->float_divisors[qtblno] = (FAST_FLOAT *)
    308 	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
    309 					DCTSIZE2 * SIZEOF(FAST_FLOAT));
    310 	}
    311 	fdtbl = fdct->float_divisors[qtblno];
    312 	i = 0;
    313 	for (row = 0; row < DCTSIZE; row++) {
    314 	  for (col = 0; col < DCTSIZE; col++) {
    315 	    fdtbl[i] = (FAST_FLOAT)
    316 	      (1.0 / (((double) qtbl->quantval[i] *
    317 		       aanscalefactor[row] * aanscalefactor[col] * 8.0)));
    318 	    i++;
    319 	  }
    320 	}
    321       }
    322       break;
    323 #endif
    324     default:
    325       ERREXIT(cinfo, JERR_NOT_COMPILED);
    326       break;
    327     }
    328   }
    329 }
    330 
    331 
    332 /*
    333  * Load data into workspace, applying unsigned->signed conversion.
    334  */
    335 
    336 METHODDEF(void)
    337 convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
    338 {
    339   register DCTELEM *workspaceptr;
    340   register JSAMPROW elemptr;
    341   register int elemr;
    342 
    343   workspaceptr = workspace;
    344   for (elemr = 0; elemr < DCTSIZE; elemr++) {
    345     elemptr = sample_data[elemr] + start_col;
    346 
    347 #if DCTSIZE == 8		/* unroll the inner loop */
    348     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    349     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    350     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    351     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    352     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    353     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    354     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    355     *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    356 #else
    357     {
    358       register int elemc;
    359       for (elemc = DCTSIZE; elemc > 0; elemc--)
    360         *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
    361     }
    362 #endif
    363   }
    364 }
    365 
    366 
    367 /*
    368  * Quantize/descale the coefficients, and store into coef_blocks[].
    369  */
    370 
    371 METHODDEF(void)
    372 quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
    373 {
    374   int i;
    375   DCTELEM temp;
    376   UDCTELEM recip, corr, shift;
    377   UDCTELEM2 product;
    378   JCOEFPTR output_ptr = coef_block;
    379 
    380   for (i = 0; i < DCTSIZE2; i++) {
    381     temp = workspace[i];
    382     recip = divisors[i + DCTSIZE2 * 0];
    383     corr =  divisors[i + DCTSIZE2 * 1];
    384     shift = divisors[i + DCTSIZE2 * 3];
    385 
    386     if (temp < 0) {
    387       temp = -temp;
    388       product = (UDCTELEM2)(temp + corr) * recip;
    389       product >>= shift + sizeof(DCTELEM)*8;
    390       temp = product;
    391       temp = -temp;
    392     } else {
    393       product = (UDCTELEM2)(temp + corr) * recip;
    394       product >>= shift + sizeof(DCTELEM)*8;
    395       temp = product;
    396     }
    397 
    398     output_ptr[i] = (JCOEF) temp;
    399   }
    400 }
    401 
    402 
    403 /*
    404  * Perform forward DCT on one or more blocks of a component.
    405  *
    406  * The input samples are taken from the sample_data[] array starting at
    407  * position start_row/start_col, and moving to the right for any additional
    408  * blocks. The quantized coefficients are returned in coef_blocks[].
    409  */
    410 
    411 METHODDEF(void)
    412 forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
    413 	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
    414 	     JDIMENSION start_row, JDIMENSION start_col,
    415 	     JDIMENSION num_blocks)
    416 /* This version is used for integer DCT implementations. */
    417 {
    418   /* This routine is heavily used, so it's worth coding it tightly. */
    419   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
    420   DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
    421   DCTELEM * workspace;
    422   JDIMENSION bi;
    423 
    424   /* Make sure the compiler doesn't look up these every pass */
    425   forward_DCT_method_ptr do_dct = fdct->dct;
    426   convsamp_method_ptr do_convsamp = fdct->convsamp;
    427   quantize_method_ptr do_quantize = fdct->quantize;
    428   workspace = fdct->workspace;
    429 
    430   sample_data += start_row;	/* fold in the vertical offset once */
    431 
    432   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    433     /* Load data into workspace, applying unsigned->signed conversion */
    434     (*do_convsamp) (sample_data, start_col, workspace);
    435 
    436     /* Perform the DCT */
    437     (*do_dct) (workspace);
    438 
    439     /* Quantize/descale the coefficients, and store into coef_blocks[] */
    440     (*do_quantize) (coef_blocks[bi], divisors, workspace);
    441   }
    442 }
    443 
    444 
    445 #ifdef DCT_FLOAT_SUPPORTED
    446 
    447 
    448 METHODDEF(void)
    449 convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
    450 {
    451   register FAST_FLOAT *workspaceptr;
    452   register JSAMPROW elemptr;
    453   register int elemr;
    454 
    455   workspaceptr = workspace;
    456   for (elemr = 0; elemr < DCTSIZE; elemr++) {
    457     elemptr = sample_data[elemr] + start_col;
    458 #if DCTSIZE == 8		/* unroll the inner loop */
    459     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    460     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    461     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    462     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    463     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    464     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    465     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    466     *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    467 #else
    468     {
    469       register int elemc;
    470       for (elemc = DCTSIZE; elemc > 0; elemc--)
    471         *workspaceptr++ = (FAST_FLOAT)
    472                           (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
    473     }
    474 #endif
    475   }
    476 }
    477 
    478 
    479 METHODDEF(void)
    480 quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
    481 {
    482   register FAST_FLOAT temp;
    483   register int i;
    484   register JCOEFPTR output_ptr = coef_block;
    485 
    486   for (i = 0; i < DCTSIZE2; i++) {
    487     /* Apply the quantization and scaling factor */
    488     temp = workspace[i] * divisors[i];
    489 
    490     /* Round to nearest integer.
    491      * Since C does not specify the direction of rounding for negative
    492      * quotients, we have to force the dividend positive for portability.
    493      * The maximum coefficient size is +-16K (for 12-bit data), so this
    494      * code should work for either 16-bit or 32-bit ints.
    495      */
    496     output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
    497   }
    498 }
    499 
    500 
    501 METHODDEF(void)
    502 forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
    503 		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
    504 		   JDIMENSION start_row, JDIMENSION start_col,
    505 		   JDIMENSION num_blocks)
    506 /* This version is used for floating-point DCT implementations. */
    507 {
    508   /* This routine is heavily used, so it's worth coding it tightly. */
    509   my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
    510   FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
    511   FAST_FLOAT * workspace;
    512   JDIMENSION bi;
    513 
    514 
    515   /* Make sure the compiler doesn't look up these every pass */
    516   float_DCT_method_ptr do_dct = fdct->float_dct;
    517   float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
    518   float_quantize_method_ptr do_quantize = fdct->float_quantize;
    519   workspace = fdct->float_workspace;
    520 
    521   sample_data += start_row;	/* fold in the vertical offset once */
    522 
    523   for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    524     /* Load data into workspace, applying unsigned->signed conversion */
    525     (*do_convsamp) (sample_data, start_col, workspace);
    526 
    527     /* Perform the DCT */
    528     (*do_dct) (workspace);
    529 
    530     /* Quantize/descale the coefficients, and store into coef_blocks[] */
    531     (*do_quantize) (coef_blocks[bi], divisors, workspace);
    532   }
    533 }
    534 
    535 #endif /* DCT_FLOAT_SUPPORTED */
    536 
    537 
    538 /*
    539  * Initialize FDCT manager.
    540  */
    541 
    542 GLOBAL(void)
    543 jinit_forward_dct (j_compress_ptr cinfo)
    544 {
    545   my_fdct_ptr fdct;
    546   int i;
    547 
    548   fdct = (my_fdct_ptr)
    549     (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
    550 				SIZEOF(my_fdct_controller));
    551   cinfo->fdct = (struct jpeg_forward_dct *) fdct;
    552   fdct->pub.start_pass = start_pass_fdctmgr;
    553 
    554   /* First determine the DCT... */
    555   switch (cinfo->dct_method) {
    556 #ifdef DCT_ISLOW_SUPPORTED
    557   case JDCT_ISLOW:
    558     fdct->pub.forward_DCT = forward_DCT;
    559     if (jsimd_can_fdct_islow())
    560       fdct->dct = jsimd_fdct_islow;
    561     else
    562       fdct->dct = jpeg_fdct_islow;
    563     break;
    564 #endif
    565 #ifdef DCT_IFAST_SUPPORTED
    566   case JDCT_IFAST:
    567     fdct->pub.forward_DCT = forward_DCT;
    568     if (jsimd_can_fdct_ifast())
    569       fdct->dct = jsimd_fdct_ifast;
    570     else
    571       fdct->dct = jpeg_fdct_ifast;
    572     break;
    573 #endif
    574 #ifdef DCT_FLOAT_SUPPORTED
    575   case JDCT_FLOAT:
    576     fdct->pub.forward_DCT = forward_DCT_float;
    577     if (jsimd_can_fdct_float())
    578       fdct->float_dct = jsimd_fdct_float;
    579     else
    580       fdct->float_dct = jpeg_fdct_float;
    581     break;
    582 #endif
    583   default:
    584     ERREXIT(cinfo, JERR_NOT_COMPILED);
    585     break;
    586   }
    587 
    588   /* ...then the supporting stages. */
    589   switch (cinfo->dct_method) {
    590 #ifdef DCT_ISLOW_SUPPORTED
    591   case JDCT_ISLOW:
    592 #endif
    593 #ifdef DCT_IFAST_SUPPORTED
    594   case JDCT_IFAST:
    595 #endif
    596 #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
    597     if (jsimd_can_convsamp())
    598       fdct->convsamp = jsimd_convsamp;
    599     else
    600       fdct->convsamp = convsamp;
    601     if (jsimd_can_quantize())
    602       fdct->quantize = jsimd_quantize;
    603     else
    604       fdct->quantize = quantize;
    605     break;
    606 #endif
    607 #ifdef DCT_FLOAT_SUPPORTED
    608   case JDCT_FLOAT:
    609     if (jsimd_can_convsamp_float())
    610       fdct->float_convsamp = jsimd_convsamp_float;
    611     else
    612       fdct->float_convsamp = convsamp_float;
    613     if (jsimd_can_quantize_float())
    614       fdct->float_quantize = jsimd_quantize_float;
    615     else
    616       fdct->float_quantize = quantize_float;
    617     break;
    618 #endif
    619   default:
    620     ERREXIT(cinfo, JERR_NOT_COMPILED);
    621     break;
    622   }
    623 
    624   /* Allocate workspace memory */
    625 #ifdef DCT_FLOAT_SUPPORTED
    626   if (cinfo->dct_method == JDCT_FLOAT)
    627     fdct->float_workspace = (FAST_FLOAT *)
    628       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
    629 				  SIZEOF(FAST_FLOAT) * DCTSIZE2);
    630   else
    631 #endif
    632     fdct->workspace = (DCTELEM *)
    633       (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
    634 				  SIZEOF(DCTELEM) * DCTSIZE2);
    635 
    636   /* Mark divisor tables unallocated */
    637   for (i = 0; i < NUM_QUANT_TBLS; i++) {
    638     fdct->divisors[i] = NULL;
    639 #ifdef DCT_FLOAT_SUPPORTED
    640     fdct->float_divisors[i] = NULL;
    641 #endif
    642   }
    643 }
    644