Home | History | Annotate | Download | only in decoder
      1 /******************************************************************************
      2  *                                                                            *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 #include <stdlib.h>
     21 #include <stdio.h>
     22 
     23 #include <ixheaacd_type_def.h>
     24 #include "ixheaacd_interface.h"
     25 #include "ixheaacd_constants.h"
     26 #include <ixheaacd_basic_ops32.h>
     27 #include "ixheaacd_function_selector.h"
     28 
     29 extern const WORD32 ixheaacd_twiddle_table_fft_32x32[514];
     30 extern const WORD32 ixheaacd_twiddle_table_3pr[1155];
     31 extern const WORD32 ixheaacd_twiddle_table_3pi[1155];
     32 extern const WORD8 ixheaacd_mps_dig_rev[16];
     33 
     34 #define PLATFORM_INLINE __inline
     35 
     36 #define DIG_REV(i, m, j)                                    \
     37   do {                                                      \
     38     unsigned _ = (i);                                       \
     39     _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); \
     40     _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); \
     41     _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); \
     42     (j) = _ >> (m);                                         \
     43   } while (0)
     44 
     45 static PLATFORM_INLINE WORD32 ixheaacd_mult32(WORD32 a, WORD32 b) {
     46   WORD32 result;
     47   WORD64 temp_result;
     48 
     49   temp_result = (WORD64)a * (WORD64)b;
     50   result = (WORD32)(temp_result >> 31);
     51 
     52   return (result);
     53 }
     54 
     55 static PLATFORM_INLINE WORD32 ixheaacd_mac32(WORD32 a, WORD32 b, WORD32 c) {
     56   WORD32 result;
     57 
     58   result = a + ixheaacd_mult32(b, c);
     59 
     60   return (result);
     61 }
     62 
     63 static PLATFORM_INLINE WORD32 ixheaacd_mult32_shl(WORD32 a, WORD32 b) {
     64   WORD32 result;
     65   WORD64 temp_result;
     66 
     67   temp_result = (WORD64)a * (WORD64)b;
     68   result = (WORD32)(temp_result >> 32);
     69 
     70   return (result << 1);
     71 }
     72 
     73 VOID ixheaacd_mps_complex_fft_64_dec(WORD32 *ptr_x, WORD32 *fin_re,
     74                                      WORD32 *fin_im, WORD32 nlength) {
     75   WORD32 i, j, k, n_stages;
     76   WORD32 h2, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
     77   WORD32 del, nodespacing, in_loop_cnt;
     78   WORD32 y[128];
     79   WORD32 npoints = nlength;
     80   WORD32 *ptr_y = y;
     81   const WORD32 *ptr_w;
     82   n_stages = 30 - ixheaacd_norm32(npoints);
     83 
     84   n_stages = n_stages >> 1;
     85 
     86   ptr_w = ixheaacd_twiddle_table_fft_32x32;
     87 
     88   for (i = 0; i < npoints; i += 4) {
     89     WORD32 *inp = ptr_x;
     90     h2 = ixheaacd_mps_dig_rev[i >> 2];
     91     inp += (h2);
     92 
     93     x0r = *inp;
     94     x0i = *(inp + 1);
     95     inp += (npoints >> 1);
     96 
     97     x1r = *inp;
     98     x1i = *(inp + 1);
     99     inp += (npoints >> 1);
    100 
    101     x2r = *inp;
    102     x2i = *(inp + 1);
    103     inp += (npoints >> 1);
    104 
    105     x3r = *inp;
    106     x3i = *(inp + 1);
    107 
    108     x0r = x0r + x2r;
    109     x0i = x0i + x2i;
    110     x2r = x0r - (x2r << 1);
    111     x2i = x0i - (x2i << 1);
    112     x1r = x1r + x3r;
    113     x1i = x1i + x3i;
    114     x3r = x1r - (x3r << 1);
    115     x3i = x1i - (x3i << 1);
    116 
    117     x0r = x0r + x1r;
    118     x0i = x0i + x1i;
    119     x1r = x0r - (x1r << 1);
    120     x1i = x0i - (x1i << 1);
    121     x2r = x2r + x3i;
    122     x2i = x2i - x3r;
    123     x3i = x2r - (x3i << 1);
    124     x3r = x2i + (x3r << 1);
    125 
    126     *ptr_y++ = x0r;
    127     *ptr_y++ = x0i;
    128     *ptr_y++ = x2r;
    129     *ptr_y++ = x2i;
    130     *ptr_y++ = x1r;
    131     *ptr_y++ = x1i;
    132     *ptr_y++ = x3i;
    133     *ptr_y++ = x3r;
    134   }
    135   ptr_y -= 2 * npoints;
    136   del = 4;
    137   nodespacing = 64;
    138   in_loop_cnt = npoints >> 4;
    139   for (i = n_stages - 1; i > 0; i--) {
    140     const WORD32 *twiddles = ptr_w;
    141     WORD32 *data = ptr_y;
    142     WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
    143     WORD32 sec_loop_cnt;
    144 
    145     for (k = in_loop_cnt; k != 0; k--) {
    146       x0r = (*data);
    147       x0i = (*(data + 1));
    148       data += (del << 1);
    149 
    150       x1r = (*data);
    151       x1i = (*(data + 1));
    152       data += (del << 1);
    153 
    154       x2r = (*data);
    155       x2i = (*(data + 1));
    156       data += (del << 1);
    157 
    158       x3r = (*data);
    159       x3i = (*(data + 1));
    160       data -= 3 * (del << 1);
    161 
    162       x0r = x0r + x2r;
    163       x0i = x0i + x2i;
    164       x2r = x0r - (x2r << 1);
    165       x2i = x0i - (x2i << 1);
    166       x1r = x1r + x3r;
    167       x1i = x1i + x3i;
    168       x3r = x1r - (x3r << 1);
    169       x3i = x1i - (x3i << 1);
    170 
    171       x0r = x0r + x1r;
    172       x0i = x0i + x1i;
    173       x1r = x0r - (x1r << 1);
    174       x1i = x0i - (x1i << 1);
    175       x2r = x2r + x3i;
    176       x2i = x2i - x3r;
    177       x3i = x2r - (x3i << 1);
    178       x3r = x2i + (x3r << 1);
    179 
    180       *data = x0r;
    181       *(data + 1) = x0i;
    182       data += (del << 1);
    183 
    184       *data = x2r;
    185       *(data + 1) = x2i;
    186       data += (del << 1);
    187 
    188       *data = x1r;
    189       *(data + 1) = x1i;
    190       data += (del << 1);
    191 
    192       *data = x3i;
    193       *(data + 1) = x3r;
    194       data += (del << 1);
    195     }
    196     data = ptr_y + 2;
    197 
    198     sec_loop_cnt = (nodespacing * del);
    199     sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
    200                    (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
    201                    (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
    202                    (sec_loop_cnt / 256);
    203     j = nodespacing;
    204 
    205     for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
    206       w1h = *(twiddles + 2 * j);
    207       w1l = *(twiddles + 2 * j + 1);
    208       w2h = *(twiddles + 2 * (j << 1));
    209       w2l = *(twiddles + 2 * (j << 1) + 1);
    210       w3h = *(twiddles + 2 * j + 2 * (j << 1));
    211       w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
    212 
    213       for (k = in_loop_cnt; k != 0; k--) {
    214         WORD32 tmp;
    215         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    216 
    217         data += (del << 1);
    218 
    219         x1r = *data;
    220         x1i = *(data + 1);
    221         data += (del << 1);
    222 
    223         x2r = *data;
    224         x2i = *(data + 1);
    225         data += (del << 1);
    226 
    227         x3r = *data;
    228         x3i = *(data + 1);
    229         data -= 3 * (del << 1);
    230 
    231         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    232         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    233         x1r = tmp;
    234 
    235         tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
    236         x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
    237         x2r = tmp;
    238 
    239         tmp = (ixheaacd_mult32(x3r, w3l) - ixheaacd_mult32(x3i, w3h));
    240         x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
    241         x3r = tmp;
    242 
    243         x0r = (*data);
    244         x0i = (*(data + 1));
    245 
    246         x0r = x0r + (x2r);
    247         x0i = x0i + (x2i);
    248         x2r = x0r - (x2r << 1);
    249         x2i = x0i - (x2i << 1);
    250         x1r = x1r + x3r;
    251         x1i = x1i + x3i;
    252         x3r = x1r - (x3r << 1);
    253         x3i = x1i - (x3i << 1);
    254 
    255         x0r = x0r + (x1r);
    256         x0i = x0i + (x1i);
    257         x1r = x0r - (x1r << 1);
    258         x1i = x0i - (x1i << 1);
    259         x2r = x2r + (x3i);
    260         x2i = x2i - (x3r);
    261         x3i = x2r - (x3i << 1);
    262         x3r = x2i + (x3r << 1);
    263 
    264         *data = x0r;
    265         *(data + 1) = x0i;
    266         data += (del << 1);
    267 
    268         *data = x2r;
    269         *(data + 1) = x2i;
    270         data += (del << 1);
    271 
    272         *data = x1r;
    273         *(data + 1) = x1i;
    274         data += (del << 1);
    275 
    276         *data = x3i;
    277         *(data + 1) = x3r;
    278         data += (del << 1);
    279       }
    280       data -= 2 * npoints;
    281       data += 2;
    282     }
    283     for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
    284       w1h = *(twiddles + 2 * j);
    285       w2h = *(twiddles + 2 * (j << 1));
    286       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
    287       w1l = *(twiddles + 2 * j + 1);
    288       w2l = *(twiddles + 2 * (j << 1) + 1);
    289       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
    290 
    291       for (k = in_loop_cnt; k != 0; k--) {
    292         WORD32 tmp;
    293         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    294 
    295         data += (del << 1);
    296 
    297         x1r = *data;
    298         x1i = *(data + 1);
    299         data += (del << 1);
    300 
    301         x2r = *data;
    302         x2i = *(data + 1);
    303         data += (del << 1);
    304 
    305         x3r = *data;
    306         x3i = *(data + 1);
    307         data -= 3 * (del << 1);
    308 
    309         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    310         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    311         x1r = tmp;
    312 
    313         tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
    314         x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
    315         x2r = tmp;
    316 
    317         tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
    318         x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
    319         x3r = tmp;
    320 
    321         x0r = (*data);
    322         x0i = (*(data + 1));
    323 
    324         x0r = x0r + (x2r);
    325         x0i = x0i + (x2i);
    326         x2r = x0r - (x2r << 1);
    327         x2i = x0i - (x2i << 1);
    328         x1r = x1r + x3r;
    329         x1i = x1i + x3i;
    330         x3r = x1r - (x3r << 1);
    331         x3i = x1i - (x3i << 1);
    332 
    333         x0r = x0r + (x1r);
    334         x0i = x0i + (x1i);
    335         x1r = x0r - (x1r << 1);
    336         x1i = x0i - (x1i << 1);
    337         x2r = x2r + (x3i);
    338         x2i = x2i - (x3r);
    339         x3i = x2r - (x3i << 1);
    340         x3r = x2i + (x3r << 1);
    341 
    342         *data = x0r;
    343         *(data + 1) = x0i;
    344         data += (del << 1);
    345 
    346         *data = x2r;
    347         *(data + 1) = x2i;
    348         data += (del << 1);
    349 
    350         *data = x1r;
    351         *(data + 1) = x1i;
    352         data += (del << 1);
    353 
    354         *data = x3i;
    355         *(data + 1) = x3r;
    356         data += (del << 1);
    357       }
    358       data -= 2 * npoints;
    359       data += 2;
    360     }
    361     for (; j <= sec_loop_cnt * 2; j += nodespacing) {
    362       w1h = *(twiddles + 2 * j);
    363       w2h = *(twiddles + 2 * (j << 1) - 512);
    364       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
    365       w1l = *(twiddles + 2 * j + 1);
    366       w2l = *(twiddles + 2 * (j << 1) - 511);
    367       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
    368 
    369       for (k = in_loop_cnt; k != 0; k--) {
    370         WORD32 tmp;
    371         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    372 
    373         data += (del << 1);
    374 
    375         x1r = *data;
    376         x1i = *(data + 1);
    377         data += (del << 1);
    378 
    379         x2r = *data;
    380         x2i = *(data + 1);
    381         data += (del << 1);
    382 
    383         x3r = *data;
    384         x3i = *(data + 1);
    385         data -= 3 * (del << 1);
    386 
    387         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    388         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    389         x1r = tmp;
    390 
    391         tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
    392         x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
    393         x2r = tmp;
    394 
    395         tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
    396         x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
    397         x3r = tmp;
    398 
    399         x0r = (*data);
    400         x0i = (*(data + 1));
    401 
    402         x0r = x0r + (x2r);
    403         x0i = x0i + (x2i);
    404         x2r = x0r - (x2r << 1);
    405         x2i = x0i - (x2i << 1);
    406         x1r = x1r + x3r;
    407         x1i = x1i + x3i;
    408         x3r = x1r - (x3r << 1);
    409         x3i = x1i - (x3i << 1);
    410 
    411         x0r = x0r + (x1r);
    412         x0i = x0i + (x1i);
    413         x1r = x0r - (x1r << 1);
    414         x1i = x0i - (x1i << 1);
    415         x2r = x2r + (x3i);
    416         x2i = x2i - (x3r);
    417         x3i = x2r - (x3i << 1);
    418         x3r = x2i + (x3r << 1);
    419 
    420         *data = x0r;
    421         *(data + 1) = x0i;
    422         data += (del << 1);
    423 
    424         *data = x2r;
    425         *(data + 1) = x2i;
    426         data += (del << 1);
    427 
    428         *data = x1r;
    429         *(data + 1) = x1i;
    430         data += (del << 1);
    431 
    432         *data = x3i;
    433         *(data + 1) = x3r;
    434         data += (del << 1);
    435       }
    436       data -= 2 * npoints;
    437       data += 2;
    438     }
    439     for (; j < nodespacing * del; j += nodespacing) {
    440       w1h = *(twiddles + 2 * j);
    441       w2h = *(twiddles + 2 * (j << 1) - 512);
    442       w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
    443       w1l = *(twiddles + 2 * j + 1);
    444       w2l = *(twiddles + 2 * (j << 1) - 511);
    445       w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
    446 
    447       for (k = in_loop_cnt; k != 0; k--) {
    448         WORD32 tmp;
    449         WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    450 
    451         data += (del << 1);
    452 
    453         x1r = *data;
    454         x1i = *(data + 1);
    455         data += (del << 1);
    456 
    457         x2r = *data;
    458         x2i = *(data + 1);
    459         data += (del << 1);
    460 
    461         x3r = *data;
    462         x3i = *(data + 1);
    463         data -= 3 * (del << 1);
    464 
    465         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    466         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    467         x1r = tmp;
    468 
    469         tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
    470         x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
    471         x2r = tmp;
    472 
    473         tmp = (-ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h));
    474         x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
    475         x3r = tmp;
    476 
    477         x0r = (*data);
    478         x0i = (*(data + 1));
    479 
    480         x0r = x0r + (x2r);
    481         x0i = x0i + (x2i);
    482         x2r = x0r - (x2r << 1);
    483         x2i = x0i - (x2i << 1);
    484         x1r = x1r + x3r;
    485         x1i = x1i - x3i;
    486         x3r = x1r - (x3r << 1);
    487         x3i = x1i + (x3i << 1);
    488 
    489         x0r = x0r + (x1r);
    490         x0i = x0i + (x1i);
    491         x1r = x0r - (x1r << 1);
    492         x1i = x0i - (x1i << 1);
    493         x2r = x2r + (x3i);
    494         x2i = x2i - (x3r);
    495         x3i = x2r - (x3i << 1);
    496         x3r = x2i + (x3r << 1);
    497 
    498         *data = x0r;
    499         *(data + 1) = x0i;
    500         data += (del << 1);
    501 
    502         *data = x2r;
    503         *(data + 1) = x2i;
    504         data += (del << 1);
    505 
    506         *data = x1r;
    507         *(data + 1) = x1i;
    508         data += (del << 1);
    509 
    510         *data = x3i;
    511         *(data + 1) = x3r;
    512         data += (del << 1);
    513       }
    514       data -= 2 * npoints;
    515       data += 2;
    516     }
    517     nodespacing >>= 2;
    518     del <<= 2;
    519     in_loop_cnt >>= 2;
    520   }
    521 
    522   for (i = 0; i < 2 * nlength; i += 2) {
    523     fin_re[i] = y[i];
    524     fin_im[i] = y[i + 1];
    525   }
    526 
    527   return;
    528 }
    529 
    530 VOID ixheaacd_complex_fft_p2_dec(WORD32 *xr, WORD32 *xi, WORD32 nlength,
    531                                  WORD32 fft_mode, WORD32 *preshift) {
    532   WORD32 i, j, k, n_stages;
    533   WORD32 h2, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    534   WORD32 del, nodespacing, in_loop_cnt;
    535   WORD32 not_power_4;
    536   WORD32 npts, shift;
    537   WORD32 dig_rev_shift;
    538   WORD32 ptr_x[1024];
    539   WORD32 y[1024];
    540   WORD32 npoints = nlength;
    541   WORD32 n = 0;
    542   WORD32 *ptr_y = y;
    543   const WORD32 *ptr_w;
    544   dig_rev_shift = ixheaacd_norm32(npoints) + 1 - 16;
    545   n_stages = 30 - ixheaacd_norm32(npoints);
    546   not_power_4 = n_stages & 1;
    547 
    548   n_stages = n_stages >> 1;
    549 
    550   npts = npoints;
    551   while (npts >> 1) {
    552     n++;
    553     npts = npts >> 1;
    554   }
    555 
    556   if (n % 2 == 0)
    557     shift = ((n + 4)) / 2;
    558   else
    559     shift = ((n + 3) / 2);
    560 
    561   for (i = 0; i < nlength; i++) {
    562     ptr_x[2 * i] = (xr[i] / (1 << (shift)));
    563     ptr_x[2 * i + 1] = (xi[i] / (1 << (shift)));
    564   }
    565 
    566   if (fft_mode == -1) {
    567     ptr_w = ixheaacd_twiddle_table_fft_32x32;
    568 
    569     for (i = 0; i < npoints; i += 4) {
    570       WORD32 *inp = ptr_x;
    571 
    572       DIG_REV(i, dig_rev_shift, h2);
    573       if (not_power_4) {
    574         h2 += 1;
    575         h2 &= ~1;
    576       }
    577       inp += (h2);
    578 
    579       x0r = *inp;
    580       x0i = *(inp + 1);
    581       inp += (npoints >> 1);
    582 
    583       x1r = *inp;
    584       x1i = *(inp + 1);
    585       inp += (npoints >> 1);
    586 
    587       x2r = *inp;
    588       x2i = *(inp + 1);
    589       inp += (npoints >> 1);
    590 
    591       x3r = *inp;
    592       x3i = *(inp + 1);
    593 
    594       x0r = x0r + x2r;
    595       x0i = x0i + x2i;
    596       x2r = x0r - (x2r << 1);
    597       x2i = x0i - (x2i << 1);
    598       x1r = x1r + x3r;
    599       x1i = x1i + x3i;
    600       x3r = x1r - (x3r << 1);
    601       x3i = x1i - (x3i << 1);
    602 
    603       x0r = x0r + x1r;
    604       x0i = x0i + x1i;
    605       x1r = x0r - (x1r << 1);
    606       x1i = x0i - (x1i << 1);
    607       x2r = x2r + x3i;
    608       x2i = x2i - x3r;
    609       x3i = x2r - (x3i << 1);
    610       x3r = x2i + (x3r << 1);
    611 
    612       *ptr_y++ = x0r;
    613       *ptr_y++ = x0i;
    614       *ptr_y++ = x2r;
    615       *ptr_y++ = x2i;
    616       *ptr_y++ = x1r;
    617       *ptr_y++ = x1i;
    618       *ptr_y++ = x3i;
    619       *ptr_y++ = x3r;
    620     }
    621     ptr_y -= 2 * npoints;
    622     del = 4;
    623     nodespacing = 64;
    624     in_loop_cnt = npoints >> 4;
    625     for (i = n_stages - 1; i > 0; i--) {
    626       const WORD32 *twiddles = ptr_w;
    627       WORD32 *data = ptr_y;
    628       WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
    629       WORD32 sec_loop_cnt;
    630 
    631       for (k = in_loop_cnt; k != 0; k--) {
    632         x0r = (*data);
    633         x0i = (*(data + 1));
    634         data += (del << 1);
    635 
    636         x1r = (*data);
    637         x1i = (*(data + 1));
    638         data += (del << 1);
    639 
    640         x2r = (*data);
    641         x2i = (*(data + 1));
    642         data += (del << 1);
    643 
    644         x3r = (*data);
    645         x3i = (*(data + 1));
    646         data -= 3 * (del << 1);
    647 
    648         x0r = x0r + x2r;
    649         x0i = x0i + x2i;
    650         x2r = x0r - (x2r << 1);
    651         x2i = x0i - (x2i << 1);
    652         x1r = x1r + x3r;
    653         x1i = x1i + x3i;
    654         x3r = x1r - (x3r << 1);
    655         x3i = x1i - (x3i << 1);
    656 
    657         x0r = x0r + x1r;
    658         x0i = x0i + x1i;
    659         x1r = x0r - (x1r << 1);
    660         x1i = x0i - (x1i << 1);
    661         x2r = x2r + x3i;
    662         x2i = x2i - x3r;
    663         x3i = x2r - (x3i << 1);
    664         x3r = x2i + (x3r << 1);
    665 
    666         *data = x0r;
    667         *(data + 1) = x0i;
    668         data += (del << 1);
    669 
    670         *data = x2r;
    671         *(data + 1) = x2i;
    672         data += (del << 1);
    673 
    674         *data = x1r;
    675         *(data + 1) = x1i;
    676         data += (del << 1);
    677 
    678         *data = x3i;
    679         *(data + 1) = x3r;
    680         data += (del << 1);
    681       }
    682       data = ptr_y + 2;
    683 
    684       sec_loop_cnt = (nodespacing * del);
    685       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
    686                      (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
    687                      (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
    688                      (sec_loop_cnt / 256);
    689       j = nodespacing;
    690 
    691       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
    692         w1h = *(twiddles + 2 * j);
    693         w1l = *(twiddles + 2 * j + 1);
    694         w2h = *(twiddles + 2 * (j << 1));
    695         w2l = *(twiddles + 2 * (j << 1) + 1);
    696         w3h = *(twiddles + 2 * j + 2 * (j << 1));
    697         w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
    698 
    699         for (k = in_loop_cnt; k != 0; k--) {
    700           WORD32 tmp;
    701           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    702 
    703           data += (del << 1);
    704 
    705           x1r = *data;
    706           x1i = *(data + 1);
    707           data += (del << 1);
    708 
    709           x2r = *data;
    710           x2i = *(data + 1);
    711           data += (del << 1);
    712 
    713           x3r = *data;
    714           x3i = *(data + 1);
    715           data -= 3 * (del << 1);
    716 
    717           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    718           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    719           x1r = tmp;
    720 
    721           tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
    722           x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
    723           x2r = tmp;
    724 
    725           tmp = (ixheaacd_mult32(x3r, w3l) - ixheaacd_mult32(x3i, w3h));
    726           x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
    727           x3r = tmp;
    728 
    729           x0r = (*data);
    730           x0i = (*(data + 1));
    731 
    732           x0r = x0r + (x2r);
    733           x0i = x0i + (x2i);
    734           x2r = x0r - (x2r << 1);
    735           x2i = x0i - (x2i << 1);
    736           x1r = x1r + x3r;
    737           x1i = x1i + x3i;
    738           x3r = x1r - (x3r << 1);
    739           x3i = x1i - (x3i << 1);
    740 
    741           x0r = x0r + (x1r);
    742           x0i = x0i + (x1i);
    743           x1r = x0r - (x1r << 1);
    744           x1i = x0i - (x1i << 1);
    745           x2r = x2r + (x3i);
    746           x2i = x2i - (x3r);
    747           x3i = x2r - (x3i << 1);
    748           x3r = x2i + (x3r << 1);
    749 
    750           *data = x0r;
    751           *(data + 1) = x0i;
    752           data += (del << 1);
    753 
    754           *data = x2r;
    755           *(data + 1) = x2i;
    756           data += (del << 1);
    757 
    758           *data = x1r;
    759           *(data + 1) = x1i;
    760           data += (del << 1);
    761 
    762           *data = x3i;
    763           *(data + 1) = x3r;
    764           data += (del << 1);
    765         }
    766         data -= 2 * npoints;
    767         data += 2;
    768       }
    769       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
    770         w1h = *(twiddles + 2 * j);
    771         w2h = *(twiddles + 2 * (j << 1));
    772         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
    773         w1l = *(twiddles + 2 * j + 1);
    774         w2l = *(twiddles + 2 * (j << 1) + 1);
    775         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
    776 
    777         for (k = in_loop_cnt; k != 0; k--) {
    778           WORD32 tmp;
    779           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    780           data += (del << 1);
    781 
    782           x1r = *data;
    783           x1i = *(data + 1);
    784           data += (del << 1);
    785 
    786           x2r = *data;
    787           x2i = *(data + 1);
    788           data += (del << 1);
    789 
    790           x3r = *data;
    791           x3i = *(data + 1);
    792           data -= 3 * (del << 1);
    793 
    794           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    795           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    796           x1r = tmp;
    797 
    798           tmp = (ixheaacd_mult32(x2r, w2l) - ixheaacd_mult32(x2i, w2h));
    799           x2i = ixheaacd_mac32(ixheaacd_mult32(x2r, w2h), x2i, w2l);
    800           x2r = tmp;
    801 
    802           tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
    803           x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
    804           x3r = tmp;
    805 
    806           x0r = (*data);
    807           x0i = (*(data + 1));
    808 
    809           x0r = x0r + (x2r);
    810           x0i = x0i + (x2i);
    811           x2r = x0r - (x2r << 1);
    812           x2i = x0i - (x2i << 1);
    813           x1r = x1r + x3r;
    814           x1i = x1i + x3i;
    815           x3r = x1r - (x3r << 1);
    816           x3i = x1i - (x3i << 1);
    817 
    818           x0r = x0r + (x1r);
    819           x0i = x0i + (x1i);
    820           x1r = x0r - (x1r << 1);
    821           x1i = x0i - (x1i << 1);
    822           x2r = x2r + (x3i);
    823           x2i = x2i - (x3r);
    824           x3i = x2r - (x3i << 1);
    825           x3r = x2i + (x3r << 1);
    826 
    827           *data = x0r;
    828           *(data + 1) = x0i;
    829           data += (del << 1);
    830 
    831           *data = x2r;
    832           *(data + 1) = x2i;
    833           data += (del << 1);
    834 
    835           *data = x1r;
    836           *(data + 1) = x1i;
    837           data += (del << 1);
    838 
    839           *data = x3i;
    840           *(data + 1) = x3r;
    841           data += (del << 1);
    842         }
    843         data -= 2 * npoints;
    844         data += 2;
    845       }
    846       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
    847         w1h = *(twiddles + 2 * j);
    848         w2h = *(twiddles + 2 * (j << 1) - 512);
    849         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
    850         w1l = *(twiddles + 2 * j + 1);
    851         w2l = *(twiddles + 2 * (j << 1) - 511);
    852         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
    853 
    854         for (k = in_loop_cnt; k != 0; k--) {
    855           WORD32 tmp;
    856           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    857 
    858           data += (del << 1);
    859 
    860           x1r = *data;
    861           x1i = *(data + 1);
    862           data += (del << 1);
    863 
    864           x2r = *data;
    865           x2i = *(data + 1);
    866           data += (del << 1);
    867 
    868           x3r = *data;
    869           x3i = *(data + 1);
    870           data -= 3 * (del << 1);
    871 
    872           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    873           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    874           x1r = tmp;
    875 
    876           tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
    877           x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
    878           x2r = tmp;
    879 
    880           tmp = (ixheaacd_mult32(x3r, w3h) + ixheaacd_mult32(x3i, w3l));
    881           x3i = -ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
    882           x3r = tmp;
    883 
    884           x0r = (*data);
    885           x0i = (*(data + 1));
    886 
    887           x0r = x0r + (x2r);
    888           x0i = x0i + (x2i);
    889           x2r = x0r - (x2r << 1);
    890           x2i = x0i - (x2i << 1);
    891           x1r = x1r + x3r;
    892           x1i = x1i + x3i;
    893           x3r = x1r - (x3r << 1);
    894           x3i = x1i - (x3i << 1);
    895 
    896           x0r = x0r + (x1r);
    897           x0i = x0i + (x1i);
    898           x1r = x0r - (x1r << 1);
    899           x1i = x0i - (x1i << 1);
    900           x2r = x2r + (x3i);
    901           x2i = x2i - (x3r);
    902           x3i = x2r - (x3i << 1);
    903           x3r = x2i + (x3r << 1);
    904 
    905           *data = x0r;
    906           *(data + 1) = x0i;
    907           data += (del << 1);
    908 
    909           *data = x2r;
    910           *(data + 1) = x2i;
    911           data += (del << 1);
    912 
    913           *data = x1r;
    914           *(data + 1) = x1i;
    915           data += (del << 1);
    916 
    917           *data = x3i;
    918           *(data + 1) = x3r;
    919           data += (del << 1);
    920         }
    921         data -= 2 * npoints;
    922         data += 2;
    923       }
    924       for (; j < nodespacing * del; j += nodespacing) {
    925         w1h = *(twiddles + 2 * j);
    926         w2h = *(twiddles + 2 * (j << 1) - 512);
    927         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
    928         w1l = *(twiddles + 2 * j + 1);
    929         w2l = *(twiddles + 2 * (j << 1) - 511);
    930         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
    931 
    932         for (k = in_loop_cnt; k != 0; k--) {
    933           WORD32 tmp;
    934           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    935 
    936           data += (del << 1);
    937 
    938           x1r = *data;
    939           x1i = *(data + 1);
    940           data += (del << 1);
    941 
    942           x2r = *data;
    943           x2i = *(data + 1);
    944           data += (del << 1);
    945 
    946           x3r = *data;
    947           x3i = *(data + 1);
    948           data -= 3 * (del << 1);
    949 
    950           tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
    951           x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
    952           x1r = tmp;
    953 
    954           tmp = (ixheaacd_mult32(x2r, w2h) + ixheaacd_mult32(x2i, w2l));
    955           x2i = -ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
    956           x2r = tmp;
    957 
    958           tmp = (-ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h));
    959           x3i = ixheaacd_mac32(ixheaacd_mult32(x3r, w3h), x3i, w3l);
    960           x3r = tmp;
    961 
    962           x0r = (*data);
    963           x0i = (*(data + 1));
    964 
    965           x0r = x0r + (x2r);
    966           x0i = x0i + (x2i);
    967           x2r = x0r - (x2r << 1);
    968           x2i = x0i - (x2i << 1);
    969           x1r = x1r + x3r;
    970           x1i = x1i - x3i;
    971           x3r = x1r - (x3r << 1);
    972           x3i = x1i + (x3i << 1);
    973 
    974           x0r = x0r + (x1r);
    975           x0i = x0i + (x1i);
    976           x1r = x0r - (x1r << 1);
    977           x1i = x0i - (x1i << 1);
    978           x2r = x2r + (x3i);
    979           x2i = x2i - (x3r);
    980           x3i = x2r - (x3i << 1);
    981           x3r = x2i + (x3r << 1);
    982 
    983           *data = x0r;
    984           *(data + 1) = x0i;
    985           data += (del << 1);
    986 
    987           *data = x2r;
    988           *(data + 1) = x2i;
    989           data += (del << 1);
    990 
    991           *data = x1r;
    992           *(data + 1) = x1i;
    993           data += (del << 1);
    994 
    995           *data = x3i;
    996           *(data + 1) = x3r;
    997           data += (del << 1);
    998         }
    999         data -= 2 * npoints;
   1000         data += 2;
   1001       }
   1002       nodespacing >>= 2;
   1003       del <<= 2;
   1004       in_loop_cnt >>= 2;
   1005     }
   1006     if (not_power_4) {
   1007       const WORD32 *twiddles = ptr_w;
   1008       nodespacing <<= 1;
   1009       shift += 1;
   1010 
   1011       for (j = del / 2; j != 0; j--) {
   1012         WORD32 w1h = *twiddles;
   1013         WORD32 w1l = *(twiddles + 1);
   1014         WORD32 tmp;
   1015         twiddles += nodespacing * 2;
   1016 
   1017         x0r = *ptr_y;
   1018         x0i = *(ptr_y + 1);
   1019         ptr_y += (del << 1);
   1020 
   1021         x1r = *ptr_y;
   1022         x1i = *(ptr_y + 1);
   1023 
   1024         tmp = (ixheaacd_mult32(x1r, w1l) - ixheaacd_mult32(x1i, w1h));
   1025         x1i = ixheaacd_mac32(ixheaacd_mult32(x1r, w1h), x1i, w1l);
   1026         x1r = tmp;
   1027 
   1028         *ptr_y = (x0r) / 2 - (x1r) / 2;
   1029         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
   1030         ptr_y -= (del << 1);
   1031 
   1032         *ptr_y = (x0r) / 2 + (x1r) / 2;
   1033         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
   1034         ptr_y += 2;
   1035       }
   1036       twiddles = ptr_w;
   1037       for (j = del / 2; j != 0; j--) {
   1038         WORD32 w1h = *twiddles;
   1039         WORD32 w1l = *(twiddles + 1);
   1040         WORD32 tmp;
   1041         twiddles += nodespacing * 2;
   1042 
   1043         x0r = *ptr_y;
   1044         x0i = *(ptr_y + 1);
   1045         ptr_y += (del << 1);
   1046 
   1047         x1r = *ptr_y;
   1048         x1i = *(ptr_y + 1);
   1049 
   1050         tmp = (ixheaacd_mult32(x1r, w1h) + ixheaacd_mult32(x1i, w1l));
   1051         x1i = -ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h);
   1052         x1r = tmp;
   1053 
   1054         *ptr_y = (x0r) / 2 - (x1r) / 2;
   1055         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
   1056         ptr_y -= (del << 1);
   1057 
   1058         *ptr_y = (x0r) / 2 + (x1r) / 2;
   1059         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
   1060         ptr_y += 2;
   1061       }
   1062     }
   1063 
   1064   }
   1065 
   1066   else {
   1067     ptr_w = ixheaacd_twiddle_table_fft_32x32;
   1068 
   1069     for (i = 0; i < npoints; i += 4) {
   1070       WORD32 *inp = ptr_x;
   1071 
   1072       DIG_REV(i, dig_rev_shift, h2);
   1073       if (not_power_4) {
   1074         h2 += 1;
   1075         h2 &= ~1;
   1076       }
   1077       inp += (h2);
   1078 
   1079       x0r = *inp;
   1080       x0i = *(inp + 1);
   1081       inp += (npoints >> 1);
   1082 
   1083       x1r = *inp;
   1084       x1i = *(inp + 1);
   1085       inp += (npoints >> 1);
   1086 
   1087       x2r = *inp;
   1088       x2i = *(inp + 1);
   1089       inp += (npoints >> 1);
   1090 
   1091       x3r = *inp;
   1092       x3i = *(inp + 1);
   1093 
   1094       x0r = x0r + x2r;
   1095       x0i = x0i + x2i;
   1096       x2r = x0r - (x2r << 1);
   1097       x2i = x0i - (x2i << 1);
   1098       x1r = x1r + x3r;
   1099       x1i = x1i + x3i;
   1100       x3r = x1r - (x3r << 1);
   1101       x3i = x1i - (x3i << 1);
   1102 
   1103       x0r = x0r + x1r;
   1104       x0i = x0i + x1i;
   1105       x1r = x0r - (x1r << 1);
   1106       x1i = x0i - (x1i << 1);
   1107       x2r = x2r - x3i;
   1108       x2i = x2i + x3r;
   1109       x3i = x2r + (x3i << 1);
   1110       x3r = x2i - (x3r << 1);
   1111 
   1112       *ptr_y++ = x0r;
   1113       *ptr_y++ = x0i;
   1114       *ptr_y++ = x2r;
   1115       *ptr_y++ = x2i;
   1116       *ptr_y++ = x1r;
   1117       *ptr_y++ = x1i;
   1118       *ptr_y++ = x3i;
   1119       *ptr_y++ = x3r;
   1120     }
   1121     ptr_y -= 2 * npoints;
   1122     del = 4;
   1123     nodespacing = 64;
   1124     in_loop_cnt = npoints >> 4;
   1125     for (i = n_stages - 1; i > 0; i--) {
   1126       const WORD32 *twiddles = ptr_w;
   1127       WORD32 *data = ptr_y;
   1128       WORD32 w1h, w2h, w3h, w1l, w2l, w3l;
   1129       WORD32 sec_loop_cnt;
   1130 
   1131       for (k = in_loop_cnt; k != 0; k--) {
   1132         x0r = (*data);
   1133         x0i = (*(data + 1));
   1134         data += (del << 1);
   1135 
   1136         x1r = (*data);
   1137         x1i = (*(data + 1));
   1138         data += (del << 1);
   1139 
   1140         x2r = (*data);
   1141         x2i = (*(data + 1));
   1142         data += (del << 1);
   1143 
   1144         x3r = (*data);
   1145         x3i = (*(data + 1));
   1146         data -= 3 * (del << 1);
   1147 
   1148         x0r = x0r + x2r;
   1149         x0i = x0i + x2i;
   1150         x2r = x0r - (x2r << 1);
   1151         x2i = x0i - (x2i << 1);
   1152         x1r = x1r + x3r;
   1153         x1i = x1i + x3i;
   1154         x3r = x1r - (x3r << 1);
   1155         x3i = x1i - (x3i << 1);
   1156 
   1157         x0r = ixheaacd_add32_sat(x0r, x1r);
   1158         x0i = ixheaacd_add32_sat(x0i, x1i);
   1159         x1r = ixheaacd_sub32_sat(x0r, (x1r << 1));
   1160         x1i = ixheaacd_sub32_sat(x0i, (x1i << 1));
   1161         x2r = ixheaacd_sub32_sat(x2r, x3i);
   1162         x2i = ixheaacd_add32_sat(x2i, x3r);
   1163         x3i = ixheaacd_add32_sat(x2r, (x3i << 1));
   1164         x3r = ixheaacd_sub32_sat(x2i, (x3r << 1));
   1165 
   1166         *data = x0r;
   1167         *(data + 1) = x0i;
   1168         data += (del << 1);
   1169 
   1170         *data = x2r;
   1171         *(data + 1) = x2i;
   1172         data += (del << 1);
   1173 
   1174         *data = x1r;
   1175         *(data + 1) = x1i;
   1176         data += (del << 1);
   1177 
   1178         *data = x3i;
   1179         *(data + 1) = x3r;
   1180         data += (del << 1);
   1181       }
   1182       data = ptr_y + 2;
   1183 
   1184       sec_loop_cnt = (nodespacing * del);
   1185       sec_loop_cnt = (sec_loop_cnt / 4) + (sec_loop_cnt / 8) -
   1186                      (sec_loop_cnt / 16) + (sec_loop_cnt / 32) -
   1187                      (sec_loop_cnt / 64) + (sec_loop_cnt / 128) -
   1188                      (sec_loop_cnt / 256);
   1189       j = nodespacing;
   1190 
   1191       for (j = nodespacing; j <= sec_loop_cnt; j += nodespacing) {
   1192         w1h = *(twiddles + 2 * j);
   1193         w2h = *(twiddles + 2 * (j << 1));
   1194         w3h = *(twiddles + 2 * j + 2 * (j << 1));
   1195         w1l = *(twiddles + 2 * j + 1);
   1196         w2l = *(twiddles + 2 * (j << 1) + 1);
   1197         w3l = *(twiddles + 2 * j + 2 * (j << 1) + 1);
   1198 
   1199         for (k = in_loop_cnt; k != 0; k--) {
   1200           WORD32 tmp;
   1201           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
   1202 
   1203           data += (del << 1);
   1204 
   1205           x1r = *data;
   1206           x1i = *(data + 1);
   1207           data += (del << 1);
   1208 
   1209           x2r = *data;
   1210           x2i = *(data + 1);
   1211           data += (del << 1);
   1212 
   1213           x3r = *data;
   1214           x3i = *(data + 1);
   1215           data -= 3 * (del << 1);
   1216 
   1217           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
   1218           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
   1219           x1r = tmp;
   1220 
   1221           tmp = (ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h));
   1222           x2i = ixheaacd_mac32(-ixheaacd_mult32(x2r, w2h), x2i, w2l);
   1223           x2r = tmp;
   1224 
   1225           tmp = (ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h));
   1226           x3i = ixheaacd_mac32(-ixheaacd_mult32(x3r, w3h), x3i, w3l);
   1227           x3r = tmp;
   1228 
   1229           x0r = (*data);
   1230           x0i = (*(data + 1));
   1231 
   1232           x0r = x0r + (x2r);
   1233           x0i = x0i + (x2i);
   1234           x2r = x0r - (x2r << 1);
   1235           x2i = x0i - (x2i << 1);
   1236           x1r = x1r + x3r;
   1237           x1i = x1i + x3i;
   1238           x3r = x1r - (x3r << 1);
   1239           x3i = x1i - (x3i << 1);
   1240 
   1241           x0r = x0r + (x1r);
   1242           x0i = x0i + (x1i);
   1243           x1r = x0r - (x1r << 1);
   1244           x1i = x0i - (x1i << 1);
   1245           x2r = x2r - (x3i);
   1246           x2i = x2i + (x3r);
   1247           x3i = x2r + (x3i << 1);
   1248           x3r = x2i - (x3r << 1);
   1249 
   1250           *data = x0r;
   1251           *(data + 1) = x0i;
   1252           data += (del << 1);
   1253 
   1254           *data = x2r;
   1255           *(data + 1) = x2i;
   1256           data += (del << 1);
   1257 
   1258           *data = x1r;
   1259           *(data + 1) = x1i;
   1260           data += (del << 1);
   1261 
   1262           *data = x3i;
   1263           *(data + 1) = x3r;
   1264           data += (del << 1);
   1265         }
   1266         data -= 2 * npoints;
   1267         data += 2;
   1268       }
   1269       for (; j <= (nodespacing * del) >> 1; j += nodespacing) {
   1270         w1h = *(twiddles + 2 * j);
   1271         w2h = *(twiddles + 2 * (j << 1));
   1272         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
   1273         w1l = *(twiddles + 2 * j + 1);
   1274         w2l = *(twiddles + 2 * (j << 1) + 1);
   1275         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
   1276 
   1277         for (k = in_loop_cnt; k != 0; k--) {
   1278           WORD32 tmp;
   1279           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
   1280 
   1281           data += (del << 1);
   1282 
   1283           x1r = *data;
   1284           x1i = *(data + 1);
   1285           data += (del << 1);
   1286 
   1287           x2r = *data;
   1288           x2i = *(data + 1);
   1289           data += (del << 1);
   1290 
   1291           x3r = *data;
   1292           x3i = *(data + 1);
   1293           data -= 3 * (del << 1);
   1294 
   1295           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
   1296           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
   1297           x1r = tmp;
   1298 
   1299           tmp = (ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h));
   1300           x2i = ixheaacd_mac32(-ixheaacd_mult32(x2r, w2h), x2i, w2l);
   1301           x2r = tmp;
   1302 
   1303           tmp = (ixheaacd_mult32(x3r, w3h) - ixheaacd_mult32(x3i, w3l));
   1304           x3i = ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
   1305           x3r = tmp;
   1306 
   1307           x0r = (*data);
   1308           x0i = (*(data + 1));
   1309 
   1310           x0r = x0r + (x2r);
   1311           x0i = x0i + (x2i);
   1312           x2r = x0r - (x2r << 1);
   1313           x2i = x0i - (x2i << 1);
   1314           x1r = x1r + x3r;
   1315           x1i = x1i + x3i;
   1316           x3r = x1r - (x3r << 1);
   1317           x3i = x1i - (x3i << 1);
   1318 
   1319           x0r = x0r + (x1r);
   1320           x0i = x0i + (x1i);
   1321           x1r = x0r - (x1r << 1);
   1322           x1i = x0i - (x1i << 1);
   1323           x2r = x2r - (x3i);
   1324           x2i = x2i + (x3r);
   1325           x3i = x2r + (x3i << 1);
   1326           x3r = x2i - (x3r << 1);
   1327 
   1328           *data = x0r;
   1329           *(data + 1) = x0i;
   1330           data += (del << 1);
   1331 
   1332           *data = x2r;
   1333           *(data + 1) = x2i;
   1334           data += (del << 1);
   1335 
   1336           *data = x1r;
   1337           *(data + 1) = x1i;
   1338           data += (del << 1);
   1339 
   1340           *data = x3i;
   1341           *(data + 1) = x3r;
   1342           data += (del << 1);
   1343         }
   1344         data -= 2 * npoints;
   1345         data += 2;
   1346       }
   1347       for (; j <= sec_loop_cnt * 2; j += nodespacing) {
   1348         w1h = *(twiddles + 2 * j);
   1349         w2h = *(twiddles + 2 * (j << 1) - 512);
   1350         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 512);
   1351         w1l = *(twiddles + 2 * j + 1);
   1352         w2l = *(twiddles + 2 * (j << 1) - 511);
   1353         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 511);
   1354 
   1355         for (k = in_loop_cnt; k != 0; k--) {
   1356           WORD32 tmp;
   1357           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
   1358 
   1359           data += (del << 1);
   1360 
   1361           x1r = *data;
   1362           x1i = *(data + 1);
   1363           data += (del << 1);
   1364 
   1365           x2r = *data;
   1366           x2i = *(data + 1);
   1367           data += (del << 1);
   1368 
   1369           x3r = *data;
   1370           x3i = *(data + 1);
   1371           data -= 3 * (del << 1);
   1372 
   1373           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
   1374           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
   1375           x1r = tmp;
   1376 
   1377           tmp = (ixheaacd_mult32(x2r, w2h) - ixheaacd_mult32(x2i, w2l));
   1378           x2i = ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
   1379           x2r = tmp;
   1380 
   1381           tmp = (ixheaacd_mult32(x3r, w3h) - ixheaacd_mult32(x3i, w3l));
   1382           x3i = ixheaacd_mult32(x3r, w3l) + ixheaacd_mult32(x3i, w3h);
   1383           x3r = tmp;
   1384 
   1385           x0r = (*data);
   1386           x0i = (*(data + 1));
   1387 
   1388           x0r = x0r + (x2r);
   1389           x0i = x0i + (x2i);
   1390           x2r = x0r - (x2r << 1);
   1391           x2i = x0i - (x2i << 1);
   1392           x1r = x1r + x3r;
   1393           x1i = x1i + x3i;
   1394           x3r = x1r - (x3r << 1);
   1395           x3i = x1i - (x3i << 1);
   1396 
   1397           x0r = x0r + (x1r);
   1398           x0i = x0i + (x1i);
   1399           x1r = x0r - (x1r << 1);
   1400           x1i = x0i - (x1i << 1);
   1401           x2r = x2r - (x3i);
   1402           x2i = x2i + (x3r);
   1403           x3i = x2r + (x3i << 1);
   1404           x3r = x2i - (x3r << 1);
   1405 
   1406           *data = x0r;
   1407           *(data + 1) = x0i;
   1408           data += (del << 1);
   1409 
   1410           *data = x2r;
   1411           *(data + 1) = x2i;
   1412           data += (del << 1);
   1413 
   1414           *data = x1r;
   1415           *(data + 1) = x1i;
   1416           data += (del << 1);
   1417 
   1418           *data = x3i;
   1419           *(data + 1) = x3r;
   1420           data += (del << 1);
   1421         }
   1422         data -= 2 * npoints;
   1423         data += 2;
   1424       }
   1425       for (; j < nodespacing * del; j += nodespacing) {
   1426         w1h = *(twiddles + 2 * j);
   1427         w2h = *(twiddles + 2 * (j << 1) - 512);
   1428         w3h = *(twiddles + 2 * j + 2 * (j << 1) - 1024);
   1429         w1l = *(twiddles + 2 * j + 1);
   1430         w2l = *(twiddles + 2 * (j << 1) - 511);
   1431         w3l = *(twiddles + 2 * j + 2 * (j << 1) - 1023);
   1432 
   1433         for (k = in_loop_cnt; k != 0; k--) {
   1434           WORD32 tmp;
   1435           WORD32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
   1436 
   1437           data += (del << 1);
   1438 
   1439           x1r = *data;
   1440           x1i = *(data + 1);
   1441           data += (del << 1);
   1442 
   1443           x2r = *data;
   1444           x2i = *(data + 1);
   1445           data += (del << 1);
   1446 
   1447           x3r = *data;
   1448           x3i = *(data + 1);
   1449           data -= 3 * (del << 1);
   1450 
   1451           tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
   1452           x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
   1453           x1r = tmp;
   1454 
   1455           tmp = (ixheaacd_mult32(x2r, w2h) - ixheaacd_mult32(x2i, w2l));
   1456           x2i = ixheaacd_mult32(x2r, w2l) + ixheaacd_mult32(x2i, w2h);
   1457           x2r = tmp;
   1458 
   1459           tmp = (-ixheaacd_mult32(x3r, w3l) - ixheaacd_mult32(x3i, w3h));
   1460           x3i = ixheaacd_mac32(-ixheaacd_mult32(x3r, w3h), x3i, w3l);
   1461           x3r = tmp;
   1462 
   1463           x0r = (*data);
   1464           x0i = (*(data + 1));
   1465 
   1466           x0r = x0r + (x2r);
   1467           x0i = x0i + (x2i);
   1468           x2r = x0r - (x2r << 1);
   1469           x2i = x0i - (x2i << 1);
   1470           x1r = x1r + x3r;
   1471           x1i = x1i - x3i;
   1472           x3r = x1r - (x3r << 1);
   1473           x3i = x1i + (x3i << 1);
   1474 
   1475           x0r = x0r + (x1r);
   1476           x0i = x0i + (x1i);
   1477           x1r = x0r - (x1r << 1);
   1478           x1i = x0i - (x1i << 1);
   1479           x2r = x2r - (x3i);
   1480           x2i = x2i + (x3r);
   1481           x3i = x2r + (x3i << 1);
   1482           x3r = x2i - (x3r << 1);
   1483 
   1484           *data = x0r;
   1485           *(data + 1) = x0i;
   1486           data += (del << 1);
   1487 
   1488           *data = x2r;
   1489           *(data + 1) = x2i;
   1490           data += (del << 1);
   1491 
   1492           *data = x1r;
   1493           *(data + 1) = x1i;
   1494           data += (del << 1);
   1495 
   1496           *data = x3i;
   1497           *(data + 1) = x3r;
   1498           data += (del << 1);
   1499         }
   1500         data -= 2 * npoints;
   1501         data += 2;
   1502       }
   1503       nodespacing >>= 2;
   1504       del <<= 2;
   1505       in_loop_cnt >>= 2;
   1506     }
   1507     if (not_power_4) {
   1508       const WORD32 *twiddles = ptr_w;
   1509       nodespacing <<= 1;
   1510       shift += 1;
   1511       for (j = del / 2; j != 0; j--) {
   1512         WORD32 w1h = *twiddles;
   1513         WORD32 w1l = *(twiddles + 1);
   1514 
   1515         WORD32 tmp;
   1516         twiddles += nodespacing * 2;
   1517 
   1518         x0r = *ptr_y;
   1519         x0i = *(ptr_y + 1);
   1520         ptr_y += (del << 1);
   1521 
   1522         x1r = *ptr_y;
   1523         x1i = *(ptr_y + 1);
   1524 
   1525         tmp = (ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h));
   1526         x1i = ixheaacd_mac32(-ixheaacd_mult32(x1r, w1h), x1i, w1l);
   1527         x1r = tmp;
   1528 
   1529         *ptr_y = (x0r) / 2 - (x1r) / 2;
   1530         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
   1531         ptr_y -= (del << 1);
   1532 
   1533         *ptr_y = (x0r) / 2 + (x1r) / 2;
   1534         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
   1535         ptr_y += 2;
   1536       }
   1537       twiddles = ptr_w;
   1538       for (j = del / 2; j != 0; j--) {
   1539         WORD32 w1h = *twiddles;
   1540         WORD32 w1l = *(twiddles + 1);
   1541         WORD32 tmp;
   1542         twiddles += nodespacing * 2;
   1543 
   1544         x0r = *ptr_y;
   1545         x0i = *(ptr_y + 1);
   1546         ptr_y += (del << 1);
   1547 
   1548         x1r = *ptr_y;
   1549         x1i = *(ptr_y + 1);
   1550 
   1551         tmp = (ixheaacd_mult32(x1r, w1h) - ixheaacd_mult32(x1i, w1l));
   1552         x1i = ixheaacd_mult32(x1r, w1l) + ixheaacd_mult32(x1i, w1h);
   1553         x1r = tmp;
   1554 
   1555         *ptr_y = (x0r) / 2 - (x1r) / 2;
   1556         *(ptr_y + 1) = (x0i) / 2 - (x1i) / 2;
   1557         ptr_y -= (del << 1);
   1558 
   1559         *ptr_y = (x0r) / 2 + (x1r) / 2;
   1560         *(ptr_y + 1) = (x0i) / 2 + (x1i) / 2;
   1561         ptr_y += 2;
   1562       }
   1563     }
   1564   }
   1565 
   1566   for (i = 0; i < nlength; i++) {
   1567     xr[i] = y[2 * i];
   1568     xi[i] = y[2 * i + 1];
   1569   }
   1570 
   1571   *preshift = shift - *preshift;
   1572   return;
   1573 }
   1574 
   1575 static PLATFORM_INLINE void ixheaacd_complex_3point_fft(WORD32 *inp, WORD32 *op,
   1576                                                         WORD32 sign_dir) {
   1577   WORD32 add_r, sub_r;
   1578   WORD32 add_i, sub_i;
   1579   WORD32 temp_real, temp_imag, temp;
   1580 
   1581   WORD32 p1, p2, p3, p4;
   1582 
   1583   WORD32 sinmu;
   1584   sinmu = -1859775393 * sign_dir;
   1585 
   1586   temp_real = ixheaacd_add32_sat(inp[0], inp[2]);
   1587   temp_imag = ixheaacd_add32_sat(inp[1], inp[3]);
   1588 
   1589   add_r = ixheaacd_add32_sat(inp[2], inp[4]);
   1590   add_i = ixheaacd_add32_sat(inp[3], inp[5]);
   1591 
   1592   sub_r = ixheaacd_sub32_sat(inp[2], inp[4]);
   1593   sub_i = ixheaacd_sub32_sat(inp[3], inp[5]);
   1594 
   1595   p1 = add_r >> 1;
   1596   p4 = add_i >> 1;
   1597   p2 = ixheaacd_mult32_shl(sub_i, sinmu);
   1598   p3 = ixheaacd_mult32_shl(sub_r, sinmu);
   1599 
   1600   temp = ixheaacd_sub32(inp[0], p1);
   1601 
   1602   op[0] = ixheaacd_add32_sat(temp_real, inp[4]);
   1603   op[1] = ixheaacd_add32_sat(temp_imag, inp[5]);
   1604   op[2] = ixheaacd_add32_sat(temp, p2);
   1605   op[3] = ixheaacd_sub32_sat(ixheaacd_sub32_sat(inp[1], p3), p4);
   1606   op[4] = ixheaacd_sub32_sat(temp, p2);
   1607   op[5] = ixheaacd_sub32_sat(ixheaacd_add32_sat(inp[1], p3), p4);
   1608 
   1609   return;
   1610 }
   1611 
   1612 VOID ixheaacd_complex_fft_p3(WORD32 *xr, WORD32 *xi, WORD32 nlength,
   1613                              WORD32 fft_mode, WORD32 *preshift) {
   1614   WORD32 i, j;
   1615   WORD32 shift = 0;
   1616   WORD32 xr_3[384];
   1617   WORD32 xi_3[384];
   1618   WORD32 x[1024];
   1619   WORD32 y[1024];
   1620   WORD32 cnfac, npts;
   1621   WORD32 mpass = nlength;
   1622   WORD32 n = 0;
   1623   WORD32 *ptr_x = x;
   1624   WORD32 *ptr_y = y;
   1625 
   1626   cnfac = 0;
   1627   while (mpass % 3 == 0) {
   1628     mpass /= 3;
   1629     cnfac++;
   1630   }
   1631   npts = mpass;
   1632 
   1633   for (i = 0; i < 3 * cnfac; i++) {
   1634     for (j = 0; j < mpass; j++) {
   1635       xr_3[j] = xr[3 * j + i];
   1636       xi_3[j] = xi[3 * j + i];
   1637     }
   1638 
   1639     (*ixheaacd_complex_fft_p2)(xr_3, xi_3, mpass, fft_mode, &shift);
   1640 
   1641     for (j = 0; j < mpass; j++) {
   1642       xr[3 * j + i] = xr_3[j];
   1643       xi[3 * j + i] = xi_3[j];
   1644     }
   1645   }
   1646 
   1647   while (npts >> 1) {
   1648     n++;
   1649     npts = npts >> 1;
   1650   }
   1651 
   1652   if (n % 2 == 0)
   1653     shift = ((n + 4)) / 2;
   1654   else
   1655     shift = ((n + 5) / 2);
   1656 
   1657   *preshift = shift - *preshift + 1;
   1658 
   1659   for (i = 0; i < nlength; i++) {
   1660     ptr_x[2 * i] = (xr[i] >> 1);
   1661     ptr_x[2 * i + 1] = (xi[i] >> 1);
   1662   }
   1663 
   1664   {
   1665     const WORD32 *w1r, *w1i;
   1666     WORD32 tmp;
   1667     w1r = ixheaacd_twiddle_table_3pr;
   1668     w1i = ixheaacd_twiddle_table_3pi;
   1669 
   1670     if (fft_mode < 0) {
   1671       for (i = 0; i < nlength; i += 3) {
   1672         tmp = ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i], (*w1r)),
   1673                                  ixheaacd_mult32(ptr_x[2 * i + 1], (*w1i)));
   1674         ptr_x[2 * i + 1] =
   1675             ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i], (*w1i)),
   1676                                ixheaacd_mult32(ptr_x[2 * i + 1], (*w1r)));
   1677         ptr_x[2 * i] = tmp;
   1678 
   1679         w1r++;
   1680         w1i++;
   1681 
   1682         tmp = ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 2], (*w1r)),
   1683                                  ixheaacd_mult32(ptr_x[2 * i + 3], (*w1i)));
   1684         ptr_x[2 * i + 3] =
   1685             ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 2], (*w1i)),
   1686                                ixheaacd_mult32(ptr_x[2 * i + 3], (*w1r)));
   1687         ptr_x[2 * i + 2] = tmp;
   1688 
   1689         w1r++;
   1690         w1i++;
   1691 
   1692         tmp = ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 4], (*w1r)),
   1693                                  ixheaacd_mult32(ptr_x[2 * i + 5], (*w1i)));
   1694         ptr_x[2 * i + 5] =
   1695             ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 4], (*w1i)),
   1696                                ixheaacd_mult32(ptr_x[2 * i + 5], (*w1r)));
   1697         ptr_x[2 * i + 4] = tmp;
   1698 
   1699         w1r += 3 * (128 / mpass - 1) + 1;
   1700         w1i += 3 * (128 / mpass - 1) + 1;
   1701       }
   1702     }
   1703 
   1704     else {
   1705       for (i = 0; i < nlength; i += 3) {
   1706         tmp = ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i], (*w1r)),
   1707                                  ixheaacd_mult32(ptr_x[2 * i + 1], (*w1i)));
   1708         ptr_x[2 * i + 1] =
   1709             ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 1], (*w1r)),
   1710                                ixheaacd_mult32(ptr_x[2 * i], (*w1i)));
   1711         ptr_x[2 * i] = tmp;
   1712 
   1713         w1r++;
   1714         w1i++;
   1715 
   1716         tmp = ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 2], (*w1r)),
   1717                                  ixheaacd_mult32(ptr_x[2 * i + 3], (*w1i)));
   1718         ptr_x[2 * i + 3] =
   1719             ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 3], (*w1r)),
   1720                                ixheaacd_mult32(ptr_x[2 * i + 2], (*w1i)));
   1721         ptr_x[2 * i + 2] = tmp;
   1722 
   1723         w1r++;
   1724         w1i++;
   1725 
   1726         tmp = ixheaacd_add32_sat(ixheaacd_mult32(ptr_x[2 * i + 4], (*w1r)),
   1727                                  ixheaacd_mult32(ptr_x[2 * i + 5], (*w1i)));
   1728         ptr_x[2 * i + 5] =
   1729             ixheaacd_sub32_sat(ixheaacd_mult32(ptr_x[2 * i + 5], (*w1r)),
   1730                                ixheaacd_mult32(ptr_x[2 * i + 4], (*w1i)));
   1731         ptr_x[2 * i + 4] = tmp;
   1732 
   1733         w1r += 3 * (128 / mpass - 1) + 1;
   1734         w1i += 3 * (128 / mpass - 1) + 1;
   1735       }
   1736     }
   1737   }
   1738 
   1739   for (i = 0; i < mpass; i++) {
   1740     ixheaacd_complex_3point_fft(ptr_x, ptr_y, fft_mode);
   1741 
   1742     ptr_x = ptr_x + 6;
   1743     ptr_y = ptr_y + 6;
   1744   }
   1745 
   1746   for (i = 0; i < mpass; i++) {
   1747     xr[i] = y[6 * i];
   1748     xi[i] = y[6 * i + 1];
   1749   }
   1750 
   1751   for (i = 0; i < mpass; i++) {
   1752     xr[mpass + i] = y[6 * i + 2];
   1753     xi[mpass + i] = y[6 * i + 3];
   1754   }
   1755 
   1756   for (i = 0; i < mpass; i++) {
   1757     xr[2 * mpass + i] = y[6 * i + 4];
   1758     xi[2 * mpass + i] = y[6 * i + 5];
   1759   }
   1760   return;
   1761 }
   1762 
   1763 VOID ixheaacd_complex_fft(WORD32 *data_r, WORD32 *data_i, WORD32 nlength,
   1764                           WORD32 fft_mode, WORD32 *preshift) {
   1765   if (nlength & (nlength - 1)) {
   1766     if ((nlength != 24) && (nlength != 48) && (nlength != 96) &&
   1767         (nlength != 192) && (nlength != 384)) {
   1768       printf("%d point FFT not supported", nlength);
   1769       exit(0);
   1770     }
   1771     ixheaacd_complex_fft_p3(data_r, data_i, nlength, fft_mode, preshift);
   1772   } else
   1773     (*ixheaacd_complex_fft_p2)(data_r, data_i, nlength, fft_mode, preshift);
   1774 
   1775   return;
   1776 }
   1777