Home | History | Annotate | Download | only in aec
      1 /*
      2  * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
      3  * Copyright Takuya OOURA, 1996-2001
      4  *
      5  * You may use, copy, modify and distribute this code for any purpose (include
      6  * commercial use) and without fee. Please refer to this package when you modify
      7  * this code.
      8  *
      9  * Changes by the WebRTC authors:
     10  *    - Trivial type modifications.
     11  *    - Minimal code subset to do rdft of length 128.
     12  *    - Optimizations because of known length.
     13  *
     14  *  All changes are covered by the WebRTC license and IP grant:
     15  *  Use of this source code is governed by a BSD-style license
     16  *  that can be found in the LICENSE file in the root of the source
     17  *  tree. An additional intellectual property rights grant can be found
     18  *  in the file PATENTS.  All contributing project authors may
     19  *  be found in the AUTHORS file in the root of the source tree.
     20  */
     21 
     22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h"
     23 
     24 #include <math.h>
     25 
     26 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
     27 #include "webrtc/typedefs.h"
     28 
     29 // These tables used to be computed at run-time. For example, refer to:
     30 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
     31 // to see the initialization code.
     32 const float rdft_w[64] = {
     33     1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
     34     0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
     35     0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
     36     0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
     37     0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
     38     0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
     39     0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
     40     0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
     41     0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
     42     0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
     43     0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
     44     0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
     45     0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
     46     0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
     47     0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
     48     0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
     49 };
     50 const float rdft_wk3ri_first[16] = {
     51     1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
     52     0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
     53     0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
     54     0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
     55 };
     56 const float rdft_wk3ri_second[16] = {
     57     -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
     58     -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
     59     -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
     60     -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
     61 };
     62 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
     63     1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
     64     0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
     65     0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
     66     0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
     67     0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
     68     0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
     69     0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
     70     0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
     71 };
     72 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
     73     1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
     74     0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
     75     0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
     76     0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
     77     0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
     78     0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
     79     0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
     80     0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
     81 };
     82 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
     83     1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
     84     0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
     85     0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
     86     -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
     87     0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
     88     0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
     89     0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
     90     -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
     91 };
     92 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
     93     -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
     94     -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
     95     -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
     96     -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
     97     -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
     98     -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
     99     -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
    100     -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
    101 };
    102 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
    103     -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
    104     -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
    105     -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
    106     -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
    107     -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
    108     -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
    109     -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
    110     -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
    111 };
    112 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
    113     -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
    114     -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
    115     -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
    116     -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
    117     -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
    118     -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
    119     -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
    120     -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
    121 };
    122 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
    123     0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
    124 };
    125 
    126 static void bitrv2_128_C(float* a) {
    127   /*
    128       Following things have been attempted but are no faster:
    129       (a) Storing the swap indexes in a LUT (index calculations are done
    130           for 'free' while waiting on memory/L1).
    131       (b) Consolidate the load/store of two consecutive floats by a 64 bit
    132           integer (execution is memory/L1 bound).
    133       (c) Do a mix of floats and 64 bit integer to maximize register
    134           utilization (execution is memory/L1 bound).
    135       (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
    136       (e) Hard-coding of the offsets to completely eliminates index
    137           calculations.
    138   */
    139 
    140   unsigned int j, j1, k, k1;
    141   float xr, xi, yr, yi;
    142 
    143   static const int ip[4] = {0, 64, 32, 96};
    144   for (k = 0; k < 4; k++) {
    145     for (j = 0; j < k; j++) {
    146       j1 = 2 * j + ip[k];
    147       k1 = 2 * k + ip[j];
    148       xr = a[j1 + 0];
    149       xi = a[j1 + 1];
    150       yr = a[k1 + 0];
    151       yi = a[k1 + 1];
    152       a[j1 + 0] = yr;
    153       a[j1 + 1] = yi;
    154       a[k1 + 0] = xr;
    155       a[k1 + 1] = xi;
    156       j1 += 8;
    157       k1 += 16;
    158       xr = a[j1 + 0];
    159       xi = a[j1 + 1];
    160       yr = a[k1 + 0];
    161       yi = a[k1 + 1];
    162       a[j1 + 0] = yr;
    163       a[j1 + 1] = yi;
    164       a[k1 + 0] = xr;
    165       a[k1 + 1] = xi;
    166       j1 += 8;
    167       k1 -= 8;
    168       xr = a[j1 + 0];
    169       xi = a[j1 + 1];
    170       yr = a[k1 + 0];
    171       yi = a[k1 + 1];
    172       a[j1 + 0] = yr;
    173       a[j1 + 1] = yi;
    174       a[k1 + 0] = xr;
    175       a[k1 + 1] = xi;
    176       j1 += 8;
    177       k1 += 16;
    178       xr = a[j1 + 0];
    179       xi = a[j1 + 1];
    180       yr = a[k1 + 0];
    181       yi = a[k1 + 1];
    182       a[j1 + 0] = yr;
    183       a[j1 + 1] = yi;
    184       a[k1 + 0] = xr;
    185       a[k1 + 1] = xi;
    186     }
    187     j1 = 2 * k + 8 + ip[k];
    188     k1 = j1 + 8;
    189     xr = a[j1 + 0];
    190     xi = a[j1 + 1];
    191     yr = a[k1 + 0];
    192     yi = a[k1 + 1];
    193     a[j1 + 0] = yr;
    194     a[j1 + 1] = yi;
    195     a[k1 + 0] = xr;
    196     a[k1 + 1] = xi;
    197   }
    198 }
    199 
    200 static void cft1st_128_C(float* a) {
    201   const int n = 128;
    202   int j, k1, k2;
    203   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
    204   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    205 
    206   // The processing of the first set of elements was simplified in C to avoid
    207   // some operations (multiplication by zero or one, addition of two elements
    208   // multiplied by the same weight, ...).
    209   x0r = a[0] + a[2];
    210   x0i = a[1] + a[3];
    211   x1r = a[0] - a[2];
    212   x1i = a[1] - a[3];
    213   x2r = a[4] + a[6];
    214   x2i = a[5] + a[7];
    215   x3r = a[4] - a[6];
    216   x3i = a[5] - a[7];
    217   a[0] = x0r + x2r;
    218   a[1] = x0i + x2i;
    219   a[4] = x0r - x2r;
    220   a[5] = x0i - x2i;
    221   a[2] = x1r - x3i;
    222   a[3] = x1i + x3r;
    223   a[6] = x1r + x3i;
    224   a[7] = x1i - x3r;
    225   wk1r = rdft_w[2];
    226   x0r = a[8] + a[10];
    227   x0i = a[9] + a[11];
    228   x1r = a[8] - a[10];
    229   x1i = a[9] - a[11];
    230   x2r = a[12] + a[14];
    231   x2i = a[13] + a[15];
    232   x3r = a[12] - a[14];
    233   x3i = a[13] - a[15];
    234   a[8] = x0r + x2r;
    235   a[9] = x0i + x2i;
    236   a[12] = x2i - x0i;
    237   a[13] = x0r - x2r;
    238   x0r = x1r - x3i;
    239   x0i = x1i + x3r;
    240   a[10] = wk1r * (x0r - x0i);
    241   a[11] = wk1r * (x0r + x0i);
    242   x0r = x3i + x1r;
    243   x0i = x3r - x1i;
    244   a[14] = wk1r * (x0i - x0r);
    245   a[15] = wk1r * (x0i + x0r);
    246   k1 = 0;
    247   for (j = 16; j < n; j += 16) {
    248     k1 += 2;
    249     k2 = 2 * k1;
    250     wk2r = rdft_w[k1 + 0];
    251     wk2i = rdft_w[k1 + 1];
    252     wk1r = rdft_w[k2 + 0];
    253     wk1i = rdft_w[k2 + 1];
    254     wk3r = rdft_wk3ri_first[k1 + 0];
    255     wk3i = rdft_wk3ri_first[k1 + 1];
    256     x0r = a[j + 0] + a[j + 2];
    257     x0i = a[j + 1] + a[j + 3];
    258     x1r = a[j + 0] - a[j + 2];
    259     x1i = a[j + 1] - a[j + 3];
    260     x2r = a[j + 4] + a[j + 6];
    261     x2i = a[j + 5] + a[j + 7];
    262     x3r = a[j + 4] - a[j + 6];
    263     x3i = a[j + 5] - a[j + 7];
    264     a[j + 0] = x0r + x2r;
    265     a[j + 1] = x0i + x2i;
    266     x0r -= x2r;
    267     x0i -= x2i;
    268     a[j + 4] = wk2r * x0r - wk2i * x0i;
    269     a[j + 5] = wk2r * x0i + wk2i * x0r;
    270     x0r = x1r - x3i;
    271     x0i = x1i + x3r;
    272     a[j + 2] = wk1r * x0r - wk1i * x0i;
    273     a[j + 3] = wk1r * x0i + wk1i * x0r;
    274     x0r = x1r + x3i;
    275     x0i = x1i - x3r;
    276     a[j + 6] = wk3r * x0r - wk3i * x0i;
    277     a[j + 7] = wk3r * x0i + wk3i * x0r;
    278     wk1r = rdft_w[k2 + 2];
    279     wk1i = rdft_w[k2 + 3];
    280     wk3r = rdft_wk3ri_second[k1 + 0];
    281     wk3i = rdft_wk3ri_second[k1 + 1];
    282     x0r = a[j + 8] + a[j + 10];
    283     x0i = a[j + 9] + a[j + 11];
    284     x1r = a[j + 8] - a[j + 10];
    285     x1i = a[j + 9] - a[j + 11];
    286     x2r = a[j + 12] + a[j + 14];
    287     x2i = a[j + 13] + a[j + 15];
    288     x3r = a[j + 12] - a[j + 14];
    289     x3i = a[j + 13] - a[j + 15];
    290     a[j + 8] = x0r + x2r;
    291     a[j + 9] = x0i + x2i;
    292     x0r -= x2r;
    293     x0i -= x2i;
    294     a[j + 12] = -wk2i * x0r - wk2r * x0i;
    295     a[j + 13] = -wk2i * x0i + wk2r * x0r;
    296     x0r = x1r - x3i;
    297     x0i = x1i + x3r;
    298     a[j + 10] = wk1r * x0r - wk1i * x0i;
    299     a[j + 11] = wk1r * x0i + wk1i * x0r;
    300     x0r = x1r + x3i;
    301     x0i = x1i - x3r;
    302     a[j + 14] = wk3r * x0r - wk3i * x0i;
    303     a[j + 15] = wk3r * x0i + wk3i * x0r;
    304   }
    305 }
    306 
    307 static void cftmdl_128_C(float* a) {
    308   const int l = 8;
    309   const int n = 128;
    310   const int m = 32;
    311   int j0, j1, j2, j3, k, k1, k2, m2;
    312   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
    313   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    314 
    315   for (j0 = 0; j0 < l; j0 += 2) {
    316     j1 = j0 + 8;
    317     j2 = j0 + 16;
    318     j3 = j0 + 24;
    319     x0r = a[j0 + 0] + a[j1 + 0];
    320     x0i = a[j0 + 1] + a[j1 + 1];
    321     x1r = a[j0 + 0] - a[j1 + 0];
    322     x1i = a[j0 + 1] - a[j1 + 1];
    323     x2r = a[j2 + 0] + a[j3 + 0];
    324     x2i = a[j2 + 1] + a[j3 + 1];
    325     x3r = a[j2 + 0] - a[j3 + 0];
    326     x3i = a[j2 + 1] - a[j3 + 1];
    327     a[j0 + 0] = x0r + x2r;
    328     a[j0 + 1] = x0i + x2i;
    329     a[j2 + 0] = x0r - x2r;
    330     a[j2 + 1] = x0i - x2i;
    331     a[j1 + 0] = x1r - x3i;
    332     a[j1 + 1] = x1i + x3r;
    333     a[j3 + 0] = x1r + x3i;
    334     a[j3 + 1] = x1i - x3r;
    335   }
    336   wk1r = rdft_w[2];
    337   for (j0 = m; j0 < l + m; j0 += 2) {
    338     j1 = j0 + 8;
    339     j2 = j0 + 16;
    340     j3 = j0 + 24;
    341     x0r = a[j0 + 0] + a[j1 + 0];
    342     x0i = a[j0 + 1] + a[j1 + 1];
    343     x1r = a[j0 + 0] - a[j1 + 0];
    344     x1i = a[j0 + 1] - a[j1 + 1];
    345     x2r = a[j2 + 0] + a[j3 + 0];
    346     x2i = a[j2 + 1] + a[j3 + 1];
    347     x3r = a[j2 + 0] - a[j3 + 0];
    348     x3i = a[j2 + 1] - a[j3 + 1];
    349     a[j0 + 0] = x0r + x2r;
    350     a[j0 + 1] = x0i + x2i;
    351     a[j2 + 0] = x2i - x0i;
    352     a[j2 + 1] = x0r - x2r;
    353     x0r = x1r - x3i;
    354     x0i = x1i + x3r;
    355     a[j1 + 0] = wk1r * (x0r - x0i);
    356     a[j1 + 1] = wk1r * (x0r + x0i);
    357     x0r = x3i + x1r;
    358     x0i = x3r - x1i;
    359     a[j3 + 0] = wk1r * (x0i - x0r);
    360     a[j3 + 1] = wk1r * (x0i + x0r);
    361   }
    362   k1 = 0;
    363   m2 = 2 * m;
    364   for (k = m2; k < n; k += m2) {
    365     k1 += 2;
    366     k2 = 2 * k1;
    367     wk2r = rdft_w[k1 + 0];
    368     wk2i = rdft_w[k1 + 1];
    369     wk1r = rdft_w[k2 + 0];
    370     wk1i = rdft_w[k2 + 1];
    371     wk3r = rdft_wk3ri_first[k1 + 0];
    372     wk3i = rdft_wk3ri_first[k1 + 1];
    373     for (j0 = k; j0 < l + k; j0 += 2) {
    374       j1 = j0 + 8;
    375       j2 = j0 + 16;
    376       j3 = j0 + 24;
    377       x0r = a[j0 + 0] + a[j1 + 0];
    378       x0i = a[j0 + 1] + a[j1 + 1];
    379       x1r = a[j0 + 0] - a[j1 + 0];
    380       x1i = a[j0 + 1] - a[j1 + 1];
    381       x2r = a[j2 + 0] + a[j3 + 0];
    382       x2i = a[j2 + 1] + a[j3 + 1];
    383       x3r = a[j2 + 0] - a[j3 + 0];
    384       x3i = a[j2 + 1] - a[j3 + 1];
    385       a[j0 + 0] = x0r + x2r;
    386       a[j0 + 1] = x0i + x2i;
    387       x0r -= x2r;
    388       x0i -= x2i;
    389       a[j2 + 0] = wk2r * x0r - wk2i * x0i;
    390       a[j2 + 1] = wk2r * x0i + wk2i * x0r;
    391       x0r = x1r - x3i;
    392       x0i = x1i + x3r;
    393       a[j1 + 0] = wk1r * x0r - wk1i * x0i;
    394       a[j1 + 1] = wk1r * x0i + wk1i * x0r;
    395       x0r = x1r + x3i;
    396       x0i = x1i - x3r;
    397       a[j3 + 0] = wk3r * x0r - wk3i * x0i;
    398       a[j3 + 1] = wk3r * x0i + wk3i * x0r;
    399     }
    400     wk1r = rdft_w[k2 + 2];
    401     wk1i = rdft_w[k2 + 3];
    402     wk3r = rdft_wk3ri_second[k1 + 0];
    403     wk3i = rdft_wk3ri_second[k1 + 1];
    404     for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
    405       j1 = j0 + 8;
    406       j2 = j0 + 16;
    407       j3 = j0 + 24;
    408       x0r = a[j0 + 0] + a[j1 + 0];
    409       x0i = a[j0 + 1] + a[j1 + 1];
    410       x1r = a[j0 + 0] - a[j1 + 0];
    411       x1i = a[j0 + 1] - a[j1 + 1];
    412       x2r = a[j2 + 0] + a[j3 + 0];
    413       x2i = a[j2 + 1] + a[j3 + 1];
    414       x3r = a[j2 + 0] - a[j3 + 0];
    415       x3i = a[j2 + 1] - a[j3 + 1];
    416       a[j0 + 0] = x0r + x2r;
    417       a[j0 + 1] = x0i + x2i;
    418       x0r -= x2r;
    419       x0i -= x2i;
    420       a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
    421       a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
    422       x0r = x1r - x3i;
    423       x0i = x1i + x3r;
    424       a[j1 + 0] = wk1r * x0r - wk1i * x0i;
    425       a[j1 + 1] = wk1r * x0i + wk1i * x0r;
    426       x0r = x1r + x3i;
    427       x0i = x1i - x3r;
    428       a[j3 + 0] = wk3r * x0r - wk3i * x0i;
    429       a[j3 + 1] = wk3r * x0i + wk3i * x0r;
    430     }
    431   }
    432 }
    433 
    434 static void cftfsub_128_C(float* a) {
    435   int j, j1, j2, j3, l;
    436   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    437 
    438   cft1st_128(a);
    439   cftmdl_128(a);
    440   l = 32;
    441   for (j = 0; j < l; j += 2) {
    442     j1 = j + l;
    443     j2 = j1 + l;
    444     j3 = j2 + l;
    445     x0r = a[j] + a[j1];
    446     x0i = a[j + 1] + a[j1 + 1];
    447     x1r = a[j] - a[j1];
    448     x1i = a[j + 1] - a[j1 + 1];
    449     x2r = a[j2] + a[j3];
    450     x2i = a[j2 + 1] + a[j3 + 1];
    451     x3r = a[j2] - a[j3];
    452     x3i = a[j2 + 1] - a[j3 + 1];
    453     a[j] = x0r + x2r;
    454     a[j + 1] = x0i + x2i;
    455     a[j2] = x0r - x2r;
    456     a[j2 + 1] = x0i - x2i;
    457     a[j1] = x1r - x3i;
    458     a[j1 + 1] = x1i + x3r;
    459     a[j3] = x1r + x3i;
    460     a[j3 + 1] = x1i - x3r;
    461   }
    462 }
    463 
    464 static void cftbsub_128_C(float* a) {
    465   int j, j1, j2, j3, l;
    466   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    467 
    468   cft1st_128(a);
    469   cftmdl_128(a);
    470   l = 32;
    471 
    472   for (j = 0; j < l; j += 2) {
    473     j1 = j + l;
    474     j2 = j1 + l;
    475     j3 = j2 + l;
    476     x0r = a[j] + a[j1];
    477     x0i = -a[j + 1] - a[j1 + 1];
    478     x1r = a[j] - a[j1];
    479     x1i = -a[j + 1] + a[j1 + 1];
    480     x2r = a[j2] + a[j3];
    481     x2i = a[j2 + 1] + a[j3 + 1];
    482     x3r = a[j2] - a[j3];
    483     x3i = a[j2 + 1] - a[j3 + 1];
    484     a[j] = x0r + x2r;
    485     a[j + 1] = x0i - x2i;
    486     a[j2] = x0r - x2r;
    487     a[j2 + 1] = x0i + x2i;
    488     a[j1] = x1r - x3i;
    489     a[j1 + 1] = x1i - x3r;
    490     a[j3] = x1r + x3i;
    491     a[j3 + 1] = x1i + x3r;
    492   }
    493 }
    494 
    495 static void rftfsub_128_C(float* a) {
    496   const float* c = rdft_w + 32;
    497   int j1, j2, k1, k2;
    498   float wkr, wki, xr, xi, yr, yi;
    499 
    500   for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
    501     k2 = 128 - j2;
    502     k1 = 32 - j1;
    503     wkr = 0.5f - c[k1];
    504     wki = c[j1];
    505     xr = a[j2 + 0] - a[k2 + 0];
    506     xi = a[j2 + 1] + a[k2 + 1];
    507     yr = wkr * xr - wki * xi;
    508     yi = wkr * xi + wki * xr;
    509     a[j2 + 0] -= yr;
    510     a[j2 + 1] -= yi;
    511     a[k2 + 0] += yr;
    512     a[k2 + 1] -= yi;
    513   }
    514 }
    515 
    516 static void rftbsub_128_C(float* a) {
    517   const float* c = rdft_w + 32;
    518   int j1, j2, k1, k2;
    519   float wkr, wki, xr, xi, yr, yi;
    520 
    521   a[1] = -a[1];
    522   for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
    523     k2 = 128 - j2;
    524     k1 = 32 - j1;
    525     wkr = 0.5f - c[k1];
    526     wki = c[j1];
    527     xr = a[j2 + 0] - a[k2 + 0];
    528     xi = a[j2 + 1] + a[k2 + 1];
    529     yr = wkr * xr + wki * xi;
    530     yi = wkr * xi - wki * xr;
    531     a[j2 + 0] = a[j2 + 0] - yr;
    532     a[j2 + 1] = yi - a[j2 + 1];
    533     a[k2 + 0] = yr + a[k2 + 0];
    534     a[k2 + 1] = yi - a[k2 + 1];
    535   }
    536   a[65] = -a[65];
    537 }
    538 
    539 void aec_rdft_forward_128(float* a) {
    540   float xi;
    541   bitrv2_128(a);
    542   cftfsub_128(a);
    543   rftfsub_128(a);
    544   xi = a[0] - a[1];
    545   a[0] += a[1];
    546   a[1] = xi;
    547 }
    548 
    549 void aec_rdft_inverse_128(float* a) {
    550   a[1] = 0.5f * (a[0] - a[1]);
    551   a[0] -= a[1];
    552   rftbsub_128(a);
    553   bitrv2_128(a);
    554   cftbsub_128(a);
    555 }
    556 
    557 // code path selection
    558 RftSub128 cft1st_128;
    559 RftSub128 cftmdl_128;
    560 RftSub128 rftfsub_128;
    561 RftSub128 rftbsub_128;
    562 RftSub128 cftfsub_128;
    563 RftSub128 cftbsub_128;
    564 RftSub128 bitrv2_128;
    565 
    566 void aec_rdft_init(void) {
    567   cft1st_128 = cft1st_128_C;
    568   cftmdl_128 = cftmdl_128_C;
    569   rftfsub_128 = rftfsub_128_C;
    570   rftbsub_128 = rftbsub_128_C;
    571   cftfsub_128 = cftfsub_128_C;
    572   cftbsub_128 = cftbsub_128_C;
    573   bitrv2_128 = bitrv2_128_C;
    574 #if defined(WEBRTC_ARCH_X86_FAMILY)
    575   if (WebRtc_GetCPUInfo(kSSE2)) {
    576     aec_rdft_init_sse2();
    577   }
    578 #endif
    579 #if defined(MIPS_FPU_LE)
    580   aec_rdft_init_mips();
    581 #endif
    582 #if defined(WEBRTC_HAS_NEON)
    583   aec_rdft_init_neon();
    584 #elif defined(WEBRTC_DETECT_NEON)
    585   if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
    586     aec_rdft_init_neon();
    587   }
    588 #endif
    589 }
    590