1 /* 2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html 3 * Copyright Takuya OOURA, 1996-2001 4 * 5 * You may use, copy, modify and distribute this code for any purpose (include 6 * commercial use) and without fee. Please refer to this package when you modify 7 * this code. 8 * 9 * Changes by the WebRTC authors: 10 * - Trivial type modifications. 11 * - Minimal code subset to do rdft of length 128. 12 * - Optimizations because of known length. 13 * 14 * All changes are covered by the WebRTC license and IP grant: 15 * Use of this source code is governed by a BSD-style license 16 * that can be found in the LICENSE file in the root of the source 17 * tree. An additional intellectual property rights grant can be found 18 * in the file PATENTS. All contributing project authors may 19 * be found in the AUTHORS file in the root of the source tree. 20 */ 21 22 #include "webrtc/modules/audio_processing/aec/aec_rdft.h" 23 24 #include <math.h> 25 26 #include "webrtc/system_wrappers/include/cpu_features_wrapper.h" 27 #include "webrtc/typedefs.h" 28 29 // These tables used to be computed at run-time. For example, refer to: 30 // https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564 31 // to see the initialization code. 32 const float rdft_w[64] = { 33 1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f, 34 0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f, 35 0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f, 36 0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f, 37 0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f, 38 0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f, 39 0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f, 40 0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f, 41 0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f, 42 0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f, 43 0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f, 44 0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f, 45 0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f, 46 0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f, 47 0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f, 48 0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f, 49 }; 50 const float rdft_wk3ri_first[16] = { 51 1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f, 52 0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f, 53 0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f, 54 0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f, 55 }; 56 const float rdft_wk3ri_second[16] = { 57 -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f, 58 -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f, 59 -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f, 60 -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f, 61 }; 62 ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = { 63 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 64 0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f, 65 0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f, 66 0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f, 67 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f, 68 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 69 0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f, 70 0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f, 71 }; 72 ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = { 73 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, 74 0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f, 75 0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f, 76 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, 77 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f, 78 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, 79 0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f, 80 0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f, 81 }; 82 ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = { 83 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, 84 0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f, 85 0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f, 86 -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f, 87 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f, 88 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, 89 0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f, 90 -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f, 91 }; 92 ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = { 93 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, 94 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, 95 -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f, 96 -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f, 97 -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f, 98 -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, 99 -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f, 100 -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f, 101 }; 102 ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = { 103 -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, 104 -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f, 105 -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f, 106 -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f, 107 -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f, 108 -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, 109 -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f, 110 -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f, 111 }; 112 ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = { 113 -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, 114 -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f, 115 -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f, 116 -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f, 117 -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f, 118 -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, 119 -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f, 120 -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f, 121 }; 122 ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = { 123 0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f, 124 }; 125 126 static void bitrv2_128_C(float* a) { 127 /* 128 Following things have been attempted but are no faster: 129 (a) Storing the swap indexes in a LUT (index calculations are done 130 for 'free' while waiting on memory/L1). 131 (b) Consolidate the load/store of two consecutive floats by a 64 bit 132 integer (execution is memory/L1 bound). 133 (c) Do a mix of floats and 64 bit integer to maximize register 134 utilization (execution is memory/L1 bound). 135 (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5). 136 (e) Hard-coding of the offsets to completely eliminates index 137 calculations. 138 */ 139 140 unsigned int j, j1, k, k1; 141 float xr, xi, yr, yi; 142 143 static const int ip[4] = {0, 64, 32, 96}; 144 for (k = 0; k < 4; k++) { 145 for (j = 0; j < k; j++) { 146 j1 = 2 * j + ip[k]; 147 k1 = 2 * k + ip[j]; 148 xr = a[j1 + 0]; 149 xi = a[j1 + 1]; 150 yr = a[k1 + 0]; 151 yi = a[k1 + 1]; 152 a[j1 + 0] = yr; 153 a[j1 + 1] = yi; 154 a[k1 + 0] = xr; 155 a[k1 + 1] = xi; 156 j1 += 8; 157 k1 += 16; 158 xr = a[j1 + 0]; 159 xi = a[j1 + 1]; 160 yr = a[k1 + 0]; 161 yi = a[k1 + 1]; 162 a[j1 + 0] = yr; 163 a[j1 + 1] = yi; 164 a[k1 + 0] = xr; 165 a[k1 + 1] = xi; 166 j1 += 8; 167 k1 -= 8; 168 xr = a[j1 + 0]; 169 xi = a[j1 + 1]; 170 yr = a[k1 + 0]; 171 yi = a[k1 + 1]; 172 a[j1 + 0] = yr; 173 a[j1 + 1] = yi; 174 a[k1 + 0] = xr; 175 a[k1 + 1] = xi; 176 j1 += 8; 177 k1 += 16; 178 xr = a[j1 + 0]; 179 xi = a[j1 + 1]; 180 yr = a[k1 + 0]; 181 yi = a[k1 + 1]; 182 a[j1 + 0] = yr; 183 a[j1 + 1] = yi; 184 a[k1 + 0] = xr; 185 a[k1 + 1] = xi; 186 } 187 j1 = 2 * k + 8 + ip[k]; 188 k1 = j1 + 8; 189 xr = a[j1 + 0]; 190 xi = a[j1 + 1]; 191 yr = a[k1 + 0]; 192 yi = a[k1 + 1]; 193 a[j1 + 0] = yr; 194 a[j1 + 1] = yi; 195 a[k1 + 0] = xr; 196 a[k1 + 1] = xi; 197 } 198 } 199 200 static void cft1st_128_C(float* a) { 201 const int n = 128; 202 int j, k1, k2; 203 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; 204 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; 205 206 // The processing of the first set of elements was simplified in C to avoid 207 // some operations (multiplication by zero or one, addition of two elements 208 // multiplied by the same weight, ...). 209 x0r = a[0] + a[2]; 210 x0i = a[1] + a[3]; 211 x1r = a[0] - a[2]; 212 x1i = a[1] - a[3]; 213 x2r = a[4] + a[6]; 214 x2i = a[5] + a[7]; 215 x3r = a[4] - a[6]; 216 x3i = a[5] - a[7]; 217 a[0] = x0r + x2r; 218 a[1] = x0i + x2i; 219 a[4] = x0r - x2r; 220 a[5] = x0i - x2i; 221 a[2] = x1r - x3i; 222 a[3] = x1i + x3r; 223 a[6] = x1r + x3i; 224 a[7] = x1i - x3r; 225 wk1r = rdft_w[2]; 226 x0r = a[8] + a[10]; 227 x0i = a[9] + a[11]; 228 x1r = a[8] - a[10]; 229 x1i = a[9] - a[11]; 230 x2r = a[12] + a[14]; 231 x2i = a[13] + a[15]; 232 x3r = a[12] - a[14]; 233 x3i = a[13] - a[15]; 234 a[8] = x0r + x2r; 235 a[9] = x0i + x2i; 236 a[12] = x2i - x0i; 237 a[13] = x0r - x2r; 238 x0r = x1r - x3i; 239 x0i = x1i + x3r; 240 a[10] = wk1r * (x0r - x0i); 241 a[11] = wk1r * (x0r + x0i); 242 x0r = x3i + x1r; 243 x0i = x3r - x1i; 244 a[14] = wk1r * (x0i - x0r); 245 a[15] = wk1r * (x0i + x0r); 246 k1 = 0; 247 for (j = 16; j < n; j += 16) { 248 k1 += 2; 249 k2 = 2 * k1; 250 wk2r = rdft_w[k1 + 0]; 251 wk2i = rdft_w[k1 + 1]; 252 wk1r = rdft_w[k2 + 0]; 253 wk1i = rdft_w[k2 + 1]; 254 wk3r = rdft_wk3ri_first[k1 + 0]; 255 wk3i = rdft_wk3ri_first[k1 + 1]; 256 x0r = a[j + 0] + a[j + 2]; 257 x0i = a[j + 1] + a[j + 3]; 258 x1r = a[j + 0] - a[j + 2]; 259 x1i = a[j + 1] - a[j + 3]; 260 x2r = a[j + 4] + a[j + 6]; 261 x2i = a[j + 5] + a[j + 7]; 262 x3r = a[j + 4] - a[j + 6]; 263 x3i = a[j + 5] - a[j + 7]; 264 a[j + 0] = x0r + x2r; 265 a[j + 1] = x0i + x2i; 266 x0r -= x2r; 267 x0i -= x2i; 268 a[j + 4] = wk2r * x0r - wk2i * x0i; 269 a[j + 5] = wk2r * x0i + wk2i * x0r; 270 x0r = x1r - x3i; 271 x0i = x1i + x3r; 272 a[j + 2] = wk1r * x0r - wk1i * x0i; 273 a[j + 3] = wk1r * x0i + wk1i * x0r; 274 x0r = x1r + x3i; 275 x0i = x1i - x3r; 276 a[j + 6] = wk3r * x0r - wk3i * x0i; 277 a[j + 7] = wk3r * x0i + wk3i * x0r; 278 wk1r = rdft_w[k2 + 2]; 279 wk1i = rdft_w[k2 + 3]; 280 wk3r = rdft_wk3ri_second[k1 + 0]; 281 wk3i = rdft_wk3ri_second[k1 + 1]; 282 x0r = a[j + 8] + a[j + 10]; 283 x0i = a[j + 9] + a[j + 11]; 284 x1r = a[j + 8] - a[j + 10]; 285 x1i = a[j + 9] - a[j + 11]; 286 x2r = a[j + 12] + a[j + 14]; 287 x2i = a[j + 13] + a[j + 15]; 288 x3r = a[j + 12] - a[j + 14]; 289 x3i = a[j + 13] - a[j + 15]; 290 a[j + 8] = x0r + x2r; 291 a[j + 9] = x0i + x2i; 292 x0r -= x2r; 293 x0i -= x2i; 294 a[j + 12] = -wk2i * x0r - wk2r * x0i; 295 a[j + 13] = -wk2i * x0i + wk2r * x0r; 296 x0r = x1r - x3i; 297 x0i = x1i + x3r; 298 a[j + 10] = wk1r * x0r - wk1i * x0i; 299 a[j + 11] = wk1r * x0i + wk1i * x0r; 300 x0r = x1r + x3i; 301 x0i = x1i - x3r; 302 a[j + 14] = wk3r * x0r - wk3i * x0i; 303 a[j + 15] = wk3r * x0i + wk3i * x0r; 304 } 305 } 306 307 static void cftmdl_128_C(float* a) { 308 const int l = 8; 309 const int n = 128; 310 const int m = 32; 311 int j0, j1, j2, j3, k, k1, k2, m2; 312 float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; 313 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; 314 315 for (j0 = 0; j0 < l; j0 += 2) { 316 j1 = j0 + 8; 317 j2 = j0 + 16; 318 j3 = j0 + 24; 319 x0r = a[j0 + 0] + a[j1 + 0]; 320 x0i = a[j0 + 1] + a[j1 + 1]; 321 x1r = a[j0 + 0] - a[j1 + 0]; 322 x1i = a[j0 + 1] - a[j1 + 1]; 323 x2r = a[j2 + 0] + a[j3 + 0]; 324 x2i = a[j2 + 1] + a[j3 + 1]; 325 x3r = a[j2 + 0] - a[j3 + 0]; 326 x3i = a[j2 + 1] - a[j3 + 1]; 327 a[j0 + 0] = x0r + x2r; 328 a[j0 + 1] = x0i + x2i; 329 a[j2 + 0] = x0r - x2r; 330 a[j2 + 1] = x0i - x2i; 331 a[j1 + 0] = x1r - x3i; 332 a[j1 + 1] = x1i + x3r; 333 a[j3 + 0] = x1r + x3i; 334 a[j3 + 1] = x1i - x3r; 335 } 336 wk1r = rdft_w[2]; 337 for (j0 = m; j0 < l + m; j0 += 2) { 338 j1 = j0 + 8; 339 j2 = j0 + 16; 340 j3 = j0 + 24; 341 x0r = a[j0 + 0] + a[j1 + 0]; 342 x0i = a[j0 + 1] + a[j1 + 1]; 343 x1r = a[j0 + 0] - a[j1 + 0]; 344 x1i = a[j0 + 1] - a[j1 + 1]; 345 x2r = a[j2 + 0] + a[j3 + 0]; 346 x2i = a[j2 + 1] + a[j3 + 1]; 347 x3r = a[j2 + 0] - a[j3 + 0]; 348 x3i = a[j2 + 1] - a[j3 + 1]; 349 a[j0 + 0] = x0r + x2r; 350 a[j0 + 1] = x0i + x2i; 351 a[j2 + 0] = x2i - x0i; 352 a[j2 + 1] = x0r - x2r; 353 x0r = x1r - x3i; 354 x0i = x1i + x3r; 355 a[j1 + 0] = wk1r * (x0r - x0i); 356 a[j1 + 1] = wk1r * (x0r + x0i); 357 x0r = x3i + x1r; 358 x0i = x3r - x1i; 359 a[j3 + 0] = wk1r * (x0i - x0r); 360 a[j3 + 1] = wk1r * (x0i + x0r); 361 } 362 k1 = 0; 363 m2 = 2 * m; 364 for (k = m2; k < n; k += m2) { 365 k1 += 2; 366 k2 = 2 * k1; 367 wk2r = rdft_w[k1 + 0]; 368 wk2i = rdft_w[k1 + 1]; 369 wk1r = rdft_w[k2 + 0]; 370 wk1i = rdft_w[k2 + 1]; 371 wk3r = rdft_wk3ri_first[k1 + 0]; 372 wk3i = rdft_wk3ri_first[k1 + 1]; 373 for (j0 = k; j0 < l + k; j0 += 2) { 374 j1 = j0 + 8; 375 j2 = j0 + 16; 376 j3 = j0 + 24; 377 x0r = a[j0 + 0] + a[j1 + 0]; 378 x0i = a[j0 + 1] + a[j1 + 1]; 379 x1r = a[j0 + 0] - a[j1 + 0]; 380 x1i = a[j0 + 1] - a[j1 + 1]; 381 x2r = a[j2 + 0] + a[j3 + 0]; 382 x2i = a[j2 + 1] + a[j3 + 1]; 383 x3r = a[j2 + 0] - a[j3 + 0]; 384 x3i = a[j2 + 1] - a[j3 + 1]; 385 a[j0 + 0] = x0r + x2r; 386 a[j0 + 1] = x0i + x2i; 387 x0r -= x2r; 388 x0i -= x2i; 389 a[j2 + 0] = wk2r * x0r - wk2i * x0i; 390 a[j2 + 1] = wk2r * x0i + wk2i * x0r; 391 x0r = x1r - x3i; 392 x0i = x1i + x3r; 393 a[j1 + 0] = wk1r * x0r - wk1i * x0i; 394 a[j1 + 1] = wk1r * x0i + wk1i * x0r; 395 x0r = x1r + x3i; 396 x0i = x1i - x3r; 397 a[j3 + 0] = wk3r * x0r - wk3i * x0i; 398 a[j3 + 1] = wk3r * x0i + wk3i * x0r; 399 } 400 wk1r = rdft_w[k2 + 2]; 401 wk1i = rdft_w[k2 + 3]; 402 wk3r = rdft_wk3ri_second[k1 + 0]; 403 wk3i = rdft_wk3ri_second[k1 + 1]; 404 for (j0 = k + m; j0 < l + (k + m); j0 += 2) { 405 j1 = j0 + 8; 406 j2 = j0 + 16; 407 j3 = j0 + 24; 408 x0r = a[j0 + 0] + a[j1 + 0]; 409 x0i = a[j0 + 1] + a[j1 + 1]; 410 x1r = a[j0 + 0] - a[j1 + 0]; 411 x1i = a[j0 + 1] - a[j1 + 1]; 412 x2r = a[j2 + 0] + a[j3 + 0]; 413 x2i = a[j2 + 1] + a[j3 + 1]; 414 x3r = a[j2 + 0] - a[j3 + 0]; 415 x3i = a[j2 + 1] - a[j3 + 1]; 416 a[j0 + 0] = x0r + x2r; 417 a[j0 + 1] = x0i + x2i; 418 x0r -= x2r; 419 x0i -= x2i; 420 a[j2 + 0] = -wk2i * x0r - wk2r * x0i; 421 a[j2 + 1] = -wk2i * x0i + wk2r * x0r; 422 x0r = x1r - x3i; 423 x0i = x1i + x3r; 424 a[j1 + 0] = wk1r * x0r - wk1i * x0i; 425 a[j1 + 1] = wk1r * x0i + wk1i * x0r; 426 x0r = x1r + x3i; 427 x0i = x1i - x3r; 428 a[j3 + 0] = wk3r * x0r - wk3i * x0i; 429 a[j3 + 1] = wk3r * x0i + wk3i * x0r; 430 } 431 } 432 } 433 434 static void cftfsub_128_C(float* a) { 435 int j, j1, j2, j3, l; 436 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; 437 438 cft1st_128(a); 439 cftmdl_128(a); 440 l = 32; 441 for (j = 0; j < l; j += 2) { 442 j1 = j + l; 443 j2 = j1 + l; 444 j3 = j2 + l; 445 x0r = a[j] + a[j1]; 446 x0i = a[j + 1] + a[j1 + 1]; 447 x1r = a[j] - a[j1]; 448 x1i = a[j + 1] - a[j1 + 1]; 449 x2r = a[j2] + a[j3]; 450 x2i = a[j2 + 1] + a[j3 + 1]; 451 x3r = a[j2] - a[j3]; 452 x3i = a[j2 + 1] - a[j3 + 1]; 453 a[j] = x0r + x2r; 454 a[j + 1] = x0i + x2i; 455 a[j2] = x0r - x2r; 456 a[j2 + 1] = x0i - x2i; 457 a[j1] = x1r - x3i; 458 a[j1 + 1] = x1i + x3r; 459 a[j3] = x1r + x3i; 460 a[j3 + 1] = x1i - x3r; 461 } 462 } 463 464 static void cftbsub_128_C(float* a) { 465 int j, j1, j2, j3, l; 466 float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; 467 468 cft1st_128(a); 469 cftmdl_128(a); 470 l = 32; 471 472 for (j = 0; j < l; j += 2) { 473 j1 = j + l; 474 j2 = j1 + l; 475 j3 = j2 + l; 476 x0r = a[j] + a[j1]; 477 x0i = -a[j + 1] - a[j1 + 1]; 478 x1r = a[j] - a[j1]; 479 x1i = -a[j + 1] + a[j1 + 1]; 480 x2r = a[j2] + a[j3]; 481 x2i = a[j2 + 1] + a[j3 + 1]; 482 x3r = a[j2] - a[j3]; 483 x3i = a[j2 + 1] - a[j3 + 1]; 484 a[j] = x0r + x2r; 485 a[j + 1] = x0i - x2i; 486 a[j2] = x0r - x2r; 487 a[j2 + 1] = x0i + x2i; 488 a[j1] = x1r - x3i; 489 a[j1 + 1] = x1i - x3r; 490 a[j3] = x1r + x3i; 491 a[j3 + 1] = x1i + x3r; 492 } 493 } 494 495 static void rftfsub_128_C(float* a) { 496 const float* c = rdft_w + 32; 497 int j1, j2, k1, k2; 498 float wkr, wki, xr, xi, yr, yi; 499 500 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { 501 k2 = 128 - j2; 502 k1 = 32 - j1; 503 wkr = 0.5f - c[k1]; 504 wki = c[j1]; 505 xr = a[j2 + 0] - a[k2 + 0]; 506 xi = a[j2 + 1] + a[k2 + 1]; 507 yr = wkr * xr - wki * xi; 508 yi = wkr * xi + wki * xr; 509 a[j2 + 0] -= yr; 510 a[j2 + 1] -= yi; 511 a[k2 + 0] += yr; 512 a[k2 + 1] -= yi; 513 } 514 } 515 516 static void rftbsub_128_C(float* a) { 517 const float* c = rdft_w + 32; 518 int j1, j2, k1, k2; 519 float wkr, wki, xr, xi, yr, yi; 520 521 a[1] = -a[1]; 522 for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { 523 k2 = 128 - j2; 524 k1 = 32 - j1; 525 wkr = 0.5f - c[k1]; 526 wki = c[j1]; 527 xr = a[j2 + 0] - a[k2 + 0]; 528 xi = a[j2 + 1] + a[k2 + 1]; 529 yr = wkr * xr + wki * xi; 530 yi = wkr * xi - wki * xr; 531 a[j2 + 0] = a[j2 + 0] - yr; 532 a[j2 + 1] = yi - a[j2 + 1]; 533 a[k2 + 0] = yr + a[k2 + 0]; 534 a[k2 + 1] = yi - a[k2 + 1]; 535 } 536 a[65] = -a[65]; 537 } 538 539 void aec_rdft_forward_128(float* a) { 540 float xi; 541 bitrv2_128(a); 542 cftfsub_128(a); 543 rftfsub_128(a); 544 xi = a[0] - a[1]; 545 a[0] += a[1]; 546 a[1] = xi; 547 } 548 549 void aec_rdft_inverse_128(float* a) { 550 a[1] = 0.5f * (a[0] - a[1]); 551 a[0] -= a[1]; 552 rftbsub_128(a); 553 bitrv2_128(a); 554 cftbsub_128(a); 555 } 556 557 // code path selection 558 RftSub128 cft1st_128; 559 RftSub128 cftmdl_128; 560 RftSub128 rftfsub_128; 561 RftSub128 rftbsub_128; 562 RftSub128 cftfsub_128; 563 RftSub128 cftbsub_128; 564 RftSub128 bitrv2_128; 565 566 void aec_rdft_init(void) { 567 cft1st_128 = cft1st_128_C; 568 cftmdl_128 = cftmdl_128_C; 569 rftfsub_128 = rftfsub_128_C; 570 rftbsub_128 = rftbsub_128_C; 571 cftfsub_128 = cftfsub_128_C; 572 cftbsub_128 = cftbsub_128_C; 573 bitrv2_128 = bitrv2_128_C; 574 #if defined(WEBRTC_ARCH_X86_FAMILY) 575 if (WebRtc_GetCPUInfo(kSSE2)) { 576 aec_rdft_init_sse2(); 577 } 578 #endif 579 #if defined(MIPS_FPU_LE) 580 aec_rdft_init_mips(); 581 #endif 582 #if defined(WEBRTC_HAS_NEON) 583 aec_rdft_init_neon(); 584 #elif defined(WEBRTC_DETECT_NEON) 585 if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) { 586 aec_rdft_init_neon(); 587 } 588 #endif 589 } 590