1 /* 2 * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 /* 12 * fft.c 13 * 14 * Fast Fourier Transform 15 * 16 */ 17 18 19 #include "fft.h" 20 21 const int16_t kSortTabFft[240] = { 22 0, 60, 120, 180, 20, 80, 140, 200, 40, 100, 160, 220, 23 4, 64, 124, 184, 24, 84, 144, 204, 44, 104, 164, 224, 24 8, 68, 128, 188, 28, 88, 148, 208, 48, 108, 168, 228, 25 12, 72, 132, 192, 32, 92, 152, 212, 52, 112, 172, 232, 26 16, 76, 136, 196, 36, 96, 156, 216, 56, 116, 176, 236, 27 1, 61, 121, 181, 21, 81, 141, 201, 41, 101, 161, 221, 28 5, 65, 125, 185, 25, 85, 145, 205, 45, 105, 165, 225, 29 9, 69, 129, 189, 29, 89, 149, 209, 49, 109, 169, 229, 30 13, 73, 133, 193, 33, 93, 153, 213, 53, 113, 173, 233, 31 17, 77, 137, 197, 37, 97, 157, 217, 57, 117, 177, 237, 32 2, 62, 122, 182, 22, 82, 142, 202, 42, 102, 162, 222, 33 6, 66, 126, 186, 26, 86, 146, 206, 46, 106, 166, 226, 34 10, 70, 130, 190, 30, 90, 150, 210, 50, 110, 170, 230, 35 14, 74, 134, 194, 34, 94, 154, 214, 54, 114, 174, 234, 36 18, 78, 138, 198, 38, 98, 158, 218, 58, 118, 178, 238, 37 3, 63, 123, 183, 23, 83, 143, 203, 43, 103, 163, 223, 38 7, 67, 127, 187, 27, 87, 147, 207, 47, 107, 167, 227, 39 11, 71, 131, 191, 31, 91, 151, 211, 51, 111, 171, 231, 40 15, 75, 135, 195, 35, 95, 155, 215, 55, 115, 175, 235, 41 19, 79, 139, 199, 39, 99, 159, 219, 59, 119, 179, 239 42 }; 43 44 /* Cosine table in Q14 */ 45 const int16_t kCosTabFfftQ14[240] = { 46 16384, 16378, 16362, 16333, 16294, 16244, 16182, 16110, 16026, 15931, 15826, 15709, 47 15582, 15444, 15296, 15137, 14968, 14788, 14598, 14399, 14189, 13970, 13741, 13502, 48 13255, 12998, 12733, 12458, 12176, 11885, 11585, 11278, 10963, 10641, 10311, 9974, 49 9630, 9280, 8923, 8561, 8192, 7818, 7438, 7053, 6664, 6270, 5872, 5469, 50 5063, 4653, 4240, 3825, 3406, 2986, 2563, 2139, 1713, 1285, 857, 429, 51 0, -429, -857, -1285, -1713, -2139, -2563, -2986, -3406, -3825, -4240, -4653, 52 -5063, -5469, -5872, -6270, -6664, -7053, -7438, -7818, -8192, -8561, -8923, -9280, 53 -9630, -9974, -10311, -10641, -10963, -11278, -11585, -11885, -12176, -12458, -12733, -12998, 54 -13255, -13502, -13741, -13970, -14189, -14399, -14598, -14788, -14968, -15137, -15296, -15444, 55 -15582, -15709, -15826, -15931, -16026, -16110, -16182, -16244, -16294, -16333, -16362, -16378, 56 -16384, -16378, -16362, -16333, -16294, -16244, -16182, -16110, -16026, -15931, -15826, -15709, 57 -15582, -15444, -15296, -15137, -14968, -14788, -14598, -14399, -14189, -13970, -13741, -13502, 58 -13255, -12998, -12733, -12458, -12176, -11885, -11585, -11278, -10963, -10641, -10311, -9974, 59 -9630, -9280, -8923, -8561, -8192, -7818, -7438, -7053, -6664, -6270, -5872, -5469, 60 -5063, -4653, -4240, -3825, -3406, -2986, -2563, -2139, -1713, -1285, -857, -429, 61 0, 429, 857, 1285, 1713, 2139, 2563, 2986, 3406, 3825, 4240, 4653, 62 5063, 5469, 5872, 6270, 6664, 7053, 7438, 7818, 8192, 8561, 8923, 9280, 63 9630, 9974, 10311, 10641, 10963, 11278, 11585, 11885, 12176, 12458, 12733, 12998, 64 13255, 13502, 13741, 13970, 14189, 14399, 14598, 14788, 14968, 15137, 15296, 15444, 65 15582, 15709, 15826, 15931, 16026, 16110, 16182, 16244, 16294, 16333, 16362, 16378 66 }; 67 68 69 70 /* Uses 16x16 mul, without rounding, which is faster. Uses WEBRTC_SPL_MUL_16_16_RSFT */ 71 int16_t WebRtcIsacfix_FftRadix16Fastest(int16_t RexQx[], int16_t ImxQx[], int16_t iSign) { 72 73 int16_t dd, ee, ff, gg, hh, ii; 74 int16_t k0, k1, k2, k3, k4, kk; 75 int16_t tmp116, tmp216; 76 77 int16_t ccc1Q14, ccc2Q14, ccc3Q14, sss1Q14, sss2Q14, sss3Q14; 78 int16_t sss60Q14, ccc72Q14, sss72Q14; 79 int16_t aaQx, ajQx, akQx, ajmQx, ajpQx, akmQx, akpQx; 80 int16_t bbQx, bjQx, bkQx, bjmQx, bjpQx, bkmQx, bkpQx; 81 82 int16_t ReDATAQx[240], ImDATAQx[240]; 83 84 sss60Q14 = kCosTabFfftQ14[20]; 85 ccc72Q14 = kCosTabFfftQ14[48]; 86 sss72Q14 = kCosTabFfftQ14[12]; 87 88 if (iSign < 0) { 89 sss72Q14 = -sss72Q14; 90 sss60Q14 = -sss60Q14; 91 } 92 /* Complexity is: 10 cycles */ 93 94 /* compute fourier transform */ 95 96 // transform for factor of 4 97 for (kk=0; kk<60; kk++) { 98 k0 = kk; 99 k1 = k0 + 60; 100 k2 = k1 + 60; 101 k3 = k2 + 60; 102 103 akpQx = RexQx[k0] + RexQx[k2]; 104 akmQx = RexQx[k0] - RexQx[k2]; 105 ajpQx = RexQx[k1] + RexQx[k3]; 106 ajmQx = RexQx[k1] - RexQx[k3]; 107 bkpQx = ImxQx[k0] + ImxQx[k2]; 108 bkmQx = ImxQx[k0] - ImxQx[k2]; 109 bjpQx = ImxQx[k1] + ImxQx[k3]; 110 bjmQx = ImxQx[k1] - ImxQx[k3]; 111 112 RexQx[k0] = akpQx + ajpQx; 113 ImxQx[k0] = bkpQx + bjpQx; 114 ajpQx = akpQx - ajpQx; 115 bjpQx = bkpQx - bjpQx; 116 if (iSign < 0) { 117 akpQx = akmQx + bjmQx; 118 bkpQx = bkmQx - ajmQx; 119 akmQx -= bjmQx; 120 bkmQx += ajmQx; 121 } else { 122 akpQx = akmQx - bjmQx; 123 bkpQx = bkmQx + ajmQx; 124 akmQx += bjmQx; 125 bkmQx -= ajmQx; 126 } 127 128 ccc1Q14 = kCosTabFfftQ14[kk]; 129 ccc2Q14 = kCosTabFfftQ14[2 * kk]; 130 ccc3Q14 = kCosTabFfftQ14[3 * kk]; 131 sss1Q14 = kCosTabFfftQ14[kk + 60]; 132 sss2Q14 = kCosTabFfftQ14[2 * kk + 60]; 133 sss3Q14 = kCosTabFfftQ14[3 * kk + 60]; 134 if (iSign==1) { 135 sss1Q14 = -sss1Q14; 136 sss2Q14 = -sss2Q14; 137 sss3Q14 = -sss3Q14; 138 } 139 140 //Do several multiplications like Q14*Q16>>14 = Q16 141 // RexQ16[k1] = akpQ16 * ccc1Q14 - bkpQ16 * sss1Q14; 142 // RexQ16[k2] = ajpQ16 * ccc2Q14 - bjpQ16 * sss2Q14; 143 // RexQ16[k3] = akmQ16 * ccc3Q14 - bkmQ16 * sss3Q14; 144 // ImxQ16[k1] = akpQ16 * sss1Q14 + bkpQ16 * ccc1Q14; 145 // ImxQ16[k2] = ajpQ16 * sss2Q14 + bjpQ16 * ccc2Q14; 146 // ImxQ16[k3] = akmQ16 * sss3Q14 + bkmQ16 * ccc3Q14; 147 148 RexQx[k1] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc1Q14, akpQx, 14) - 149 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss1Q14, bkpQx, 14); // 6 non-mul + 2 mul cycles, i.e. 8 cycles (6+2*7=20 cycles if 16x32mul) 150 RexQx[k2] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, ajpQx, 14) - 151 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, bjpQx, 14); 152 RexQx[k3] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc3Q14, akmQx, 14) - 153 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss3Q14, bkmQx, 14); 154 ImxQx[k1] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss1Q14, akpQx, 14) + 155 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc1Q14, bkpQx, 14); 156 ImxQx[k2] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, ajpQx, 14) + 157 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, bjpQx, 14); 158 ImxQx[k3] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss3Q14, akmQx, 14) + 159 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc3Q14, bkmQx, 14); 160 //This mul segment needs 6*8 = 48 cycles for 16x16 muls, but 6*20 = 120 cycles for 16x32 muls 161 162 163 } 164 /* Complexity is: 51+48 = 99 cycles for 16x16 muls, but 51+120 = 171 cycles for 16x32 muls*/ 165 166 // transform for factor of 3 167 kk=0; 168 k1=20; 169 k2=40; 170 171 for (hh=0; hh<4; hh++) { 172 for (ii=0; ii<20; ii++) { 173 akQx = RexQx[kk]; 174 bkQx = ImxQx[kk]; 175 ajQx = RexQx[k1] + RexQx[k2]; 176 bjQx = ImxQx[k1] + ImxQx[k2]; 177 RexQx[kk] = akQx + ajQx; 178 ImxQx[kk] = bkQx + bjQx; 179 tmp116 = ajQx >> 1; 180 tmp216 = bjQx >> 1; 181 akQx = akQx - tmp116; 182 bkQx = bkQx - tmp216; 183 tmp116 = RexQx[k1] - RexQx[k2]; 184 tmp216 = ImxQx[k1] - ImxQx[k2]; 185 186 ajQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss60Q14, tmp116, 14); // Q14*Qx>>14 = Qx 187 bjQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss60Q14, tmp216, 14); // Q14*Qx>>14 = Qx 188 RexQx[k1] = akQx - bjQx; 189 RexQx[k2] = akQx + bjQx; 190 ImxQx[k1] = bkQx + ajQx; 191 ImxQx[k2] = bkQx - ajQx; 192 193 kk++; 194 k1++; 195 k2++; 196 } 197 /* Complexity : (31+6)*20 = 740 cycles for 16x16 muls, but (31+18)*20 = 980 cycles for 16x32 muls*/ 198 kk=kk+40; 199 k1=k1+40; 200 k2=k2+40; 201 } 202 /* Complexity : 4*(740+3) = 2972 cycles for 16x16 muls, but 4*(980+3) = 3932 cycles for 16x32 muls*/ 203 204 /* multiply by rotation factor for odd factor 3 or 5 (not for 4) 205 Same code (duplicated) for both ii=2 and ii=3 */ 206 kk = 1; 207 ee = 0; 208 ff = 0; 209 210 for (gg=0; gg<19; gg++) { 211 kk += 20; 212 ff = ff+4; 213 for (hh=0; hh<2; hh++) { 214 ee = ff + hh * ff; 215 dd = ee + 60; 216 ccc2Q14 = kCosTabFfftQ14[ee]; 217 sss2Q14 = kCosTabFfftQ14[dd]; 218 if (iSign==1) { 219 sss2Q14 = -sss2Q14; 220 } 221 for (ii=0; ii<4; ii++) { 222 akQx = RexQx[kk]; 223 bkQx = ImxQx[kk]; 224 RexQx[kk] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, akQx, 14) - // Q14*Qx>>14 = Qx 225 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, bkQx, 14); 226 ImxQx[kk] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, akQx, 14) + // Q14*Qx>>14 = Qx 227 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, bkQx, 14); 228 229 230 kk += 60; 231 } 232 kk = kk - 220; 233 } 234 // Complexity: 2*(13+5+4*13+2) = 144 for 16x16 muls, but 2*(13+5+4*33+2) = 304 cycles for 16x32 muls 235 kk = kk - 59; 236 } 237 // Complexity: 19*144 = 2736 for 16x16 muls, but 19*304 = 5776 cycles for 16x32 muls 238 239 // transform for factor of 5 240 kk = 0; 241 ccc2Q14 = kCosTabFfftQ14[96]; 242 sss2Q14 = kCosTabFfftQ14[84]; 243 if (iSign==1) { 244 sss2Q14 = -sss2Q14; 245 } 246 247 for (hh=0; hh<4; hh++) { 248 for (ii=0; ii<12; ii++) { 249 k1 = kk + 4; 250 k2 = k1 + 4; 251 k3 = k2 + 4; 252 k4 = k3 + 4; 253 254 akpQx = RexQx[k1] + RexQx[k4]; 255 akmQx = RexQx[k1] - RexQx[k4]; 256 bkpQx = ImxQx[k1] + ImxQx[k4]; 257 bkmQx = ImxQx[k1] - ImxQx[k4]; 258 ajpQx = RexQx[k2] + RexQx[k3]; 259 ajmQx = RexQx[k2] - RexQx[k3]; 260 bjpQx = ImxQx[k2] + ImxQx[k3]; 261 bjmQx = ImxQx[k2] - ImxQx[k3]; 262 aaQx = RexQx[kk]; 263 bbQx = ImxQx[kk]; 264 RexQx[kk] = aaQx + akpQx + ajpQx; 265 ImxQx[kk] = bbQx + bkpQx + bjpQx; 266 267 akQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc72Q14, akpQx, 14) + 268 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, ajpQx, 14) + aaQx; 269 bkQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc72Q14, bkpQx, 14) + 270 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, bjpQx, 14) + bbQx; 271 ajQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss72Q14, akmQx, 14) + 272 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, ajmQx, 14); 273 bjQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss72Q14, bkmQx, 14) + 274 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, bjmQx, 14); 275 // 32+4*8=64 or 32+4*20=112 276 277 RexQx[k1] = akQx - bjQx; 278 RexQx[k4] = akQx + bjQx; 279 ImxQx[k1] = bkQx + ajQx; 280 ImxQx[k4] = bkQx - ajQx; 281 282 akQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, akpQx, 14) + 283 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc72Q14, ajpQx, 14) + aaQx; 284 bkQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, bkpQx, 14) + 285 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc72Q14, bjpQx, 14) + bbQx; 286 ajQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, akmQx, 14) - 287 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss72Q14, ajmQx, 14); 288 bjQx = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, bkmQx, 14) - 289 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss72Q14, bjmQx, 14); 290 // 8+4*8=40 or 8+4*20=88 291 292 RexQx[k2] = akQx - bjQx; 293 RexQx[k3] = akQx + bjQx; 294 ImxQx[k2] = bkQx + ajQx; 295 ImxQx[k3] = bkQx - ajQx; 296 297 kk = k4 + 4; 298 } 299 // Complexity: 12*(64+40+10) = 1368 for 16x16 muls, but 12*(112+88+10) = 2520 cycles for 16x32 muls 300 kk -= 239; 301 } 302 // Complexity: 4*1368 = 5472 for 16x16 muls, but 4*2520 = 10080 cycles for 16x32 muls 303 304 /* multiply by rotation factor for odd factor 3 or 5 (not for 4) 305 Same code (duplicated) for both ii=2 and ii=3 */ 306 kk = 1; 307 ee=0; 308 309 for (gg=0; gg<3; gg++) { 310 kk += 4; 311 dd = 12 + 12 * gg; 312 ff = 0; 313 for (hh=0; hh<4; hh++) { 314 ff = ff+dd; 315 ee = ff+60; 316 for (ii=0; ii<12; ii++) { 317 akQx = RexQx[kk]; 318 bkQx = ImxQx[kk]; 319 320 ccc2Q14 = kCosTabFfftQ14[ff]; 321 sss2Q14 = kCosTabFfftQ14[ee]; 322 323 if (iSign==1) { 324 sss2Q14 = -sss2Q14; 325 } 326 327 RexQx[kk] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, akQx, 14) - 328 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, bkQx, 14); 329 ImxQx[kk] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(sss2Q14, akQx, 14) + 330 (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(ccc2Q14, bkQx, 14); 331 332 kk += 20; 333 } 334 kk = kk - 236; 335 // Complexity: 12*(12+12) = 288 for 16x16 muls, but 12*(12+32) = 528 cycles for 16x32 muls 336 } 337 kk = kk - 19; 338 // Complexity: 4*288+6 for 16x16 muls, but 4*528+6 cycles for 16x32 muls 339 } 340 // Complexity: 3*4*288+6 = 3462 for 16x16 muls, but 3*4*528+6 = 6342 cycles for 16x32 muls 341 342 343 // last transform for factor of 4 */ 344 for (kk=0; kk<240; kk=kk+4) { 345 k1 = kk + 1; 346 k2 = k1 + 1; 347 k3 = k2 + 1; 348 349 akpQx = RexQx[kk] + RexQx[k2]; 350 akmQx = RexQx[kk] - RexQx[k2]; 351 ajpQx = RexQx[k1] + RexQx[k3]; 352 ajmQx = RexQx[k1] - RexQx[k3]; 353 bkpQx = ImxQx[kk] + ImxQx[k2]; 354 bkmQx = ImxQx[kk] - ImxQx[k2]; 355 bjpQx = ImxQx[k1] + ImxQx[k3]; 356 bjmQx = ImxQx[k1] - ImxQx[k3]; 357 RexQx[kk] = akpQx + ajpQx; 358 ImxQx[kk] = bkpQx + bjpQx; 359 ajpQx = akpQx - ajpQx; 360 bjpQx = bkpQx - bjpQx; 361 if (iSign < 0) { 362 akpQx = akmQx + bjmQx; 363 bkpQx = bkmQx - ajmQx; 364 akmQx -= bjmQx; 365 bkmQx += ajmQx; 366 } else { 367 akpQx = akmQx - bjmQx; 368 bkpQx = bkmQx + ajmQx; 369 akmQx += bjmQx; 370 bkmQx -= ajmQx; 371 } 372 RexQx[k1] = akpQx; 373 RexQx[k2] = ajpQx; 374 RexQx[k3] = akmQx; 375 ImxQx[k1] = bkpQx; 376 ImxQx[k2] = bjpQx; 377 ImxQx[k3] = bkmQx; 378 } 379 // Complexity: 60*45 = 2700 for 16x16 muls, but 60*45 = 2700 cycles for 16x32 muls 380 381 /* permute the results to normal order */ 382 for (ii=0; ii<240; ii++) { 383 ReDATAQx[ii]=RexQx[ii]; 384 ImDATAQx[ii]=ImxQx[ii]; 385 } 386 // Complexity: 240*2=480 cycles 387 388 for (ii=0; ii<240; ii++) { 389 RexQx[ii]=ReDATAQx[kSortTabFft[ii]]; 390 ImxQx[ii]=ImDATAQx[kSortTabFft[ii]]; 391 } 392 // Complexity: 240*2*2=960 cycles 393 394 // Total complexity: 395 // 16x16 16x32 396 // Complexity: 10 10 397 // Complexity: 99 171 398 // Complexity: 2972 3932 399 // Complexity: 2736 5776 400 // Complexity: 5472 10080 401 // Complexity: 3462 6342 402 // Complexity: 2700 2700 403 // Complexity: 480 480 404 // Complexity: 960 960 405 // ======================= 406 // 18891 30451 407 // 408 // If this FFT is called 2 time each frame, i.e. 67 times per second, it will correspond to 409 // a C54 complexity of 67*18891/1000000 = 1.27 MIPS with 16x16-muls, and 67*30451/1000000 = 410 // = 2.04 MIPS with 16x32-muls. Note that this routine somtimes is called 6 times during the 411 // encoding of a frame, i.e. the max complexity would be 7/2*1.27 = 4.4 MIPS for the 16x16 mul case. 412 413 414 return 0; 415 } 416