1 /*Copyright (c) 2003-2004, Mark Borgerding 2 Lots of modifications by Jean-Marc Valin 3 Copyright (c) 2005-2007, Xiph.Org Foundation 4 Copyright (c) 2008, Xiph.Org Foundation, CSIRO 5 6 All rights reserved. 7 8 Redistribution and use in source and binary forms, with or without 9 modification, are permitted provided that the following conditions are met: 10 11 * Redistributions of source code must retain the above copyright notice, 12 this list of conditions and the following disclaimer. 13 * Redistributions in binary form must reproduce the above copyright notice, 14 this list of conditions and the following disclaimer in the 15 documentation and/or other materials provided with the distribution. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 POSSIBILITY OF SUCH DAMAGE.*/ 28 29 /* This code is originally from Mark Borgerding's KISS-FFT but has been 30 heavily modified to better suit Opus */ 31 32 #ifndef SKIP_CONFIG_H 33 # ifdef HAVE_CONFIG_H 34 # include "config.h" 35 # endif 36 #endif 37 38 #include "_kiss_fft_guts.h" 39 #include "arch.h" 40 #include "os_support.h" 41 #include "mathops.h" 42 #include "stack_alloc.h" 43 44 /* The guts header contains all the multiplication and addition macros that are defined for 45 complex numbers. It also delares the kf_ internal functions. 46 */ 47 48 static void kf_bfly2( 49 kiss_fft_cpx * Fout, 50 int m, 51 int N 52 ) 53 { 54 kiss_fft_cpx * Fout2; 55 int i; 56 (void)m; 57 #ifdef CUSTOM_MODES 58 if (m==1) 59 { 60 celt_assert(m==1); 61 for (i=0;i<N;i++) 62 { 63 kiss_fft_cpx t; 64 Fout2 = Fout + 1; 65 t = *Fout2; 66 C_SUB( *Fout2 , *Fout , t ); 67 C_ADDTO( *Fout , t ); 68 Fout += 2; 69 } 70 } else 71 #endif 72 { 73 opus_val16 tw; 74 tw = QCONST16(0.7071067812f, 15); 75 /* We know that m==4 here because the radix-2 is just after a radix-4 */ 76 celt_assert(m==4); 77 for (i=0;i<N;i++) 78 { 79 kiss_fft_cpx t; 80 Fout2 = Fout + 4; 81 t = Fout2[0]; 82 C_SUB( Fout2[0] , Fout[0] , t ); 83 C_ADDTO( Fout[0] , t ); 84 85 t.r = S_MUL(ADD32_ovflw(Fout2[1].r, Fout2[1].i), tw); 86 t.i = S_MUL(SUB32_ovflw(Fout2[1].i, Fout2[1].r), tw); 87 C_SUB( Fout2[1] , Fout[1] , t ); 88 C_ADDTO( Fout[1] , t ); 89 90 t.r = Fout2[2].i; 91 t.i = -Fout2[2].r; 92 C_SUB( Fout2[2] , Fout[2] , t ); 93 C_ADDTO( Fout[2] , t ); 94 95 t.r = S_MUL(SUB32_ovflw(Fout2[3].i, Fout2[3].r), tw); 96 t.i = S_MUL(NEG32_ovflw(ADD32_ovflw(Fout2[3].i, Fout2[3].r)), tw); 97 C_SUB( Fout2[3] , Fout[3] , t ); 98 C_ADDTO( Fout[3] , t ); 99 Fout += 8; 100 } 101 } 102 } 103 104 static void kf_bfly4( 105 kiss_fft_cpx * Fout, 106 const size_t fstride, 107 const kiss_fft_state *st, 108 int m, 109 int N, 110 int mm 111 ) 112 { 113 int i; 114 115 if (m==1) 116 { 117 /* Degenerate case where all the twiddles are 1. */ 118 for (i=0;i<N;i++) 119 { 120 kiss_fft_cpx scratch0, scratch1; 121 122 C_SUB( scratch0 , *Fout, Fout[2] ); 123 C_ADDTO(*Fout, Fout[2]); 124 C_ADD( scratch1 , Fout[1] , Fout[3] ); 125 C_SUB( Fout[2], *Fout, scratch1 ); 126 C_ADDTO( *Fout , scratch1 ); 127 C_SUB( scratch1 , Fout[1] , Fout[3] ); 128 129 Fout[1].r = ADD32_ovflw(scratch0.r, scratch1.i); 130 Fout[1].i = SUB32_ovflw(scratch0.i, scratch1.r); 131 Fout[3].r = SUB32_ovflw(scratch0.r, scratch1.i); 132 Fout[3].i = ADD32_ovflw(scratch0.i, scratch1.r); 133 Fout+=4; 134 } 135 } else { 136 int j; 137 kiss_fft_cpx scratch[6]; 138 const kiss_twiddle_cpx *tw1,*tw2,*tw3; 139 const int m2=2*m; 140 const int m3=3*m; 141 kiss_fft_cpx * Fout_beg = Fout; 142 for (i=0;i<N;i++) 143 { 144 Fout = Fout_beg + i*mm; 145 tw3 = tw2 = tw1 = st->twiddles; 146 /* m is guaranteed to be a multiple of 4. */ 147 for (j=0;j<m;j++) 148 { 149 C_MUL(scratch[0],Fout[m] , *tw1 ); 150 C_MUL(scratch[1],Fout[m2] , *tw2 ); 151 C_MUL(scratch[2],Fout[m3] , *tw3 ); 152 153 C_SUB( scratch[5] , *Fout, scratch[1] ); 154 C_ADDTO(*Fout, scratch[1]); 155 C_ADD( scratch[3] , scratch[0] , scratch[2] ); 156 C_SUB( scratch[4] , scratch[0] , scratch[2] ); 157 C_SUB( Fout[m2], *Fout, scratch[3] ); 158 tw1 += fstride; 159 tw2 += fstride*2; 160 tw3 += fstride*3; 161 C_ADDTO( *Fout , scratch[3] ); 162 163 Fout[m].r = ADD32_ovflw(scratch[5].r, scratch[4].i); 164 Fout[m].i = SUB32_ovflw(scratch[5].i, scratch[4].r); 165 Fout[m3].r = SUB32_ovflw(scratch[5].r, scratch[4].i); 166 Fout[m3].i = ADD32_ovflw(scratch[5].i, scratch[4].r); 167 ++Fout; 168 } 169 } 170 } 171 } 172 173 174 #ifndef RADIX_TWO_ONLY 175 176 static void kf_bfly3( 177 kiss_fft_cpx * Fout, 178 const size_t fstride, 179 const kiss_fft_state *st, 180 int m, 181 int N, 182 int mm 183 ) 184 { 185 int i; 186 size_t k; 187 const size_t m2 = 2*m; 188 const kiss_twiddle_cpx *tw1,*tw2; 189 kiss_fft_cpx scratch[5]; 190 kiss_twiddle_cpx epi3; 191 192 kiss_fft_cpx * Fout_beg = Fout; 193 #ifdef FIXED_POINT 194 /*epi3.r = -16384;*/ /* Unused */ 195 epi3.i = -28378; 196 #else 197 epi3 = st->twiddles[fstride*m]; 198 #endif 199 for (i=0;i<N;i++) 200 { 201 Fout = Fout_beg + i*mm; 202 tw1=tw2=st->twiddles; 203 /* For non-custom modes, m is guaranteed to be a multiple of 4. */ 204 k=m; 205 do { 206 207 C_MUL(scratch[1],Fout[m] , *tw1); 208 C_MUL(scratch[2],Fout[m2] , *tw2); 209 210 C_ADD(scratch[3],scratch[1],scratch[2]); 211 C_SUB(scratch[0],scratch[1],scratch[2]); 212 tw1 += fstride; 213 tw2 += fstride*2; 214 215 Fout[m].r = SUB32_ovflw(Fout->r, HALF_OF(scratch[3].r)); 216 Fout[m].i = SUB32_ovflw(Fout->i, HALF_OF(scratch[3].i)); 217 218 C_MULBYSCALAR( scratch[0] , epi3.i ); 219 220 C_ADDTO(*Fout,scratch[3]); 221 222 Fout[m2].r = ADD32_ovflw(Fout[m].r, scratch[0].i); 223 Fout[m2].i = SUB32_ovflw(Fout[m].i, scratch[0].r); 224 225 Fout[m].r = SUB32_ovflw(Fout[m].r, scratch[0].i); 226 Fout[m].i = ADD32_ovflw(Fout[m].i, scratch[0].r); 227 228 ++Fout; 229 } while(--k); 230 } 231 } 232 233 234 #ifndef OVERRIDE_kf_bfly5 235 static void kf_bfly5( 236 kiss_fft_cpx * Fout, 237 const size_t fstride, 238 const kiss_fft_state *st, 239 int m, 240 int N, 241 int mm 242 ) 243 { 244 kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; 245 int i, u; 246 kiss_fft_cpx scratch[13]; 247 const kiss_twiddle_cpx *tw; 248 kiss_twiddle_cpx ya,yb; 249 kiss_fft_cpx * Fout_beg = Fout; 250 251 #ifdef FIXED_POINT 252 ya.r = 10126; 253 ya.i = -31164; 254 yb.r = -26510; 255 yb.i = -19261; 256 #else 257 ya = st->twiddles[fstride*m]; 258 yb = st->twiddles[fstride*2*m]; 259 #endif 260 tw=st->twiddles; 261 262 for (i=0;i<N;i++) 263 { 264 Fout = Fout_beg + i*mm; 265 Fout0=Fout; 266 Fout1=Fout0+m; 267 Fout2=Fout0+2*m; 268 Fout3=Fout0+3*m; 269 Fout4=Fout0+4*m; 270 271 /* For non-custom modes, m is guaranteed to be a multiple of 4. */ 272 for ( u=0; u<m; ++u ) { 273 scratch[0] = *Fout0; 274 275 C_MUL(scratch[1] ,*Fout1, tw[u*fstride]); 276 C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]); 277 C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]); 278 C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]); 279 280 C_ADD( scratch[7],scratch[1],scratch[4]); 281 C_SUB( scratch[10],scratch[1],scratch[4]); 282 C_ADD( scratch[8],scratch[2],scratch[3]); 283 C_SUB( scratch[9],scratch[2],scratch[3]); 284 285 Fout0->r = ADD32_ovflw(Fout0->r, ADD32_ovflw(scratch[7].r, scratch[8].r)); 286 Fout0->i = ADD32_ovflw(Fout0->i, ADD32_ovflw(scratch[7].i, scratch[8].i)); 287 288 scratch[5].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,ya.r), S_MUL(scratch[8].r,yb.r))); 289 scratch[5].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,ya.r), S_MUL(scratch[8].i,yb.r))); 290 291 scratch[6].r = ADD32_ovflw(S_MUL(scratch[10].i,ya.i), S_MUL(scratch[9].i,yb.i)); 292 scratch[6].i = NEG32_ovflw(ADD32_ovflw(S_MUL(scratch[10].r,ya.i), S_MUL(scratch[9].r,yb.i))); 293 294 C_SUB(*Fout1,scratch[5],scratch[6]); 295 C_ADD(*Fout4,scratch[5],scratch[6]); 296 297 scratch[11].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,yb.r), S_MUL(scratch[8].r,ya.r))); 298 scratch[11].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,yb.r), S_MUL(scratch[8].i,ya.r))); 299 scratch[12].r = SUB32_ovflw(S_MUL(scratch[9].i,ya.i), S_MUL(scratch[10].i,yb.i)); 300 scratch[12].i = SUB32_ovflw(S_MUL(scratch[10].r,yb.i), S_MUL(scratch[9].r,ya.i)); 301 302 C_ADD(*Fout2,scratch[11],scratch[12]); 303 C_SUB(*Fout3,scratch[11],scratch[12]); 304 305 ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4; 306 } 307 } 308 } 309 #endif /* OVERRIDE_kf_bfly5 */ 310 311 312 #endif 313 314 315 #ifdef CUSTOM_MODES 316 317 static 318 void compute_bitrev_table( 319 int Fout, 320 opus_int16 *f, 321 const size_t fstride, 322 int in_stride, 323 opus_int16 * factors, 324 const kiss_fft_state *st 325 ) 326 { 327 const int p=*factors++; /* the radix */ 328 const int m=*factors++; /* stage's fft length/p */ 329 330 /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/ 331 if (m==1) 332 { 333 int j; 334 for (j=0;j<p;j++) 335 { 336 *f = Fout+j; 337 f += fstride*in_stride; 338 } 339 } else { 340 int j; 341 for (j=0;j<p;j++) 342 { 343 compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st); 344 f += fstride*in_stride; 345 Fout += m; 346 } 347 } 348 } 349 350 /* facbuf is populated by p1,m1,p2,m2, ... 351 where 352 p[i] * m[i] = m[i-1] 353 m0 = n */ 354 static 355 int kf_factor(int n,opus_int16 * facbuf) 356 { 357 int p=4; 358 int i; 359 int stages=0; 360 int nbak = n; 361 362 /*factor out powers of 4, powers of 2, then any remaining primes */ 363 do { 364 while (n % p) { 365 switch (p) { 366 case 4: p = 2; break; 367 case 2: p = 3; break; 368 default: p += 2; break; 369 } 370 if (p>32000 || (opus_int32)p*(opus_int32)p > n) 371 p = n; /* no more factors, skip to end */ 372 } 373 n /= p; 374 #ifdef RADIX_TWO_ONLY 375 if (p!=2 && p != 4) 376 #else 377 if (p>5) 378 #endif 379 { 380 return 0; 381 } 382 facbuf[2*stages] = p; 383 if (p==2 && stages > 1) 384 { 385 facbuf[2*stages] = 4; 386 facbuf[2] = 2; 387 } 388 stages++; 389 } while (n > 1); 390 n = nbak; 391 /* Reverse the order to get the radix 4 at the end, so we can use the 392 fast degenerate case. It turns out that reversing the order also 393 improves the noise behaviour. */ 394 for (i=0;i<stages/2;i++) 395 { 396 int tmp; 397 tmp = facbuf[2*i]; 398 facbuf[2*i] = facbuf[2*(stages-i-1)]; 399 facbuf[2*(stages-i-1)] = tmp; 400 } 401 for (i=0;i<stages;i++) 402 { 403 n /= facbuf[2*i]; 404 facbuf[2*i+1] = n; 405 } 406 return 1; 407 } 408 409 static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft) 410 { 411 int i; 412 #ifdef FIXED_POINT 413 for (i=0;i<nfft;++i) { 414 opus_val32 phase = -i; 415 kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft)); 416 } 417 #else 418 for (i=0;i<nfft;++i) { 419 const double pi=3.14159265358979323846264338327; 420 double phase = ( -2*pi /nfft ) * i; 421 kf_cexp(twiddles+i, phase ); 422 } 423 #endif 424 } 425 426 int opus_fft_alloc_arch_c(kiss_fft_state *st) { 427 (void)st; 428 return 0; 429 } 430 431 /* 432 * 433 * Allocates all necessary storage space for the fft and ifft. 434 * The return value is a contiguous block of memory. As such, 435 * It can be freed with free(). 436 * */ 437 kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, 438 const kiss_fft_state *base, int arch) 439 { 440 kiss_fft_state *st=NULL; 441 size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/ 442 443 if ( lenmem==NULL ) { 444 st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded ); 445 }else{ 446 if (mem != NULL && *lenmem >= memneeded) 447 st = (kiss_fft_state*)mem; 448 *lenmem = memneeded; 449 } 450 if (st) { 451 opus_int16 *bitrev; 452 kiss_twiddle_cpx *twiddles; 453 454 st->nfft=nfft; 455 #ifdef FIXED_POINT 456 st->scale_shift = celt_ilog2(st->nfft); 457 if (st->nfft == 1<<st->scale_shift) 458 st->scale = Q15ONE; 459 else 460 st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift); 461 #else 462 st->scale = 1.f/nfft; 463 #endif 464 if (base != NULL) 465 { 466 st->twiddles = base->twiddles; 467 st->shift = 0; 468 while (st->shift < 32 && nfft<<st->shift != base->nfft) 469 st->shift++; 470 if (st->shift>=32) 471 goto fail; 472 } else { 473 st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft); 474 compute_twiddles(twiddles, nfft); 475 st->shift = -1; 476 } 477 if (!kf_factor(nfft,st->factors)) 478 { 479 goto fail; 480 } 481 482 /* bitrev */ 483 st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft); 484 if (st->bitrev==NULL) 485 goto fail; 486 compute_bitrev_table(0, bitrev, 1,1, st->factors,st); 487 488 /* Initialize architecture specific fft parameters */ 489 if (opus_fft_alloc_arch(st, arch)) 490 goto fail; 491 } 492 return st; 493 fail: 494 opus_fft_free(st, arch); 495 return NULL; 496 } 497 498 kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch) 499 { 500 return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch); 501 } 502 503 void opus_fft_free_arch_c(kiss_fft_state *st) { 504 (void)st; 505 } 506 507 void opus_fft_free(const kiss_fft_state *cfg, int arch) 508 { 509 if (cfg) 510 { 511 opus_fft_free_arch((kiss_fft_state *)cfg, arch); 512 opus_free((opus_int16*)cfg->bitrev); 513 if (cfg->shift < 0) 514 opus_free((kiss_twiddle_cpx*)cfg->twiddles); 515 opus_free((kiss_fft_state*)cfg); 516 } 517 } 518 519 #endif /* CUSTOM_MODES */ 520 521 void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout) 522 { 523 int m2, m; 524 int p; 525 int L; 526 int fstride[MAXFACTORS]; 527 int i; 528 int shift; 529 530 /* st->shift can be -1 */ 531 shift = st->shift>0 ? st->shift : 0; 532 533 fstride[0] = 1; 534 L=0; 535 do { 536 p = st->factors[2*L]; 537 m = st->factors[2*L+1]; 538 fstride[L+1] = fstride[L]*p; 539 L++; 540 } while(m!=1); 541 m = st->factors[2*L-1]; 542 for (i=L-1;i>=0;i--) 543 { 544 if (i!=0) 545 m2 = st->factors[2*i-1]; 546 else 547 m2 = 1; 548 switch (st->factors[2*i]) 549 { 550 case 2: 551 kf_bfly2(fout, m, fstride[i]); 552 break; 553 case 4: 554 kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2); 555 break; 556 #ifndef RADIX_TWO_ONLY 557 case 3: 558 kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2); 559 break; 560 case 5: 561 kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2); 562 break; 563 #endif 564 } 565 m = m2; 566 } 567 } 568 569 void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) 570 { 571 int i; 572 opus_val16 scale; 573 #ifdef FIXED_POINT 574 /* Allows us to scale with MULT16_32_Q16(), which is faster than 575 MULT16_32_Q15() on ARM. */ 576 int scale_shift = st->scale_shift-1; 577 #endif 578 scale = st->scale; 579 580 celt_assert2 (fin != fout, "In-place FFT not supported"); 581 /* Bit-reverse the input */ 582 for (i=0;i<st->nfft;i++) 583 { 584 kiss_fft_cpx x = fin[i]; 585 fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift); 586 fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift); 587 } 588 opus_fft_impl(st, fout); 589 } 590 591 592 void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) 593 { 594 int i; 595 celt_assert2 (fin != fout, "In-place FFT not supported"); 596 /* Bit-reverse the input */ 597 for (i=0;i<st->nfft;i++) 598 fout[st->bitrev[i]] = fin[i]; 599 for (i=0;i<st->nfft;i++) 600 fout[i].i = -fout[i].i; 601 opus_fft_impl(st, fout); 602 for (i=0;i<st->nfft;i++) 603 fout[i].i = -fout[i].i; 604 } 605