1 /* Copyright (c) 2011 Xiph.Org Foundation 2 Written by Jean-Marc Valin */ 3 /* 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include "kiss_fft.h" 33 #include "celt.h" 34 #include "modes.h" 35 #include "arch.h" 36 #include "quant_bands.h" 37 #include <stdio.h> 38 #include "analysis.h" 39 #include "mlp.h" 40 #include "stack_alloc.h" 41 42 extern const MLP net; 43 44 #ifndef M_PI 45 #define M_PI 3.141592653 46 #endif 47 48 static const float dct_table[128] = { 49 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 50 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 51 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f, 52 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f, 53 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f, 54 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f, 55 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f, 56 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f, 57 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, 58 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, 59 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f, 60 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f, 61 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f, 62 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f, 63 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f, 64 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f, 65 }; 66 67 static const float analysis_window[240] = { 68 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f, 69 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f, 70 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f, 71 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f, 72 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f, 73 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f, 74 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f, 75 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f, 76 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f, 77 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f, 78 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f, 79 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f, 80 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f, 81 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f, 82 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f, 83 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f, 84 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f, 85 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f, 86 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f, 87 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f, 88 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f, 89 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f, 90 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f, 91 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f, 92 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f, 93 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f, 94 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f, 95 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f, 96 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f, 97 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f, 98 }; 99 100 static const int tbands[NB_TBANDS+1] = { 101 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120 102 }; 103 104 static const int extra_bands[NB_TOT_BANDS+1] = { 105 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200 106 }; 107 108 /*static const float tweight[NB_TBANDS+1] = { 109 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 110 };*/ 111 112 #define NB_TONAL_SKIP_BANDS 9 113 114 #define cA 0.43157974f 115 #define cB 0.67848403f 116 #define cC 0.08595542f 117 #define cE ((float)M_PI/2) 118 static inline float fast_atan2f(float y, float x) { 119 float x2, y2; 120 /* Should avoid underflow on the values we'll get */ 121 if (ABS16(x)+ABS16(y)<1e-9f) 122 { 123 x*=1e12f; 124 y*=1e12f; 125 } 126 x2 = x*x; 127 y2 = y*y; 128 if(x2<y2){ 129 float den = (y2 + cB*x2) * (y2 + cC*x2); 130 if (den!=0) 131 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE); 132 else 133 return (y<0 ? -cE : cE); 134 }else{ 135 float den = (x2 + cB*y2) * (x2 + cC*y2); 136 if (den!=0) 137 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); 138 else 139 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE); 140 } 141 } 142 143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) 144 { 145 int pos; 146 int curr_lookahead; 147 float psum; 148 int i; 149 150 pos = tonal->read_pos; 151 curr_lookahead = tonal->write_pos-tonal->read_pos; 152 if (curr_lookahead<0) 153 curr_lookahead += DETECT_SIZE; 154 155 if (len > 480 && pos != tonal->write_pos) 156 { 157 pos++; 158 if (pos==DETECT_SIZE) 159 pos=0; 160 } 161 if (pos == tonal->write_pos) 162 pos--; 163 if (pos<0) 164 pos = DETECT_SIZE-1; 165 OPUS_COPY(info_out, &tonal->info[pos], 1); 166 tonal->read_subframe += len/120; 167 while (tonal->read_subframe>=4) 168 { 169 tonal->read_subframe -= 4; 170 tonal->read_pos++; 171 } 172 if (tonal->read_pos>=DETECT_SIZE) 173 tonal->read_pos-=DETECT_SIZE; 174 175 /* Compensate for the delay in the features themselves. 176 FIXME: Need a better estimate the 10 I just made up */ 177 curr_lookahead = IMAX(curr_lookahead-10, 0); 178 179 psum=0; 180 /* Summing the probability of transition patterns that involve music at 181 time (DETECT_SIZE-curr_lookahead-1) */ 182 for (i=0;i<DETECT_SIZE-curr_lookahead;i++) 183 psum += tonal->pmusic[i]; 184 for (;i<DETECT_SIZE;i++) 185 psum += tonal->pspeech[i]; 186 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; 187 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ 188 189 info_out->music_prob = psum; 190 } 191 192 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) 193 { 194 int i, b; 195 const kiss_fft_state *kfft; 196 VARDECL(kiss_fft_cpx, in); 197 VARDECL(kiss_fft_cpx, out); 198 int N = 480, N2=240; 199 float * OPUS_RESTRICT A = tonal->angle; 200 float * OPUS_RESTRICT dA = tonal->d_angle; 201 float * OPUS_RESTRICT d2A = tonal->d2_angle; 202 VARDECL(float, tonality); 203 VARDECL(float, noisiness); 204 float band_tonality[NB_TBANDS]; 205 float logE[NB_TBANDS]; 206 float BFCC[8]; 207 float features[25]; 208 float frame_tonality; 209 float max_frame_tonality; 210 /*float tw_sum=0;*/ 211 float frame_noisiness; 212 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI); 213 float slope=0; 214 float frame_stationarity; 215 float relativeE; 216 float frame_probs[2]; 217 float alpha, alphaE, alphaE2; 218 float frame_loudness; 219 float bandwidth_mask; 220 int bandwidth=0; 221 float maxE = 0; 222 float noise_floor; 223 int remaining; 224 AnalysisInfo *info; 225 SAVE_STACK; 226 227 tonal->last_transition++; 228 alpha = 1.f/IMIN(20, 1+tonal->count); 229 alphaE = 1.f/IMIN(50, 1+tonal->count); 230 alphaE2 = 1.f/IMIN(1000, 1+tonal->count); 231 232 if (tonal->count<4) 233 tonal->music_prob = .5; 234 kfft = celt_mode->mdct.kfft[0]; 235 if (tonal->count==0) 236 tonal->mem_fill = 240; 237 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C); 238 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) 239 { 240 tonal->mem_fill += len; 241 /* Don't have enough to update the analysis */ 242 RESTORE_STACK; 243 return; 244 } 245 info = &tonal->info[tonal->write_pos++]; 246 if (tonal->write_pos>=DETECT_SIZE) 247 tonal->write_pos-=DETECT_SIZE; 248 249 ALLOC(in, 480, kiss_fft_cpx); 250 ALLOC(out, 480, kiss_fft_cpx); 251 ALLOC(tonality, 240, float); 252 ALLOC(noisiness, 240, float); 253 for (i=0;i<N2;i++) 254 { 255 float w = analysis_window[i]; 256 in[i].r = w*tonal->inmem[i]; 257 in[i].i = w*tonal->inmem[N2+i]; 258 in[N-i-1].r = w*tonal->inmem[N-i-1]; 259 in[N-i-1].i = w*tonal->inmem[N+N2-i-1]; 260 } 261 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); 262 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); 263 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C); 264 tonal->mem_fill = 240 + remaining; 265 opus_fft(kfft, in, out); 266 267 for (i=1;i<N2;i++) 268 { 269 float X1r, X2r, X1i, X2i; 270 float angle, d_angle, d2_angle; 271 float angle2, d_angle2, d2_angle2; 272 float mod1, mod2, avg_mod; 273 X1r = out[i].r+out[N-i].r; 274 X1i = out[i].i-out[N-i].i; 275 X2r = out[i].i+out[N-i].i; 276 X2i = out[N-i].r-out[i].r; 277 278 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); 279 d_angle = angle - A[i]; 280 d2_angle = d_angle - dA[i]; 281 282 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); 283 d_angle2 = angle2 - angle; 284 d2_angle2 = d_angle2 - d_angle; 285 286 mod1 = d2_angle - (float)floor(.5+d2_angle); 287 noisiness[i] = ABS16(mod1); 288 mod1 *= mod1; 289 mod1 *= mod1; 290 291 mod2 = d2_angle2 - (float)floor(.5+d2_angle2); 292 noisiness[i] += ABS16(mod2); 293 mod2 *= mod2; 294 mod2 *= mod2; 295 296 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); 297 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; 298 299 A[i] = angle2; 300 dA[i] = d_angle2; 301 d2A[i] = mod2; 302 } 303 304 frame_tonality = 0; 305 max_frame_tonality = 0; 306 /*tw_sum = 0;*/ 307 info->activity = 0; 308 frame_noisiness = 0; 309 frame_stationarity = 0; 310 if (!tonal->count) 311 { 312 for (b=0;b<NB_TBANDS;b++) 313 { 314 tonal->lowE[b] = 1e10; 315 tonal->highE[b] = -1e10; 316 } 317 } 318 relativeE = 0; 319 frame_loudness = 0; 320 bandwidth_mask = 0; 321 for (b=0;b<NB_TBANDS;b++) 322 { 323 float E=0, tE=0, nE=0; 324 float L1, L2; 325 float stationarity; 326 for (i=tbands[b];i<tbands[b+1];i++) 327 { 328 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 329 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 330 #ifdef FIXED_POINT 331 /* FIXME: It's probably best to change the BFCC filter initial state instead */ 332 binE *= 5.55e-17f; 333 #endif 334 E += binE; 335 tE += binE*tonality[i]; 336 nE += binE*2.f*(.5f-noisiness[i]); 337 } 338 tonal->E[tonal->E_count][b] = E; 339 frame_noisiness += nE/(1e-15f+E); 340 341 frame_loudness += sqrt(E+1e-10f); 342 logE[b] = (float)log(E+1e-10f); 343 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); 344 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); 345 if (tonal->highE[b] < tonal->lowE[b]+1.f) 346 { 347 tonal->highE[b]+=.5f; 348 tonal->lowE[b]-=.5f; 349 } 350 relativeE += (logE[b]-tonal->lowE[b])/(1e-15+tonal->highE[b]-tonal->lowE[b]); 351 352 L1=L2=0; 353 for (i=0;i<NB_FRAMES;i++) 354 { 355 L1 += sqrt(tonal->E[i][b]); 356 L2 += tonal->E[i][b]; 357 } 358 359 stationarity = MIN16(0.99f,L1/sqrt(1e-15+NB_FRAMES*L2)); 360 stationarity *= stationarity; 361 stationarity *= stationarity; 362 frame_stationarity += stationarity; 363 /*band_tonality[b] = tE/(1e-15+E)*/; 364 band_tonality[b] = MAX16(tE/(1e-15+E), stationarity*tonal->prev_band_tonality[b]); 365 #if 0 366 if (b>=NB_TONAL_SKIP_BANDS) 367 { 368 frame_tonality += tweight[b]*band_tonality[b]; 369 tw_sum += tweight[b]; 370 } 371 #else 372 frame_tonality += band_tonality[b]; 373 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) 374 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; 375 #endif 376 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality); 377 slope += band_tonality[b]*(b-8); 378 /*printf("%f %f ", band_tonality[b], stationarity);*/ 379 tonal->prev_band_tonality[b] = band_tonality[b]; 380 } 381 382 bandwidth_mask = 0; 383 bandwidth = 0; 384 maxE = 0; 385 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); 386 #ifdef FIXED_POINT 387 noise_floor *= 1<<(15+SIG_SHIFT); 388 #endif 389 noise_floor *= noise_floor; 390 for (b=0;b<NB_TOT_BANDS;b++) 391 { 392 float E=0; 393 int band_start, band_end; 394 /* Keep a margin of 300 Hz for aliasing */ 395 band_start = extra_bands[b]; 396 band_end = extra_bands[b+1]; 397 for (i=band_start;i<band_end;i++) 398 { 399 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 400 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 401 E += binE; 402 } 403 maxE = MAX32(maxE, E); 404 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); 405 E = MAX32(E, tonal->meanE[b]); 406 /* Use a simple follower with 13 dB/Bark slope for spreading function */ 407 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); 408 /* Consider the band "active" only if all these conditions are met: 409 1) less than 10 dB below the simple follower 410 2) less than 90 dB below the peak band (maximal masking possible considering 411 both the ATH and the loudness-dependent slope of the spreading function) 412 3) above the PCM quantization noise floor 413 */ 414 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start)) 415 bandwidth = b; 416 } 417 if (tonal->count<=2) 418 bandwidth = 20; 419 frame_loudness = 20*(float)log10(frame_loudness); 420 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); 421 tonal->lowECount *= (1-alphaE); 422 if (frame_loudness < tonal->Etracker-30) 423 tonal->lowECount += alphaE; 424 425 for (i=0;i<8;i++) 426 { 427 float sum=0; 428 for (b=0;b<16;b++) 429 sum += dct_table[i*16+b]*logE[b]; 430 BFCC[i] = sum; 431 } 432 433 frame_stationarity /= NB_TBANDS; 434 relativeE /= NB_TBANDS; 435 if (tonal->count<10) 436 relativeE = .5; 437 frame_noisiness /= NB_TBANDS; 438 #if 1 439 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; 440 #else 441 info->activity = .5*(1+frame_noisiness-frame_stationarity); 442 #endif 443 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); 444 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); 445 tonal->prev_tonality = frame_tonality; 446 447 slope /= 8*8; 448 info->tonality_slope = slope; 449 450 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; 451 tonal->count++; 452 info->tonality = frame_tonality; 453 454 for (i=0;i<4;i++) 455 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; 456 457 for (i=0;i<4;i++) 458 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; 459 460 for (i=0;i<4;i++) 461 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]); 462 for (i=0;i<3;i++) 463 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; 464 465 if (tonal->count > 5) 466 { 467 for (i=0;i<9;i++) 468 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; 469 } 470 471 for (i=0;i<8;i++) 472 { 473 tonal->mem[i+24] = tonal->mem[i+16]; 474 tonal->mem[i+16] = tonal->mem[i+8]; 475 tonal->mem[i+8] = tonal->mem[i]; 476 tonal->mem[i] = BFCC[i]; 477 } 478 for (i=0;i<9;i++) 479 features[11+i] = sqrt(tonal->std[i]); 480 features[20] = info->tonality; 481 features[21] = info->activity; 482 features[22] = frame_stationarity; 483 features[23] = info->tonality_slope; 484 features[24] = tonal->lowECount; 485 486 #ifndef DISABLE_FLOAT_API 487 mlp_process(&net, features, frame_probs); 488 frame_probs[0] = .5f*(frame_probs[0]+1); 489 /* Curve fitting between the MLP probability and the actual probability */ 490 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10); 491 /* Probability of active audio (as opposed to silence) */ 492 frame_probs[1] = .5f*frame_probs[1]+.5f; 493 /* Consider that silence has a 50-50 probability. */ 494 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; 495 496 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ 497 { 498 /* Probability of state transition */ 499 float tau; 500 /* Represents independence of the MLP probabilities, where 501 beta=1 means fully independent. */ 502 float beta; 503 /* Denormalized probability of speech (p0) and music (p1) after update */ 504 float p0, p1; 505 /* Probabilities for "all speech" and "all music" */ 506 float s0, m0; 507 /* Probability sum for renormalisation */ 508 float psum; 509 /* Instantaneous probability of speech and music, with beta pre-applied. */ 510 float speech0; 511 float music0; 512 513 /* One transition every 3 minutes of active audio */ 514 tau = .00005f*frame_probs[1]; 515 beta = .05f; 516 if (1) { 517 /* Adapt beta based on how "unexpected" the new prob is */ 518 float p, q; 519 p = MAX16(.05f,MIN16(.95f,frame_probs[0])); 520 q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); 521 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); 522 } 523 /* p0 and p1 are the probabilities of speech and music at this frame 524 using only information from previous frame and applying the 525 state transition model */ 526 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; 527 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; 528 /* We apply the current probability with exponent beta to work around 529 the fact that the probability estimates aren't independent. */ 530 p0 *= (float)pow(1-frame_probs[0], beta); 531 p1 *= (float)pow(frame_probs[0], beta); 532 /* Normalise the probabilities to get the Marokv probability of music. */ 533 tonal->music_prob = p1/(p0+p1); 534 info->music_prob = tonal->music_prob; 535 536 /* This chunk of code deals with delayed decision. */ 537 psum=1e-20f; 538 /* Instantaneous probability of speech and music, with beta pre-applied. */ 539 speech0 = (float)pow(1-frame_probs[0], beta); 540 music0 = (float)pow(frame_probs[0], beta); 541 if (tonal->count==1) 542 { 543 tonal->pspeech[0]=.5; 544 tonal->pmusic [0]=.5; 545 } 546 /* Updated probability of having only speech (s0) or only music (m0), 547 before considering the new observation. */ 548 s0 = tonal->pspeech[0] + tonal->pspeech[1]; 549 m0 = tonal->pmusic [0] + tonal->pmusic [1]; 550 /* Updates s0 and m0 with instantaneous probability. */ 551 tonal->pspeech[0] = s0*(1-tau)*speech0; 552 tonal->pmusic [0] = m0*(1-tau)*music0; 553 /* Propagate the transition probabilities */ 554 for (i=1;i<DETECT_SIZE-1;i++) 555 { 556 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0; 557 tonal->pmusic [i] = tonal->pmusic [i+1]*music0; 558 } 559 /* Probability that the latest frame is speech, when all the previous ones were music. */ 560 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; 561 /* Probability that the latest frame is music, when all the previous ones were speech. */ 562 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; 563 564 /* Renormalise probabilities to 1 */ 565 for (i=0;i<DETECT_SIZE;i++) 566 psum += tonal->pspeech[i] + tonal->pmusic[i]; 567 psum = 1.f/psum; 568 for (i=0;i<DETECT_SIZE;i++) 569 { 570 tonal->pspeech[i] *= psum; 571 tonal->pmusic [i] *= psum; 572 } 573 psum = tonal->pmusic[0]; 574 for (i=1;i<DETECT_SIZE;i++) 575 psum += tonal->pspeech[i]; 576 577 /* Estimate our confidence in the speech/music decisions */ 578 if (frame_probs[1]>.75) 579 { 580 if (tonal->music_prob>.9) 581 { 582 float adapt; 583 adapt = 1.f/(++tonal->music_confidence_count); 584 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); 585 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); 586 } 587 if (tonal->music_prob<.1) 588 { 589 float adapt; 590 adapt = 1.f/(++tonal->speech_confidence_count); 591 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); 592 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); 593 } 594 } else { 595 if (tonal->music_confidence_count==0) 596 tonal->music_confidence = .9f; 597 if (tonal->speech_confidence_count==0) 598 tonal->speech_confidence = .1f; 599 } 600 psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum)); 601 } 602 if (tonal->last_music != (tonal->music_prob>.5f)) 603 tonal->last_transition=0; 604 tonal->last_music = tonal->music_prob>.5f; 605 #else 606 info->music_prob = 0; 607 #endif 608 /*for (i=0;i<25;i++) 609 printf("%f ", features[i]); 610 printf("\n");*/ 611 612 info->bandwidth = bandwidth; 613 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ 614 info->noisiness = frame_noisiness; 615 info->valid = 1; 616 if (info_out!=NULL) 617 OPUS_COPY(info_out, info, 1); 618 RESTORE_STACK; 619 } 620 621 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm, 622 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs, 623 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) 624 { 625 int offset; 626 int pcm_len; 627 628 if (analysis_pcm != NULL) 629 { 630 /* Avoid overflow/wrap-around of the analysis buffer */ 631 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); 632 633 pcm_len = analysis_frame_size - analysis->analysis_offset; 634 offset = analysis->analysis_offset; 635 do { 636 tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); 637 offset += 480; 638 pcm_len -= 480; 639 } while (pcm_len>0); 640 analysis->analysis_offset = analysis_frame_size; 641 642 analysis->analysis_offset -= frame_size; 643 } 644 645 analysis_info->valid = 0; 646 tonality_get_info(analysis, analysis_info, frame_size); 647 } 648