Home | History | Annotate | Download | only in src
      1 /* Copyright (c) 2011 Xiph.Org Foundation
      2    Written by Jean-Marc Valin */
      3 /*
      4    Redistribution and use in source and binary forms, with or without
      5    modification, are permitted provided that the following conditions
      6    are met:
      7 
      8    - Redistributions of source code must retain the above copyright
      9    notice, this list of conditions and the following disclaimer.
     10 
     11    - Redistributions in binary form must reproduce the above copyright
     12    notice, this list of conditions and the following disclaimer in the
     13    documentation and/or other materials provided with the distribution.
     14 
     15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
     19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include "kiss_fft.h"
     33 #include "celt.h"
     34 #include "modes.h"
     35 #include "arch.h"
     36 #include "quant_bands.h"
     37 #include <stdio.h>
     38 #include "analysis.h"
     39 #include "mlp.h"
     40 #include "stack_alloc.h"
     41 
     42 extern const MLP net;
     43 
     44 #ifndef M_PI
     45 #define M_PI 3.141592653
     46 #endif
     47 
     48 static const float dct_table[128] = {
     49         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
     50         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
     51         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
     52        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
     53         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
     54        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
     55         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
     56         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
     57         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
     58         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
     59         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
     60        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
     61         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
     62        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
     63         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
     64         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
     65 };
     66 
     67 static const float analysis_window[240] = {
     68       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
     69       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
     70       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
     71       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
     72       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
     73       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
     74       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
     75       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
     76       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
     77       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
     78       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
     79       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
     80       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
     81       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
     82       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
     83       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
     84       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
     85       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
     86       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
     87       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
     88       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
     89       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
     90       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
     91       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
     92       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
     93       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
     94       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
     95       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
     96       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
     97       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
     98 };
     99 
    100 static const int tbands[NB_TBANDS+1] = {
    101        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
    102 };
    103 
    104 static const int extra_bands[NB_TOT_BANDS+1] = {
    105       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
    106 };
    107 
    108 /*static const float tweight[NB_TBANDS+1] = {
    109       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
    110 };*/
    111 
    112 #define NB_TONAL_SKIP_BANDS 9
    113 
    114 #define cA 0.43157974f
    115 #define cB 0.67848403f
    116 #define cC 0.08595542f
    117 #define cE ((float)M_PI/2)
    118 static OPUS_INLINE float fast_atan2f(float y, float x) {
    119    float x2, y2;
    120    /* Should avoid underflow on the values we'll get */
    121    if (ABS16(x)+ABS16(y)<1e-9f)
    122    {
    123       x*=1e12f;
    124       y*=1e12f;
    125    }
    126    x2 = x*x;
    127    y2 = y*y;
    128    if(x2<y2){
    129       float den = (y2 + cB*x2) * (y2 + cC*x2);
    130       if (den!=0)
    131          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
    132       else
    133          return (y<0 ? -cE : cE);
    134    }else{
    135       float den = (x2 + cB*y2) * (x2 + cC*y2);
    136       if (den!=0)
    137          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
    138       else
    139          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
    140    }
    141 }
    142 
    143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
    144 {
    145    int pos;
    146    int curr_lookahead;
    147    float psum;
    148    int i;
    149 
    150    pos = tonal->read_pos;
    151    curr_lookahead = tonal->write_pos-tonal->read_pos;
    152    if (curr_lookahead<0)
    153       curr_lookahead += DETECT_SIZE;
    154 
    155    if (len > 480 && pos != tonal->write_pos)
    156    {
    157       pos++;
    158       if (pos==DETECT_SIZE)
    159          pos=0;
    160    }
    161    if (pos == tonal->write_pos)
    162       pos--;
    163    if (pos<0)
    164       pos = DETECT_SIZE-1;
    165    OPUS_COPY(info_out, &tonal->info[pos], 1);
    166    tonal->read_subframe += len/120;
    167    while (tonal->read_subframe>=4)
    168    {
    169       tonal->read_subframe -= 4;
    170       tonal->read_pos++;
    171    }
    172    if (tonal->read_pos>=DETECT_SIZE)
    173       tonal->read_pos-=DETECT_SIZE;
    174 
    175    /* Compensate for the delay in the features themselves.
    176       FIXME: Need a better estimate the 10 I just made up */
    177    curr_lookahead = IMAX(curr_lookahead-10, 0);
    178 
    179    psum=0;
    180    /* Summing the probability of transition patterns that involve music at
    181       time (DETECT_SIZE-curr_lookahead-1) */
    182    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
    183       psum += tonal->pmusic[i];
    184    for (;i<DETECT_SIZE;i++)
    185       psum += tonal->pspeech[i];
    186    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
    187    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
    188 
    189    info_out->music_prob = psum;
    190 }
    191 
    192 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
    193 {
    194     int i, b;
    195     const kiss_fft_state *kfft;
    196     VARDECL(kiss_fft_cpx, in);
    197     VARDECL(kiss_fft_cpx, out);
    198     int N = 480, N2=240;
    199     float * OPUS_RESTRICT A = tonal->angle;
    200     float * OPUS_RESTRICT dA = tonal->d_angle;
    201     float * OPUS_RESTRICT d2A = tonal->d2_angle;
    202     VARDECL(float, tonality);
    203     VARDECL(float, noisiness);
    204     float band_tonality[NB_TBANDS];
    205     float logE[NB_TBANDS];
    206     float BFCC[8];
    207     float features[25];
    208     float frame_tonality;
    209     float max_frame_tonality;
    210     /*float tw_sum=0;*/
    211     float frame_noisiness;
    212     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
    213     float slope=0;
    214     float frame_stationarity;
    215     float relativeE;
    216     float frame_probs[2];
    217     float alpha, alphaE, alphaE2;
    218     float frame_loudness;
    219     float bandwidth_mask;
    220     int bandwidth=0;
    221     float maxE = 0;
    222     float noise_floor;
    223     int remaining;
    224     AnalysisInfo *info;
    225     SAVE_STACK;
    226 
    227     tonal->last_transition++;
    228     alpha = 1.f/IMIN(20, 1+tonal->count);
    229     alphaE = 1.f/IMIN(50, 1+tonal->count);
    230     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
    231 
    232     if (tonal->count<4)
    233        tonal->music_prob = .5;
    234     kfft = celt_mode->mdct.kfft[0];
    235     if (tonal->count==0)
    236        tonal->mem_fill = 240;
    237     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
    238     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
    239     {
    240        tonal->mem_fill += len;
    241        /* Don't have enough to update the analysis */
    242        RESTORE_STACK;
    243        return;
    244     }
    245     info = &tonal->info[tonal->write_pos++];
    246     if (tonal->write_pos>=DETECT_SIZE)
    247        tonal->write_pos-=DETECT_SIZE;
    248 
    249     ALLOC(in, 480, kiss_fft_cpx);
    250     ALLOC(out, 480, kiss_fft_cpx);
    251     ALLOC(tonality, 240, float);
    252     ALLOC(noisiness, 240, float);
    253     for (i=0;i<N2;i++)
    254     {
    255        float w = analysis_window[i];
    256        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
    257        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
    258        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
    259        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
    260     }
    261     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
    262     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
    263     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
    264     tonal->mem_fill = 240 + remaining;
    265     opus_fft(kfft, in, out);
    266 
    267     for (i=1;i<N2;i++)
    268     {
    269        float X1r, X2r, X1i, X2i;
    270        float angle, d_angle, d2_angle;
    271        float angle2, d_angle2, d2_angle2;
    272        float mod1, mod2, avg_mod;
    273        X1r = (float)out[i].r+out[N-i].r;
    274        X1i = (float)out[i].i-out[N-i].i;
    275        X2r = (float)out[i].i+out[N-i].i;
    276        X2i = (float)out[N-i].r-out[i].r;
    277 
    278        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
    279        d_angle = angle - A[i];
    280        d2_angle = d_angle - dA[i];
    281 
    282        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
    283        d_angle2 = angle2 - angle;
    284        d2_angle2 = d_angle2 - d_angle;
    285 
    286        mod1 = d2_angle - (float)floor(.5+d2_angle);
    287        noisiness[i] = ABS16(mod1);
    288        mod1 *= mod1;
    289        mod1 *= mod1;
    290 
    291        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
    292        noisiness[i] += ABS16(mod2);
    293        mod2 *= mod2;
    294        mod2 *= mod2;
    295 
    296        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
    297        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
    298 
    299        A[i] = angle2;
    300        dA[i] = d_angle2;
    301        d2A[i] = mod2;
    302     }
    303 
    304     frame_tonality = 0;
    305     max_frame_tonality = 0;
    306     /*tw_sum = 0;*/
    307     info->activity = 0;
    308     frame_noisiness = 0;
    309     frame_stationarity = 0;
    310     if (!tonal->count)
    311     {
    312        for (b=0;b<NB_TBANDS;b++)
    313        {
    314           tonal->lowE[b] = 1e10;
    315           tonal->highE[b] = -1e10;
    316        }
    317     }
    318     relativeE = 0;
    319     frame_loudness = 0;
    320     for (b=0;b<NB_TBANDS;b++)
    321     {
    322        float E=0, tE=0, nE=0;
    323        float L1, L2;
    324        float stationarity;
    325        for (i=tbands[b];i<tbands[b+1];i++)
    326        {
    327           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
    328                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
    329 #ifdef FIXED_POINT
    330           /* FIXME: It's probably best to change the BFCC filter initial state instead */
    331           binE *= 5.55e-17f;
    332 #endif
    333           E += binE;
    334           tE += binE*tonality[i];
    335           nE += binE*2.f*(.5f-noisiness[i]);
    336        }
    337        tonal->E[tonal->E_count][b] = E;
    338        frame_noisiness += nE/(1e-15f+E);
    339 
    340        frame_loudness += (float)sqrt(E+1e-10f);
    341        logE[b] = (float)log(E+1e-10f);
    342        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
    343        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
    344        if (tonal->highE[b] < tonal->lowE[b]+1.f)
    345        {
    346           tonal->highE[b]+=.5f;
    347           tonal->lowE[b]-=.5f;
    348        }
    349        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
    350 
    351        L1=L2=0;
    352        for (i=0;i<NB_FRAMES;i++)
    353        {
    354           L1 += (float)sqrt(tonal->E[i][b]);
    355           L2 += tonal->E[i][b];
    356        }
    357 
    358        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
    359        stationarity *= stationarity;
    360        stationarity *= stationarity;
    361        frame_stationarity += stationarity;
    362        /*band_tonality[b] = tE/(1e-15+E)*/;
    363        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
    364 #if 0
    365        if (b>=NB_TONAL_SKIP_BANDS)
    366        {
    367           frame_tonality += tweight[b]*band_tonality[b];
    368           tw_sum += tweight[b];
    369        }
    370 #else
    371        frame_tonality += band_tonality[b];
    372        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
    373           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
    374 #endif
    375        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
    376        slope += band_tonality[b]*(b-8);
    377        /*printf("%f %f ", band_tonality[b], stationarity);*/
    378        tonal->prev_band_tonality[b] = band_tonality[b];
    379     }
    380 
    381     bandwidth_mask = 0;
    382     bandwidth = 0;
    383     maxE = 0;
    384     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
    385 #ifdef FIXED_POINT
    386     noise_floor *= 1<<(15+SIG_SHIFT);
    387 #endif
    388     noise_floor *= noise_floor;
    389     for (b=0;b<NB_TOT_BANDS;b++)
    390     {
    391        float E=0;
    392        int band_start, band_end;
    393        /* Keep a margin of 300 Hz for aliasing */
    394        band_start = extra_bands[b];
    395        band_end = extra_bands[b+1];
    396        for (i=band_start;i<band_end;i++)
    397        {
    398           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
    399                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
    400           E += binE;
    401        }
    402        maxE = MAX32(maxE, E);
    403        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
    404        E = MAX32(E, tonal->meanE[b]);
    405        /* Use a simple follower with 13 dB/Bark slope for spreading function */
    406        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
    407        /* Consider the band "active" only if all these conditions are met:
    408           1) less than 10 dB below the simple follower
    409           2) less than 90 dB below the peak band (maximal masking possible considering
    410              both the ATH and the loudness-dependent slope of the spreading function)
    411           3) above the PCM quantization noise floor
    412        */
    413        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
    414           bandwidth = b;
    415     }
    416     if (tonal->count<=2)
    417        bandwidth = 20;
    418     frame_loudness = 20*(float)log10(frame_loudness);
    419     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
    420     tonal->lowECount *= (1-alphaE);
    421     if (frame_loudness < tonal->Etracker-30)
    422        tonal->lowECount += alphaE;
    423 
    424     for (i=0;i<8;i++)
    425     {
    426        float sum=0;
    427        for (b=0;b<16;b++)
    428           sum += dct_table[i*16+b]*logE[b];
    429        BFCC[i] = sum;
    430     }
    431 
    432     frame_stationarity /= NB_TBANDS;
    433     relativeE /= NB_TBANDS;
    434     if (tonal->count<10)
    435        relativeE = .5;
    436     frame_noisiness /= NB_TBANDS;
    437 #if 1
    438     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
    439 #else
    440     info->activity = .5*(1+frame_noisiness-frame_stationarity);
    441 #endif
    442     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
    443     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
    444     tonal->prev_tonality = frame_tonality;
    445 
    446     slope /= 8*8;
    447     info->tonality_slope = slope;
    448 
    449     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
    450     tonal->count++;
    451     info->tonality = frame_tonality;
    452 
    453     for (i=0;i<4;i++)
    454        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
    455 
    456     for (i=0;i<4;i++)
    457        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
    458 
    459     for (i=0;i<4;i++)
    460         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
    461     for (i=0;i<3;i++)
    462         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
    463 
    464     if (tonal->count > 5)
    465     {
    466        for (i=0;i<9;i++)
    467           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
    468     }
    469 
    470     for (i=0;i<8;i++)
    471     {
    472        tonal->mem[i+24] = tonal->mem[i+16];
    473        tonal->mem[i+16] = tonal->mem[i+8];
    474        tonal->mem[i+8] = tonal->mem[i];
    475        tonal->mem[i] = BFCC[i];
    476     }
    477     for (i=0;i<9;i++)
    478        features[11+i] = (float)sqrt(tonal->std[i]);
    479     features[20] = info->tonality;
    480     features[21] = info->activity;
    481     features[22] = frame_stationarity;
    482     features[23] = info->tonality_slope;
    483     features[24] = tonal->lowECount;
    484 
    485 #ifndef DISABLE_FLOAT_API
    486     mlp_process(&net, features, frame_probs);
    487     frame_probs[0] = .5f*(frame_probs[0]+1);
    488     /* Curve fitting between the MLP probability and the actual probability */
    489     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
    490     /* Probability of active audio (as opposed to silence) */
    491     frame_probs[1] = .5f*frame_probs[1]+.5f;
    492     /* Consider that silence has a 50-50 probability. */
    493     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
    494 
    495     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
    496     {
    497        /* Probability of state transition */
    498        float tau;
    499        /* Represents independence of the MLP probabilities, where
    500           beta=1 means fully independent. */
    501        float beta;
    502        /* Denormalized probability of speech (p0) and music (p1) after update */
    503        float p0, p1;
    504        /* Probabilities for "all speech" and "all music" */
    505        float s0, m0;
    506        /* Probability sum for renormalisation */
    507        float psum;
    508        /* Instantaneous probability of speech and music, with beta pre-applied. */
    509        float speech0;
    510        float music0;
    511 
    512        /* One transition every 3 minutes of active audio */
    513        tau = .00005f*frame_probs[1];
    514        beta = .05f;
    515        if (1) {
    516           /* Adapt beta based on how "unexpected" the new prob is */
    517           float p, q;
    518           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
    519           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
    520           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
    521        }
    522        /* p0 and p1 are the probabilities of speech and music at this frame
    523           using only information from previous frame and applying the
    524           state transition model */
    525        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
    526        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
    527        /* We apply the current probability with exponent beta to work around
    528           the fact that the probability estimates aren't independent. */
    529        p0 *= (float)pow(1-frame_probs[0], beta);
    530        p1 *= (float)pow(frame_probs[0], beta);
    531        /* Normalise the probabilities to get the Marokv probability of music. */
    532        tonal->music_prob = p1/(p0+p1);
    533        info->music_prob = tonal->music_prob;
    534 
    535        /* This chunk of code deals with delayed decision. */
    536        psum=1e-20f;
    537        /* Instantaneous probability of speech and music, with beta pre-applied. */
    538        speech0 = (float)pow(1-frame_probs[0], beta);
    539        music0  = (float)pow(frame_probs[0], beta);
    540        if (tonal->count==1)
    541        {
    542           tonal->pspeech[0]=.5;
    543           tonal->pmusic [0]=.5;
    544        }
    545        /* Updated probability of having only speech (s0) or only music (m0),
    546           before considering the new observation. */
    547        s0 = tonal->pspeech[0] + tonal->pspeech[1];
    548        m0 = tonal->pmusic [0] + tonal->pmusic [1];
    549        /* Updates s0 and m0 with instantaneous probability. */
    550        tonal->pspeech[0] = s0*(1-tau)*speech0;
    551        tonal->pmusic [0] = m0*(1-tau)*music0;
    552        /* Propagate the transition probabilities */
    553        for (i=1;i<DETECT_SIZE-1;i++)
    554        {
    555           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
    556           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
    557        }
    558        /* Probability that the latest frame is speech, when all the previous ones were music. */
    559        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
    560        /* Probability that the latest frame is music, when all the previous ones were speech. */
    561        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
    562 
    563        /* Renormalise probabilities to 1 */
    564        for (i=0;i<DETECT_SIZE;i++)
    565           psum += tonal->pspeech[i] + tonal->pmusic[i];
    566        psum = 1.f/psum;
    567        for (i=0;i<DETECT_SIZE;i++)
    568        {
    569           tonal->pspeech[i] *= psum;
    570           tonal->pmusic [i] *= psum;
    571        }
    572        psum = tonal->pmusic[0];
    573        for (i=1;i<DETECT_SIZE;i++)
    574           psum += tonal->pspeech[i];
    575 
    576        /* Estimate our confidence in the speech/music decisions */
    577        if (frame_probs[1]>.75)
    578        {
    579           if (tonal->music_prob>.9)
    580           {
    581              float adapt;
    582              adapt = 1.f/(++tonal->music_confidence_count);
    583              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
    584              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
    585           }
    586           if (tonal->music_prob<.1)
    587           {
    588              float adapt;
    589              adapt = 1.f/(++tonal->speech_confidence_count);
    590              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
    591              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
    592           }
    593        } else {
    594           if (tonal->music_confidence_count==0)
    595              tonal->music_confidence = .9f;
    596           if (tonal->speech_confidence_count==0)
    597              tonal->speech_confidence = .1f;
    598        }
    599     }
    600     if (tonal->last_music != (tonal->music_prob>.5f))
    601        tonal->last_transition=0;
    602     tonal->last_music = tonal->music_prob>.5f;
    603 #else
    604     info->music_prob = 0;
    605 #endif
    606     /*for (i=0;i<25;i++)
    607        printf("%f ", features[i]);
    608     printf("\n");*/
    609 
    610     info->bandwidth = bandwidth;
    611     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
    612     info->noisiness = frame_noisiness;
    613     info->valid = 1;
    614     if (info_out!=NULL)
    615        OPUS_COPY(info_out, info, 1);
    616     RESTORE_STACK;
    617 }
    618 
    619 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
    620                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
    621                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
    622 {
    623    int offset;
    624    int pcm_len;
    625 
    626    if (analysis_pcm != NULL)
    627    {
    628       /* Avoid overflow/wrap-around of the analysis buffer */
    629       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
    630 
    631       pcm_len = analysis_frame_size - analysis->analysis_offset;
    632       offset = analysis->analysis_offset;
    633       do {
    634          tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
    635          offset += 480;
    636          pcm_len -= 480;
    637       } while (pcm_len>0);
    638       analysis->analysis_offset = analysis_frame_size;
    639 
    640       analysis->analysis_offset -= frame_size;
    641    }
    642 
    643    analysis_info->valid = 0;
    644    tonality_get_info(analysis, analysis_info, frame_size);
    645 }
    646