Home | History | Annotate | Download | only in silk
      1 /***********************************************************************
      2 Copyright (c) 2006-2011, Skype Limited. All rights reserved.
      3 Redistribution and use in source and binary forms, with or without
      4 modification, are permitted provided that the following conditions
      5 are met:
      6 - Redistributions of source code must retain the above copyright notice,
      7 this list of conditions and the following disclaimer.
      8 - Redistributions in binary form must reproduce the above copyright
      9 notice, this list of conditions and the following disclaimer in the
     10 documentation and/or other materials provided with the distribution.
     11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
     12 names of specific contributors, may be used to endorse or promote
     13 products derived from this software without specific prior written
     14 permission.
     15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25 POSSIBILITY OF SUCH DAMAGE.
     26 ***********************************************************************/
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include "main.h"
     33 #include "stack_alloc.h"
     34 
     35 /* Silk VAD noise level estimation */
     36 static OPUS_INLINE void silk_VAD_GetNoiseLevels(
     37     const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
     38     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
     39 );
     40 
     41 /**********************************/
     42 /* Initialization of the Silk VAD */
     43 /**********************************/
     44 opus_int silk_VAD_Init(                                         /* O    Return value, 0 if success                  */
     45     silk_VAD_state              *psSilk_VAD                     /* I/O  Pointer to Silk VAD state                   */
     46 )
     47 {
     48     opus_int b, ret = 0;
     49 
     50     /* reset state memory */
     51     silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
     52 
     53     /* init noise levels */
     54     /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
     55     for( b = 0; b < VAD_N_BANDS; b++ ) {
     56         psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
     57     }
     58 
     59     /* Initialize state */
     60     for( b = 0; b < VAD_N_BANDS; b++ ) {
     61         psSilk_VAD->NL[ b ]     = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
     62         psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
     63     }
     64     psSilk_VAD->counter = 15;
     65 
     66     /* init smoothed energy-to-noise ratio*/
     67     for( b = 0; b < VAD_N_BANDS; b++ ) {
     68         psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256;       /* 100 * 256 --> 20 dB SNR */
     69     }
     70 
     71     return( ret );
     72 }
     73 
     74 /* Weighting factors for tilt measure */
     75 static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
     76 
     77 /***************************************/
     78 /* Get the speech activity level in Q8 */
     79 /***************************************/
     80 opus_int silk_VAD_GetSA_Q8(                                     /* O    Return value, 0 if success                  */
     81     silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
     82     const opus_int16            pIn[]                           /* I    PCM input                                   */
     83 )
     84 {
     85     opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
     86     opus_int   decimated_framelength1, decimated_framelength2;
     87     opus_int   decimated_framelength;
     88     opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
     89     opus_int32 sumSquared, smooth_coef_Q16;
     90     opus_int16 HPstateTmp;
     91     VARDECL( opus_int16, X );
     92     opus_int32 Xnrg[ VAD_N_BANDS ];
     93     opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
     94     opus_int32 speech_nrg, x_tmp;
     95     opus_int   X_offset[ VAD_N_BANDS ];
     96     opus_int   ret = 0;
     97     silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
     98     SAVE_STACK;
     99 
    100     /* Safety checks */
    101     silk_assert( VAD_N_BANDS == 4 );
    102     silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
    103     silk_assert( psEncC->frame_length <= 512 );
    104     silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
    105 
    106     /***********************/
    107     /* Filter and Decimate */
    108     /***********************/
    109     decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
    110     decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
    111     decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
    112     /* Decimate into 4 bands:
    113        0       L      3L       L              3L                             5L
    114                -      --       -              --                             --
    115                8       8       2               4                              4
    116 
    117        [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
    118 
    119        They're arranged to allow the minimal ( frame_length / 4 ) extra
    120        scratch space during the downsampling process */
    121     X_offset[ 0 ] = 0;
    122     X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
    123     X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
    124     X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
    125     ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
    126 
    127     /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
    128     silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
    129         X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
    130 
    131     /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
    132     silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
    133         X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
    134 
    135     /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
    136     silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
    137         X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
    138 
    139     /*********************************************/
    140     /* HP filter on lowest band (differentiator) */
    141     /*********************************************/
    142     X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
    143     HPstateTmp = X[ decimated_framelength - 1 ];
    144     for( i = decimated_framelength - 1; i > 0; i-- ) {
    145         X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
    146         X[ i ]     -= X[ i - 1 ];
    147     }
    148     X[ 0 ] -= psSilk_VAD->HPstate;
    149     psSilk_VAD->HPstate = HPstateTmp;
    150 
    151     /*************************************/
    152     /* Calculate the energy in each band */
    153     /*************************************/
    154     for( b = 0; b < VAD_N_BANDS; b++ ) {
    155         /* Find the decimated framelength in the non-uniformly divided bands */
    156         decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
    157 
    158         /* Split length into subframe lengths */
    159         dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
    160         dec_subframe_offset = 0;
    161 
    162         /* Compute energy per sub-frame */
    163         /* initialize with summed energy of last subframe */
    164         Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
    165         for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
    166             sumSquared = 0;
    167             for( i = 0; i < dec_subframe_length; i++ ) {
    168                 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
    169                 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
    170                 x_tmp = silk_RSHIFT(
    171                     X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
    172                 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
    173 
    174                 /* Safety check */
    175                 silk_assert( sumSquared >= 0 );
    176             }
    177 
    178             /* Add/saturate summed energy of current subframe */
    179             if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
    180                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
    181             } else {
    182                 /* Look-ahead subframe */
    183                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
    184             }
    185 
    186             dec_subframe_offset += dec_subframe_length;
    187         }
    188         psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
    189     }
    190 
    191     /********************/
    192     /* Noise estimation */
    193     /********************/
    194     silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
    195 
    196     /***********************************************/
    197     /* Signal-plus-noise to noise ratio estimation */
    198     /***********************************************/
    199     sumSquared = 0;
    200     input_tilt = 0;
    201     for( b = 0; b < VAD_N_BANDS; b++ ) {
    202         speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
    203         if( speech_nrg > 0 ) {
    204             /* Divide, with sufficient resolution */
    205             if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
    206                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
    207             } else {
    208                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
    209             }
    210 
    211             /* Convert to log domain */
    212             SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
    213 
    214             /* Sum-of-squares */
    215             sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
    216 
    217             /* Tilt measure */
    218             if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
    219                 /* Scale down SNR value for small subband speech energies */
    220                 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
    221             }
    222             input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
    223         } else {
    224             NrgToNoiseRatio_Q8[ b ] = 256;
    225         }
    226     }
    227 
    228     /* Mean-of-squares */
    229     sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
    230 
    231     /* Root-mean-square approximation, scale to dBs, and write to output pointer */
    232     pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
    233 
    234     /*********************************/
    235     /* Speech Probability Estimation */
    236     /*********************************/
    237     SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
    238 
    239     /**************************/
    240     /* Frequency Tilt Measure */
    241     /**************************/
    242     psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
    243 
    244     /**************************************************/
    245     /* Scale the sigmoid output based on power levels */
    246     /**************************************************/
    247     speech_nrg = 0;
    248     for( b = 0; b < VAD_N_BANDS; b++ ) {
    249         /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
    250         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
    251     }
    252 
    253     /* Power scaling */
    254     if( speech_nrg <= 0 ) {
    255         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
    256     } else if( speech_nrg < 32768 ) {
    257         if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
    258             speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
    259         } else {
    260             speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
    261         }
    262 
    263         /* square-root */
    264         speech_nrg = silk_SQRT_APPROX( speech_nrg );
    265         SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
    266     }
    267 
    268     /* Copy the resulting speech activity in Q8 */
    269     psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
    270 
    271     /***********************************/
    272     /* Energy Level and SNR estimation */
    273     /***********************************/
    274     /* Smoothing coefficient */
    275     smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
    276 
    277     if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
    278         smooth_coef_Q16 >>= 1;
    279     }
    280 
    281     for( b = 0; b < VAD_N_BANDS; b++ ) {
    282         /* compute smoothed energy-to-noise ratio per band */
    283         psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
    284             NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
    285 
    286         /* signal to noise ratio in dB per band */
    287         SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
    288         /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
    289         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
    290     }
    291 
    292     RESTORE_STACK;
    293     return( ret );
    294 }
    295 
    296 /**************************/
    297 /* Noise level estimation */
    298 /**************************/
    299 static OPUS_INLINE void silk_VAD_GetNoiseLevels(
    300     const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
    301     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
    302 )
    303 {
    304     opus_int   k;
    305     opus_int32 nl, nrg, inv_nrg;
    306     opus_int   coef, min_coef;
    307 
    308     /* Initially faster smoothing */
    309     if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
    310         min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
    311     } else {
    312         min_coef = 0;
    313     }
    314 
    315     for( k = 0; k < VAD_N_BANDS; k++ ) {
    316         /* Get old noise level estimate for current band */
    317         nl = psSilk_VAD->NL[ k ];
    318         silk_assert( nl >= 0 );
    319 
    320         /* Add bias */
    321         nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
    322         silk_assert( nrg > 0 );
    323 
    324         /* Invert energies */
    325         inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
    326         silk_assert( inv_nrg >= 0 );
    327 
    328         /* Less update when subband energy is high */
    329         if( nrg > silk_LSHIFT( nl, 3 ) ) {
    330             coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
    331         } else if( nrg < nl ) {
    332             coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
    333         } else {
    334             coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
    335         }
    336 
    337         /* Initially faster smoothing */
    338         coef = silk_max_int( coef, min_coef );
    339 
    340         /* Smooth inverse energies */
    341         psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
    342         silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
    343 
    344         /* Compute noise level by inverting again */
    345         nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
    346         silk_assert( nl >= 0 );
    347 
    348         /* Limit noise levels (guarantee 7 bits of head room) */
    349         nl = silk_min( nl, 0x00FFFFFF );
    350 
    351         /* Store as part of state */
    352         psSilk_VAD->NL[ k ] = nl;
    353     }
    354 
    355     /* Increment frame counter */
    356     psSilk_VAD->counter++;
    357 }
    358