Home | History | Annotate | Download | only in silk
      1 /***********************************************************************
      2 Copyright (c) 2006-2011, Skype Limited. All rights reserved.
      3 Redistribution and use in source and binary forms, with or without
      4 modification, are permitted provided that the following conditions
      5 are met:
      6 - Redistributions of source code must retain the above copyright notice,
      7 this list of conditions and the following disclaimer.
      8 - Redistributions in binary form must reproduce the above copyright
      9 notice, this list of conditions and the following disclaimer in the
     10 documentation and/or other materials provided with the distribution.
     11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
     12 names of specific contributors, may be used to endorse or promote
     13 products derived from this software without specific prior written
     14 permission.
     15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25 POSSIBILITY OF SUCH DAMAGE.
     26 ***********************************************************************/
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include "main.h"
     33 #include "stack_alloc.h"
     34 
     35 /* Silk VAD noise level estimation */
     36 # if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
     37 static OPUS_INLINE void silk_VAD_GetNoiseLevels(
     38     const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
     39     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
     40 );
     41 #endif
     42 
     43 /**********************************/
     44 /* Initialization of the Silk VAD */
     45 /**********************************/
     46 opus_int silk_VAD_Init(                                         /* O    Return value, 0 if success                  */
     47     silk_VAD_state              *psSilk_VAD                     /* I/O  Pointer to Silk VAD state                   */
     48 )
     49 {
     50     opus_int b, ret = 0;
     51 
     52     /* reset state memory */
     53     silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
     54 
     55     /* init noise levels */
     56     /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
     57     for( b = 0; b < VAD_N_BANDS; b++ ) {
     58         psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
     59     }
     60 
     61     /* Initialize state */
     62     for( b = 0; b < VAD_N_BANDS; b++ ) {
     63         psSilk_VAD->NL[ b ]     = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
     64         psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
     65     }
     66     psSilk_VAD->counter = 15;
     67 
     68     /* init smoothed energy-to-noise ratio*/
     69     for( b = 0; b < VAD_N_BANDS; b++ ) {
     70         psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256;       /* 100 * 256 --> 20 dB SNR */
     71     }
     72 
     73     return( ret );
     74 }
     75 
     76 /* Weighting factors for tilt measure */
     77 static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
     78 
     79 /***************************************/
     80 /* Get the speech activity level in Q8 */
     81 /***************************************/
     82 opus_int silk_VAD_GetSA_Q8_c(                                   /* O    Return value, 0 if success                  */
     83     silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
     84     const opus_int16            pIn[]                           /* I    PCM input                                   */
     85 )
     86 {
     87     opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
     88     opus_int   decimated_framelength1, decimated_framelength2;
     89     opus_int   decimated_framelength;
     90     opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
     91     opus_int32 sumSquared, smooth_coef_Q16;
     92     opus_int16 HPstateTmp;
     93     VARDECL( opus_int16, X );
     94     opus_int32 Xnrg[ VAD_N_BANDS ];
     95     opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
     96     opus_int32 speech_nrg, x_tmp;
     97     opus_int   X_offset[ VAD_N_BANDS ];
     98     opus_int   ret = 0;
     99     silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
    100     SAVE_STACK;
    101 
    102     /* Safety checks */
    103     silk_assert( VAD_N_BANDS == 4 );
    104     silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
    105     silk_assert( psEncC->frame_length <= 512 );
    106     silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
    107 
    108     /***********************/
    109     /* Filter and Decimate */
    110     /***********************/
    111     decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
    112     decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
    113     decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
    114     /* Decimate into 4 bands:
    115        0       L      3L       L              3L                             5L
    116                -      --       -              --                             --
    117                8       8       2               4                              4
    118 
    119        [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
    120 
    121        They're arranged to allow the minimal ( frame_length / 4 ) extra
    122        scratch space during the downsampling process */
    123     X_offset[ 0 ] = 0;
    124     X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
    125     X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
    126     X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
    127     ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
    128 
    129     /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
    130     silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
    131         X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
    132 
    133     /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
    134     silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
    135         X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
    136 
    137     /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
    138     silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
    139         X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
    140 
    141     /*********************************************/
    142     /* HP filter on lowest band (differentiator) */
    143     /*********************************************/
    144     X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
    145     HPstateTmp = X[ decimated_framelength - 1 ];
    146     for( i = decimated_framelength - 1; i > 0; i-- ) {
    147         X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
    148         X[ i ]     -= X[ i - 1 ];
    149     }
    150     X[ 0 ] -= psSilk_VAD->HPstate;
    151     psSilk_VAD->HPstate = HPstateTmp;
    152 
    153     /*************************************/
    154     /* Calculate the energy in each band */
    155     /*************************************/
    156     for( b = 0; b < VAD_N_BANDS; b++ ) {
    157         /* Find the decimated framelength in the non-uniformly divided bands */
    158         decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
    159 
    160         /* Split length into subframe lengths */
    161         dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
    162         dec_subframe_offset = 0;
    163 
    164         /* Compute energy per sub-frame */
    165         /* initialize with summed energy of last subframe */
    166         Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
    167         for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
    168             sumSquared = 0;
    169             for( i = 0; i < dec_subframe_length; i++ ) {
    170                 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
    171                 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
    172                 x_tmp = silk_RSHIFT(
    173                     X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
    174                 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
    175 
    176                 /* Safety check */
    177                 silk_assert( sumSquared >= 0 );
    178             }
    179 
    180             /* Add/saturate summed energy of current subframe */
    181             if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
    182                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
    183             } else {
    184                 /* Look-ahead subframe */
    185                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
    186             }
    187 
    188             dec_subframe_offset += dec_subframe_length;
    189         }
    190         psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
    191     }
    192 
    193     /********************/
    194     /* Noise estimation */
    195     /********************/
    196     silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
    197 
    198     /***********************************************/
    199     /* Signal-plus-noise to noise ratio estimation */
    200     /***********************************************/
    201     sumSquared = 0;
    202     input_tilt = 0;
    203     for( b = 0; b < VAD_N_BANDS; b++ ) {
    204         speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
    205         if( speech_nrg > 0 ) {
    206             /* Divide, with sufficient resolution */
    207             if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
    208                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
    209             } else {
    210                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
    211             }
    212 
    213             /* Convert to log domain */
    214             SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
    215 
    216             /* Sum-of-squares */
    217             sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
    218 
    219             /* Tilt measure */
    220             if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
    221                 /* Scale down SNR value for small subband speech energies */
    222                 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
    223             }
    224             input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
    225         } else {
    226             NrgToNoiseRatio_Q8[ b ] = 256;
    227         }
    228     }
    229 
    230     /* Mean-of-squares */
    231     sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
    232 
    233     /* Root-mean-square approximation, scale to dBs, and write to output pointer */
    234     pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
    235 
    236     /*********************************/
    237     /* Speech Probability Estimation */
    238     /*********************************/
    239     SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
    240 
    241     /**************************/
    242     /* Frequency Tilt Measure */
    243     /**************************/
    244     psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
    245 
    246     /**************************************************/
    247     /* Scale the sigmoid output based on power levels */
    248     /**************************************************/
    249     speech_nrg = 0;
    250     for( b = 0; b < VAD_N_BANDS; b++ ) {
    251         /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
    252         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
    253     }
    254 
    255     /* Power scaling */
    256     if( speech_nrg <= 0 ) {
    257         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
    258     } else if( speech_nrg < 32768 ) {
    259         if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
    260             speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
    261         } else {
    262             speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
    263         }
    264 
    265         /* square-root */
    266         speech_nrg = silk_SQRT_APPROX( speech_nrg );
    267         SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
    268     }
    269 
    270     /* Copy the resulting speech activity in Q8 */
    271     psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
    272 
    273     /***********************************/
    274     /* Energy Level and SNR estimation */
    275     /***********************************/
    276     /* Smoothing coefficient */
    277     smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
    278 
    279     if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
    280         smooth_coef_Q16 >>= 1;
    281     }
    282 
    283     for( b = 0; b < VAD_N_BANDS; b++ ) {
    284         /* compute smoothed energy-to-noise ratio per band */
    285         psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
    286             NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
    287 
    288         /* signal to noise ratio in dB per band */
    289         SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
    290         /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
    291         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
    292     }
    293 
    294     RESTORE_STACK;
    295     return( ret );
    296 }
    297 
    298 /**************************/
    299 /* Noise level estimation */
    300 /**************************/
    301 # if  !defined(OPUS_X86_MAY_HAVE_SSE4_1)
    302 static OPUS_INLINE
    303 #endif
    304 void silk_VAD_GetNoiseLevels(
    305     const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
    306     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
    307 )
    308 {
    309     opus_int   k;
    310     opus_int32 nl, nrg, inv_nrg;
    311     opus_int   coef, min_coef;
    312 
    313     /* Initially faster smoothing */
    314     if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
    315         min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
    316     } else {
    317         min_coef = 0;
    318     }
    319 
    320     for( k = 0; k < VAD_N_BANDS; k++ ) {
    321         /* Get old noise level estimate for current band */
    322         nl = psSilk_VAD->NL[ k ];
    323         silk_assert( nl >= 0 );
    324 
    325         /* Add bias */
    326         nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
    327         silk_assert( nrg > 0 );
    328 
    329         /* Invert energies */
    330         inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
    331         silk_assert( inv_nrg >= 0 );
    332 
    333         /* Less update when subband energy is high */
    334         if( nrg > silk_LSHIFT( nl, 3 ) ) {
    335             coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
    336         } else if( nrg < nl ) {
    337             coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
    338         } else {
    339             coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
    340         }
    341 
    342         /* Initially faster smoothing */
    343         coef = silk_max_int( coef, min_coef );
    344 
    345         /* Smooth inverse energies */
    346         psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
    347         silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
    348 
    349         /* Compute noise level by inverting again */
    350         nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
    351         silk_assert( nl >= 0 );
    352 
    353         /* Limit noise levels (guarantee 7 bits of head room) */
    354         nl = silk_min( nl, 0x00FFFFFF );
    355 
    356         /* Store as part of state */
    357         psSilk_VAD->NL[ k ] = nl;
    358     }
    359 
    360     /* Increment frame counter */
    361     psSilk_VAD->counter++;
    362 }
    363