1 /*********************************************************************** 2 Copyright (c) 2006-2011, Skype Limited. All rights reserved. 3 Redistribution and use in source and binary forms, with or without 4 modification, are permitted provided that the following conditions 5 are met: 6 - Redistributions of source code must retain the above copyright notice, 7 this list of conditions and the following disclaimer. 8 - Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the 12 names of specific contributors, may be used to endorse or promote 13 products derived from this software without specific prior written 14 permission. 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 POSSIBILITY OF SUCH DAMAGE. 26 ***********************************************************************/ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include "main.h" 33 34 /* Silk VAD noise level estimation */ 35 static inline void silk_VAD_GetNoiseLevels( 36 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */ 37 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ 38 ); 39 40 /**********************************/ 41 /* Initialization of the Silk VAD */ 42 /**********************************/ 43 opus_int silk_VAD_Init( /* O Return value, 0 if success */ 44 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ 45 ) 46 { 47 opus_int b, ret = 0; 48 49 /* reset state memory */ 50 silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) ); 51 52 /* init noise levels */ 53 /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */ 54 for( b = 0; b < VAD_N_BANDS; b++ ) { 55 psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 ); 56 } 57 58 /* Initialize state */ 59 for( b = 0; b < VAD_N_BANDS; b++ ) { 60 psSilk_VAD->NL[ b ] = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] ); 61 psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] ); 62 } 63 psSilk_VAD->counter = 15; 64 65 /* init smoothed energy-to-noise ratio*/ 66 for( b = 0; b < VAD_N_BANDS; b++ ) { 67 psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256; /* 100 * 256 --> 20 dB SNR */ 68 } 69 70 return( ret ); 71 } 72 73 /* Weighting factors for tilt measure */ 74 static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 }; 75 76 /***************************************/ 77 /* Get the speech activity level in Q8 */ 78 /***************************************/ 79 opus_int silk_VAD_GetSA_Q8( /* O Return value, 0 if success */ 80 silk_encoder_state *psEncC, /* I/O Encoder state */ 81 const opus_int16 pIn[] /* I PCM input */ 82 ) 83 { 84 opus_int SA_Q15, pSNR_dB_Q7, input_tilt; 85 opus_int decimated_framelength, dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s; 86 opus_int32 sumSquared, smooth_coef_Q16; 87 opus_int16 HPstateTmp; 88 opus_int16 X[ VAD_N_BANDS ][ MAX_FRAME_LENGTH / 2 ]; 89 opus_int32 Xnrg[ VAD_N_BANDS ]; 90 opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ]; 91 opus_int32 speech_nrg, x_tmp; 92 opus_int ret = 0; 93 silk_VAD_state *psSilk_VAD = &psEncC->sVAD; 94 95 /* Safety checks */ 96 silk_assert( VAD_N_BANDS == 4 ); 97 silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length ); 98 silk_assert( psEncC->frame_length <= 512 ); 99 silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) ); 100 101 /***********************/ 102 /* Filter and Decimate */ 103 /***********************/ 104 /* 0-8 kHz to 0-4 kHz and 4-8 kHz */ 105 silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[ 0 ], &X[ 0 ][ 0 ], &X[ 3 ][ 0 ], psEncC->frame_length ); 106 107 /* 0-4 kHz to 0-2 kHz and 2-4 kHz */ 108 silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState1[ 0 ], &X[ 0 ][ 0 ], &X[ 2 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 1 ) ); 109 110 /* 0-2 kHz to 0-1 kHz and 1-2 kHz */ 111 silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState2[ 0 ], &X[ 0 ][ 0 ], &X[ 1 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 2 ) ); 112 113 /*********************************************/ 114 /* HP filter on lowest band (differentiator) */ 115 /*********************************************/ 116 decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 ); 117 X[ 0 ][ decimated_framelength - 1 ] = silk_RSHIFT( X[ 0 ][ decimated_framelength - 1 ], 1 ); 118 HPstateTmp = X[ 0 ][ decimated_framelength - 1 ]; 119 for( i = decimated_framelength - 1; i > 0; i-- ) { 120 X[ 0 ][ i - 1 ] = silk_RSHIFT( X[ 0 ][ i - 1 ], 1 ); 121 X[ 0 ][ i ] -= X[ 0 ][ i - 1 ]; 122 } 123 X[ 0 ][ 0 ] -= psSilk_VAD->HPstate; 124 psSilk_VAD->HPstate = HPstateTmp; 125 126 /*************************************/ 127 /* Calculate the energy in each band */ 128 /*************************************/ 129 for( b = 0; b < VAD_N_BANDS; b++ ) { 130 /* Find the decimated framelength in the non-uniformly divided bands */ 131 decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) ); 132 133 /* Split length into subframe lengths */ 134 dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 ); 135 dec_subframe_offset = 0; 136 137 /* Compute energy per sub-frame */ 138 /* initialize with summed energy of last subframe */ 139 Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ]; 140 for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) { 141 sumSquared = 0; 142 for( i = 0; i < dec_subframe_length; i++ ) { 143 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2. */ 144 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128) */ 145 x_tmp = silk_RSHIFT( X[ b ][ i + dec_subframe_offset ], 3 ); 146 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp ); 147 148 /* Safety check */ 149 silk_assert( sumSquared >= 0 ); 150 } 151 152 /* Add/saturate summed energy of current subframe */ 153 if( s < VAD_INTERNAL_SUBFRAMES - 1 ) { 154 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared ); 155 } else { 156 /* Look-ahead subframe */ 157 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) ); 158 } 159 160 dec_subframe_offset += dec_subframe_length; 161 } 162 psSilk_VAD->XnrgSubfr[ b ] = sumSquared; 163 } 164 165 /********************/ 166 /* Noise estimation */ 167 /********************/ 168 silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD ); 169 170 /***********************************************/ 171 /* Signal-plus-noise to noise ratio estimation */ 172 /***********************************************/ 173 sumSquared = 0; 174 input_tilt = 0; 175 for( b = 0; b < VAD_N_BANDS; b++ ) { 176 speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ]; 177 if( speech_nrg > 0 ) { 178 /* Divide, with sufficient resolution */ 179 if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) { 180 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 ); 181 } else { 182 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 ); 183 } 184 185 /* Convert to log domain */ 186 SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128; 187 188 /* Sum-of-squares */ 189 sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 ); /* Q14 */ 190 191 /* Tilt measure */ 192 if( speech_nrg < ( (opus_int32)1 << 20 ) ) { 193 /* Scale down SNR value for small subband speech energies */ 194 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 ); 195 } 196 input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 ); 197 } else { 198 NrgToNoiseRatio_Q8[ b ] = 256; 199 } 200 } 201 202 /* Mean-of-squares */ 203 sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */ 204 205 /* Root-mean-square approximation, scale to dBs, and write to output pointer */ 206 pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */ 207 208 /*********************************/ 209 /* Speech Probability Estimation */ 210 /*********************************/ 211 SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 ); 212 213 /**************************/ 214 /* Frequency Tilt Measure */ 215 /**************************/ 216 psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 ); 217 218 /**************************************************/ 219 /* Scale the sigmoid output based on power levels */ 220 /**************************************************/ 221 speech_nrg = 0; 222 for( b = 0; b < VAD_N_BANDS; b++ ) { 223 /* Accumulate signal-without-noise energies, higher frequency bands have more weight */ 224 speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 ); 225 } 226 227 /* Power scaling */ 228 if( speech_nrg <= 0 ) { 229 SA_Q15 = silk_RSHIFT( SA_Q15, 1 ); 230 } else if( speech_nrg < 32768 ) { 231 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) { 232 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 ); 233 } else { 234 speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 ); 235 } 236 237 /* square-root */ 238 speech_nrg = silk_SQRT_APPROX( speech_nrg ); 239 SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 ); 240 } 241 242 /* Copy the resulting speech activity in Q8 */ 243 psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX ); 244 245 /***********************************/ 246 /* Energy Level and SNR estimation */ 247 /***********************************/ 248 /* Smoothing coefficient */ 249 smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) ); 250 251 if( psEncC->frame_length == 10 * psEncC->fs_kHz ) { 252 smooth_coef_Q16 >>= 1; 253 } 254 255 for( b = 0; b < VAD_N_BANDS; b++ ) { 256 /* compute smoothed energy-to-noise ratio per band */ 257 psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ], 258 NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 ); 259 260 /* signal to noise ratio in dB per band */ 261 SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 ); 262 /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */ 263 psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) ); 264 } 265 266 return( ret ); 267 } 268 269 /**************************/ 270 /* Noise level estimation */ 271 /**************************/ 272 static inline void silk_VAD_GetNoiseLevels( 273 const opus_int32 pX[ VAD_N_BANDS ], /* I subband energies */ 274 silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */ 275 ) 276 { 277 opus_int k; 278 opus_int32 nl, nrg, inv_nrg; 279 opus_int coef, min_coef; 280 281 /* Initially faster smoothing */ 282 if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */ 283 min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 ); 284 } else { 285 min_coef = 0; 286 } 287 288 for( k = 0; k < VAD_N_BANDS; k++ ) { 289 /* Get old noise level estimate for current band */ 290 nl = psSilk_VAD->NL[ k ]; 291 silk_assert( nl >= 0 ); 292 293 /* Add bias */ 294 nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] ); 295 silk_assert( nrg > 0 ); 296 297 /* Invert energies */ 298 inv_nrg = silk_DIV32( silk_int32_MAX, nrg ); 299 silk_assert( inv_nrg >= 0 ); 300 301 /* Less update when subband energy is high */ 302 if( nrg > silk_LSHIFT( nl, 3 ) ) { 303 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3; 304 } else if( nrg < nl ) { 305 coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16; 306 } else { 307 coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 ); 308 } 309 310 /* Initially faster smoothing */ 311 coef = silk_max_int( coef, min_coef ); 312 313 /* Smooth inverse energies */ 314 psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef ); 315 silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 ); 316 317 /* Compute noise level by inverting again */ 318 nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] ); 319 silk_assert( nl >= 0 ); 320 321 /* Limit noise levels (guarantee 7 bits of head room) */ 322 nl = silk_min( nl, 0x00FFFFFF ); 323 324 /* Store as part of state */ 325 psSilk_VAD->NL[ k ] = nl; 326 } 327 328 /* Increment frame counter */ 329 psSilk_VAD->counter++; 330 } 331