1 /*---------------------------------------------------------------------------* 2 * swicms.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 #include <string.h> 21 #include"swicms.h" 22 #include"srec_sizes.h" 23 #include"prelib.h" 24 25 #include "passert.h" 26 #include "ESR_Session.h" 27 #include "ESR_SessionType.h" 28 #include "IntArrayList.h" 29 #include "portable.h" 30 31 #define printf_vector(HEAD, FMT, PTR, NN) { int i; LCHAR buffer[256]; sprintf(buffer, HEAD); sprintf(buffer + LSTRLEN(buffer), " %x", (int)PTR); for (i=0; i<(NN); ++i) sprintf(buffer + LSTRLEN(buffer), FMT, PTR[i]); PLogMessage(buffer); } 32 33 /* Cross-utterance CMN calculation: 34 We try to normalize the speech frames before they get to the recognizer. 35 The speech frames are LDA-processed mfcc-with-dynamic feature vectors. 36 We collect these speech frames during recognition. At the end of 37 recognition we exclude the silence frames from the collected data, and 38 generate a new channel average based on the previous average and the new 39 data, using an exponential decay formula. 40 41 In-utterance CMN calculation: 42 A new short-term average mechanism was introduced, with faster update, 43 to improve recognition on the very first recognition after init or reset. 44 We wait for a minimum number of new data frames to apply this. We also 45 disable the fast updater after some frames, because we assume the 46 cross-utterance estimator to be more reliable, particularly in its 47 ability to exclude silence frames from the calculation. 48 */ 49 50 /* default settings for cross-utterance cms */ 51 #define SWICMS_FORGET_FACTOR_DEFAULT 400 /* effective frms of history */ 52 #define SWICMS_SBINDEX_DEFAULT 100 /* use speech frames only */ 53 /* #define SWICMS_CACHE_RESOLUTION_DEFAULT see swicms.h */ 54 /* #define SWICMS_CACHE_SIZE_DEFAULT see swicms.h */ 55 56 /* default settings for in-utterance cms */ 57 #define SWICMS_INUTT_FORGET_FACTOR2_DISABLE 65535 /* any large number */ 58 #define SWICMS_INUTT_FORGET_FACTOR2_DEFAULT SWICMS_INUTT_FORGET_FACTOR2_DISABLE 59 /* disable this when cross-utt become more reliable */ 60 #define SWICMS_INUTT_DISABLE_AFTER_FRAMES 200 61 /* wait while the estimate is poor */ 62 #define SWICMS_INUTT_ENABLE_AFTER_FRAMES 10 63 64 /** 65 * Logging Stuff 66 */ 67 #define LOG_LEVEL 2 68 #define MODULE_NAME L("swicms.c") 69 //static const char* MTAG = MODULE_NAME; 70 71 static const char *rcsid = 0 ? (const char *) &rcsid : 72 "$Id: swicms.c,v 1.21.6.16 2008/06/05 19:00:55 stever Exp $"; 73 74 static ESR_BOOL SWICMS_DEBUG = ESR_FALSE; 75 76 /* these are good values from cmn/tmn files */ 77 static const imeldata gswicms_cmn1_8 [MAX_CHAN_DIM] = 78 { 79 158, 141, 99, 125, 101, 162, 113, 138, 128, 143, 123, 141, 80 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 81 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 82 }; 83 84 static const imeldata gswicms_cmn1_11 [MAX_CHAN_DIM] = 85 { 86 163, 121, 120, 114, 124, 139, 144, 108, 150, 119, 146, 124, 87 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 88 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 89 }; 90 91 static const imeldata gswicms_tmn1_8 [MAX_CHAN_DIM] = 92 { 93 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 94 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 95 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 96 }; 97 98 static const imeldata gswicms_tmn1_11 [MAX_CHAN_DIM] = 99 { 100 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 101 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 102 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 103 }; 104 105 static ESR_ReturnCode GetSomeIntsIfAny( const LCHAR* parname, imeldata* parvalue, size_t reqSize) 106 { 107 size_t i, size; 108 ESR_ReturnCode rc; 109 ESR_BOOL exists; 110 IntArrayList* intList = 0; 111 112 CHKLOG(rc, ESR_SessionContains(parname, &exists)); 113 if (exists) { 114 rc = ESR_SessionGetProperty(parname, (void**)&intList, TYPES_INTARRAYLIST); 115 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) { 116 /* no match will revert to default data already in static array */ 117 PLogError(L("Error reading %s from session: %s"), parname, ESR_rc2str(rc)); 118 return ESR_FATAL_ERROR; 119 } 120 else if (rc == ESR_SUCCESS) { 121 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 122 if(size != reqSize) { 123 PLogError(L("Error reading %s from session, expected len %d: %s"), parname, reqSize, ESR_rc2str(rc)); 124 return ESR_FATAL_ERROR; 125 } 126 if(reqSize == 1) 127 CHKLOG(rc, IntArrayListGet(intList, 0, parvalue)); 128 else { 129 for (i=0; i<size; ++i) 130 CHKLOG(rc, IntArrayListGet(intList, i, &parvalue[i])); 131 } 132 } 133 } 134 return ESR_SUCCESS; 135 CLEANUP: 136 return rc; 137 } 138 139 int swicms_init(swicms_norm_info* swicms) 140 { 141 ESR_ReturnCode rc = ESR_SUCCESS; 142 size_t i; 143 ESR_BOOL exists, sessionExists; 144 size_t sample_rate; 145 146 /* defaults */ 147 swicms->sbindex = SWICMS_SBINDEX_DEFAULT; 148 swicms->cached_num_frames = 0; 149 swicms->forget_factor = SWICMS_FORGET_FACTOR_DEFAULT; 150 swicms->cache_resolution = SWICMS_CACHE_RESOLUTION_DEFAULT; 151 swicms->num_frames_in_cmn = 0; 152 153 CHKLOG(rc, ESR_SessionExists(&sessionExists)); 154 155 if (sessionExists) 156 { /* We'll assume this rate is valid or someone else will be complaining. SteveR */ 157 rc = ESR_SessionGetSize_t ( L ( "CREC.Frontend.samplerate" ), &sample_rate ); 158 159 if ( rc != ESR_SUCCESS ) 160 return ( rc ); 161 } 162 else 163 sample_rate = 11025; 164 165 /* init the data structures by copying the static data so that we can have a copy if we need to reset */ 166 if ( sample_rate == 8000 ) 167 { 168 for ( i = 0; i < MAX_CHAN_DIM; i++ ) 169 { 170 swicms->cmn [i] = gswicms_cmn1_8 [i]; 171 swicms->tmn [i] = gswicms_tmn1_8 [i]; 172 // _lda_*mn below are OK, but are recalculated in swicms_lda_process() 173 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */ 174 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */ 175 } 176 } 177 else 178 { 179 for ( i = 0; i < MAX_CHAN_DIM; i++ ) 180 { 181 swicms->cmn [i] = gswicms_cmn1_11 [i]; 182 swicms->tmn [i] = gswicms_tmn1_11 [i]; 183 // _lda_*mn below are OK, but are recalculated in swicms_lda_process() 184 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */ 185 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */ 186 } 187 } 188 CHKLOG(rc, ESR_SessionExists(&sessionExists)); 189 190 if (sessionExists) 191 { 192 const LCHAR* parname = L("CREC.Frontend.swicms.debug"); 193 CHKLOG(rc, ESR_SessionContains(parname, &exists)); 194 if (exists) { 195 rc = ESR_SessionGetBool(parname, &SWICMS_DEBUG); 196 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) { 197 PLOG_DBG_ERROR((L("Error reading %s from session: %s"), parname, ESR_rc2str(rc))); 198 return rc; 199 } 200 } 201 202 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.forget_factor"), 203 &swicms->forget_factor, 1); 204 if(rc != ESR_SUCCESS) return rc; 205 206 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.sbindex"), 207 &swicms->sbindex, 1); 208 if(rc != ESR_SUCCESS) return rc; 209 210 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn"), 211 &swicms->cmn[0], MAX_CHAN_DIM); 212 if(rc != ESR_SUCCESS) return rc; 213 214 if ( sample_rate == 8000 ) 215 { 216 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn8"), &swicms->cmn[0], MAX_CHAN_DIM); 217 218 if(rc != ESR_SUCCESS) 219 return rc; 220 } 221 else 222 { 223 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn11"), &swicms->cmn[0], MAX_CHAN_DIM); 224 225 if(rc != ESR_SUCCESS) 226 return rc; 227 } 228 229 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.tmn"), 230 &swicms->tmn[0], MAX_CHAN_DIM); 231 if(rc != ESR_SUCCESS) return rc; 232 } 233 234 swicms->is_valid = 0; 235 for (i = 0; i < MAX_CHAN_DIM; i++) 236 swicms->adjust[i] = 255; 237 238 #ifdef SREC_ENGINE_VERBOSE_LOGGING 239 PLogMessage("swicms->forget_factor = %d\n", swicms->forget_factor); 240 PLogMessage("swicms->cache_resolution = %d\n", swicms->cache_resolution); 241 PLogMessage("swicms->sbindex = %d\n", swicms->sbindex); 242 #endif 243 244 /* in-utt cms parameters */ 245 swicms->inutt.forget_factor2 = SWICMS_INUTT_FORGET_FACTOR2_DEFAULT; 246 swicms->inutt.disable_after = 200; 247 swicms->inutt.enable_after = 10; /* in-utt is less reliable */ 248 swicms->inutt.num_bou_frames_to_skip = 20; /* silence frames! see windback */ 249 swicms->inutt.num_frames_since_bou = 0; 250 swicms->inutt.num_frames_in_accum = 0; 251 for(i=0; i<MAX_CHAN_DIM; i++) swicms->inutt.accum[i] = 0; 252 253 if (sessionExists) { 254 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.forget_factor2"), 255 &swicms->inutt.forget_factor2, 1); 256 if(rc != ESR_SUCCESS) return rc; 257 258 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.disable_after"), 259 &swicms->inutt.disable_after, 1); 260 if(rc != ESR_SUCCESS) return rc; 261 262 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.enable_after"), 263 &swicms->inutt.enable_after, 1); 264 if(rc != ESR_SUCCESS) return rc; 265 266 /* we need to estimate the in-utt cmn from speech frames only! so let's 267 make sure to skip some frames before collecting data, */ 268 ESR_SessionContains(L("CREC.Frontend.start_windback"), &exists); 269 if (exists) { 270 ESR_BOOL do_skip_even_frames = ESR_TRUE; 271 ESR_SessionGetBool(L("CREC.Frontend.do_skip_even_frames"), &do_skip_even_frames); 272 ESR_SessionGetInt(L("CREC.Frontend.start_windback"), &swicms->inutt.num_bou_frames_to_skip); 273 if( do_skip_even_frames) 274 swicms->inutt.num_bou_frames_to_skip /= 2; 275 swicms->inutt.num_bou_frames_to_skip -= 5; /* ensure spch frames only */ 276 } 277 } 278 279 return 0; 280 CLEANUP: 281 return rc; 282 } 283 284 285 ESR_ReturnCode swicms_get_cmn ( swicms_norm_info* swicms, LCHAR *cmn_params, size_t* len ) 286 { 287 int dim_count; 288 int i; 289 imeldata temp[MAX_CHAN_DIM]; 290 const size_t INT_LENGTH = 12; 291 292 if ( swicms->_prep != NULL ) /* lda exists give them transformed lda. */ 293 { 294 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 295 temp [dim_count] = swicms->lda_cmn [dim_count]; 296 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 297 } 298 else /* lda does not exist give them raw cmn values */ 299 { 300 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 301 temp [dim_count] = swicms->cmn [dim_count]; 302 } 303 304 for ( dim_count = 0, i = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 305 { 306 i += sprintf( cmn_params + i, dim_count==0 ? "%d" : ",%d", temp [dim_count] ); 307 if (i + INT_LENGTH >= *len) { 308 *len = MAX_CHAN_DIM * (INT_LENGTH + 2) * sizeof(LCHAR); 309 return ESR_BUFFER_OVERFLOW; 310 } 311 } 312 313 return ESR_SUCCESS; 314 } 315 316 317 ESR_ReturnCode swicms_set_cmn ( swicms_norm_info* swicms, const char *cmn_params ) 318 { 319 ESR_ReturnCode set_status; 320 int length_of_params; 321 int dim_count; 322 int got_word; 323 int current_position; 324 char *copy_of_params; 325 char *parsed_strings [MAX_CHAN_DIM]; 326 int temp_cmn [MAX_CHAN_DIM]; 327 328 length_of_params = strlen ( cmn_params ) + 1; 329 copy_of_params = (char*)MALLOC ( length_of_params, NULL ); 330 331 if ( copy_of_params != NULL ) 332 { 333 set_status = ESR_SUCCESS; 334 memcpy ( copy_of_params, cmn_params, length_of_params ); 335 dim_count = 0; 336 current_position = 0; 337 got_word = 0; 338 parsed_strings [dim_count] = copy_of_params + current_position; 339 340 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) ) 341 { 342 switch ( *( copy_of_params + current_position ) ) 343 { 344 case '\0': 345 if ( got_word == 1 ) 346 { 347 if ( dim_count == ( MAX_CHAN_DIM - 1 ) ) 348 dim_count++; 349 else 350 { 351 PLogError ( "Channel Normalization : Missing Params Must Contain %d Params\n", MAX_CHAN_DIM ); 352 set_status = ESR_INVALID_ARGUMENT; 353 } 354 } 355 else 356 { 357 PLogError ( "Channel Normalization : Missing Params Mus Contain %d Params\n", MAX_CHAN_DIM ); 358 set_status = ESR_INVALID_ARGUMENT; 359 } 360 break; 361 362 case ',': 363 if ( got_word == 1 ) 364 { 365 if ( dim_count < ( MAX_CHAN_DIM - 1 ) ) 366 { 367 dim_count++; 368 *( copy_of_params + current_position) = '\0'; 369 current_position++; 370 371 if ( current_position == length_of_params ) 372 { 373 PLogError ( "Channel Normalization : Delimiter At End Of Param String\n" ); 374 set_status = ESR_INVALID_ARGUMENT; 375 } 376 parsed_strings [dim_count] = copy_of_params + current_position; 377 got_word = 0; 378 } 379 else 380 { 381 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); 382 set_status = ESR_INVALID_ARGUMENT; 383 } 384 } 385 else 386 { 387 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); 388 set_status = ESR_INVALID_ARGUMENT; 389 } 390 break; 391 392 case '0': 393 case '1': 394 case '2': 395 case '3': 396 case '4': 397 case '5': 398 case '6': 399 case '7': 400 case '8': 401 case '9': 402 got_word = 1; 403 current_position++; 404 405 if ( current_position == length_of_params ) 406 { 407 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); 408 set_status = ESR_INVALID_ARGUMENT; 409 } 410 break; 411 412 default: 413 PLogError ( "Channel Normalization : Invalid Param : %c : Params Must Contain Only Digits\n" ); 414 set_status = ESR_INVALID_ARGUMENT; 415 break; 416 } 417 } 418 if ( set_status == ESR_SUCCESS ) 419 { 420 dim_count = 0; 421 422 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) ) 423 { 424 temp_cmn [dim_count] = atoi ( parsed_strings [dim_count] ); 425 426 if ( ( temp_cmn [dim_count] < 0 ) || ( temp_cmn [dim_count] > 255 ) ) 427 { 428 set_status = ESR_INVALID_ARGUMENT; 429 } 430 } 431 if ( set_status == ESR_SUCCESS ) 432 { 433 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 434 swicms->cmn [dim_count] = temp_cmn [dim_count]; 435 if ( swicms->_prep != NULL ) /* Set now if NULL it will automatically be set on first utterance */ 436 linear_transform_frame(swicms->_prep, swicms->lda_cmn, 1 /*do_shift*/); 437 } 438 } 439 FREE ( copy_of_params ); 440 } 441 else 442 { 443 PLogError ( "Channel Normalization Out Of Memory Error\n" ); 444 set_status = ESR_OUT_OF_MEMORY; 445 } 446 swicms->num_frames_in_cmn = 0; 447 return ( set_status ); 448 } 449 450 451 int swicms_cache_frame(swicms_norm_info* swicms, imeldata* frame, int dimen) 452 { 453 int i; 454 imeldata *pcache, *pframe; 455 456 ASSERT(dimen == MAX_CHAN_DIM); 457 i = swicms->cached_num_frames / swicms->cache_resolution; 458 if (i < SWICMS_CACHE_SIZE_DEFAULT) 459 { 460 pcache = swicms->cached_sections[ i]; 461 if (swicms->cached_num_frames % swicms->cache_resolution == 0) 462 { 463 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ = 0; 464 pcache -= MAX_CHAN_DIM; 465 } 466 pframe = frame; 467 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ += *pframe++; 468 swicms->cached_num_frames++; 469 } 470 471 return 0; 472 } 473 474 int apply_channel_normalization_in_swicms(swicms_norm_info *swicms, 475 imeldata* oframe, 476 imeldata* iframe, int dimen) 477 { 478 int ii; 479 ASSERT(dimen == MAX_CHAN_DIM); 480 481 /* IF inutt is activated at all */ 482 if(swicms->inutt.forget_factor2 != SWICMS_INUTT_FORGET_FACTOR2_DISABLE) { 483 /* AND IF we have not disabled it (due to x-utt more reliable) */ 484 if(swicms->inutt.num_frames_in_accum < swicms->inutt.disable_after) { 485 /* AND IF we have skipped past the silence frames */ 486 if( swicms->inutt.num_frames_since_bou >= swicms->inutt.num_bou_frames_to_skip){ 487 swicms->inutt.num_frames_in_accum++; 488 for(ii=0;ii<dimen;ii++) swicms->inutt.accum[ii] += iframe[ii]; 489 /* AND IF we've already seen at least 10 frames (presumably) of speech */ 490 if(swicms->inutt.num_frames_in_accum>swicms->inutt.enable_after) { 491 /* THEN we update the adjustment in-line with the current utterance! */ 492 for(ii=0;ii<dimen;ii++) { 493 imeldata denom = ( swicms->inutt.forget_factor2 494 + swicms->inutt.num_frames_in_accum ); 495 /* tmp: weighted average of the old lda_cmn and the new accum */ 496 imeldata tmp=(swicms->lda_cmn[ii]*swicms->inutt.forget_factor2 497 + swicms->inutt.accum[ii] + denom/2) / denom; 498 swicms->adjust[ii] = swicms->lda_tmn[ii] - tmp; 499 } 500 //printf_vector("swicms->adjust2 "," %d",swicms->adjust, dimen); 501 } 502 } 503 } 504 swicms->inutt.num_frames_since_bou++; 505 } 506 507 for (ii = 0; ii < dimen; ii++) 508 oframe[ii] = MAKEBYTE(iframe[ii] + swicms->adjust[ii]); 509 return 0; 510 } 511 512 int swicms_update(swicms_norm_info* swicms, int speech_start, int speech_end) 513 { 514 int i, j; 515 asr_int32_t speech_avg[MAX_CHAN_DIM], backgr_avg[MAX_CHAN_DIM], avg[MAX_CHAN_DIM]; 516 int ff; 517 int nn, speech_nn, backgr_nn; 518 int num_frames = swicms->cached_num_frames; 519 int cache_start, cache_end, backgr_cache_end; 520 int sbindex = swicms->sbindex; 521 522 /* init for utterance */ 523 swicms->inutt.num_frames_since_bou = 0; 524 525 swicms->cached_num_frames = 0; 526 cache_start = speech_start; 527 cache_start -= (cache_start % swicms->cache_resolution); 528 cache_start /= swicms->cache_resolution; 529 530 if (speech_end == MAXframeID) 531 { 532 cache_end = SWICMS_CACHE_SIZE_DEFAULT; 533 } 534 else 535 { 536 if (speech_end < num_frames) 537 cache_end = speech_end; 538 else 539 cache_end = num_frames; 540 cache_end -= (cache_end % swicms->cache_resolution); 541 cache_end /= swicms->cache_resolution; 542 } 543 544 if (num_frames == 0 || speech_end == 0 || speech_start == speech_end || speech_end == MAXframeID) 545 { 546 if (speech_end != 0 || speech_start != 0) 547 PLogError("Warning: speech_bounds (%d,%d) swicms->cached_num_frames (%d)\n", 548 speech_start, speech_end, num_frames); 549 if (SWICMS_DEBUG) { 550 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM); 551 } 552 return 1; 553 } 554 555 backgr_cache_end = (num_frames - num_frames % swicms->cache_resolution) / swicms->cache_resolution; 556 557 speech_nn = (cache_end - cache_start) * swicms->cache_resolution; 558 backgr_nn = backgr_cache_end * swicms->cache_resolution - speech_nn; 559 560 for (i = 0; i < MAX_CHAN_DIM; i++) 561 { 562 speech_avg[i] = 0; 563 backgr_avg[i] = 0; 564 for (j = cache_start; j < cache_end; j++) 565 speech_avg[i] += swicms->cached_sections[j][i]; 566 for (j = 0; j < cache_start; j++) 567 backgr_avg[i] += swicms->cached_sections[j][i]; 568 for (j = cache_end; j < backgr_cache_end; j++) 569 backgr_avg[i] += swicms->cached_sections[j][i]; 570 if (speech_nn == 0 && backgr_nn > 0) 571 { 572 backgr_avg[i] /= backgr_nn; 573 speech_avg[i] = backgr_avg[i]; 574 speech_nn = backgr_nn; 575 } 576 else if (speech_nn > 0 && backgr_nn == 0) 577 { 578 speech_avg[i] /= speech_nn; 579 backgr_avg[i] = speech_avg[i]; 580 backgr_nn = speech_nn; 581 } 582 else if (speech_nn > 0 && backgr_nn > 0) 583 { 584 speech_avg[i] /= speech_nn; 585 backgr_avg[i] /= backgr_nn; 586 } 587 else 588 { 589 return 0; 590 } 591 592 avg[i] = (sbindex * speech_avg[i] + (100 - sbindex) * backgr_avg[i] + 50) / 100; 593 } 594 nn = (sbindex * speech_nn + (100 - sbindex) * backgr_nn + 50) / 100; 595 596 for (i = 0, ff = 0; i < MAX_CHAN_DIM; i++) 597 { 598 ff += (swicms->lda_tmn[i] - avg[i]); 599 } 600 ff /= MAX_CHAN_DIM; /* sum is now the average offset from TMN */ 601 if (ff > 5) 602 { 603 PLogError("Warning: bad utt mean during swicms_update() (moffs=%d)\n", ff); 604 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM); 605 return 1; 606 } 607 ff = swicms->forget_factor; 608 if (ff < 9999) 609 { 610 for (i = 0; i < MAX_CHAN_DIM; i++) 611 { 612 swicms->lda_cmn[i] = (swicms->lda_cmn[i] * ff + avg[i] * nn + (ff + nn) / 2) / (ff + nn); 613 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i]; 614 } 615 } 616 617 if (SWICMS_DEBUG) 618 { 619 imeldata temp[MAX_CHAN_DIM]; 620 PLogMessage("swicms_update() used %d frames (%d-%d)", nn, speech_start, speech_end); 621 622 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i]; 623 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 624 /* use this dump, to put back into CREC.Frontend.swicms.cmn */ 625 printf_vector("swicms.cmn(r) ", " %d", temp, MAX_CHAN_DIM); 626 627 //printf_vector("swicms.lda_cmn ", " %d", &swicms.lda_cmn [0], MAX_CHAN_DIM); 628 //printf_vector("swicms.lda_tmn ", " %d", &swicms.lda_tmn [0], MAX_CHAN_DIM); 629 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); 630 //printf_vector("avg.speech ", " %d", avg, MAX_CHAN_DIM); 631 } 632 else 633 { 634 #ifndef NDEBUG 635 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); 636 #endif 637 } 638 swicms->num_frames_in_cmn += nn; 639 return 0; 640 } 641 642 int swicms_lda_process(swicms_norm_info* swicms, preprocessed* prep) 643 { 644 int i; 645 646 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_tmn[i] = swicms->tmn[i]; 647 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_cmn[i] = swicms->cmn[i]; 648 linear_transform_frame(prep, swicms->lda_tmn, 1 /*do_shift*/); 649 linear_transform_frame(prep, swicms->lda_cmn, 1 /*do_shift*/); 650 651 for (i = 0; i < MAX_CHAN_DIM; i++) 652 { 653 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i]; 654 } 655 656 #ifndef NDEBUG 657 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); 658 #endif 659 swicms->is_valid = 1; 660 swicms->_prep = prep; 661 662 if(SWICMS_DEBUG) { 663 imeldata temp[MAX_CHAN_DIM]; 664 printf_vector("swicms->cmn ", " %d", swicms->cmn, MAX_CHAN_DIM); 665 printf_vector("swicms->lda_cmn ", " %d", swicms->lda_cmn, MAX_CHAN_DIM); 666 //printf_vector("swicms->tmn ", " %d", swicms->tmn, MAX_CHAN_DIM); 667 //printf_vector("swicms->lda_tmn ", " %d", swicms->lda_tmn, MAX_CHAN_DIM); 668 //printf_vector("swicms->adjust ", " %d", swicms->adjust, MAX_CHAN_DIM); 669 670 //for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_tmn[i]; 671 //inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 672 //printf_vector("swicms->tmn(r) ", " %d", temp, MAX_CHAN_DIM); 673 674 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i]; 675 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 676 printf_vector("swicms->cmn(r) ", " %d", temp, MAX_CHAN_DIM); 677 } 678 return 0; 679 } 680 681 682 683