1 /*---------------------------------------------------------------------------* 2 * swicms.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 #include <string.h> 21 #include"swicms.h" 22 #include"srec_sizes.h" 23 #include"prelib.h" 24 25 #include "passert.h" 26 #include "ESR_Session.h" 27 #include "ESR_SessionType.h" 28 #include "IntArrayList.h" 29 #include "portable.h" 30 31 #define printf_vector(HEAD, FMT, PTR, NN) { int i; LCHAR buffer[256]; sprintf(buffer, HEAD); sprintf(buffer + LSTRLEN(buffer), " %p", (void *)PTR); for (i=0; i<(NN); ++i) sprintf(buffer + LSTRLEN(buffer), FMT, PTR[i]); PLogMessage(buffer); } 32 33 /* Cross-utterance CMN calculation: 34 We try to normalize the speech frames before they get to the recognizer. 35 The speech frames are LDA-processed mfcc-with-dynamic feature vectors. 36 We collect these speech frames during recognition. At the end of 37 recognition we exclude the silence frames from the collected data, and 38 generate a new channel average based on the previous average and the new 39 data, using an exponential decay formula. 40 41 In-utterance CMN calculation: 42 A new short-term average mechanism was introduced, with faster update, 43 to improve recognition on the very first recognition after init or reset. 44 We wait for a minimum number of new data frames to apply this. We also 45 disable the fast updater after some frames, because we assume the 46 cross-utterance estimator to be more reliable, particularly in its 47 ability to exclude silence frames from the calculation. 48 */ 49 50 /* default settings for cross-utterance cms */ 51 #define SWICMS_FORGET_FACTOR_DEFAULT 400 /* effective frms of history */ 52 #define SWICMS_SBINDEX_DEFAULT 100 /* use speech frames only */ 53 /* #define SWICMS_CACHE_RESOLUTION_DEFAULT see swicms.h */ 54 /* #define SWICMS_CACHE_SIZE_DEFAULT see swicms.h */ 55 56 /* default settings for in-utterance cms */ 57 #define SWICMS_INUTT_FORGET_FACTOR2_DISABLE 65535 /* any large number */ 58 #define SWICMS_INUTT_FORGET_FACTOR2_DEFAULT SWICMS_INUTT_FORGET_FACTOR2_DISABLE 59 /* disable this when cross-utt become more reliable */ 60 #define SWICMS_INUTT_DISABLE_AFTER_FRAMES 200 61 /* wait while the estimate is poor */ 62 #define SWICMS_INUTT_ENABLE_AFTER_FRAMES 10 63 64 /** 65 * Logging Stuff 66 */ 67 #define LOG_LEVEL 2 68 #define MODULE_NAME L("swicms.c") 69 //static const char* MTAG = MODULE_NAME; 70 71 static const char *rcsid = 0 ? (const char *) &rcsid : 72 "$Id: swicms.c,v 1.21.6.16 2008/06/05 19:00:55 stever Exp $"; 73 74 static ESR_BOOL SWICMS_DEBUG = ESR_FALSE; 75 76 /* these are good values from cmn/tmn files */ 77 static const imeldata gswicms_cmn1_8 [MAX_CHAN_DIM] = 78 { 79 158, 141, 99, 125, 101, 162, 113, 138, 128, 143, 123, 141, 80 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 81 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 82 }; 83 84 static const imeldata gswicms_cmn1_11 [MAX_CHAN_DIM] = 85 { 86 163, 121, 120, 114, 124, 139, 144, 108, 150, 119, 146, 124, 87 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 88 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 89 }; 90 91 static const imeldata gswicms_tmn1_8 [MAX_CHAN_DIM] = 92 { 93 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 94 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 95 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 96 }; 97 98 static const imeldata gswicms_tmn1_11 [MAX_CHAN_DIM] = 99 { 100 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 101 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 102 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127 103 }; 104 105 static ESR_ReturnCode GetSomeIntsIfAny( const LCHAR* parname, imeldata* parvalue, size_t reqSize) 106 { 107 size_t i, size; 108 ESR_ReturnCode rc; 109 ESR_BOOL exists; 110 IntArrayList* intList = 0; 111 112 CHKLOG(rc, ESR_SessionContains(parname, &exists)); 113 if (exists) { 114 rc = ESR_SessionGetProperty(parname, (void**)&intList, TYPES_INTARRAYLIST); 115 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) { 116 /* no match will revert to default data already in static array */ 117 PLogError(L("Error reading %s from session: %s"), parname, ESR_rc2str(rc)); 118 return ESR_FATAL_ERROR; 119 } 120 else if (rc == ESR_SUCCESS) { 121 CHKLOG(rc, IntArrayListGetSize(intList, &size)); 122 if(size != reqSize) { 123 PLogError(L("Error reading %s from session, expected len %d: %s"), parname, reqSize, ESR_rc2str(rc)); 124 return ESR_FATAL_ERROR; 125 } 126 if(reqSize == 1) 127 CHKLOG(rc, IntArrayListGet(intList, 0, parvalue)); 128 else { 129 for (i=0; i<size; ++i) 130 CHKLOG(rc, IntArrayListGet(intList, i, &parvalue[i])); 131 } 132 } 133 } 134 return ESR_SUCCESS; 135 CLEANUP: 136 return rc; 137 } 138 139 int swicms_init(swicms_norm_info* swicms) 140 { 141 ESR_ReturnCode rc = ESR_SUCCESS; 142 size_t i; 143 ESR_BOOL exists, sessionExists; 144 size_t sample_rate; 145 146 /* defaults */ 147 swicms->sbindex = SWICMS_SBINDEX_DEFAULT; 148 swicms->cached_num_frames = 0; 149 swicms->forget_factor = SWICMS_FORGET_FACTOR_DEFAULT; 150 swicms->cache_resolution = SWICMS_CACHE_RESOLUTION_DEFAULT; 151 swicms->num_frames_in_cmn = 0; 152 153 CHKLOG(rc, ESR_SessionExists(&sessionExists)); 154 155 if (sessionExists) 156 { /* We'll assume this rate is valid or someone else will be complaining. SteveR */ 157 rc = ESR_SessionGetSize_t ( L ( "CREC.Frontend.samplerate" ), &sample_rate ); 158 159 if ( rc != ESR_SUCCESS ) 160 return ( rc ); 161 } 162 else 163 sample_rate = 11025; 164 165 /* init the data structures by copying the static data so that we can have a copy if we need to reset */ 166 if ( sample_rate == 8000 ) 167 { 168 for ( i = 0; i < MAX_CHAN_DIM; i++ ) 169 { 170 swicms->cmn [i] = gswicms_cmn1_8 [i]; 171 swicms->tmn [i] = gswicms_tmn1_8 [i]; 172 // _lda_*mn below are OK, but are recalculated in swicms_lda_process() 173 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */ 174 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */ 175 } 176 } 177 else 178 { 179 for ( i = 0; i < MAX_CHAN_DIM; i++ ) 180 { 181 swicms->cmn [i] = gswicms_cmn1_11 [i]; 182 swicms->tmn [i] = gswicms_tmn1_11 [i]; 183 // _lda_*mn below are OK, but are recalculated in swicms_lda_process() 184 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */ 185 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */ 186 } 187 } 188 CHKLOG(rc, ESR_SessionExists(&sessionExists)); 189 190 if (sessionExists) 191 { 192 const LCHAR* parname = L("CREC.Frontend.swicms.debug"); 193 CHKLOG(rc, ESR_SessionContains(parname, &exists)); 194 if (exists) { 195 rc = ESR_SessionGetBool(parname, &SWICMS_DEBUG); 196 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) { 197 PLOG_DBG_ERROR((L("Error reading %s from session: %s"), parname, ESR_rc2str(rc))); 198 return rc; 199 } 200 } 201 202 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.forget_factor"), 203 &swicms->forget_factor, 1); 204 if(rc != ESR_SUCCESS) return rc; 205 206 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.sbindex"), 207 &swicms->sbindex, 1); 208 if(rc != ESR_SUCCESS) return rc; 209 210 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn"), 211 &swicms->cmn[0], MAX_CHAN_DIM); 212 if(rc != ESR_SUCCESS) return rc; 213 214 if ( sample_rate == 8000 ) 215 { 216 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn8"), &swicms->cmn[0], MAX_CHAN_DIM); 217 218 if(rc != ESR_SUCCESS) 219 return rc; 220 } 221 else 222 { 223 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn11"), &swicms->cmn[0], MAX_CHAN_DIM); 224 225 if(rc != ESR_SUCCESS) 226 return rc; 227 } 228 229 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.tmn"), 230 &swicms->tmn[0], MAX_CHAN_DIM); 231 if(rc != ESR_SUCCESS) return rc; 232 } 233 234 swicms->is_valid = 0; 235 for (i = 0; i < MAX_CHAN_DIM; i++) 236 swicms->adjust[i] = 255; 237 238 #ifdef SREC_ENGINE_VERBOSE_LOGGING 239 PLogMessage("swicms->forget_factor = %d\n", swicms->forget_factor); 240 PLogMessage("swicms->cache_resolution = %d\n", swicms->cache_resolution); 241 PLogMessage("swicms->sbindex = %d\n", swicms->sbindex); 242 #endif 243 244 /* in-utt cms parameters */ 245 swicms->inutt.forget_factor2 = SWICMS_INUTT_FORGET_FACTOR2_DEFAULT; 246 swicms->inutt.disable_after = 200; 247 swicms->inutt.enable_after = 10; /* in-utt is less reliable */ 248 swicms->inutt.num_bou_frames_to_skip = 20; /* silence frames! see windback */ 249 swicms->inutt.num_frames_since_bou = 0; 250 swicms->inutt.num_frames_in_accum = 0; 251 for(i=0; i<MAX_CHAN_DIM; i++) swicms->inutt.accum[i] = 0; 252 253 if (sessionExists) { 254 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.forget_factor2"), 255 &swicms->inutt.forget_factor2, 1); 256 if(rc != ESR_SUCCESS) return rc; 257 258 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.disable_after"), 259 &swicms->inutt.disable_after, 1); 260 if(rc != ESR_SUCCESS) return rc; 261 262 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.enable_after"), 263 &swicms->inutt.enable_after, 1); 264 if(rc != ESR_SUCCESS) return rc; 265 266 /* we need to estimate the in-utt cmn from speech frames only! so let's 267 make sure to skip some frames before collecting data, */ 268 ESR_SessionContains(L("CREC.Frontend.start_windback"), &exists); 269 if (exists) { 270 ESR_BOOL do_skip_even_frames = ESR_TRUE; 271 ESR_SessionGetBool(L("CREC.Frontend.do_skip_even_frames"), &do_skip_even_frames); 272 ESR_SessionGetInt(L("CREC.Frontend.start_windback"), &swicms->inutt.num_bou_frames_to_skip); 273 if( do_skip_even_frames) 274 swicms->inutt.num_bou_frames_to_skip /= 2; 275 swicms->inutt.num_bou_frames_to_skip -= 5; /* ensure spch frames only */ 276 } 277 } 278 279 return 0; 280 CLEANUP: 281 return rc; 282 } 283 284 285 ESR_ReturnCode swicms_get_cmn ( swicms_norm_info* swicms, LCHAR *cmn_params, size_t* len ) 286 { 287 int dim_count; 288 int i; 289 imeldata temp[MAX_CHAN_DIM]; 290 const size_t INT_LENGTH = 12; 291 292 if ( swicms->_prep != NULL ) /* lda exists give them transformed lda. */ 293 { 294 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 295 temp [dim_count] = swicms->lda_cmn [dim_count]; 296 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 297 } 298 else /* lda does not exist give them raw cmn values */ 299 { 300 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 301 temp [dim_count] = swicms->cmn [dim_count]; 302 } 303 304 for ( dim_count = 0, i = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 305 { 306 i += sprintf( cmn_params + i, dim_count==0 ? "%d" : ",%d", temp [dim_count] ); 307 if (i + INT_LENGTH >= *len) { 308 *len = MAX_CHAN_DIM * (INT_LENGTH + 2) * sizeof(LCHAR); 309 return ESR_BUFFER_OVERFLOW; 310 } 311 } 312 313 return ESR_SUCCESS; 314 } 315 316 317 ESR_ReturnCode swicms_set_cmn ( swicms_norm_info* swicms, const char *cmn_params ) 318 { 319 ESR_ReturnCode set_status; 320 int length_of_params; 321 int dim_count; 322 int got_word; 323 int current_position; 324 char *copy_of_params; 325 char *parsed_strings [MAX_CHAN_DIM]; 326 int temp_cmn [MAX_CHAN_DIM]; 327 328 length_of_params = strlen ( cmn_params ) + 1; 329 copy_of_params = (char*)MALLOC ( length_of_params, NULL ); 330 331 if ( copy_of_params != NULL ) 332 { 333 set_status = ESR_SUCCESS; 334 memcpy ( copy_of_params, cmn_params, length_of_params ); 335 dim_count = 0; 336 current_position = 0; 337 got_word = 0; 338 parsed_strings [dim_count] = copy_of_params + current_position; 339 340 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) ) 341 { 342 switch ( *( copy_of_params + current_position ) ) 343 { 344 case '\0': 345 if ( got_word == 1 ) 346 { 347 if ( dim_count == ( MAX_CHAN_DIM - 1 ) ) 348 dim_count++; 349 else 350 { 351 PLogError ( "Channel Normalization : Missing Params Must Contain %d Params\n", MAX_CHAN_DIM ); 352 set_status = ESR_INVALID_ARGUMENT; 353 } 354 } 355 else 356 { 357 PLogError ( "Channel Normalization : Missing Params Mus Contain %d Params\n", MAX_CHAN_DIM ); 358 set_status = ESR_INVALID_ARGUMENT; 359 } 360 break; 361 362 case ',': 363 if ( got_word == 1 ) 364 { 365 if ( dim_count < ( MAX_CHAN_DIM - 1 ) ) 366 { 367 dim_count++; 368 *( copy_of_params + current_position) = '\0'; 369 current_position++; 370 371 if ( current_position == length_of_params ) 372 { 373 PLogError ( "Channel Normalization : Delimiter At End Of Param String\n" ); 374 set_status = ESR_INVALID_ARGUMENT; 375 } 376 parsed_strings [dim_count] = copy_of_params + current_position; 377 got_word = 0; 378 } 379 else 380 { 381 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); 382 set_status = ESR_INVALID_ARGUMENT; 383 } 384 } 385 else 386 { 387 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); 388 set_status = ESR_INVALID_ARGUMENT; 389 } 390 break; 391 392 case '0': 393 case '1': 394 case '2': 395 case '3': 396 case '4': 397 case '5': 398 case '6': 399 case '7': 400 case '8': 401 case '9': 402 got_word = 1; 403 current_position++; 404 405 if ( current_position == length_of_params ) 406 { 407 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM ); 408 set_status = ESR_INVALID_ARGUMENT; 409 } 410 break; 411 412 default: 413 PLogError ( "Channel Normalization : Invalid Param : %c : Params Must Contain Only Digits\n" ); 414 set_status = ESR_INVALID_ARGUMENT; 415 break; 416 } 417 } 418 if ( set_status == ESR_SUCCESS ) 419 { 420 dim_count = 0; 421 422 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) ) 423 { 424 temp_cmn [dim_count] = atoi ( parsed_strings [dim_count] ); 425 426 if ( ( temp_cmn [dim_count] < 0 ) || ( temp_cmn [dim_count] > 255 ) ) 427 { 428 set_status = ESR_INVALID_ARGUMENT; 429 } 430 431 dim_count++; 432 } 433 if ( set_status == ESR_SUCCESS ) 434 { 435 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ ) 436 swicms->cmn [dim_count] = temp_cmn [dim_count]; 437 if ( swicms->_prep != NULL ) /* Set now if NULL it will automatically be set on first utterance */ 438 linear_transform_frame(swicms->_prep, swicms->lda_cmn, 1 /*do_shift*/); 439 } 440 } 441 FREE ( copy_of_params ); 442 } 443 else 444 { 445 PLogError ( "Channel Normalization Out Of Memory Error\n" ); 446 set_status = ESR_OUT_OF_MEMORY; 447 } 448 swicms->num_frames_in_cmn = 0; 449 return ( set_status ); 450 } 451 452 453 int swicms_cache_frame(swicms_norm_info* swicms, imeldata* frame, int dimen) 454 { 455 int i; 456 imeldata *pcache, *pframe; 457 458 ASSERT(dimen == MAX_CHAN_DIM); 459 i = swicms->cached_num_frames / swicms->cache_resolution; 460 if (i < SWICMS_CACHE_SIZE_DEFAULT) 461 { 462 pcache = swicms->cached_sections[ i]; 463 if (swicms->cached_num_frames % swicms->cache_resolution == 0) 464 { 465 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ = 0; 466 pcache -= MAX_CHAN_DIM; 467 } 468 pframe = frame; 469 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ += *pframe++; 470 swicms->cached_num_frames++; 471 } 472 473 return 0; 474 } 475 476 int apply_channel_normalization_in_swicms(swicms_norm_info *swicms, 477 imeldata* oframe, 478 imeldata* iframe, int dimen) 479 { 480 int ii; 481 ASSERT(dimen == MAX_CHAN_DIM); 482 483 /* IF inutt is activated at all */ 484 if(swicms->inutt.forget_factor2 != SWICMS_INUTT_FORGET_FACTOR2_DISABLE) { 485 /* AND IF we have not disabled it (due to x-utt more reliable) */ 486 if(swicms->inutt.num_frames_in_accum < swicms->inutt.disable_after) { 487 /* AND IF we have skipped past the silence frames */ 488 if( swicms->inutt.num_frames_since_bou >= swicms->inutt.num_bou_frames_to_skip){ 489 swicms->inutt.num_frames_in_accum++; 490 for(ii=0;ii<dimen;ii++) swicms->inutt.accum[ii] += iframe[ii]; 491 /* AND IF we've already seen at least 10 frames (presumably) of speech */ 492 if(swicms->inutt.num_frames_in_accum>swicms->inutt.enable_after) { 493 /* THEN we update the adjustment in-line with the current utterance! */ 494 for(ii=0;ii<dimen;ii++) { 495 imeldata denom = ( swicms->inutt.forget_factor2 496 + swicms->inutt.num_frames_in_accum ); 497 /* tmp: weighted average of the old lda_cmn and the new accum */ 498 imeldata tmp=(swicms->lda_cmn[ii]*swicms->inutt.forget_factor2 499 + swicms->inutt.accum[ii] + denom/2) / denom; 500 swicms->adjust[ii] = swicms->lda_tmn[ii] - tmp; 501 } 502 //printf_vector("swicms->adjust2 "," %d",swicms->adjust, dimen); 503 } 504 } 505 } 506 swicms->inutt.num_frames_since_bou++; 507 } 508 509 for (ii = 0; ii < dimen; ii++) 510 oframe[ii] = MAKEBYTE(iframe[ii] + swicms->adjust[ii]); 511 return 0; 512 } 513 514 int swicms_update(swicms_norm_info* swicms, int speech_start, int speech_end) 515 { 516 int i, j; 517 asr_int32_t speech_avg[MAX_CHAN_DIM], backgr_avg[MAX_CHAN_DIM], avg[MAX_CHAN_DIM]; 518 int ff; 519 int nn, speech_nn, backgr_nn; 520 int num_frames = swicms->cached_num_frames; 521 int cache_start, cache_end, backgr_cache_end; 522 int sbindex = swicms->sbindex; 523 524 /* init for utterance */ 525 swicms->inutt.num_frames_since_bou = 0; 526 527 swicms->cached_num_frames = 0; 528 cache_start = speech_start; 529 cache_start -= (cache_start % swicms->cache_resolution); 530 cache_start /= swicms->cache_resolution; 531 532 if (speech_end == MAXframeID) 533 { 534 cache_end = SWICMS_CACHE_SIZE_DEFAULT; 535 } 536 else 537 { 538 if (speech_end < num_frames) 539 cache_end = speech_end; 540 else 541 cache_end = num_frames; 542 cache_end -= (cache_end % swicms->cache_resolution); 543 cache_end /= swicms->cache_resolution; 544 } 545 546 if (num_frames == 0 || speech_end == 0 || speech_start == speech_end || speech_end == MAXframeID) 547 { 548 if (speech_end != 0 || speech_start != 0) 549 PLogError("Warning: speech_bounds (%d,%d) swicms->cached_num_frames (%d)\n", 550 speech_start, speech_end, num_frames); 551 if (SWICMS_DEBUG) { 552 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM); 553 } 554 return 1; 555 } 556 557 backgr_cache_end = (num_frames - num_frames % swicms->cache_resolution) / swicms->cache_resolution; 558 559 speech_nn = (cache_end - cache_start) * swicms->cache_resolution; 560 backgr_nn = backgr_cache_end * swicms->cache_resolution - speech_nn; 561 562 for (i = 0; i < MAX_CHAN_DIM; i++) 563 { 564 speech_avg[i] = 0; 565 backgr_avg[i] = 0; 566 for (j = cache_start; j < cache_end; j++) 567 speech_avg[i] += swicms->cached_sections[j][i]; 568 for (j = 0; j < cache_start; j++) 569 backgr_avg[i] += swicms->cached_sections[j][i]; 570 for (j = cache_end; j < backgr_cache_end; j++) 571 backgr_avg[i] += swicms->cached_sections[j][i]; 572 if (speech_nn == 0 && backgr_nn > 0) 573 { 574 backgr_avg[i] /= backgr_nn; 575 speech_avg[i] = backgr_avg[i]; 576 speech_nn = backgr_nn; 577 } 578 else if (speech_nn > 0 && backgr_nn == 0) 579 { 580 speech_avg[i] /= speech_nn; 581 backgr_avg[i] = speech_avg[i]; 582 backgr_nn = speech_nn; 583 } 584 else if (speech_nn > 0 && backgr_nn > 0) 585 { 586 speech_avg[i] /= speech_nn; 587 backgr_avg[i] /= backgr_nn; 588 } 589 else 590 { 591 return 0; 592 } 593 594 avg[i] = (sbindex * speech_avg[i] + (100 - sbindex) * backgr_avg[i] + 50) / 100; 595 } 596 nn = (sbindex * speech_nn + (100 - sbindex) * backgr_nn + 50) / 100; 597 598 for (i = 0, ff = 0; i < MAX_CHAN_DIM; i++) 599 { 600 ff += (swicms->lda_tmn[i] - avg[i]); 601 } 602 ff /= MAX_CHAN_DIM; /* sum is now the average offset from TMN */ 603 if (ff > 5) 604 { 605 PLogError("Warning: bad utt mean during swicms_update() (moffs=%d)\n", ff); 606 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM); 607 return 1; 608 } 609 ff = swicms->forget_factor; 610 if (ff < 9999) 611 { 612 for (i = 0; i < MAX_CHAN_DIM; i++) 613 { 614 swicms->lda_cmn[i] = (swicms->lda_cmn[i] * ff + avg[i] * nn + (ff + nn) / 2) / (ff + nn); 615 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i]; 616 } 617 } 618 619 if (SWICMS_DEBUG) 620 { 621 imeldata temp[MAX_CHAN_DIM]; 622 PLogMessage("swicms_update() used %d frames (%d-%d)", nn, speech_start, speech_end); 623 624 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i]; 625 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 626 /* use this dump, to put back into CREC.Frontend.swicms.cmn */ 627 printf_vector("swicms.cmn(r) ", " %d", temp, MAX_CHAN_DIM); 628 629 //printf_vector("swicms.lda_cmn ", " %d", &swicms.lda_cmn [0], MAX_CHAN_DIM); 630 //printf_vector("swicms.lda_tmn ", " %d", &swicms.lda_tmn [0], MAX_CHAN_DIM); 631 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); 632 //printf_vector("avg.speech ", " %d", avg, MAX_CHAN_DIM); 633 } 634 else 635 { 636 #ifndef NDEBUG 637 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); 638 #endif 639 } 640 swicms->num_frames_in_cmn += nn; 641 return 0; 642 } 643 644 int swicms_lda_process(swicms_norm_info* swicms, preprocessed* prep) 645 { 646 int i; 647 648 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_tmn[i] = swicms->tmn[i]; 649 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_cmn[i] = swicms->cmn[i]; 650 linear_transform_frame(prep, swicms->lda_tmn, 1 /*do_shift*/); 651 linear_transform_frame(prep, swicms->lda_cmn, 1 /*do_shift*/); 652 653 for (i = 0; i < MAX_CHAN_DIM; i++) 654 { 655 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i]; 656 } 657 658 #ifndef NDEBUG 659 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM); 660 #endif 661 swicms->is_valid = 1; 662 swicms->_prep = prep; 663 664 if(SWICMS_DEBUG) { 665 imeldata temp[MAX_CHAN_DIM]; 666 printf_vector("swicms->cmn ", " %d", swicms->cmn, MAX_CHAN_DIM); 667 printf_vector("swicms->lda_cmn ", " %d", swicms->lda_cmn, MAX_CHAN_DIM); 668 //printf_vector("swicms->tmn ", " %d", swicms->tmn, MAX_CHAN_DIM); 669 //printf_vector("swicms->lda_tmn ", " %d", swicms->lda_tmn, MAX_CHAN_DIM); 670 //printf_vector("swicms->adjust ", " %d", swicms->adjust, MAX_CHAN_DIM); 671 672 //for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_tmn[i]; 673 //inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 674 //printf_vector("swicms->tmn(r) ", " %d", temp, MAX_CHAN_DIM); 675 676 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i]; 677 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/); 678 printf_vector("swicms->cmn(r) ", " %d", temp, MAX_CHAN_DIM); 679 } 680 return 0; 681 } 682 683 684 685