1 /*---------------------------------------------------------------------------* 2 * swicms.h * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 #ifndef __SWICMS_H__ 21 #define __SWICMS_H__ 22 23 #include"all_defs.h" 24 #include"sizes.h" 25 #include"fronttyp.h" 26 #include"pre_desc.h" 27 28 #define DEBUG_SWICMS 0 29 #define MAX_CACHED_FRAMES 800 30 #define SWICMS_CACHE_RESOLUTION_DEFAULT 8 31 #define SWICMS_CACHE_SIZE_DEFAULT 100 /* equals #frames/resolution */ 32 33 /** 34 * This is used for casting in debugger, just type (imelvec*)tmn. 35 */ 36 typedef struct 37 { 38 imeldata vec[MAX_CHAN_DIM]; 39 } 40 imelvec; 41 42 /** 43 * Does channel normalization without using fine recognition segmenation. It remembers the 44 * frames of speech and uses that as a channel mean for the next utterance. A forget_factor 45 * is used to weigh the new speech mean estimate with an older one. 46 */ 47 typedef struct 48 { 49 imeldata tmn [MAX_CHAN_DIM]; /* target mean */ 50 imeldata cmn [MAX_CHAN_DIM]; /* channel mean */ 51 52 imeldata lda_tmn [MAX_CHAN_DIM]; /* target mean */ 53 imeldata lda_cmn [MAX_CHAN_DIM]; /* channel mean */ 54 55 imeldata adjust[MAX_CHAN_DIM]; /* target less channel */ 56 57 int is_valid; 58 int forget_factor; /* in frames, mass of cmn average */ 59 int sbindex; /* speech to background index 60 100 -> use only speech to calculate CMN 61 000 -> use only background to calculate CMN 62 050 -> use half/half .. 63 all numbers in between are acceptable */ 64 65 int num_frames_in_cmn; /* num frames used to estimate cmn (or lda_cmn) */ 66 67 /* for in-utterance channel normalization */ 68 struct { 69 int forget_factor2; /* cmn is given this weight to start off */ 70 int disable_after; /* we disable in-utt cms after this many fr*/ 71 int enable_after; /* we enable in-utt cms after this many fr*/ 72 int num_bou_frames_to_skip; /* don't start accum 'til this many frames */ 73 int num_frames_since_bou; /* counter for above, bou=begin-of-utt */ 74 int num_frames_in_accum; /* number of frames in accum */ 75 imeldata accum[MAX_CHAN_DIM]; /* accumulates frames of the current utt */ 76 } inutt; 77 78 int cached_num_frames; /* we cache frames, until recognition is done 79 and can calculate speech mean from these */ 80 int cache_resolution; /* we'll avg this many frames per section */ 81 imeldata cached_sections[SWICMS_CACHE_SIZE_DEFAULT][MAX_CHAN_DIM]; 82 /*const*/ preprocessed* _prep; 83 } 84 swicms_norm_info; 85 86 int swicms_init(swicms_norm_info* swicms); 87 int swicms_cache_frame(swicms_norm_info* swicms, imeldata* frame, int dimen); 88 int apply_channel_normalization_in_swicms(swicms_norm_info *swicms, 89 imeldata* oframe, imeldata* iframe, 90 int dimen); 91 int swicms_lda_process(swicms_norm_info* swicms, preprocessed* prep); 92 93 int swicms_update(swicms_norm_info* swicms, int speech_start_frame, int speech_end_frame); 94 95 ESR_ReturnCode swicms_set_cmn(swicms_norm_info *swicms, const LCHAR *new_cmn_params ); 96 ESR_ReturnCode swicms_get_cmn(swicms_norm_info *swicms, LCHAR *cmn_params, size_t* len ); 97 98 #if DEBUG_SWICMS 99 int swicms_compare(swicms_norm_info* swicms, imeldata* imelda_adjust); 100 int swicms_dump_stats(swicms_norm_info* swicms); 101 #else 102 #define swicms_compare(swicms,ia) 103 #define swicms_dump_stats(swicms) 104 #endif 105 106 #endif 107 108