Home | History | Annotate | Download | only in include
      1 /*---------------------------------------------------------------------------*
      2  *  swicms.h                                                                 *
      3  *                                                                           *
      4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                         *
      5  *                                                                           *
      6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
      7  *  you may not use this file except in compliance with the License.         *
      8  *                                                                           *
      9  *  You may obtain a copy of the License at                                  *
     10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
     11  *                                                                           *
     12  *  Unless required by applicable law or agreed to in writing, software      *
     13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
     14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
     15  *  See the License for the specific language governing permissions and      *
     16  *  limitations under the License.                                           *
     17  *                                                                           *
     18  *---------------------------------------------------------------------------*/
     19 
     20 #ifndef __SWICMS_H__
     21 #define __SWICMS_H__
     22 
     23 #include"all_defs.h"
     24 #include"sizes.h"
     25 #include"fronttyp.h"
     26 #include"pre_desc.h"
     27 
     28 #define DEBUG_SWICMS        0
     29 #define MAX_CACHED_FRAMES 800
     30 #define SWICMS_CACHE_RESOLUTION_DEFAULT   8
     31 #define SWICMS_CACHE_SIZE_DEFAULT         100 /* equals #frames/resolution */
     32 
     33 /**
     34  * This is used for casting in debugger, just type (imelvec*)tmn.
     35  */
     36 typedef struct
     37 {
     38   imeldata vec[MAX_CHAN_DIM];
     39 }
     40 imelvec;
     41 
     42 /**
     43  * Does channel normalization without using fine recognition segmenation.  It remembers the
     44  * frames of speech and uses that as a channel mean for the next utterance.  A forget_factor
     45  * is used to weigh the new speech mean estimate with an older one.
     46  */
     47 typedef struct
     48 {
     49   imeldata tmn [MAX_CHAN_DIM];                 /* target mean */
     50   imeldata cmn [MAX_CHAN_DIM];                 /* channel mean */
     51 
     52   imeldata lda_tmn [MAX_CHAN_DIM];                 /* target mean */
     53   imeldata lda_cmn [MAX_CHAN_DIM];                 /* channel mean */
     54 
     55   imeldata adjust[MAX_CHAN_DIM]; /* target less channel */
     56 
     57   int is_valid;
     58   int forget_factor;           /* in frames, mass of cmn average */
     59   int sbindex;                 /* speech to background index
     60         100 -> use only speech to calculate CMN
     61         000 -> use only background to calculate CMN
     62         050 -> use half/half ..
     63         all numbers in between are acceptable */
     64 
     65   int num_frames_in_cmn; /* num frames used to estimate cmn (or lda_cmn) */
     66 
     67   /* for in-utterance channel normalization */
     68   struct {
     69     int forget_factor2;     /* cmn is given this weight to start off */
     70     int disable_after;      /* we disable in-utt cms after this many fr*/
     71     int enable_after;       /* we enable in-utt cms after this many fr*/
     72     int num_bou_frames_to_skip;   /* don't start accum 'til this many frames */
     73     int num_frames_since_bou;     /* counter for above, bou=begin-of-utt     */
     74     int num_frames_in_accum;      /* number of frames in accum */
     75     imeldata accum[MAX_CHAN_DIM]; /* accumulates frames of the current utt */
     76   } inutt;
     77 
     78   int cached_num_frames;       /* we cache frames, until recognition is done
     79         and can calculate speech mean from these */
     80   int cache_resolution;        /* we'll avg this many frames per section */
     81   imeldata cached_sections[SWICMS_CACHE_SIZE_DEFAULT][MAX_CHAN_DIM];
     82   /*const*/ preprocessed* _prep;
     83 }
     84 swicms_norm_info;
     85 
     86 int swicms_init(swicms_norm_info* swicms);
     87 int swicms_cache_frame(swicms_norm_info* swicms, imeldata* frame, int dimen);
     88 int apply_channel_normalization_in_swicms(swicms_norm_info *swicms,
     89     imeldata* oframe, imeldata* iframe,
     90     int dimen);
     91 int swicms_lda_process(swicms_norm_info* swicms, preprocessed* prep);
     92 
     93 int swicms_update(swicms_norm_info* swicms, int speech_start_frame, int speech_end_frame);
     94 
     95 ESR_ReturnCode swicms_set_cmn(swicms_norm_info *swicms, const LCHAR *new_cmn_params );
     96 ESR_ReturnCode swicms_get_cmn(swicms_norm_info *swicms, LCHAR *cmn_params, size_t* len );
     97 
     98 #if DEBUG_SWICMS
     99 int swicms_compare(swicms_norm_info* swicms, imeldata* imelda_adjust);
    100 int swicms_dump_stats(swicms_norm_info* swicms);
    101 #else
    102 #define swicms_compare(swicms,ia)
    103 #define swicms_dump_stats(swicms)
    104 #endif
    105 
    106 #endif
    107 
    108