Home | History | Annotate | Download | only in include
      1 /*---------------------------------------------------------------------------*
      2  *  srec.h  *
      3  *                                                                           *
      4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
      5  *                                                                           *
      6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
      7  *  you may not use this file except in compliance with the License.         *
      8  *                                                                           *
      9  *  You may obtain a copy of the License at                                  *
     10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
     11  *                                                                           *
     12  *  Unless required by applicable law or agreed to in writing, software      *
     13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
     14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
     15  *  See the License for the specific language governing permissions and      *
     16  *  limitations under the License.                                           *
     17  *                                                                           *
     18  *---------------------------------------------------------------------------*/
     19 
     20 /* this file contains defines needed by the srec search component*/
     21 
     22 #ifndef _h_srec_
     23 #define _h_srec_
     24 
     25 #include "swimodel.h"
     26 #include "hmm_desc.h"
     27 #include "utteranc.h"
     28 #include "hmmlib.h"
     29 #include "srec_sizes.h"
     30 #include "search_network.h"
     31 #include "srec_context.h"
     32 #include "srec_eosd.h"
     33 #include "astar.h"
     34 
     35 #define MAX_HMM 3            /*maximum HMM states in an allophone*/
     36 #define DO_ALLOW_MULTIPLE_MODELS 1
     37 
     38 /*in order to keep data sizes as small as possible, most of the the structure
     39   below use indices into one fsmarc_token array and one word_token array.  This
     40   makes the code a bit confusing (compared to just keeping pointers to these
     41   structure around), uses a bit more CPU, but saves memory and gives us more
     42   flexibility in the sizes of these data types*/
     43 
     44 /**
     45  * @todo document
     46  */
     47 typedef struct altword_token_t
     48 {
     49   costdata costdelta;        /* cost relative to path being propagated */
     50   wordID word;               /* alternative word, diff from path b.p. */
     51   wtokenID word_backtrace;   /* alternative backtrace, diff from path b.p.*/
     52   struct altword_token_t* next_token; /* todo: change this to indices */
     53   asr_int16_t refcount;
     54   costdata costbasis;        /* cost of best fsmarc_token host */
     55 }
     56 altword_token;
     57 #define AWTNULL 0
     58 /* fsmarc_tokens and fsmnode_tokens point to a batch of altword_tokens
     59    to save memory, many fsmarc_tokens can point to the same altword_token
     60    and these are propagated by reference */
     61 
     62 /**
     63  * @todo document
     64  */
     65 typedef struct fsmarc_token_t
     66 {
     67   frameID num_hmm_states;           /* number of hmm states */
     68   costdata cost[MAX_HMM];           /* cost so far*/
     69   wtokenID word_backtrace[MAX_HMM]; /* index into word tokens*/
     70   wordID word[MAX_HMM];             /* when the path encounters an output
     71              symbol, store it here*/
     72   frameID duration[MAX_HMM];        /* frames observed for this hmm state, todo: pack into char! */
     73   arcID FSMarc_index;               /* index into the FSM arc array */
     74 
     75   stokenID next_token_index;        /* for maintaining linked lists of these
     76              tokens, both in search and in freelist */
     77   altword_token* aword_backtrace[MAX_HMM];
     78 }
     79 fsmarc_token;
     80 /* 30 bytes */
     81 
     82 
     83 /**
     84  * These are used while maximizing into FSM nodes.
     85  */
     86 typedef struct fsmnode_token_t
     87 {
     88   costdata cost;
     89   wtokenID word_backtrace;  /* index into word tokens*/
     90   wordID word;              /* when the path encounters an output*/
     91   nodeID FSMnode_index;
     92   ftokenID next_token_index;
     93   altword_token* aword_backtrace;
     94   frameID silence_duration;
     95 }
     96 fsmnode_token;
     97 /* 10 bytes */
     98 
     99 /**
    100  * @todo document
    101  */
    102 typedef struct word_token_t
    103 {
    104   wordID word;                /* the word just observed */
    105   frameID end_time;           /* end time of the word just observed, includes trailing silence */
    106   nodeID end_node;            /* for backtrace with word graph */
    107   wtokenID backtrace;         /* for backtrace */
    108   costdata cost;              /* cost for path up to this point*/
    109   wtokenID next_token_index;  /* for maintaining linked lists of these tokens
    110        (both in the search and in the freelist) */
    111   frameID _word_end_time;     /* end time of the word just observed, excl trailing silence */
    112   /* since frameID is 16 bit, and 15bits is plenty
    113      (ie 32767 frames * 20ms/frame = 655 sec), we use the high-bit to store
    114 	 whether this word_token represents a homonym, this is used in confidence
    115 	 score fixing! */
    116 #define WORD_TOKEN_GET_HOMONYM(wT)     (wT->_word_end_time & 0x8000)  // 10000000
    117 #define WORD_TOKEN_SET_HOMONYM(wT,hM)  (wT->_word_end_time = (wT->_word_end_time&0x7fff)|(hM?0x8000:0))
    118 #define WORD_TOKEN_GET_WD_ETIME(wT)    (wT->_word_end_time & 0x7fff) // 01111111
    119 #define WORD_TOKEN_SET_WD_ETIME(wT,eT) (wT->_word_end_time = (wT->_word_end_time&0x8000)|(eT))
    120 }
    121 word_token;
    122 /* 12 bytes */
    123 
    124 /**
    125  * Contains what we need for later backtrace, nbest, etc.
    126  */
    127 typedef struct
    128 {
    129   /* there are various arrays below which frame number long - this is the number allocated */
    130   frameID max_frames;
    131 
    132   /* for each frame, head of a linked list of word tokens for that frame */
    133   wtokenID *words_for_frame;
    134   asr_int16_t *whether_sorted;
    135 
    136 }
    137 srec_word_lattice;
    138 
    139 /*This is just implemented as a list so far - use Johan's fancy implementation later*/
    140 
    141 /**
    142  * @todo document
    143  */
    144 typedef struct priority_q_t
    145 {
    146   wtokenID word_token_list;  /* index of head token in queue - keep worst at end
    147       (so we can pop one off) */
    148   costdata max_cost_in_q;
    149   miscdata num_in_q;
    150   miscdata max_in_q;
    151 }
    152 priority_q;
    153 
    154 /*------------------------------------------------------------------*
    155  *                                                                  *
    156  *------------------------------------------------------------------*/
    157 
    158 /* notes ... what needs to be acoustic model specific
    159 
    160    (p)ool it
    161    (1) single  .r but reset
    162    (x) specific
    163 
    164    1 context
    165    1 word_priority_q
    166    x word_lattice
    167    1 prune_delta
    168    1 current_search_frame
    169 
    170    1.r best_token_for_arc[]  max_fsm_arcs
    171    1.r best_token_for_node[]   max_fsm_nodes
    172    1 cost_offset_for_frame MAX_FRAMES
    173    1 accumulated_cost_offset_for_frame MAX_FRAMES
    174 
    175    x active_fsmarc_tokens
    176    num_new_states   ... num in active_fsmarc_tokens
    177    max_new_states   ... same as fsmarc_token_array_size
    178 
    179    x active_fsm_node_tokens
    180 
    181    ? current_model_scores num_model_slots_allocated
    182 
    183    p fsmarc_token_array _size _freelist
    184    p fsmnode_token_array  _size _freelist
    185    x word_token_array _size _freelist
    186    x word_token_array_flags
    187 
    188    ... not used! best_fsmarc_token
    189    srec_ended
    190    astar_stack
    191 */
    192 
    193 struct srec_t
    194 {  /*contains everything needed to run the search*/
    195   asr_int16_t id;                   /*contains an id for this recognizer*/
    196   srec_context *context;      /*contains the recognition context (fst, info about models, etc)*/
    197   priority_q *word_priority_q; /*used to keep track of new word in frame*/
    198   srec_word_lattice *word_lattice;  /*used to keep track of word lattice in utterance*/
    199 
    200   costdata prune_delta;        /* controls the amount of score-based pruning - should this go in the context instead?*/
    201   costdata current_prune_delta; /* when the above changes in mid-frame */
    202   costdata current_best_cost;   /* 0 if single recog */
    203 
    204   frameID current_search_frame;
    205   stokenID *best_token_for_arc;  /* non-owning ptr, see multi_srec below */
    206 
    207   stokenID active_fsmarc_tokens; /*head of list of state tokens for the next frame.  Used during
    208         the search to keep track of new states for new frame.  This
    209         is to allow us to efficently do things like prune, free state arrays, etc*/
    210 
    211 
    212   nodeID num_new_states;
    213   nodeID max_new_states;  /*the num allocated in the new_states array - if the search is exceeding this,
    214          we need to tighten the pruning*/
    215 
    216   ftokenID *best_token_for_node;   /* non-owning ptr, see multi_srec below */
    217 
    218   ftokenID active_fsmnode_tokens;  /* linked list of all fsmnode token (same as ones in
    219            best_state_for_node, just kept as a list)*/
    220 
    221   costdata *current_model_scores;  /* temporary array used by the search to contain model scores -
    222            size is max number of models*/
    223   modelID num_model_slots_allocated;  /*num allocated in above array - search will only
    224        work with models with less than this number of models*/
    225 
    226   /*the following arrays handle all the state and word tokens.  All of them
    227     are allocated to a fixed size at startup time, and the search uses elements
    228     from the first array in the search.  The pruning of the search is used to
    229     make sure that the allocated number is not exceeded*/
    230 
    231 
    232   fsmarc_token *fsmarc_token_array;  /*used for storage of all state tokens
    233            - allocated once at startup time and kept
    234            around.  It's fixed size and the search
    235            pruning must ensure that it is never
    236            exceeded*/
    237   stokenID fsmarc_token_array_size; /*total number of tokens allocated in this array*/
    238   stokenID fsmarc_token_freelist;   /*index to head of state token freelist*/
    239 
    240   fsmnode_token *fsmnode_token_array;  /*used for storage of all fsmnode tokens
    241            - allocated once at startup time and kept
    242            around.  It's fixed size and the search
    243            pruning must ensure that it is never
    244            exceeded*/
    245   ftokenID fsmnode_token_array_size; /*total number of tokens allocated in this array*/
    246   ftokenID fsmnode_token_freelist;   /*index to head of fsmnode token freelist*/
    247 
    248   word_token *word_token_array;    /* used for storage of all word tokens -
    249             allocated once at startup time and kept
    250             around.  It's fixed size and the search
    251             pruning must ensure that it is never
    252             exceeded*/
    253   asr_int16_t* word_token_array_flags;   /* bitarray used for flagging */
    254   wtokenID word_token_array_size;  /* total number of tokens allocated in
    255             this array*/
    256   wtokenID word_token_freelist;    /* index to head of word token freelist*/
    257 
    258   altword_token* altword_token_array; /* used to store alternative words before a wb */
    259   wtokenID altword_token_array_size;
    260   altword_token* altword_token_freelist;
    261   wtokenID altword_token_freelist_len;
    262 
    263   frameID max_frames;
    264   costdata* best_model_cost_for_frame;
    265   costdata* cost_offset_for_frame;        /* see multi_srec, below */
    266   bigcostdata* accumulated_cost_offset;   /* see multi_srec, below */
    267 
    268   stokenID best_fsmarc_token;      /* ?? index of best scoring state token
    269            this is used to lookup wtokens on the
    270            top choice path, to make sure they're not
    271            pruned via reprune_word_tokens() */
    272   costdata current_best_ftoken_cost[NODE_INFO_NUMS];
    273   ftokenID current_best_ftoken_index[NODE_INFO_NUMS];
    274 
    275   /*the following elements are to keep track of how big various arrays are*/
    276   nodeID max_fsm_nodes;           /* see multi_srec below */
    277   arcID max_fsm_arcs;             /* see multi_srec below */
    278   asr_int16_t srec_ended;
    279   AstarStack *astar_stack;        /* for backwards word search */
    280   const featdata* avg_state_durations;  /* average state durations (from AMs) */
    281 
    282   srec_eos_detector_state eosd_state;
    283 };
    284 
    285 #define MAX_RECOGNIZERS 2          /* generally, 1x for each acoustic model */
    286 #define MAX_ACOUSTIC_MODELS 2
    287 
    288 /**
    289  * @todo document
    290  */
    291 typedef struct
    292 {
    293   asr_int32_t num_allocated_recs;
    294   asr_int32_t num_activated_recs;
    295   srec* rec;                       /* size num_allocated_recs, one for
    296             each gender */
    297 
    298   frameID max_frames;
    299   costdata* cost_offset_for_frame; /* size max_frames, keeps track of
    300             current_best_costs bookkeeping from
    301             reset_current_best_costs_to_zero() */
    302   bigcostdata *accumulated_cost_offset; /* same as above but cumulative */
    303 
    304 
    305   ftokenID *best_token_for_node;  /* array (size max_fsm_nodes) best path into
    306            fsmnode - kept as an fsmnode_token */
    307   nodeID max_fsm_nodes;
    308   stokenID *best_token_for_arc;   /* array (size max_fsm_arcs) best path into
    309            fsmarc - kept as a fsmarc_token */
    310   arcID max_fsm_arcs;
    311 
    312   /* non owning pointer to compact acoustic models */
    313   asr_int32_t num_swimodels;
    314   const SWIModel    *swimodel[MAX_ACOUSTIC_MODELS];
    315   EOSrc eos_status;
    316 }
    317 multi_srec;
    318 
    319 #ifdef __cplusplus
    320 extern "C"
    321 {
    322 #endif
    323   priority_q* allocate_priority_q(int max_n);
    324   void free_priority_q(priority_q* pq);
    325   void clear_priority_q(priority_q *pq);
    326   wtokenID get_word_token_list(priority_q *pq, word_token *word_token_array);
    327   wtokenID add_word_token_to_priority_q(priority_q *pq, wtokenID token_index_to_add, word_token *word_token_array);
    328   void remove_non_end_word_from_q(srec *rec, priority_q *pq, word_token *word_token_array, nodeID end_node);
    329   costdata get_priority_q_threshold(priority_q *pq, word_token *word_token_array);
    330 
    331   void free_word_token(srec *rec, wtokenID old_token_index);
    332   int srec_begin(srec* rec, int begin_syn_node);
    333   void srec_no_more_frames(srec* rec);
    334   bigcostdata accumulated_cost_offset(costdata *cost_offsets, frameID frame);
    335   void multi_srec_get_speech_bounds(multi_srec* rec, frameID* start_frame, frameID* end_frame);
    336   int multi_srec_get_eos_status(multi_srec* rec);
    337 #ifdef __cplusplus
    338 }
    339 #endif
    340 
    341 /**
    342  * For visualization in the debugger
    343  */
    344 typedef struct
    345 {
    346   asr_uint16_t data[50];
    347 }
    348 us50;
    349 
    350 /**
    351  * @todo document
    352  */
    353 typedef struct
    354 {
    355   asr_uint16_t data[250];
    356 }
    357 us250;
    358 
    359 /**
    360  * @todo document
    361  */
    362 typedef struct
    363 {
    364   asr_uint16_t data[1000];
    365 }
    366 us1000;
    367 
    368 #endif
    369