1 /*---------------------------------------------------------------------------* 2 * srec.h * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 /* this file contains defines needed by the srec search component*/ 21 22 #ifndef _h_srec_ 23 #define _h_srec_ 24 25 #include "swimodel.h" 26 #include "hmm_desc.h" 27 #include "utteranc.h" 28 #include "hmmlib.h" 29 #include "srec_sizes.h" 30 #include "search_network.h" 31 #include "srec_context.h" 32 #include "srec_eosd.h" 33 #include "astar.h" 34 35 #define MAX_HMM 3 /*maximum HMM states in an allophone*/ 36 #define DO_ALLOW_MULTIPLE_MODELS 1 37 38 /*in order to keep data sizes as small as possible, most of the the structure 39 below use indices into one fsmarc_token array and one word_token array. This 40 makes the code a bit confusing (compared to just keeping pointers to these 41 structure around), uses a bit more CPU, but saves memory and gives us more 42 flexibility in the sizes of these data types*/ 43 44 /** 45 * @todo document 46 */ 47 typedef struct altword_token_t 48 { 49 costdata costdelta; /* cost relative to path being propagated */ 50 wordID word; /* alternative word, diff from path b.p. */ 51 wtokenID word_backtrace; /* alternative backtrace, diff from path b.p.*/ 52 struct altword_token_t* next_token; /* todo: change this to indices */ 53 asr_int16_t refcount; 54 costdata costbasis; /* cost of best fsmarc_token host */ 55 } 56 altword_token; 57 #define AWTNULL 0 58 /* fsmarc_tokens and fsmnode_tokens point to a batch of altword_tokens 59 to save memory, many fsmarc_tokens can point to the same altword_token 60 and these are propagated by reference */ 61 62 /** 63 * @todo document 64 */ 65 typedef struct fsmarc_token_t 66 { 67 frameID num_hmm_states; /* number of hmm states */ 68 costdata cost[MAX_HMM]; /* cost so far*/ 69 wtokenID word_backtrace[MAX_HMM]; /* index into word tokens*/ 70 wordID word[MAX_HMM]; /* when the path encounters an output 71 symbol, store it here*/ 72 frameID duration[MAX_HMM]; /* frames observed for this hmm state, todo: pack into char! */ 73 arcID FSMarc_index; /* index into the FSM arc array */ 74 75 stokenID next_token_index; /* for maintaining linked lists of these 76 tokens, both in search and in freelist */ 77 altword_token* aword_backtrace[MAX_HMM]; 78 } 79 fsmarc_token; 80 /* 30 bytes */ 81 82 83 /** 84 * These are used while maximizing into FSM nodes. 85 */ 86 typedef struct fsmnode_token_t 87 { 88 costdata cost; 89 wtokenID word_backtrace; /* index into word tokens*/ 90 wordID word; /* when the path encounters an output*/ 91 nodeID FSMnode_index; 92 ftokenID next_token_index; 93 altword_token* aword_backtrace; 94 frameID silence_duration; 95 } 96 fsmnode_token; 97 /* 10 bytes */ 98 99 /** 100 * @todo document 101 */ 102 typedef struct word_token_t 103 { 104 wordID word; /* the word just observed */ 105 frameID end_time; /* end time of the word just observed, includes trailing silence */ 106 nodeID end_node; /* for backtrace with word graph */ 107 wtokenID backtrace; /* for backtrace */ 108 costdata cost; /* cost for path up to this point*/ 109 wtokenID next_token_index; /* for maintaining linked lists of these tokens 110 (both in the search and in the freelist) */ 111 frameID _word_end_time; /* end time of the word just observed, excl trailing silence */ 112 /* since frameID is 16 bit, and 15bits is plenty 113 (ie 32767 frames * 20ms/frame = 655 sec), we use the high-bit to store 114 whether this word_token represents a homonym, this is used in confidence 115 score fixing! */ 116 #define WORD_TOKEN_GET_HOMONYM(wT) (wT->_word_end_time & 0x8000) // 10000000 117 #define WORD_TOKEN_SET_HOMONYM(wT,hM) (wT->_word_end_time = (wT->_word_end_time&0x7fff)|(hM?0x8000:0)) 118 #define WORD_TOKEN_GET_WD_ETIME(wT) (wT->_word_end_time & 0x7fff) // 01111111 119 #define WORD_TOKEN_SET_WD_ETIME(wT,eT) (wT->_word_end_time = (wT->_word_end_time&0x8000)|(eT)) 120 } 121 word_token; 122 /* 12 bytes */ 123 124 /** 125 * Contains what we need for later backtrace, nbest, etc. 126 */ 127 typedef struct 128 { 129 /* there are various arrays below which frame number long - this is the number allocated */ 130 frameID max_frames; 131 132 /* for each frame, head of a linked list of word tokens for that frame */ 133 wtokenID *words_for_frame; 134 asr_int16_t *whether_sorted; 135 136 } 137 srec_word_lattice; 138 139 /*This is just implemented as a list so far - use Johan's fancy implementation later*/ 140 141 /** 142 * @todo document 143 */ 144 typedef struct priority_q_t 145 { 146 wtokenID word_token_list; /* index of head token in queue - keep worst at end 147 (so we can pop one off) */ 148 costdata max_cost_in_q; 149 miscdata num_in_q; 150 miscdata max_in_q; 151 } 152 priority_q; 153 154 /*------------------------------------------------------------------* 155 * * 156 *------------------------------------------------------------------*/ 157 158 /* notes ... what needs to be acoustic model specific 159 160 (p)ool it 161 (1) single .r but reset 162 (x) specific 163 164 1 context 165 1 word_priority_q 166 x word_lattice 167 1 prune_delta 168 1 current_search_frame 169 170 1.r best_token_for_arc[] max_fsm_arcs 171 1.r best_token_for_node[] max_fsm_nodes 172 1 cost_offset_for_frame MAX_FRAMES 173 1 accumulated_cost_offset_for_frame MAX_FRAMES 174 175 x active_fsmarc_tokens 176 num_new_states ... num in active_fsmarc_tokens 177 max_new_states ... same as fsmarc_token_array_size 178 179 x active_fsm_node_tokens 180 181 ? current_model_scores num_model_slots_allocated 182 183 p fsmarc_token_array _size _freelist 184 p fsmnode_token_array _size _freelist 185 x word_token_array _size _freelist 186 x word_token_array_flags 187 188 ... not used! best_fsmarc_token 189 srec_ended 190 astar_stack 191 */ 192 193 struct srec_t 194 { /*contains everything needed to run the search*/ 195 asr_int16_t id; /*contains an id for this recognizer*/ 196 srec_context *context; /*contains the recognition context (fst, info about models, etc)*/ 197 priority_q *word_priority_q; /*used to keep track of new word in frame*/ 198 srec_word_lattice *word_lattice; /*used to keep track of word lattice in utterance*/ 199 200 costdata prune_delta; /* controls the amount of score-based pruning - should this go in the context instead?*/ 201 costdata current_prune_delta; /* when the above changes in mid-frame */ 202 costdata current_best_cost; /* 0 if single recog */ 203 204 frameID current_search_frame; 205 stokenID *best_token_for_arc; /* non-owning ptr, see multi_srec below */ 206 207 stokenID active_fsmarc_tokens; /*head of list of state tokens for the next frame. Used during 208 the search to keep track of new states for new frame. This 209 is to allow us to efficently do things like prune, free state arrays, etc*/ 210 211 212 nodeID num_new_states; 213 nodeID max_new_states; /*the num allocated in the new_states array - if the search is exceeding this, 214 we need to tighten the pruning*/ 215 216 ftokenID *best_token_for_node; /* non-owning ptr, see multi_srec below */ 217 218 ftokenID active_fsmnode_tokens; /* linked list of all fsmnode token (same as ones in 219 best_state_for_node, just kept as a list)*/ 220 221 costdata *current_model_scores; /* temporary array used by the search to contain model scores - 222 size is max number of models*/ 223 modelID num_model_slots_allocated; /*num allocated in above array - search will only 224 work with models with less than this number of models*/ 225 226 /*the following arrays handle all the state and word tokens. All of them 227 are allocated to a fixed size at startup time, and the search uses elements 228 from the first array in the search. The pruning of the search is used to 229 make sure that the allocated number is not exceeded*/ 230 231 232 fsmarc_token *fsmarc_token_array; /*used for storage of all state tokens 233 - allocated once at startup time and kept 234 around. It's fixed size and the search 235 pruning must ensure that it is never 236 exceeded*/ 237 stokenID fsmarc_token_array_size; /*total number of tokens allocated in this array*/ 238 stokenID fsmarc_token_freelist; /*index to head of state token freelist*/ 239 240 fsmnode_token *fsmnode_token_array; /*used for storage of all fsmnode tokens 241 - allocated once at startup time and kept 242 around. It's fixed size and the search 243 pruning must ensure that it is never 244 exceeded*/ 245 ftokenID fsmnode_token_array_size; /*total number of tokens allocated in this array*/ 246 ftokenID fsmnode_token_freelist; /*index to head of fsmnode token freelist*/ 247 248 word_token *word_token_array; /* used for storage of all word tokens - 249 allocated once at startup time and kept 250 around. It's fixed size and the search 251 pruning must ensure that it is never 252 exceeded*/ 253 asr_int16_t* word_token_array_flags; /* bitarray used for flagging */ 254 wtokenID word_token_array_size; /* total number of tokens allocated in 255 this array*/ 256 wtokenID word_token_freelist; /* index to head of word token freelist*/ 257 258 altword_token* altword_token_array; /* used to store alternative words before a wb */ 259 wtokenID altword_token_array_size; 260 altword_token* altword_token_freelist; 261 wtokenID altword_token_freelist_len; 262 263 frameID max_frames; 264 costdata* best_model_cost_for_frame; 265 costdata* cost_offset_for_frame; /* see multi_srec, below */ 266 bigcostdata* accumulated_cost_offset; /* see multi_srec, below */ 267 268 stokenID best_fsmarc_token; /* ?? index of best scoring state token 269 this is used to lookup wtokens on the 270 top choice path, to make sure they're not 271 pruned via reprune_word_tokens() */ 272 costdata current_best_ftoken_cost[NODE_INFO_NUMS]; 273 ftokenID current_best_ftoken_index[NODE_INFO_NUMS]; 274 275 /*the following elements are to keep track of how big various arrays are*/ 276 nodeID max_fsm_nodes; /* see multi_srec below */ 277 arcID max_fsm_arcs; /* see multi_srec below */ 278 asr_int16_t srec_ended; 279 AstarStack *astar_stack; /* for backwards word search */ 280 const featdata* avg_state_durations; /* average state durations (from AMs) */ 281 282 srec_eos_detector_state eosd_state; 283 }; 284 285 #define MAX_RECOGNIZERS 2 /* generally, 1x for each acoustic model */ 286 #define MAX_ACOUSTIC_MODELS 2 287 288 /** 289 * @todo document 290 */ 291 typedef struct 292 { 293 asr_int32_t num_allocated_recs; 294 asr_int32_t num_activated_recs; 295 srec* rec; /* size num_allocated_recs, one for 296 each gender */ 297 298 frameID max_frames; 299 costdata* cost_offset_for_frame; /* size max_frames, keeps track of 300 current_best_costs bookkeeping from 301 reset_current_best_costs_to_zero() */ 302 bigcostdata *accumulated_cost_offset; /* same as above but cumulative */ 303 304 305 ftokenID *best_token_for_node; /* array (size max_fsm_nodes) best path into 306 fsmnode - kept as an fsmnode_token */ 307 nodeID max_fsm_nodes; 308 stokenID *best_token_for_arc; /* array (size max_fsm_arcs) best path into 309 fsmarc - kept as a fsmarc_token */ 310 arcID max_fsm_arcs; 311 312 /* non owning pointer to compact acoustic models */ 313 asr_int32_t num_swimodels; 314 const SWIModel *swimodel[MAX_ACOUSTIC_MODELS]; 315 EOSrc eos_status; 316 } 317 multi_srec; 318 319 #ifdef __cplusplus 320 extern "C" 321 { 322 #endif 323 priority_q* allocate_priority_q(int max_n); 324 void free_priority_q(priority_q* pq); 325 void clear_priority_q(priority_q *pq); 326 wtokenID get_word_token_list(priority_q *pq, word_token *word_token_array); 327 wtokenID add_word_token_to_priority_q(priority_q *pq, wtokenID token_index_to_add, word_token *word_token_array); 328 void remove_non_end_word_from_q(srec *rec, priority_q *pq, word_token *word_token_array, nodeID end_node); 329 costdata get_priority_q_threshold(priority_q *pq, word_token *word_token_array); 330 331 void free_word_token(srec *rec, wtokenID old_token_index); 332 int srec_begin(srec* rec, int begin_syn_node); 333 void srec_no_more_frames(srec* rec); 334 bigcostdata accumulated_cost_offset(costdata *cost_offsets, frameID frame); 335 void multi_srec_get_speech_bounds(multi_srec* rec, frameID* start_frame, frameID* end_frame); 336 int multi_srec_get_eos_status(multi_srec* rec); 337 #ifdef __cplusplus 338 } 339 #endif 340 341 /** 342 * For visualization in the debugger 343 */ 344 typedef struct 345 { 346 asr_uint16_t data[50]; 347 } 348 us50; 349 350 /** 351 * @todo document 352 */ 353 typedef struct 354 { 355 asr_uint16_t data[250]; 356 } 357 us250; 358 359 /** 360 * @todo document 361 */ 362 typedef struct 363 { 364 asr_uint16_t data[1000]; 365 } 366 us1000; 367 368 #endif 369