1 /*---------------------------------------------------------------------------* 2 * SR_RecognizerImpl.h * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 #ifndef __SR_RECOGNIZERIMPL_H 21 #define __SR_RECOGNIZERIMPL_H 22 23 24 25 #include "ArrayList.h" 26 #include "CircularBuffer.h" 27 #include "ESR_ReturnCode.h" 28 #include "ESR_SessionType.h" 29 #include "HashMap.h" 30 #include "SR_AcousticState.h" 31 #include "SR_Recognizer.h" 32 #include "SR_EventLog.h" 33 #include "ptimestamp.h" 34 #include "SR_Grammar.h" 35 #include "SR_Nametag.h" 36 37 38 #include "frontapi.h" 39 #include "simapi.h" 40 41 /*** 42 * Recognizer timings to be written to OSI logs 43 */ 44 45 typedef struct RecogLogTimings_t 46 { 47 size_t BORT; /* beginning of recognition time (millisec) */ 48 size_t DURS; /* amount of speech processed (millisec) */ 49 size_t EORT; /* end of recognition time (millisec) */ 50 size_t EOSD; /* num of frames of speech before EOSS (frames) */ 51 size_t EOSS; /* frame where end of speech signal occurred (frames) */ 52 size_t BOSS; /* frame where start of speech signal occurred (frames) */ 53 size_t EOST; /* instant where end of speech signal occurred (millisec) */ 54 } 55 RecogLogTimings; 56 57 58 typedef enum 59 { 60 /** 61 * Initial state. 62 */ 63 SR_RECOGNIZER_INTERNAL_BEGIN, 64 /** 65 * Timeout before beginning of speech. 66 */ 67 SR_RECOGNIZER_INTERNAL_BOS_TIMEOUT, 68 /** 69 * Got end of input before beginning of speech. 70 */ 71 SR_RECOGNIZER_INTERNAL_BOS_NO_MATCH, 72 /** 73 * Waiting for beginning of speech. 74 */ 75 SR_RECOGNIZER_INTERNAL_BOS_DETECTION, 76 /** 77 * Waiting for end of speech or input. 78 */ 79 SR_RECOGNIZER_INTERNAL_EOS_DETECTION, 80 /** 81 * Got end of input. 82 */ 83 SR_RECOGNIZER_INTERNAL_EOI, 84 /** 85 * Detected end of speech (not due to end of input). 86 */ 87 SR_RECOGNIZER_INTERNAL_EOS, 88 /** 89 * Final state. 90 */ 91 SR_RECOGNIZER_INTERNAL_END, 92 } SR_RecognizerInternalStatus; 93 94 95 /** 96 * Waveform Buffering stuff (for Nametags) 97 **/ 98 99 #define DEFAULT_WAVEFORM_BUFFER_MAX_SIZE 65 /* kBytes, will not grow */ 100 #define DEFAULT_WAVEFORM_WINDBACK_FRAMES 50 /* will convert frames to bytes, will not grow */ 101 #define DEFAULT_BOS_COMFORT_FRAMES 2 102 #define DEFAULT_EOS_COMFORT_FRAMES 2 103 104 typedef enum 105 { 106 WAVEFORM_BUFFERING_OFF, /* no buffering */ 107 WAVEFORM_BUFFERING_ON_CIRCULAR, /* buffer but, do not grow past a certain upper bound, just loop & overwrite */ 108 WAVEFORM_BUFFERING_ON_LINEAR, /* buffer and report overflow if necessary */ 109 } waveform_buffering_state_t; 110 111 /* audio buffer which supports windback */ 112 113 typedef struct WaveformBuffer_t 114 { 115 void *windback_buffer; /* a temp buffer used for windback functionality (malloc only at init)*/ 116 size_t windback_buffer_sz; /* sizeof buffer */ 117 waveform_buffering_state_t state; /* state of the buffer (considered only when writing to buffer) */ 118 CircularBuffer* cbuffer; /* the actual buffer */ 119 size_t overflow_count; /* indicates the total number of bytes the overflowed */ 120 size_t read_size; 121 size_t eos_comfort_frames; 122 size_t bos_comfort_frames; 123 } 124 WaveformBuffer; 125 126 127 /* create the buffer */ 128 ESR_ReturnCode WaveformBuffer_Create(WaveformBuffer** waveformBuffer, size_t frame_size); 129 130 /* reset the buffer... do not release memeory */ 131 ESR_ReturnCode WaveformBuffer_Reset(WaveformBuffer* waveformBuffer); 132 133 /* get size */ 134 ESR_ReturnCode WaveformBuffer_GetSize(WaveformBuffer* waveformBuffer, size_t* size); 135 136 /* write to buffer. will grow only if buffering state is set to allow it */ 137 ESR_ReturnCode WaveformBuffer_Write(WaveformBuffer* waveformBuffer, void *data, size_t num_bytes); 138 139 /* read the whole buffer (starting from start offset, up to read_size) into a chunk allocated outside */ 140 ESR_ReturnCode WaveformBuffer_Read(WaveformBuffer* waveformBuffer, void *data, size_t* num_bytes); 141 142 /* does the windback after bos detected */ 143 ESR_ReturnCode WaveformBuffer_WindBack(WaveformBuffer* waveformBuffer, const size_t num_bytes); 144 145 /* sets the start offset and read_size at the end of recognition when endpointed transcription is known */ 146 ESR_ReturnCode WaveformBuffer_ParseEndPointedResultAndTrim(WaveformBuffer* waveformBuffer, const LCHAR* end_pointed_result, const size_t bytes_per_frame); 147 148 /* free the memory allocated for blocks and for windback */ 149 ESR_ReturnCode WaveformBuffer_Destroy(WaveformBuffer* waveformBuffer); 150 151 /* sets the state of buffer */ 152 ESR_ReturnCode WaveformBuffer_SetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t state); 153 154 /* gets the state of buffer */ 155 ESR_ReturnCode WaveformBuffer_GetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t* state); 156 157 /* skip the first few bytes (moves read pointer forward */ 158 ESR_ReturnCode WaveformBuffer_Skip(WaveformBuffer* waveformBuffer, const size_t bytes); 159 160 161 162 /** 163 * Speech recognizer. 164 */ 165 typedef struct SR_RecognizerImpl_t 166 { 167 /** 168 * Interface functions that must be implemented. 169 */ 170 SR_Recognizer Interface; 171 172 /** 173 * Legacy CREC frontend. 174 */ 175 CA_Frontend* frontend; 176 /** 177 * Legacy CREC Input waveform object. 178 */ 179 CA_Wave* wavein; 180 /** 181 * Legacy CREC Utterance object. 182 */ 183 CA_Utterance* utterance; 184 /** 185 * Legacy CREC confidence score calculator. 186 */ 187 CA_ConfidenceScorer* confidenceScorer; 188 /** 189 * Legacy CREC recognizer. 190 */ 191 CA_Recog* recognizer; 192 /** 193 * AcousticModels associated with Recognizer. 194 */ 195 SR_AcousticModels* models; 196 /** 197 * Active Recognizer grammars. 198 */ 199 HashMap* grammars; 200 /** 201 * Recognition result. 202 */ 203 SR_RecognizerResult* result; 204 /** 205 * Recognizer parameters. 206 */ 207 ESR_SessionType* parameters; 208 /** 209 * AcousticState associated with Recognizer. 210 */ 211 SR_AcousticState* acousticState; 212 /** 213 * Total number of frames pushed by SR_RecognizerPutAudio(). 214 */ 215 size_t frames; 216 /** 217 * Number of processed frames. 218 */ 219 size_t processed; 220 /** 221 * The number of frames up until the windback point (where -pau- starts). 222 */ 223 size_t beginningOfSpeechOffset; 224 /** 225 * Internal recognizer state. 226 */ 227 SR_RecognizerInternalStatus internalState; 228 /** 229 * Indicates if SR_RecognizerStart() was called. 230 */ 231 ESR_BOOL isStarted; 232 /** 233 * Indicates if PutAudio() was called with the last audio frame. 234 */ 235 ESR_BOOL gotLastFrame; 236 /** 237 * Audio buffer used by PutAudio(). 238 */ 239 CircularBuffer* buffer; 240 /** 241 * Temporary buffer used to transfer audio data (PutAudio). 242 **/ 243 asr_int16_t *audioBuffer; 244 /** 245 * Recognizer sample rate. 246 */ 247 size_t sampleRate; 248 /** 249 * Whether reconition has begun after begiing of speech detection 250 */ 251 ESR_BOOL isRecognizing; 252 /** 253 * Max number of frames to process before BOS timeout 254 */ 255 size_t utterance_timeout; 256 /** 257 * Locking function associated. 258 */ 259 SR_RecognizerLockFunction lockFunction; 260 /** 261 * Locking function data. 262 */ 263 void* lockData; 264 265 /** 266 * OSI logging level 267 * if bit0 (OSI_LOG_LEVEL_BASIC) is set: do basic logging 268 * if bit1 (OSI_LOG_LEVEL_AUDIO) is set: do audio waveform logging 269 * if bit2 (OSI_LOG_LEVEL_ADDWD) is set: do dynamic grammar addword logging 270 */ 271 size_t osi_log_level; 272 273 /** 274 * EventLog pointer 275 */ 276 SR_EventLog* eventLog; 277 /** 278 * Data that should be logged in OSI 279 */ 280 RecogLogTimings recogLogTimings; 281 /** 282 * Timestamp reference used for calculating timings 283 */ 284 PTimeStamp timestamp; 285 286 /** 287 * Waveform buffer (for nametags) . 288 */ 289 WaveformBuffer* waveformBuffer; 290 291 /** 292 * Reason for eos detected 293 */ 294 LCHAR* eos_reason; 295 296 /** 297 * Indicates if signal quality variables have been initialized. 298 */ 299 ESR_BOOL isSignalQualityInitialized; 300 /** 301 * True if signal is being clipped. 302 */ 303 ESR_BOOL isSignalClipping; 304 /** 305 * True if DCOffset is present in signal. 306 */ 307 ESR_BOOL isSignalDCOffset; 308 /** 309 * True if signal is noisy. 310 */ 311 ESR_BOOL isSignalNoisy; 312 /** 313 * True if signal is too quiet. 314 */ 315 ESR_BOOL isSignalTooQuiet; 316 /** 317 * True if signal contains too few samples. 318 */ 319 ESR_BOOL isSignalTooFewSamples; 320 /** 321 * True if signal contains too many samples. 322 */ 323 ESR_BOOL isSignalTooManySamples; 324 325 /** 326 * Number of bytes in a frame. 327 **/ 328 size_t FRAME_SIZE; 329 330 /** 331 * If TRUE, beginning of speech detection is enabled. 332 */ 333 ESR_BOOL gatedMode; 334 335 /** 336 * The minimum number of frames to sniff before beginning recognition. 337 */ 338 size_t bgsniff; 339 /** 340 * Indicates if we've skipped holdOffPeriod frames at the beginning of the waveform. 341 */ 342 ESR_BOOL holdOffPeriodSkipped; 343 } 344 SR_RecognizerImpl; 345 346 /** 347 * Groups grammar with meta-data. 348 */ 349 typedef struct GrammarBag_t 350 { 351 /** 352 * Grammar object. 353 */ 354 SR_Grammar* grammar; 355 /** 356 * Grammar weight. 357 */ 358 unsigned int weight; 359 /** 360 * Grammar ID. 361 */ 362 LCHAR* grammarID; 363 } 364 GrammarBag; 365 366 367 /** 368 * Default implementation. 369 */ 370 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerStartImpl(SR_Recognizer* self); 371 /** 372 * Default implementation. 373 */ 374 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerStopImpl(SR_Recognizer* self); 375 /** 376 * Default implementation. 377 */ 378 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDestroyImpl(SR_Recognizer* self); 379 /** 380 * Default implementation. 381 */ 382 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetupImpl(SR_Recognizer* self); 383 /** 384 * Default implementation. 385 */ 386 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerUnsetupImpl(SR_Recognizer* self); 387 /** 388 * Default implementation. 389 */ 390 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSetupImpl(SR_Recognizer* self, ESR_BOOL* isSetup); 391 392 /** 393 * Default implementation. 394 */ 395 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetParameterImpl(SR_Recognizer* self, const LCHAR* key, LCHAR* value, size_t* len); 396 /** 397 * Default implementation. 398 */ 399 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, size_t* value); 400 /** 401 * Default implementation. 402 */ 403 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL* value); 404 /** 405 * Default implementation. 406 */ 407 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetParameterImpl(SR_Recognizer* self, const LCHAR* key, LCHAR* value); 408 /** 409 * Default implementation. 410 */ 411 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, size_t value); 412 /** 413 * Default implementation. 414 */ 415 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL value); 416 417 /** 418 * Default implementation. 419 */ 420 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerHasSetupRulesImpl(SR_Recognizer* self, 421 ESR_BOOL* hasSetupRules); 422 /** 423 * Default implementation. 424 */ 425 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerActivateRuleImpl(SR_Recognizer* self, 426 SR_Grammar* grammar, 427 const LCHAR* ruleName, 428 unsigned int weight); 429 /** 430 * Default implementation. 431 */ 432 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDeactivateRuleImpl(SR_Recognizer* self, 433 SR_Grammar* grammar, 434 const LCHAR* ruleName); 435 436 /** 437 * Default implementation. 438 */ 439 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDeactivateAllRulesImpl(SR_Recognizer* self); 440 441 /** 442 * Default implementation. 443 */ 444 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsActiveRuleImpl(SR_Recognizer* self, 445 SR_Grammar* grammar, 446 const LCHAR* ruleName, 447 ESR_BOOL* isActiveRule); 448 /** 449 * Default implementation. 450 */ 451 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetWordAdditionCeilingImpl(SR_Recognizer* self, 452 SR_Grammar* grammar); 453 /** 454 * Default implementation. 455 */ 456 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerCheckGrammarConsistencyImpl(SR_Recognizer* self, 457 SR_Grammar* grammar, 458 ESR_BOOL* isConsistent); 459 /** 460 * Default implementation. 461 */ 462 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetModelsImpl(SR_Recognizer* self, 463 SR_AcousticModels** models); 464 /** 465 * Default implementation. 466 */ 467 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerPutAudioImpl(SR_Recognizer* self, 468 asr_int16_t* buffer, 469 size_t* bufferSize, 470 ESR_BOOL isLast); 471 /** 472 * Default implementation. 473 */ 474 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerAdvanceImpl(SR_Recognizer* self, 475 SR_RecognizerStatus* status, 476 SR_RecognizerResultType* type, 477 SR_RecognizerResult** result); 478 479 /** 480 * Default implementation. 481 */ 482 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerClearAcousticStateImpl(SR_Recognizer* self); 483 /** 484 * Default implementation. 485 */ 486 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadAcousticStateImpl(SR_Recognizer* self, 487 const LCHAR* filename); 488 489 /** 490 * Default implementation. 491 */ 492 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadUtteranceImpl(SR_Recognizer* self, const LCHAR* filename); 493 /** 494 * Default implementation. 495 */ 496 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadWaveFileImpl(SR_Recognizer* self, const LCHAR* filename); 497 498 /** 499 * Default implementation. 500 */ 501 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogTokenImpl(SR_Recognizer* self, const LCHAR* token, const LCHAR* value); 502 /** 503 * Default implementation. 504 */ 505 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogTokenIntImpl(SR_Recognizer* self, const LCHAR* token, int value); 506 /** 507 * Default implementation. 508 */ 509 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogEventImpl(SR_Recognizer* self, const LCHAR* event); 510 /** 511 * Default implementation. 512 */ 513 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogSessionStartImpl(SR_Recognizer* self, const LCHAR* sessionName); 514 /** 515 * Default implementation. 516 */ 517 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogSessionEndImpl(SR_Recognizer* self); 518 /** 519 * Default implementation. 520 */ 521 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogWaveformDataImpl(SR_Recognizer* self, 522 const LCHAR* waveformFilename, 523 const LCHAR* transcription, 524 const double bos, 525 const double eos, 526 ESR_BOOL isInvocab); 527 /** 528 * Default implementation. 529 */ 530 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetLockFunctionImpl(SR_Recognizer *self, SR_RecognizerLockFunction function, void* data); 531 /** 532 * Default implementation. 533 */ 534 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalClippingImpl(SR_Recognizer* self, ESR_BOOL* isClipping); 535 /** 536 * Default implementation. 537 */ 538 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalDCOffsetImpl(SR_Recognizer* self, ESR_BOOL* isDCOffset); 539 /** 540 * Default implementation. 541 */ 542 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalNoisyImpl(SR_Recognizer* self, ESR_BOOL* isNoisy); 543 /** 544 * Default implementation. 545 */ 546 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooQuietImpl(SR_Recognizer* self, ESR_BOOL* isTooQuiet); 547 /** 548 * Default implementation. 549 */ 550 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooFewSamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooFewSamples); 551 /** 552 * Default implementation. 553 */ 554 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooManySamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooManySamples); 555 556 SREC_RECOGNIZER_API ESR_ReturnCode SR_Recognizer_Change_Sample_RateImpl ( SR_Recognizer *self, size_t new_sample_rate ); 557 558 #endif /* __SR_RECOGNIZERIMPL_H */ 559