Home | History | Annotate | Download | only in include
      1 /*---------------------------------------------------------------------------*
      2  *  SR_RecognizerImpl.h  *
      3  *                                                                           *
      4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
      5  *                                                                           *
      6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
      7  *  you may not use this file except in compliance with the License.         *
      8  *                                                                           *
      9  *  You may obtain a copy of the License at                                  *
     10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
     11  *                                                                           *
     12  *  Unless required by applicable law or agreed to in writing, software      *
     13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
     14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
     15  *  See the License for the specific language governing permissions and      *
     16  *  limitations under the License.                                           *
     17  *                                                                           *
     18  *---------------------------------------------------------------------------*/
     19 
     20 #ifndef __SR_RECOGNIZERIMPL_H
     21 #define __SR_RECOGNIZERIMPL_H
     22 
     23 
     24 
     25 #include "ArrayList.h"
     26 #include "CircularBuffer.h"
     27 #include "ESR_ReturnCode.h"
     28 #include "ESR_SessionType.h"
     29 #include "HashMap.h"
     30 #include "SR_AcousticState.h"
     31 #include "SR_Recognizer.h"
     32 #include "SR_EventLog.h"
     33 #include "ptimestamp.h"
     34 #include "SR_Grammar.h"
     35 #include "SR_Nametag.h"
     36 
     37 
     38 #include "frontapi.h"
     39 #include "simapi.h"
     40 
     41 /***
     42  * Recognizer timings to be written to OSI logs
     43  */
     44 
     45 typedef struct RecogLogTimings_t
     46 {
     47   size_t BORT;    /* beginning of recognition time (millisec) */
     48   size_t DURS;    /* amount of speech processed (millisec) */
     49   size_t EORT;    /* end of recognition time (millisec) */
     50   size_t EOSD;    /* num of frames of speech before EOSS (frames) */
     51   size_t EOSS;    /* frame where end of speech signal occurred (frames) */
     52   size_t BOSS;    /* frame where start of speech signal occurred (frames) */
     53   size_t EOST;    /* instant where end of speech signal occurred (millisec) */
     54 }
     55 RecogLogTimings;
     56 
     57 
     58 typedef enum
     59 {
     60   /**
     61    * Initial state.
     62    */
     63   SR_RECOGNIZER_INTERNAL_BEGIN,
     64   /**
     65    * Timeout before beginning of speech.
     66    */
     67   SR_RECOGNIZER_INTERNAL_BOS_TIMEOUT,
     68   /**
     69    * Got end of input before beginning of speech.
     70    */
     71   SR_RECOGNIZER_INTERNAL_BOS_NO_MATCH,
     72   /**
     73    * Waiting for beginning of speech.
     74    */
     75   SR_RECOGNIZER_INTERNAL_BOS_DETECTION,
     76   /**
     77    * Waiting for end of speech or input.
     78    */
     79   SR_RECOGNIZER_INTERNAL_EOS_DETECTION,
     80   /**
     81    * Got end of input.
     82    */
     83   SR_RECOGNIZER_INTERNAL_EOI,
     84   /**
     85    * Detected end of speech (not due to end of input).
     86    */
     87   SR_RECOGNIZER_INTERNAL_EOS,
     88   /**
     89    * Final state.
     90    */
     91   SR_RECOGNIZER_INTERNAL_END,
     92 } SR_RecognizerInternalStatus;
     93 
     94 
     95 /**
     96  * Waveform Buffering stuff (for Nametags)
     97  **/
     98 
     99 #define DEFAULT_WAVEFORM_BUFFER_MAX_SIZE       65  /* kBytes, will not grow */
    100 #define DEFAULT_WAVEFORM_WINDBACK_FRAMES       50  /* will convert frames to bytes, will not grow */
    101 #define DEFAULT_BOS_COMFORT_FRAMES              2
    102 #define DEFAULT_EOS_COMFORT_FRAMES              2
    103 
    104 typedef enum
    105 {
    106   WAVEFORM_BUFFERING_OFF,             /* no buffering */
    107   WAVEFORM_BUFFERING_ON_CIRCULAR,     /* buffer but, do not grow past a certain upper bound, just loop & overwrite */
    108   WAVEFORM_BUFFERING_ON_LINEAR,       /* buffer and report overflow if necessary */
    109 } waveform_buffering_state_t;
    110 
    111 /* audio buffer which supports windback */
    112 
    113 typedef struct WaveformBuffer_t
    114 {
    115   void   *windback_buffer;        /* a temp buffer used for windback functionality (malloc only at init)*/
    116   size_t windback_buffer_sz;      /* sizeof buffer */
    117   waveform_buffering_state_t state; /* state of the buffer (considered only when writing to buffer) */
    118   CircularBuffer* cbuffer;        /* the actual buffer */
    119   size_t   overflow_count;        /* indicates the total number of bytes the overflowed */
    120   size_t read_size;
    121   size_t eos_comfort_frames;
    122   size_t bos_comfort_frames;
    123 }
    124 WaveformBuffer;
    125 
    126 
    127 /* create the buffer */
    128 ESR_ReturnCode WaveformBuffer_Create(WaveformBuffer** waveformBuffer, size_t frame_size);
    129 
    130 /* reset the buffer... do not release memeory */
    131 ESR_ReturnCode WaveformBuffer_Reset(WaveformBuffer* waveformBuffer);
    132 
    133 /* get size */
    134 ESR_ReturnCode WaveformBuffer_GetSize(WaveformBuffer* waveformBuffer, size_t* size);
    135 
    136 /* write to buffer. will grow only if buffering state is set to allow it */
    137 ESR_ReturnCode WaveformBuffer_Write(WaveformBuffer* waveformBuffer, void *data, size_t num_bytes);
    138 
    139 /* read the whole buffer (starting from start offset, up to read_size) into a chunk allocated outside */
    140 ESR_ReturnCode WaveformBuffer_Read(WaveformBuffer* waveformBuffer, void *data, size_t* num_bytes);
    141 
    142 /* does the windback after bos detected */
    143 ESR_ReturnCode WaveformBuffer_WindBack(WaveformBuffer* waveformBuffer, const size_t num_bytes);
    144 
    145 /* sets the start offset and read_size at the end of recognition when endpointed transcription is known */
    146 ESR_ReturnCode WaveformBuffer_ParseEndPointedResultAndTrim(WaveformBuffer* waveformBuffer, const LCHAR* end_pointed_result, const size_t bytes_per_frame);
    147 
    148 /* free the memory allocated for blocks and for windback */
    149 ESR_ReturnCode WaveformBuffer_Destroy(WaveformBuffer* waveformBuffer);
    150 
    151 /* sets the state of buffer */
    152 ESR_ReturnCode WaveformBuffer_SetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t state);
    153 
    154 /* gets the state of buffer */
    155 ESR_ReturnCode WaveformBuffer_GetBufferingState(WaveformBuffer* waveformBuffer, waveform_buffering_state_t* state);
    156 
    157 /* skip the first few bytes (moves read pointer forward */
    158 ESR_ReturnCode WaveformBuffer_Skip(WaveformBuffer* waveformBuffer, const size_t bytes);
    159 
    160 
    161 
    162 /**
    163  * Speech recognizer.
    164  */
    165 typedef struct SR_RecognizerImpl_t
    166 {
    167   /**
    168    * Interface functions that must be implemented.
    169    */
    170   SR_Recognizer Interface;
    171 
    172   /**
    173    * Legacy CREC frontend.
    174    */
    175   CA_Frontend* frontend;
    176   /**
    177    * Legacy CREC Input waveform object.
    178    */
    179   CA_Wave* wavein;
    180   /**
    181    * Legacy CREC Utterance object.
    182    */
    183   CA_Utterance* utterance;
    184   /**
    185    * Legacy CREC confidence score calculator.
    186    */
    187   CA_ConfidenceScorer* confidenceScorer;
    188   /**
    189    * Legacy CREC recognizer.
    190    */
    191   CA_Recog* recognizer;
    192   /**
    193    * AcousticModels associated with Recognizer.
    194    */
    195   SR_AcousticModels* models;
    196   /**
    197   * Active Recognizer grammars.
    198   */
    199   HashMap* grammars;
    200   /**
    201    * Recognition result.
    202    */
    203   SR_RecognizerResult* result;
    204   /**
    205    * Recognizer parameters.
    206    */
    207   ESR_SessionType* parameters;
    208   /**
    209    * AcousticState associated with Recognizer.
    210    */
    211   SR_AcousticState* acousticState;
    212   /**
    213    * Total number of frames pushed by SR_RecognizerPutAudio().
    214    */
    215   size_t frames;
    216   /**
    217    * Number of processed frames.
    218    */
    219   size_t processed;
    220   /**
    221    * The number of frames up until the windback point (where -pau- starts).
    222    */
    223   size_t beginningOfSpeechOffset;
    224   /**
    225    * Internal recognizer state.
    226    */
    227   SR_RecognizerInternalStatus internalState;
    228   /**
    229    * Indicates if SR_RecognizerStart() was called.
    230    */
    231   ESR_BOOL isStarted;
    232   /**
    233    * Indicates if PutAudio() was called with the last audio frame.
    234    */
    235   ESR_BOOL gotLastFrame;
    236   /**
    237    * Audio buffer used by PutAudio().
    238    */
    239   CircularBuffer* buffer;
    240   /**
    241    * Temporary buffer used to transfer audio data (PutAudio).
    242    **/
    243   asr_int16_t *audioBuffer;
    244   /**
    245    * Recognizer sample rate.
    246    */
    247   size_t sampleRate;
    248   /**
    249    * Whether reconition has begun after begiing of speech detection
    250    */
    251   ESR_BOOL isRecognizing;
    252   /**
    253    * Max number of frames to process before BOS timeout
    254    */
    255   size_t utterance_timeout;
    256   /**
    257    * Locking function associated.
    258    */
    259   SR_RecognizerLockFunction lockFunction;
    260   /**
    261    * Locking function data.
    262    */
    263   void* lockData;
    264 
    265   /**
    266    * OSI logging level
    267    * if bit0 (OSI_LOG_LEVEL_BASIC) is set: do basic logging
    268    * if bit1 (OSI_LOG_LEVEL_AUDIO) is set: do audio waveform logging
    269    * if bit2 (OSI_LOG_LEVEL_ADDWD) is set: do dynamic grammar addword logging
    270    */
    271   size_t osi_log_level;
    272 
    273   /**
    274    * EventLog pointer
    275    */
    276   SR_EventLog* eventLog;
    277   /**
    278    * Data that should be logged in OSI
    279    */
    280   RecogLogTimings recogLogTimings;
    281   /**
    282    * Timestamp reference used for calculating timings
    283    */
    284   PTimeStamp timestamp;
    285 
    286   /**
    287    * Waveform buffer (for nametags) .
    288    */
    289   WaveformBuffer* waveformBuffer;
    290 
    291   /**
    292    * Reason for eos detected
    293    */
    294   LCHAR* eos_reason;
    295 
    296   /**
    297    * Indicates if signal quality variables have been initialized.
    298    */
    299   ESR_BOOL isSignalQualityInitialized;
    300   /**
    301    * True if signal is being clipped.
    302    */
    303   ESR_BOOL isSignalClipping;
    304   /**
    305    * True if DCOffset is present in signal.
    306    */
    307   ESR_BOOL isSignalDCOffset;
    308   /**
    309    * True if signal is noisy.
    310    */
    311   ESR_BOOL isSignalNoisy;
    312   /**
    313    * True if signal is too quiet.
    314    */
    315   ESR_BOOL isSignalTooQuiet;
    316   /**
    317    * True if signal contains too few samples.
    318    */
    319   ESR_BOOL isSignalTooFewSamples;
    320   /**
    321    * True if signal contains too many samples.
    322    */
    323   ESR_BOOL isSignalTooManySamples;
    324 
    325   /**
    326    * Number of bytes in a frame.
    327    **/
    328   size_t FRAME_SIZE;
    329 
    330   /**
    331    * If TRUE, beginning of speech detection is enabled.
    332    */
    333   ESR_BOOL gatedMode;
    334 
    335   /**
    336    * The minimum number of frames to sniff before beginning recognition.
    337    */
    338   size_t bgsniff;
    339   /**
    340    * Indicates if we've skipped holdOffPeriod frames at the beginning of the waveform.
    341    */
    342   ESR_BOOL holdOffPeriodSkipped;
    343 }
    344 SR_RecognizerImpl;
    345 
    346 /**
    347  * Groups grammar with meta-data.
    348  */
    349 typedef struct GrammarBag_t
    350 {
    351   /**
    352    * Grammar object.
    353    */
    354   SR_Grammar* grammar;
    355   /**
    356    * Grammar weight.
    357    */
    358   unsigned int weight;
    359   /**
    360    * Grammar ID.
    361    */
    362   LCHAR* grammarID;
    363 }
    364 GrammarBag;
    365 
    366 
    367 /**
    368  * Default implementation.
    369  */
    370 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerStartImpl(SR_Recognizer* self);
    371 /**
    372  * Default implementation.
    373  */
    374 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerStopImpl(SR_Recognizer* self);
    375 /**
    376  * Default implementation.
    377  */
    378 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDestroyImpl(SR_Recognizer* self);
    379 /**
    380  * Default implementation.
    381  */
    382 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetupImpl(SR_Recognizer* self);
    383 /**
    384  * Default implementation.
    385  */
    386 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerUnsetupImpl(SR_Recognizer* self);
    387 /**
    388  * Default implementation.
    389  */
    390 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSetupImpl(SR_Recognizer* self, ESR_BOOL* isSetup);
    391 
    392 /**
    393  * Default implementation.
    394  */
    395 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetParameterImpl(SR_Recognizer* self, const LCHAR* key, LCHAR* value, size_t* len);
    396 /**
    397  * Default implementation.
    398  */
    399 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, size_t* value);
    400 /**
    401  * Default implementation.
    402  */
    403 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL* value);
    404 /**
    405  * Default implementation.
    406  */
    407 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetParameterImpl(SR_Recognizer* self, const LCHAR* key, LCHAR* value);
    408 /**
    409  * Default implementation.
    410  */
    411 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetSize_tParameterImpl(SR_Recognizer* self, const LCHAR* key, size_t value);
    412 /**
    413  * Default implementation.
    414  */
    415 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetBoolParameterImpl(SR_Recognizer* self, const LCHAR* key, ESR_BOOL value);
    416 
    417 /**
    418  * Default implementation.
    419  */
    420 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerHasSetupRulesImpl(SR_Recognizer* self,
    421     ESR_BOOL* hasSetupRules);
    422 /**
    423  * Default implementation.
    424  */
    425 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerActivateRuleImpl(SR_Recognizer* self,
    426     SR_Grammar* grammar,
    427     const LCHAR* ruleName,
    428     unsigned int weight);
    429 /**
    430  * Default implementation.
    431  */
    432 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDeactivateRuleImpl(SR_Recognizer* self,
    433     SR_Grammar* grammar,
    434     const LCHAR* ruleName);
    435 
    436 /**
    437  * Default implementation.
    438  */
    439 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerDeactivateAllRulesImpl(SR_Recognizer* self);
    440 
    441 /**
    442  * Default implementation.
    443  */
    444 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsActiveRuleImpl(SR_Recognizer* self,
    445     SR_Grammar* grammar,
    446     const LCHAR* ruleName,
    447     ESR_BOOL* isActiveRule);
    448 /**
    449  * Default implementation.
    450  */
    451 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetWordAdditionCeilingImpl(SR_Recognizer* self,
    452     SR_Grammar* grammar);
    453 /**
    454  * Default implementation.
    455  */
    456 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerCheckGrammarConsistencyImpl(SR_Recognizer* self,
    457     SR_Grammar* grammar,
    458     ESR_BOOL* isConsistent);
    459 /**
    460  * Default implementation.
    461  */
    462 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerGetModelsImpl(SR_Recognizer* self,
    463 															  SR_AcousticModels** models);
    464 /**
    465  * Default implementation.
    466  */
    467 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerPutAudioImpl(SR_Recognizer* self,
    468     asr_int16_t* buffer,
    469     size_t* bufferSize,
    470     ESR_BOOL isLast);
    471 /**
    472  * Default implementation.
    473  */
    474 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerAdvanceImpl(SR_Recognizer* self,
    475     SR_RecognizerStatus* status,
    476     SR_RecognizerResultType* type,
    477     SR_RecognizerResult** result);
    478 
    479 /**
    480  * Default implementation.
    481  */
    482 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerClearAcousticStateImpl(SR_Recognizer* self);
    483 /**
    484  * Default implementation.
    485  */
    486 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadAcousticStateImpl(SR_Recognizer* self,
    487     const LCHAR* filename);
    488 
    489 /**
    490  * Default implementation.
    491  */
    492 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadUtteranceImpl(SR_Recognizer* self, const LCHAR* filename);
    493 /**
    494  * Default implementation.
    495  */
    496 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLoadWaveFileImpl(SR_Recognizer* self, const LCHAR* filename);
    497 
    498 /**
    499  * Default implementation.
    500  */
    501 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogTokenImpl(SR_Recognizer* self, const LCHAR* token, const LCHAR* value);
    502 /**
    503  * Default implementation.
    504  */
    505 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogTokenIntImpl(SR_Recognizer* self, const LCHAR* token, int value);
    506 /**
    507  * Default implementation.
    508  */
    509 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogEventImpl(SR_Recognizer* self, const LCHAR* event);
    510 /**
    511  * Default implementation.
    512  */
    513 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogSessionStartImpl(SR_Recognizer* self, const LCHAR* sessionName);
    514 /**
    515  * Default implementation.
    516  */
    517 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogSessionEndImpl(SR_Recognizer* self);
    518 /**
    519  * Default implementation.
    520  */
    521 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerLogWaveformDataImpl(SR_Recognizer* self,
    522     const LCHAR* waveformFilename,
    523     const LCHAR* transcription,
    524     const double bos,
    525     const double eos,
    526     ESR_BOOL isInvocab);
    527 /**
    528  * Default implementation.
    529  */
    530 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerSetLockFunctionImpl(SR_Recognizer *self, SR_RecognizerLockFunction function, void* data);
    531 /**
    532  * Default implementation.
    533  */
    534 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalClippingImpl(SR_Recognizer* self, ESR_BOOL* isClipping);
    535 /**
    536  * Default implementation.
    537  */
    538 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalDCOffsetImpl(SR_Recognizer* self, ESR_BOOL* isDCOffset);
    539 /**
    540  * Default implementation.
    541  */
    542 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalNoisyImpl(SR_Recognizer* self, ESR_BOOL* isNoisy);
    543 /**
    544  * Default implementation.
    545  */
    546 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooQuietImpl(SR_Recognizer* self, ESR_BOOL* isTooQuiet);
    547 /**
    548  * Default implementation.
    549  */
    550 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooFewSamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooFewSamples);
    551 /**
    552  * Default implementation.
    553  */
    554 SREC_RECOGNIZER_API ESR_ReturnCode SR_RecognizerIsSignalTooManySamplesImpl(SR_Recognizer* self, ESR_BOOL* isTooManySamples);
    555 
    556 SREC_RECOGNIZER_API ESR_ReturnCode SR_Recognizer_Change_Sample_RateImpl ( SR_Recognizer *self, size_t new_sample_rate );
    557 
    558 #endif /* __SR_RECOGNIZERIMPL_H */
    559