1 /*---------------------------------------------------------------------------* 2 * utteranc.h * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 21 22 #ifndef _h_utteranc_ 23 #define _h_utteranc_ 24 25 #ifdef SET_RCSID 26 static const char utteranc_h[] = "$Id: utteranc.h,v 1.3.6.7 2007/08/31 17:44:53 dahan Exp $"; 27 #endif 28 29 30 31 #include "all_defs.h" 32 #include "hmm_type.h" 33 #include "fpi_tgt.h" 34 #include "voicing.h" 35 #include "specnorm.h" 36 #include "channorm.h" 37 #include "swicms.h" 38 #ifndef _RTT 39 #include "duk_io.h" 40 #endif 41 42 #define DEFAULT_BUFFER_SIZE 100 /* in frames */ 43 #define KEEP_FRAMES 40 /* in frames, past frames kept */ 44 45 /* Functions supported are 46 ** new, delete (by source) 47 ** open file/device, close file/device 48 ** attach and detach sink 49 ** read/store samples - including the header 50 */ 51 52 /** 53 * @todo document 54 */ 55 typedef struct 56 { /* label structure */ 57 char *label; 58 long begin; 59 long end; 60 char *extra; 61 unsigned char flag; 62 } 63 annotate; 64 65 66 /** 67 * @todo document 68 */ 69 typedef struct 70 { 71 int utt_type; 72 int dim; 73 fepFramePkt *frame; 74 int num_chan; 75 int do_channorm; 76 spect_dist_info **spchchan; /* Mirrored from the Wave object */ 77 norm_info *channorm; /* Mirrored from the Wave object */ 78 swicms_norm_info *swicms; /* copy of wave obj pointer */ 79 spect_dist_info *backchan[MAX_CHAN_DIM]; 80 featdata *last_push; 81 int voice_duration; 82 int quiet_duration; 83 int unsure_duration; 84 int start_windback; 85 } 86 utt_generic_info; 87 88 #ifndef _RTT 89 /** 90 * @todo document 91 */ 92 typedef struct 93 { 94 char typ; /* s (16 bit), c (8 bit), u (newton .utb) */ 95 int endian; /* 0 is little 1 is big */ 96 int do_skip; /* skip every other frame */ 97 unsigned long len; /* length of file/utterance */ 98 PFile* file; /* pointer to file */ 99 char name[MAX_LABEL]; /* file name */ 100 /* int op; read or write */ 101 int num_utts; /* no. of utterances in utb file */ 102 annotate *utb_table; /* utb file header information */ 103 } 104 utt_file_info; 105 106 /** 107 * @todo document 108 */ 109 typedef struct 110 { 111 int utt_type; 112 int dim; 113 fepFramePkt *frame; 114 int num_chan; 115 int do_channorm; 116 spect_dist_info **spchchan; /* Mirrored from the Wave object */ 117 norm_info *channorm; /* Mirrored from the Wave object */ 118 swicms_norm_info *swicms; /* copy of wave obj pointer */ 119 spect_dist_info *backchan[MAX_CHAN_DIM]; 120 featdata *last_push; 121 int voice_duration; 122 int quiet_duration; 123 int unsure_duration; 124 int start_windback; 125 /* voicing_info voice; */ 126 utt_file_info file; 127 } 128 file_utterance_info; 129 #endif 130 131 /** 132 * @todo document 133 */ 134 typedef struct 135 { 136 int utt_type; 137 int dim; 138 fepFramePkt *frame; 139 int num_chan; 140 int do_channorm; 141 spect_dist_info **spchchan; /* Mirrored from the Wave object */ 142 norm_info *channorm; /* Mirrored from the Wave object */ 143 swicms_norm_info *swicms; /* copy of wave obj pointer */ 144 spect_dist_info *backchan[MAX_CHAN_DIM]; 145 featdata *last_push; 146 int voice_duration; 147 int quiet_duration; 148 int unsure_duration; 149 int start_windback; 150 } 151 live_utterance_info; 152 153 /** 154 * @todo document 155 */ 156 typedef union 157 { 158 int utt_type; /* live or from file */ 159 utt_generic_info gen_utt; /* generic one */ 160 #ifndef _RTT 161 file_utterance_info file_utt; 162 #endif 163 live_utterance_info live_utt; 164 } utterance_info; 165 166 167 /* 168 ** Size of the utb file headers and details 169 */ 170 171 #ifndef _RTT 172 #define UTT_VERSION 2 173 #define UTT_HEADER_SIZE 16 /*Size on disk*/ 174 #define UTB_HEADER_SIZE 32 /*Size on disk*/ 175 #define UTB_HEADER_USED 16 /*Size on disk*/ /* SAL */ 176 177 /** 178 * UTB file header. 179 */ 180 typedef struct _UttHeader 181 { 182 /** 183 * The size of the header in bytes. 184 */ 185 unsigned short headerSize; 186 /** 187 * The version of the file format. 188 */ 189 unsigned short version; 190 /** 191 * The size of the payload in bytes. 192 */ 193 unsigned long nBytes; 194 /** 195 * The number of parameters per frame. 196 */ 197 unsigned short nParametersPerFrame; 198 /** 199 * 0=unknown, 1=none, 2=amp-based, 3=harmonicity-based, 4=mrec style 200 */ 201 unsigned short channelNormalization; 202 /** 203 * 0=unknown, 1=no, 2=yes 204 */ 205 unsigned short speakerNormalization; 206 /** 207 * 0=unknown, 1=no, 2=yes 208 */ 209 unsigned short imeldaization; 210 /** 211 * Before imelda truncation. 212 */ 213 unsigned short nOriginalParameters; 214 /** 215 * The number of samples per frame. 216 */ 217 unsigned short samplesPerFrame; 218 /** 219 * The audio sample rate. 220 */ 221 unsigned long sampleRate; 222 /** 223 * not used in version 5. 224 */ 225 unsigned long checksum; 226 } 227 UttHeader; 228 229 int update_utb_header(file_utterance_info *utt, int frames, int samplerate, 230 int framerate); 231 void init_utt_v5_header(UttHeader *uhead, int dim, int samplerate, int framerate); 232 int init_data_file(char *filename, file_utterance_info *utt, int dimen, 233 char typ, int endian, int do_skip); 234 int new_data_file(char *filename, file_utterance_info *utt, int dimen, 235 char typ, int endian); 236 int set_data_frame(file_utterance_info *utt, long begin); 237 int buffer_data_frames(file_utterance_info *utt, long f_begin, long f_end); 238 void more_data_frames(file_utterance_info *utt); 239 int save_data_frames(file_utterance_info *utt); 240 void close_data_stream(file_utterance_info *utt); 241 int init_utb_file(file_utterance_info *utt, annotate **table); 242 int position_utb_file(file_utterance_info *utt, long position, annotate *table); 243 int load_utb_data(file_utterance_info *utt, int num_frames, int do_skip); 244 int load_short_data(file_utterance_info *utt, int num_frames, int do_skip); 245 int save_utb_data(file_utterance_info *utt, int num_frames); 246 int save_short_data(file_utterance_info *utt, int num_frames); 247 int read_utt_head(UttHeader *head, PFile* datafile); 248 int write_utt_head(UttHeader *head, PFile* datafile); 249 int check_for_utb(char* filename); 250 251 /* TCP reading routines 252 */ 253 int read_tcp(char *filename, annotate **tag_base); 254 int read_lst(char *filename, annotate *tag_base, int ntags); 255 int read_utb_table(char *filename, annotate **tag_base); 256 void save_tcp(char *tcpnam, annotate *tag, int ntags); 257 void compose_tcp_name_of_utt(char* uttname , char* tcpname); 258 259 #endif 260 261 void init_utterance(utterance_info *utt, int utt_type, int dimen, 262 int buffer_size, int keep_frames, int num_chan, int do_voicing); 263 void set_voicing_durations(utterance_info *utt, int voice_duration, 264 int quiet_duration, int unsure_duration, 265 int start_windback); 266 void free_utterance(utterance_info *utt); 267 int utterance_started(utterance_info *utt); 268 int utterance_ended(utterance_info *utt); 269 int load_utterance_frame(utterance_info *utt, unsigned char* pUttFrame, int voicing); 270 int copy_utterance_frame(utterance_info *oututt, utterance_info *inutt); 271 272 #endif /* _h_utteranc_ */ 273