1 /*---------------------------------------------------------------------------* 2 * get_fram.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 21 #include <stdlib.h> 22 #ifndef _RTT 23 #include "pstdio.h" 24 #endif 25 #include <limits.h> 26 #include <math.h> 27 #include <string.h> 28 #include "passert.h" 29 30 #include "c42mul.h" 31 #include "portable.h" 32 33 #include "../clib/fpi_tgt.inl" 34 35 #define DEBUG 0 36 #define FUDGE_FACTOR 1.2f 37 38 const float root_pi_over_2 = (float) 1.2533141; 39 40 static const char get_fram[] = "$Id: get_fram.c,v 1.7.6.13 2007/10/15 18:06:24 dahan Exp $"; 41 42 static void create_cepstrum_offsets(preprocessed *prep); 43 static void destroy_cepstrum_offsets(preprocessed *prep); 44 static void apply_channel_offset(preprocessed *prep); 45 static int compare_cached_frame(preprocessed *prep, utterance_info *utt); 46 47 void init_utterance(utterance_info *utt, int utt_type, int dimen, 48 int buffer_size, int keep_frames, int num_chan, int do_voicing) 49 /* 50 ** To setup the utterance structure 51 */ 52 { 53 /* Construct frame buffer and voice buffer here 54 */ 55 ASSERT(utt); 56 ASSERT(dimen > 0); 57 if (buffer_size < keep_frames) 58 SERVICE_ERROR(BAD_ARGUMENT); 59 utt->utt_type = utt_type; 60 utt->gen_utt.dim = dimen; 61 utt->gen_utt.frame = createFrameBuffer(buffer_size, 62 dimen, keep_frames, do_voicing); 63 utt->gen_utt.num_chan = num_chan; 64 65 setup_ambient_estimation(utt->gen_utt.backchan, 66 utt->gen_utt.num_chan, 100); 67 return; 68 } 69 70 void set_voicing_durations(utterance_info *utt, int voice_duration, 71 int quiet_duration, int unsure_duration, 72 int start_windback) 73 { 74 utt->gen_utt.voice_duration = voice_duration; 75 utt->gen_utt.quiet_duration = quiet_duration; 76 utt->gen_utt.unsure_duration = unsure_duration; 77 utt->gen_utt.start_windback = start_windback; 78 return; 79 } 80 81 void free_utterance(utterance_info *utt) 82 /* 83 ** To close data file pointers etc. 84 */ 85 { 86 /* Destroy frame buffer 87 */ 88 ASSERT(utt); 89 90 clear_ambient_estimation(utt->gen_utt.backchan, utt->gen_utt.dim); 91 if (utt->gen_utt.frame) 92 { 93 destroyFrameBuffer(utt->gen_utt.frame); 94 utt->gen_utt.frame = NULL; 95 } 96 return; 97 } 98 99 void init_preprocessed(preprocessed *prep, int dimen, float imelda_scale) 100 /* 101 ** To setup the preprocessed structure 102 */ 103 { 104 105 ASSERT(prep); 106 ASSERT(dimen > 0); 107 prep->dim = dimen; 108 prep->seq = (imeldata *) CALLOC(prep->dim, sizeof(imeldata), 109 "srec.prep->seq"); 110 prep->seq_unnorm = (imeldata *) CALLOC(prep->dim, sizeof(imeldata), 111 "srec.prep->seq_unnorm"); 112 prep->last_frame = (featdata *) CALLOC(prep->dim, sizeof(featdata), 113 "srec.prep->last_frame"); 114 115 /* Setup constants for distance calculation 116 */ 117 /* TODO: check numbers for non-zero */ 118 prep->add.scale = (prdata)((2 * imelda_scale * imelda_scale) / MUL_SCALE 119 + 0.5) - (prdata)0.5; 120 prep->add.inv_scale = (prdata)(((float)(0x01 << 12) * MUL_SCALE) / 121 (2 * imelda_scale * imelda_scale) + 0.5) - 122 (prdata)0.5; 123 prep->mul.multable_factor_gaussian = 1; 124 prep->mul.multable_factor = (prdata)(((MUL_SCALE * (0x01 << EUCLID_SHIFT) 125 * prep->uni_score_scale) 126 / (2 * (imelda_scale * imelda_scale 127 * FUDGE_FACTOR * FUDGE_FACTOR))) / 128 + 0.5) 128 - (prdata)0.5; 129 prep->mul.grand_mod_cov = (prdata)((MUL_SCALE * prep->uni_score_scale * 130 prep->whole_dim * 131 log((imelda_scale * FUDGE_FACTOR) / 132 (SIGMA_BIAS * root_pi_over_2))) / 128 + 0.5) 133 - (prdata)0.5 - prep->uni_score_offset; 134 prep->mul.grand_mod_cov_gaussian = (prdata)(2 * imelda_scale * imelda_scale * 135 prep->use_dim * 136 log(imelda_scale / 137 (SIGMA_BIAS * root_pi_over_2)) + 0.5) 138 - (prdata)0.5; 139 #if DEBUG 140 log_report("grand_mod_cov %.1f, grand_mod_cov_gaussian %.1f\n", 141 (float)prep->mul.grand_mod_cov, 142 (float)prep->mul.grand_mod_cov_gaussian); 143 log_report("multable_factor %f, multable_factor_gaussian %f\n", 144 (float)prep->mul.multable_factor, 145 (float)prep->mul.multable_factor_gaussian); 146 #endif 147 148 149 create_cepstrum_offsets(prep); 150 return; 151 } 152 153 void clear_preprocessed(preprocessed *prep) 154 /* 155 ** To setup the preprocessed structure 156 */ 157 { 158 ASSERT(prep); 159 destroy_cepstrum_offsets(prep); 160 prep->dim = 0; 161 FREE((char *)prep->last_frame); 162 FREE((char *)prep->seq); 163 FREE((char *)prep->seq_unnorm); 164 return; 165 } 166 167 int get_data_frame(preprocessed *prep, utterance_info *utt) 168 /* 169 ** To get a frame amount of data and perform preprocessing functions 170 */ 171 { 172 int status_code; 173 174 ASSERT(prep); 175 ASSERT(utt); 176 if (utt->gen_utt.channorm && !utt->gen_utt.channorm->adj_valid) 177 convert_adjustment_to_imelda(utt->gen_utt.channorm, prep); 178 if (utt->gen_utt.dim != prep->dim) 179 SERVICE_ERROR(UTTERANCE_DIMEN_MISMATCH); 180 181 if (prep->post_proc & VFR) 182 { 183 if ((status_code = get_utterance_frame(prep, utt)) <= 0) 184 return (status_code); 185 186 log_report("get_data_frame vfr not supported\n"); 187 SERVICE_ERROR(FEATURE_NOT_SUPPORTED); 188 } 189 else 190 { 191 status_code = get_utterance_frame(prep, utt); 192 if (status_code == 0) return(status_code); 193 else if (status_code == -1) return(1); 194 } 195 196 if (prep->chan_offset) 197 apply_channel_offset(prep); 198 199 /* Apply linear transformation if necessary 200 */ 201 if (prep->post_proc & LIN_TRAN) 202 linear_transform_frame(prep, prep->seq, True); 203 204 memcpy(prep->seq_unnorm, prep->seq, prep->dim * sizeof(imeldata)); 205 if (utt->gen_utt.channorm) 206 apply_channel_normalization_in_imelda(utt->gen_utt.channorm, 207 prep->seq, prep->seq_unnorm, 208 utt->gen_utt.channorm->dim); 209 return (1); 210 } 211 212 int get_utterance_frame(preprocessed *prep, utterance_info *utt) 213 /* 214 ** To get a frame amount of data 215 ** Maintains a single data buffer and passes the pointers to frame of data. 216 ** Post-increments after copying 217 */ 218 { 219 featdata *frame_ptr; 220 int ii; 221 222 ASSERT(prep); 223 ASSERT(utt); 224 225 /* Get the next data frame in 226 */ 227 if (getFrameGap(utt->gen_utt.frame) > 0) 228 { 229 /* is it a cloned object */ 230 if (prep->ref_count > 1 && compare_cached_frame(prep, utt)) 231 return (-1); 232 233 frame_ptr = currentRECframePtr(utt->gen_utt.frame); 234 if (frame_ptr == NULL) 235 return (0); 236 if (prep->ref_count > 1) 237 { 238 ASSERT(prep->last_frame); 239 memcpy(prep->last_frame, frame_ptr, 240 prep->dim* sizeof(featdata)); 241 } 242 for (ii = 0; ii < utt->gen_utt.dim; ii++) 243 prep->seq[ii] = (imeldata)frame_ptr[ii]; 244 /* Apply fast-voice corrections if necessary */ 245 if (utt->gen_utt.frame->haveVoiced) 246 { 247 utterance_detection_fixup(utt->gen_utt.frame, 248 &utt->gen_utt.last_push, utt->gen_utt.voice_duration, 249 utt->gen_utt.quiet_duration, utt->gen_utt.unsure_duration); 250 /* if (isFrameBufferActive (utt->gen_utt.frame) 251 && getFrameGap (utt->gen_utt.frame) <= utt->gen_utt.quiet_duration) 252 SERVICE_ERROR (INTERNAL_ERROR); */ 253 prep->voicing_status = 254 rec_frame_voicing_status(utt->gen_utt.frame); 255 } 256 return (1); 257 } 258 return (0); 259 } 260 261 262 int advance_utterance_frame(utterance_info *utt) 263 /* 264 ** To get a frame amount of data 265 */ 266 { 267 ASSERT(utt); 268 /* if more samples are needed then read from file if the type matched 269 */ 270 /* Get the next data frame in 271 */ 272 if (getFrameGap(utt->gen_utt.frame) > 0) 273 { 274 if (incRECframePtr(utt->gen_utt.frame) != False) 275 return (0); 276 return (1); 277 } 278 return (0); 279 } 280 281 int retreat_utterance_frame(utterance_info *utt) 282 /* 283 ** To get a frame amount of data 284 */ 285 { 286 ASSERT(utt); 287 288 if (getBlockGap(utt->gen_utt.frame) > 0) 289 { 290 if (decRECframePtr(utt->gen_utt.frame) != False) 291 return (0); 292 return (1); 293 } 294 return (0); 295 } 296 297 void prepare_data_frame(preprocessed *prep) 298 { 299 int ii; 300 prdata sum_sq; 301 302 sum_sq = 0; 303 304 for (ii = 0; ii < prep->whole_dim; ii++) 305 sum_sq += (prdata) SQR((prdata)prep->seq[ii]); 306 prep->seq_sq_sum_whole = -sum_sq; 307 308 ASSERT(prep->whole_dim <= prep->use_dim); 309 for (ii = 0; ii < prep->use_dim; ii++) 310 sum_sq += (prdata) SQR((prdata)prep->seq[ii]); 311 prep->seq_sq_sum = -sum_sq; 312 313 sum_sq = 0; 314 315 for (ii = 0; ii < prep->whole_dim; ii++) 316 sum_sq += (prdata) SQR((prdata)prep->seq_unnorm[ii]); 317 prep->seq_unnorm_sq_sum_whole = -sum_sq; 318 319 return; 320 } 321 322 int utterance_started(utterance_info *utt) 323 { 324 ASSERT(utt); 325 if (utt->gen_utt.frame->haveVoiced 326 && utt->gen_utt.frame->voicingDetected) 327 return (True); 328 else 329 return (False); 330 } 331 332 int utterance_ended(utterance_info *utt) 333 { 334 ASSERT(utt); 335 return (utt->gen_utt.frame->utt_ended); 336 } 337 338 int load_utterance_frame(utterance_info *utt, unsigned char* pUttFrame, int voicing) 339 { 340 featdata framdata[MAX_DIMEN]; 341 int ii; 342 343 ASSERT(utt); 344 ASSERT(pUttFrame); 345 346 for (ii = 0; ii < utt->gen_utt.frame->uttDim; ii++) 347 framdata[ii] = (featdata) pUttFrame[ii]; 348 349 if (pushSingleFEPframe(utt->gen_utt.frame, framdata, voicing) != False) 350 return (0); 351 352 return (1); 353 } 354 355 int copy_utterance_frame(utterance_info *oututt, utterance_info *inutt) 356 { 357 int voicedata; 358 featdata *framdata; 359 360 ASSERT(oututt); 361 ASSERT(inutt); 362 363 if ((framdata = currentRECframePtr(inutt->gen_utt.frame)) == NULL) 364 return (0); 365 366 voicedata = getVoicingCode(inutt->gen_utt.frame, framdata); 367 368 if (pushSingleFEPframe(oututt->gen_utt.frame, framdata, voicedata) != False) 369 return (0); 370 371 return (1); 372 } 373 374 int copy_pattern_frame(utterance_info *oututt, preprocessed *prep) 375 { 376 int ii; 377 featdata frame_ptr[MAX_DIMEN]; 378 379 ASSERT(oututt); 380 ASSERT(prep); 381 ASSERT(oututt->gen_utt.dim < MAX_DIMEN); 382 for (ii = 0; ii < oututt->gen_utt.dim; ii++) 383 frame_ptr[ii] = (featdata) RANGE(prep->seq[ii], 0, 255); 384 if (pushSingleFEPframe(oututt->gen_utt.frame, frame_ptr, 385 prep->voicing_status) 386 != False) return(0); 387 return (1); 388 } 389 390 static void create_cepstrum_offsets(preprocessed *prep) 391 { 392 ASSERT(prep); 393 prep->chan_offset = (imeldata *) CALLOC_CLR(prep->dim, 394 sizeof(imeldata), "srec.chan_offset"); 395 return; 396 } 397 398 void set_cepstrum_offset(preprocessed *prep, int index, int value) 399 { 400 ASSERT(prep); 401 ASSERT(prep->chan_offset); 402 ASSERT(index >= 0 && index < prep->dim); 403 prep->chan_offset[index] = (imeldata) value; 404 return; 405 } 406 407 static void destroy_cepstrum_offsets(preprocessed *prep) 408 { 409 ASSERT(prep); 410 FREE((char *)prep->chan_offset); 411 prep->chan_offset = 0; 412 return; 413 } 414 415 static void apply_channel_offset(preprocessed *prep) 416 { 417 int ii; 418 419 for (ii = 0; ii < prep->dim; ii++) 420 prep->seq[ii] += prep->chan_offset[ii]; 421 return; 422 } 423 424 static int compare_cached_frame(preprocessed *prep, utterance_info *utt) 425 { 426 int ii; 427 featdata *frame_ptr; 428 429 frame_ptr = currentRECframePtr(utt->gen_utt.frame); 430 if (frame_ptr == NULL) 431 return (False); 432 for (ii = 0; ii < utt->gen_utt.dim; ii++) 433 if (prep->last_frame[ii] != frame_ptr[ii]) 434 return (False); 435 return (True); 436 } 437 438 void convert_adjustment_to_imelda(norm_info *norm, preprocessed *prep) 439 { 440 int ii; 441 imeldata fram[MAX_DIMEN]; 442 443 ASSERT(prep); 444 ASSERT(norm); 445 for (ii = 0; ii < 12; ii++) /* TODO: fix dimension properly, and sort out rouding/type */ 446 fram[ii] = (imeldata) norm->adjust[ii]; /* TODO: review types */ 447 for (; ii < prep->dim; ii++) 448 fram[ii] = 0; 449 450 linear_transform_frame(prep, fram, False); 451 452 for (ii = 0; ii < prep->dim; ii++) 453 norm->imelda_adjust[ii] = fram[ii]; 454 #if DEBUG 455 log_report("NORM AUX: "); 456 for (ii = 0; ii < norm->dim; ii++) 457 log_report("%d ", (int)norm->imelda_adjust[ii]); 458 log_report("\n"); 459 #endif 460 norm->adj_valid = True; 461 return; 462 } 463