1 /*---------------------------------------------------------------------------* 2 * srec_context.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 21 #include "stdlib.h" 22 #include "string.h" 23 #include "buildopt.h" 24 #include "setting.h" 25 #include "passert.h" 26 #include "pendian.h" 27 #include "portable.h" 28 #include "pstdio.h" 29 #include "ptypes.h" 30 #include "srec_context.h" 31 #include "srec_sizes.h" 32 #include "search_network.h" 33 #include "srec_arb.h" 34 #include "hmm_desc.h" 35 #if USE_COMP_STATS 36 #include "comp_stats.h" 37 #endif 38 39 #ifdef SET_RCSID 40 static const char *rcsid = 0 ? (const char *) &rcsid : 41 "$Id: srec_context.c,v 1.84.4.54 2008/05/15 20:06:39 dahan Exp $"; 42 #endif 43 44 #include <stdint.h> 45 46 /* these are constants for now, we need to figure out how to make 47 this more flexible, possible ideas are: 48 (1) to specify these via API functions such as CR_LoadSyntax(), 49 (2) use defaults that would have been set the main parfile, 50 (3) put values for these fields in an actual grammar 51 */ 52 53 #define AVG_ARCS_PER_WORD 10 54 #define AVG_NODES_PER_WORD 7 55 #define AVG_CHARS_PER_WORD 18 56 #define MAX_PRON_LEN 1024 57 #define MAX_WORD_LEN 128 58 #define DO_WEIGHTED_ADDWORD 1 59 #define SLOTLOOP_OFFSET 64 60 #define SLOTNAME_INDICATOR "__" 61 #define RULE_INDICATOR '@' 62 #define MAX_LINE_LENGTH 512 63 #define DEFAULT_WB_COST 40 64 #define DISABLE_ARC_COST 999 65 #define DO_ARCS_IO_IN_ARC_ORDER 1 66 67 #define IS_SILENCE_ILABEL(ilabel,context) (ilabel >= context->hmm_ilabel_offset+EPSILON_OFFSET && ilabel<context->hmm_ilabel_offset+EPSILON_OFFSET+NUM_SILENCE_HMMS) 68 // looking to match filename.grxml.Names@__Names__ 69 // see also grxmlcompile.cpp where these names are constructed 70 #define IS_SLOT_OLABEL(wd) (strchr(wd,IMPORTED_RULES_DELIM)<strstr(wd,SLOTNAME_INDICATOR) && strlen(wd)>4 && !strcmp(wd+strlen(wd)-2,SLOTNAME_INDICATOR) ) 71 72 /* 73 SYNTAX : DATA_ALIGN(pointer, data type, bytes_filled) 74 EXAMPLE: We need to cast a memory address to a (wordmap*) 75 so we call DATA_ALIGN(memptr, wordmap, filled), 76 where FILLED contains the number of bytes that were used to align the pointer 77 */ 78 #if (CPU & CPU_ARM)||(CPU & CPU_STRONGARM) 79 /* under IPAQ it appears we need to align the structures 80 to certain boundaries. More attention needed here for other 81 platforms ! */ 82 #define DATA_ALIGN(x, y, z) if ((int) (x) % sizeof(y)) { \ 83 (z)=sizeof(y) - ((int) (x) % sizeof(y)); \ 84 (x) += (z); /* force correct data alignment */ \ 85 } else z=0; 86 #else 87 #define DATA_ALIGN(x, y, z) 88 #endif 89 90 static const int do_minimize = 1; 91 #define PTR_TO_IDX(ptr, base) ((asr_uint32_t) (ptr == NULL ? 0xFFFFFFFFu : (asr_uint32_t)(ptr - base))) 92 #define IDX_TO_PTR(idx, base) (idx == 0xFFFFFFFFu ? NULL : base + idx) 93 94 /* prototypes */ 95 96 /*----------------------------------------------------------------------* 97 * * 98 * internal prototypes * 99 * * 100 *----------------------------------------------------------------------*/ 101 102 /* general use functions */ 103 char split_line(char* line, char** av); 104 asr_int32_t atoi_with_check(const char* buf, asr_int32_t max); 105 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc); 106 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc); 107 int printf_node1(srec_context* fst, FSMnode* node); 108 109 /* fst_* functions are internal fst functions */ 110 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node, 111 wordID olabel, costdata cost, 112 modelID* model_sequence, int num_models); 113 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc); 114 int fst_push_arc_cost(srec_context* fst, FSMarc* arc); 115 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc); 116 int fst_free_arc(srec_context* fst, FSMarc* arc); 117 int fst_free_node(srec_context* fst, FSMnode* node); 118 int fst_free_arc_net(srec_context* fst, FSMarc* arc); 119 arcID fst_get_free_arc(srec_context* fst); 120 nodeID fst_get_free_node(srec_context* fst); 121 int fst_pack_arc_usage(srec_context* fst); 122 123 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr atok); 124 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr atok); 125 int num_arcs_leaving(srec_context* fst, FSMnode* node); 126 127 int num_arcs_arriving(srec_context* fst, FSMnode* node); 128 int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node); 129 int fst_fill_node_info(srec_context* context); 130 int fst_alloc_transit_points(srec_context* context); 131 132 /* higher-level functions */ 133 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, 134 PFile* fp); 135 int FST_LoadParams(srec_context* context, PFile* fp); 136 int FST_AssumeParams(srec_context* context); 137 int FST_UnloadReverseWordGraph(srec_context* context); 138 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree); 139 140 static ESR_ReturnCode wordmap_clean ( wordmap *word_map ); 141 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words ); 142 143 /*--------------------------------------------------------------------------* 144 * * 145 * * 146 * * 147 *--------------------------------------------------------------------------*/ 148 149 int FST_IsVoiceEnrollment(srec_context* context) 150 { 151 if (context->olabels == NULL) return 0; 152 if (context->olabels->num_words < 2) return 0; 153 if (strstr(context->olabels->words[1], "enroll")) return 1; 154 return 0; 155 } 156 157 int FST_LoadContext(const char* synbase, srec_context** pcontext, 158 int num_words_to_add) 159 { 160 int rc; 161 PFile* fp; 162 char buffer[MAX_LINE_LENGTH]; 163 srec_context* context; 164 165 context = (srec_context*)CALLOC_CLR(1, sizeof(srec_context), "srec.graph.base"); 166 if(!context) 167 return FST_FAILED_ON_MEMORY; 168 memset(context, 0, sizeof(srec_context)); 169 170 context->addWordCaching_lastslot_name = 0; 171 context->addWordCaching_lastslot_num = MAXwordID; 172 context->addWordCaching_lastslot_needs_post_silence = ESR_FALSE; 173 174 sprintf(buffer, "%s.map", synbase); 175 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 176 if (!fp) return FST_FAILED_ON_INVALID_ARGS; 177 rc = FST_LoadWordMap(&context->olabels, num_words_to_add, fp); 178 pfclose(fp); 179 if (rc) return rc; 180 181 sprintf(buffer, "%s.PCLG.txt", synbase); 182 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 183 if (!fp) return FST_FAILED_ON_INVALID_ARGS; 184 rc = FST_LoadGraph(context, context->ilabels, context->olabels, num_words_to_add, fp); 185 pfclose(fp); 186 if (rc != FST_SUCCESS) return rc; 187 188 sprintf(buffer, "%s.Grev2.det.txt", synbase); 189 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 190 if (!fp) return FST_FAILED_ON_INVALID_ARGS; 191 rc = FST_LoadReverseWordGraph(context, num_words_to_add, fp); 192 pfclose(fp); 193 if (rc != FST_SUCCESS) return rc; 194 195 sprintf(buffer, "%s.params", synbase); 196 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 197 if (fp) { 198 rc = FST_LoadParams(context, fp); 199 pfclose(fp); 200 if (rc != FST_SUCCESS) return rc; 201 } else { 202 rc = FST_AssumeParams(context); 203 } 204 205 context->grmtyp = (arcID)FST_GetGrammarType(context); 206 207 *pcontext = context; 208 rc = fst_fill_node_info(context); 209 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS; 210 } 211 212 void FST_UnloadContext(srec_context* context) 213 { 214 if (!context) return; 215 216 FST_UnloadWordMap(&context->ilabels); 217 FST_UnloadWordMap(&context->olabels); 218 FST_UnloadGraph(context); 219 FST_UnloadReverseWordGraph(context); 220 FREE(context); 221 } 222 223 int FST_PrepareContext(srec_context* context) 224 { 225 nodeID i; 226 int rc = FST_SUCCESS; 227 /* after all word additions, we need to "prepare" the context */ 228 229 /* first check for any changes to optional endnodes etc .. */ 230 for (i = 0; i < context->num_nodes; i++) 231 if (context->FSMnode_info_list[i] == NODE_INFO_UNKNOWN) 232 break; 233 if (i != context->num_nodes) 234 rc = fst_fill_node_info(context); 235 236 context->whether_prepared = 1; 237 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS; 238 } 239 240 void fst_set_wb_costs( srec_context* context, costdata wbcost) 241 { 242 unsigned int i; 243 for(i=0; i<(unsigned int)context->FSMarc_list_len; i++) { 244 if(context->FSMarc_list[i].ilabel == WORD_BOUNDARY) 245 context->FSMarc_list[i].cost = wbcost; 246 } 247 } 248 249 int FST_LoadParams(srec_context* context, PFile* fp) 250 { 251 char line[MAX_LINE_LENGTH]; 252 costdata wbcost = MAXcostdata; 253 while (pfgets(line, MAX_LINE_LENGTH, fp)) 254 { 255 char* key = strtok(line," \n\r\t"); 256 char* val = key ? strtok(NULL," \n\r\t") : NULL; 257 if(val && !strcmp(val,"=")) 258 val = key ? strtok(NULL," \n\r\t") : NULL; 259 if(!key || !key[0]) 260 continue; 261 else if(val && val[0]) { 262 if(!strcmp(key,"word_penalty")) { 263 wbcost = (costdata)atoi_with_check(val, MAXcostdata); 264 if(wbcost == MAXcostdata) { 265 return FST_FAILED_ON_INVALID_ARGS; 266 } 267 } else { 268 PLogError(L("error: unknown parameter %s in .params file"), key); 269 return FST_FAILED_ON_INVALID_ARGS; 270 } 271 } 272 } 273 if(wbcost != MAXcostdata) 274 fst_set_wb_costs( context, wbcost); 275 return FST_SUCCESS; 276 } 277 278 int FST_AssumeParams( srec_context* context) 279 { 280 fst_set_wb_costs( context, (costdata)DEFAULT_WB_COST); 281 return FST_SUCCESS; 282 } 283 284 285 /*--------------------------------------------------------------------------* 286 * * 287 * * 288 * * 289 *--------------------------------------------------------------------------*/ 290 291 modelID hmm_number(const char* hmm_Name, modelID hmm_ilabel_offset) 292 { 293 if (!strcmp(hmm_Name, "eps")) return EPSILON_LABEL; 294 if (!strcmp(hmm_Name, ".wb")) return WORD_BOUNDARY; 295 if (!strcmp(hmm_Name, ".ph")) return PHONE_BOUNDARY; 296 ASSERT(hmm_Name[0] == 'h' && hmm_Name[1] == 'm' && hmm_Name[2] == 'm' && hmm_Name[3]); 297 return (modelID)(hmm_ilabel_offset + (modelID)atoi_with_check(hmm_Name + 3, MAXmodelID)); 298 } 299 300 char* hmm_name(modelID ilabel, modelID hmm_ilabel_offset, char* buf) 301 { 302 if (ilabel == EPSILON_LABEL) 303 sprintf(buf, "eps"); 304 else if (ilabel == WORD_BOUNDARY) 305 sprintf(buf, ".wb"); 306 else if (ilabel == PHONE_BOUNDARY) 307 sprintf(buf, ".ph"); 308 else 309 sprintf(buf, "hmm%03d", ilabel - hmm_ilabel_offset); 310 return buf; 311 } 312 313 /*------------------------------------------------------------------* 314 * * 315 * words * 316 * * 317 *------------------------------------------------------------------*/ 318 int HashCmpWord(const LCHAR *key1, const LCHAR *key2) 319 { 320 return LSTRCMP(key1,key2); 321 } 322 unsigned int HashGetCode(const void *key) 323 { 324 const LCHAR* k = (const LCHAR*)key; 325 unsigned int i, len, h = 0; 326 len = LSTRLEN( k); 327 for ( i = 0; i < len; i++) 328 h = 31*h + (unsigned int)k[i]; 329 return h; 330 } 331 332 int wordmap_create(wordmap** pwmap, int num_chars, int num_words, int num_words_to_add) 333 { 334 wordmap* Interface; 335 PHashTableArgs hashArgs; 336 ESR_ReturnCode rc; 337 338 Interface = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.wordmap.base"); 339 Interface->max_words = (wordID)(num_words + num_words_to_add); 340 Interface->num_words = (wordID)0; 341 /* *pwmap->words = (ptr32*) CALLOC_CLR(wmap->max_words, sizeof(ptr32), "graph.wordmap.words"); */ 342 Interface->words = (char**) CALLOC_CLR(Interface->max_words, sizeof(char*), "srec.graph.wordmap.words"); 343 Interface->max_chars = num_chars + num_words_to_add * AVG_CHARS_PER_WORD; 344 Interface->chars = (char*) CALLOC_CLR(Interface->max_chars, sizeof(char), "srec.graph.wordmap.chars"); 345 Interface->next_chars = Interface->chars; 346 Interface->wordIDForWord = NULL; 347 *pwmap = Interface; 348 349 /* use a hashtable to save mapping between wdID and array index */ 350 if (num_words_to_add >= 0) 351 { 352 hashArgs.capacity = num_words + num_words_to_add + 10; 353 if(hashArgs.capacity%2==0) hashArgs.capacity += 1; 354 hashArgs.compFunction = HashCmpWord; // PHASH_TABLE_DEFAULT_COMP_FUNCTION; 355 hashArgs.hashFunction = HashGetCode; // PHASH_TABLE_DEFAULT_HASH_FUNCTION; 356 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR; 357 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.wordmap_create()"), &Interface->wordIDForWord)); 358 } 359 else 360 { 361 Interface->wordIDForWord = NULL; 362 } 363 364 return FST_SUCCESS; 365 366 CLEANUP: 367 wordmap_destroy(pwmap); 368 return rc; 369 } 370 wordID wordmap_add_word(wordmap* wmap, const char* word); 371 372 int FST_LoadWordMap(wordmap** pwmap, int num_words_to_add, PFile* fp) 373 { 374 wordID my_wID, wm_wID; 375 char *word, *wID_string; 376 int num_words; 377 asr_int32_t num_chars; 378 char buf[MAX_LINE_LENGTH]; 379 long fpos; 380 wordmap* wmap; 381 /* figure out the number of words */ 382 fpos = pftell(fp); 383 for (num_words = 0, num_chars = 0; pfgets(buf, MAX_LINE_LENGTH, fp); num_words++) 384 { 385 word = strtok(buf, " \n\r\t"); 386 num_chars += strlen(word); 387 } 388 num_chars += num_words * 2; /* for null terminators */ 389 pfseek(fp, fpos, SEEK_SET); 390 391 /* Alex added this to reuse this functionality elsewhere 392 destroy function for this create function is actually taken care of by the 393 FST_UnloadWordMap function 394 */ 395 wordmap_create(&wmap, num_chars, num_words, num_words_to_add); 396 397 /* later replace this with wordblocks, linked list of 8 character blocks! */ 398 while (pfgets(buf, MAX_LINE_LENGTH, fp)) 399 { 400 word = strtok(buf, " \n\r\t"); 401 wID_string = strtok(NULL, " \n\r\t"); 402 my_wID = (wordID)atoi_with_check(wID_string, MAXwordID); 403 404 // if(strstr(word,SLOTNAME_INDICATOR)) 405 // word = strstr(word,SLOTNAME_INDICATOR); 406 wm_wID = wordmap_add_word(wmap, word); 407 ASSERT(my_wID == wm_wID); 408 } 409 ASSERT(wmap->num_words == num_words); 410 411 for (my_wID = 1; my_wID < num_words; my_wID++) 412 { 413 if (!IS_SLOT_OLABEL(wmap->words[my_wID])) 414 break; 415 } 416 wmap->num_slots = my_wID; 417 wordmap_setbase(wmap); 418 *pwmap = wmap; 419 if(wmap->num_slots > MAX_NUM_SLOTS) 420 return FST_FAILED_ON_INVALID_ARGS; 421 else 422 return FST_SUCCESS; 423 } 424 425 int wordmap_destroy(wordmap** wmap) 426 { 427 if (wmap && *wmap) 428 { 429 wordmap_clean ( *wmap ); 430 if (((*wmap)->wordIDForWord)) PHashTableDestroy((*wmap)->wordIDForWord); 431 if (((*wmap)->chars)) FREE((*wmap)->chars); 432 if (((*wmap)->words)) FREE((*wmap)->words); 433 if ((*wmap)) FREE((*wmap)); 434 *wmap = 0; 435 } 436 return FST_SUCCESS; 437 } 438 439 int FST_UnloadWordMap(wordmap** wmap) 440 { 441 return wordmap_destroy(wmap); 442 } 443 444 445 int FST_DumpWordMap(PFile* fp, wordmap* wmap) 446 { 447 wordID my_wID; 448 for (my_wID = 0; my_wID < wmap->num_words; my_wID++) 449 { 450 pfprintf(fp, "%s %hu\n", wmap->words[my_wID], my_wID); 451 } 452 return FST_SUCCESS; 453 } 454 455 wordID wordmap_find_index(wordmap* wmap, const char* word) 456 { 457 void *wdID_void; 458 ESR_ReturnCode rc; 459 size_t i; 460 461 if (!word) 462 return MAXwordID; 463 464 if (wmap->num_words == 0) 465 return MAXwordID; 466 467 if (wmap->wordIDForWord) 468 { 469 rc = PHashTableGetValue(wmap->wordIDForWord, word, (void**)&wdID_void); 470 471 if (rc == ESR_SUCCESS) 472 return (wordID)(uintptr_t)wdID_void; 473 } 474 else 475 { 476 for (i = 0; i < wmap->num_words; i++) 477 { 478 if (!strcmp(wmap->words[i], word)) 479 { 480 return (wordID)i; 481 } 482 } 483 } 484 return MAXwordID; 485 } 486 487 wordID wordmap_find_rule_index(wordmap* wmap, const char* rule) 488 { 489 int i; 490 int strlen_rule = strlen(rule); 491 for (i = wmap->num_slots; --i > 0;) 492 { /* > (not >=) because eps is always 0 */ 493 char* p = strstr(wmap->words[i], SLOTNAME_INDICATOR); 494 if(p != NULL){ 495 if (!strcmp(p, rule)) break; 496 else if(!strncmp(p,rule,strlen_rule) && !strcmp(p+strlen_rule,SLOTNAME_INDICATOR)) break; 497 } 498 } 499 if (i == 0) return MAXwordID; 500 else return(wordID)i; 501 } 502 503 wordID wordmap_find_index_in_rule(wordmap* wmap, const char* word, wordID rule) 504 { 505 int len = strlen(word); 506 int rule0 = rule + '0'; 507 void *wdID_void; 508 LCHAR word_dot_rule[256]; 509 ESR_ReturnCode rc; 510 511 if (!word) 512 return MAXwordID; 513 LSTRCPY(word_dot_rule, word); 514 word_dot_rule[len++] = IMPORTED_RULES_DELIM; 515 word_dot_rule[len++] = (char)rule0; 516 word_dot_rule[len++] = 0; 517 rc = PHashTableGetValue(wmap->wordIDForWord, word_dot_rule, (void**)&wdID_void); 518 if (rc == ESR_SUCCESS) 519 return (wordID)(uintptr_t)(wdID_void); 520 return MAXwordID; 521 } 522 523 int strlen_with_null(const char* word) 524 { 525 #if 1 /* NT */ 526 int len = strlen(word) + 1; 527 if (len % 2 == 1) len++; 528 return len; 529 #else /* C54 */ 530 #endif 531 } 532 533 int wordmap_whether_in_rule(wordmap* wmap, wordID word, wordID rule) 534 { 535 char* word_chars; 536 int len; 537 if (word > wmap->num_words) return 0; 538 word_chars = wmap->words[word]; 539 len = strlen(word_chars); 540 if (word_chars[len-1] == rule + '0' && word_chars[len-2] == IMPORTED_RULES_DELIM) 541 return 1; 542 else 543 return 0; 544 } 545 546 void wordmap_setbase(wordmap* wmap) 547 { 548 wmap->num_base_words = wmap->num_words; 549 wmap->next_base_chars = wmap->next_chars; 550 } 551 552 void wordmap_ceiling(wordmap* wmap) 553 { 554 /* this is irreversible */ 555 wmap->max_words = wmap->num_words; 556 wmap->max_chars = (wmap->next_chars - wmap->chars); 557 } 558 559 560 static ESR_ReturnCode wordmap_clean ( wordmap *word_map ) 561 { 562 ESR_ReturnCode clean_status; 563 PHashTableEntry *entry; 564 PHashTableEntry *oldEntry; 565 wordID *value; 566 567 clean_status = ESR_SUCCESS; 568 569 if ( word_map->wordIDForWord == NULL ) 570 return clean_status; 571 572 clean_status = PHashTableEntryGetFirst ( word_map->wordIDForWord, &entry); 573 574 while ( ( entry != NULL ) && ( clean_status == ESR_SUCCESS ) ) 575 { 576 clean_status = PHashTableEntryGetKeyValue ( entry, NULL, (void **)&value ); 577 578 if ( clean_status == ESR_SUCCESS ) 579 { 580 oldEntry = entry; 581 clean_status = PHashTableEntryAdvance ( &entry ); 582 583 if ( clean_status == ESR_SUCCESS ) 584 clean_status = PHashTableEntryRemove ( oldEntry ); 585 } 586 } 587 return ( clean_status ); 588 } 589 590 591 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words ) 592 { 593 ESR_ReturnCode populate_status; 594 wordID wdID; 595 596 populate_status = ESR_SUCCESS; 597 wdID = 0; 598 599 if ( word_map->wordIDForWord != NULL ) 600 { 601 while ( ( wdID < num_words ) && ( populate_status == ESR_SUCCESS ) ) 602 { 603 populate_status = PHashTablePutValue ( word_map->wordIDForWord, word_map->words[wdID], 604 (const void *)(uintptr_t)wdID, NULL ); 605 if ( populate_status == ESR_SUCCESS ) 606 wdID++; 607 else { 608 return (populate_status); 609 } 610 } 611 } 612 return ( populate_status ); 613 } 614 615 void wordmap_reset(wordmap* wmap) 616 { 617 char** tmp_words; 618 int i; 619 ESR_ReturnCode reset_status; 620 621 if (wmap->num_base_words < wmap->num_words) 622 { 623 /*wordID i = (wordID)(wmap->num_base_words);*/ 624 char* old_wmap_chars = wmap->chars; 625 int offset = wmap->next_base_chars - wmap->chars; 626 char* tmp_chars = NEW_ARRAY(char, offset, L("srec.g2g.graph.wordmap.chars")); 627 if(!tmp_chars) { 628 passert ( 0 && L("failed to reset the memory for wordmap.chars ") ); 629 } 630 memcpy(tmp_chars,wmap->chars, offset * sizeof(char)); 631 FREE(wmap->chars); 632 633 wmap->chars = tmp_chars; 634 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap_chars); 635 wmap->max_chars = (wordID) offset; 636 wmap->next_chars = wmap->next_base_chars; 637 638 tmp_words = (char**) CALLOC_CLR(wmap->num_base_words, sizeof(char*), "srec.graph.wordmap.words"); 639 if(!tmp_words) { 640 passert ( 0 && L("failed to reset the memory for wordmap.words ") ); 641 } 642 memcpy( tmp_words, wmap->words, wmap->num_base_words * sizeof(char*)); 643 FREE( wmap->words); 644 645 wmap->words = tmp_words; 646 wmap->max_words = wmap->num_base_words; 647 wmap->num_words = wmap->num_base_words; 648 649 for(i=0; i<wmap->num_words; i++) 650 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap_chars); 651 } 652 653 reset_status = wordmap_clean ( wmap ); 654 655 if ( reset_status == ESR_SUCCESS ) 656 { 657 reset_status = wordmap_populate ( wmap, wmap->num_base_words ); 658 659 if ( reset_status != ESR_SUCCESS ) 660 { 661 wordmap_clean ( wmap ); 662 passert ( 0 && L("wordmap_reset failed") ); 663 } 664 } 665 else 666 { 667 passert ( 0 && L("wordmap_reset failed") ); 668 } 669 } 670 671 #define FST_GROW_FACTOR 12/10 672 #define FST_GROW_MINCHARS 256 673 #define FST_GROW_MINWORDS 32 674 wordID wordmap_add_word(wordmap* wmap, const char* word) 675 { 676 int len = strlen_with_null(word); 677 wordID wdID =0; 678 679 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars) 680 { 681 #if defined(FST_GROW_FACTOR) 682 uintptr_t i,tmp_max_chars= wmap->max_chars * FST_GROW_FACTOR; 683 char* old_wmap__chars = wmap->chars; 684 if(tmp_max_chars - wmap->max_chars < FST_GROW_MINCHARS) 685 tmp_max_chars +=FST_GROW_MINCHARS; 686 687 char* tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars")); 688 if(!tmp_chars) { 689 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.chars")); 690 return MAXwordID; 691 } 692 memcpy(tmp_chars,wmap->chars,wmap->max_chars * sizeof(char)); 693 FREE(wmap->chars); 694 695 wmap->chars = tmp_chars; 696 wmap->next_chars = wmap->chars + (wmap->next_chars - old_wmap__chars); 697 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap__chars); 698 wmap->max_chars = (wordID)tmp_max_chars; 699 //Remove old keys --- Add WORD 700 wordmap_clean (wmap ); 701 702 // adjust word pointers 703 for(i=0; i<wmap->num_words; i++) 704 { 705 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap__chars); 706 // adjust hashtable --- 707 if (wmap->wordIDForWord) 708 { 709 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i], 710 (const void *)i, NULL ); 711 if ( rc != ESR_SUCCESS ) { 712 goto CLEANUP; 713 } 714 } 715 } 716 #else // so not defined(FST_GROW_FACTOR) 717 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars); 718 return MAXwordID; 719 #endif // defined(FST_GROW_FACTOR) 720 } 721 //Add word 722 if (wmap->num_words == wmap->max_words) 723 { 724 #if defined(FST_GROW_FACTOR) 725 char** tmp_words; 726 wordID tmp_max_words; 727 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR; 728 729 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS) 730 itmp_max_words += FST_GROW_MINWORDS; 731 732 if( itmp_max_words >= MAXwordID) { 733 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 734 return MAXwordID; 735 } 736 tmp_max_words = (wordID)itmp_max_words; 737 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words"); 738 if(!tmp_words) { 739 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.words")); 740 return MAXwordID; 741 } 742 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*)); 743 FREE( wmap->words); 744 wmap->words = tmp_words; 745 wmap->max_words = tmp_max_words; 746 #else // so not defined(FST_GROW_FACTOR) 747 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 748 return MAXwordID; 749 #endif // defined(FST_GROW_FACTOR) 750 } 751 if(1) 752 { 753 strcpy(wmap->next_chars, word); 754 wmap->words[ wmap->num_words++] = wmap->next_chars; 755 wmap->next_chars += len; 756 wdID = (wordID)(wmap->num_words - (wordID)1); 757 if (wmap->wordIDForWord) 758 { 759 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID], 760 (const void *)(uintptr_t)wdID, NULL ); 761 if ( rc != ESR_SUCCESS ) 762 goto CLEANUP; 763 } 764 return wdID; 765 } 766 CLEANUP: 767 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID); 768 return MAXwordID; 769 } 770 771 wordID wordmap_add_word_in_rule(wordmap* wmap, const char* word, wordID rule) 772 { 773 int len = strlen_with_null(word) + 2; 774 wordID wdID = 0; 775 wordID i; 776 //wordmap_add_word_in_rule 777 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars) 778 { 779 #if defined(FST_GROW_FACTOR) 780 int tmp_max_chars = wmap->max_chars * FST_GROW_FACTOR; 781 char* tmp_chars; 782 char* old_next_chars = wmap->next_chars; 783 char* old_chars = wmap->chars; 784 785 if(tmp_max_chars-wmap->max_chars < FST_GROW_MINCHARS ) 786 tmp_max_chars += FST_GROW_MINCHARS; 787 if( wmap->chars + tmp_max_chars <= wmap->next_chars + len) 788 tmp_max_chars += len; 789 790 tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars")); 791 if(!tmp_chars) { 792 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_in_rule.chars")); 793 return MAXwordID; 794 } 795 memcpy(tmp_chars, wmap->chars, wmap->max_chars * sizeof(char)); 796 FREE(wmap->chars); 797 798 wmap->chars = tmp_chars; 799 wmap->next_chars = wmap->chars + (old_next_chars - old_chars) ; 800 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_chars); 801 wmap->max_chars = (wordID)tmp_max_chars; 802 803 //Remove old keys 804 wordmap_clean ( wmap ); 805 806 // adjust word pointers wordmap_add_word_in_rule 807 for(i=0; i<wmap->num_words; i++) 808 { 809 wmap->words[i] = wmap->chars +(wmap->words[i] - old_chars) ; 810 // adjust hashtable ----- add in rulewordmap_add_word_in_rule ---- 811 if (wmap->wordIDForWord) { 812 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i], 813 (void*)(uintptr_t)(i), NULL ); 814 if ( rc != ESR_SUCCESS ) 815 goto CLEANUP; 816 } 817 } 818 #else 819 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars); 820 return MAXwordID; 821 #endif 822 }//wordmap_add_word_in_rule 823 if (wmap->num_words == wmap->max_words) 824 { 825 #if defined(FST_GROW_FACTOR) 826 char** tmp_words; 827 wordID tmp_max_words; 828 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR; 829 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS) 830 itmp_max_words += FST_GROW_MINWORDS; 831 832 if( itmp_max_words >= MAXwordID) { 833 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 834 return MAXwordID; 835 } 836 837 tmp_max_words = (wordID)itmp_max_words; 838 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words"); 839 if(!tmp_words) { 840 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_rule.words")); 841 return MAXwordID; 842 } 843 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*)); 844 FREE( wmap->words); 845 wmap->words = tmp_words; 846 wmap->max_words = tmp_max_words; 847 #else 848 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 849 return MAXwordID; 850 #endif 851 } 852 if(1) 853 { 854 char *p; 855 const char *q; 856 /* word.1 */ 857 for (p = wmap->next_chars, q = word; (*p = *q) != '\0'; p++, q++) ; /* basic strcpy() */ 858 *p++ = IMPORTED_RULES_DELIM; 859 *p++ = (char)(rule + ((wordID)'0')); 860 *p = '\0'; 861 wmap->words[ wmap->num_words++] = wmap->next_chars; 862 wmap->next_chars += len; 863 wdID = (wordID)(wmap->num_words - (wordID)1); 864 if (wmap->wordIDForWord) 865 { 866 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID], 867 (void*)(uintptr_t)(wdID), NULL ); 868 if ( rc != ESR_SUCCESS ) 869 goto CLEANUP; 870 } 871 return wdID; 872 } 873 CLEANUP: 874 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID); 875 return MAXwordID; 876 } 877 878 /*----------------------------------------------------------------------* 879 * * 880 * API * 881 * * 882 *----------------------------------------------------------------------*/ 883 884 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree) 885 { 886 int rc = 0; 887 unsigned int allotree__modelid; 888 889 fst->allotree = allophone_tree; 890 if( !allophone_tree) 891 return 1; 892 fst->hmm_info_for_ilabel = allophone_tree->hmm_infos - fst->hmm_ilabel_offset; 893 allotree__modelid = version_arbdata_models(allophone_tree); 894 if (allotree__modelid != 0 && fst->modelid != 0) 895 { 896 if (allotree__modelid != fst->modelid) 897 { 898 PLogError("Error: modelids disagree, sgcbaseline(%u) arbdata(%u)", fst->modelid, allotree__modelid); 899 rc = FST_FAILED_ON_INVALID_ARGS; 900 } 901 } 902 903 return rc; 904 } 905 906 int FST_LoadGraph(srec_context* pfst, wordmap* imap, wordmap* omap, 907 int num_words_to_add, PFile* fp) 908 { 909 int i, rc = 0; 910 char line[MAX_LINE_LENGTH], *args[32], nargs; 911 arcID max_num_FSMarcs; 912 arcID max_num_FSMnodes; 913 nodeID from_node, last_from_node, into_node, num_nodes, max_node_number = 0; 914 FSMarc_ptr atok = (FSMarc_ptr)0; 915 FSMarc *atoken = NULL; 916 FSMnode *fr_node, *to_node; 917 costdata cost = FREEcostdata; 918 arcID num_arcs; 919 920 srec_context* fst = pfst; 921 char *ilabel_str = 0, *olabel_str = 0; 922 long fpos; 923 arcID new_arc_id; 924 asr_int32_t temp; 925 926 927 /* determine number of arcs and nodes to allocate, add 50% for dynamic */ 928 fpos = pftell(fp); 929 max_num_FSMnodes = 0; 930 pfst->modelid = 0; 931 for (max_num_FSMarcs = 0; pfgets(line, sizeof(line) / sizeof(char), fp); max_num_FSMarcs++) 932 { 933 if (strstr(line, "modelid:") == line) 934 { 935 char *p; 936 pfst->modelid = strtoul(line + strlen("modelid:"), &p, 10); 937 } 938 from_node = (nodeID)atoi_with_check(line, MAXnodeID); 939 if (from_node > max_num_FSMnodes) 940 max_num_FSMnodes = from_node; 941 } 942 pfseek(fp, fpos, SEEK_SET); 943 temp = max_num_FSMnodes + 1 /*why+1?*/ + num_words_to_add * AVG_NODES_PER_WORD; 944 if (temp >= MAXnodeID) 945 { 946 max_num_FSMnodes = MAXnodeID - 1; 947 PLogMessage("Warning: using max nodes instead\n"); 948 } 949 else 950 { 951 max_num_FSMnodes = (nodeID)temp; 952 } 953 temp = max_num_FSMarcs + num_words_to_add * AVG_ARCS_PER_WORD; 954 if (temp >= MAXarcID) 955 { 956 max_num_FSMarcs = MAXarcID - 1; 957 PLogMessage("Warning: using max arcs instead\n"); 958 } 959 else 960 { 961 max_num_FSMarcs = (arcID)temp; 962 } 963 fst->olabels = omap; 964 if (imap) 965 { 966 /* generally no imap is specified */ 967 fst->ilabels = imap; 968 fst->hmm_ilabel_offset = wordmap_find_index(fst->ilabels, "hmm0"); 969 ASSERT(fst->hmm_ilabel_offset >= 0); 970 } 971 else 972 { 973 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap"); 974 fst->ilabels->num_words = fst->ilabels->max_words = 0; 975 fst->ilabels->words = 0; 976 /* this is bad hard code, we can get this from the swiarb file, it is 977 equal to the number of phonemes (53 for enu, 39 for jpn) plus the special 978 models (eps,.wb,.ph,h#,#h,iwt,not) minus 1 ('cuz base number is 0) */ 979 fst->hmm_ilabel_offset = 128; /* should match MAX_PHONEMES */ 980 } 981 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(max_num_FSMarcs, sizeof(FSMarc), "srec.graph.arcs"); 982 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(max_num_FSMnodes, sizeof(FSMnode), "srec.graph.nodes"); 983 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(max_num_FSMnodes, sizeof(char), "srec.graph.nodeinfos"); 984 985 /* setup the arc freelist */ 986 fst->FSMarc_freelist = 0; 987 fst->num_arcs = 0; 988 for (i = 0; i < max_num_FSMarcs - 1; i++) 989 { 990 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1); 991 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 992 } 993 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL; 994 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 995 996 /* initialize the nodes, 'cuz reading is random order */ 997 fst->num_nodes = 0; 998 for (i = 0; i < max_num_FSMnodes; i++) 999 { 1000 fr_node = &fst->FSMnode_list[i]; 1001 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL; 1002 } 1003 1004 /* 1. first load up all the information */ 1005 IF_DEBUG_WDADD(printf("load graph ... 1\n")); 1006 num_arcs = 0; 1007 last_from_node = MAXnodeID; 1008 while (pfgets(line, sizeof(line) / sizeof(char), fp)) 1009 { 1010 if (strstr(line, "modelid:") == line) continue; 1011 IF_DEBUG_WDADD(printf("read arc %s", line)); 1012 nargs = split_line(line, args); 1013 if (nargs >= 4) 1014 { 1015 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID); 1016 into_node = (nodeID)atoi_with_check(args[1], MAXnodeID); 1017 ilabel_str = args[2]; 1018 olabel_str = args[3]; 1019 cost = FREEcostdata; 1020 if (nargs == 5) 1021 PLogError(L("Warning: too many arguments on line %s"), line); 1022 } 1023 else if (nargs == 1) 1024 { 1025 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID); 1026 into_node = MAXnodeID; 1027 ilabel_str = 0; 1028 olabel_str = 0; 1029 cost = FREEcostdata; 1030 } 1031 else 1032 { 1033 from_node = into_node = 0; 1034 PLogError("can't parse line %s\n", line); 1035 ASSERT(0); 1036 } 1037 1038 if (into_node == MAXnodeID) 1039 { 1040 fst->end_node = from_node; 1041 } 1042 else 1043 { 1044 new_arc_id = fst_get_free_arc(fst); 1045 if (new_arc_id == MAXarcID) 1046 return FST_FAILED_ON_MEMORY; 1047 atok = ARC_ItoX(new_arc_id); 1048 atoken = ARC_XtoP(atok); 1049 num_arcs++; 1050 fr_node = &fst->FSMnode_list[from_node]; 1051 to_node = &fst->FSMnode_list[into_node]; 1052 if (fst->ilabels->num_words == 0) 1053 { 1054 atoken->ilabel = hmm_number(ilabel_str, fst->hmm_ilabel_offset); 1055 /* Xufang: if olabel_str is a slotname, change the input label to a no-silence and no-eps HMM. It is true that the weight of slot arcs is 999 and hmm will not take effect in the runtime, but it was found that when the slot is a loop, eps and silence will cause the recursive function fst_node_has_speech_to_come into a no-step-out status.*/ 1056 if(strstr(olabel_str, SLOTNAME_INDICATOR)!=NULL) 1057 atoken->ilabel = fst->hmm_ilabel_offset + SLOTLOOP_OFFSET; 1058 } 1059 else 1060 { 1061 atoken->ilabel = wordmap_find_index(fst->ilabels, ilabel_str); 1062 } 1063 atoken->olabel = wordmap_find_index(fst->olabels, olabel_str); 1064 /* if(g_fst_options.wtw_cost_override>0) { 1065 if(atoken->ilabel == WORD_BOUNDARY) 1066 atoken->cost = g_fst_options.wtw_cost_override; 1067 else 1068 atoken->cost = g_fst_options.arc_cost_override; 1069 } else */ 1070 atoken->cost = cost; 1071 #if DEBUG_WDADD 1072 atoken->ilabel_str = (atoken->ilabel < fst->ilabels->num_words ? fst->ilabels->words[atoken->ilabel] : 0); 1073 atoken->olabel_str = (atoken->olabel < fst->olabels->num_words ? fst->olabels->words[atoken->olabel] : 0); 1074 #endif 1075 append_arc_leaving_node(fst, fr_node, atok); 1076 if (into_node > max_node_number) 1077 max_node_number = into_node; 1078 append_arc_arriving_node(fst, to_node, atok); 1079 atoken->fr_node = NODE_ItoX(from_node); 1080 atoken->to_node = NODE_ItoX(into_node); 1081 } 1082 last_from_node = from_node; 1083 } 1084 ASSERT(fst->num_arcs == num_arcs); 1085 1086 /* setup the node freelist */ 1087 IF_DEBUG_WDADD(printf("load graph ... 6\n")); 1088 num_nodes = (nodeID)(max_node_number + 1); 1089 if( max_num_FSMnodes > num_nodes) { 1090 fst->FSMnode_freelist = num_nodes; 1091 for (i = num_nodes; i < (max_num_FSMnodes - 1); i++) 1092 { 1093 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1); 1094 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 1095 } 1096 if (i == (max_num_FSMnodes - 1)) { 1097 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL; 1098 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 1099 } 1100 } else 1101 fst->FSMnode_freelist = FSMNODE_NULL; 1102 1103 /* some book-keeping stuff */ 1104 IF_DEBUG_WDADD(printf("load graph ... 7\n")); 1105 fst->num_base_arcs = fst->num_arcs = num_arcs; 1106 fst->FSMarc_list_len = max_num_FSMarcs; 1107 fst->FSMnode_list_len = max_num_FSMnodes; 1108 fst->num_base_nodes = fst->num_nodes = num_nodes; 1109 fst->start_node = 0; 1110 /* fst->end_node = 0; this is set up above */ 1111 1112 /* beg and end word labels */ 1113 fst->beg_silence_word = wordmap_find_index(fst->olabels, "-pau-"); 1114 fst->end_silence_word = wordmap_find_index(fst->olabels, "-pau2-"); 1115 fst->hack_silence_word = wordmap_find_index(fst->olabels, "silence"); 1116 1117 /* set the search limitations, real ones will be registered later */ 1118 fst->max_searchable_nodes = 0; 1119 fst->max_searchable_arcs = 0; 1120 1121 rc = fst_alloc_transit_points(fst); 1122 return (rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS); 1123 } 1124 1125 int FST_UnloadGraph(srec_context* pfst) 1126 { 1127 if (pfst->ilabels) 1128 FREE(pfst->ilabels); 1129 FREE(pfst->FSMarc_list); 1130 FREE(pfst->FSMnode_list); 1131 FREE(pfst->FSMnode_info_list); 1132 pfst->FSMarc_list = 0; 1133 pfst->FSMnode_list = 0; 1134 pfst->FSMnode_info_list = 0; 1135 return FST_SUCCESS; 1136 } 1137 1138 int FST_DumpGraph(srec_context* fst, PFile* fp) 1139 { 1140 int rc = 0; 1141 nodeID i; 1142 FSMarc_ptr atok; 1143 FSMarc* atoken; 1144 nodeID from_node, into_node; 1145 FSMnode* ntoken; 1146 char *ilabel, *olabel; 1147 1148 for (i = 0; i < fst->num_nodes; i++) 1149 { 1150 from_node = i; 1151 ntoken = &fst->FSMnode_list[i]; 1152 if (ntoken->first_prev_arc == FSMARC_FREE) 1153 continue; 1154 if (ntoken->un_ptr.first_next_arc != FSMARC_NULL) 1155 { 1156 for (atok = ntoken->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 1157 { 1158 char buf[32]; 1159 atoken = ARC_XtoP(atok); 1160 into_node = NODE_XtoI(atoken->to_node); 1161 ilabel = fst->ilabels->num_words == 0 ? 1162 hmm_name(atoken->ilabel, fst->hmm_ilabel_offset, buf) : 1163 fst->ilabels->words[atoken->ilabel] ; 1164 olabel = fst->olabels->words[atoken->olabel]; 1165 1166 if (atoken->cost != FREEcostdata) 1167 { 1168 /* regular arc */ 1169 pfprintf(fp, "%hu\t%hu\t%s\t%s\t%hu\n", 1170 from_node, into_node, ilabel, olabel, atoken->cost); 1171 } 1172 else 1173 { 1174 /* regular zero cost arc */ 1175 pfprintf(fp, "%hu\t%hu\t%s\t%s\n", 1176 from_node, into_node, ilabel, olabel); 1177 } 1178 } 1179 } 1180 else 1181 { 1182 pfprintf(fp, "%hu\n", from_node); 1183 } 1184 } 1185 return rc; 1186 } 1187 1188 int FST_AddWordToGrammar(srec_context* fst, const char* _slot, 1189 const char* word, const char* pron, 1190 const int cost) 1191 { 1192 int num_pron_ampersands = 0, num_prons_added = 0, num_words_added = 0; 1193 int i, pron_len, model_sequence_len; 1194 modelID model_sequence[MAX_PRON_LEN]; 1195 char phoneme_sequence[MAX_PRON_LEN]; 1196 wordID olabel = MAXwordID; 1197 int irc, rc = FST_SUCCESS; 1198 nodeID start_node = MAXnodeID, end_node = MAXnodeID; 1199 char veslot[MAX_WORD_LEN]; 1200 #if USE_HMM_BASED_ENROLLMENT 1201 const char* Tpron; 1202 #endif 1203 1204 /* The addword from voice enroll still use @, and to test voice enroll, @ sign was removed and __ was added at the begining and ending of a word. Xufang */ 1205 if(_slot[0] == '@') { 1206 strcpy(veslot,SLOTNAME_INDICATOR); 1207 strcat(veslot,_slot+1); 1208 strcat(veslot,SLOTNAME_INDICATOR); 1209 } else 1210 strcpy(veslot, _slot); 1211 1212 #if USE_COMP_STATS 1213 if (!comp_stats) 1214 comp_stats = init_comp_stats1(); 1215 start_cs_clock1(&comp_stats->word_addition); 1216 #endif 1217 1218 /* we expect developers to call AddWord with the same slot many times, 1219 so we cache the slot start and end nodes and re-use on subseq calls */ 1220 if( fst->addWordCaching_lastslot_num == MAXwordID ) 1221 fst->addWordCaching_lastslot_name = NULL; 1222 else { 1223 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num]; 1224 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR); 1225 ASSERT( fst->addWordCaching_lastslot_name); 1226 } 1227 if( fst->addWordCaching_lastslot_name==NULL || strcmp( fst->addWordCaching_lastslot_name, veslot)) { 1228 1229 fst->addWordCaching_lastslot_num = wordmap_find_rule_index(fst->olabels, veslot); 1230 /* olabel not found is a user error */ 1231 if (fst->addWordCaching_lastslot_num == MAXwordID) 1232 { 1233 size_t i; 1234 1235 pfprintf(PSTDOUT, L("error: slot '%s' not found among ["), veslot); 1236 for (i = 1; i < (size_t) fst->olabels->num_slots; ++i) 1237 pfprintf(PSTDOUT, "%s, ", fst->olabels->words[i]); 1238 pfprintf(PSTDOUT, L("] possible\n")); 1239 rc = FST_FAILED_ON_INVALID_ARGS; 1240 goto RETRC; 1241 } 1242 1243 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num]; 1244 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR); 1245 ASSERT(fst->addWordCaching_lastslot_name); 1246 1247 /* now find where in the graph this slot is referenced, note that 1248 there might be more than one place, but here we ignore multiples */ 1249 for (i = fst->num_fsm_exit_points; --i >= 0;) 1250 { 1251 arcID arcid = fst->fsm_exit_points[i].arc_index; 1252 if (fst->FSMarc_list[arcid].olabel == fst->addWordCaching_lastslot_num) 1253 { 1254 FSMarc* arc; 1255 FSMnode* node; 1256 start_node = fst->fsm_exit_points[i].from_node_index; 1257 end_node = fst->fsm_exit_points[i].wbto_node_index; 1258 fst->addWordCaching_lastslot_needs_post_silence = ESR_TRUE; 1259 fst->addWordCaching_lastslot_ifsm_exit_point = i; 1260 node = &fst->FSMnode_list[ end_node]; 1261 arc = &fst->FSMarc_list[node->un_ptr.first_next_arc]; 1262 if (arc->olabel == fst->end_silence_word && arc->linkl_next_arc == MAXarcID) 1263 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE; 1264 break; 1265 } 1266 } 1267 1268 /* not found in the graph is an internal error, coding error */ 1269 if (i < 0 || start_node>=fst->num_nodes || end_node>=fst->num_nodes) 1270 { 1271 PLogError("error: (internal) finding olabel %d %d %d\n", fst->addWordCaching_lastslot_num, 1272 start_node, end_node); 1273 goto RETRC; 1274 } 1275 } /* cached or not */ 1276 1277 i = fst->addWordCaching_lastslot_ifsm_exit_point; 1278 start_node = fst->fsm_exit_points[i].from_node_index; 1279 end_node = fst->fsm_exit_points[i].wbto_node_index; 1280 1281 /* now start_node and end_node are known */ 1282 if (!word || !*word || !pron || !*pron) 1283 { 1284 rc = FST_FAILED_ON_INVALID_ARGS; 1285 PLogError("error: null word/pron on input to FST_AddWordToGrammar()\n"); 1286 goto RETRC; 1287 } 1288 1289 /* from here */ 1290 IF_DEBUG_WDADD(printf("Adding %s %s\n", word, (const char*)pron)); 1291 1292 /* loop over all prons, we break when we hit the double-null (hence the +1) */ 1293 for( ; (*pron)!='\0'; pron+=(pron_len+1)) { 1294 pron_len = strlen((char*)pron); 1295 if (pron_len >= MAX_PRON_LEN) 1296 { 1297 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron); 1298 rc = FST_FAILED_ON_INVALID_ARGS; 1299 goto RETRC; 1300 } 1301 1302 /* check for searchability after adding, estimating at most pron_len 1303 arcs and nodes will be added */ 1304 if (fst->num_arcs + pron_len > fst->max_searchable_arcs) 1305 { 1306 PLogError("error: wordadd failed on word %s due to %d arc search limit\n", 1307 word, fst->max_searchable_arcs); 1308 rc = FST_FAILED_ON_MEMORY; 1309 goto RETRC; 1310 } 1311 1312 if (fst->num_nodes + pron_len > fst->max_searchable_nodes) 1313 { 1314 PLogError("error: wordadd failed on word %s due to %d node search limit\n", 1315 word, fst->max_searchable_nodes); 1316 rc = FST_FAILED_ON_MEMORY; 1317 goto RETRC; 1318 } 1319 1320 /* add the word if necessary, !FST_SUCCESS is allowed if we're just adding 1321 an alternate pronunciation */ 1322 olabel = wordmap_find_index_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num); 1323 if (olabel == MAXwordID) 1324 { 1325 olabel = wordmap_add_word_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num); 1326 if (olabel != MAXwordID) 1327 num_words_added++; 1328 } 1329 if (olabel == MAXwordID) 1330 { 1331 PLogError("error: wordmap_add_word failed\n"); 1332 rc = FST_FAILED_ON_MEMORY; 1333 goto RETRC; 1334 } 1335 1336 #if USE_HMM_BASED_ENROLLMENT 1337 // pron should be converted to model_sequence, model_sequence_len 1338 #define VETRANS_PREFIX "wd_hmm" 1339 #define VETRANS_PREFIX_LEN 6 1340 if( LSTRSTR(pron, VETRANS_PREFIX)!=NULL) { 1341 // printf("Debug-pron: %d, %s\n",pron_len, pron); 1342 model_sequence_len=0; 1343 for(Tpron=pron; (Tpron=strstr(Tpron, VETRANS_PREFIX))!=NULL; ){ 1344 // Tpron = strstr(Tpron, "wd_hmm"); 1345 Tpron += VETRANS_PREFIX_LEN; // skip over "wd_hmm" 1346 // Copy hmm number (56,132,...) 1347 model_sequence[ model_sequence_len] = atoi_with_check(Tpron, MAXmodelID); 1348 model_sequence_len++; 1349 } 1350 1351 1352 /* we need to append silence at the end since we might be dealing with a slot 1353 that has ensuing speech, formerly we only dealt with ROOT which defacto 1354 was followed by the pau2 silence, so ROOT did not need its own */ 1355 if (fst->addWordCaching_lastslot_needs_post_silence) 1356 model_sequence[model_sequence_len++] = 3; // <<< hmm3_sil !!! ugly hard-code here 1357 1358 /* append the word boundary */ 1359 model_sequence[model_sequence_len++] = WORD_BOUNDARY; 1360 /* translate to input label ids */ 1361 for (i = 0; i < model_sequence_len; i++) 1362 { 1363 if (model_sequence[i] >= EPSILON_OFFSET) 1364 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset); 1365 } 1366 } 1367 else 1368 #endif 1369 { 1370 pron_len = strlen((char*)pron); 1371 if (pron_len >= MAX_PRON_LEN) 1372 { 1373 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron); 1374 rc = FST_FAILED_ON_INVALID_ARGS; 1375 goto RETRC; 1376 } 1377 1378 for (i = 0; i < pron_len; i++) 1379 { 1380 if (pron[i] == OPTSILENCE_CODE) 1381 { 1382 phoneme_sequence[i] = SILENCE_CODE; 1383 num_pron_ampersands++; 1384 } 1385 else 1386 phoneme_sequence[i] = pron[i]; 1387 } 1388 /* we need to append silence at the end since we might be dealing with a slot 1389 that has ensuing speech, formerly we only dealt with ROOT which defacto 1390 was followed by the pau2 silence, so ROOT did not need its own */ 1391 if (fst->addWordCaching_lastslot_needs_post_silence) 1392 phoneme_sequence[i++] = SILENCE_CODE; 1393 1394 model_sequence_len = i; 1395 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence); 1396 /* check for bad sequence of phonemes */ 1397 if (irc) 1398 { 1399 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, irc); 1400 rc = FST_FAILED_ON_INVALID_ARGS; 1401 goto RETRC; 1402 } 1403 IF_DEBUG_WDADD(printf("model_sequence ...\n")); 1404 1405 /* append the word boundary */ 1406 model_sequence[model_sequence_len++] = WORD_BOUNDARY; 1407 /* translate to input label ids */ 1408 for (i = 0; i < model_sequence_len; i++) 1409 { 1410 if (model_sequence[i] >= EPSILON_OFFSET) 1411 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset); 1412 } 1413 } /* end of ph_t ph_r .. type decoding of the model sequence */ 1414 1415 /* now do the actual addition */ 1416 rc = fst_add_arcs(fst, start_node, end_node, olabel, (costdata)cost, model_sequence, model_sequence_len); 1417 if (rc == FST_SUCCESS) 1418 { 1419 num_prons_added++; 1420 } 1421 else if (rc == FST_FAILED_ON_HOMONYM) 1422 { 1423 /* maybe the another pron will work? */ 1424 } 1425 else 1426 { 1427 PLogMessage("error: fst_add_arcs() failed adding word %s pron %s ('&' as 'iwt')\n", word, (char*)pron); 1428 goto RETRC; 1429 } 1430 1431 /* second add the pron with no silences, 1432 this only applies to true prons, not wd_hmm333 wd_hmm222 type prons */ 1433 1434 if (num_pron_ampersands > 0) 1435 { 1436 for (i = 0, model_sequence_len = 0; i < pron_len; i++) 1437 { 1438 if (pron[i] != OPTSILENCE_CODE) 1439 phoneme_sequence[model_sequence_len++] = pron[i]; 1440 } 1441 1442 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence); 1443 /* check for bad sequence of phonemes */ 1444 if (irc) 1445 { 1446 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, rc); 1447 rc = FST_FAILED_ON_INVALID_ARGS; 1448 goto RETRC; 1449 } 1450 else 1451 { 1452 IF_DEBUG_WDADD(printf("model_sequence ...\n")); 1453 1454 /* append the word boundary */ 1455 model_sequence[model_sequence_len++] = WORD_BOUNDARY; 1456 /* translate to input label ids */ 1457 for (i = 0; i < model_sequence_len; i++) 1458 { 1459 if (model_sequence[i] >= EPSILON_OFFSET) 1460 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset); 1461 } 1462 /* now do the actual addition */ 1463 rc = fst_add_arcs(fst, start_node, end_node, 1464 olabel, (costdata)cost, model_sequence, model_sequence_len); 1465 1466 if (rc == FST_SUCCESS) 1467 { 1468 num_prons_added++; 1469 } 1470 else if (rc == FST_FAILED_ON_HOMONYM) 1471 { 1472 /* maybe another pron worked? */ 1473 } 1474 else 1475 { 1476 PLogMessage("Warning: fst_add_arcs() failed while adding " 1477 "word %s pron %s (skip '&')\n", word, (char*)pron); 1478 goto RETRC; 1479 } 1480 } 1481 } 1482 } 1483 1484 1485 RETRC: 1486 /* set this to make sure that FST_Prepare gets called after add word */ 1487 fst->whether_prepared = 0; 1488 #if USE_COMP_STATS 1489 end_cs_clock1(&comp_stats->word_addition, 1); 1490 #endif 1491 1492 if (rc < 0 && rc != FST_FAILED_ON_HOMONYM) 1493 return rc; 1494 1495 if (num_prons_added == 0 && num_words_added > 0) 1496 { 1497 return FST_FAILED_ON_HOMONYM; 1498 } 1499 else if (num_prons_added == 0 && num_words_added == 0) 1500 { 1501 return FST_FAILED_ON_HOMOGRAPH; 1502 } 1503 else if (num_prons_added > 0 && num_words_added == 0) 1504 { 1505 return FST_SUCCESS_ON_OLD_WORD; 1506 } 1507 else 1508 { 1509 /* if(num_prons_added>0 && num_words_added>0) */ 1510 return FST_SUCCESS; 1511 } 1512 } 1513 1514 /* remove arcs leaving a node, arcs added via word add, are just discarded 1515 to be cleaned up for re-use by other functions */ 1516 void remove_added_arcs_leaving(srec_context* fst, nodeID ni) 1517 { 1518 FSMnode* node = &fst->FSMnode_list[ ni]; 1519 FSMarc *arc = NULL, *arc2; 1520 arcID *pai, ai, ai2; 1521 for (pai = &node->un_ptr.first_next_arc, ai = (*pai); ai != MAXarcID; pai = &arc->linkl_next_arc, ai = (*pai)) 1522 { 1523 if (ai < fst->num_base_arcs) 1524 { 1525 arc = &fst->FSMarc_list[ai]; 1526 } 1527 else 1528 { 1529 arc2 = &fst->FSMarc_list[ai]; 1530 for (ai2 = arc2->linkl_next_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID; 1531 ai2 = arc2->linkl_next_arc) 1532 { 1533 arc2 = &fst->FSMarc_list[ai2]; 1534 } 1535 *pai = ai2; 1536 } 1537 } 1538 } 1539 1540 /* remove arcs arriving at a node, arcs added via word add, are just discarded 1541 to be cleaned up for re-use by other functions */ 1542 void remove_added_arcs_arriving(srec_context* fst, nodeID ni) 1543 { 1544 FSMnode* node = &fst->FSMnode_list[ni]; 1545 FSMarc *arc = NULL, *arc2; 1546 arcID *pai, ai, ai2; 1547 for (pai = &node->first_prev_arc, ai = (*pai); ai != MAXarcID; 1548 pai = &arc->linkl_prev_arc, ai = (*pai)) 1549 { 1550 if (ai < fst->num_base_arcs) 1551 { 1552 arc = &fst->FSMarc_list[ai]; 1553 } 1554 else 1555 { 1556 arc2 = &fst->FSMarc_list[ai]; 1557 for (ai2 = arc2->linkl_prev_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID; 1558 ai2 = arc2->linkl_prev_arc) 1559 { 1560 arc2 = &fst->FSMarc_list[ai2]; 1561 } 1562 *pai = ai2; 1563 } 1564 } 1565 } 1566 1567 /* reset greammar, but resetting all the arcs, nodes and words added 1568 via the addword functions */ 1569 1570 int FST_ResetGrammar(srec_context* fst) 1571 { 1572 int i, rc = 0; 1573 nodeID fst_slot_start_node, fst_slot_end_node; 1574 wordID fst_slot_slotnum; 1575 arcID ai; 1576 nodeID ni2, ni3, ni4, ni3a; 1577 FSMnode_t *node, *node2, *node3; 1578 FSMarc_t *arc, *arc2, *arc3; 1579 1580 /*fst_slot_slotnum = wordmap_find_rule_index(fst->olabels, slot);*/ 1581 for (fst_slot_slotnum = 1; fst_slot_slotnum < fst->olabels->num_slots; 1582 fst_slot_slotnum++) 1583 { 1584 1585 if (fst_slot_slotnum == MAXwordID) 1586 { 1587 char *slot = ""; 1588 PLogError("error: slot '%s' not found among [%d,%d] possible\n", slot, 1, fst->olabels->num_slots - 1); 1589 return (rc = FST_FAILED_ON_INVALID_ARGS); 1590 } 1591 1592 /* now find where in the graph this slot is referenced, note that 1593 there might be more than one place, but here we ignore multiples */ 1594 fst_slot_start_node = MAXnodeID; 1595 fst_slot_end_node = MAXnodeID; 1596 for (i = fst->num_fsm_exit_points; --i >= 0;) 1597 { 1598 ai = fst->fsm_exit_points[i].arc_index; 1599 if (fst->FSMarc_list[ai].olabel == fst_slot_slotnum) 1600 { 1601 fst_slot_start_node = fst->fsm_exit_points[i].from_node_index; 1602 fst_slot_end_node = fst->fsm_exit_points[i].wbto_node_index; 1603 } 1604 } 1605 1606 /* this 'slot' could be the root rule, which can't be removed */ 1607 if (fst_slot_start_node == MAXnodeID || fst_slot_end_node == MAXnodeID) 1608 continue; 1609 1610 /* 1611 start end 1612 node node2 node3 node4 1613 o--@N-->o--sil-->o--wb-->o 1614 arc arc2 | arc3 ^ 1615 | |wb 1616 \__sil->o 1617 arc4 ni3a 1618 */ 1619 1620 remove_added_arcs_leaving(fst, fst_slot_start_node); 1621 node = &fst->FSMnode_list[ fst_slot_start_node]; 1622 for (ai = node->un_ptr.first_next_arc; ai != MAXarcID; ai = arc->linkl_next_arc) 1623 { 1624 arc = &fst->FSMarc_list[ai]; 1625 if (arc->olabel != fst_slot_slotnum) 1626 continue; 1627 1628 ni2 = arc->to_node; 1629 remove_added_arcs_arriving(fst, ni2); 1630 if (ni2 == fst_slot_end_node) 1631 continue; 1632 node2 = &fst->FSMnode_list[ni2]; 1633 arc2 = &fst->FSMarc_list[ node2->un_ptr.first_next_arc]; 1634 1635 1636 ni3 = arc2->to_node; 1637 remove_added_arcs_arriving(fst, ni3); 1638 if (ni3 == fst_slot_end_node) 1639 continue; 1640 node3 = &fst->FSMnode_list[ni3]; 1641 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc]; 1642 while (arc3->linkl_next_arc != MAXarcID) 1643 { 1644 arc3 = &fst->FSMarc_list[arc3->linkl_next_arc]; 1645 ni3a = arc3->to_node; 1646 remove_added_arcs_arriving(fst, ni3a); 1647 } 1648 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc]; 1649 1650 ni4 = arc3->to_node; 1651 remove_added_arcs_arriving(fst, ni4); 1652 ASSERT(ni4 == fst_slot_end_node); 1653 if (ni4 == fst_slot_end_node) 1654 continue; 1655 } 1656 } 1657 1658 /* reset the freelist for nodes */ 1659 if( fst->num_nodes == fst->num_base_nodes ) 1660 {} 1661 else{ 1662 FSMnode *ntoken; 1663 FSMnode* tmp_FSMnode_list; 1664 FSMnode_info* tmp_FSMnode_info_list; 1665 1666 fst->FSMnode_freelist = MAXnodeID; 1667 fst->num_nodes = fst->FSMnode_list_len = fst->num_base_nodes; 1668 1669 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes"); 1670 if(!tmp_FSMnode_list){ 1671 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodes"); 1672 return FST_FAILED_ON_MEMORY; 1673 } 1674 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode)); 1675 1676 /* scan to the end of the free list (to be re-used) */ 1677 nodeID* last_free_node = (&fst->FSMnode_freelist); 1678 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ; 1679 1680 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node) 1681 ntoken = &tmp_FSMnode_list[ *last_free_node]; 1682 1683 FREE( fst->FSMnode_list); 1684 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos"); 1685 if(!tmp_FSMnode_info_list){ 1686 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodeinfos"); 1687 return FST_FAILED_ON_MEMORY; 1688 } 1689 // copy in node info 1690 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info)); 1691 1692 1693 FREE( fst->FSMnode_info_list); 1694 fst->FSMnode_info_list = tmp_FSMnode_info_list; 1695 fst->FSMnode_list = tmp_FSMnode_list; 1696 } 1697 1698 /*ni = fst->FSMnode_freelist = fst->num_base_nodes; 1699 node = &fst->FSMnode_list[ni]; 1700 for (; ni < fst->FSMnode_list_len - 1; ni++, node++) 1701 node->un_ptr.next_node = (nodeID)(ni + 1); 1702 node->un_ptr.next_node = MAXnodeID; 1703 fst->num_nodes = fst->num_base_nodes;*/ 1704 1705 /* reset the freelist for arcs */ 1706 if( fst->num_arcs == fst->num_base_arcs ) 1707 {} 1708 else { 1709 FSMarc* atoken = NULL; 1710 FSMarc* tmp_FSMarc_list; 1711 1712 fst->num_arcs = fst->num_base_arcs; 1713 fst->FSMarc_list_len = fst->num_base_arcs; 1714 fst->FSMarc_freelist = MAXarcID; 1715 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs"); 1716 if(!tmp_FSMarc_list){ 1717 PLogError("ERROR: Could NOT reset the memory for srec.graph.arcs"); 1718 return FST_FAILED_ON_MEMORY; 1719 } 1720 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc)); 1721 1722 /* scan to the end of the free list (to be re-used) */ 1723 arcID* last_free_arc = &fst->FSMarc_freelist; 1724 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ; 1725 1726 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc) 1727 atoken = &tmp_FSMarc_list[ *last_free_arc]; 1728 1729 FREE( fst->FSMarc_list); 1730 fst->FSMarc_list = tmp_FSMarc_list; 1731 } 1732 1733 /*ai = fst->FSMarc_freelist = fst->num_base_arcs; 1734 arc = &fst->FSMarc_list[ai]; 1735 for (; ai < fst->FSMarc_list_len - 1; ai++, arc++) 1736 arc->linkl_next_arc = (arcID)(ai + 1); 1737 arc->linkl_next_arc = MAXarcID; 1738 fst->num_arcs = fst->num_base_arcs; 1739 fst->FSMarc_list_len = fst->num_base_arcs;*/ 1740 1741 /* now remove all the added words */ 1742 wordmap_reset(fst->olabels); 1743 return FST_SUCCESS; 1744 } 1745 1746 /* See the description of arc_tokens in the header file. These 1747 arcs are nodeless, which make loading from a file that 1748 references nodes a little harder. We need to do it in stages. 1749 Later, when we load from binary image it should be much simpler. 1750 */ 1751 1752 arc_token_lnk get_first_arc_leaving_node(arc_token* arc_token_list, 1753 arcID num_arcs, 1754 nodeID node) 1755 { 1756 arcID i; 1757 for (i = 0; i < num_arcs; i++) 1758 { 1759 if ((nodeID)(int)arc_token_list[i].next_token_index == node) 1760 return ARC_TOKEN_LNK(arc_token_list, i); 1761 } 1762 return ARC_TOKEN_NULL; 1763 } 1764 1765 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, PFile* fp) 1766 { 1767 arcID i; 1768 char line[MAX_LINE_LENGTH]; 1769 char word_label_as_str[128]; 1770 arcID num_alloc_arc_tokens; 1771 nodeID from_node, into_node; 1772 labelID word_label = MAXwordID; 1773 arc_token *atoken, *last_atoken; 1774 costdata cost; 1775 arcID num_arcs; 1776 arc_token *arc_token_list, *tmp; 1777 long fpos; 1778 1779 /* determine number of word arcs to allocate */ 1780 fpos = pftell(fp); 1781 for (num_arcs = 0; pfgets(line, MAX_LINE_LENGTH, fp); num_arcs++) ; 1782 num_alloc_arc_tokens = num_arcs; 1783 num_alloc_arc_tokens = (arcID)(num_arcs + num_words_to_add); 1784 pfseek(fp, fpos, SEEK_SET); 1785 1786 context->arc_token_list = (arc_token*)CALLOC_CLR(num_alloc_arc_tokens, sizeof(arc_token), "srec.graph.wordgraph"); 1787 arc_token_list = context->arc_token_list; 1788 1789 /* 1. first load up all the information */ 1790 i = 0; 1791 while (pfgets(line, MAX_LINE_LENGTH, fp)) 1792 { 1793 if (sscanf(line, "%hu\t%hu\t%s", &from_node, &into_node, word_label_as_str) == 3) 1794 { 1795 word_label = wordmap_find_index(context->olabels, word_label_as_str); 1796 // ASSERT(word_label >= 0); 1797 cost = FREEcostdata; 1798 } 1799 else if (sscanf(line, "%hu", &from_node) == 1) 1800 { 1801 into_node = MAXnodeID; 1802 word_label = MAXwordID; 1803 cost = FREEcostdata; 1804 } 1805 else 1806 { 1807 PLogError("FST_LoadReverseWordGraph() .. can't parse line %s\n", line); 1808 ASSERT(0); 1809 } 1810 atoken = &arc_token_list[i]; 1811 i++; 1812 atoken->ilabel = word_label; 1813 /* atoken->olabel = WORD_EPSILON_LABEL; */ 1814 /*atoken->cost = cost; cost is not used for now */ 1815 #if DEBUG_ASTAR 1816 atoken->label = (word_label == MAXwordID ? 0 : wmap->words[word_label]); 1817 #endif 1818 atoken->first_next_arc = (arc_token_lnk)into_node; 1819 atoken->next_token_index = (arc_token_lnk)from_node; 1820 } 1821 num_arcs = i; 1822 1823 /* 2. now do the internal cross references */ 1824 for (i = 0; i < num_arcs; i++) 1825 { 1826 atoken = &arc_token_list[i]; 1827 into_node = (nodeID)atoken->first_next_arc; 1828 if (into_node == MAXnodeID) 1829 atoken->first_next_arc = ARC_TOKEN_NULL; 1830 else 1831 atoken->first_next_arc = get_first_arc_leaving_node(arc_token_list, num_arcs, (nodeID)atoken->first_next_arc); 1832 } 1833 1834 /* 3. now do more internal cross refs */ 1835 last_atoken = &arc_token_list[0]; 1836 for (i = 1; i < num_arcs; i++) 1837 { 1838 atoken = &arc_token_list[i]; 1839 if (atoken->next_token_index != last_atoken->next_token_index) 1840 { 1841 last_atoken->next_token_index = ARC_TOKEN_NULL; 1842 } 1843 else 1844 { 1845 last_atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i); 1846 } 1847 last_atoken = atoken; 1848 } 1849 last_atoken->next_token_index = ARC_TOKEN_NULL; 1850 1851 #if DEBUG_ASTAR 1852 /* under debug, it's nice to be able to see the words leaving the 1853 destination node, they are stored sequentially in the debug ary */ 1854 for (i = 0; i < num_arcs; i++) 1855 { 1856 char * p; 1857 atoken = &arc_token_list[i]; 1858 atoken->debug[0] = 0; 1859 for (tmp = ARC_TOKEN_LNK(arc_token_list, atoken->first_next_arc); tmp != NULL; 1860 tmp = ARC_TOKEN_PTR(arc_token_list, tmp->next_token_index)) 1861 { 1862 if (tmp->first_next_arc == ARC_TOKEN_NULL) 1863 { 1864 p = "END"; 1865 } 1866 else if (!tmp->label) 1867 { 1868 p = "NULL"; 1869 } 1870 else 1871 { 1872 p = tmp->label; 1873 } 1874 if (strlen(atoken->debug) + strlen(p) + 6 < 64) 1875 { 1876 strcat(atoken->debug, p); 1877 strcat(atoken->debug, " "); 1878 } 1879 else 1880 { 1881 strcat(atoken->debug, "..."); 1882 break; 1883 } 1884 } 1885 } 1886 #endif 1887 context->arc_token_list_len = num_alloc_arc_tokens; 1888 if (num_alloc_arc_tokens > num_arcs) 1889 { 1890 atoken = &context->arc_token_list[num_arcs]; 1891 for (i = num_arcs; i < num_alloc_arc_tokens; i++, atoken++) 1892 { 1893 atoken->first_next_arc = ARC_TOKEN_NULL; 1894 atoken->ilabel = MAXwordID; 1895 atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i + 1);/*atoken+1;*/ 1896 } 1897 atoken--; 1898 atoken->next_token_index = ARC_TOKEN_NULL; 1899 context->arc_token_freelist = &context->arc_token_list[num_arcs]; 1900 } 1901 else 1902 { 1903 context->arc_token_freelist = NULL; 1904 } 1905 /* the insert start is the arc that goes to the end */ 1906 atoken = context->arc_token_list; 1907 atoken = ARC_TOKEN_PTR(arc_token_list, atoken->first_next_arc); 1908 for (; atoken; atoken = ARC_TOKEN_PTR(arc_token_list, atoken->next_token_index)) 1909 { 1910 if (atoken->first_next_arc == ARC_TOKEN_NULL) 1911 break; 1912 tmp = ARC_TOKEN_PTR(context->arc_token_list, atoken->first_next_arc); 1913 if (tmp->ilabel == MAXwordID && tmp->first_next_arc == ARC_TOKEN_NULL) 1914 break; 1915 } 1916 context->arc_token_insert_start = atoken; 1917 1918 return FST_SUCCESS; 1919 } 1920 1921 int FST_UnloadReverseWordGraph(srec_context* context) 1922 { 1923 FREE(context->arc_token_list); 1924 context->arc_token_list = 0; 1925 return FST_SUCCESS; 1926 } 1927 1928 /*----------------------------------------------------------------------* 1929 * * 1930 * general * 1931 * * 1932 *----------------------------------------------------------------------*/ 1933 1934 char split_line(char* line, char** av) 1935 { 1936 int i = 0; 1937 av[i] = strtok(line, "\n\r\t "); 1938 for (; av[i];) 1939 { 1940 av[++i] = strtok(NULL, "\n\r\t "); 1941 } 1942 return i; 1943 } 1944 1945 asr_int32_t atoi_with_check(const char* buf, asr_int32_t mymax) 1946 { 1947 asr_int32_t ans = atol(buf); 1948 ASSERT(ans < mymax); 1949 return ans; 1950 } 1951 1952 /*----------------------------------------------------------------------* 1953 * * 1954 * fst * 1955 * * 1956 *----------------------------------------------------------------------*/ 1957 1958 arcID fst_get_free_arc(srec_context* fst) 1959 { 1960 FSMarc* atoken; 1961 arcID atokid = fst->FSMarc_freelist; 1962 if (atokid == MAXarcID) 1963 { 1964 PLogError("error: ran out of arcs\n"); 1965 return atokid; 1966 } 1967 atoken = &fst->FSMarc_list[atokid]; 1968 fst->FSMarc_freelist = ARC_XtoI(atoken->linkl_next_arc); 1969 memset(atoken, 0, sizeof(FSMarc)); 1970 atoken->to_node = FSMNODE_NULL; 1971 atoken->fr_node = FSMNODE_NULL; 1972 atoken->linkl_next_arc = FSMARC_NULL; 1973 atoken->linkl_prev_arc = FSMARC_NULL; 1974 atoken->ilabel = 0; 1975 atoken->olabel = 0; 1976 atoken->cost = 0; 1977 fst->num_arcs++; 1978 return atokid; 1979 } 1980 nodeID fst_get_free_node(srec_context* fst) 1981 { 1982 FSMnode* ntoken; 1983 nodeID ntokid = fst->FSMnode_freelist; 1984 if (ntokid == MAXnodeID) 1985 { 1986 PLogError("error: ran out of nodes\n"); 1987 return ntokid; 1988 } 1989 ntoken = &fst->FSMnode_list[ntokid]; 1990 fst->FSMnode_freelist = NODE_XtoI(ntoken->un_ptr.next_node); 1991 ntoken->un_ptr.first_next_arc = FSMARC_NULL; 1992 ntoken->first_prev_arc = FSMARC_NULL; 1993 /* ASSERT( (int)(ntok - fst->FSMnode_list) == fst->num_nodes); */ 1994 fst->num_nodes++; 1995 return ntokid; 1996 } 1997 1998 int fst_free_node(srec_context* fst, FSMnode* node) 1999 { 2000 IF_DEBUG_WDADD(printf_node(fst, "freeing: ", node)); 2001 node->first_prev_arc = FSMARC_NULL; 2002 node->un_ptr.next_node = fst->FSMnode_freelist; 2003 node->first_prev_arc = FSMARC_FREE; 2004 fst->FSMnode_freelist = (nodeID)(node - fst->FSMnode_list); 2005 /* fst->num_nodes--; not allowed unless we compress! */ 2006 return 0; 2007 } 2008 2009 int fst_free_arc(srec_context* fst, FSMarc* arc) 2010 { 2011 IF_DEBUG_WDADD(printf_arc(fst, "freeing: ", arc)); 2012 arc->linkl_prev_arc = FSMARC_FREE; 2013 arc->linkl_next_arc = ARC_ItoX(fst->FSMarc_freelist); 2014 arc->cost = 2; 2015 arc->ilabel = 0; 2016 arc->olabel = 0; 2017 IF_DEBUG_WDADD(arc->ilabel_str = 0); 2018 IF_DEBUG_WDADD(arc->olabel_str = 0); 2019 fst->FSMarc_freelist = (arcID)(arc - fst->FSMarc_list); 2020 fst->num_arcs--; 2021 return 0; 2022 } 2023 2024 int fst_free_arc_net(srec_context* fst, FSMarc* arc) 2025 { 2026 if (!arc) 2027 return 0; 2028 /* need to traverse the arc, get the 'from' nodes, 2029 then traverse the from nodes and kill arcs */ 2030 /* we'll need this after an error condition */ 2031 return 0; 2032 } 2033 2034 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc) 2035 { 2036 FSMarc_ptr atok; 2037 FSMarc* atoken; 2038 FSMnode_ptr ntok = arc->to_node; 2039 2040 /* look for all arcs leaving this arc, and mark them 2041 normally there is only one, except for multipron words */ 2042 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2043 { 2044 atoken = ARC_XtoP(atok); 2045 /* cannot have multiple olabels before a boundary */ 2046 if (atoken->olabel != WORD_EPSILON_LABEL) 2047 return FST_FAILED_INTERNAL; 2048 atoken->olabel = arc->olabel; 2049 #if DO_WEIGHTED_ADDWORD 2050 atoken->cost = (costdata)(atoken->cost + arc->cost); 2051 #endif 2052 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str); 2053 } 2054 /* unlabel this arc */ 2055 arc->olabel = WORD_EPSILON_LABEL; 2056 #if DO_WEIGHTED_ADDWORD 2057 arc->cost = FREEcostdata; 2058 #endif 2059 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]); 2060 return FST_SUCCESS; 2061 } 2062 2063 #if DO_WEIGHTED_ADDWORD 2064 int fst_push_arc_cost(srec_context* fst, FSMarc* arc) 2065 { 2066 FSMarc_ptr atok; 2067 FSMarc* atoken; 2068 FSMnode_ptr ntok = arc->to_node; 2069 /* look for all arcs leaving this arc, and push the cost thereto */ 2070 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2071 { 2072 atoken = ARC_XtoP(atok); 2073 atoken->cost = (costdata)(atoken->cost + arc->cost); 2074 } 2075 arc->cost = FREEcostdata; 2076 return FST_SUCCESS; 2077 } 2078 #endif 2079 2080 2081 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc) 2082 { 2083 FSMarc_ptr atok; 2084 FSMarc* atoken; 2085 FSMnode_ptr fr_node = arc->fr_node; 2086 FSMarc_ptr tmp_atok; 2087 FSMarc *tmp_atoken; 2088 FSMnode *anode; 2089 2090 if (arc->olabel == WORD_EPSILON_LABEL) 2091 return FST_SUCCESS; 2092 2093 /* can the olabel be pulled earlier? */ 2094 for (atok = FIRST_PREV(fr_node); atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2095 { 2096 atoken = ARC_XtoP(atok); 2097 anode = NODE_XtoP(atoken->to_node); 2098 2099 tmp_atok = anode->un_ptr.first_next_arc; 2100 if (tmp_atok != FSMARC_NULL) 2101 { 2102 tmp_atoken = ARC_XtoP(tmp_atok); 2103 if (tmp_atoken->linkl_next_arc != FSMARC_NULL) 2104 { 2105 return FST_CONTINUE; 2106 } 2107 } 2108 } 2109 2110 /* look for all arcs arriving at this arc, and mark them */ 2111 for (atok = FIRST_PREV(fr_node);atok != FSMARC_NULL;atok = atoken->linkl_prev_arc) 2112 { 2113 atoken = ARC_XtoP(atok); 2114 /* cannot have multiple olabels! serious internal error!? */ 2115 if (atoken->olabel != WORD_EPSILON_LABEL) 2116 { 2117 PLogError("error: internal error, in fst_pull_arc_olabel()\n"); 2118 return FST_FAILED_INTERNAL; 2119 } 2120 atoken->olabel = arc->olabel; 2121 #if DO_WEIGHTED_ADDWORD 2122 atoken->cost = arc->cost; 2123 #endif 2124 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str); 2125 } 2126 /* unlabel this arc */ 2127 arc->olabel = WORD_EPSILON_LABEL; 2128 #if DO_WEIGHTED_ADDWORD 2129 arc->cost = FREEcostdata; 2130 #endif 2131 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]); 2132 return FST_SUCCESS; 2133 } 2134 2135 static FSMarc* find_next_arc_with_ilabel(srec_context* fst, FSMnode* node, labelID ilabel, FSMarc** last) 2136 { 2137 FSMarc_ptr atok; 2138 FSMarc* atoken = NULL; 2139 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2140 { 2141 atoken = ARC_XtoP(atok); 2142 if (atoken->ilabel == ilabel) 2143 return atoken; 2144 } 2145 *last = atoken; /* to this we may want to append a new arc */ 2146 return NULL; 2147 } 2148 2149 FSMarc* find_prev_arc_with_iolabels(srec_context* fst, FSMnode* node, labelID ilabel, labelID olabel, FSMarc** last) 2150 { 2151 FSMarc_ptr atok; 2152 FSMarc* atoken = NULL; 2153 FSMarc_ptr tmp_atok; 2154 FSMarc* tmp_atoken; 2155 2156 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2157 { 2158 atoken = ARC_XtoP(atok); 2159 if (atoken->olabel == olabel && atoken->ilabel == ilabel) 2160 { 2161 tmp_atok = NODE_XtoP(atoken->fr_node)->un_ptr.first_next_arc; 2162 if (tmp_atok != FSMARC_NULL) 2163 { 2164 tmp_atoken = ARC_XtoP(tmp_atok); 2165 2166 if (tmp_atoken->linkl_next_arc == FSMARC_NULL) 2167 return atoken; 2168 } 2169 else 2170 return atoken; 2171 } 2172 2173 } 2174 if (last) *last = atoken; 2175 return NULL; 2176 } 2177 2178 int num_arcs_leaving(srec_context* fst, FSMnode* node) 2179 { 2180 int num_arcs = 0; 2181 FSMarc_ptr atok; 2182 FSMarc* atoken; 2183 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2184 { 2185 atoken = ARC_XtoP(atok); 2186 num_arcs++; 2187 } 2188 return num_arcs; 2189 } 2190 2191 int num_arcs_arriving(srec_context* fst, FSMnode* node) 2192 { 2193 int num_arcs = 0; 2194 FSMarc_ptr atok; 2195 FSMarc* atoken; 2196 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2197 { 2198 atoken = ARC_XtoP(atok); 2199 num_arcs++; 2200 } 2201 return num_arcs; 2202 } 2203 2204 PINLINE int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node) 2205 { 2206 FSMarc_ptr atok = node->first_prev_arc; 2207 FSMarc* atoken; 2208 if (atok == FSMARC_NULL) 2209 return 0; 2210 atoken = ARC_XtoP(atok); 2211 return atoken->linkl_prev_arc == FSMARC_NULL ? 0 : 1; 2212 } 2213 2214 static void remove_arc_arriving(srec_context* fst, FSMnode* to_node, FSMarc* arc) 2215 { 2216 FSMarc* atoken; 2217 FSMarc_ptr* atok; 2218 for (atok = &to_node->first_prev_arc;(*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc) 2219 { 2220 atoken = ARC_XtoP(*atok); 2221 if (atoken == arc) 2222 { 2223 (*atok) = arc->linkl_prev_arc; 2224 return; 2225 } 2226 } 2227 ASSERT(0); 2228 } 2229 2230 int split_node_for_arc(srec_context* fst, FSMarc* arc) 2231 { 2232 arcID new_arc_id; 2233 nodeID new_node_id; 2234 FSMnode* new_node; 2235 FSMnode* old_node; 2236 FSMarc_ptr atok, new_next, last_new_next; 2237 FSMarc *atoken, *new_next_p; 2238 2239 IF_DEBUG_WDADD(printf_arc(fst, "spliting ... ", arc)); 2240 2241 new_node_id = fst_get_free_node(fst); 2242 if (new_node_id == MAXnodeID) 2243 return FST_FAILED_ON_MEMORY; 2244 new_node = &fst->FSMnode_list[ new_node_id]; 2245 old_node = NODE_XtoP(arc->to_node); 2246 arc->to_node = NODE_ItoX(new_node_id); 2247 new_node->first_prev_arc = ARC_PtoX(arc); 2248 remove_arc_arriving(fst, old_node, arc); 2249 arc->linkl_prev_arc = FSMARC_NULL; 2250 2251 last_new_next = FSMARC_NULL; 2252 for (atok = old_node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2253 { 2254 atoken = ARC_XtoP(atok); 2255 new_arc_id = fst_get_free_arc(fst); 2256 if (new_arc_id == MAXarcID) 2257 return FST_FAILED_ON_MEMORY; 2258 new_next = ARC_ItoX(new_arc_id); 2259 if (last_new_next != FSMARC_NULL) 2260 LINKL_NEXT(last_new_next) = new_next; 2261 else 2262 new_node->un_ptr.first_next_arc = new_next; 2263 new_next_p = ARC_XtoP(new_next); 2264 new_next_p->ilabel = atoken->ilabel; 2265 new_next_p->to_node = atoken->to_node; 2266 new_next_p->fr_node = arc->to_node; 2267 new_next_p->olabel = atoken->olabel; 2268 IF_DEBUG_WDADD(new_next_p->ilabel_str = atoken->ilabel_str); 2269 IF_DEBUG_WDADD(new_next_p->olabel_str = atoken->olabel_str); 2270 new_next_p->linkl_next_arc = FSMARC_NULL; /* set at next loop */ 2271 append_arc_arriving_node(fst, NODE_XtoP(atoken->to_node), new_next); 2272 last_new_next = new_next; 2273 } 2274 return FST_SUCCESS; 2275 } 2276 2277 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node, 2278 wordID add_tree_olabel, costdata add_tree_cost, 2279 modelID* add_tree_start, int add_tree_len) 2280 { 2281 int rc = FST_SUCCESS; 2282 FSMarc_ptr atok = (FSMarc_ptr)0, last_atok = (FSMarc_ptr)0; 2283 FSMarc* atoken = NULL; 2284 FSMarc *next_atoken, *prev_atoken; 2285 FSMarc *append_to; 2286 FSMnode *late_start_node, *early_end_node; 2287 FSMnode *ntoken, *last_ntoken; 2288 FSMnode_ptr ntok, last_ntok; 2289 modelID *add_tree_end = add_tree_start + add_tree_len - 1; 2290 modelID *add_tree_late_start, *add_tree_early_end; 2291 modelID *add_tree; 2292 arcID new_arc_id, atokid; 2293 nodeID new_node_id, atokid_node; 2294 wordID add_tree_olabel_use = add_tree_olabel; 2295 wordID add_tree_cost_use = add_tree_cost; 2296 2297 append_to = NULL; 2298 add_tree = add_tree_start; 2299 #define FST_GROW_MINARCS 100 2300 #define FST_GROW_MINNODES 100 2301 #if defined(FST_GROW_FACTOR) 2302 2303 /* make sure we have enough arcs available */ 2304 if(fst->num_arcs + add_tree_len >= fst->FSMarc_list_len) 2305 { 2306 FSMarc* tmp_FSMarc_list; 2307 arcID tmp_FSMarc_list_len; 2308 int itmp_FSMarc_list_len = fst->FSMarc_list_len*FST_GROW_FACTOR ; 2309 2310 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < FST_GROW_MINARCS) 2311 itmp_FSMarc_list_len += FST_GROW_MINARCS; 2312 2313 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < add_tree_len) 2314 itmp_FSMarc_list_len += add_tree_len; 2315 2316 if(itmp_FSMarc_list_len >= (int)MAXarcID) 2317 return FST_FAILED_ON_MEMORY; 2318 2319 tmp_FSMarc_list_len = (arcID)itmp_FSMarc_list_len; 2320 2321 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(tmp_FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs"); 2322 if(!tmp_FSMarc_list) { 2323 PLogError("error: Failed to extend the memory for arcs\n"); 2324 return FST_FAILED_INTERNAL; 2325 } 2326 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc)); 2327 2328 /* initialize the new free list. 2329 head of this new free list is tmp_FSMarc_list[ fst->FSMarc_list_len] */ 2330 for(atokid=fst->FSMarc_list_len; atokid<tmp_FSMarc_list_len-1; atokid++) { 2331 tmp_FSMarc_list[atokid].linkl_next_arc = ARC_ItoX(atokid+1); 2332 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE; 2333 } 2334 tmp_FSMarc_list[atokid].linkl_next_arc = FSMARC_NULL; 2335 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE; 2336 2337 /* scan to the end of the old free list (to be re-used) */ 2338 arcID* last_free_arc = &fst->FSMarc_freelist; 2339 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ; 2340 2341 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc) 2342 atoken = &tmp_FSMarc_list[ *last_free_arc]; 2343 2344 /* append the new free list to the current free list */ 2345 *last_free_arc = fst->FSMarc_list_len; 2346 2347 FREE( fst->FSMarc_list); 2348 fst->FSMarc_list = tmp_FSMarc_list; 2349 fst->FSMarc_list_len = tmp_FSMarc_list_len; 2350 } 2351 2352 /* make sure we have enough nodes available */ 2353 if(fst->num_nodes + add_tree_len >= fst->FSMnode_list_len) 2354 { 2355 FSMnode* tmp_FSMnode_list; 2356 nodeID tmp_FSMnode_list_len; 2357 FSMnode_info* tmp_FSMnode_info_list; 2358 int itmp_FSMnode_list_len = fst->FSMnode_list_len * FST_GROW_FACTOR ; 2359 2360 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < FST_GROW_MINNODES) 2361 itmp_FSMnode_list_len += FST_GROW_MINNODES; 2362 2363 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < add_tree_len) 2364 itmp_FSMnode_list_len += add_tree_len; 2365 2366 if(itmp_FSMnode_list_len >= (int)MAXnodeID) 2367 return FST_FAILED_ON_MEMORY; 2368 2369 tmp_FSMnode_list_len = (nodeID)itmp_FSMnode_list_len; 2370 2371 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(tmp_FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes"); 2372 if(!tmp_FSMnode_list) { 2373 PLogError("ERROR: Failed to extend the memory for nodes\n"); 2374 return FST_FAILED_INTERNAL; 2375 } 2376 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode)); 2377 2378 /* initialize the new free node list. 2379 head of this new free list is tmp_FSMnode_list[ fst->FSMnode_list_len] */ 2380 for(atokid_node=fst->FSMnode_list_len; atokid_node<tmp_FSMnode_list_len-1; atokid_node++) 2381 { 2382 tmp_FSMnode_list[atokid_node].un_ptr.next_node = NODE_ItoX(atokid_node + 1); 2383 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE; 2384 } 2385 tmp_FSMnode_list[atokid_node].un_ptr.next_node = FSMNODE_NULL; 2386 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE; 2387 2388 /* scan to the end of the old free list (to be re-used) */ 2389 nodeID* last_free_node = (&fst->FSMnode_freelist); 2390 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ; 2391 2392 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node) 2393 ntoken = &tmp_FSMnode_list[ *last_free_node]; 2394 2395 /* append the new free list to the current free list */ 2396 *last_free_node = fst->FSMnode_list_len; 2397 2398 FREE( fst->FSMnode_list); 2399 2400 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR( tmp_FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos"); 2401 if(!tmp_FSMnode_info_list) { 2402 PLogError("ERROR: Failed to extend the memory for node infos\n"); 2403 return FST_FAILED_INTERNAL; 2404 } 2405 // copy in old node info 2406 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info)); 2407 2408 // initialize the new node info 2409 for (atokid_node=fst->FSMnode_list_len; atokid_node < tmp_FSMnode_list_len; atokid_node++) 2410 tmp_FSMnode_info_list[atokid_node] = NODE_INFO_UNKNOWN; 2411 2412 FREE( fst->FSMnode_info_list); 2413 fst->FSMnode_info_list = tmp_FSMnode_info_list; 2414 fst->FSMnode_list = tmp_FSMnode_list; 2415 fst->FSMnode_list_len = tmp_FSMnode_list_len; 2416 } 2417 #endif 2418 late_start_node = &fst->FSMnode_list[start_node]; 2419 2420 while (1) 2421 { 2422 2423 next_atoken = find_next_arc_with_ilabel(fst, late_start_node, 2424 *add_tree /*->ilabel*/, 2425 &append_to); 2426 if (next_atoken != NULL) 2427 { 2428 /* so next_atok->ilabel == add_tree->ilabel */ 2429 2430 if (next_atoken->ilabel == WORD_BOUNDARY && *add_tree/*->ilabel*/ == WORD_BOUNDARY) 2431 { 2432 next_atoken = NULL; 2433 break; /* break as if nothing more can be shared! */ 2434 // return FST_FAILED_ON_HOMONYM; 2435 } 2436 2437 if (num_arcs_arriving_gt_1(fst, NODE_XtoP(next_atoken->to_node))) 2438 { 2439 split_node_for_arc(fst, next_atoken); 2440 /* unfortunate side effect here, that if we later find out this 2441 was a homonym addition, then this expansion was useless and 2442 for now we don't undo it! */ 2443 } 2444 2445 /* we shouldn't really push the olabel if it's the same for both! */ 2446 if (next_atoken->olabel != add_tree_olabel_use) 2447 { 2448 if (next_atoken->olabel != WORD_EPSILON_LABEL) 2449 { 2450 if (fst_push_arc_olabel(fst, next_atoken) != FST_SUCCESS) 2451 { 2452 PLogError("error: internal error fst_push_arc_olabel()\n"); 2453 return FST_FAILED_INTERNAL; 2454 } 2455 } 2456 #if DO_WEIGHTED_ADDWORD 2457 else 2458 fst_push_arc_cost(fst, next_atoken); 2459 #endif 2460 } 2461 else if (add_tree_olabel_use != WORD_EPSILON_LABEL) 2462 { 2463 /* the graph already has this word, so we just re-use the olabel 2464 and disable the setting of olabel down below */ 2465 add_tree_olabel_use = WORD_EPSILON_LABEL; 2466 #if DO_WEIGHTED_ADDWORD 2467 add_tree_cost_use = FREEcostdata; 2468 #endif 2469 } 2470 add_tree++; 2471 late_start_node = NODE_XtoP(next_atoken->to_node); 2472 } 2473 else 2474 { 2475 break; 2476 } 2477 } 2478 2479 add_tree_late_start = add_tree; 2480 early_end_node = &fst->FSMnode_list[end_node]; 2481 2482 if (!do_minimize) 2483 { 2484 last_ntoken = NULL; 2485 last_ntok = NODE_PtoX(late_start_node); 2486 for (; add_tree <= add_tree_end; add_tree++) 2487 { 2488 new_arc_id = fst_get_free_arc(fst); 2489 if (new_arc_id == MAXarcID) 2490 { 2491 PLogError("fst_get_free_arc() failed\n"); 2492 return FST_FAILED_ON_MEMORY; 2493 } 2494 atok = ARC_ItoX(new_arc_id); 2495 atoken = ARC_XtoP(atok); 2496 if (add_tree != add_tree_end) 2497 { 2498 new_node_id = fst_get_free_node(fst); 2499 if (new_node_id == MAXnodeID) 2500 { 2501 PLogError("fst_get_free_node() failed\n"); 2502 return FST_FAILED_ON_MEMORY; 2503 } 2504 ntok = NODE_ItoX(new_node_id); 2505 ntoken = NODE_XtoP(ntok); 2506 ntoken->first_prev_arc = atok; 2507 } 2508 else 2509 { 2510 ntok = FSMNODE_NULL; 2511 ntoken = NULL; 2512 } 2513 atoken->fr_node = last_ntok; /* was NODE_PtoX(late_start_node) */ 2514 atoken->to_node = ntok; 2515 atoken->ilabel = *add_tree; 2516 atoken->linkl_next_arc = FSMARC_NULL; 2517 atoken->linkl_prev_arc = FSMARC_NULL; 2518 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]); 2519 if (!last_ntoken) 2520 { 2521 append_to->linkl_next_arc = atok; 2522 atoken->olabel = add_tree_olabel_use; 2523 #if DO_WEIGHTED_ADDWORD 2524 atoken->cost = add_tree_cost_use; 2525 #endif 2526 IF_DEBUG_WDADD(atok->olabel_str = fst->olabels->words[ atoken->olabel]); 2527 } 2528 else 2529 { 2530 last_ntoken->un_ptr.first_next_arc = atok; 2531 } 2532 last_ntoken = ntoken; 2533 last_ntok = ntok; 2534 } 2535 /* add_tree_end->to_node = early_end_node; 2536 add_tree_end->linkl_next_arc = FSMARC_NULL; 2537 atok = last_arc_arriving( early_end_node); 2538 atok->linkl_prev_arc = add_tree_end; 2539 add_tree_end->linkl_prev_arc = FSMARC_NULL; */ 2540 atoken->to_node = NODE_ItoX(end_node); 2541 append_arc_arriving_node(fst, &fst->FSMnode_list[end_node], atok); 2542 } 2543 2544 else /* ie. do_minimize from the rear */ 2545 { 2546 2547 add_tree = add_tree_end; 2548 new_arc_id = fst_get_free_arc(fst); 2549 if (new_arc_id == MAXarcID) 2550 return FST_FAILED_ON_MEMORY; 2551 atok = ARC_ItoX(new_arc_id); 2552 atoken = ARC_XtoP(atok); 2553 atoken->olabel = add_tree_olabel_use; 2554 #if DO_WEIGHTED_ADDWORD 2555 atoken->cost = add_tree_cost_use; 2556 #endif 2557 atoken->ilabel = *add_tree_late_start; 2558 atoken->fr_node = NODE_PtoX(late_start_node); 2559 if (atoken->ilabel == WORD_BOUNDARY) 2560 { 2561 atoken->cost = (costdata)(atoken->cost + fst->wtw_average); 2562 } 2563 else 2564 { 2565 atoken->cost = add_tree_cost_use + fst->wtw_average; 2566 add_tree_cost_use = FREEcostdata; 2567 } 2568 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[ atoken->olabel]); 2569 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]); 2570 append_arc_leaving_node(fst, late_start_node, atok); 2571 2572 last_atok = atok; 2573 while (1) 2574 { 2575 2576 if (add_tree == add_tree_late_start) 2577 break; 2578 /* if there are other arcs leaving this node then joining 2579 earlier than here will result in over-generation, so don't 2580 if( atok!=end_arc && atok->linkl_next_arc != FSMARC_NULL) 2581 break; 2582 */ 2583 /* 2584 also need boundary conditions checker here 2585 */ 2586 2587 /* */ 2588 IF_DEBUG_WDADD(printf_node(fst, early_end_node)); 2589 2590 /* if( num_arcs_leaving( add_tree_end->fr_node->first_prev_arc->to_node) >1) { 2591 printf("merge stopped due to num_arcs leaving ..\n"); 2592 printf_arc(fst, add_tree_end->fr_node->first_prev_arc->to_node->first_next_arc); 2593 break; 2594 } */ 2595 2596 for (atok = early_end_node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2597 { 2598 atoken = ARC_XtoP(atok); 2599 /* never pull before the slot marker */ 2600 if(atoken->cost == DISABLE_ARC_COST) break; 2601 if(atoken->olabel>0 && atoken->olabel<fst->olabels->num_slots) break; 2602 fst_pull_arc_olabel(fst, atoken); /* fails are ok */ 2603 } 2604 2605 prev_atoken = find_prev_arc_with_iolabels(fst, early_end_node, 2606 *add_tree /*->ilabel*/, 2607 WORD_EPSILON_LABEL, 2608 NULL /*&append_to*/); 2609 if (!prev_atoken) 2610 break; 2611 else 2612 { 2613 /* this check is now inside find_prev_arc_with_iolabels */ 2614 /* if( num_arcs_leaving(prev_atok->fr_node->first_prev_arc->to_node->first_next_arc)>1)*/ 2615 2616 /* now look to merge earlier still */ 2617 early_end_node = NODE_XtoP(prev_atoken->fr_node); 2618 add_tree--; 2619 if (add_tree == add_tree_late_start) 2620 break; 2621 } 2622 } 2623 add_tree_early_end = add_tree; 2624 for (add_tree = add_tree_late_start + 1; add_tree <= add_tree_early_end; add_tree++) 2625 { 2626 new_node_id = fst_get_free_node(fst); 2627 if (new_node_id == MAXnodeID) 2628 return FST_FAILED_ON_MEMORY; 2629 ntok = NODE_ItoX(new_node_id); 2630 ntoken = NODE_XtoP(ntok); 2631 new_arc_id = fst_get_free_arc(fst); 2632 if (new_arc_id == MAXarcID) 2633 return FST_FAILED_ON_MEMORY; 2634 atok = ARC_ItoX(new_arc_id); 2635 atoken = ARC_XtoP(atok); 2636 atoken->ilabel = *add_tree; 2637 #if DO_WEIGHTED_ADDWORD 2638 atoken->cost = FREEcostdata; /* idea: distribute add_tree_cost here;*/ 2639 #endif 2640 atoken->olabel = WORD_EPSILON_LABEL; 2641 atoken->fr_node = ntok; 2642 atoken->to_node = FSMNODE_NULL; /* filled in next loop */ 2643 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[atoken->olabel]); 2644 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[atoken->ilabel]); 2645 2646 ntoken->un_ptr.first_next_arc = atok; 2647 ntoken->first_prev_arc = last_atok; 2648 TO_NODE(last_atok) = ntok; 2649 last_atok = atok; 2650 } 2651 TO_NODE(last_atok) = NODE_PtoX(early_end_node); 2652 append_arc_arriving_node(fst, early_end_node, last_atok); 2653 } 2654 return rc; 2655 } 2656 2657 2658 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr arc) 2659 { 2660 FSMarc_ptr* atok = &fr_node->un_ptr.first_next_arc; 2661 FSMarc* atoken = ARC_XtoP(*atok); 2662 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_next_arc) 2663 { 2664 atoken = ARC_XtoP(*atok); 2665 } 2666 *atok = arc; 2667 LINKL_NEXT(arc) = FSMARC_NULL; 2668 } 2669 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr arc) 2670 { 2671 FSMarc_ptr* atok = &to_node->first_prev_arc; 2672 FSMarc* atoken = ARC_XtoP(*atok); 2673 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc) 2674 { 2675 atoken = ARC_XtoP(*atok); 2676 } 2677 *atok = arc; 2678 LINKL_PREV(arc) = FSMARC_NULL; 2679 } 2680 2681 2682 int printf_node1(srec_context* fst, FSMnode* node) 2683 { 2684 return 0; 2685 } 2686 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc) 2687 { 2688 char buffer[MAX_LINE_LENGTH]; 2689 sprintf_arc(buffer, fst, arc); 2690 printf("%s%s\n", msg, buffer); 2691 return 0; 2692 } 2693 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc) 2694 { 2695 int rc; 2696 FSMnode* to_node = NODE_XtoP(arc->to_node); 2697 arcID arc_index = (arcID)(arc - fst->FSMarc_list); 2698 if (to_node->un_ptr.first_next_arc == FSMARC_NULL) 2699 rc = sprintf(buf, "arc%hu\n", arc_index); 2700 else 2701 { 2702 rc = sprintf(buf, "arc%hu\t%hu,%hu\t%s\t%s\t%hu\n", 2703 arc_index, 2704 ARC_XtoI(to_node->un_ptr.first_next_arc), 2705 arc->linkl_next_arc != FSMARC_NULL ? ARC_XtoI(arc->linkl_next_arc) : -1, 2706 fst->ilabels->words[arc->ilabel], 2707 fst->olabels->words[arc->olabel], 2708 arc->cost); 2709 } 2710 return rc; 2711 } 2712 2713 /* dumps the recognition context as a binary file, 2714 it will also store any empty space for growing */ 2715 2716 #define ENCODE(SIGN,TYP) { SIGN+=sizeof(TYP); SIGN=SIGN<<3; ASSERT((shifted+=3)<32); } 2717 2718 asr_int32_t FST_sizes_signature() 2719 { 2720 #ifndef NDEBUG 2721 int shifted = 0; 2722 #endif 2723 asr_int32_t signature = 0; 2724 ENCODE(signature, arcID); 2725 ENCODE(signature, nodeID); 2726 ENCODE(signature, wordID); 2727 ENCODE(signature, labelID); 2728 ENCODE(signature, costdata); 2729 return signature; 2730 } 2731 2732 int FST_ContextImageSize(srec_context* context) 2733 { 2734 int size = 0; 2735 size += sizeof(srec_context); 2736 size += sizeof(wordmap); 2737 size += sizeof(char*) * context->olabels->max_words; 2738 size += sizeof(char) * context->olabels->max_chars; 2739 size += sizeof(wordmap); 2740 if (context->ilabels->words != NULL) 2741 size += sizeof(char*) * context->olabels->max_words; 2742 if (context->ilabels->chars != NULL) 2743 size += sizeof(char) * context->olabels->max_chars; 2744 size += sizeof(FSMarc) * context->FSMarc_list_len; 2745 size += sizeof(FSMnode) * context->FSMnode_list_len; 2746 size += sizeof(FSMnode_info) * context->FSMnode_list_len; 2747 size += sizeof(arc_token) * context->arc_token_list_len; 2748 return size; 2749 } 2750 2751 ESR_ReturnCode serializeWordMapV2(wordmap *wordmap, PFile* fp) 2752 { 2753 unsigned int i = 0; 2754 unsigned int nfields; 2755 unsigned int tmp2[32]; 2756 2757 i = 0; 2758 tmp2[i++] = wordmap->num_words; 2759 tmp2[i++] = wordmap->num_slots; 2760 tmp2[i++] = wordmap->max_words; 2761 tmp2[i++] = wordmap->num_base_words; 2762 tmp2[i++] = wordmap->max_chars; 2763 tmp2[i++] = (wordmap->next_chars - wordmap->chars); 2764 tmp2[i++] = (wordmap->next_base_chars - wordmap->chars); 2765 nfields = i; 2766 if (pfwrite(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields) 2767 return ESR_WRITE_ERROR; 2768 2769 if (pfwrite(wordmap->chars, sizeof(char), wordmap->max_chars, fp) != (size_t)wordmap->max_chars) 2770 return ESR_WRITE_ERROR; 2771 2772 return ESR_SUCCESS; 2773 } 2774 2775 ESR_ReturnCode deserializeWordMapV2(wordmap **pwordmap, PFile* fp) 2776 { 2777 unsigned int i = 0; 2778 unsigned int nfields; 2779 unsigned int tmp2[32]; 2780 wordmap *awordmap; 2781 char *p; 2782 ESR_ReturnCode rc = ESR_SUCCESS; 2783 unsigned int next_chars_idx, next_base_chars_idx; 2784 2785 awordmap = NEW(wordmap, L("srec.g2g.graph.wordmap.base")); 2786 if (awordmap == NULL) 2787 { 2788 PLogError("NEW failed on srec.g2g.graph.wordmap.base\n"); 2789 rc = ESR_OUT_OF_MEMORY; 2790 goto CLEANUP; 2791 } 2792 awordmap->wordIDForWord = NULL; // early break to cleanup needs this 2793 2794 nfields = 7; 2795 if (pfread(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields) 2796 { 2797 PLogError("pfread failed when reading nfields\n"); 2798 rc = ESR_READ_ERROR; 2799 goto CLEANUP; 2800 } 2801 2802 i = 0; 2803 awordmap->num_words = (wordID)tmp2[i++]; 2804 awordmap->num_slots = (wordID)tmp2[i++]; 2805 awordmap->max_words = (wordID)tmp2[i++]; 2806 awordmap->num_base_words = (wordID)tmp2[i++]; 2807 awordmap->max_chars = tmp2[i++]; 2808 next_chars_idx = tmp2[i++]; 2809 next_base_chars_idx = tmp2[i++]; 2810 ASSERT(nfields == i); 2811 2812 awordmap->words = NEW_ARRAY(char*, awordmap->max_words, L("srec.g2g.graph.wordmap.words")); 2813 if (awordmap->words == NULL) 2814 { 2815 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.words %d\n", awordmap->max_words); 2816 rc = ESR_OUT_OF_MEMORY; 2817 goto CLEANUP; 2818 } 2819 awordmap->chars = NEW_ARRAY(char, awordmap->max_chars, L("srec.g2g.graph.wordmap.chars")); 2820 if (awordmap->chars == NULL) 2821 { 2822 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.chars %d\n", awordmap->max_chars); 2823 rc = ESR_OUT_OF_MEMORY; 2824 goto CLEANUP; 2825 } 2826 awordmap->next_chars = awordmap->chars + next_chars_idx; 2827 awordmap->next_base_chars = awordmap->chars + next_base_chars_idx; 2828 2829 if ((i=pfread(awordmap->chars, sizeof(char), awordmap->max_chars, fp)) != (size_t)awordmap->max_chars) 2830 { 2831 PLogError("pfread failed while reading %d chars\n", awordmap->max_chars); 2832 rc = ESR_READ_ERROR; 2833 goto CLEANUP; 2834 } 2835 2836 2837 p = awordmap->chars; 2838 ASSERT((uintptr_t)p % 2 == 0); 2839 nfields = 0; 2840 if (nfields < awordmap->num_words) 2841 awordmap->words[nfields++] = p; 2842 for (; p < awordmap->next_chars; p++) // was next_base_chars 2843 { 2844 if ( ( *p ) == '\0' ) 2845 { 2846 if (nfields == awordmap->num_words) // was num_base_words 2847 break; 2848 if (((uintptr_t)p) % 2 == 0) p++; /* so that words begin on even byte bound */ 2849 awordmap->words[nfields++] = p + 1; 2850 } 2851 } 2852 ASSERT(nfields == awordmap->num_words); // was num_base_words 2853 2854 if (awordmap->max_words >= awordmap->num_base_words) 2855 { 2856 PHashTableArgs hashArgs; 2857 hashArgs.capacity = awordmap->max_words; 2858 if(hashArgs.capacity%2==0) hashArgs.capacity += 1; 2859 hashArgs.compFunction = HashCmpWord; //PHASH_TABLE_DEFAULT_COMP_FUNCTION; 2860 hashArgs.hashFunction = HashGetCode; //PHASH_TABLE_DEFAULT_HASH_FUNCTION; 2861 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR; 2862 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.deserializeWordMap()"), &awordmap->wordIDForWord)); 2863 2864 rc = wordmap_populate ( awordmap, awordmap->num_words ); 2865 2866 if (rc != ESR_SUCCESS) 2867 { 2868 wordmap_clean ( awordmap ); 2869 goto CLEANUP; 2870 } 2871 } 2872 else 2873 { 2874 awordmap->wordIDForWord = NULL; 2875 } 2876 2877 /* success */ 2878 *pwordmap = awordmap; 2879 return ESR_SUCCESS; 2880 2881 CLEANUP: 2882 if (awordmap != NULL) 2883 { 2884 if (awordmap->wordIDForWord != NULL) 2885 PHashTableDestroy(awordmap->wordIDForWord); 2886 if (awordmap->words != NULL) FREE(awordmap->words); 2887 if (awordmap->chars != NULL) FREE(awordmap->chars); 2888 FREE(awordmap); 2889 } 2890 return rc; 2891 } 2892 2893 static ESR_ReturnCode serializeArcToken(arc_token *token_base, 2894 int i, 2895 PFile* fp) 2896 { 2897 arc_token *token = token_base + i; 2898 asr_uint32_t idx; 2899 2900 if (pfwrite(&token->ilabel, 2, 1, fp) != 1) 2901 return ESR_WRITE_ERROR; 2902 2903 if (pfwrite(&token->olabel, 2, 1, fp) != 1) 2904 return ESR_WRITE_ERROR; 2905 2906 /* if (pfwrite(&token->cost, 2, 1, fp) != 1) 2907 return ESR_WRITE_ERROR; */ 2908 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->first_next_arc), token_base); 2909 2910 if (pfwrite(&idx, 4, 1, fp) != 1) 2911 return ESR_WRITE_ERROR; 2912 2913 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->next_token_index), token_base); 2914 2915 if (pfwrite(&idx, 4, 1, fp) != 1) 2916 return ESR_WRITE_ERROR; 2917 2918 return ESR_SUCCESS; 2919 } 2920 2921 static ESR_ReturnCode serializeArcTokenInfo(srec_context *context, 2922 PFile* fp) 2923 { 2924 int i; 2925 asr_uint32_t idx; 2926 ESR_ReturnCode rc; 2927 2928 if (pfwrite(&context->arc_token_list_len, 2, 1, fp) != 1) 2929 return ESR_WRITE_ERROR; 2930 2931 idx = PTR_TO_IDX(context->arc_token_freelist, context->arc_token_list); 2932 2933 if (pfwrite(&idx, 4, 1, fp) != 1) 2934 return ESR_WRITE_ERROR; 2935 2936 idx = PTR_TO_IDX(context->arc_token_insert_start, context->arc_token_list); 2937 2938 if (pfwrite(&idx, 4, 1, fp) != 1) 2939 return ESR_WRITE_ERROR; 2940 2941 for (i = 0; i < context->arc_token_list_len; ++i) 2942 { 2943 rc = serializeArcToken(context->arc_token_list, i, fp); 2944 if (rc != ESR_SUCCESS) return rc; 2945 } 2946 return ESR_SUCCESS; 2947 } 2948 2949 static ESR_ReturnCode deserializeArcToken(arc_token *token_base, 2950 int i, 2951 PFile* fp) 2952 { 2953 arc_token *token = token_base + i; 2954 asr_uint32_t idx[2]; 2955 asr_uint16_t labels[2]; 2956 2957 if (pfread(labels, 2, 2, fp) != 2) 2958 return ESR_READ_ERROR; 2959 2960 token->ilabel = labels[0]; 2961 token->olabel = labels[1]; 2962 2963 /* if (pfread(&token->cost, 2, 1, fp) != 1) 2964 return ESR_READ_ERROR; */ 2965 2966 if (pfread(idx, 4, 2, fp) != 2) 2967 return ESR_READ_ERROR; 2968 2969 token->first_next_arc = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[0], token_base)); 2970 token->next_token_index = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[1], token_base)); 2971 2972 return ESR_SUCCESS; 2973 } 2974 2975 static ESR_ReturnCode deserializeArcTokenInfo(srec_context *context, 2976 PFile* fp) 2977 { 2978 ESR_ReturnCode rc; 2979 int i; 2980 asr_uint32_t idx; 2981 2982 if (pfread(&context->arc_token_list_len, 2, 1, fp) != 1) { 2983 PLogError("pfread failed in deserializeArcTokenInfo()\n"); 2984 return ESR_READ_ERROR; 2985 } 2986 2987 context->arc_token_list = NEW_ARRAY(arc_token, context->arc_token_list_len, 2988 L("srec.graph.wordgraph")); 2989 2990 if (context->arc_token_list == NULL) 2991 return ESR_OUT_OF_MEMORY; 2992 2993 if (pfread(&idx, 4, 1, fp) != 1) 2994 { 2995 rc = ESR_READ_ERROR; 2996 goto CLEANUP; 2997 } 2998 2999 context->arc_token_freelist = 3000 IDX_TO_PTR(idx, context->arc_token_list); 3001 3002 if (pfread(&idx, 4, 1, fp) != 1) 3003 { 3004 rc = ESR_READ_ERROR; 3005 goto CLEANUP; 3006 } 3007 3008 context->arc_token_insert_start = 3009 IDX_TO_PTR(idx, context->arc_token_list); 3010 3011 for (i = 0; i < context->arc_token_list_len; ++i) 3012 { 3013 rc = deserializeArcToken(context->arc_token_list, i, fp); 3014 if (rc != ESR_SUCCESS) goto CLEANUP; 3015 } 3016 return ESR_SUCCESS; 3017 3018 CLEANUP: 3019 FREE(context->arc_token_list); 3020 context->arc_token_list = 3021 context->arc_token_freelist = 3022 context->arc_token_insert_start = NULL; 3023 return rc; 3024 } 3025 3026 int FST_GetGrammarType(srec_context* context) 3027 { 3028 arc_token *t, *u; 3029 wordID expected_wdid; 3030 /* 0 1 0 3031 1 2 4 3032 ... 3033 1 2 1316 3034 2 3035 */ 3036 t = context->arc_token_list; 3037 if (t->ilabel != WORD_EPSILON_LABEL) 3038 return GrammarTypeBNF; 3039 if (t->next_token_index) 3040 return GrammarTypeBNF; 3041 t = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc); 3042 expected_wdid = NUM_ITEMLIST_HDRWDS; 3043 for (; t; t = ARC_TOKEN_PTR(context->arc_token_list, t->next_token_index)) 3044 { 3045 if (t->ilabel != expected_wdid) 3046 return GrammarTypeBNF; 3047 u = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc); 3048 if (u != NULL && u->ilabel != MAXwordID) 3049 return GrammarTypeBNF; 3050 expected_wdid++; 3051 } 3052 if (expected_wdid != context->olabels->num_words) 3053 return GrammarTypeBNF; 3054 return GrammarTypeItemList; 3055 } 3056 3057 int FST_DumpContextAsImageV2(srec_context* context, PFile* fp) 3058 { 3059 asr_uint32_t header[4]; 3060 arcID tmp[32], i, j, nfields; 3061 FSMarc* arc; 3062 ESR_ReturnCode rc; 3063 #if !defined(DO_ARCS_IO_IN_ARC_ORDER) 3064 FSMnode* node; 3065 arcID arcid, num_arcs, num_arcs_written; 3066 #endif 3067 3068 /* Write header information. */ 3069 header[0] = 0; 3070 header[1] = IMAGE_FORMAT_V2; 3071 header[2] = context->modelid; 3072 header[3] = context->grmtyp; 3073 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3074 PLogMessage("FST_DumpContextAsImageV2() fstgrmtyp %d", context->grmtyp); 3075 #endif 3076 3077 if (pfwrite(header, sizeof(header[0]), 4, fp) != 4) 3078 { 3079 PLogError("FST_DumpContextAsImage: Could not write header.\n"); 3080 return FST_FAILED_INTERNAL; 3081 } 3082 3083 /* arcs and nodes */ 3084 i = 0; 3085 tmp[i++] = context->num_arcs; 3086 tmp[i++] = context->FSMarc_list_len; 3087 tmp[i++] = context->num_base_arcs; 3088 tmp[i++] = context->FSMarc_freelist; 3089 tmp[i++] = context->max_searchable_arcs; 3090 3091 tmp[i++] = context->num_nodes; 3092 tmp[i++] = context->FSMnode_list_len; 3093 tmp[i++] = context->num_base_nodes; 3094 tmp[i++] = context->FSMnode_freelist; 3095 tmp[i++] = context->start_node; 3096 tmp[i++] = context->end_node; 3097 tmp[i++] = context->max_searchable_nodes; 3098 3099 tmp[i++] = context->beg_silence_word; 3100 tmp[i++] = context->end_silence_word; 3101 tmp[i++] = context->hack_silence_word; 3102 tmp[i++] = context->hmm_ilabel_offset; 3103 3104 nfields = i; 3105 3106 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3107 return ESR_WRITE_ERROR; 3108 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3109 PLogMessage("G2G done WR hdrs %d", pftell(fp)); 3110 #endif 3111 3112 #if defined(DO_ARCS_IO_IN_ARC_ORDER) 3113 if( 1) { 3114 i = 0; 3115 tmp[i++] = context->end_node; 3116 tmp[i++] = MAXnodeID; 3117 tmp[i++] = MAXmodelID; 3118 tmp[i++] = MAXwordID; 3119 tmp[i++] = FREEcostdata; 3120 nfields = i; 3121 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3122 return ESR_WRITE_ERROR; 3123 } 3124 for(j=0; j<context->num_arcs; j++) { 3125 arc = &context->FSMarc_list[j]; 3126 i = 0; 3127 tmp[i++] = arc->fr_node; 3128 tmp[i++] = arc->to_node; 3129 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID); 3130 tmp[i++] = arc->ilabel; 3131 tmp[i++] = arc->olabel; 3132 tmp[i++] = arc->cost; 3133 nfields = i; 3134 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3135 return ESR_WRITE_ERROR; 3136 } 3137 #else 3138 num_arcs_written = 0; 3139 for (j = 0; j < context->num_nodes; j++) { 3140 node = &context->FSMnode_list[j]; 3141 num_arcs = (arcID)num_arcs_leaving(context, node); 3142 arcid = node->un_ptr.first_next_arc; 3143 if (arcid == MAXarcID) { 3144 i = 0; 3145 tmp[i++] = (arcID)j; 3146 tmp[i++] = MAXarcID; 3147 tmp[i++] = MAXarcID; 3148 tmp[i++] = MAXarcID; 3149 tmp[i++] = MAXarcID; 3150 nfields = i; 3151 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3152 return ESR_WRITE_ERROR; 3153 /* num_arcs_written++; end node don't have a to-arc */ 3154 } 3155 else { 3156 for (; arcid != MAXarcID; arcid = arc->linkl_next_arc) { 3157 arc = &context->FSMarc_list[arcid]; 3158 i = 0; 3159 tmp[i++] = (arcID)j; 3160 tmp[i++] = arc->to_node; 3161 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID); 3162 tmp[i++] = arc->ilabel; 3163 tmp[i++] = arc->olabel; 3164 tmp[i++] = arc->cost; 3165 nfields = i; 3166 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3167 return ESR_WRITE_ERROR; 3168 num_arcs_written++; 3169 } 3170 } 3171 } 3172 ASSERT(num_arcs_written == context->num_arcs); 3173 #endif 3174 3175 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3176 PLogMessage("G2G done WR arcs %d", pftell(fp)); 3177 #endif 3178 3179 /* node info .. will now be calculated on line */ 3180 /* wrapup_cost .. will now be calculated on line */ 3181 3182 /* exit points for slots */ 3183 if (1) 3184 { 3185 srec_fsm_exit_point *p, *q; 3186 if (pfwrite(&context->num_fsm_exit_points, 2, 1, fp) != 1) 3187 return ESR_WRITE_ERROR; 3188 p = context->fsm_exit_points; 3189 q = p + MAX_NUM_SLOTS; 3190 while (p < q) 3191 { 3192 if (pfwrite(&p->from_node_index, 2, 1, fp) != 1) 3193 return ESR_WRITE_ERROR; 3194 if (pfwrite(&p->arc_index, 2, 1, fp) != 1) 3195 return ESR_WRITE_ERROR; 3196 if (pfwrite(&p->wbto_node_index, 2, 1, fp) != 1) 3197 return ESR_WRITE_ERROR; 3198 ++p; 3199 } 3200 } 3201 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3202 PLogMessage("G2G done WR exits %d", pftell(fp)); 3203 #endif 3204 /* no entry points stuff */ 3205 3206 /* no saving ilabels */ 3207 3208 /* now load the ilabels */ 3209 rc = serializeWordMapV2(context->olabels, fp); 3210 if (rc != ESR_SUCCESS) 3211 return rc; 3212 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3213 PLogMessage("G2G done WR wmap %d", pftell(fp)); 3214 #endif 3215 3216 if (context->grmtyp != GrammarTypeItemList) 3217 { 3218 if (serializeArcTokenInfo(context, fp) != ESR_SUCCESS) 3219 { 3220 PLogError("FST_DumpContextAsImage: could not write arc token data.\n"); 3221 return FST_FAILED_INTERNAL; 3222 } 3223 } 3224 else 3225 { 3226 /* nothing, at decoding time, we'll just construct the graph 3227 from the word list, ie just this .. 3228 0 1 0 3229 1 2 4 3230 ... 3231 1 2 1316 3232 2 3233 ie 3234 0 1 0 3235 for(i=4;i<nwords;i++) { "1 2 i" } 3236 2 3237 because words 0-3 are eps,-pau-,-pau2-,slot (at) grammarname.grxml 3238 */ 3239 } 3240 3241 /* Write header information. */ 3242 header[0] = pftell(fp); 3243 header[1] = IMAGE_FORMAT_V2; 3244 3245 /* goto beginning of file. */ 3246 if (pfseek(fp, 0, SEEK_SET)) 3247 { 3248 PLogError("FST_DumpContextAsImage: could not reposition for header.\n"); 3249 return FST_FAILED_INTERNAL; 3250 } 3251 3252 if (pfwrite(header, 4, 2, fp) != 2) 3253 { 3254 PLogError("FST_DumpContextAsImage: Could not write header.\n"); 3255 return FST_FAILED_INTERNAL; 3256 } 3257 3258 /* reset pointer at end of file. */ 3259 if (pfseek(fp, 0, SEEK_END)) 3260 { 3261 PLogError("FST_DumpContextAsImage: could not reposition file pointer at end.\n"); 3262 return FST_FAILED_INTERNAL; 3263 } 3264 3265 return FST_SUCCESS; 3266 } 3267 3268 3269 int FST_LoadContextFromImageV2(srec_context* fst, PFile* fp) 3270 { 3271 asr_int32_t header[6]; 3272 arcID tmp[32], num_arcs, new_arc_id; 3273 unsigned int i, nfields; 3274 srec_fsm_exit_point *p, *q; 3275 arcID max_num_FSMarcs; 3276 nodeID max_num_FSMnodes; 3277 FSMarc_ptr atok = (FSMarc_ptr)0; 3278 FSMarc *atoken = NULL; 3279 FSMnode *fr_node, *to_node; 3280 int rc = FST_SUCCESS; 3281 ESR_ReturnCode esr_rc; 3282 ESR_BOOL seenFinalNode = ESR_FALSE; 3283 3284 /* read header information. */ 3285 if (pfread(&header[2], sizeof(header[0]), 2, fp) != 2) 3286 { 3287 PLogError("FST_DumpContextAsImage: Could not read header.\n"); 3288 return FST_FAILED_INTERNAL; 3289 } 3290 fst->modelid = header[2]; 3291 fst->grmtyp = header[3]; 3292 3293 nfields = 16; 3294 if (pfread(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3295 return ESR_READ_ERROR; 3296 3297 /* arcs and nodes */ 3298 i = 0; 3299 fst->num_arcs = max_num_FSMarcs = tmp[i++]; 3300 fst->FSMarc_list_len = tmp[i++]; 3301 fst->num_base_arcs = tmp[i++]; 3302 fst->FSMarc_freelist = tmp[i++]; 3303 fst->max_searchable_arcs = tmp[i++] * 0; /*5*/ 3304 fst->num_nodes = max_num_FSMnodes = tmp[i++]; 3305 fst->FSMnode_list_len = tmp[i++]; 3306 fst->num_base_nodes = tmp[i++]; 3307 fst->FSMnode_freelist = tmp[i++]; 3308 fst->start_node = tmp[i++];/*10*/ 3309 fst->end_node = tmp[i++]; 3310 fst->max_searchable_nodes = tmp[i++] * 0; 3311 fst->beg_silence_word = tmp[i++]; 3312 fst->end_silence_word = tmp[i++]; 3313 fst->hack_silence_word = tmp[i++]; /*15*/ 3314 fst->hmm_ilabel_offset = tmp[i++]; /* 16 */ 3315 3316 fst->addWordCaching_lastslot_name = 0; 3317 fst->addWordCaching_lastslot_num = MAXwordID; 3318 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE; 3319 3320 ASSERT(i == nfields); 3321 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3322 PLogMessage("G2G done RD hdrs %d", pftell(fp)); 3323 #endif 3324 3325 ASSERT(fst->num_arcs >= fst->num_base_arcs); // was == 3326 ASSERT(fst->num_nodes >= fst->num_base_nodes); // was == 3327 3328 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs"); 3329 if (!fst->FSMarc_list) 3330 { 3331 rc = FST_FAILED_ON_MEMORY; 3332 PLogError("CALLOC_CLR fst->FSMarc_list \n"); 3333 goto CLEANUP; 3334 } 3335 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes"); 3336 if (!fst->FSMnode_list) 3337 { 3338 rc = FST_FAILED_ON_MEMORY; 3339 PLogError("CALLOC_CLR fst->FSMnode_list failed\n"); 3340 goto CLEANUP; 3341 } 3342 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos"); 3343 3344 if (!fst->FSMnode_info_list) 3345 { 3346 rc = FST_FAILED_ON_MEMORY; 3347 PLogError("CALLOC_CLR fst->FSMnode_info_list failed\n"); 3348 goto CLEANUP; 3349 } 3350 3351 /* setup the arc freelist */ 3352 fst->FSMarc_freelist = 0; 3353 fst->num_arcs = 0; 3354 for (i = 0; i < (unsigned)fst->FSMarc_list_len - 1; i++) 3355 { 3356 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1); 3357 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 3358 } 3359 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL; 3360 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 3361 3362 /* initialize the nodes, 'cuz reading is random order */ 3363 for (i = 0; i < max_num_FSMnodes; i++) 3364 { 3365 fr_node = &fst->FSMnode_list[i]; 3366 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL; 3367 } 3368 3369 /* 1. first load up all the information */ 3370 num_arcs = 0; 3371 for (i = 0; i < max_num_FSMarcs || !seenFinalNode; i++) 3372 { 3373 if (i > max_num_FSMarcs && !seenFinalNode) 3374 { 3375 PLogError("Final node never encountered"); 3376 rc = ESR_INVALID_STATE; 3377 goto CLEANUP; 3378 } 3379 nfields = 5; 3380 if (pfread(tmp,sizeof(tmp[0]),nfields,fp) != nfields) 3381 { 3382 PLogError("reading arc"); 3383 rc = ESR_INVALID_STATE; 3384 goto CLEANUP; 3385 } 3386 3387 if (tmp[1] == MAXnodeID) 3388 { 3389 seenFinalNode = ESR_TRUE; 3390 if(fst->end_node != tmp[0]) { 3391 PLogError("error with arc %d->%d ilabel %d olabel %d cost %d\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4]); 3392 ASSERT(fst->end_node == tmp[0]); 3393 } 3394 i--; 3395 } 3396 else 3397 { 3398 new_arc_id = fst_get_free_arc(fst); 3399 if (new_arc_id == MAXarcID) 3400 return FST_FAILED_ON_MEMORY; 3401 atok = ARC_ItoX(new_arc_id); 3402 atoken = ARC_XtoP(atok); 3403 num_arcs++; 3404 3405 ASSERT(tmp[0] < fst->num_nodes); 3406 ASSERT(tmp[1] < fst->num_nodes); 3407 fr_node = &fst->FSMnode_list[ tmp[0]]; 3408 to_node = &fst->FSMnode_list[ tmp[1]]; 3409 atoken->ilabel = tmp[2]; 3410 atoken->olabel = tmp[3]; 3411 atoken->cost = tmp[4]; 3412 append_arc_leaving_node(fst, fr_node, atok); 3413 append_arc_arriving_node(fst, to_node, atok); 3414 atoken->fr_node = NODE_ItoX(tmp[0]); 3415 atoken->to_node = NODE_ItoX(tmp[1]); 3416 } 3417 } 3418 ASSERT(fst->num_arcs == num_arcs); 3419 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3420 PLogMessage("G2G done RD arcs %d", pftell(fp)); 3421 #endif 3422 3423 /* setup the node freelist */ 3424 if (fst->num_nodes < fst->FSMnode_list_len) 3425 { 3426 fst->FSMnode_freelist = fst->num_nodes; 3427 for (i = fst->num_nodes; i < (unsigned)fst->FSMnode_list_len - 1; i++) 3428 { 3429 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1); 3430 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 3431 } 3432 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL; 3433 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 3434 } 3435 else 3436 { 3437 fst->FSMnode_freelist = MAXnodeID; 3438 } 3439 3440 /* node info .. will now be calculated on line */ 3441 /* wrapup_cost .. will now be calculated on line */ 3442 fst->whether_prepared = 0; 3443 fst_fill_node_info(fst); 3444 for (i = 0; i < fst->num_nodes; i++) 3445 fst->FSMnode_info_list[i] = NODE_INFO_UNKNOWN; 3446 3447 /* exit points for slots */ 3448 if (pfread(&fst->num_fsm_exit_points, 2, 1, fp) != 1) 3449 return ESR_READ_ERROR; 3450 p = fst->fsm_exit_points; 3451 q = p + MAX_NUM_SLOTS; 3452 while (p < q) 3453 { 3454 if (pfread(&p->from_node_index, 2, 1, fp) != 1) 3455 return ESR_WRITE_ERROR; 3456 if (pfread(&p->arc_index, 2, 1, fp) != 1) 3457 return ESR_WRITE_ERROR; 3458 if (pfread(&p->wbto_node_index, 2, 1, fp) != 1) 3459 return ESR_WRITE_ERROR; 3460 ++p; 3461 } 3462 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3463 PLogMessage("G2G done RD exits %d", pftell(fp)); 3464 #endif 3465 3466 /* no entry points stuff */ 3467 3468 /* ilabels were not saved, create them as empty */ 3469 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap"); 3470 fst->ilabels->num_words = fst->ilabels->max_words = 0; 3471 fst->ilabels->words = 0; 3472 3473 /* now save the olabels */ 3474 esr_rc = deserializeWordMapV2(&fst->olabels, fp); 3475 if (esr_rc != ESR_SUCCESS) 3476 { 3477 PLogError("deserializeWordMapV2() failed\n"); 3478 rc = FST_FAILED_INTERNAL; 3479 goto CLEANUP; 3480 } 3481 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3482 PLogMessage("G2G done RD wmap %d", pftell(fp)); 3483 #endif 3484 3485 /* arctokeninfo */ 3486 if (fst->grmtyp != GrammarTypeItemList) 3487 { 3488 if (deserializeArcTokenInfo(fst, fp) != ESR_SUCCESS) 3489 { 3490 PLogError("FST_DumpContextAsImage: could not write arc token data.\n"); 3491 rc = FST_FAILED_INTERNAL; 3492 goto CLEANUP; 3493 } 3494 } 3495 else 3496 { 3497 /* here we just construct an isolated list, but we should scrap it later */ 3498 wordID wdid; 3499 arc_token *atl, *final, *atli; 3500 fst->arc_token_list_len = fst->olabels->num_words + 2 - NUM_ITEMLIST_HDRWDS; 3501 atl = NEW_ARRAY(arc_token, fst->arc_token_list_len, L("srec.graph.wordgraph")); 3502 if(!atl) { 3503 PLogError("fst->arc_token_list_len failed\n"); 3504 goto CLEANUP; 3505 } 3506 atl->first_next_arc = ARC_TOKEN_LNK(atl, 1); 3507 atl->next_token_index = ARC_TOKEN_NULL; 3508 final = atl + fst->arc_token_list_len - 1; 3509 for (i = 1, wdid = NUM_ITEMLIST_HDRWDS; wdid < fst->olabels->num_words; i++, wdid++) 3510 { 3511 atli = &atl[i]; 3512 atli->ilabel = wdid; 3513 atli->olabel = wdid;/*not used*/ 3514 atli->next_token_index = ARC_TOKEN_LNK(atl, (i + 1)); 3515 atli->first_next_arc = ARC_TOKEN_LNK(atl, (fst->arc_token_list_len - 1)); /*final*/ 3516 } 3517 ASSERT(atl + i == final); 3518 final->next_token_index = ARC_TOKEN_NULL; 3519 final->first_next_arc = ARC_TOKEN_NULL; 3520 final->ilabel = final->olabel = 0; 3521 fst->arc_token_list = atl; 3522 } 3523 return FST_SUCCESS; 3524 CLEANUP: 3525 return FST_FAILED_INTERNAL; 3526 } 3527 3528 3529 int FST_LoadContextFromImageV2(srec_context* context, PFile* fp); 3530 3531 int FST_LoadContextFromImage(srec_context** pcontext, PFile* fp) 3532 { 3533 srec_context* context = NULL; 3534 /* used to force correct data alignment */ 3535 asr_int32_t header[2]; 3536 int rc = FST_SUCCESS; 3537 3538 /*printf("FST_LoadContexFromImage\n");*/ 3539 if (!fp) 3540 { 3541 PLogError("FST_LoadContextImage() failed; bad file pointer\n"); 3542 return FST_FAILED_ON_INVALID_ARGS; 3543 } 3544 3545 context = NEW(srec_context, L("srec.graph.binary")); 3546 if (context == NULL) 3547 { 3548 PLogError("FST_LoadContextFromImage: out of memory while allocating context.\n"); 3549 return FST_FAILED_ON_MEMORY; 3550 } 3551 memset(context, 0, sizeof(srec_context)); 3552 3553 /* header */ 3554 if (pfread(header, 4, 2, fp) != 2) 3555 { 3556 PLogError("FST_LoadContextFromImage: failed reading header.\n"); 3557 rc = FST_FAILED_ON_INVALID_ARGS; 3558 goto CLEANUP; 3559 } 3560 3561 if (header[1] == IMAGE_FORMAT_V2) 3562 { 3563 rc = FST_LoadContextFromImageV2(context, fp); 3564 if (rc != FST_SUCCESS) 3565 goto CLEANUP; 3566 } 3567 else 3568 { 3569 PLogError("FST_LoadContextFromImage() failed on image_format\n"); 3570 goto CLEANUP; 3571 } 3572 *pcontext = context; 3573 return FST_SUCCESS; 3574 CLEANUP: 3575 if (context) FREE(context); 3576 *pcontext = 0; 3577 return rc; 3578 } 3579 3580 3581 int FST_DumpReverseWordGraph(srec_context* context, PFile* fp) 3582 { 3583 /* not implemented, use FST_DumpSyntaxAsImage() for now */ 3584 return 1; 3585 } 3586 3587 /* this belongs part of the srec_context */ 3588 typedef struct 3589 { 3590 asr_int32_t image_format; 3591 asr_int32_t image_size; 3592 asr_int32_t sizes_signature; 3593 } 3594 context_image_header; 3595 3596 3597 int FST_LoadContextFromStreamImage(srec_context** pcontext, char* buffer, asr_int32_t buffer_len) 3598 { 3599 return FST_FAILED_INTERNAL; 3600 } 3601 3602 typedef struct nodeID_list_t 3603 { 3604 nodeID *nodes, num_nodes, num_alloc; 3605 } 3606 nodeID_list; 3607 3608 int fst_node_has_speech_to_come(srec_context* context, nodeID node_index) 3609 { 3610 /* recursive func ... try to keep stack use low! */ 3611 /* later we should cache this information: 3612 need 2bits: endn opte regl dunno */ 3613 FSMarc* arc; 3614 arcID arc_index = context->FSMnode_list[ node_index].un_ptr.first_next_arc; 3615 for (; arc_index != MAXarcID; arc_index = arc->linkl_next_arc) 3616 { 3617 arc = &context->FSMarc_list[arc_index]; 3618 if (arc->ilabel >= context->hmm_ilabel_offset + EPSILON_OFFSET + NUM_SILENCE_HMMS) 3619 { 3620 return 1; 3621 } 3622 else if (arc->ilabel < EPSILON_OFFSET) 3623 { 3624 if (fst_node_has_speech_to_come(context, arc->to_node)) 3625 { 3626 return 1; 3627 } 3628 } 3629 else if (IS_SILENCE_ILABEL(arc->ilabel, context)) 3630 { 3631 /* another silence! */ 3632 if (fst_node_has_speech_to_come(context, arc->to_node)) 3633 return 1; 3634 } 3635 } 3636 return 0; 3637 } 3638 3639 int fst_fill_node_info(srec_context* context) 3640 { 3641 char* infos = context->FSMnode_info_list; 3642 nodeID_list optends_obj; 3643 nodeID_list *optends = &optends_obj; 3644 3645 FSMnode* node; 3646 FSMarc* arc; 3647 nodeID i, j, node_index; 3648 arcID arc_index; 3649 costdata wrapup_cost; 3650 int wrapup_cost_inconsistencies; 3651 /* int num_near_end_nodes; nodes within EPS or silence to the end, these 3652 are called NODE_INFO_ENDNODES, so that they can be excluded from the 3653 comparison with end_node score in the eos detection */ 3654 3655 optends->num_nodes = 1; 3656 optends->num_alloc = 8192; /* scaled up from 512 to support dynamic grammar add word of 5000 */ 3657 optends->nodes = (nodeID*)CALLOC(optends->num_alloc, sizeof(nodeID), "srec.tmp.optendnodes"); 3658 optends->nodes[0] = context->end_node; 3659 /* num_near_end_nodes = 0; */ 3660 3661 for (i = 0; i < optends->num_nodes; i++) 3662 { 3663 node_index = optends->nodes[i]; 3664 node = &context->FSMnode_list[ node_index]; 3665 for (arc_index = node->first_prev_arc; arc_index != MAXarcID; 3666 arc_index = arc->linkl_prev_arc) 3667 { 3668 arc = &context->FSMarc_list[arc_index]; 3669 if (arc->fr_node != node_index) 3670 { 3671 3672 if (IS_SILENCE_ILABEL(arc->ilabel, context) || arc->ilabel < EPSILON_OFFSET) 3673 { 3674 /* ok, fr_node goes to the end via silence, check for dups */ 3675 for (j = 0; j < optends->num_nodes; j++) 3676 if (optends->nodes[j] == arc->fr_node) break; 3677 /* append it to the list */ 3678 if (j == optends->num_nodes) 3679 { 3680 if (optends->num_nodes < optends->num_alloc) 3681 optends->nodes[ optends->num_nodes++] = arc->fr_node; 3682 else 3683 { 3684 ASSERT(0 && "allocated too few optend nodes"); 3685 return 0; 3686 } 3687 } 3688 } 3689 } 3690 } 3691 } 3692 3693 /* now set the info array, default is regular */ 3694 for (i = 0; i < context->num_nodes; i++) 3695 infos[i] = NODE_INFO_REGULAR; 3696 /* also set the other nodes, we'll check whether nodes were added 3697 via these flags */ 3698 for (; i < context->FSMnode_list_len; i++) 3699 infos[i] = NODE_INFO_UNKNOWN; 3700 infos[ context->end_node] = NODE_INFO_ENDNODE; 3701 3702 /* get rid of direct to end nodes, etc */ 3703 for (i = 0, j = 0; i < optends->num_nodes; i++) 3704 { 3705 optends->nodes[j] = optends->nodes[i]; 3706 if (fst_node_has_speech_to_come(context, optends->nodes[i])) 3707 { 3708 j++; 3709 infos[ optends->nodes[i]] = NODE_INFO_OPTENDN; 3710 } 3711 else 3712 { 3713 infos[ optends->nodes[i]] = NODE_INFO_ENDNODE; 3714 /* num_near_end_nodes++; */ 3715 } 3716 } 3717 optends->num_nodes = j; 3718 3719 /* printf("get_oppend_nodes (%d)", optends->num_nodes); 3720 for(i=0; i<optends->num_nodes; i++) printf(" %d", optends->nodes[i]); 3721 printf("\n"); 3722 */ 3723 3724 FREE(optends->nodes); 3725 3726 /* find the wrapup cost */ 3727 node = &context->FSMnode_list[ context->end_node]; 3728 wrapup_cost = MAXcostdata; 3729 wrapup_cost_inconsistencies = 0; 3730 for (arc_index = node->first_prev_arc; arc_index != MAXarcID; 3731 arc_index = arc->linkl_prev_arc) 3732 { 3733 arc = &context->FSMarc_list[ arc_index]; 3734 if (IS_SILENCE_ILABEL(arc->ilabel, context) && 3735 arc->olabel == context->end_silence_word) 3736 { 3737 if (wrapup_cost == MAXcostdata) 3738 wrapup_cost = arc->cost; 3739 else if (context->wrapup_cost != arc->cost) 3740 { 3741 wrapup_cost = arc->cost; 3742 wrapup_cost_inconsistencies++; 3743 } 3744 } 3745 } 3746 #define MIN_WRAPUP_COST 100 3747 context->wtw_average = wrapup_cost; 3748 if (context->wtw_average > 200) 3749 context->wtw_average = 200; 3750 if (context->wrapup_cost < MIN_WRAPUP_COST) 3751 context->wrapup_cost = MIN_WRAPUP_COST; 3752 /*log_report("context->wrapup_cost %d (%d)\n", context->wrapup_cost, 3753 wrapup_cost_inconsistencies);*/ 3754 return 0; 3755 } 3756 3757 int fst_alloc_transit_points(srec_context* context) 3758 { 3759 arcID i; 3760 wordID num_slots = context->olabels->num_slots; 3761 wordID olabel; 3762 asr_int16_t nfxps = 0; 3763 FSMarc* arc; 3764 FSMnode* node; 3765 3766 context->num_fsm_exit_points = 0; 3767 /* slot 0 is invalid, it is the "eps", so num_slots==1 means no slots! */ 3768 if (num_slots == 1) 3769 return 0; 3770 3771 /* scan through to finds arc that carry rule references */ 3772 for (i = 0; i < context->num_arcs; i++) 3773 { 3774 olabel = context->FSMarc_list[i].olabel; 3775 /* (wordID)(olabel-1) < (num_slots-1) ?? might work */ 3776 if (olabel > 0 && olabel < num_slots) 3777 { 3778 context->FSMarc_list[i].cost = DISABLE_ARC_COST; 3779 if (nfxps >= MAX_NUM_SLOTS) 3780 { 3781 PLogError("error: too many fsm exit points in fsm, too many public rules referenced from here\n"); 3782 return 0; 3783 } 3784 context->fsm_exit_points[nfxps].arc_index = i; 3785 context->fsm_exit_points[nfxps].from_node_index = context->FSMarc_list[i].fr_node; 3786 /* skip over the trailing silence and .wb labels */ 3787 for (arc = &context->FSMarc_list[i]; arc->ilabel != WORD_BOUNDARY;) 3788 { 3789 node = &context->FSMnode_list[arc->to_node]; 3790 arc = &context->FSMarc_list[node->un_ptr.first_next_arc]; 3791 } 3792 context->fsm_exit_points[nfxps].wbto_node_index = arc->to_node; 3793 nfxps++; 3794 3795 } /* olabel<num_slots */ 3796 } /* i<num_arcs */ 3797 context->num_fsm_exit_points = nfxps; 3798 return 0; 3799 } 3800 3801 3802 /* 3803 cross-word modeling: 3804 for now we'll assume silence context on either side, later we'll create 3805 a graph with the right fan out on the edges of each slot. At that point we 3806 may have routines like ... if(is_dead_end(arc)) continue; 3807 3808 word removal: 3809 we'll use refcounters on arcs, we can search the whole graph for the 3810 olabel, then remove forward and remove backwards 3811 3812 word addition: 3813 glenville ^ hmm183 hmm222 hmm162 hmm246 hmm346 hmm191 hmm219 3814 3815 set imap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/model.map 3816 set omap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/namesnnumsSC.map 3817 wintel/debug/addword.exe -min 3818 $SWISDK/bin/fsmcompileD -t -i $imap -o $omap -F out.PCLG out.PCLG.txt 3819 $SWISDK/bin/fsmdrawD -i $imap -o $omap out.PCLG > out.PCLG.dot 3820 dotty out.PCLG.dot 3821 3822 need 3823 - a sanity check 3824 - remove word 3825 - multiple pronunciations handling 3826 - homonyms handling, other boundary conditions checking 3827 - measure the speed 3828 3829 alan_adams al~#ad}z 3830 alan_adams al~ad}z 3831 paul_adams p{l#ad}z 3832 3833 3834 3835 */ 3836