1 /*---------------------------------------------------------------------------* 2 * srec_context.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 21 #include "stdlib.h" 22 #include "string.h" 23 #include "buildopt.h" 24 #include "setting.h" 25 #include "passert.h" 26 #include "pendian.h" 27 #include "portable.h" 28 #include "pstdio.h" 29 #include "ptypes.h" 30 #include "srec_context.h" 31 #include "srec_sizes.h" 32 #include "search_network.h" 33 #include "srec_arb.h" 34 #include "hmm_desc.h" 35 #if USE_COMP_STATS 36 #include "comp_stats.h" 37 #endif 38 39 #ifdef SET_RCSID 40 static const char *rcsid = 0 ? (const char *) &rcsid : 41 "$Id: srec_context.c,v 1.84.4.54 2008/05/15 20:06:39 dahan Exp $"; 42 #endif 43 44 45 /* these are constants for now, we need to figure out how to make 46 this more flexible, possible ideas are: 47 (1) to specify these via API functions such as CR_LoadSyntax(), 48 (2) use defaults that would have been set the main parfile, 49 (3) put values for these fields in an actual grammar 50 */ 51 52 #define AVG_ARCS_PER_WORD 10 53 #define AVG_NODES_PER_WORD 7 54 #define AVG_CHARS_PER_WORD 18 55 #define MAX_PRON_LEN 1024 56 #define MAX_WORD_LEN 128 57 #define DO_WEIGHTED_ADDWORD 1 58 #define SLOTLOOP_OFFSET 64 59 #define SLOTNAME_INDICATOR "__" 60 #define RULE_INDICATOR '@' 61 #define MAX_LINE_LENGTH 512 62 #define DEFAULT_WB_COST 40 63 #define DISABLE_ARC_COST 999 64 #define DO_ARCS_IO_IN_ARC_ORDER 1 65 66 #define IS_SILENCE_ILABEL(ilabel,context) (ilabel >= context->hmm_ilabel_offset+EPSILON_OFFSET && ilabel<context->hmm_ilabel_offset+EPSILON_OFFSET+NUM_SILENCE_HMMS) 67 // looking to match filename.grxml.Names@__Names__ 68 // see also grxmlcompile.cpp where these names are constructed 69 #define IS_SLOT_OLABEL(wd) (strchr(wd,IMPORTED_RULES_DELIM)<strstr(wd,SLOTNAME_INDICATOR) && strlen(wd)>4 && !strcmp(wd+strlen(wd)-2,SLOTNAME_INDICATOR) ) 70 71 /* 72 SYNTAX : DATA_ALIGN(pointer, data type, bytes_filled) 73 EXAMPLE: We need to cast a memory address to a (wordmap*) 74 so we call DATA_ALIGN(memptr, wordmap, filled), 75 where FILLED contains the number of bytes that were used to align the pointer 76 */ 77 #if (CPU & CPU_ARM)||(CPU & CPU_STRONGARM) 78 /* under IPAQ it appears we need to align the structures 79 to certain boundaries. More attention needed here for other 80 platforms ! */ 81 #define DATA_ALIGN(x, y, z) if ((int) (x) % sizeof(y)) { \ 82 (z)=sizeof(y) - ((int) (x) % sizeof(y)); \ 83 (x) += (z); /* force correct data alignment */ \ 84 } else z=0; 85 #else 86 #define DATA_ALIGN(x, y, z) 87 #endif 88 89 static const int do_minimize = 1; 90 #define PTR_TO_IDX(ptr, base) ((asr_uint32_t) (ptr == NULL ? 0xFFFFFFFFu : (asr_uint32_t)(ptr - base))) 91 #define IDX_TO_PTR(idx, base) (idx == 0xFFFFFFFFu ? NULL : base + idx) 92 93 /* prototypes */ 94 95 /*----------------------------------------------------------------------* 96 * * 97 * internal prototypes * 98 * * 99 *----------------------------------------------------------------------*/ 100 101 /* general use functions */ 102 char split_line(char* line, char** av); 103 asr_int32_t atoi_with_check(const char* buf, asr_int32_t max); 104 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc); 105 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc); 106 int printf_node1(srec_context* fst, FSMnode* node); 107 108 /* fst_* functions are internal fst functions */ 109 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node, 110 wordID olabel, costdata cost, 111 modelID* model_sequence, int num_models); 112 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc); 113 int fst_push_arc_cost(srec_context* fst, FSMarc* arc); 114 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc); 115 int fst_free_arc(srec_context* fst, FSMarc* arc); 116 int fst_free_node(srec_context* fst, FSMnode* node); 117 int fst_free_arc_net(srec_context* fst, FSMarc* arc); 118 arcID fst_get_free_arc(srec_context* fst); 119 nodeID fst_get_free_node(srec_context* fst); 120 int fst_pack_arc_usage(srec_context* fst); 121 122 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr atok); 123 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr atok); 124 int num_arcs_leaving(srec_context* fst, FSMnode* node); 125 126 int num_arcs_arriving(srec_context* fst, FSMnode* node); 127 int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node); 128 int fst_fill_node_info(srec_context* context); 129 int fst_alloc_transit_points(srec_context* context); 130 131 /* higher-level functions */ 132 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, 133 PFile* fp); 134 int FST_LoadParams(srec_context* context, PFile* fp); 135 int FST_AssumeParams(srec_context* context); 136 int FST_UnloadReverseWordGraph(srec_context* context); 137 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree); 138 139 static ESR_ReturnCode wordmap_clean ( wordmap *word_map ); 140 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words ); 141 142 /*--------------------------------------------------------------------------* 143 * * 144 * * 145 * * 146 *--------------------------------------------------------------------------*/ 147 148 int FST_IsVoiceEnrollment(srec_context* context) 149 { 150 if (context->olabels == NULL) return 0; 151 if (context->olabels->num_words < 2) return 0; 152 if (strstr(context->olabels->words[1], "enroll")) return 1; 153 return 0; 154 } 155 156 int FST_LoadContext(const char* synbase, srec_context** pcontext, 157 int num_words_to_add) 158 { 159 int rc; 160 PFile* fp; 161 char buffer[MAX_LINE_LENGTH]; 162 srec_context* context; 163 164 context = (srec_context*)CALLOC_CLR(1, sizeof(srec_context), "srec.graph.base"); 165 if(!context) 166 return FST_FAILED_ON_MEMORY; 167 memset(context, 0, sizeof(srec_context)); 168 169 context->addWordCaching_lastslot_name = 0; 170 context->addWordCaching_lastslot_num = MAXwordID; 171 context->addWordCaching_lastslot_needs_post_silence = ESR_FALSE; 172 173 sprintf(buffer, "%s.map", synbase); 174 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 175 if (!fp) return FST_FAILED_ON_INVALID_ARGS; 176 rc = FST_LoadWordMap(&context->olabels, num_words_to_add, fp); 177 pfclose(fp); 178 if (rc) return rc; 179 180 sprintf(buffer, "%s.PCLG.txt", synbase); 181 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 182 if (!fp) return FST_FAILED_ON_INVALID_ARGS; 183 rc = FST_LoadGraph(context, context->ilabels, context->olabels, num_words_to_add, fp); 184 pfclose(fp); 185 if (rc != FST_SUCCESS) return rc; 186 187 sprintf(buffer, "%s.Grev2.det.txt", synbase); 188 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 189 if (!fp) return FST_FAILED_ON_INVALID_ARGS; 190 rc = FST_LoadReverseWordGraph(context, num_words_to_add, fp); 191 pfclose(fp); 192 if (rc != FST_SUCCESS) return rc; 193 194 sprintf(buffer, "%s.params", synbase); 195 fp = file_must_open(NULL, buffer, L("r"), ESR_TRUE); 196 if (fp) { 197 rc = FST_LoadParams(context, fp); 198 pfclose(fp); 199 if (rc != FST_SUCCESS) return rc; 200 } else { 201 rc = FST_AssumeParams(context); 202 } 203 204 context->grmtyp = (arcID)FST_GetGrammarType(context); 205 206 *pcontext = context; 207 rc = fst_fill_node_info(context); 208 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS; 209 } 210 211 void FST_UnloadContext(srec_context* context) 212 { 213 if (!context) return; 214 215 FST_UnloadWordMap(&context->ilabels); 216 FST_UnloadWordMap(&context->olabels); 217 FST_UnloadGraph(context); 218 FST_UnloadReverseWordGraph(context); 219 FREE(context); 220 } 221 222 int FST_PrepareContext(srec_context* context) 223 { 224 nodeID i; 225 int rc = FST_SUCCESS; 226 /* after all word additions, we need to "prepare" the context */ 227 228 /* first check for any changes to optional endnodes etc .. */ 229 for (i = 0; i < context->num_nodes; i++) 230 if (context->FSMnode_info_list[i] == NODE_INFO_UNKNOWN) 231 break; 232 if (i != context->num_nodes) 233 rc = fst_fill_node_info(context); 234 235 context->whether_prepared = 1; 236 return rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS; 237 } 238 239 void fst_set_wb_costs( srec_context* context, costdata wbcost) 240 { 241 unsigned int i; 242 for(i=0; i<(unsigned int)context->FSMarc_list_len; i++) { 243 if(context->FSMarc_list[i].ilabel == WORD_BOUNDARY) 244 context->FSMarc_list[i].cost = wbcost; 245 } 246 } 247 248 int FST_LoadParams(srec_context* context, PFile* fp) 249 { 250 char line[MAX_LINE_LENGTH]; 251 costdata wbcost = MAXcostdata; 252 while (pfgets(line, MAX_LINE_LENGTH, fp)) 253 { 254 char* key = strtok(line," \n\r\t"); 255 char* val = key ? strtok(NULL," \n\r\t") : NULL; 256 if(val && !strcmp(val,"=")) 257 val = key ? strtok(NULL," \n\r\t") : NULL; 258 if(!key || !key[0]) 259 continue; 260 else if(val && val[0]) { 261 if(!strcmp(key,"word_penalty")) { 262 wbcost = (costdata)atoi_with_check(val, MAXcostdata); 263 if(wbcost == MAXcostdata) { 264 return FST_FAILED_ON_INVALID_ARGS; 265 } 266 } else { 267 PLogError(L("error: unknown parameter %s in .params file"), key); 268 return FST_FAILED_ON_INVALID_ARGS; 269 } 270 } 271 } 272 if(wbcost != MAXcostdata) 273 fst_set_wb_costs( context, wbcost); 274 return FST_SUCCESS; 275 } 276 277 int FST_AssumeParams( srec_context* context) 278 { 279 fst_set_wb_costs( context, (costdata)DEFAULT_WB_COST); 280 return FST_SUCCESS; 281 } 282 283 284 /*--------------------------------------------------------------------------* 285 * * 286 * * 287 * * 288 *--------------------------------------------------------------------------*/ 289 290 modelID hmm_number(const char* hmm_Name, modelID hmm_ilabel_offset) 291 { 292 if (!strcmp(hmm_Name, "eps")) return EPSILON_LABEL; 293 if (!strcmp(hmm_Name, ".wb")) return WORD_BOUNDARY; 294 if (!strcmp(hmm_Name, ".ph")) return PHONE_BOUNDARY; 295 ASSERT(hmm_Name[0] == 'h' && hmm_Name[1] == 'm' && hmm_Name[2] == 'm' && hmm_Name[3]); 296 return (modelID)(hmm_ilabel_offset + (modelID)atoi_with_check(hmm_Name + 3, MAXmodelID)); 297 } 298 299 char* hmm_name(modelID ilabel, modelID hmm_ilabel_offset, char* buf) 300 { 301 if (ilabel == EPSILON_LABEL) 302 sprintf(buf, "eps"); 303 else if (ilabel == WORD_BOUNDARY) 304 sprintf(buf, ".wb"); 305 else if (ilabel == PHONE_BOUNDARY) 306 sprintf(buf, ".ph"); 307 else 308 sprintf(buf, "hmm%03d", ilabel - hmm_ilabel_offset); 309 return buf; 310 } 311 312 /*------------------------------------------------------------------* 313 * * 314 * words * 315 * * 316 *------------------------------------------------------------------*/ 317 int HashCmpWord(const LCHAR *key1, const LCHAR *key2) 318 { 319 return LSTRCMP(key1,key2); 320 } 321 unsigned int HashGetCode(const void *key) 322 { 323 const LCHAR* k = (const LCHAR*)key; 324 unsigned int i, len, h = 0; 325 len = LSTRLEN( k); 326 for ( i = 0; i < len; i++) 327 h = 31*h + (unsigned int)k[i]; 328 return h; 329 } 330 331 int wordmap_create(wordmap** pwmap, int num_chars, int num_words, int num_words_to_add) 332 { 333 wordmap* Interface; 334 PHashTableArgs hashArgs; 335 ESR_ReturnCode rc; 336 337 Interface = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.wordmap.base"); 338 Interface->max_words = (wordID)(num_words + num_words_to_add); 339 Interface->num_words = (wordID)0; 340 /* *pwmap->words = (ptr32*) CALLOC_CLR(wmap->max_words, sizeof(ptr32), "graph.wordmap.words"); */ 341 Interface->words = (char**) CALLOC_CLR(Interface->max_words, sizeof(char*), "srec.graph.wordmap.words"); 342 Interface->max_chars = num_chars + num_words_to_add * AVG_CHARS_PER_WORD; 343 Interface->chars = (char*) CALLOC_CLR(Interface->max_chars, sizeof(char), "srec.graph.wordmap.chars"); 344 Interface->next_chars = Interface->chars; 345 Interface->wordIDForWord = NULL; 346 *pwmap = Interface; 347 348 /* use a hashtable to save mapping between wdID and array index */ 349 if (num_words_to_add >= 0) 350 { 351 hashArgs.capacity = num_words + num_words_to_add + 10; 352 if(hashArgs.capacity%2==0) hashArgs.capacity += 1; 353 hashArgs.compFunction = HashCmpWord; // PHASH_TABLE_DEFAULT_COMP_FUNCTION; 354 hashArgs.hashFunction = HashGetCode; // PHASH_TABLE_DEFAULT_HASH_FUNCTION; 355 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR; 356 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.wordmap_create()"), &Interface->wordIDForWord)); 357 } 358 else 359 { 360 Interface->wordIDForWord = NULL; 361 } 362 363 return FST_SUCCESS; 364 365 CLEANUP: 366 wordmap_destroy(pwmap); 367 return rc; 368 } 369 wordID wordmap_add_word(wordmap* wmap, const char* word); 370 371 int FST_LoadWordMap(wordmap** pwmap, int num_words_to_add, PFile* fp) 372 { 373 wordID my_wID, wm_wID; 374 char *word, *wID_string; 375 int num_words; 376 asr_int32_t num_chars; 377 char buf[MAX_LINE_LENGTH]; 378 long fpos; 379 wordmap* wmap; 380 /* figure out the number of words */ 381 fpos = pftell(fp); 382 for (num_words = 0, num_chars = 0; pfgets(buf, MAX_LINE_LENGTH, fp); num_words++) 383 { 384 word = strtok(buf, " \n\r\t"); 385 num_chars += strlen(word); 386 } 387 num_chars += num_words * 2; /* for null terminators */ 388 pfseek(fp, fpos, SEEK_SET); 389 390 /* Alex added this to reuse this functionality elsewhere 391 destroy function for this create function is actually taken care of by the 392 FST_UnloadWordMap function 393 */ 394 wordmap_create(&wmap, num_chars, num_words, num_words_to_add); 395 396 /* later replace this with wordblocks, linked list of 8 character blocks! */ 397 while (pfgets(buf, MAX_LINE_LENGTH, fp)) 398 { 399 word = strtok(buf, " \n\r\t"); 400 wID_string = strtok(NULL, " \n\r\t"); 401 my_wID = (wordID)atoi_with_check(wID_string, MAXwordID); 402 403 // if(strstr(word,SLOTNAME_INDICATOR)) 404 // word = strstr(word,SLOTNAME_INDICATOR); 405 wm_wID = wordmap_add_word(wmap, word); 406 ASSERT(my_wID == wm_wID); 407 } 408 ASSERT(wmap->num_words == num_words); 409 410 for (my_wID = 1; my_wID < num_words; my_wID++) 411 { 412 if (!IS_SLOT_OLABEL(wmap->words[my_wID])) 413 break; 414 } 415 wmap->num_slots = my_wID; 416 wordmap_setbase(wmap); 417 *pwmap = wmap; 418 if(wmap->num_slots > MAX_NUM_SLOTS) 419 return FST_FAILED_ON_INVALID_ARGS; 420 else 421 return FST_SUCCESS; 422 } 423 424 int wordmap_destroy(wordmap** wmap) 425 { 426 if (wmap && *wmap) 427 { 428 wordmap_clean ( *wmap ); 429 if (((*wmap)->wordIDForWord)) PHashTableDestroy((*wmap)->wordIDForWord); 430 if (((*wmap)->chars)) FREE((*wmap)->chars); 431 if (((*wmap)->words)) FREE((*wmap)->words); 432 if ((*wmap)) FREE((*wmap)); 433 *wmap = 0; 434 } 435 return FST_SUCCESS; 436 } 437 438 int FST_UnloadWordMap(wordmap** wmap) 439 { 440 return wordmap_destroy(wmap); 441 } 442 443 444 int FST_DumpWordMap(PFile* fp, wordmap* wmap) 445 { 446 wordID my_wID; 447 for (my_wID = 0; my_wID < wmap->num_words; my_wID++) 448 { 449 pfprintf(fp, "%s %hu\n", wmap->words[my_wID], my_wID); 450 } 451 return FST_SUCCESS; 452 } 453 454 wordID wordmap_find_index(wordmap* wmap, const char* word) 455 { 456 void *wdID_void; 457 ESR_ReturnCode rc; 458 size_t i; 459 460 if (!word) 461 return MAXwordID; 462 463 if (wmap->num_words == 0) 464 return MAXwordID; 465 466 if (wmap->wordIDForWord) 467 { 468 rc = PHashTableGetValue(wmap->wordIDForWord, word, (void**)&wdID_void); 469 470 if (rc == ESR_SUCCESS) 471 return (wordID)(int)wdID_void; 472 } 473 else 474 { 475 for (i = 0; i < wmap->num_words; i++) 476 { 477 if (!strcmp(wmap->words[i], word)) 478 { 479 return (wordID)i; 480 } 481 } 482 } 483 return MAXwordID; 484 } 485 486 wordID wordmap_find_rule_index(wordmap* wmap, const char* rule) 487 { 488 int i; 489 int strlen_rule = strlen(rule); 490 for (i = wmap->num_slots; --i > 0;) 491 { /* > (not >=) because eps is always 0 */ 492 char* p = strstr(wmap->words[i], SLOTNAME_INDICATOR); 493 if(p != NULL){ 494 if (!strcmp(p, rule)) break; 495 else if(!strncmp(p,rule,strlen_rule) && !strcmp(p+strlen_rule,SLOTNAME_INDICATOR)) break; 496 } 497 } 498 if (i == 0) return MAXwordID; 499 else return(wordID)i; 500 } 501 502 wordID wordmap_find_index_in_rule(wordmap* wmap, const char* word, wordID rule) 503 { 504 int len = strlen(word); 505 int rule0 = rule + '0'; 506 void *wdID_void; 507 LCHAR word_dot_rule[256]; 508 ESR_ReturnCode rc; 509 510 if (!word) 511 return MAXwordID; 512 LSTRCPY(word_dot_rule, word); 513 word_dot_rule[len++] = IMPORTED_RULES_DELIM; 514 word_dot_rule[len++] = (char)rule0; 515 word_dot_rule[len++] = 0; 516 rc = PHashTableGetValue(wmap->wordIDForWord, word_dot_rule, (void**)&wdID_void); 517 if (rc == ESR_SUCCESS) 518 return (wordID)(int)(wdID_void); 519 return MAXwordID; 520 } 521 522 int strlen_with_null(const char* word) 523 { 524 #if 1 /* NT */ 525 int len = strlen(word) + 1; 526 if (len % 2 == 1) len++; 527 return len; 528 #else /* C54 */ 529 #endif 530 } 531 532 int wordmap_whether_in_rule(wordmap* wmap, wordID word, wordID rule) 533 { 534 char* word_chars; 535 int len; 536 if (word > wmap->num_words) return 0; 537 word_chars = wmap->words[word]; 538 len = strlen(word_chars); 539 if (word_chars[len-1] == rule + '0' && word_chars[len-2] == IMPORTED_RULES_DELIM) 540 return 1; 541 else 542 return 0; 543 } 544 545 void wordmap_setbase(wordmap* wmap) 546 { 547 wmap->num_base_words = wmap->num_words; 548 wmap->next_base_chars = wmap->next_chars; 549 } 550 551 void wordmap_ceiling(wordmap* wmap) 552 { 553 /* this is irreversible */ 554 wmap->max_words = wmap->num_words; 555 wmap->max_chars = (wmap->next_chars - wmap->chars); 556 } 557 558 559 static ESR_ReturnCode wordmap_clean ( wordmap *word_map ) 560 { 561 ESR_ReturnCode clean_status; 562 PHashTableEntry *entry; 563 PHashTableEntry *oldEntry; 564 wordID *value; 565 566 clean_status = ESR_SUCCESS; 567 568 if ( word_map->wordIDForWord == NULL ) 569 return clean_status; 570 571 clean_status = PHashTableEntryGetFirst ( word_map->wordIDForWord, &entry); 572 573 while ( ( entry != NULL ) && ( clean_status == ESR_SUCCESS ) ) 574 { 575 clean_status = PHashTableEntryGetKeyValue ( entry, NULL, (void **)&value ); 576 577 if ( clean_status == ESR_SUCCESS ) 578 { 579 oldEntry = entry; 580 clean_status = PHashTableEntryAdvance ( &entry ); 581 582 if ( clean_status == ESR_SUCCESS ) 583 clean_status = PHashTableEntryRemove ( oldEntry ); 584 } 585 } 586 return ( clean_status ); 587 } 588 589 590 static ESR_ReturnCode wordmap_populate ( wordmap *word_map, wordID num_words ) 591 { 592 ESR_ReturnCode populate_status; 593 wordID wdID; 594 595 populate_status = ESR_SUCCESS; 596 wdID = 0; 597 598 if ( word_map->wordIDForWord != NULL ) 599 { 600 while ( ( wdID < num_words ) && ( populate_status == ESR_SUCCESS ) ) 601 { 602 populate_status = PHashTablePutValue ( word_map->wordIDForWord, word_map->words[wdID], 603 (const void *)(int)wdID, NULL ); 604 if ( populate_status == ESR_SUCCESS ) 605 wdID++; 606 else { 607 return (populate_status); 608 } 609 } 610 } 611 return ( populate_status ); 612 } 613 614 void wordmap_reset(wordmap* wmap) 615 { 616 char** tmp_words; 617 int i; 618 ESR_ReturnCode reset_status; 619 620 if (wmap->num_base_words < wmap->num_words) 621 { 622 /*wordID i = (wordID)(wmap->num_base_words);*/ 623 char* old_wmap_chars = wmap->chars; 624 int offset = wmap->next_base_chars - wmap->chars; 625 char* tmp_chars = NEW_ARRAY(char, offset, L("srec.g2g.graph.wordmap.chars")); 626 if(!tmp_chars) { 627 passert ( 0 && L("failed to reset the memory for wordmap.chars ") ); 628 } 629 memcpy(tmp_chars,wmap->chars, offset * sizeof(char)); 630 FREE(wmap->chars); 631 632 wmap->chars = tmp_chars; 633 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap_chars); 634 wmap->max_chars = (wordID) offset; 635 wmap->next_chars = wmap->next_base_chars; 636 637 tmp_words = (char**) CALLOC_CLR(wmap->num_base_words, sizeof(char*), "srec.graph.wordmap.words"); 638 if(!tmp_words) { 639 passert ( 0 && L("failed to reset the memory for wordmap.words ") ); 640 } 641 memcpy( tmp_words, wmap->words, wmap->num_base_words * sizeof(char*)); 642 FREE( wmap->words); 643 644 wmap->words = tmp_words; 645 wmap->max_words = wmap->num_base_words; 646 wmap->num_words = wmap->num_base_words; 647 648 for(i=0; i<wmap->num_words; i++) 649 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap_chars); 650 } 651 652 reset_status = wordmap_clean ( wmap ); 653 654 if ( reset_status == ESR_SUCCESS ) 655 { 656 reset_status = wordmap_populate ( wmap, wmap->num_base_words ); 657 658 if ( reset_status != ESR_SUCCESS ) 659 { 660 wordmap_clean ( wmap ); 661 passert ( 0 && L("wordmap_reset failed") ); 662 } 663 } 664 else 665 { 666 passert ( 0 && L("wordmap_reset failed") ); 667 } 668 } 669 670 #define FST_GROW_FACTOR 12/10 671 #define FST_GROW_MINCHARS 256 672 #define FST_GROW_MINWORDS 32 673 wordID wordmap_add_word(wordmap* wmap, const char* word) 674 { 675 int len = strlen_with_null(word); 676 wordID wdID =0; 677 678 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars) 679 { 680 #if defined(FST_GROW_FACTOR) 681 int i,tmp_max_chars= wmap->max_chars * FST_GROW_FACTOR; 682 char* old_wmap__chars = wmap->chars; 683 if(tmp_max_chars - wmap->max_chars < FST_GROW_MINCHARS) 684 tmp_max_chars +=FST_GROW_MINCHARS; 685 686 char* tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars")); 687 if(!tmp_chars) { 688 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.chars")); 689 return MAXwordID; 690 } 691 memcpy(tmp_chars,wmap->chars,wmap->max_chars * sizeof(char)); 692 FREE(wmap->chars); 693 694 wmap->chars = tmp_chars; 695 wmap->next_chars = wmap->chars + (wmap->next_chars - old_wmap__chars); 696 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_wmap__chars); 697 wmap->max_chars = (wordID)tmp_max_chars; 698 //Remove old keys --- Add WORD 699 wordmap_clean (wmap ); 700 701 // adjust word pointers 702 for(i=0; i<wmap->num_words; i++) 703 { 704 wmap->words[i] = wmap->chars + (wmap->words[i] - old_wmap__chars); 705 // adjust hashtable --- 706 if (wmap->wordIDForWord) 707 { 708 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i], 709 (const void *)i, NULL ); 710 if ( rc != ESR_SUCCESS ) { 711 goto CLEANUP; 712 } 713 } 714 } 715 #else // so not defined(FST_GROW_FACTOR) 716 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars); 717 return MAXwordID; 718 #endif // defined(FST_GROW_FACTOR) 719 } 720 //Add word 721 if (wmap->num_words == wmap->max_words) 722 { 723 #if defined(FST_GROW_FACTOR) 724 char** tmp_words; 725 wordID tmp_max_words; 726 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR; 727 728 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS) 729 itmp_max_words += FST_GROW_MINWORDS; 730 731 if( itmp_max_words >= MAXwordID) { 732 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 733 return MAXwordID; 734 } 735 tmp_max_words = (wordID)itmp_max_words; 736 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words"); 737 if(!tmp_words) { 738 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap.words")); 739 return MAXwordID; 740 } 741 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*)); 742 FREE( wmap->words); 743 wmap->words = tmp_words; 744 wmap->max_words = tmp_max_words; 745 #else // so not defined(FST_GROW_FACTOR) 746 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 747 return MAXwordID; 748 #endif // defined(FST_GROW_FACTOR) 749 } 750 if(1) 751 { 752 strcpy(wmap->next_chars, word); 753 wmap->words[ wmap->num_words++] = wmap->next_chars; 754 wmap->next_chars += len; 755 wdID = (wordID)(wmap->num_words - (wordID)1); 756 if (wmap->wordIDForWord) 757 { 758 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID], 759 (const void *)(int)wdID, NULL ); 760 if ( rc != ESR_SUCCESS ) 761 goto CLEANUP; 762 } 763 return wdID; 764 } 765 CLEANUP: 766 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID); 767 return MAXwordID; 768 } 769 770 wordID wordmap_add_word_in_rule(wordmap* wmap, const char* word, wordID rule) 771 { 772 int len = strlen_with_null(word) + 2; 773 wordID wdID = 0; 774 wordID i; 775 //wordmap_add_word_in_rule 776 if (wmap->next_chars + len >= wmap->chars + wmap->max_chars) 777 { 778 #if defined(FST_GROW_FACTOR) 779 int tmp_max_chars = wmap->max_chars * FST_GROW_FACTOR; 780 char* tmp_chars; 781 char* old_next_chars = wmap->next_chars; 782 char* old_chars = wmap->chars; 783 784 if(tmp_max_chars-wmap->max_chars < FST_GROW_MINCHARS ) 785 tmp_max_chars += FST_GROW_MINCHARS; 786 if( wmap->chars + tmp_max_chars <= wmap->next_chars + len) 787 tmp_max_chars += len; 788 789 tmp_chars = NEW_ARRAY(char, tmp_max_chars, L("srec.g2g.graph.wordmap.chars")); 790 if(!tmp_chars) { 791 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_in_rule.chars")); 792 return MAXwordID; 793 } 794 memcpy(tmp_chars, wmap->chars, wmap->max_chars * sizeof(char)); 795 FREE(wmap->chars); 796 797 wmap->chars = tmp_chars; 798 wmap->next_chars = wmap->chars + (old_next_chars - old_chars) ; 799 wmap->next_base_chars = wmap->chars + (wmap->next_base_chars - old_chars); 800 wmap->max_chars = (wordID)tmp_max_chars; 801 802 //Remove old keys 803 wordmap_clean ( wmap ); 804 805 // adjust word pointers wordmap_add_word_in_rule 806 for(i=0; i<wmap->num_words; i++) 807 { 808 wmap->words[i] = wmap->chars +(wmap->words[i] - old_chars) ; 809 // adjust hashtable ----- add in rulewordmap_add_word_in_rule ---- 810 if (wmap->wordIDForWord) { 811 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[i], 812 (void*)(int)(i), NULL ); 813 if ( rc != ESR_SUCCESS ) 814 goto CLEANUP; 815 } 816 } 817 #else 818 PLogError("error: char overflow in wmap %d max %d\n", (int)(wmap->next_chars - wmap->chars), wmap->max_chars); 819 return MAXwordID; 820 #endif 821 }//wordmap_add_word_in_rule 822 if (wmap->num_words == wmap->max_words) 823 { 824 #if defined(FST_GROW_FACTOR) 825 char** tmp_words; 826 wordID tmp_max_words; 827 int itmp_max_words = wmap->max_words * FST_GROW_FACTOR; 828 if(itmp_max_words - wmap->max_words < FST_GROW_MINWORDS) 829 itmp_max_words += FST_GROW_MINWORDS; 830 831 if( itmp_max_words >= MAXwordID) { 832 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 833 return MAXwordID; 834 } 835 836 tmp_max_words = (wordID)itmp_max_words; 837 tmp_words = (char**) CALLOC_CLR(tmp_max_words, sizeof(char*), "srec.graph.wordmap.words"); 838 if(!tmp_words) { 839 PLogError(L("ESR_OUT_OF_MEMORY: Could not extend allocation of wordmap_add_rule.words")); 840 return MAXwordID; 841 } 842 memcpy( tmp_words, wmap->words, wmap->num_words * sizeof(char*)); 843 FREE( wmap->words); 844 wmap->words = tmp_words; 845 wmap->max_words = tmp_max_words; 846 #else 847 PLogError("error: word ptr overflow in wmap %d max %d\n", (int)wmap->num_words, wmap->max_words); 848 return MAXwordID; 849 #endif 850 } 851 if(1) 852 { 853 char *p; 854 const char *q; 855 /* word.1 */ 856 for (p = wmap->next_chars, q = word; (*p = *q) != '\0'; p++, q++) ; /* basic strcpy() */ 857 *p++ = IMPORTED_RULES_DELIM; 858 *p++ = (char)(rule + ((wordID)'0')); 859 *p = '\0'; 860 wmap->words[ wmap->num_words++] = wmap->next_chars; 861 wmap->next_chars += len; 862 wdID = (wordID)(wmap->num_words - (wordID)1); 863 if (wmap->wordIDForWord) 864 { 865 ESR_ReturnCode rc = PHashTablePutValue ( wmap->wordIDForWord, wmap->words[wdID], 866 (void*)(int)(wdID), NULL ); 867 if ( rc != ESR_SUCCESS ) 868 goto CLEANUP; 869 } 870 return wdID; 871 } 872 CLEANUP: 873 PLogError("error: could not add word and wordID in wmap hash (%s -> %d)\n", word, wdID); 874 return MAXwordID; 875 } 876 877 /*----------------------------------------------------------------------* 878 * * 879 * API * 880 * * 881 *----------------------------------------------------------------------*/ 882 883 int FST_AttachArbdata(srec_context* fst, srec_arbdata* allophone_tree) 884 { 885 int rc = 0; 886 unsigned int allotree__modelid; 887 888 fst->allotree = allophone_tree; 889 if( !allophone_tree) 890 return 1; 891 fst->hmm_info_for_ilabel = allophone_tree->hmm_infos - fst->hmm_ilabel_offset; 892 allotree__modelid = version_arbdata_models(allophone_tree); 893 if (allotree__modelid != 0 && fst->modelid != 0) 894 { 895 if (allotree__modelid != fst->modelid) 896 { 897 PLogError("Error: modelids disagree, sgcbaseline(%u) arbdata(%u)", fst->modelid, allotree__modelid); 898 rc = FST_FAILED_ON_INVALID_ARGS; 899 } 900 } 901 902 return rc; 903 } 904 905 int FST_LoadGraph(srec_context* pfst, wordmap* imap, wordmap* omap, 906 int num_words_to_add, PFile* fp) 907 { 908 int i, rc = 0; 909 char line[MAX_LINE_LENGTH], *args[32], nargs; 910 arcID max_num_FSMarcs; 911 arcID max_num_FSMnodes; 912 nodeID from_node, last_from_node, into_node, num_nodes, max_node_number = 0; 913 FSMarc_ptr atok = (FSMarc_ptr)0; 914 FSMarc *atoken = NULL; 915 FSMnode *fr_node, *to_node; 916 costdata cost = FREEcostdata; 917 arcID num_arcs; 918 919 srec_context* fst = pfst; 920 char *ilabel_str = 0, *olabel_str = 0; 921 long fpos; 922 arcID new_arc_id; 923 asr_int32_t temp; 924 925 926 /* determine number of arcs and nodes to allocate, add 50% for dynamic */ 927 fpos = pftell(fp); 928 max_num_FSMnodes = 0; 929 pfst->modelid = 0; 930 for (max_num_FSMarcs = 0; pfgets(line, sizeof(line) / sizeof(char), fp); max_num_FSMarcs++) 931 { 932 if (strstr(line, "modelid:") == line) 933 { 934 char *p; 935 pfst->modelid = strtoul(line + strlen("modelid:"), &p, 10); 936 } 937 from_node = (nodeID)atoi_with_check(line, MAXnodeID); 938 if (from_node > max_num_FSMnodes) 939 max_num_FSMnodes = from_node; 940 } 941 pfseek(fp, fpos, SEEK_SET); 942 temp = max_num_FSMnodes + 1 /*why+1?*/ + num_words_to_add * AVG_NODES_PER_WORD; 943 if (temp >= MAXnodeID) 944 { 945 max_num_FSMnodes = MAXnodeID - 1; 946 PLogMessage("Warning: using max nodes instead\n"); 947 } 948 else 949 { 950 max_num_FSMnodes = (nodeID)temp; 951 } 952 temp = max_num_FSMarcs + num_words_to_add * AVG_ARCS_PER_WORD; 953 if (temp >= MAXarcID) 954 { 955 max_num_FSMarcs = MAXarcID - 1; 956 PLogMessage("Warning: using max arcs instead\n"); 957 } 958 else 959 { 960 max_num_FSMarcs = (arcID)temp; 961 } 962 fst->olabels = omap; 963 if (imap) 964 { 965 /* generally no imap is specified */ 966 fst->ilabels = imap; 967 fst->hmm_ilabel_offset = wordmap_find_index(fst->ilabels, "hmm0"); 968 ASSERT(fst->hmm_ilabel_offset >= 0); 969 } 970 else 971 { 972 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap"); 973 fst->ilabels->num_words = fst->ilabels->max_words = 0; 974 fst->ilabels->words = 0; 975 /* this is bad hard code, we can get this from the swiarb file, it is 976 equal to the number of phonemes (53 for enu, 39 for jpn) plus the special 977 models (eps,.wb,.ph,h#,#h,iwt,not) minus 1 ('cuz base number is 0) */ 978 fst->hmm_ilabel_offset = 128; /* should match MAX_PHONEMES */ 979 } 980 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(max_num_FSMarcs, sizeof(FSMarc), "srec.graph.arcs"); 981 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(max_num_FSMnodes, sizeof(FSMnode), "srec.graph.nodes"); 982 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(max_num_FSMnodes, sizeof(char), "srec.graph.nodeinfos"); 983 984 /* setup the arc freelist */ 985 fst->FSMarc_freelist = 0; 986 fst->num_arcs = 0; 987 for (i = 0; i < max_num_FSMarcs - 1; i++) 988 { 989 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1); 990 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 991 } 992 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL; 993 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 994 995 /* initialize the nodes, 'cuz reading is random order */ 996 fst->num_nodes = 0; 997 for (i = 0; i < max_num_FSMnodes; i++) 998 { 999 fr_node = &fst->FSMnode_list[i]; 1000 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL; 1001 } 1002 1003 /* 1. first load up all the information */ 1004 IF_DEBUG_WDADD(printf("load graph ... 1\n")); 1005 num_arcs = 0; 1006 last_from_node = MAXnodeID; 1007 while (pfgets(line, sizeof(line) / sizeof(char), fp)) 1008 { 1009 if (strstr(line, "modelid:") == line) continue; 1010 IF_DEBUG_WDADD(printf("read arc %s", line)); 1011 nargs = split_line(line, args); 1012 if (nargs >= 4) 1013 { 1014 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID); 1015 into_node = (nodeID)atoi_with_check(args[1], MAXnodeID); 1016 ilabel_str = args[2]; 1017 olabel_str = args[3]; 1018 cost = FREEcostdata; 1019 if (nargs == 5) 1020 PLogError(L("Warning: too many arguments on line %s"), line); 1021 } 1022 else if (nargs == 1) 1023 { 1024 from_node = (nodeID)atoi_with_check(args[0], MAXnodeID); 1025 into_node = MAXnodeID; 1026 ilabel_str = 0; 1027 olabel_str = 0; 1028 cost = FREEcostdata; 1029 } 1030 else 1031 { 1032 from_node = into_node = 0; 1033 PLogError("can't parse line %s\n", line); 1034 ASSERT(0); 1035 } 1036 1037 if (into_node == MAXnodeID) 1038 { 1039 fst->end_node = from_node; 1040 } 1041 else 1042 { 1043 new_arc_id = fst_get_free_arc(fst); 1044 if (new_arc_id == MAXarcID) 1045 return FST_FAILED_ON_MEMORY; 1046 atok = ARC_ItoX(new_arc_id); 1047 atoken = ARC_XtoP(atok); 1048 num_arcs++; 1049 fr_node = &fst->FSMnode_list[from_node]; 1050 to_node = &fst->FSMnode_list[into_node]; 1051 if (fst->ilabels->num_words == 0) 1052 { 1053 atoken->ilabel = hmm_number(ilabel_str, fst->hmm_ilabel_offset); 1054 /* Xufang: if olabel_str is a slotname, change the input label to a no-silence and no-eps HMM. It is true that the weight of slot arcs is 999 and hmm will not take effect in the runtime, but it was found that when the slot is a loop, eps and silence will cause the recursive function fst_node_has_speech_to_come into a no-step-out status.*/ 1055 if(strstr(olabel_str, SLOTNAME_INDICATOR)!=NULL) 1056 atoken->ilabel = fst->hmm_ilabel_offset + SLOTLOOP_OFFSET; 1057 } 1058 else 1059 { 1060 atoken->ilabel = wordmap_find_index(fst->ilabels, ilabel_str); 1061 } 1062 atoken->olabel = wordmap_find_index(fst->olabels, olabel_str); 1063 /* if(g_fst_options.wtw_cost_override>0) { 1064 if(atoken->ilabel == WORD_BOUNDARY) 1065 atoken->cost = g_fst_options.wtw_cost_override; 1066 else 1067 atoken->cost = g_fst_options.arc_cost_override; 1068 } else */ 1069 atoken->cost = cost; 1070 #if DEBUG_WDADD 1071 atoken->ilabel_str = (atoken->ilabel < fst->ilabels->num_words ? fst->ilabels->words[atoken->ilabel] : 0); 1072 atoken->olabel_str = (atoken->olabel < fst->olabels->num_words ? fst->olabels->words[atoken->olabel] : 0); 1073 #endif 1074 append_arc_leaving_node(fst, fr_node, atok); 1075 if (into_node > max_node_number) 1076 max_node_number = into_node; 1077 append_arc_arriving_node(fst, to_node, atok); 1078 atoken->fr_node = NODE_ItoX(from_node); 1079 atoken->to_node = NODE_ItoX(into_node); 1080 } 1081 last_from_node = from_node; 1082 } 1083 ASSERT(fst->num_arcs == num_arcs); 1084 1085 /* setup the node freelist */ 1086 IF_DEBUG_WDADD(printf("load graph ... 6\n")); 1087 num_nodes = (nodeID)(max_node_number + 1); 1088 if( max_num_FSMnodes > num_nodes) { 1089 fst->FSMnode_freelist = num_nodes; 1090 for (i = num_nodes; i < (max_num_FSMnodes - 1); i++) 1091 { 1092 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1); 1093 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 1094 } 1095 if (i == (max_num_FSMnodes - 1)) { 1096 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL; 1097 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 1098 } 1099 } else 1100 fst->FSMnode_freelist = FSMNODE_NULL; 1101 1102 /* some book-keeping stuff */ 1103 IF_DEBUG_WDADD(printf("load graph ... 7\n")); 1104 fst->num_base_arcs = fst->num_arcs = num_arcs; 1105 fst->FSMarc_list_len = max_num_FSMarcs; 1106 fst->FSMnode_list_len = max_num_FSMnodes; 1107 fst->num_base_nodes = fst->num_nodes = num_nodes; 1108 fst->start_node = 0; 1109 /* fst->end_node = 0; this is set up above */ 1110 1111 /* beg and end word labels */ 1112 fst->beg_silence_word = wordmap_find_index(fst->olabels, "-pau-"); 1113 fst->end_silence_word = wordmap_find_index(fst->olabels, "-pau2-"); 1114 fst->hack_silence_word = wordmap_find_index(fst->olabels, "silence"); 1115 1116 /* set the search limitations, real ones will be registered later */ 1117 fst->max_searchable_nodes = 0; 1118 fst->max_searchable_arcs = 0; 1119 1120 rc = fst_alloc_transit_points(fst); 1121 return (rc ? FST_FAILED_ON_INVALID_ARGS : FST_SUCCESS); 1122 } 1123 1124 int FST_UnloadGraph(srec_context* pfst) 1125 { 1126 if (pfst->ilabels) 1127 FREE(pfst->ilabels); 1128 FREE(pfst->FSMarc_list); 1129 FREE(pfst->FSMnode_list); 1130 FREE(pfst->FSMnode_info_list); 1131 pfst->FSMarc_list = 0; 1132 pfst->FSMnode_list = 0; 1133 pfst->FSMnode_info_list = 0; 1134 return FST_SUCCESS; 1135 } 1136 1137 int FST_DumpGraph(srec_context* fst, PFile* fp) 1138 { 1139 int rc = 0; 1140 nodeID i; 1141 FSMarc_ptr atok; 1142 FSMarc* atoken; 1143 nodeID from_node, into_node; 1144 FSMnode* ntoken; 1145 char *ilabel, *olabel; 1146 1147 for (i = 0; i < fst->num_nodes; i++) 1148 { 1149 from_node = i; 1150 ntoken = &fst->FSMnode_list[i]; 1151 if (ntoken->first_prev_arc == FSMARC_FREE) 1152 continue; 1153 if (ntoken->un_ptr.first_next_arc != FSMARC_NULL) 1154 { 1155 for (atok = ntoken->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 1156 { 1157 char buf[32]; 1158 atoken = ARC_XtoP(atok); 1159 into_node = NODE_XtoI(atoken->to_node); 1160 ilabel = fst->ilabels->num_words == 0 ? 1161 hmm_name(atoken->ilabel, fst->hmm_ilabel_offset, buf) : 1162 fst->ilabels->words[atoken->ilabel] ; 1163 olabel = fst->olabels->words[atoken->olabel]; 1164 1165 if (atoken->cost != FREEcostdata) 1166 { 1167 /* regular arc */ 1168 pfprintf(fp, "%hu\t%hu\t%s\t%s\t%hu\n", 1169 from_node, into_node, ilabel, olabel, atoken->cost); 1170 } 1171 else 1172 { 1173 /* regular zero cost arc */ 1174 pfprintf(fp, "%hu\t%hu\t%s\t%s\n", 1175 from_node, into_node, ilabel, olabel); 1176 } 1177 } 1178 } 1179 else 1180 { 1181 pfprintf(fp, "%hu\n", from_node); 1182 } 1183 } 1184 return rc; 1185 } 1186 1187 int FST_AddWordToGrammar(srec_context* fst, const char* _slot, 1188 const char* word, const char* pron, 1189 const int cost) 1190 { 1191 int num_pron_ampersands = 0, num_prons_added = 0, num_words_added = 0; 1192 int i, pron_len, model_sequence_len; 1193 modelID model_sequence[MAX_PRON_LEN]; 1194 char phoneme_sequence[MAX_PRON_LEN]; 1195 wordID olabel = MAXwordID; 1196 int irc, rc = FST_SUCCESS; 1197 nodeID start_node = MAXnodeID, end_node = MAXnodeID; 1198 char veslot[MAX_WORD_LEN]; 1199 #if USE_HMM_BASED_ENROLLMENT 1200 const char* Tpron; 1201 #endif 1202 1203 /* The addword from voice enroll still use @, and to test voice enroll, @ sign was removed and __ was added at the begining and ending of a word. Xufang */ 1204 if(_slot[0] == '@') { 1205 strcpy(veslot,SLOTNAME_INDICATOR); 1206 strcat(veslot,_slot+1); 1207 strcat(veslot,SLOTNAME_INDICATOR); 1208 } else 1209 strcpy(veslot, _slot); 1210 1211 #if USE_COMP_STATS 1212 if (!comp_stats) 1213 comp_stats = init_comp_stats1(); 1214 start_cs_clock1(&comp_stats->word_addition); 1215 #endif 1216 1217 /* we expect developers to call AddWord with the same slot many times, 1218 so we cache the slot start and end nodes and re-use on subseq calls */ 1219 if( fst->addWordCaching_lastslot_num == MAXwordID ) 1220 fst->addWordCaching_lastslot_name = NULL; 1221 else { 1222 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num]; 1223 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR); 1224 ASSERT( fst->addWordCaching_lastslot_name); 1225 } 1226 if( fst->addWordCaching_lastslot_name==NULL || strcmp( fst->addWordCaching_lastslot_name, veslot)) { 1227 1228 fst->addWordCaching_lastslot_num = wordmap_find_rule_index(fst->olabels, veslot); 1229 /* olabel not found is a user error */ 1230 if (fst->addWordCaching_lastslot_num == MAXwordID) 1231 { 1232 size_t i; 1233 1234 pfprintf(PSTDOUT, L("error: slot '%s' not found among ["), veslot); 1235 for (i = 1; i < (size_t) fst->olabels->num_slots; ++i) 1236 pfprintf(PSTDOUT, "%s, ", fst->olabels->words[i]); 1237 pfprintf(PSTDOUT, L("] possible\n")); 1238 rc = FST_FAILED_ON_INVALID_ARGS; 1239 goto RETRC; 1240 } 1241 1242 fst->addWordCaching_lastslot_name = fst->olabels->words[fst->addWordCaching_lastslot_num]; 1243 fst->addWordCaching_lastslot_name = strstr(fst->addWordCaching_lastslot_name, SLOTNAME_INDICATOR); 1244 ASSERT(fst->addWordCaching_lastslot_name); 1245 1246 /* now find where in the graph this slot is referenced, note that 1247 there might be more than one place, but here we ignore multiples */ 1248 for (i = fst->num_fsm_exit_points; --i >= 0;) 1249 { 1250 arcID arcid = fst->fsm_exit_points[i].arc_index; 1251 if (fst->FSMarc_list[arcid].olabel == fst->addWordCaching_lastslot_num) 1252 { 1253 FSMarc* arc; 1254 FSMnode* node; 1255 start_node = fst->fsm_exit_points[i].from_node_index; 1256 end_node = fst->fsm_exit_points[i].wbto_node_index; 1257 fst->addWordCaching_lastslot_needs_post_silence = ESR_TRUE; 1258 fst->addWordCaching_lastslot_ifsm_exit_point = i; 1259 node = &fst->FSMnode_list[ end_node]; 1260 arc = &fst->FSMarc_list[node->un_ptr.first_next_arc]; 1261 if (arc->olabel == fst->end_silence_word && arc->linkl_next_arc == MAXarcID) 1262 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE; 1263 break; 1264 } 1265 } 1266 1267 /* not found in the graph is an internal error, coding error */ 1268 if (i < 0 || start_node>=fst->num_nodes || end_node>=fst->num_nodes) 1269 { 1270 PLogError("error: (internal) finding olabel %d %d %d\n", fst->addWordCaching_lastslot_num, 1271 start_node, end_node); 1272 goto RETRC; 1273 } 1274 } /* cached or not */ 1275 1276 i = fst->addWordCaching_lastslot_ifsm_exit_point; 1277 start_node = fst->fsm_exit_points[i].from_node_index; 1278 end_node = fst->fsm_exit_points[i].wbto_node_index; 1279 1280 /* now start_node and end_node are known */ 1281 if (!word || !*word || !pron || !*pron) 1282 { 1283 rc = FST_FAILED_ON_INVALID_ARGS; 1284 PLogError("error: null word/pron on input to FST_AddWordToGrammar()\n"); 1285 goto RETRC; 1286 } 1287 1288 /* from here */ 1289 IF_DEBUG_WDADD(printf("Adding %s %s\n", word, (const char*)pron)); 1290 1291 /* loop over all prons, we break when we hit the double-null (hence the +1) */ 1292 for( ; (*pron)!='\0'; pron+=(pron_len+1)) { 1293 pron_len = strlen((char*)pron); 1294 if (pron_len >= MAX_PRON_LEN) 1295 { 1296 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron); 1297 rc = FST_FAILED_ON_INVALID_ARGS; 1298 goto RETRC; 1299 } 1300 1301 /* check for searchability after adding, estimating at most pron_len 1302 arcs and nodes will be added */ 1303 if (fst->num_arcs + pron_len > fst->max_searchable_arcs) 1304 { 1305 PLogError("error: wordadd failed on word %s due to %d arc search limit\n", 1306 word, fst->max_searchable_arcs); 1307 rc = FST_FAILED_ON_MEMORY; 1308 goto RETRC; 1309 } 1310 1311 if (fst->num_nodes + pron_len > fst->max_searchable_nodes) 1312 { 1313 PLogError("error: wordadd failed on word %s due to %d node search limit\n", 1314 word, fst->max_searchable_nodes); 1315 rc = FST_FAILED_ON_MEMORY; 1316 goto RETRC; 1317 } 1318 1319 /* add the word if necessary, !FST_SUCCESS is allowed if we're just adding 1320 an alternate pronunciation */ 1321 olabel = wordmap_find_index_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num); 1322 if (olabel == MAXwordID) 1323 { 1324 olabel = wordmap_add_word_in_rule(fst->olabels, word, fst->addWordCaching_lastslot_num); 1325 if (olabel != MAXwordID) 1326 num_words_added++; 1327 } 1328 if (olabel == MAXwordID) 1329 { 1330 PLogError("error: wordmap_add_word failed\n"); 1331 rc = FST_FAILED_ON_MEMORY; 1332 goto RETRC; 1333 } 1334 1335 #if USE_HMM_BASED_ENROLLMENT 1336 // pron should be converted to model_sequence, model_sequence_len 1337 #define VETRANS_PREFIX "wd_hmm" 1338 #define VETRANS_PREFIX_LEN 6 1339 if( LSTRSTR(pron, VETRANS_PREFIX)!=NULL) { 1340 // printf("Debug-pron: %d, %s\n",pron_len, pron); 1341 model_sequence_len=0; 1342 for(Tpron=pron; (Tpron=strstr(Tpron, VETRANS_PREFIX))!=NULL; ){ 1343 // Tpron = strstr(Tpron, "wd_hmm"); 1344 Tpron += VETRANS_PREFIX_LEN; // skip over "wd_hmm" 1345 // Copy hmm number (56,132,...) 1346 model_sequence[ model_sequence_len] = atoi_with_check(Tpron, MAXmodelID); 1347 model_sequence_len++; 1348 } 1349 1350 1351 /* we need to append silence at the end since we might be dealing with a slot 1352 that has ensuing speech, formerly we only dealt with ROOT which defacto 1353 was followed by the pau2 silence, so ROOT did not need its own */ 1354 if (fst->addWordCaching_lastslot_needs_post_silence) 1355 model_sequence[model_sequence_len++] = 3; // <<< hmm3_sil !!! ugly hard-code here 1356 1357 /* append the word boundary */ 1358 model_sequence[model_sequence_len++] = WORD_BOUNDARY; 1359 /* translate to input label ids */ 1360 for (i = 0; i < model_sequence_len; i++) 1361 { 1362 if (model_sequence[i] >= EPSILON_OFFSET) 1363 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset); 1364 } 1365 } 1366 else 1367 #endif 1368 { 1369 pron_len = strlen((char*)pron); 1370 if (pron_len >= MAX_PRON_LEN) 1371 { 1372 PLogError("error: wordadd failed on word %s due to pron [%s] too long\n", word, (char*)pron); 1373 rc = FST_FAILED_ON_INVALID_ARGS; 1374 goto RETRC; 1375 } 1376 1377 for (i = 0; i < pron_len; i++) 1378 { 1379 if (pron[i] == OPTSILENCE_CODE) 1380 { 1381 phoneme_sequence[i] = SILENCE_CODE; 1382 num_pron_ampersands++; 1383 } 1384 else 1385 phoneme_sequence[i] = pron[i]; 1386 } 1387 /* we need to append silence at the end since we might be dealing with a slot 1388 that has ensuing speech, formerly we only dealt with ROOT which defacto 1389 was followed by the pau2 silence, so ROOT did not need its own */ 1390 if (fst->addWordCaching_lastslot_needs_post_silence) 1391 phoneme_sequence[i++] = SILENCE_CODE; 1392 1393 model_sequence_len = i; 1394 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence); 1395 /* check for bad sequence of phonemes */ 1396 if (irc) 1397 { 1398 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, irc); 1399 rc = FST_FAILED_ON_INVALID_ARGS; 1400 goto RETRC; 1401 } 1402 IF_DEBUG_WDADD(printf("model_sequence ...\n")); 1403 1404 /* append the word boundary */ 1405 model_sequence[model_sequence_len++] = WORD_BOUNDARY; 1406 /* translate to input label ids */ 1407 for (i = 0; i < model_sequence_len; i++) 1408 { 1409 if (model_sequence[i] >= EPSILON_OFFSET) 1410 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset); 1411 } 1412 } /* end of ph_t ph_r .. type decoding of the model sequence */ 1413 1414 /* now do the actual addition */ 1415 rc = fst_add_arcs(fst, start_node, end_node, olabel, (costdata)cost, model_sequence, model_sequence_len); 1416 if (rc == FST_SUCCESS) 1417 { 1418 num_prons_added++; 1419 } 1420 else if (rc == FST_FAILED_ON_HOMONYM) 1421 { 1422 /* maybe the another pron will work? */ 1423 } 1424 else 1425 { 1426 PLogMessage("error: fst_add_arcs() failed adding word %s pron %s ('&' as 'iwt')\n", word, (char*)pron); 1427 goto RETRC; 1428 } 1429 1430 /* second add the pron with no silences, 1431 this only applies to true prons, not wd_hmm333 wd_hmm222 type prons */ 1432 1433 if (num_pron_ampersands > 0) 1434 { 1435 for (i = 0, model_sequence_len = 0; i < pron_len; i++) 1436 { 1437 if (pron[i] != OPTSILENCE_CODE) 1438 phoneme_sequence[model_sequence_len++] = pron[i]; 1439 } 1440 1441 irc = get_modelids_for_pron(fst->allotree, phoneme_sequence, model_sequence_len, model_sequence); 1442 /* check for bad sequence of phonemes */ 1443 if (irc) 1444 { 1445 PLogError("error: get_modelids_for_pron(%s) returned %d\n", pron, rc); 1446 rc = FST_FAILED_ON_INVALID_ARGS; 1447 goto RETRC; 1448 } 1449 else 1450 { 1451 IF_DEBUG_WDADD(printf("model_sequence ...\n")); 1452 1453 /* append the word boundary */ 1454 model_sequence[model_sequence_len++] = WORD_BOUNDARY; 1455 /* translate to input label ids */ 1456 for (i = 0; i < model_sequence_len; i++) 1457 { 1458 if (model_sequence[i] >= EPSILON_OFFSET) 1459 model_sequence[i] = (modelID)(model_sequence[i] + fst->hmm_ilabel_offset); 1460 } 1461 /* now do the actual addition */ 1462 rc = fst_add_arcs(fst, start_node, end_node, 1463 olabel, (costdata)cost, model_sequence, model_sequence_len); 1464 1465 if (rc == FST_SUCCESS) 1466 { 1467 num_prons_added++; 1468 } 1469 else if (rc == FST_FAILED_ON_HOMONYM) 1470 { 1471 /* maybe another pron worked? */ 1472 } 1473 else 1474 { 1475 PLogMessage("Warning: fst_add_arcs() failed while adding " 1476 "word %s pron %s (skip '&')\n", word, (char*)pron); 1477 goto RETRC; 1478 } 1479 } 1480 } 1481 } 1482 1483 1484 RETRC: 1485 /* set this to make sure that FST_Prepare gets called after add word */ 1486 fst->whether_prepared = 0; 1487 #if USE_COMP_STATS 1488 end_cs_clock1(&comp_stats->word_addition, 1); 1489 #endif 1490 1491 if (rc < 0 && rc != FST_FAILED_ON_HOMONYM) 1492 return rc; 1493 1494 if (num_prons_added == 0 && num_words_added > 0) 1495 { 1496 return FST_FAILED_ON_HOMONYM; 1497 } 1498 else if (num_prons_added == 0 && num_words_added == 0) 1499 { 1500 return FST_FAILED_ON_HOMOGRAPH; 1501 } 1502 else if (num_prons_added > 0 && num_words_added == 0) 1503 { 1504 return FST_SUCCESS_ON_OLD_WORD; 1505 } 1506 else 1507 { 1508 /* if(num_prons_added>0 && num_words_added>0) */ 1509 return FST_SUCCESS; 1510 } 1511 } 1512 1513 /* remove arcs leaving a node, arcs added via word add, are just discarded 1514 to be cleaned up for re-use by other functions */ 1515 void remove_added_arcs_leaving(srec_context* fst, nodeID ni) 1516 { 1517 FSMnode* node = &fst->FSMnode_list[ ni]; 1518 FSMarc *arc = NULL, *arc2; 1519 arcID *pai, ai, ai2; 1520 for (pai = &node->un_ptr.first_next_arc, ai = (*pai); ai != MAXarcID; pai = &arc->linkl_next_arc, ai = (*pai)) 1521 { 1522 if (ai < fst->num_base_arcs) 1523 { 1524 arc = &fst->FSMarc_list[ai]; 1525 } 1526 else 1527 { 1528 arc2 = &fst->FSMarc_list[ai]; 1529 for (ai2 = arc2->linkl_next_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID; 1530 ai2 = arc2->linkl_next_arc) 1531 { 1532 arc2 = &fst->FSMarc_list[ai2]; 1533 } 1534 *pai = ai2; 1535 } 1536 } 1537 } 1538 1539 /* remove arcs arriving at a node, arcs added via word add, are just discarded 1540 to be cleaned up for re-use by other functions */ 1541 void remove_added_arcs_arriving(srec_context* fst, nodeID ni) 1542 { 1543 FSMnode* node = &fst->FSMnode_list[ni]; 1544 FSMarc *arc = NULL, *arc2; 1545 arcID *pai, ai, ai2; 1546 for (pai = &node->first_prev_arc, ai = (*pai); ai != MAXarcID; 1547 pai = &arc->linkl_prev_arc, ai = (*pai)) 1548 { 1549 if (ai < fst->num_base_arcs) 1550 { 1551 arc = &fst->FSMarc_list[ai]; 1552 } 1553 else 1554 { 1555 arc2 = &fst->FSMarc_list[ai]; 1556 for (ai2 = arc2->linkl_prev_arc; ai2 >= fst->num_base_arcs && ai2 != MAXarcID; 1557 ai2 = arc2->linkl_prev_arc) 1558 { 1559 arc2 = &fst->FSMarc_list[ai2]; 1560 } 1561 *pai = ai2; 1562 } 1563 } 1564 } 1565 1566 /* reset greammar, but resetting all the arcs, nodes and words added 1567 via the addword functions */ 1568 1569 int FST_ResetGrammar(srec_context* fst) 1570 { 1571 int i, rc = 0; 1572 nodeID fst_slot_start_node, fst_slot_end_node; 1573 wordID fst_slot_slotnum; 1574 arcID ai; 1575 nodeID ni2, ni3, ni4, ni3a; 1576 FSMnode_t *node, *node2, *node3; 1577 FSMarc_t *arc, *arc2, *arc3; 1578 1579 /*fst_slot_slotnum = wordmap_find_rule_index(fst->olabels, slot);*/ 1580 for (fst_slot_slotnum = 1; fst_slot_slotnum < fst->olabels->num_slots; 1581 fst_slot_slotnum++) 1582 { 1583 1584 if (fst_slot_slotnum == MAXwordID) 1585 { 1586 char *slot = ""; 1587 PLogError("error: slot '%s' not found among [%d,%d] possible\n", slot, 1, fst->olabels->num_slots - 1); 1588 return (rc = FST_FAILED_ON_INVALID_ARGS); 1589 } 1590 1591 /* now find where in the graph this slot is referenced, note that 1592 there might be more than one place, but here we ignore multiples */ 1593 fst_slot_start_node = MAXnodeID; 1594 fst_slot_end_node = MAXnodeID; 1595 for (i = fst->num_fsm_exit_points; --i >= 0;) 1596 { 1597 ai = fst->fsm_exit_points[i].arc_index; 1598 if (fst->FSMarc_list[ai].olabel == fst_slot_slotnum) 1599 { 1600 fst_slot_start_node = fst->fsm_exit_points[i].from_node_index; 1601 fst_slot_end_node = fst->fsm_exit_points[i].wbto_node_index; 1602 } 1603 } 1604 1605 /* this 'slot' could be the root rule, which can't be removed */ 1606 if (fst_slot_start_node == MAXnodeID || fst_slot_end_node == MAXnodeID) 1607 continue; 1608 1609 /* 1610 start end 1611 node node2 node3 node4 1612 o--@N-->o--sil-->o--wb-->o 1613 arc arc2 | arc3 ^ 1614 | |wb 1615 \__sil->o 1616 arc4 ni3a 1617 */ 1618 1619 remove_added_arcs_leaving(fst, fst_slot_start_node); 1620 node = &fst->FSMnode_list[ fst_slot_start_node]; 1621 for (ai = node->un_ptr.first_next_arc; ai != MAXarcID; ai = arc->linkl_next_arc) 1622 { 1623 arc = &fst->FSMarc_list[ai]; 1624 if (arc->olabel != fst_slot_slotnum) 1625 continue; 1626 1627 ni2 = arc->to_node; 1628 remove_added_arcs_arriving(fst, ni2); 1629 if (ni2 == fst_slot_end_node) 1630 continue; 1631 node2 = &fst->FSMnode_list[ni2]; 1632 arc2 = &fst->FSMarc_list[ node2->un_ptr.first_next_arc]; 1633 1634 1635 ni3 = arc2->to_node; 1636 remove_added_arcs_arriving(fst, ni3); 1637 if (ni3 == fst_slot_end_node) 1638 continue; 1639 node3 = &fst->FSMnode_list[ni3]; 1640 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc]; 1641 while (arc3->linkl_next_arc != MAXarcID) 1642 { 1643 arc3 = &fst->FSMarc_list[arc3->linkl_next_arc]; 1644 ni3a = arc3->to_node; 1645 remove_added_arcs_arriving(fst, ni3a); 1646 } 1647 arc3 = &fst->FSMarc_list[ node3->un_ptr.first_next_arc]; 1648 1649 ni4 = arc3->to_node; 1650 remove_added_arcs_arriving(fst, ni4); 1651 ASSERT(ni4 == fst_slot_end_node); 1652 if (ni4 == fst_slot_end_node) 1653 continue; 1654 } 1655 } 1656 1657 /* reset the freelist for nodes */ 1658 if( fst->num_nodes == fst->num_base_nodes ) 1659 {} 1660 else{ 1661 FSMnode *ntoken; 1662 FSMnode* tmp_FSMnode_list; 1663 FSMnode_info* tmp_FSMnode_info_list; 1664 1665 fst->FSMnode_freelist = MAXnodeID; 1666 fst->num_nodes = fst->FSMnode_list_len = fst->num_base_nodes; 1667 1668 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes"); 1669 if(!tmp_FSMnode_list){ 1670 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodes"); 1671 return FST_FAILED_ON_MEMORY; 1672 } 1673 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode)); 1674 1675 /* scan to the end of the free list (to be re-used) */ 1676 nodeID* last_free_node = (&fst->FSMnode_freelist); 1677 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ; 1678 1679 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node) 1680 ntoken = &tmp_FSMnode_list[ *last_free_node]; 1681 1682 FREE( fst->FSMnode_list); 1683 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos"); 1684 if(!tmp_FSMnode_info_list){ 1685 PLogError("ERROR: Could NOT reset the memory for srec.graph.nodeinfos"); 1686 return FST_FAILED_ON_MEMORY; 1687 } 1688 // copy in node info 1689 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info)); 1690 1691 1692 FREE( fst->FSMnode_info_list); 1693 fst->FSMnode_info_list = tmp_FSMnode_info_list; 1694 fst->FSMnode_list = tmp_FSMnode_list; 1695 } 1696 1697 /*ni = fst->FSMnode_freelist = fst->num_base_nodes; 1698 node = &fst->FSMnode_list[ni]; 1699 for (; ni < fst->FSMnode_list_len - 1; ni++, node++) 1700 node->un_ptr.next_node = (nodeID)(ni + 1); 1701 node->un_ptr.next_node = MAXnodeID; 1702 fst->num_nodes = fst->num_base_nodes;*/ 1703 1704 /* reset the freelist for arcs */ 1705 if( fst->num_arcs == fst->num_base_arcs ) 1706 {} 1707 else { 1708 FSMarc* atoken = NULL; 1709 FSMarc* tmp_FSMarc_list; 1710 1711 fst->num_arcs = fst->num_base_arcs; 1712 fst->FSMarc_list_len = fst->num_base_arcs; 1713 fst->FSMarc_freelist = MAXarcID; 1714 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs"); 1715 if(!tmp_FSMarc_list){ 1716 PLogError("ERROR: Could NOT reset the memory for srec.graph.arcs"); 1717 return FST_FAILED_ON_MEMORY; 1718 } 1719 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc)); 1720 1721 /* scan to the end of the free list (to be re-used) */ 1722 arcID* last_free_arc = &fst->FSMarc_freelist; 1723 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ; 1724 1725 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc) 1726 atoken = &tmp_FSMarc_list[ *last_free_arc]; 1727 1728 FREE( fst->FSMarc_list); 1729 fst->FSMarc_list = tmp_FSMarc_list; 1730 } 1731 1732 /*ai = fst->FSMarc_freelist = fst->num_base_arcs; 1733 arc = &fst->FSMarc_list[ai]; 1734 for (; ai < fst->FSMarc_list_len - 1; ai++, arc++) 1735 arc->linkl_next_arc = (arcID)(ai + 1); 1736 arc->linkl_next_arc = MAXarcID; 1737 fst->num_arcs = fst->num_base_arcs; 1738 fst->FSMarc_list_len = fst->num_base_arcs;*/ 1739 1740 /* now remove all the added words */ 1741 wordmap_reset(fst->olabels); 1742 return FST_SUCCESS; 1743 } 1744 1745 /* See the description of arc_tokens in the header file. These 1746 arcs are nodeless, which make loading from a file that 1747 references nodes a little harder. We need to do it in stages. 1748 Later, when we load from binary image it should be much simpler. 1749 */ 1750 1751 arc_token_lnk get_first_arc_leaving_node(arc_token* arc_token_list, 1752 arcID num_arcs, 1753 nodeID node) 1754 { 1755 arcID i; 1756 for (i = 0; i < num_arcs; i++) 1757 { 1758 if ((nodeID)(int)arc_token_list[i].next_token_index == node) 1759 return ARC_TOKEN_LNK(arc_token_list, i); 1760 } 1761 return ARC_TOKEN_NULL; 1762 } 1763 1764 int FST_LoadReverseWordGraph(srec_context* context, int num_words_to_add, PFile* fp) 1765 { 1766 arcID i; 1767 char line[MAX_LINE_LENGTH]; 1768 char word_label_as_str[128]; 1769 arcID num_alloc_arc_tokens; 1770 nodeID from_node, into_node; 1771 labelID word_label = MAXwordID; 1772 arc_token *atoken, *last_atoken; 1773 costdata cost; 1774 arcID num_arcs; 1775 arc_token *arc_token_list, *tmp; 1776 long fpos; 1777 1778 /* determine number of word arcs to allocate */ 1779 fpos = pftell(fp); 1780 for (num_arcs = 0; pfgets(line, MAX_LINE_LENGTH, fp); num_arcs++) ; 1781 num_alloc_arc_tokens = num_arcs; 1782 num_alloc_arc_tokens = (arcID)(num_arcs + num_words_to_add); 1783 pfseek(fp, fpos, SEEK_SET); 1784 1785 context->arc_token_list = (arc_token*)CALLOC_CLR(num_alloc_arc_tokens, sizeof(arc_token), "srec.graph.wordgraph"); 1786 arc_token_list = context->arc_token_list; 1787 1788 /* 1. first load up all the information */ 1789 i = 0; 1790 while (pfgets(line, MAX_LINE_LENGTH, fp)) 1791 { 1792 if (sscanf(line, "%hu\t%hu\t%s", &from_node, &into_node, word_label_as_str) == 3) 1793 { 1794 word_label = wordmap_find_index(context->olabels, word_label_as_str); 1795 // ASSERT(word_label >= 0); 1796 cost = FREEcostdata; 1797 } 1798 else if (sscanf(line, "%hu", &from_node) == 1) 1799 { 1800 into_node = MAXnodeID; 1801 word_label = MAXwordID; 1802 cost = FREEcostdata; 1803 } 1804 else 1805 { 1806 PLogError("FST_LoadReverseWordGraph() .. can't parse line %s\n", line); 1807 ASSERT(0); 1808 } 1809 atoken = &arc_token_list[i]; 1810 i++; 1811 atoken->ilabel = word_label; 1812 /* atoken->olabel = WORD_EPSILON_LABEL; */ 1813 /*atoken->cost = cost; cost is not used for now */ 1814 #if DEBUG_ASTAR 1815 atoken->label = (word_label == MAXwordID ? 0 : wmap->words[word_label]); 1816 #endif 1817 atoken->first_next_arc = (arc_token_lnk)into_node; 1818 atoken->next_token_index = (arc_token_lnk)from_node; 1819 } 1820 num_arcs = i; 1821 1822 /* 2. now do the internal cross references */ 1823 for (i = 0; i < num_arcs; i++) 1824 { 1825 atoken = &arc_token_list[i]; 1826 into_node = (nodeID)atoken->first_next_arc; 1827 if (into_node == MAXnodeID) 1828 atoken->first_next_arc = ARC_TOKEN_NULL; 1829 else 1830 atoken->first_next_arc = get_first_arc_leaving_node(arc_token_list, num_arcs, (nodeID)atoken->first_next_arc); 1831 } 1832 1833 /* 3. now do more internal cross refs */ 1834 last_atoken = &arc_token_list[0]; 1835 for (i = 1; i < num_arcs; i++) 1836 { 1837 atoken = &arc_token_list[i]; 1838 if (atoken->next_token_index != last_atoken->next_token_index) 1839 { 1840 last_atoken->next_token_index = ARC_TOKEN_NULL; 1841 } 1842 else 1843 { 1844 last_atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i); 1845 } 1846 last_atoken = atoken; 1847 } 1848 last_atoken->next_token_index = ARC_TOKEN_NULL; 1849 1850 #if DEBUG_ASTAR 1851 /* under debug, it's nice to be able to see the words leaving the 1852 destination node, they are stored sequentially in the debug ary */ 1853 for (i = 0; i < num_arcs; i++) 1854 { 1855 char * p; 1856 atoken = &arc_token_list[i]; 1857 atoken->debug[0] = 0; 1858 for (tmp = ARC_TOKEN_LNK(arc_token_list, atoken->first_next_arc); tmp != NULL; 1859 tmp = ARC_TOKEN_PTR(arc_token_list, tmp->next_token_index)) 1860 { 1861 if (tmp->first_next_arc == ARC_TOKEN_NULL) 1862 { 1863 p = "END"; 1864 } 1865 else if (!tmp->label) 1866 { 1867 p = "NULL"; 1868 } 1869 else 1870 { 1871 p = tmp->label; 1872 } 1873 if (strlen(atoken->debug) + strlen(p) + 6 < 64) 1874 { 1875 strcat(atoken->debug, p); 1876 strcat(atoken->debug, " "); 1877 } 1878 else 1879 { 1880 strcat(atoken->debug, "..."); 1881 break; 1882 } 1883 } 1884 } 1885 #endif 1886 context->arc_token_list_len = num_alloc_arc_tokens; 1887 if (num_alloc_arc_tokens > num_arcs) 1888 { 1889 atoken = &context->arc_token_list[num_arcs]; 1890 for (i = num_arcs; i < num_alloc_arc_tokens; i++, atoken++) 1891 { 1892 atoken->first_next_arc = ARC_TOKEN_NULL; 1893 atoken->ilabel = MAXwordID; 1894 atoken->next_token_index = ARC_TOKEN_LNK(arc_token_list, i + 1);/*atoken+1;*/ 1895 } 1896 atoken--; 1897 atoken->next_token_index = ARC_TOKEN_NULL; 1898 context->arc_token_freelist = &context->arc_token_list[num_arcs]; 1899 } 1900 else 1901 { 1902 context->arc_token_freelist = NULL; 1903 } 1904 /* the insert start is the arc that goes to the end */ 1905 atoken = context->arc_token_list; 1906 atoken = ARC_TOKEN_PTR(arc_token_list, atoken->first_next_arc); 1907 for (; atoken; atoken = ARC_TOKEN_PTR(arc_token_list, atoken->next_token_index)) 1908 { 1909 if (atoken->first_next_arc == ARC_TOKEN_NULL) 1910 break; 1911 tmp = ARC_TOKEN_PTR(context->arc_token_list, atoken->first_next_arc); 1912 if (tmp->ilabel == MAXwordID && tmp->first_next_arc == ARC_TOKEN_NULL) 1913 break; 1914 } 1915 context->arc_token_insert_start = atoken; 1916 1917 return FST_SUCCESS; 1918 } 1919 1920 int FST_UnloadReverseWordGraph(srec_context* context) 1921 { 1922 FREE(context->arc_token_list); 1923 context->arc_token_list = 0; 1924 return FST_SUCCESS; 1925 } 1926 1927 /*----------------------------------------------------------------------* 1928 * * 1929 * general * 1930 * * 1931 *----------------------------------------------------------------------*/ 1932 1933 char split_line(char* line, char** av) 1934 { 1935 int i = 0; 1936 av[i] = strtok(line, "\n\r\t "); 1937 for (; av[i];) 1938 { 1939 av[++i] = strtok(NULL, "\n\r\t "); 1940 } 1941 return i; 1942 } 1943 1944 asr_int32_t atoi_with_check(const char* buf, asr_int32_t mymax) 1945 { 1946 asr_int32_t ans = atol(buf); 1947 ASSERT(ans < mymax); 1948 return ans; 1949 } 1950 1951 /*----------------------------------------------------------------------* 1952 * * 1953 * fst * 1954 * * 1955 *----------------------------------------------------------------------*/ 1956 1957 arcID fst_get_free_arc(srec_context* fst) 1958 { 1959 FSMarc* atoken; 1960 arcID atokid = fst->FSMarc_freelist; 1961 if (atokid == MAXarcID) 1962 { 1963 PLogError("error: ran out of arcs\n"); 1964 return atokid; 1965 } 1966 atoken = &fst->FSMarc_list[atokid]; 1967 fst->FSMarc_freelist = ARC_XtoI(atoken->linkl_next_arc); 1968 memset(atoken, 0, sizeof(FSMarc)); 1969 atoken->to_node = FSMNODE_NULL; 1970 atoken->fr_node = FSMNODE_NULL; 1971 atoken->linkl_next_arc = FSMARC_NULL; 1972 atoken->linkl_prev_arc = FSMARC_NULL; 1973 atoken->ilabel = 0; 1974 atoken->olabel = 0; 1975 atoken->cost = 0; 1976 fst->num_arcs++; 1977 return atokid; 1978 } 1979 nodeID fst_get_free_node(srec_context* fst) 1980 { 1981 FSMnode* ntoken; 1982 nodeID ntokid = fst->FSMnode_freelist; 1983 if (ntokid == MAXnodeID) 1984 { 1985 PLogError("error: ran out of nodes\n"); 1986 return ntokid; 1987 } 1988 ntoken = &fst->FSMnode_list[ntokid]; 1989 fst->FSMnode_freelist = NODE_XtoI(ntoken->un_ptr.next_node); 1990 ntoken->un_ptr.first_next_arc = FSMARC_NULL; 1991 ntoken->first_prev_arc = FSMARC_NULL; 1992 /* ASSERT( (int)(ntok - fst->FSMnode_list) == fst->num_nodes); */ 1993 fst->num_nodes++; 1994 return ntokid; 1995 } 1996 1997 int fst_free_node(srec_context* fst, FSMnode* node) 1998 { 1999 IF_DEBUG_WDADD(printf_node(fst, "freeing: ", node)); 2000 node->first_prev_arc = FSMARC_NULL; 2001 node->un_ptr.next_node = fst->FSMnode_freelist; 2002 node->first_prev_arc = FSMARC_FREE; 2003 fst->FSMnode_freelist = (nodeID)(node - fst->FSMnode_list); 2004 /* fst->num_nodes--; not allowed unless we compress! */ 2005 return 0; 2006 } 2007 2008 int fst_free_arc(srec_context* fst, FSMarc* arc) 2009 { 2010 IF_DEBUG_WDADD(printf_arc(fst, "freeing: ", arc)); 2011 arc->linkl_prev_arc = FSMARC_FREE; 2012 arc->linkl_next_arc = ARC_ItoX(fst->FSMarc_freelist); 2013 arc->cost = 2; 2014 arc->ilabel = 0; 2015 arc->olabel = 0; 2016 IF_DEBUG_WDADD(arc->ilabel_str = 0); 2017 IF_DEBUG_WDADD(arc->olabel_str = 0); 2018 fst->FSMarc_freelist = (arcID)(arc - fst->FSMarc_list); 2019 fst->num_arcs--; 2020 return 0; 2021 } 2022 2023 int fst_free_arc_net(srec_context* fst, FSMarc* arc) 2024 { 2025 if (!arc) 2026 return 0; 2027 /* need to traverse the arc, get the 'from' nodes, 2028 then traverse the from nodes and kill arcs */ 2029 /* we'll need this after an error condition */ 2030 return 0; 2031 } 2032 2033 int fst_push_arc_olabel(srec_context* fst, FSMarc* arc) 2034 { 2035 FSMarc_ptr atok; 2036 FSMarc* atoken; 2037 FSMnode_ptr ntok = arc->to_node; 2038 2039 /* look for all arcs leaving this arc, and mark them 2040 normally there is only one, except for multipron words */ 2041 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2042 { 2043 atoken = ARC_XtoP(atok); 2044 /* cannot have multiple olabels before a boundary */ 2045 if (atoken->olabel != WORD_EPSILON_LABEL) 2046 return FST_FAILED_INTERNAL; 2047 atoken->olabel = arc->olabel; 2048 #if DO_WEIGHTED_ADDWORD 2049 atoken->cost = (costdata)(atoken->cost + arc->cost); 2050 #endif 2051 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str); 2052 } 2053 /* unlabel this arc */ 2054 arc->olabel = WORD_EPSILON_LABEL; 2055 #if DO_WEIGHTED_ADDWORD 2056 arc->cost = FREEcostdata; 2057 #endif 2058 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]); 2059 return FST_SUCCESS; 2060 } 2061 2062 #if DO_WEIGHTED_ADDWORD 2063 int fst_push_arc_cost(srec_context* fst, FSMarc* arc) 2064 { 2065 FSMarc_ptr atok; 2066 FSMarc* atoken; 2067 FSMnode_ptr ntok = arc->to_node; 2068 /* look for all arcs leaving this arc, and push the cost thereto */ 2069 for (atok = FIRST_NEXT(ntok); atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2070 { 2071 atoken = ARC_XtoP(atok); 2072 atoken->cost = (costdata)(atoken->cost + arc->cost); 2073 } 2074 arc->cost = FREEcostdata; 2075 return FST_SUCCESS; 2076 } 2077 #endif 2078 2079 2080 int fst_pull_arc_olabel(srec_context* fst, FSMarc* arc) 2081 { 2082 FSMarc_ptr atok; 2083 FSMarc* atoken; 2084 FSMnode_ptr fr_node = arc->fr_node; 2085 FSMarc_ptr tmp_atok; 2086 FSMarc *tmp_atoken; 2087 FSMnode *anode; 2088 2089 if (arc->olabel == WORD_EPSILON_LABEL) 2090 return FST_SUCCESS; 2091 2092 /* can the olabel be pulled earlier? */ 2093 for (atok = FIRST_PREV(fr_node); atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2094 { 2095 atoken = ARC_XtoP(atok); 2096 anode = NODE_XtoP(atoken->to_node); 2097 2098 tmp_atok = anode->un_ptr.first_next_arc; 2099 if (tmp_atok != FSMARC_NULL) 2100 { 2101 tmp_atoken = ARC_XtoP(tmp_atok); 2102 if (tmp_atoken->linkl_next_arc != FSMARC_NULL) 2103 { 2104 return FST_CONTINUE; 2105 } 2106 } 2107 } 2108 2109 /* look for all arcs arriving at this arc, and mark them */ 2110 for (atok = FIRST_PREV(fr_node);atok != FSMARC_NULL;atok = atoken->linkl_prev_arc) 2111 { 2112 atoken = ARC_XtoP(atok); 2113 /* cannot have multiple olabels! serious internal error!? */ 2114 if (atoken->olabel != WORD_EPSILON_LABEL) 2115 { 2116 PLogError("error: internal error, in fst_pull_arc_olabel()\n"); 2117 return FST_FAILED_INTERNAL; 2118 } 2119 atoken->olabel = arc->olabel; 2120 #if DO_WEIGHTED_ADDWORD 2121 atoken->cost = arc->cost; 2122 #endif 2123 IF_DEBUG_WDADD(atoken->olabel_str = arc->olabel_str); 2124 } 2125 /* unlabel this arc */ 2126 arc->olabel = WORD_EPSILON_LABEL; 2127 #if DO_WEIGHTED_ADDWORD 2128 arc->cost = FREEcostdata; 2129 #endif 2130 IF_DEBUG_WDADD(arc->olabel_str = fst->olabels->words[ arc->olabel]); 2131 return FST_SUCCESS; 2132 } 2133 2134 static FSMarc* find_next_arc_with_ilabel(srec_context* fst, FSMnode* node, labelID ilabel, FSMarc** last) 2135 { 2136 FSMarc_ptr atok; 2137 FSMarc* atoken = NULL; 2138 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2139 { 2140 atoken = ARC_XtoP(atok); 2141 if (atoken->ilabel == ilabel) 2142 return atoken; 2143 } 2144 *last = atoken; /* to this we may want to append a new arc */ 2145 return NULL; 2146 } 2147 2148 FSMarc* find_prev_arc_with_iolabels(srec_context* fst, FSMnode* node, labelID ilabel, labelID olabel, FSMarc** last) 2149 { 2150 FSMarc_ptr atok; 2151 FSMarc* atoken = NULL; 2152 FSMarc_ptr tmp_atok; 2153 FSMarc* tmp_atoken; 2154 2155 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2156 { 2157 atoken = ARC_XtoP(atok); 2158 if (atoken->olabel == olabel && atoken->ilabel == ilabel) 2159 { 2160 tmp_atok = NODE_XtoP(atoken->fr_node)->un_ptr.first_next_arc; 2161 if (tmp_atok != FSMARC_NULL) 2162 { 2163 tmp_atoken = ARC_XtoP(tmp_atok); 2164 2165 if (tmp_atoken->linkl_next_arc == FSMARC_NULL) 2166 return atoken; 2167 } 2168 else 2169 return atoken; 2170 } 2171 2172 } 2173 if (last) *last = atoken; 2174 return NULL; 2175 } 2176 2177 int num_arcs_leaving(srec_context* fst, FSMnode* node) 2178 { 2179 int num_arcs = 0; 2180 FSMarc_ptr atok; 2181 FSMarc* atoken; 2182 for (atok = node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2183 { 2184 atoken = ARC_XtoP(atok); 2185 num_arcs++; 2186 } 2187 return num_arcs; 2188 } 2189 2190 int num_arcs_arriving(srec_context* fst, FSMnode* node) 2191 { 2192 int num_arcs = 0; 2193 FSMarc_ptr atok; 2194 FSMarc* atoken; 2195 for (atok = node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2196 { 2197 atoken = ARC_XtoP(atok); 2198 num_arcs++; 2199 } 2200 return num_arcs; 2201 } 2202 2203 PINLINE int num_arcs_arriving_gt_1(srec_context* fst, FSMnode* node) 2204 { 2205 FSMarc_ptr atok = node->first_prev_arc; 2206 FSMarc* atoken; 2207 if (atok == FSMARC_NULL) 2208 return 0; 2209 atoken = ARC_XtoP(atok); 2210 return atoken->linkl_prev_arc == FSMARC_NULL ? 0 : 1; 2211 } 2212 2213 static void remove_arc_arriving(srec_context* fst, FSMnode* to_node, FSMarc* arc) 2214 { 2215 FSMarc* atoken; 2216 FSMarc_ptr* atok; 2217 for (atok = &to_node->first_prev_arc;(*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc) 2218 { 2219 atoken = ARC_XtoP(*atok); 2220 if (atoken == arc) 2221 { 2222 (*atok) = arc->linkl_prev_arc; 2223 return; 2224 } 2225 } 2226 ASSERT(0); 2227 } 2228 2229 int split_node_for_arc(srec_context* fst, FSMarc* arc) 2230 { 2231 arcID new_arc_id; 2232 nodeID new_node_id; 2233 FSMnode* new_node; 2234 FSMnode* old_node; 2235 FSMarc_ptr atok, new_next, last_new_next; 2236 FSMarc *atoken, *new_next_p; 2237 2238 IF_DEBUG_WDADD(printf_arc(fst, "spliting ... ", arc)); 2239 2240 new_node_id = fst_get_free_node(fst); 2241 if (new_node_id == MAXnodeID) 2242 return FST_FAILED_ON_MEMORY; 2243 new_node = &fst->FSMnode_list[ new_node_id]; 2244 old_node = NODE_XtoP(arc->to_node); 2245 arc->to_node = NODE_ItoX(new_node_id); 2246 new_node->first_prev_arc = ARC_PtoX(arc); 2247 remove_arc_arriving(fst, old_node, arc); 2248 arc->linkl_prev_arc = FSMARC_NULL; 2249 2250 last_new_next = FSMARC_NULL; 2251 for (atok = old_node->un_ptr.first_next_arc; atok != FSMARC_NULL; atok = atoken->linkl_next_arc) 2252 { 2253 atoken = ARC_XtoP(atok); 2254 new_arc_id = fst_get_free_arc(fst); 2255 if (new_arc_id == MAXarcID) 2256 return FST_FAILED_ON_MEMORY; 2257 new_next = ARC_ItoX(new_arc_id); 2258 if (last_new_next != FSMARC_NULL) 2259 LINKL_NEXT(last_new_next) = new_next; 2260 else 2261 new_node->un_ptr.first_next_arc = new_next; 2262 new_next_p = ARC_XtoP(new_next); 2263 new_next_p->ilabel = atoken->ilabel; 2264 new_next_p->to_node = atoken->to_node; 2265 new_next_p->fr_node = arc->to_node; 2266 new_next_p->olabel = atoken->olabel; 2267 IF_DEBUG_WDADD(new_next_p->ilabel_str = atoken->ilabel_str); 2268 IF_DEBUG_WDADD(new_next_p->olabel_str = atoken->olabel_str); 2269 new_next_p->linkl_next_arc = FSMARC_NULL; /* set at next loop */ 2270 append_arc_arriving_node(fst, NODE_XtoP(atoken->to_node), new_next); 2271 last_new_next = new_next; 2272 } 2273 return FST_SUCCESS; 2274 } 2275 2276 int fst_add_arcs(srec_context* fst, nodeID start_node, nodeID end_node, 2277 wordID add_tree_olabel, costdata add_tree_cost, 2278 modelID* add_tree_start, int add_tree_len) 2279 { 2280 int rc = FST_SUCCESS; 2281 FSMarc_ptr atok = (FSMarc_ptr)0, last_atok = (FSMarc_ptr)0; 2282 FSMarc* atoken = NULL; 2283 FSMarc *next_atoken, *prev_atoken; 2284 FSMarc *append_to; 2285 FSMnode *late_start_node, *early_end_node; 2286 FSMnode *ntoken, *last_ntoken; 2287 FSMnode_ptr ntok, last_ntok; 2288 modelID *add_tree_end = add_tree_start + add_tree_len - 1; 2289 modelID *add_tree_late_start, *add_tree_early_end; 2290 modelID *add_tree; 2291 arcID new_arc_id, atokid; 2292 nodeID new_node_id, atokid_node; 2293 wordID add_tree_olabel_use = add_tree_olabel; 2294 wordID add_tree_cost_use = add_tree_cost; 2295 2296 append_to = NULL; 2297 add_tree = add_tree_start; 2298 #define FST_GROW_MINARCS 100 2299 #define FST_GROW_MINNODES 100 2300 #if defined(FST_GROW_FACTOR) 2301 2302 /* make sure we have enough arcs available */ 2303 if(fst->num_arcs + add_tree_len >= fst->FSMarc_list_len) 2304 { 2305 FSMarc* tmp_FSMarc_list; 2306 arcID tmp_FSMarc_list_len; 2307 int itmp_FSMarc_list_len = fst->FSMarc_list_len*FST_GROW_FACTOR ; 2308 2309 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < FST_GROW_MINARCS) 2310 itmp_FSMarc_list_len += FST_GROW_MINARCS; 2311 2312 if( itmp_FSMarc_list_len - fst->FSMarc_list_len < add_tree_len) 2313 itmp_FSMarc_list_len += add_tree_len; 2314 2315 if(itmp_FSMarc_list_len >= (int)MAXarcID) 2316 return FST_FAILED_ON_MEMORY; 2317 2318 tmp_FSMarc_list_len = (arcID)itmp_FSMarc_list_len; 2319 2320 tmp_FSMarc_list = (FSMarc*)CALLOC_CLR(tmp_FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs"); 2321 if(!tmp_FSMarc_list) { 2322 PLogError("error: Failed to extend the memory for arcs\n"); 2323 return FST_FAILED_INTERNAL; 2324 } 2325 memcpy( tmp_FSMarc_list, fst->FSMarc_list, fst->FSMarc_list_len*sizeof(FSMarc)); 2326 2327 /* initialize the new free list. 2328 head of this new free list is tmp_FSMarc_list[ fst->FSMarc_list_len] */ 2329 for(atokid=fst->FSMarc_list_len; atokid<tmp_FSMarc_list_len-1; atokid++) { 2330 tmp_FSMarc_list[atokid].linkl_next_arc = ARC_ItoX(atokid+1); 2331 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE; 2332 } 2333 tmp_FSMarc_list[atokid].linkl_next_arc = FSMARC_NULL; 2334 tmp_FSMarc_list[atokid].linkl_prev_arc = FSMARC_FREE; 2335 2336 /* scan to the end of the old free list (to be re-used) */ 2337 arcID* last_free_arc = &fst->FSMarc_freelist; 2338 atoken = (*last_free_arc==MAXarcID) ? NULL : &fst->FSMarc_list[*last_free_arc] ; 2339 2340 for( ; *last_free_arc!=MAXarcID; last_free_arc=&atoken->linkl_next_arc) 2341 atoken = &tmp_FSMarc_list[ *last_free_arc]; 2342 2343 /* append the new free list to the current free list */ 2344 *last_free_arc = fst->FSMarc_list_len; 2345 2346 FREE( fst->FSMarc_list); 2347 fst->FSMarc_list = tmp_FSMarc_list; 2348 fst->FSMarc_list_len = tmp_FSMarc_list_len; 2349 } 2350 2351 /* make sure we have enough nodes available */ 2352 if(fst->num_nodes + add_tree_len >= fst->FSMnode_list_len) 2353 { 2354 FSMnode* tmp_FSMnode_list; 2355 nodeID tmp_FSMnode_list_len; 2356 FSMnode_info* tmp_FSMnode_info_list; 2357 int itmp_FSMnode_list_len = fst->FSMnode_list_len * FST_GROW_FACTOR ; 2358 2359 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < FST_GROW_MINNODES) 2360 itmp_FSMnode_list_len += FST_GROW_MINNODES; 2361 2362 if( itmp_FSMnode_list_len - fst->FSMnode_list_len < add_tree_len) 2363 itmp_FSMnode_list_len += add_tree_len; 2364 2365 if(itmp_FSMnode_list_len >= (int)MAXnodeID) 2366 return FST_FAILED_ON_MEMORY; 2367 2368 tmp_FSMnode_list_len = (nodeID)itmp_FSMnode_list_len; 2369 2370 tmp_FSMnode_list = (FSMnode*)CALLOC_CLR(tmp_FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes"); 2371 if(!tmp_FSMnode_list) { 2372 PLogError("ERROR: Failed to extend the memory for nodes\n"); 2373 return FST_FAILED_INTERNAL; 2374 } 2375 memcpy( tmp_FSMnode_list, fst->FSMnode_list, fst->FSMnode_list_len*sizeof(FSMnode)); 2376 2377 /* initialize the new free node list. 2378 head of this new free list is tmp_FSMnode_list[ fst->FSMnode_list_len] */ 2379 for(atokid_node=fst->FSMnode_list_len; atokid_node<tmp_FSMnode_list_len-1; atokid_node++) 2380 { 2381 tmp_FSMnode_list[atokid_node].un_ptr.next_node = NODE_ItoX(atokid_node + 1); 2382 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE; 2383 } 2384 tmp_FSMnode_list[atokid_node].un_ptr.next_node = FSMNODE_NULL; 2385 tmp_FSMnode_list[atokid_node].first_prev_arc = FSMARC_FREE; 2386 2387 /* scan to the end of the old free list (to be re-used) */ 2388 nodeID* last_free_node = (&fst->FSMnode_freelist); 2389 ntoken = (*last_free_node==MAXnodeID) ? NULL : &fst->FSMnode_list[*last_free_node] ; 2390 2391 for( ; *last_free_node!=MAXnodeID; last_free_node = &ntoken->un_ptr.next_node) 2392 ntoken = &tmp_FSMnode_list[ *last_free_node]; 2393 2394 /* append the new free list to the current free list */ 2395 *last_free_node = fst->FSMnode_list_len; 2396 2397 FREE( fst->FSMnode_list); 2398 2399 tmp_FSMnode_info_list = (FSMnode_info*)CALLOC_CLR( tmp_FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos"); 2400 if(!tmp_FSMnode_info_list) { 2401 PLogError("ERROR: Failed to extend the memory for node infos\n"); 2402 return FST_FAILED_INTERNAL; 2403 } 2404 // copy in old node info 2405 memcpy( tmp_FSMnode_info_list, fst->FSMnode_info_list, fst->FSMnode_list_len*sizeof(FSMnode_info)); 2406 2407 // initialize the new node info 2408 for (atokid_node=fst->FSMnode_list_len; atokid_node < tmp_FSMnode_list_len; atokid_node++) 2409 tmp_FSMnode_info_list[atokid_node] = NODE_INFO_UNKNOWN; 2410 2411 FREE( fst->FSMnode_info_list); 2412 fst->FSMnode_info_list = tmp_FSMnode_info_list; 2413 fst->FSMnode_list = tmp_FSMnode_list; 2414 fst->FSMnode_list_len = tmp_FSMnode_list_len; 2415 } 2416 #endif 2417 late_start_node = &fst->FSMnode_list[start_node]; 2418 2419 while (1) 2420 { 2421 2422 next_atoken = find_next_arc_with_ilabel(fst, late_start_node, 2423 *add_tree /*->ilabel*/, 2424 &append_to); 2425 if (next_atoken != NULL) 2426 { 2427 /* so next_atok->ilabel == add_tree->ilabel */ 2428 2429 if (next_atoken->ilabel == WORD_BOUNDARY && *add_tree/*->ilabel*/ == WORD_BOUNDARY) 2430 { 2431 next_atoken = NULL; 2432 break; /* break as if nothing more can be shared! */ 2433 // return FST_FAILED_ON_HOMONYM; 2434 } 2435 2436 if (num_arcs_arriving_gt_1(fst, NODE_XtoP(next_atoken->to_node))) 2437 { 2438 split_node_for_arc(fst, next_atoken); 2439 /* unfortunate side effect here, that if we later find out this 2440 was a homonym addition, then this expansion was useless and 2441 for now we don't undo it! */ 2442 } 2443 2444 /* we shouldn't really push the olabel if it's the same for both! */ 2445 if (next_atoken->olabel != add_tree_olabel_use) 2446 { 2447 if (next_atoken->olabel != WORD_EPSILON_LABEL) 2448 { 2449 if (fst_push_arc_olabel(fst, next_atoken) != FST_SUCCESS) 2450 { 2451 PLogError("error: internal error fst_push_arc_olabel()\n"); 2452 return FST_FAILED_INTERNAL; 2453 } 2454 } 2455 #if DO_WEIGHTED_ADDWORD 2456 else 2457 fst_push_arc_cost(fst, next_atoken); 2458 #endif 2459 } 2460 else if (add_tree_olabel_use != WORD_EPSILON_LABEL) 2461 { 2462 /* the graph already has this word, so we just re-use the olabel 2463 and disable the setting of olabel down below */ 2464 add_tree_olabel_use = WORD_EPSILON_LABEL; 2465 #if DO_WEIGHTED_ADDWORD 2466 add_tree_cost_use = FREEcostdata; 2467 #endif 2468 } 2469 add_tree++; 2470 late_start_node = NODE_XtoP(next_atoken->to_node); 2471 } 2472 else 2473 { 2474 break; 2475 } 2476 } 2477 2478 add_tree_late_start = add_tree; 2479 early_end_node = &fst->FSMnode_list[end_node]; 2480 2481 if (!do_minimize) 2482 { 2483 last_ntoken = NULL; 2484 last_ntok = NODE_PtoX(late_start_node); 2485 for (; add_tree <= add_tree_end; add_tree++) 2486 { 2487 new_arc_id = fst_get_free_arc(fst); 2488 if (new_arc_id == MAXarcID) 2489 { 2490 PLogError("fst_get_free_arc() failed\n"); 2491 return FST_FAILED_ON_MEMORY; 2492 } 2493 atok = ARC_ItoX(new_arc_id); 2494 atoken = ARC_XtoP(atok); 2495 if (add_tree != add_tree_end) 2496 { 2497 new_node_id = fst_get_free_node(fst); 2498 if (new_node_id == MAXnodeID) 2499 { 2500 PLogError("fst_get_free_node() failed\n"); 2501 return FST_FAILED_ON_MEMORY; 2502 } 2503 ntok = NODE_ItoX(new_node_id); 2504 ntoken = NODE_XtoP(ntok); 2505 ntoken->first_prev_arc = atok; 2506 } 2507 else 2508 { 2509 ntok = FSMNODE_NULL; 2510 ntoken = NULL; 2511 } 2512 atoken->fr_node = last_ntok; /* was NODE_PtoX(late_start_node) */ 2513 atoken->to_node = ntok; 2514 atoken->ilabel = *add_tree; 2515 atoken->linkl_next_arc = FSMARC_NULL; 2516 atoken->linkl_prev_arc = FSMARC_NULL; 2517 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]); 2518 if (!last_ntoken) 2519 { 2520 append_to->linkl_next_arc = atok; 2521 atoken->olabel = add_tree_olabel_use; 2522 #if DO_WEIGHTED_ADDWORD 2523 atoken->cost = add_tree_cost_use; 2524 #endif 2525 IF_DEBUG_WDADD(atok->olabel_str = fst->olabels->words[ atoken->olabel]); 2526 } 2527 else 2528 { 2529 last_ntoken->un_ptr.first_next_arc = atok; 2530 } 2531 last_ntoken = ntoken; 2532 last_ntok = ntok; 2533 } 2534 /* add_tree_end->to_node = early_end_node; 2535 add_tree_end->linkl_next_arc = FSMARC_NULL; 2536 atok = last_arc_arriving( early_end_node); 2537 atok->linkl_prev_arc = add_tree_end; 2538 add_tree_end->linkl_prev_arc = FSMARC_NULL; */ 2539 atoken->to_node = NODE_ItoX(end_node); 2540 append_arc_arriving_node(fst, &fst->FSMnode_list[end_node], atok); 2541 } 2542 2543 else /* ie. do_minimize from the rear */ 2544 { 2545 2546 add_tree = add_tree_end; 2547 new_arc_id = fst_get_free_arc(fst); 2548 if (new_arc_id == MAXarcID) 2549 return FST_FAILED_ON_MEMORY; 2550 atok = ARC_ItoX(new_arc_id); 2551 atoken = ARC_XtoP(atok); 2552 atoken->olabel = add_tree_olabel_use; 2553 #if DO_WEIGHTED_ADDWORD 2554 atoken->cost = add_tree_cost_use; 2555 #endif 2556 atoken->ilabel = *add_tree_late_start; 2557 atoken->fr_node = NODE_PtoX(late_start_node); 2558 if (atoken->ilabel == WORD_BOUNDARY) 2559 { 2560 atoken->cost = (costdata)(atoken->cost + fst->wtw_average); 2561 } 2562 else 2563 { 2564 atoken->cost = add_tree_cost_use + fst->wtw_average; 2565 add_tree_cost_use = FREEcostdata; 2566 } 2567 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[ atoken->olabel]); 2568 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[ atoken->ilabel]); 2569 append_arc_leaving_node(fst, late_start_node, atok); 2570 2571 last_atok = atok; 2572 while (1) 2573 { 2574 2575 if (add_tree == add_tree_late_start) 2576 break; 2577 /* if there are other arcs leaving this node then joining 2578 earlier than here will result in over-generation, so don't 2579 if( atok!=end_arc && atok->linkl_next_arc != FSMARC_NULL) 2580 break; 2581 */ 2582 /* 2583 also need boundary conditions checker here 2584 */ 2585 2586 /* */ 2587 IF_DEBUG_WDADD(printf_node(fst, early_end_node)); 2588 2589 /* if( num_arcs_leaving( add_tree_end->fr_node->first_prev_arc->to_node) >1) { 2590 printf("merge stopped due to num_arcs leaving ..\n"); 2591 printf_arc(fst, add_tree_end->fr_node->first_prev_arc->to_node->first_next_arc); 2592 break; 2593 } */ 2594 2595 for (atok = early_end_node->first_prev_arc; atok != FSMARC_NULL; atok = atoken->linkl_prev_arc) 2596 { 2597 atoken = ARC_XtoP(atok); 2598 /* never pull before the slot marker */ 2599 if(atoken->cost == DISABLE_ARC_COST) break; 2600 if(atoken->olabel>0 && atoken->olabel<fst->olabels->num_slots) break; 2601 fst_pull_arc_olabel(fst, atoken); /* fails are ok */ 2602 } 2603 2604 prev_atoken = find_prev_arc_with_iolabels(fst, early_end_node, 2605 *add_tree /*->ilabel*/, 2606 WORD_EPSILON_LABEL, 2607 NULL /*&append_to*/); 2608 if (!prev_atoken) 2609 break; 2610 else 2611 { 2612 /* this check is now inside find_prev_arc_with_iolabels */ 2613 /* if( num_arcs_leaving(prev_atok->fr_node->first_prev_arc->to_node->first_next_arc)>1)*/ 2614 2615 /* now look to merge earlier still */ 2616 early_end_node = NODE_XtoP(prev_atoken->fr_node); 2617 add_tree--; 2618 if (add_tree == add_tree_late_start) 2619 break; 2620 } 2621 } 2622 add_tree_early_end = add_tree; 2623 for (add_tree = add_tree_late_start + 1; add_tree <= add_tree_early_end; add_tree++) 2624 { 2625 new_node_id = fst_get_free_node(fst); 2626 if (new_node_id == MAXnodeID) 2627 return FST_FAILED_ON_MEMORY; 2628 ntok = NODE_ItoX(new_node_id); 2629 ntoken = NODE_XtoP(ntok); 2630 new_arc_id = fst_get_free_arc(fst); 2631 if (new_arc_id == MAXarcID) 2632 return FST_FAILED_ON_MEMORY; 2633 atok = ARC_ItoX(new_arc_id); 2634 atoken = ARC_XtoP(atok); 2635 atoken->ilabel = *add_tree; 2636 #if DO_WEIGHTED_ADDWORD 2637 atoken->cost = FREEcostdata; /* idea: distribute add_tree_cost here;*/ 2638 #endif 2639 atoken->olabel = WORD_EPSILON_LABEL; 2640 atoken->fr_node = ntok; 2641 atoken->to_node = FSMNODE_NULL; /* filled in next loop */ 2642 IF_DEBUG_WDADD(atoken->olabel_str = fst->olabels->words[atoken->olabel]); 2643 IF_DEBUG_WDADD(atoken->ilabel_str = fst->ilabels->words[atoken->ilabel]); 2644 2645 ntoken->un_ptr.first_next_arc = atok; 2646 ntoken->first_prev_arc = last_atok; 2647 TO_NODE(last_atok) = ntok; 2648 last_atok = atok; 2649 } 2650 TO_NODE(last_atok) = NODE_PtoX(early_end_node); 2651 append_arc_arriving_node(fst, early_end_node, last_atok); 2652 } 2653 return rc; 2654 } 2655 2656 2657 void append_arc_leaving_node(srec_context* fst, FSMnode* fr_node, FSMarc_ptr arc) 2658 { 2659 FSMarc_ptr* atok = &fr_node->un_ptr.first_next_arc; 2660 FSMarc* atoken = ARC_XtoP(*atok); 2661 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_next_arc) 2662 { 2663 atoken = ARC_XtoP(*atok); 2664 } 2665 *atok = arc; 2666 LINKL_NEXT(arc) = FSMARC_NULL; 2667 } 2668 void append_arc_arriving_node(srec_context* fst, FSMnode* to_node, FSMarc_ptr arc) 2669 { 2670 FSMarc_ptr* atok = &to_node->first_prev_arc; 2671 FSMarc* atoken = ARC_XtoP(*atok); 2672 for (; (*atok) != FSMARC_NULL; atok = &atoken->linkl_prev_arc) 2673 { 2674 atoken = ARC_XtoP(*atok); 2675 } 2676 *atok = arc; 2677 LINKL_PREV(arc) = FSMARC_NULL; 2678 } 2679 2680 2681 int printf_node1(srec_context* fst, FSMnode* node) 2682 { 2683 return 0; 2684 } 2685 int printf_arc1(srec_context* fst, char* msg, FSMarc* arc) 2686 { 2687 char buffer[MAX_LINE_LENGTH]; 2688 sprintf_arc(buffer, fst, arc); 2689 printf("%s%s\n", msg, buffer); 2690 return 0; 2691 } 2692 int sprintf_arc(char* buf, srec_context* fst, FSMarc* arc) 2693 { 2694 int rc; 2695 FSMnode* to_node = NODE_XtoP(arc->to_node); 2696 arcID arc_index = (arcID)(arc - fst->FSMarc_list); 2697 if (to_node->un_ptr.first_next_arc == FSMARC_NULL) 2698 rc = sprintf(buf, "arc%hu\n", arc_index); 2699 else 2700 { 2701 rc = sprintf(buf, "arc%hu\t%hu,%hu\t%s\t%s\t%hu\n", 2702 arc_index, 2703 ARC_XtoI(to_node->un_ptr.first_next_arc), 2704 arc->linkl_next_arc != FSMARC_NULL ? ARC_XtoI(arc->linkl_next_arc) : -1, 2705 fst->ilabels->words[arc->ilabel], 2706 fst->olabels->words[arc->olabel], 2707 arc->cost); 2708 } 2709 return rc; 2710 } 2711 2712 /* dumps the recognition context as a binary file, 2713 it will also store any empty space for growing */ 2714 2715 #define ENCODE(SIGN,TYP) { SIGN+=sizeof(TYP); SIGN=SIGN<<3; ASSERT((shifted+=3)<32); } 2716 2717 asr_int32_t FST_sizes_signature() 2718 { 2719 #ifndef NDEBUG 2720 int shifted = 0; 2721 #endif 2722 asr_int32_t signature = 0; 2723 ENCODE(signature, arcID); 2724 ENCODE(signature, nodeID); 2725 ENCODE(signature, wordID); 2726 ENCODE(signature, labelID); 2727 ENCODE(signature, costdata); 2728 return signature; 2729 } 2730 2731 int FST_ContextImageSize(srec_context* context) 2732 { 2733 int size = 0; 2734 size += sizeof(srec_context); 2735 size += sizeof(wordmap); 2736 size += sizeof(char*) * context->olabels->max_words; 2737 size += sizeof(char) * context->olabels->max_chars; 2738 size += sizeof(wordmap); 2739 if (context->ilabels->words != NULL) 2740 size += sizeof(char*) * context->olabels->max_words; 2741 if (context->ilabels->chars != NULL) 2742 size += sizeof(char) * context->olabels->max_chars; 2743 size += sizeof(FSMarc) * context->FSMarc_list_len; 2744 size += sizeof(FSMnode) * context->FSMnode_list_len; 2745 size += sizeof(FSMnode_info) * context->FSMnode_list_len; 2746 size += sizeof(arc_token) * context->arc_token_list_len; 2747 return size; 2748 } 2749 2750 ESR_ReturnCode serializeWordMapV2(wordmap *wordmap, PFile* fp) 2751 { 2752 unsigned int i = 0; 2753 unsigned int nfields; 2754 unsigned int tmp2[32]; 2755 2756 i = 0; 2757 tmp2[i++] = wordmap->num_words; 2758 tmp2[i++] = wordmap->num_slots; 2759 tmp2[i++] = wordmap->max_words; 2760 tmp2[i++] = wordmap->num_base_words; 2761 tmp2[i++] = wordmap->max_chars; 2762 tmp2[i++] = (wordmap->next_chars - wordmap->chars); 2763 tmp2[i++] = (wordmap->next_base_chars - wordmap->chars); 2764 nfields = i; 2765 if (pfwrite(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields) 2766 return ESR_WRITE_ERROR; 2767 2768 if (pfwrite(wordmap->chars, sizeof(char), wordmap->max_chars, fp) != (size_t)wordmap->max_chars) 2769 return ESR_WRITE_ERROR; 2770 2771 return ESR_SUCCESS; 2772 } 2773 2774 ESR_ReturnCode deserializeWordMapV2(wordmap **pwordmap, PFile* fp) 2775 { 2776 unsigned int i = 0; 2777 unsigned int nfields; 2778 unsigned int tmp2[32]; 2779 wordmap *awordmap; 2780 char *p; 2781 ESR_ReturnCode rc = ESR_SUCCESS; 2782 unsigned int next_chars_idx, next_base_chars_idx; 2783 2784 awordmap = NEW(wordmap, L("srec.g2g.graph.wordmap.base")); 2785 if (awordmap == NULL) 2786 { 2787 PLogError("NEW failed on srec.g2g.graph.wordmap.base\n"); 2788 rc = ESR_OUT_OF_MEMORY; 2789 goto CLEANUP; 2790 } 2791 awordmap->wordIDForWord = NULL; // early break to cleanup needs this 2792 2793 nfields = 7; 2794 if (pfread(tmp2, sizeof(tmp2[0]), nfields, fp) != nfields) 2795 { 2796 PLogError("pfread failed when reading nfields\n"); 2797 rc = ESR_READ_ERROR; 2798 goto CLEANUP; 2799 } 2800 2801 i = 0; 2802 awordmap->num_words = (wordID)tmp2[i++]; 2803 awordmap->num_slots = (wordID)tmp2[i++]; 2804 awordmap->max_words = (wordID)tmp2[i++]; 2805 awordmap->num_base_words = (wordID)tmp2[i++]; 2806 awordmap->max_chars = tmp2[i++]; 2807 next_chars_idx = tmp2[i++]; 2808 next_base_chars_idx = tmp2[i++]; 2809 ASSERT(nfields == i); 2810 2811 awordmap->words = NEW_ARRAY(char*, awordmap->max_words, L("srec.g2g.graph.wordmap.words")); 2812 if (awordmap->words == NULL) 2813 { 2814 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.words %d\n", awordmap->max_words); 2815 rc = ESR_OUT_OF_MEMORY; 2816 goto CLEANUP; 2817 } 2818 awordmap->chars = NEW_ARRAY(char, awordmap->max_chars, L("srec.g2g.graph.wordmap.chars")); 2819 if (awordmap->chars == NULL) 2820 { 2821 PLogError("NEW_ARRAY failed for srec.g2g.graph.wordmap.chars %d\n", awordmap->max_chars); 2822 rc = ESR_OUT_OF_MEMORY; 2823 goto CLEANUP; 2824 } 2825 awordmap->next_chars = awordmap->chars + next_chars_idx; 2826 awordmap->next_base_chars = awordmap->chars + next_base_chars_idx; 2827 2828 if ((i=pfread(awordmap->chars, sizeof(char), awordmap->max_chars, fp)) != (size_t)awordmap->max_chars) 2829 { 2830 PLogError("pfread failed while reading %d chars\n", awordmap->max_chars); 2831 rc = ESR_READ_ERROR; 2832 goto CLEANUP; 2833 } 2834 2835 2836 p = awordmap->chars; 2837 ASSERT((int)p % 2 == 0); 2838 nfields = 0; 2839 if (nfields < awordmap->num_words) 2840 awordmap->words[nfields++] = p; 2841 for (; p < awordmap->next_chars; p++) // was next_base_chars 2842 { 2843 if ( ( *p ) == '\0' ) 2844 { 2845 if (nfields == awordmap->num_words) // was num_base_words 2846 break; 2847 if (((int)p) % 2 == 0) p++; /* so that words begin on even byte bound */ 2848 awordmap->words[nfields++] = p + 1; 2849 } 2850 } 2851 ASSERT(nfields == awordmap->num_words); // was num_base_words 2852 2853 if (awordmap->max_words >= awordmap->num_base_words) 2854 { 2855 PHashTableArgs hashArgs; 2856 hashArgs.capacity = awordmap->max_words; 2857 if(hashArgs.capacity%2==0) hashArgs.capacity += 1; 2858 hashArgs.compFunction = HashCmpWord; //PHASH_TABLE_DEFAULT_COMP_FUNCTION; 2859 hashArgs.hashFunction = HashGetCode; //PHASH_TABLE_DEFAULT_HASH_FUNCTION; 2860 hashArgs.maxLoadFactor = PHASH_TABLE_DEFAULT_MAX_LOAD_FACTOR; 2861 CHKLOG(rc, PHashTableCreate(&hashArgs, L("srec.graph.wordmap.wordIDForWord.deserializeWordMap()"), &awordmap->wordIDForWord)); 2862 2863 rc = wordmap_populate ( awordmap, awordmap->num_words ); 2864 2865 if (rc != ESR_SUCCESS) 2866 { 2867 wordmap_clean ( awordmap ); 2868 goto CLEANUP; 2869 } 2870 } 2871 else 2872 { 2873 awordmap->wordIDForWord = NULL; 2874 } 2875 2876 /* success */ 2877 *pwordmap = awordmap; 2878 return ESR_SUCCESS; 2879 2880 CLEANUP: 2881 if (awordmap != NULL) 2882 { 2883 if (awordmap->wordIDForWord != NULL) 2884 PHashTableDestroy(awordmap->wordIDForWord); 2885 if (awordmap->words != NULL) FREE(awordmap->words); 2886 if (awordmap->chars != NULL) FREE(awordmap->chars); 2887 FREE(awordmap); 2888 } 2889 return rc; 2890 } 2891 2892 static ESR_ReturnCode serializeArcToken(arc_token *token_base, 2893 int i, 2894 PFile* fp) 2895 { 2896 arc_token *token = token_base + i; 2897 asr_uint32_t idx; 2898 2899 if (pfwrite(&token->ilabel, 2, 1, fp) != 1) 2900 return ESR_WRITE_ERROR; 2901 2902 if (pfwrite(&token->olabel, 2, 1, fp) != 1) 2903 return ESR_WRITE_ERROR; 2904 2905 /* if (pfwrite(&token->cost, 2, 1, fp) != 1) 2906 return ESR_WRITE_ERROR; */ 2907 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->first_next_arc), token_base); 2908 2909 if (pfwrite(&idx, 4, 1, fp) != 1) 2910 return ESR_WRITE_ERROR; 2911 2912 idx = PTR_TO_IDX(ARC_TOKEN_PTR(token_base, token->next_token_index), token_base); 2913 2914 if (pfwrite(&idx, 4, 1, fp) != 1) 2915 return ESR_WRITE_ERROR; 2916 2917 return ESR_SUCCESS; 2918 } 2919 2920 static ESR_ReturnCode serializeArcTokenInfo(srec_context *context, 2921 PFile* fp) 2922 { 2923 int i; 2924 asr_uint32_t idx; 2925 ESR_ReturnCode rc; 2926 2927 if (pfwrite(&context->arc_token_list_len, 2, 1, fp) != 1) 2928 return ESR_WRITE_ERROR; 2929 2930 idx = PTR_TO_IDX(context->arc_token_freelist, context->arc_token_list); 2931 2932 if (pfwrite(&idx, 4, 1, fp) != 1) 2933 return ESR_WRITE_ERROR; 2934 2935 idx = PTR_TO_IDX(context->arc_token_insert_start, context->arc_token_list); 2936 2937 if (pfwrite(&idx, 4, 1, fp) != 1) 2938 return ESR_WRITE_ERROR; 2939 2940 for (i = 0; i < context->arc_token_list_len; ++i) 2941 { 2942 rc = serializeArcToken(context->arc_token_list, i, fp); 2943 if (rc != ESR_SUCCESS) return rc; 2944 } 2945 return ESR_SUCCESS; 2946 } 2947 2948 static ESR_ReturnCode deserializeArcToken(arc_token *token_base, 2949 int i, 2950 PFile* fp) 2951 { 2952 arc_token *token = token_base + i; 2953 asr_uint32_t idx[2]; 2954 asr_uint16_t labels[2]; 2955 2956 if (pfread(labels, 2, 2, fp) != 2) 2957 return ESR_READ_ERROR; 2958 2959 token->ilabel = labels[0]; 2960 token->olabel = labels[1]; 2961 2962 /* if (pfread(&token->cost, 2, 1, fp) != 1) 2963 return ESR_READ_ERROR; */ 2964 2965 if (pfread(idx, 4, 2, fp) != 2) 2966 return ESR_READ_ERROR; 2967 2968 token->first_next_arc = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[0], token_base)); 2969 token->next_token_index = ARC_TOKEN_PTR2LNK(token_base, IDX_TO_PTR(idx[1], token_base)); 2970 2971 return ESR_SUCCESS; 2972 } 2973 2974 static ESR_ReturnCode deserializeArcTokenInfo(srec_context *context, 2975 PFile* fp) 2976 { 2977 ESR_ReturnCode rc; 2978 int i; 2979 asr_uint32_t idx; 2980 2981 if (pfread(&context->arc_token_list_len, 2, 1, fp) != 1) { 2982 PLogError("pfread failed in deserializeArcTokenInfo()\n"); 2983 return ESR_READ_ERROR; 2984 } 2985 2986 context->arc_token_list = NEW_ARRAY(arc_token, context->arc_token_list_len, 2987 L("srec.graph.wordgraph")); 2988 2989 if (context->arc_token_list == NULL) 2990 return ESR_OUT_OF_MEMORY; 2991 2992 if (pfread(&idx, 4, 1, fp) != 1) 2993 { 2994 rc = ESR_READ_ERROR; 2995 goto CLEANUP; 2996 } 2997 2998 context->arc_token_freelist = 2999 IDX_TO_PTR(idx, context->arc_token_list); 3000 3001 if (pfread(&idx, 4, 1, fp) != 1) 3002 { 3003 rc = ESR_READ_ERROR; 3004 goto CLEANUP; 3005 } 3006 3007 context->arc_token_insert_start = 3008 IDX_TO_PTR(idx, context->arc_token_list); 3009 3010 for (i = 0; i < context->arc_token_list_len; ++i) 3011 { 3012 rc = deserializeArcToken(context->arc_token_list, i, fp); 3013 if (rc != ESR_SUCCESS) goto CLEANUP; 3014 } 3015 return ESR_SUCCESS; 3016 3017 CLEANUP: 3018 FREE(context->arc_token_list); 3019 context->arc_token_list = 3020 context->arc_token_freelist = 3021 context->arc_token_insert_start = NULL; 3022 return rc; 3023 } 3024 3025 int FST_GetGrammarType(srec_context* context) 3026 { 3027 arc_token *t, *u; 3028 wordID expected_wdid; 3029 /* 0 1 0 3030 1 2 4 3031 ... 3032 1 2 1316 3033 2 3034 */ 3035 t = context->arc_token_list; 3036 if (t->ilabel != WORD_EPSILON_LABEL) 3037 return GrammarTypeBNF; 3038 if (t->next_token_index) 3039 return GrammarTypeBNF; 3040 t = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc); 3041 expected_wdid = NUM_ITEMLIST_HDRWDS; 3042 for (; t; t = ARC_TOKEN_PTR(context->arc_token_list, t->next_token_index)) 3043 { 3044 if (t->ilabel != expected_wdid) 3045 return GrammarTypeBNF; 3046 u = ARC_TOKEN_PTR(context->arc_token_list, t->first_next_arc); 3047 if (u != NULL && u->ilabel != MAXwordID) 3048 return GrammarTypeBNF; 3049 expected_wdid++; 3050 } 3051 if (expected_wdid != context->olabels->num_words) 3052 return GrammarTypeBNF; 3053 return GrammarTypeItemList; 3054 } 3055 3056 int FST_DumpContextAsImageV2(srec_context* context, PFile* fp) 3057 { 3058 asr_uint32_t header[4]; 3059 arcID tmp[32], i, j, nfields; 3060 FSMarc* arc; 3061 ESR_ReturnCode rc; 3062 #if !defined(DO_ARCS_IO_IN_ARC_ORDER) 3063 FSMnode* node; 3064 arcID arcid, num_arcs, num_arcs_written; 3065 #endif 3066 3067 /* Write header information. */ 3068 header[0] = 0; 3069 header[1] = IMAGE_FORMAT_V2; 3070 header[2] = context->modelid; 3071 header[3] = context->grmtyp; 3072 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3073 PLogMessage("FST_DumpContextAsImageV2() fstgrmtyp %d", context->grmtyp); 3074 #endif 3075 3076 if (pfwrite(header, sizeof(header[0]), 4, fp) != 4) 3077 { 3078 PLogError("FST_DumpContextAsImage: Could not write header.\n"); 3079 return FST_FAILED_INTERNAL; 3080 } 3081 3082 /* arcs and nodes */ 3083 i = 0; 3084 tmp[i++] = context->num_arcs; 3085 tmp[i++] = context->FSMarc_list_len; 3086 tmp[i++] = context->num_base_arcs; 3087 tmp[i++] = context->FSMarc_freelist; 3088 tmp[i++] = context->max_searchable_arcs; 3089 3090 tmp[i++] = context->num_nodes; 3091 tmp[i++] = context->FSMnode_list_len; 3092 tmp[i++] = context->num_base_nodes; 3093 tmp[i++] = context->FSMnode_freelist; 3094 tmp[i++] = context->start_node; 3095 tmp[i++] = context->end_node; 3096 tmp[i++] = context->max_searchable_nodes; 3097 3098 tmp[i++] = context->beg_silence_word; 3099 tmp[i++] = context->end_silence_word; 3100 tmp[i++] = context->hack_silence_word; 3101 tmp[i++] = context->hmm_ilabel_offset; 3102 3103 nfields = i; 3104 3105 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3106 return ESR_WRITE_ERROR; 3107 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3108 PLogMessage("G2G done WR hdrs %d", pftell(fp)); 3109 #endif 3110 3111 #if defined(DO_ARCS_IO_IN_ARC_ORDER) 3112 if( 1) { 3113 i = 0; 3114 tmp[i++] = context->end_node; 3115 tmp[i++] = MAXnodeID; 3116 tmp[i++] = MAXmodelID; 3117 tmp[i++] = MAXwordID; 3118 tmp[i++] = FREEcostdata; 3119 nfields = i; 3120 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3121 return ESR_WRITE_ERROR; 3122 } 3123 for(j=0; j<context->num_arcs; j++) { 3124 arc = &context->FSMarc_list[j]; 3125 i = 0; 3126 tmp[i++] = arc->fr_node; 3127 tmp[i++] = arc->to_node; 3128 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID); 3129 tmp[i++] = arc->ilabel; 3130 tmp[i++] = arc->olabel; 3131 tmp[i++] = arc->cost; 3132 nfields = i; 3133 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3134 return ESR_WRITE_ERROR; 3135 } 3136 #else 3137 num_arcs_written = 0; 3138 for (j = 0; j < context->num_nodes; j++) { 3139 node = &context->FSMnode_list[j]; 3140 num_arcs = (arcID)num_arcs_leaving(context, node); 3141 arcid = node->un_ptr.first_next_arc; 3142 if (arcid == MAXarcID) { 3143 i = 0; 3144 tmp[i++] = (arcID)j; 3145 tmp[i++] = MAXarcID; 3146 tmp[i++] = MAXarcID; 3147 tmp[i++] = MAXarcID; 3148 tmp[i++] = MAXarcID; 3149 nfields = i; 3150 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3151 return ESR_WRITE_ERROR; 3152 /* num_arcs_written++; end node don't have a to-arc */ 3153 } 3154 else { 3155 for (; arcid != MAXarcID; arcid = arc->linkl_next_arc) { 3156 arc = &context->FSMarc_list[arcid]; 3157 i = 0; 3158 tmp[i++] = (arcID)j; 3159 tmp[i++] = arc->to_node; 3160 ASSERT(arc->to_node == context->end_node || arc->to_node != MAXnodeID); 3161 tmp[i++] = arc->ilabel; 3162 tmp[i++] = arc->olabel; 3163 tmp[i++] = arc->cost; 3164 nfields = i; 3165 if (pfwrite(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3166 return ESR_WRITE_ERROR; 3167 num_arcs_written++; 3168 } 3169 } 3170 } 3171 ASSERT(num_arcs_written == context->num_arcs); 3172 #endif 3173 3174 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3175 PLogMessage("G2G done WR arcs %d", pftell(fp)); 3176 #endif 3177 3178 /* node info .. will now be calculated on line */ 3179 /* wrapup_cost .. will now be calculated on line */ 3180 3181 /* exit points for slots */ 3182 if (1) 3183 { 3184 srec_fsm_exit_point *p, *q; 3185 if (pfwrite(&context->num_fsm_exit_points, 2, 1, fp) != 1) 3186 return ESR_WRITE_ERROR; 3187 p = context->fsm_exit_points; 3188 q = p + MAX_NUM_SLOTS; 3189 while (p < q) 3190 { 3191 if (pfwrite(&p->from_node_index, 2, 1, fp) != 1) 3192 return ESR_WRITE_ERROR; 3193 if (pfwrite(&p->arc_index, 2, 1, fp) != 1) 3194 return ESR_WRITE_ERROR; 3195 if (pfwrite(&p->wbto_node_index, 2, 1, fp) != 1) 3196 return ESR_WRITE_ERROR; 3197 ++p; 3198 } 3199 } 3200 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3201 PLogMessage("G2G done WR exits %d", pftell(fp)); 3202 #endif 3203 /* no entry points stuff */ 3204 3205 /* no saving ilabels */ 3206 3207 /* now load the ilabels */ 3208 rc = serializeWordMapV2(context->olabels, fp); 3209 if (rc != ESR_SUCCESS) 3210 return rc; 3211 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3212 PLogMessage("G2G done WR wmap %d", pftell(fp)); 3213 #endif 3214 3215 if (context->grmtyp != GrammarTypeItemList) 3216 { 3217 if (serializeArcTokenInfo(context, fp) != ESR_SUCCESS) 3218 { 3219 PLogError("FST_DumpContextAsImage: could not write arc token data.\n"); 3220 return FST_FAILED_INTERNAL; 3221 } 3222 } 3223 else 3224 { 3225 /* nothing, at decoding time, we'll just construct the graph 3226 from the word list, ie just this .. 3227 0 1 0 3228 1 2 4 3229 ... 3230 1 2 1316 3231 2 3232 ie 3233 0 1 0 3234 for(i=4;i<nwords;i++) { "1 2 i" } 3235 2 3236 because words 0-3 are eps,-pau-,-pau2-,slot (at) grammarname.grxml 3237 */ 3238 } 3239 3240 /* Write header information. */ 3241 header[0] = pftell(fp); 3242 header[1] = IMAGE_FORMAT_V2; 3243 3244 /* goto beginning of file. */ 3245 if (pfseek(fp, 0, SEEK_SET)) 3246 { 3247 PLogError("FST_DumpContextAsImage: could not reposition for header.\n"); 3248 return FST_FAILED_INTERNAL; 3249 } 3250 3251 if (pfwrite(header, 4, 2, fp) != 2) 3252 { 3253 PLogError("FST_DumpContextAsImage: Could not write header.\n"); 3254 return FST_FAILED_INTERNAL; 3255 } 3256 3257 /* reset pointer at end of file. */ 3258 if (pfseek(fp, 0, SEEK_END)) 3259 { 3260 PLogError("FST_DumpContextAsImage: could not reposition file pointer at end.\n"); 3261 return FST_FAILED_INTERNAL; 3262 } 3263 3264 return FST_SUCCESS; 3265 } 3266 3267 3268 int FST_LoadContextFromImageV2(srec_context* fst, PFile* fp) 3269 { 3270 asr_int32_t header[6]; 3271 arcID tmp[32], num_arcs, new_arc_id; 3272 unsigned int i, nfields; 3273 srec_fsm_exit_point *p, *q; 3274 arcID max_num_FSMarcs; 3275 nodeID max_num_FSMnodes; 3276 FSMarc_ptr atok = (FSMarc_ptr)0; 3277 FSMarc *atoken = NULL; 3278 FSMnode *fr_node, *to_node; 3279 int rc = FST_SUCCESS; 3280 ESR_ReturnCode esr_rc; 3281 ESR_BOOL seenFinalNode = ESR_FALSE; 3282 3283 /* read header information. */ 3284 if (pfread(&header[2], sizeof(header[0]), 2, fp) != 2) 3285 { 3286 PLogError("FST_DumpContextAsImage: Could not read header.\n"); 3287 return FST_FAILED_INTERNAL; 3288 } 3289 fst->modelid = header[2]; 3290 fst->grmtyp = header[3]; 3291 3292 nfields = 16; 3293 if (pfread(tmp, sizeof(tmp[0]), nfields, fp) != nfields) 3294 return ESR_READ_ERROR; 3295 3296 /* arcs and nodes */ 3297 i = 0; 3298 fst->num_arcs = max_num_FSMarcs = tmp[i++]; 3299 fst->FSMarc_list_len = tmp[i++]; 3300 fst->num_base_arcs = tmp[i++]; 3301 fst->FSMarc_freelist = tmp[i++]; 3302 fst->max_searchable_arcs = tmp[i++] * 0; /*5*/ 3303 fst->num_nodes = max_num_FSMnodes = tmp[i++]; 3304 fst->FSMnode_list_len = tmp[i++]; 3305 fst->num_base_nodes = tmp[i++]; 3306 fst->FSMnode_freelist = tmp[i++]; 3307 fst->start_node = tmp[i++];/*10*/ 3308 fst->end_node = tmp[i++]; 3309 fst->max_searchable_nodes = tmp[i++] * 0; 3310 fst->beg_silence_word = tmp[i++]; 3311 fst->end_silence_word = tmp[i++]; 3312 fst->hack_silence_word = tmp[i++]; /*15*/ 3313 fst->hmm_ilabel_offset = tmp[i++]; /* 16 */ 3314 3315 fst->addWordCaching_lastslot_name = 0; 3316 fst->addWordCaching_lastslot_num = MAXwordID; 3317 fst->addWordCaching_lastslot_needs_post_silence = ESR_FALSE; 3318 3319 ASSERT(i == nfields); 3320 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3321 PLogMessage("G2G done RD hdrs %d", pftell(fp)); 3322 #endif 3323 3324 ASSERT(fst->num_arcs >= fst->num_base_arcs); // was == 3325 ASSERT(fst->num_nodes >= fst->num_base_nodes); // was == 3326 3327 fst->FSMarc_list = (FSMarc*)CALLOC_CLR(fst->FSMarc_list_len, sizeof(FSMarc), "srec.graph.arcs"); 3328 if (!fst->FSMarc_list) 3329 { 3330 rc = FST_FAILED_ON_MEMORY; 3331 PLogError("CALLOC_CLR fst->FSMarc_list \n"); 3332 goto CLEANUP; 3333 } 3334 fst->FSMnode_list = (FSMnode*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode), "srec.graph.nodes"); 3335 if (!fst->FSMnode_list) 3336 { 3337 rc = FST_FAILED_ON_MEMORY; 3338 PLogError("CALLOC_CLR fst->FSMnode_list failed\n"); 3339 goto CLEANUP; 3340 } 3341 fst->FSMnode_info_list = (FSMnode_info*)CALLOC_CLR(fst->FSMnode_list_len, sizeof(FSMnode_info), "srec.graph.nodeinfos"); 3342 3343 if (!fst->FSMnode_info_list) 3344 { 3345 rc = FST_FAILED_ON_MEMORY; 3346 PLogError("CALLOC_CLR fst->FSMnode_info_list failed\n"); 3347 goto CLEANUP; 3348 } 3349 3350 /* setup the arc freelist */ 3351 fst->FSMarc_freelist = 0; 3352 fst->num_arcs = 0; 3353 for (i = 0; i < (unsigned)fst->FSMarc_list_len - 1; i++) 3354 { 3355 fst->FSMarc_list[i].linkl_next_arc = ARC_ItoX(i + 1); 3356 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 3357 } 3358 fst->FSMarc_list[i].linkl_next_arc = FSMARC_NULL; 3359 fst->FSMarc_list[i].linkl_prev_arc = FSMARC_FREE; 3360 3361 /* initialize the nodes, 'cuz reading is random order */ 3362 for (i = 0; i < max_num_FSMnodes; i++) 3363 { 3364 fr_node = &fst->FSMnode_list[i]; 3365 fr_node->un_ptr.first_next_arc = fr_node->first_prev_arc = FSMARC_NULL; 3366 } 3367 3368 /* 1. first load up all the information */ 3369 num_arcs = 0; 3370 for (i = 0; i < max_num_FSMarcs || !seenFinalNode; i++) 3371 { 3372 if (i > max_num_FSMarcs && !seenFinalNode) 3373 { 3374 PLogError("Final node never encountered"); 3375 rc = ESR_INVALID_STATE; 3376 goto CLEANUP; 3377 } 3378 nfields = 5; 3379 if (pfread(tmp,sizeof(tmp[0]),nfields,fp) != nfields) 3380 { 3381 PLogError("reading arc"); 3382 rc = ESR_INVALID_STATE; 3383 goto CLEANUP; 3384 } 3385 3386 if (tmp[1] == MAXnodeID) 3387 { 3388 seenFinalNode = ESR_TRUE; 3389 if(fst->end_node != tmp[0]) { 3390 PLogError("error with arc %d->%d ilabel %d olabel %d cost %d\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4]); 3391 ASSERT(fst->end_node == tmp[0]); 3392 } 3393 i--; 3394 } 3395 else 3396 { 3397 new_arc_id = fst_get_free_arc(fst); 3398 if (new_arc_id == MAXarcID) 3399 return FST_FAILED_ON_MEMORY; 3400 atok = ARC_ItoX(new_arc_id); 3401 atoken = ARC_XtoP(atok); 3402 num_arcs++; 3403 3404 ASSERT(tmp[0] < fst->num_nodes); 3405 ASSERT(tmp[1] < fst->num_nodes); 3406 fr_node = &fst->FSMnode_list[ tmp[0]]; 3407 to_node = &fst->FSMnode_list[ tmp[1]]; 3408 atoken->ilabel = tmp[2]; 3409 atoken->olabel = tmp[3]; 3410 atoken->cost = tmp[4]; 3411 append_arc_leaving_node(fst, fr_node, atok); 3412 append_arc_arriving_node(fst, to_node, atok); 3413 atoken->fr_node = NODE_ItoX(tmp[0]); 3414 atoken->to_node = NODE_ItoX(tmp[1]); 3415 } 3416 } 3417 ASSERT(fst->num_arcs == num_arcs); 3418 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3419 PLogMessage("G2G done RD arcs %d", pftell(fp)); 3420 #endif 3421 3422 /* setup the node freelist */ 3423 if (fst->num_nodes < fst->FSMnode_list_len) 3424 { 3425 fst->FSMnode_freelist = fst->num_nodes; 3426 for (i = fst->num_nodes; i < (unsigned)fst->FSMnode_list_len - 1; i++) 3427 { 3428 fst->FSMnode_list[i].un_ptr.next_node = NODE_ItoX(i + 1); 3429 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 3430 } 3431 fst->FSMnode_list[i].un_ptr.next_node = FSMNODE_NULL; 3432 fst->FSMnode_list[i].first_prev_arc = FSMARC_FREE; 3433 } 3434 else 3435 { 3436 fst->FSMnode_freelist = MAXnodeID; 3437 } 3438 3439 /* node info .. will now be calculated on line */ 3440 /* wrapup_cost .. will now be calculated on line */ 3441 fst->whether_prepared = 0; 3442 fst_fill_node_info(fst); 3443 for (i = 0; i < fst->num_nodes; i++) 3444 fst->FSMnode_info_list[i] = NODE_INFO_UNKNOWN; 3445 3446 /* exit points for slots */ 3447 if (pfread(&fst->num_fsm_exit_points, 2, 1, fp) != 1) 3448 return ESR_READ_ERROR; 3449 p = fst->fsm_exit_points; 3450 q = p + MAX_NUM_SLOTS; 3451 while (p < q) 3452 { 3453 if (pfread(&p->from_node_index, 2, 1, fp) != 1) 3454 return ESR_WRITE_ERROR; 3455 if (pfread(&p->arc_index, 2, 1, fp) != 1) 3456 return ESR_WRITE_ERROR; 3457 if (pfread(&p->wbto_node_index, 2, 1, fp) != 1) 3458 return ESR_WRITE_ERROR; 3459 ++p; 3460 } 3461 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3462 PLogMessage("G2G done RD exits %d", pftell(fp)); 3463 #endif 3464 3465 /* no entry points stuff */ 3466 3467 /* ilabels were not saved, create them as empty */ 3468 fst->ilabels = (wordmap*)CALLOC_CLR(1, sizeof(wordmap), "srec.graph.imap"); 3469 fst->ilabels->num_words = fst->ilabels->max_words = 0; 3470 fst->ilabels->words = 0; 3471 3472 /* now save the olabels */ 3473 esr_rc = deserializeWordMapV2(&fst->olabels, fp); 3474 if (esr_rc != ESR_SUCCESS) 3475 { 3476 PLogError("deserializeWordMapV2() failed\n"); 3477 rc = FST_FAILED_INTERNAL; 3478 goto CLEANUP; 3479 } 3480 #ifdef SREC_ENGINE_VERBOSE_LOGGING 3481 PLogMessage("G2G done RD wmap %d", pftell(fp)); 3482 #endif 3483 3484 /* arctokeninfo */ 3485 if (fst->grmtyp != GrammarTypeItemList) 3486 { 3487 if (deserializeArcTokenInfo(fst, fp) != ESR_SUCCESS) 3488 { 3489 PLogError("FST_DumpContextAsImage: could not write arc token data.\n"); 3490 rc = FST_FAILED_INTERNAL; 3491 goto CLEANUP; 3492 } 3493 } 3494 else 3495 { 3496 /* here we just construct an isolated list, but we should scrap it later */ 3497 wordID wdid; 3498 arc_token *atl, *final, *atli; 3499 fst->arc_token_list_len = fst->olabels->num_words + 2 - NUM_ITEMLIST_HDRWDS; 3500 atl = NEW_ARRAY(arc_token, fst->arc_token_list_len, L("srec.graph.wordgraph")); 3501 if(!atl) { 3502 PLogError("fst->arc_token_list_len failed\n"); 3503 goto CLEANUP; 3504 } 3505 atl->first_next_arc = ARC_TOKEN_LNK(atl, 1); 3506 atl->next_token_index = ARC_TOKEN_NULL; 3507 final = atl + fst->arc_token_list_len - 1; 3508 for (i = 1, wdid = NUM_ITEMLIST_HDRWDS; wdid < fst->olabels->num_words; i++, wdid++) 3509 { 3510 atli = &atl[i]; 3511 atli->ilabel = wdid; 3512 atli->olabel = wdid;/*not used*/ 3513 atli->next_token_index = ARC_TOKEN_LNK(atl, (i + 1)); 3514 atli->first_next_arc = ARC_TOKEN_LNK(atl, (fst->arc_token_list_len - 1)); /*final*/ 3515 } 3516 ASSERT(atl + i == final); 3517 final->next_token_index = ARC_TOKEN_NULL; 3518 final->first_next_arc = ARC_TOKEN_NULL; 3519 final->ilabel = final->olabel = 0; 3520 fst->arc_token_list = atl; 3521 } 3522 return FST_SUCCESS; 3523 CLEANUP: 3524 return FST_FAILED_INTERNAL; 3525 } 3526 3527 3528 int FST_LoadContextFromImageV2(srec_context* context, PFile* fp); 3529 3530 int FST_LoadContextFromImage(srec_context** pcontext, PFile* fp) 3531 { 3532 srec_context* context = NULL; 3533 /* used to force correct data alignment */ 3534 asr_int32_t header[2]; 3535 int rc = FST_SUCCESS; 3536 3537 /*printf("FST_LoadContexFromImage\n");*/ 3538 if (!fp) 3539 { 3540 PLogError("FST_LoadContextImage() failed; bad file pointer\n"); 3541 return FST_FAILED_ON_INVALID_ARGS; 3542 } 3543 3544 context = NEW(srec_context, L("srec.graph.binary")); 3545 if (context == NULL) 3546 { 3547 PLogError("FST_LoadContextFromImage: out of memory while allocating context.\n"); 3548 return FST_FAILED_ON_MEMORY; 3549 } 3550 memset(context, 0, sizeof(srec_context)); 3551 3552 /* header */ 3553 if (pfread(header, 4, 2, fp) != 2) 3554 { 3555 PLogError("FST_LoadContextFromImage: failed reading header.\n"); 3556 rc = FST_FAILED_ON_INVALID_ARGS; 3557 goto CLEANUP; 3558 } 3559 3560 if (header[1] == IMAGE_FORMAT_V2) 3561 { 3562 rc = FST_LoadContextFromImageV2(context, fp); 3563 if (rc != FST_SUCCESS) 3564 goto CLEANUP; 3565 } 3566 else 3567 { 3568 PLogError("FST_LoadContextFromImage() failed on image_format\n"); 3569 goto CLEANUP; 3570 } 3571 *pcontext = context; 3572 return FST_SUCCESS; 3573 CLEANUP: 3574 if (context) FREE(context); 3575 *pcontext = 0; 3576 return rc; 3577 } 3578 3579 3580 int FST_DumpReverseWordGraph(srec_context* context, PFile* fp) 3581 { 3582 /* not implemented, use FST_DumpSyntaxAsImage() for now */ 3583 return 1; 3584 } 3585 3586 /* this belongs part of the srec_context */ 3587 typedef struct 3588 { 3589 asr_int32_t image_format; 3590 asr_int32_t image_size; 3591 asr_int32_t sizes_signature; 3592 } 3593 context_image_header; 3594 3595 3596 int FST_LoadContextFromStreamImage(srec_context** pcontext, char* buffer, asr_int32_t buffer_len) 3597 { 3598 return FST_FAILED_INTERNAL; 3599 } 3600 3601 typedef struct nodeID_list_t 3602 { 3603 nodeID *nodes, num_nodes, num_alloc; 3604 } 3605 nodeID_list; 3606 3607 int fst_node_has_speech_to_come(srec_context* context, nodeID node_index) 3608 { 3609 /* recursive func ... try to keep stack use low! */ 3610 /* later we should cache this information: 3611 need 2bits: endn opte regl dunno */ 3612 FSMarc* arc; 3613 arcID arc_index = context->FSMnode_list[ node_index].un_ptr.first_next_arc; 3614 for (; arc_index != MAXarcID; arc_index = arc->linkl_next_arc) 3615 { 3616 arc = &context->FSMarc_list[arc_index]; 3617 if (arc->ilabel >= context->hmm_ilabel_offset + EPSILON_OFFSET + NUM_SILENCE_HMMS) 3618 { 3619 return 1; 3620 } 3621 else if (arc->ilabel < EPSILON_OFFSET) 3622 { 3623 if (fst_node_has_speech_to_come(context, arc->to_node)) 3624 { 3625 return 1; 3626 } 3627 } 3628 else if (IS_SILENCE_ILABEL(arc->ilabel, context)) 3629 { 3630 /* another silence! */ 3631 if (fst_node_has_speech_to_come(context, arc->to_node)) 3632 return 1; 3633 } 3634 } 3635 return 0; 3636 } 3637 3638 int fst_fill_node_info(srec_context* context) 3639 { 3640 char* infos = context->FSMnode_info_list; 3641 nodeID_list optends_obj; 3642 nodeID_list *optends = &optends_obj; 3643 3644 FSMnode* node; 3645 FSMarc* arc; 3646 nodeID i, j, node_index; 3647 arcID arc_index; 3648 costdata wrapup_cost; 3649 int wrapup_cost_inconsistencies; 3650 /* int num_near_end_nodes; nodes within EPS or silence to the end, these 3651 are called NODE_INFO_ENDNODES, so that they can be excluded from the 3652 comparison with end_node score in the eos detection */ 3653 3654 optends->num_nodes = 1; 3655 optends->num_alloc = 8192; /* scaled up from 512 to support dynamic grammar add word of 5000 */ 3656 optends->nodes = (nodeID*)CALLOC(optends->num_alloc, sizeof(nodeID), "srec.tmp.optendnodes"); 3657 optends->nodes[0] = context->end_node; 3658 /* num_near_end_nodes = 0; */ 3659 3660 for (i = 0; i < optends->num_nodes; i++) 3661 { 3662 node_index = optends->nodes[i]; 3663 node = &context->FSMnode_list[ node_index]; 3664 for (arc_index = node->first_prev_arc; arc_index != MAXarcID; 3665 arc_index = arc->linkl_prev_arc) 3666 { 3667 arc = &context->FSMarc_list[arc_index]; 3668 if (arc->fr_node != node_index) 3669 { 3670 3671 if (IS_SILENCE_ILABEL(arc->ilabel, context) || arc->ilabel < EPSILON_OFFSET) 3672 { 3673 /* ok, fr_node goes to the end via silence, check for dups */ 3674 for (j = 0; j < optends->num_nodes; j++) 3675 if (optends->nodes[j] == arc->fr_node) break; 3676 /* append it to the list */ 3677 if (j == optends->num_nodes) 3678 { 3679 if (optends->num_nodes < optends->num_alloc) 3680 optends->nodes[ optends->num_nodes++] = arc->fr_node; 3681 else 3682 { 3683 ASSERT(0 && "allocated too few optend nodes"); 3684 return 0; 3685 } 3686 } 3687 } 3688 } 3689 } 3690 } 3691 3692 /* now set the info array, default is regular */ 3693 for (i = 0; i < context->num_nodes; i++) 3694 infos[i] = NODE_INFO_REGULAR; 3695 /* also set the other nodes, we'll check whether nodes were added 3696 via these flags */ 3697 for (; i < context->FSMnode_list_len; i++) 3698 infos[i] = NODE_INFO_UNKNOWN; 3699 infos[ context->end_node] = NODE_INFO_ENDNODE; 3700 3701 /* get rid of direct to end nodes, etc */ 3702 for (i = 0, j = 0; i < optends->num_nodes; i++) 3703 { 3704 optends->nodes[j] = optends->nodes[i]; 3705 if (fst_node_has_speech_to_come(context, optends->nodes[i])) 3706 { 3707 j++; 3708 infos[ optends->nodes[i]] = NODE_INFO_OPTENDN; 3709 } 3710 else 3711 { 3712 infos[ optends->nodes[i]] = NODE_INFO_ENDNODE; 3713 /* num_near_end_nodes++; */ 3714 } 3715 } 3716 optends->num_nodes = j; 3717 3718 /* printf("get_oppend_nodes (%d)", optends->num_nodes); 3719 for(i=0; i<optends->num_nodes; i++) printf(" %d", optends->nodes[i]); 3720 printf("\n"); 3721 */ 3722 3723 FREE(optends->nodes); 3724 3725 /* find the wrapup cost */ 3726 node = &context->FSMnode_list[ context->end_node]; 3727 wrapup_cost = MAXcostdata; 3728 wrapup_cost_inconsistencies = 0; 3729 for (arc_index = node->first_prev_arc; arc_index != MAXarcID; 3730 arc_index = arc->linkl_prev_arc) 3731 { 3732 arc = &context->FSMarc_list[ arc_index]; 3733 if (IS_SILENCE_ILABEL(arc->ilabel, context) && 3734 arc->olabel == context->end_silence_word) 3735 { 3736 if (wrapup_cost == MAXcostdata) 3737 wrapup_cost = arc->cost; 3738 else if (context->wrapup_cost != arc->cost) 3739 { 3740 wrapup_cost = arc->cost; 3741 wrapup_cost_inconsistencies++; 3742 } 3743 } 3744 } 3745 #define MIN_WRAPUP_COST 100 3746 context->wtw_average = wrapup_cost; 3747 if (context->wtw_average > 200) 3748 context->wtw_average = 200; 3749 if (context->wrapup_cost < MIN_WRAPUP_COST) 3750 context->wrapup_cost = MIN_WRAPUP_COST; 3751 /*log_report("context->wrapup_cost %d (%d)\n", context->wrapup_cost, 3752 wrapup_cost_inconsistencies);*/ 3753 return 0; 3754 } 3755 3756 int fst_alloc_transit_points(srec_context* context) 3757 { 3758 arcID i; 3759 wordID num_slots = context->olabels->num_slots; 3760 wordID olabel; 3761 asr_int16_t nfxps = 0; 3762 FSMarc* arc; 3763 FSMnode* node; 3764 3765 context->num_fsm_exit_points = 0; 3766 /* slot 0 is invalid, it is the "eps", so num_slots==1 means no slots! */ 3767 if (num_slots == 1) 3768 return 0; 3769 3770 /* scan through to finds arc that carry rule references */ 3771 for (i = 0; i < context->num_arcs; i++) 3772 { 3773 olabel = context->FSMarc_list[i].olabel; 3774 /* (wordID)(olabel-1) < (num_slots-1) ?? might work */ 3775 if (olabel > 0 && olabel < num_slots) 3776 { 3777 context->FSMarc_list[i].cost = DISABLE_ARC_COST; 3778 if (nfxps >= MAX_NUM_SLOTS) 3779 { 3780 PLogError("error: too many fsm exit points in fsm, too many public rules referenced from here\n"); 3781 return 0; 3782 } 3783 context->fsm_exit_points[nfxps].arc_index = i; 3784 context->fsm_exit_points[nfxps].from_node_index = context->FSMarc_list[i].fr_node; 3785 /* skip over the trailing silence and .wb labels */ 3786 for (arc = &context->FSMarc_list[i]; arc->ilabel != WORD_BOUNDARY;) 3787 { 3788 node = &context->FSMnode_list[arc->to_node]; 3789 arc = &context->FSMarc_list[node->un_ptr.first_next_arc]; 3790 } 3791 context->fsm_exit_points[nfxps].wbto_node_index = arc->to_node; 3792 nfxps++; 3793 3794 } /* olabel<num_slots */ 3795 } /* i<num_arcs */ 3796 context->num_fsm_exit_points = nfxps; 3797 return 0; 3798 } 3799 3800 3801 /* 3802 cross-word modeling: 3803 for now we'll assume silence context on either side, later we'll create 3804 a graph with the right fan out on the edges of each slot. At that point we 3805 may have routines like ... if(is_dead_end(arc)) continue; 3806 3807 word removal: 3808 we'll use refcounters on arcs, we can search the whole graph for the 3809 olabel, then remove forward and remove backwards 3810 3811 word addition: 3812 glenville ^ hmm183 hmm222 hmm162 hmm246 hmm346 hmm191 hmm219 3813 3814 set imap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/model.map 3815 set omap=C:/users/dahan/speech2go/fst/enu_d2f_fray_g/namesnnumsSC.map 3816 wintel/debug/addword.exe -min 3817 $SWISDK/bin/fsmcompileD -t -i $imap -o $omap -F out.PCLG out.PCLG.txt 3818 $SWISDK/bin/fsmdrawD -i $imap -o $omap out.PCLG > out.PCLG.dot 3819 dotty out.PCLG.dot 3820 3821 need 3822 - a sanity check 3823 - remove word 3824 - multiple pronunciations handling 3825 - homonyms handling, other boundary conditions checking 3826 - measure the speed 3827 3828 alan_adams al~#ad}z 3829 alan_adams al~ad}z 3830 paul_adams p{l#ad}z 3831 3832 3833 3834 */ 3835