1 /*---------------------------------------------------------------------------* 2 * grxmldoc.cpp * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 #include <assert.h> 21 #include <stdlib.h> 22 #include <fstream> 23 #include <sstream> 24 #include <iostream> 25 #include <algorithm> // for std::sort 26 #include "tinyxml.h" 27 #include "grph.h" // The word graph object and interface 28 #include "sub_grph.h" // The sub-graph object and interface 29 #include "hashmap.h" 30 #include "grxmldoc.h" 31 #include "ESR_Session.h" 32 //#include "LCHAR.h" 33 34 #define GRXML_DEBUG 0 35 #define MAX_PATH_NAME 512 36 37 #define FATAL_ERROR(x,y) { std::cout << (x) << std::endl; exit ((y)); } 38 #define WARNING(x) std::cout << (x) << std::endl; 39 40 #if GRXML_DEBUG 41 //#define DEBUG_PRINT(x) // 42 #define DEBUG_PRINT(x) std::cout << (x) << std::endl; 43 #define PRINT_EXPRESSION(x) 44 //#define PRINT_EXPRESSION(x) std::cout << (x) << std::endl; 45 #else 46 #define DEBUG_PRINT(x) // 47 #define PRINT_EXPRESSION(x) // 48 49 #endif 50 51 using namespace std; 52 53 #define CHECK_NOT_EMPTY(s, t) { if (s.empty()) \ 54 { \ 55 std::cout << "ERROR: Empty string of type " << t <<std::endl; \ 56 } \ 57 } 58 59 int get_range(const std::string& s, int* minCnt, int* maxCnt) 60 { 61 std::string sval; 62 unsigned int p1 =s.find("-"); 63 if ( p1 !=string::npos ) { 64 sval.assign( s, 0, p1 ); 65 if(strspn(sval.c_str(),"0123456789")<1) return 1; 66 *minCnt = atoi( sval.c_str() ); 67 sval.assign( s, p1+1, s.size() ); 68 *maxCnt = -1; // 0== any? 69 // If max is given then use BeginCount otherwise use BeginItemRepeat 70 if (!sval.empty() ) { 71 if(strspn(sval.c_str(),"0123456789")<1) return 1; 72 *maxCnt = atoi( sval.c_str() ); 73 } 74 return 0; 75 } 76 p1 = s.find("+"); 77 if( p1 != string::npos) { 78 sval.assign( s, 0, p1 ); 79 if(strspn(sval.c_str(),"0123456789")<1) return 1; 80 *minCnt = atoi( sval.c_str() ); 81 *maxCnt = -1; 82 return 0; 83 } 84 if(strspn(s.c_str(),"0123456789")<1) return 1; 85 *minCnt = *maxCnt = atoi( s.c_str()); 86 return 0; 87 } 88 89 GRXMLDoc::GRXMLDoc() 90 { 91 m_NodeKeyWords.insert(make_pair("grammar", NodeTypeGrammar)); 92 m_NodeKeyWords.insert(make_pair("rule", NodeTypeRule)); 93 m_NodeKeyWords.insert(make_pair("ruleref", NodeTypeRuleReference)); 94 m_NodeKeyWords.insert(make_pair("one-of", NodeTypeOneOf)); 95 m_NodeKeyWords.insert(make_pair("item", NodeTypeItem)); 96 m_NodeKeyWords.insert(make_pair("tag", NodeTypeTag)); 97 m_NodeKeyWords.insert(make_pair("count", NodeTypeCount)); 98 m_NodeKeyWords.insert(make_pair("meta", NodeTypeMeta)); 99 m_pGraph = 0; 100 m_RuleAutoIndex = 0; 101 m_TagAutoIndex = 0; 102 m_LabelAutoIndex = 0; 103 m_ExpandedRulesAutoIndex = 0; 104 m_XMLFileName = "dummy.xml"; 105 } 106 107 108 GRXMLDoc::~GRXMLDoc() 109 { 110 deleteRules(); 111 if (m_pGraph) { 112 delete m_pGraph; 113 } 114 } 115 116 117 bool GRXMLDoc::parseGrammar( XMLNode &node, std::string & xMLFileName ) 118 { 119 m_XMLFileName = xMLFileName; 120 // Set up the internally defined rules, etc. 121 initializeLists(); 122 // The top level "document" node is given to this fn 123 // Create the container for the word graph. 124 if (m_pGraph) { 125 delete m_pGraph; 126 } 127 m_pGraph = new Graph("XML grammar"); 128 SubGraph *p_SubGraph; 129 130 parseNode( node, p_SubGraph, 1 ); // NB Subgraph pointed to will change in recursive fn. 131 132 if (findSubGraph( m_RootRule, p_SubGraph )) { 133 m_pGraph->ExpandRules (p_SubGraph); 134 p_SubGraph->RemoveInternalConnections (); 135 //Print the root rule. 136 //printSubgraph( *p_SubGraph ); 137 } 138 return true; 139 } 140 141 142 bool GRXMLDoc::parseNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level ) 143 { 144 // We will create a new subgraph for each rule node. 145 // The "current" subgraph is substituted with the new subgraph for all ops on child nodes. 146 // After processing child nodes the original subgraph is reinstated 147 // for final operations in the endNode() fn. 148 149 // Initial processing of the current node before processing children 150 #if 0 && GRXML_DEBUG 151 if(node.Type() == TiXmlNode::ELEMENT) 152 node.ToElement()->Print( stdout, level); 153 else if(node.Type() == TiXmlNode::DOCUMENT) 154 node.ToDocument()->Print( stdout, level); 155 else if(node.Type() == TiXmlNode::TEXT) 156 node.ToText()->Print( stdout, level); 157 else if(node.Type() == TiXmlNode::DECLARATION) 158 node.ToDeclaration()->Print( stdout, level); 159 else { 160 const char* text = node.Value(); 161 if(!text) text = "__NULL__"; 162 printf("processing node type %d text %s\n", node.Type(), text); 163 } 164 #endif 165 beginNode( node, p_SubGraph, level ); 166 167 SubGraph *p_LocalSubGraph; 168 p_LocalSubGraph = p_SubGraph; 169 TiXmlNode* child; 170 for( child = node.FirstChild(); child; child = child->NextSibling() ) 171 { 172 parseNode ( *child, p_SubGraph, level+1 ); 173 } 174 // Revert current node 175 p_SubGraph = p_LocalSubGraph; 176 177 // Finish processing current node 178 endNode( node, p_SubGraph, level ); 179 180 return true; 181 } // parseNode 182 183 184 bool GRXMLDoc::beginNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level ) 185 { 186 std::string name = node.Value(); 187 DEBUG_PRINT("Element = " + name); 188 189 // XMLNode::Type type = node.getType(); 190 if ( node.Type() == TiXmlNode::TEXT) // isCData() 191 { 192 const char* cc_name = node.Parent()->Value(); 193 std::string str_name(cc_name); 194 DEBUG_PRINT (std::string("CDATA ") + name); 195 DEBUG_PRINT (std::string("CDATA ") + str_name); 196 197 processCDATA( node, p_SubGraph ); 198 } 199 else if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() /*isLeaf()*/) 200 { 201 //printNode(node, level); 202 // Use enum value 203 KEYWDPAIR::iterator pos; 204 pos = m_NodeKeyWords.find( name ); 205 KeywordValues nodeType = NodeTypeBadValue; 206 if ( pos != m_NodeKeyWords.end() ) 207 { 208 nodeType = (*pos).second; 209 DEBUG_PRINT("nodeType=" + nodeType); 210 } else if(node.Type() == TiXmlNode::COMMENT) { 211 return true; 212 } else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) { 213 return true; 214 } else { 215 FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT); 216 } 217 218 switch ( nodeType ) 219 { 220 case NodeTypeGrammar: 221 { 222 beginParseGrammarNode( node ); 223 } 224 break; 225 case NodeTypeRule: 226 { 227 // NB This fn creates a new subgraph. 228 beginParseRuleNode( node, p_SubGraph ); 229 } 230 break; 231 case NodeTypeRuleReference: 232 { 233 // NB This fn creates a new subgraph. 234 beginRuleRef( node, p_SubGraph ); 235 } 236 break; 237 case NodeTypeOneOf: 238 { 239 beginOneOf( node, p_SubGraph ); 240 } 241 break; 242 case NodeTypeItem: 243 { 244 beginItem( node, p_SubGraph ); 245 } 246 break; 247 case NodeTypeTag: 248 { 249 beginTag( node, p_SubGraph ); 250 } 251 break; 252 case NodeTypeCount: 253 { 254 beginCount( node, p_SubGraph ); 255 } 256 break; 257 case NodeTypeMeta: 258 { 259 beginParseMetaNode( node ); 260 } 261 break; 262 case NodeTypeBadValue: 263 default: 264 DEBUG_PRINT( "UNKNOWN node name: " + name ); 265 break; 266 }; // switch 267 } //is a Node or Leaf 268 else if ( node.Type() == TiXmlNode::TEXT) // isCData() 269 { 270 DEBUG_PRINT (std::string("CDATA ") + name); 271 processCDATA( node, p_SubGraph ); 272 } 273 return true; 274 } // beginNode() 275 276 277 bool GRXMLDoc::endNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level ) 278 { 279 std::string name = node.Value(); 280 //XMLNode::Type type = node.getType(); 281 282 if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() ) 283 { 284 KEYWDPAIR::iterator pos; 285 pos = m_NodeKeyWords.find( name ); 286 KeywordValues nodeType = NodeTypeBadValue; 287 if ( pos != m_NodeKeyWords.end() ) 288 { 289 nodeType = (*pos).second; 290 } else if(node.Type() == TiXmlNode::COMMENT) { 291 return true; 292 } else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) { 293 return true; 294 } else if(node.Type() == TiXmlNode::TEXT) { 295 296 } else { 297 FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT ); 298 } 299 300 switch ( nodeType ) 301 { 302 case NodeTypeGrammar: 303 { 304 endParseGrammarNode( node ); 305 } 306 break; 307 case NodeTypeRule: 308 { 309 endParseRuleNode( node, p_SubGraph ); 310 } 311 break; 312 case NodeTypeRuleReference: 313 { 314 endRuleRef( node, p_SubGraph ); 315 } 316 break; 317 case NodeTypeOneOf: 318 { 319 endOneOf( node, p_SubGraph ); 320 } 321 break; 322 case NodeTypeItem: 323 { 324 endItem(node, p_SubGraph ); 325 } 326 break; 327 case NodeTypeTag: 328 { 329 endTag( node, p_SubGraph ); 330 } 331 break; 332 case NodeTypeCount: 333 { 334 endCount( node, p_SubGraph ); 335 } 336 break; 337 case NodeTypeMeta: 338 { 339 endParseMetaNode( node ); 340 } 341 break; 342 case NodeTypeBadValue: 343 default: 344 DEBUG_PRINT( "UNKNOWN node name: "); 345 DEBUG_PRINT( name.c_str() ); 346 //Extend the 347 break; 348 }; // switch 349 } //isNode() or isLeaf() 350 else 351 { 352 // Do nothing? 353 } 354 return true; 355 } // endNode() 356 357 358 bool GRXMLDoc::beginParseGrammarNode(XMLNode &node) 359 { 360 const char* attr; 361 #define GETATTR(nAmE) ((attr=node.ToElement()->Attribute(nAmE))!=NULL) ? attr:"" 362 m_XMLMode = GETATTR("mode"); 363 m_XMLLanguage = GETATTR("xml:lang"); 364 m_RootRule = GETATTR("root"); // The root rule name 365 366 DEBUG_PRINT("Root rule = " + m_RootRule); 367 368 m_XMLTagFormat = GETATTR("tag-format"); 369 m_XMLVersion = GETATTR("version"); 370 m_XMLBase = GETATTR("xml:base"); 371 return true; 372 } 373 374 bool GRXMLDoc::beginParseMetaNode(XMLNode &node) 375 { 376 const char* attr; 377 std::string meta_name = GETATTR("name"); 378 std::string meta_value = GETATTR("content"); 379 380 if(meta_name == "word_penalty") { 381 m_MetaKeyValPairs.insert(meta_name,meta_value); 382 // m_MetaKeyValPairs.print(); 383 } else if(meta_name == "do_skip_interword_silence") { 384 for(int j = 0; j<(int)meta_value.size(); j++){ 385 meta_value[j] = tolower(meta_value[j]); //lower(); 386 } 387 if(meta_value!="true" && meta_value!="false") 388 printf ("\nWarning: %s must be set to 'true' or 'false'; defaulting to 'false'\n", meta_name.c_str()); 389 else 390 m_MetaKeyValPairs.insert(meta_name,meta_value); 391 } else if(meta_name == "userdict_name") { 392 printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str()); 393 } else { 394 printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str()); 395 } 396 return true; 397 } 398 399 400 bool GRXMLDoc::endParseGrammarNode(XMLNode &node) 401 { 402 // End parse operations 403 return true; 404 } 405 406 407 bool GRXMLDoc::beginParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph) 408 { 409 const char* attr; 410 // Note: The subGraph may change if there are forward references. This 411 // is fine as we revert to the previous one when finished parsing the current node. 412 DEBUG_PRINT ( "---- Rule\n" ); 413 std::string ruleName = GETATTR("id" ); 414 std::string s_tag = GETATTR("tag" ); 415 if( s_tag.length()>0) { 416 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) 417 } 418 CHECK_NOT_EMPTY( ruleName, "id" ); 419 // Rule name must be unique within scope of entire grammar. 420 // Put rule on stack - for context 421 m_RuleListStack.push( ruleName ); 422 423 // Check whether a ruleref placeholder exists for this rule. 424 int index; 425 bool foundRule = findRuleIndex( ruleName, index ); 426 if (foundRule) { 427 // Rule is already declared; it must have been forward referenced 428 // so swap the placeholder subgraph in. 429 // NB subgraph and rule name are already known to lists. 430 SubGraph *p_ExistingSubgraph; 431 if ( findSubGraph( ruleName, p_ExistingSubgraph ) ) { 432 p_SubGraph = p_ExistingSubgraph; 433 } 434 else { 435 FATAL_ERROR("ERROR! Subgraph without rule name entry found!", -1); 436 } 437 } 438 else { 439 // Create a Word Graph node for each rule node 440 SubGraph *newGraph; 441 addRuleToList( ruleName, newGraph ); 442 p_SubGraph = newGraph; 443 } 444 445 // Make a note of the scope or rules; public, etc - used in map file. 446 findRuleIndex( ruleName, index ); 447 std::string ruleScope = GETATTR("scope" ); 448 if ( !ruleScope.empty() ) { 449 m_RuleScope.insert(index, ruleScope); 450 } 451 452 // We must accommodate Rules that have CDATA without an <item> element. 453 // We need to infer this element for all rules. 454 m_pGraph->BeginItem( p_SubGraph ); 455 456 PRINT_EXPRESSION( ruleName + " = { " ); 457 return true; 458 } // beginParseRuleNode() 459 460 461 bool GRXMLDoc::endParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph ) 462 { 463 // The rule expression has been built as a subgraph and ID added to the rule list. 464 // Finished editing subgraph 465 DEBUG_PRINT ( "---- /Rule\n" ); 466 //m_pGraph->EndRule(&p_SubGraph); 467 // Tell the world 468 //std::string ruleName = attr.get( "id" ); 469 std::string ruleName = m_RuleListStack.top(); 470 m_RuleListStack.pop(); 471 //CHECK_NOT_EMPTY( ruleName, "id" ); 472 // Must be unique rule name within scope of entire grammar. 473 // Check whether a ruleref placeholder exists for this rule. 474 m_pGraph->addSubGraph ( p_SubGraph ); 475 476 // We must accommodate Rules that have CDATA without an <item> element. 477 // We need to infer this element for all rules. 478 m_pGraph->EndItem( p_SubGraph ); 479 480 PRINT_EXPRESSION( " }\n" ); 481 return true; 482 } 483 484 bool GRXMLDoc::processCDATA( XMLNode &node, SubGraph *&p_SubGraph ) 485 { 486 // Note the Item's CDATA 487 // Strip leading and trailing whitespace 488 const char* cc_name = node.Parent()->Value(); 489 std::string str_name(cc_name); // = node.Parent()->ValueStr(); // getName 490 // std::string name = node.Parent()->Value(); // getName 491 //if ( name == "item" ) { 492 if ( str_name != "tag" ) { 493 494 const char* const whitespace = " \t\r\n\v\f"; 495 std::string cdata = node.Value(); // getCData() 496 std::string word; // Words are whitespace separated 497 498 cdata.erase(0, cdata.find_first_not_of(whitespace) ); 499 cdata.erase(cdata.find_last_not_of(whitespace) + 1); 500 #if GRXML_DEBUG 501 std::cout << "/--" << cdata << "--/\n"; 502 #endif 503 504 std::string::size_type begIdx, endIdx; 505 506 //search beginning of the first word 507 begIdx = cdata.find_first_not_of(whitespace); 508 509 //while beginning of a word found 510 while (begIdx != std::string::npos) { 511 //search end of the actual word 512 endIdx = cdata.find_first_of (whitespace, begIdx); 513 if (endIdx == string::npos) { 514 //end of word is end of line 515 endIdx = cdata.length(); 516 } 517 word.clear(); 518 // word.assign(cdata,begIdx,endIdx); 519 word.append (cdata, begIdx, endIdx - begIdx); 520 if ( !word.empty() ) 521 { 522 #if GRXML_DEBUG 523 std::cout << " -->" << word << "<--\n"; 524 #endif 525 int index; 526 // If a slot then take note of rule name 527 if ( IsSlot( word ) ) { 528 const char* xmlBasename; 529 std::string ruleName = m_RuleListStack.top(); 530 m_SlotList.insert(index, ruleName); 531 xmlBasename = strrchr(m_XMLFileName.c_str(),'/'); 532 xmlBasename = xmlBasename ? xmlBasename+1 : m_XMLFileName.c_str(); 533 word = (std::string)xmlBasename + "." + ruleName + "@" + word; 534 addLabelToList( word ); 535 findLabelIndex( word, index ); 536 } else { 537 addLabelToList( word ); 538 findLabelIndex( word, index ); 539 } 540 m_pGraph->AddLabel( p_SubGraph, index ); 541 } 542 begIdx = cdata.find_first_not_of (whitespace, endIdx); 543 544 } 545 } //tag 546 else { 547 // Do nothing with CDATA for elements that are not items. 548 // In particular, do not strip whitespace from tag cdata. 549 // However, CPPDOM appears to remove linefeeds. May need to tidy up. 550 551 } 552 return true; 553 } // cdata 554 555 bool GRXMLDoc::beginItem( XMLNode &node, SubGraph *&p_SubGraph ) 556 { 557 const char* attr; 558 DEBUG_PRINT ("---- Item:\n"); 559 // First check whethere there is a count/repeat 560 std::string s = GETATTR("repeat" ); 561 int minCnt=0,maxCnt=0; 562 std::string s_tag = GETATTR("tag" ); 563 if( s_tag.length()>0) { 564 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) 565 } 566 if( s.length()>0 && get_range( s, &minCnt, &maxCnt) ) { 567 FATAL_ERROR(std::string("error: while parsing range ") + s,1); 568 } 569 if ( !s.empty() ) { 570 // RED FLAG: max should not be 0! A +ve number should have been given. 571 if( maxCnt>0) { 572 m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt ); 573 } 574 else { 575 // NB: BeginItemRepeat can only use min of 0 or 1! 576 m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1); 577 } 578 } 579 else { 580 m_pGraph->BeginItem( p_SubGraph ); 581 } 582 return true; 583 } 584 585 586 bool GRXMLDoc::endItem( XMLNode &node, SubGraph *&p_SubGraph ) 587 { 588 DEBUG_PRINT ( "---- /Item\n" ); 589 590 // What TODO if no tag for an item? 591 592 m_pGraph->EndItem( p_SubGraph ); 593 return true; 594 } 595 596 597 bool GRXMLDoc::beginRuleRef( XMLNode &node, SubGraph *&p_SubGraph ) 598 { 599 // Extend word FST node with an entire FST subgraph. 600 // Forward referencing of rules is supported. 601 // NB Remove the leading # from the ruleref name! 602 DEBUG_PRINT ( "---- Ruleref\n" ); 603 604 const char* attr; 605 std::string s_tag = GETATTR("tag" ); 606 if( s_tag.length()>0) { 607 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) 608 } 609 std::string s = GETATTR("uri" ); 610 if (s.empty()) 611 { 612 // 613 FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 ); 614 } 615 // Remove the #: 616 int p1 = s.find("#"); 617 if ( p1 !=0 ) { 618 FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'" + ". Rule reference must start with a '#'. External references are not supported.", -1 ); 619 } 620 string ruleName; 621 getRuleRefName( node, ruleName ); 622 623 //std::string parentRuleName = m_RuleListStack.top(); 624 //addRuleDependency( parentRuleName, ruleName ); 625 626 int index; 627 bool foundRule = findRuleIndex( ruleName, index ); 628 if (!foundRule) { 629 // Forward reference; create a placeholder subgraph ptr. 630 //SubGraph *newGraph = new SubGraph( (char *) ruleName.c_str() ); 631 // RED FLAG: Remember to check fwd ref rule was filled in at end. 632 SubGraph *newGraph; 633 addRuleToList( ruleName, newGraph ); 634 findRuleIndex( ruleName, index ); 635 } 636 // We can now treat a forward-referenced graph as if it was defined. 637 // We will add the subgraph when we have the tag - see endItem(). 638 m_pGraph->BeginRule( p_SubGraph ); 639 m_pGraph->AddRuleRef( p_SubGraph, index ); 640 m_pGraph->EndRule( p_SubGraph ); 641 642 return true; 643 } 644 645 646 bool GRXMLDoc::endRuleRef(XMLNode &grmNode, SubGraph *&p_SubGraph ) 647 { 648 DEBUG_PRINT ( "---- /Ruleref\n" ); 649 // Does nothing 650 // NB The tag is not under the ruleref element - it is in the current item element. 651 // We now add the tag of the AddRuleRef as we see the tag element. See EndTag(). 652 653 return true; 654 } 655 656 657 bool GRXMLDoc::beginOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph) 658 { 659 DEBUG_PRINT ( "----OneOf\n" ); 660 m_pGraph->BeginOneOf (p_SubGraph); 661 return true; 662 } 663 664 665 bool GRXMLDoc::endOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph) 666 { 667 DEBUG_PRINT ( "----/OneOf\n" ); 668 m_pGraph->EndOneOf (p_SubGraph); 669 return true; 670 } 671 672 673 bool GRXMLDoc::beginTag( XMLNode &node, SubGraph *&p_SubGraph ) 674 { 675 DEBUG_PRINT ("---- Tag\n"); 676 std::string s = node.ToElement()->GetText(); // getCdata(); 677 #if GRXML_DEBUG 678 std::cout << s; // debug 679 #endif 680 // Store the semantic tag info. 681 // NB Do not strip whitespace from tag cdata 682 if ( !s.empty() ) 683 { 684 int index; 685 addTagToList( s ); 686 findTagIndex( s, index ); 687 m_pGraph->AddTag ( p_SubGraph, index ); 688 } 689 690 return true; 691 } 692 693 694 bool GRXMLDoc::endTag( XMLNode &node, SubGraph *&p_SubGraph ) 695 { 696 DEBUG_PRINT ("---- /Tag\n"); 697 return true; 698 } 699 700 701 bool GRXMLDoc::beginCount( XMLNode &node, SubGraph *&p_SubGraph ) 702 { 703 const char* attr; 704 // Count of reps applies to the text elements in this count node 705 DEBUG_PRINT ("---- Count\n"); 706 // Get number attr 707 std::string s = GETATTR("number"); 708 std::string s_tag = GETATTR("tag" ); 709 if( s_tag.length()>0) { 710 FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) 711 } 712 if (s.empty()) { 713 return false; 714 } 715 // not in subgraph but in graph?! 716 //graph.BeginCount(n); 717 718 int minCnt=-1, maxCnt=-1; 719 if( get_range( s, &minCnt, &maxCnt) ) { 720 FATAL_ERROR(std::string("error: while parsing range ") + s,1); 721 } 722 if ( s.c_str() == std::string("optional") ) 723 { 724 m_pGraph->BeginOptional( p_SubGraph ); 725 } 726 else if ( minCnt>0 && maxCnt>0) 727 { 728 m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt ); 729 } 730 else if( minCnt>0 ) 731 { 732 m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1); 733 } 734 else { // 735 m_pGraph->BeginOptional ( p_SubGraph ); 736 } 737 738 return true; 739 } 740 741 742 bool GRXMLDoc::endCount( XMLNode &node, SubGraph *&p_SubGraph ) 743 { 744 DEBUG_PRINT ("---- /Count\n"); 745 m_pGraph->EndCount( p_SubGraph ); 746 return true; 747 } 748 749 bool GRXMLDoc::endParseMetaNode(XMLNode &node) 750 { 751 // End parse operations 752 return true; 753 } 754 755 void GRXMLDoc::printNode(XMLNode &node, int level) 756 { 757 std::string name = node.Value(); 758 int type = node.Type(); 759 std::string c_data; 760 761 for(int i=0;i<level;i++) std::cout << " "; 762 763 char c = ' '; 764 switch(type) 765 { 766 case TiXmlNode::ELEMENT: 767 // case XMLNode::xml_nt_node: // grammar, rule, one-of, item, count 768 c = '+'; 769 break; 770 /* case TiXmlNode::TEXT: 771 // case XMLNode::xml_nt_leaf: 772 c = '-'; 773 break; */ 774 case TiXmlNode::DOCUMENT: 775 // case XMLNode::xml_nt_document: 776 c = '\\'; 777 break; 778 case TiXmlNode::TEXT: 779 // case XMLNode::xml_nt_cdata: 780 c = '#'; 781 c_data = node.Value(); // getCdata(); 782 break; 783 case TiXmlNode::UNKNOWN: 784 case TiXmlNode::COMMENT: 785 case TiXmlNode::TYPECOUNT: 786 case TiXmlNode::DECLARATION: 787 default: 788 std::cout << "Error: not sure what to do here" << std::endl; 789 break; 790 } 791 if(node.Type() == TiXmlNode::TEXT) // isCData() 792 std::cout << c << name.c_str() << "[" << c_data << "]" << std::endl; 793 //Extend the tag hashtable 794 else 795 std::cout << c << name.c_str() << std::endl; 796 797 if( node.Type() == TiXmlNode::ELEMENT) { 798 799 for(TiXmlAttribute* attr=node.ToElement()->FirstAttribute(); 800 attr; attr=attr->Next() ) { 801 802 // guru: added output of attributes 803 for (int i=0; i<level; i++) 804 std::cout << " "; 805 std::cout << " "; 806 std::cout << attr->Name() << ": " << attr->Value() << std::endl; 807 } 808 } 809 810 } 811 812 /** Function: addRuleToList 813 Extends list of SubGraphs with given subGraph 814 and extends list of rule names too. 815 TODO: Can we use one hash and use internal numeric index for rule IDs? 816 */ 817 818 819 bool GRXMLDoc::addRuleToList(std::string const & ruleName, SubGraph *&p_SubGraph) 820 { 821 int index; 822 if ( findRuleIndex ( ruleName, index ) ) { 823 FATAL_ERROR("ERROR! Rule name " + ruleName + " is already defined!", -1 ); 824 } 825 826 addLabelToList( m_XMLFileName + "@" + ruleName); 827 findLabelIndex( m_XMLFileName + "@" + ruleName, index ); 828 #if GRXML_DEBUG 829 std::cout << "Rule " << ruleName << std::endl; 830 #endif 831 // Create the new subgraph and update lists 832 m_RuleList.insert( ruleName, index ); 833 p_SubGraph = new SubGraph( (char *) ruleName.c_str(), index ); 834 835 bool success = m_SubgraphList.insert( ruleName, p_SubGraph ); 836 if (!success) { 837 FATAL_ERROR("ERROR! subgraph for " + ruleName + " is already defined!", -1 ); 838 } 839 #if ADD_BRACES 840 addLabelToList( "{" ); 841 std::stringstream ss; 842 ss << "}(" << index << ")"; 843 addLabelToList( ss.str()); 844 #endif 845 return success; 846 } 847 848 849 bool GRXMLDoc::deleteRules() 850 { 851 // Delete all allocated subgraphs. 852 // The rule strings are part of the hashtables and get deleted by them. 853 int index; 854 SubGraph *p_SubGraph; 855 std::string ruleName; 856 while ( !m_RuleList.isEmpty() ) { 857 m_RuleList.getFirst( &ruleName, &index ); 858 m_RuleList.remove( ruleName ); 859 if (m_SubgraphList.getValue( ruleName, &p_SubGraph ) ) { 860 delete p_SubGraph; 861 } 862 else { 863 FATAL_ERROR("No subgraph for rule " + ruleName + "! Mismatched rules and subgraph hashtables!", -1); 864 } 865 } 866 m_SubgraphList.clear(); 867 m_RuleList.clear(); 868 m_LabelList.clear(); 869 m_TagList.clear(); 870 return true; 871 } 872 873 bool GRXMLDoc::findSubGraph(std::string & s, SubGraph *&p_SubGraph) 874 { 875 return m_SubgraphList.getValue(s, &p_SubGraph); 876 } 877 878 bool GRXMLDoc::findRule(int i, std::string &s ) 879 { 880 return m_RuleList.getIndex( i, &s ); 881 } 882 883 bool GRXMLDoc::findTag(int i, std::string &s ) 884 { 885 return m_TagList.getValue( i, &s ); 886 } 887 888 bool GRXMLDoc::findLabel(int i, std::string &s ) 889 { 890 return m_LabelList.getValue( i, &s ); 891 } 892 893 bool GRXMLDoc::findSubGraphIndex( SubGraph *p_SubGraph, std::string &s ) 894 { 895 return m_SubgraphList.getIndex( p_SubGraph, &s ); 896 } 897 898 bool GRXMLDoc::findRuleIndex( std::string s, int &i ) 899 { 900 return m_RuleList.getValue( s, &i ); 901 } 902 bool GRXMLDoc::findTagIndex( std::string s, int &i ) 903 { 904 return m_TagList.getIndex( s, &i ); 905 } 906 bool GRXMLDoc::findLabelIndex( std::string s, int &i ) 907 { 908 return m_LabelList.getIndex( s, &i ); 909 } 910 bool GRXMLDoc::findMeta(const std::string & sn, std::string &s) 911 { 912 return m_MetaKeyValPairs.getValue( sn, &s ); 913 } 914 bool GRXMLDoc::setMeta(const std::string & sn, const std::string &s) 915 { 916 std::string tmp; 917 if(findMeta(sn,tmp)) 918 m_MetaKeyValPairs.remove(sn); 919 return m_MetaKeyValPairs.insert(sn,s); 920 } 921 922 bool GRXMLDoc::addTagToList( std::string const& s ) 923 { 924 bool success = true; 925 // Make values unique 926 int index; 927 if ( !findTagIndex( s, index ) ) 928 success = m_TagList.insert( m_TagAutoIndex++, s ); 929 return success; 930 } 931 932 933 bool GRXMLDoc::addLabelToList( std::string const& s ) 934 { 935 // TODO: Labels should be unique. Change key. 936 int index; 937 bool bRes = m_LabelList.getIndex( s, &index ); 938 if(bRes == true) { 939 return false; // exists 940 } 941 bRes = m_LabelList.insert( m_LabelAutoIndex++, s ); 942 return bRes; 943 } 944 945 void GRXMLDoc::printLists() 946 { 947 m_SubgraphList.print(); 948 m_RuleList.print(); 949 m_TagList.print(); 950 m_LabelList.print(); 951 } 952 953 954 void GRXMLDoc::printSubgraphs() 955 { 956 SubGraph *p_SubGraph; 957 std::string rule; 958 int index; 959 if ( m_RuleList.getFirst( &rule, &index) ) { 960 if ( findSubGraph( rule, p_SubGraph ) ) { 961 DEBUG_PRINT("============ Rule: " + rule + "============"); 962 printSubgraph( *p_SubGraph ); 963 while ( m_RuleList.getNext( &rule, &index) ) { 964 if ( findSubGraph( rule, p_SubGraph ) ) { 965 printSubgraph( *p_SubGraph ); 966 } 967 } 968 } 969 } 970 } 971 972 973 void GRXMLDoc::printSubgraph( SubGraph &p_SubGraph ) 974 { 975 p_SubGraph.PrintWithLabels( *this ); 976 } 977 978 979 bool GRXMLDoc::getRuleRefName(XMLNode &node, std::string &ruleName) 980 { 981 const char* attr; 982 std::string s = GETATTR("uri" ); 983 if (s.empty()) { 984 FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 ); 985 } 986 // Remove the #: 987 int p1 = s.find("#"); 988 if ( p1 !=0 ) { 989 FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'", -1 ); 990 } 991 ruleName.assign( s, 1, s.size() ); 992 return true; 993 } 994 995 void GRXMLDoc::initializeLists() 996 { 997 m_SubgraphList.setName("Subgraphs"); 998 m_RuleList.setName("Rules"); 999 m_TagList.setName("Tags"); 1000 m_LabelList.setName("Labels"); 1001 1002 /* Predefined rules. NB Labels are also created for each rule added. 1003 // The required order for these labels in the .map output file is: 1004 // 0 eps 1005 // next come slots 1006 // pau and pau2 1007 // everything else 1008 // We will add all these now in case they are referenced and we will 1009 // reindex after we have parsed the grammar -- when we have the list 1010 // of slots. This re-indexing is for the output files .map and .P.txt. 1011 // 1012 */ 1013 addLabelToList( "eps" ); 1014 1015 addLabelToList( "-pau-" ); 1016 addLabelToList( "-pau2-" ); 1017 } 1018 1019 void GRXMLDoc::writeMapFile( std::string & fileName ) 1020 { 1021 // We need to re-index in order to put the labels in correct order: 1022 // 1. eps 1023 // 2. all slots 1024 // 3. all rules 1025 // 4. -pau- words 1026 // 5. remaining labels 1027 ofstream outfile; 1028 int index, origIndex; 1029 std::string label; 1030 std::string slotRuleName; 1031 std::string scope; // For rules 1032 HashMap<int,std::string> orderedList; 1033 int orderedIndex=0; 1034 // 1. eps 1035 orderedList.insert( orderedIndex++, "eps" ); 1036 1037 // 2. slots 1038 if ( m_LabelList.getFirst( &origIndex, &label ) ) { 1039 if ( IsSlot( label ) ) { 1040 orderedList.insert( orderedIndex++, label ); 1041 } 1042 while (m_LabelList.getNext( &origIndex, &label ) ) { 1043 if ( IsSlot( label ) ) { 1044 orderedList.insert( orderedIndex++, label ); 1045 } 1046 } 1047 } 1048 1049 // 3. Now rules, or anything with @ 1050 if ( m_LabelList.getFirst( &origIndex, &label ) ) { 1051 do { 1052 #if GRXML_DEBUG 1053 std::cout << label << " "<< label.find_first_of ("@") << std::endl; 1054 #endif 1055 if (!IsSlot(label) && label.find_first_of ("@") != string::npos) { 1056 #if GRXML_DEBUG 1057 std::cout << " Adding " << label << std::endl; 1058 #endif 1059 orderedList.insert( orderedIndex++, label ); 1060 } 1061 } while (m_LabelList.getNext( &origIndex, &label ) ); 1062 } 1063 1064 // 4. pau 1065 orderedList.insert( orderedIndex++, "-pau-" ); 1066 orderedList.insert( orderedIndex++, "-pau2-" ); 1067 1068 // 5. Remaining stuff. NB We depend upon the label not 1069 // being added twice. 1070 if ( m_LabelList.getFirst( &origIndex, &label ) ) { 1071 if ( !orderedList.getIndex( label, &index ) ) { 1072 orderedList.insert( orderedIndex++, label ); 1073 } 1074 while (m_LabelList.getNext( &origIndex, &label ) ) { 1075 if ( !orderedList.getIndex( label, &index ) ) { 1076 orderedList.insert( orderedIndex++, label ); 1077 } 1078 } 1079 } 1080 outfile.open ( fileName.c_str() ); 1081 1082 bool bRes = orderedList.getFirst( &index, &label ); 1083 do { 1084 if(!bRes) break; 1085 // Look up scope using original index 1086 m_LabelList.getIndex( label, &origIndex ); 1087 if (m_RuleScope.getValue(origIndex, &scope) ) 1088 label = scope + ":" + label; 1089 outfile << label << " " << index << std::endl; 1090 bRes = orderedList.getNext( &index, &label ); 1091 } while(bRes); 1092 1093 outfile.close(); 1094 } 1095 1096 1097 void GRXMLDoc::writeScriptFile( std::string & fileName ) 1098 { 1099 ofstream outfile; 1100 int index; 1101 std::string label; 1102 outfile.open ( fileName.c_str() ); 1103 if ( m_TagList.getFirst( &index, &label ) ) { 1104 outfile << index << " " << label << std::endl; 1105 } 1106 while (m_TagList.getNext( &index, &label ) ) { 1107 outfile << index << " " << label << std::endl; 1108 } 1109 outfile.close(); 1110 1111 //m_LabelList.writeFile( fileName ); 1112 } 1113 1114 void GRXMLDoc::writeParamsFile( std::string & fileName ) 1115 { 1116 std::string wtw; 1117 ofstream outfile; 1118 bool bRes; 1119 1120 outfile.open(fileName.c_str()); 1121 1122 std::string metaname = "word_penalty"; 1123 bRes = findMeta(metaname, wtw); 1124 if(bRes) 1125 outfile << metaname.c_str() << "\t=\t" << wtw.c_str() << std::endl; 1126 1127 // outfile << "locale" << "\t=\t" << m_XMLLanguage << std::endl; 1128 outfile.close(); 1129 } 1130 1131 void GRXMLDoc::writeGraphFiles( std::string& prefix, bool bDoWriteRecogGraphs) 1132 { 1133 SubGraph *p_SubGraph; 1134 SubGraph *p_SemGraph; 1135 std::string fileName; 1136 if ( !findSubGraph( m_RootRule, p_SubGraph ) ) { 1137 FATAL_ERROR ("ERROR: writeGraphFiles - no root rule "+ m_RootRule + " defined. No file created", -1 ); 1138 } 1139 1140 // Create .P.txt 1141 printf ("\nCreating semantic graph file\n"); 1142 p_SemGraph = new SubGraph( (char *) "Main", -1); 1143 m_pGraph->BeginRule( p_SemGraph ); 1144 m_pGraph->AddRuleRef( p_SemGraph, p_SubGraph->getRuleId()); 1145 m_pGraph->EndRule( p_SemGraph ); 1146 m_pGraph->ExpandRules (p_SemGraph); 1147 p_SemGraph->RemoveInternalConnections (); 1148 1149 p_SemGraph->AddTerminalConnections (); 1150 p_SemGraph->ReduceArcsByEquivalence(); 1151 p_SemGraph->RemoveUnreachedConnections (-1, -1); 1152 p_SemGraph->DeterminizeArcs(); 1153 p_SemGraph->RemoveUnreachedConnections (-1, -1); 1154 p_SemGraph->ReduceArcsByEquivalence(); 1155 p_SemGraph->RemoveUnreachedConnections (-1, -1); 1156 fileName = prefix + ".P.txt"; 1157 p_SemGraph->WriteForwardGraphWithSemantic( fileName, *this ); 1158 delete p_SemGraph; 1159 1160 fileName = prefix + ".omap"; 1161 this->WriteOLabels(fileName); 1162 } 1163 1164 void GRXMLDoc::sortLabels() 1165 { 1166 // We need to re-index in order to put the labels in correct order: 1167 int index=0, origIndex; 1168 std::string label; 1169 std::string slotRuleName; 1170 std::string scope; // For rules 1171 std::vector <std::string> orderedList; 1172 if ( m_LabelList.getFirst( &origIndex, &label ) ) { 1173 // Look up scope using original index 1174 orderedList.push_back( label ); 1175 while (m_LabelList.getNext( &origIndex, &label ) ) { 1176 orderedList.push_back( label ); 1177 } 1178 } 1179 std::sort(orderedList.begin(), orderedList.end() ); 1180 m_SortedLabelList.clear(); 1181 index=0; 1182 for (std::vector<std::string>::const_iterator citer = orderedList.begin(); 1183 citer != orderedList.end(); ++citer) { 1184 label = *citer; 1185 m_LabelList.getIndex( label, &origIndex ); 1186 m_SortedLabelList.insert( index, label ); 1187 index++; 1188 // std::cout <<"Sorted: " << index <<" " << label <<std::endl; 1189 } 1190 return; 1191 } 1192 1193 bool GRXMLDoc::findSortedLabel(int i, std::string &s ) 1194 { 1195 if (m_SortedLabelList.isEmpty() ) { 1196 sortLabels(); // Create the sorted label list. 1197 } 1198 return m_SortedLabelList.getValue( i, &s ); 1199 } 1200 1201 bool GRXMLDoc::findSortedLabelIndex( int i, int &sortedIndex ) 1202 { 1203 std::string s; 1204 if (m_SortedLabelList.isEmpty() ) { 1205 sortLabels(); // Create the sorted label list. 1206 } 1207 if ( m_LabelList.getValue( i, &s ) ) { 1208 if ( m_SortedLabelList.getIndex(s, &sortedIndex )) { 1209 return true; 1210 } 1211 } 1212 return false; 1213 } 1214 1215 void GRXMLDoc::addOLabelToOList( std::string &s) 1216 { 1217 m_OutputPtxtLabels.insert( s, 0); 1218 } 1219 1220 bool GRXMLDoc::WriteOLabels(const std::string& fileName) 1221 { 1222 HashMap<int,std::string> invMap; 1223 int count = 0; 1224 int max_script_label = 0; 1225 int scriptID = 0; 1226 std::map<std::string, int>::iterator iter; 1227 bool bFound; 1228 int tmp; 1229 1230 std::string strIndex = "eps"; 1231 bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp); 1232 if(bFound) 1233 m_OutputPtxtLabels.remove(strIndex); 1234 m_OutputPtxtLabels.insert(strIndex, count); 1235 invMap.insert( count, strIndex); 1236 count++; 1237 1238 strIndex = "{"; 1239 bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp); 1240 if(bFound) 1241 m_OutputPtxtLabels.remove(strIndex); 1242 m_OutputPtxtLabels.insert(strIndex, count); 1243 invMap.insert( count, strIndex); 1244 count++; 1245 1246 iter = m_OutputPtxtLabels.begin(); 1247 for( ; iter!=m_OutputPtxtLabels.end(); iter++) { 1248 const char* label = iter->first.c_str(); 1249 if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN) 1250 && strspn(label+SCRIPT_LABEL_PREFIX_LEN,"0123456789")==strlen(label+SCRIPT_LABEL_PREFIX_LEN) ) { 1251 scriptID = atoi(label+SCRIPT_LABEL_PREFIX_LEN); 1252 if(max_script_label < scriptID) 1253 max_script_label = scriptID; 1254 }/* else if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)) { 1255 invMap.insert(count, iter->first); 1256 iter->second = count; 1257 count++; 1258 }*/ 1259 else if(!invMap.getIndex((iter->first), &tmp)){ 1260 invMap.insert(count, iter->first); 1261 iter->second = count; 1262 count++; 1263 } 1264 } 1265 1266 cout << "found max_script_label " << max_script_label << endl; 1267 for(int j=0; j<=max_script_label; j++) { 1268 std::stringstream ss; 1269 ss << SCRIPT_LABEL_PREFIX << j; 1270 if(!invMap.getIndex( ss.str(), &tmp)) { 1271 invMap.insert( count++, ss.str()); 1272 } 1273 } 1274 1275 std::ofstream outfile(fileName.c_str()); 1276 std::string outscript; 1277 if(!outfile) { 1278 FATAL_ERROR( "Error: opening the omap file for output", 1); 1279 WARNING( "Error: opening the omap file for output"); 1280 return 1; 1281 } 1282 for(int i=0; i<count; i++) { 1283 outscript = ""; 1284 invMap.getValue(i,&outscript); 1285 if(outscript.length() == 0) { 1286 cout << "error: internal error while making .omap " << i << endl; 1287 FATAL_ERROR("error",1); 1288 } 1289 outfile << outscript.c_str() << " " << i << std::endl; 1290 } 1291 outfile.close(); 1292 return 0; 1293 } 1294