/*---------------------------------------------------------------------------* * grxmldoc.cpp * * * * Copyright 2007, 2008 Nuance Communciations, Inc. * * * * Licensed under the Apache License, Version 2.0 (the 'License'); * * you may not use this file except in compliance with the License. * * * * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an 'AS IS' BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * * * *---------------------------------------------------------------------------*/ #include <assert.h> #include <stdlib.h> #include <fstream> #include <sstream> #include <iostream> #include <algorithm> // for std::sort #include "tinyxml.h" #include "grph.h" // The word graph object and interface #include "sub_grph.h" // The sub-graph object and interface #include "hashmap.h" #include "grxmldoc.h" #include "ESR_Session.h" //#include "LCHAR.h" #define GRXML_DEBUG 0 #define MAX_PATH_NAME 512 #define FATAL_ERROR(x,y) { std::cout << (x) << std::endl; exit ((y)); } #define WARNING(x) std::cout << (x) << std::endl; #if GRXML_DEBUG //#define DEBUG_PRINT(x) // #define DEBUG_PRINT(x) std::cout << (x) << std::endl; #define PRINT_EXPRESSION(x) //#define PRINT_EXPRESSION(x) std::cout << (x) << std::endl; #else #define DEBUG_PRINT(x) // #define PRINT_EXPRESSION(x) // #endif using namespace std; #define CHECK_NOT_EMPTY(s, t) { if (s.empty()) \ { \ std::cout << "ERROR: Empty string of type " << t <<std::endl; \ } \ } int get_range(const std::string& s, int* minCnt, int* maxCnt) { std::string sval; size_t p1 =s.find("-"); if ( p1 !=string::npos ) { sval.assign( s, 0, p1 ); if(strspn(sval.c_str(),"0123456789")<1) return 1; *minCnt = atoi( sval.c_str() ); sval.assign( s, p1+1, s.size() ); *maxCnt = -1; // 0== any? // If max is given then use BeginCount otherwise use BeginItemRepeat if (!sval.empty() ) { if(strspn(sval.c_str(),"0123456789")<1) return 1; *maxCnt = atoi( sval.c_str() ); } return 0; } p1 = s.find("+"); if( p1 != string::npos) { sval.assign( s, 0, p1 ); if(strspn(sval.c_str(),"0123456789")<1) return 1; *minCnt = atoi( sval.c_str() ); *maxCnt = -1; return 0; } if(strspn(s.c_str(),"0123456789")<1) return 1; *minCnt = *maxCnt = atoi( s.c_str()); return 0; } GRXMLDoc::GRXMLDoc() { m_NodeKeyWords.insert(make_pair("grammar", NodeTypeGrammar)); m_NodeKeyWords.insert(make_pair("rule", NodeTypeRule)); m_NodeKeyWords.insert(make_pair("ruleref", NodeTypeRuleReference)); m_NodeKeyWords.insert(make_pair("one-of", NodeTypeOneOf)); m_NodeKeyWords.insert(make_pair("item", NodeTypeItem)); m_NodeKeyWords.insert(make_pair("tag", NodeTypeTag)); m_NodeKeyWords.insert(make_pair("count", NodeTypeCount)); m_NodeKeyWords.insert(make_pair("meta", NodeTypeMeta)); m_pGraph = 0; m_RuleAutoIndex = 0; m_TagAutoIndex = 0; m_LabelAutoIndex = 0; m_ExpandedRulesAutoIndex = 0; m_XMLFileName = "dummy.xml"; } GRXMLDoc::~GRXMLDoc() { deleteRules(); if (m_pGraph) { delete m_pGraph; } } bool GRXMLDoc::parseGrammar( XMLNode &node, std::string & xMLFileName ) { m_XMLFileName = xMLFileName; // Set up the internally defined rules, etc. initializeLists(); // The top level "document" node is given to this fn // Create the container for the word graph. if (m_pGraph) { delete m_pGraph; } m_pGraph = new Graph("XML grammar"); SubGraph *p_SubGraph; parseNode( node, p_SubGraph, 1 ); // NB Subgraph pointed to will change in recursive fn. if (findSubGraph( m_RootRule, p_SubGraph )) { m_pGraph->ExpandRules (p_SubGraph); p_SubGraph->RemoveInternalConnections (); //Print the root rule. //printSubgraph( *p_SubGraph ); } return true; } bool GRXMLDoc::parseNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level ) { // We will create a new subgraph for each rule node. // The "current" subgraph is substituted with the new subgraph for all ops on child nodes. // After processing child nodes the original subgraph is reinstated // for final operations in the endNode() fn. // Initial processing of the current node before processing children #if 0 && GRXML_DEBUG if(node.Type() == TiXmlNode::ELEMENT) node.ToElement()->Print( stdout, level); else if(node.Type() == TiXmlNode::DOCUMENT) node.ToDocument()->Print( stdout, level); else if(node.Type() == TiXmlNode::TEXT) node.ToText()->Print( stdout, level); else if(node.Type() == TiXmlNode::DECLARATION) node.ToDeclaration()->Print( stdout, level); else { const char* text = node.Value(); if(!text) text = "__NULL__"; printf("processing node type %d text %s\n", node.Type(), text); } #endif beginNode( node, p_SubGraph, level ); SubGraph *p_LocalSubGraph; p_LocalSubGraph = p_SubGraph; TiXmlNode* child; for( child = node.FirstChild(); child; child = child->NextSibling() ) { parseNode ( *child, p_SubGraph, level+1 ); } // Revert current node p_SubGraph = p_LocalSubGraph; // Finish processing current node endNode( node, p_SubGraph, level ); return true; } // parseNode bool GRXMLDoc::beginNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level ) { std::string name = node.Value(); DEBUG_PRINT("Element = " + name); // XMLNode::Type type = node.getType(); if ( node.Type() == TiXmlNode::TEXT) // isCData() { const char* cc_name = node.Parent()->Value(); std::string str_name(cc_name); DEBUG_PRINT (std::string("CDATA ") + name); DEBUG_PRINT (std::string("CDATA ") + str_name); processCDATA( node, p_SubGraph ); } else if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() /*isLeaf()*/) { //printNode(node, level); // Use enum value KEYWDPAIR::iterator pos; pos = m_NodeKeyWords.find( name ); KeywordValues nodeType = NodeTypeBadValue; if ( pos != m_NodeKeyWords.end() ) { nodeType = (*pos).second; DEBUG_PRINT("nodeType=" + nodeType); } else if(node.Type() == TiXmlNode::COMMENT) { return true; } else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) { return true; } else { FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT); } switch ( nodeType ) { case NodeTypeGrammar: { beginParseGrammarNode( node ); } break; case NodeTypeRule: { // NB This fn creates a new subgraph. beginParseRuleNode( node, p_SubGraph ); } break; case NodeTypeRuleReference: { // NB This fn creates a new subgraph. beginRuleRef( node, p_SubGraph ); } break; case NodeTypeOneOf: { beginOneOf( node, p_SubGraph ); } break; case NodeTypeItem: { beginItem( node, p_SubGraph ); } break; case NodeTypeTag: { beginTag( node, p_SubGraph ); } break; case NodeTypeCount: { beginCount( node, p_SubGraph ); } break; case NodeTypeMeta: { beginParseMetaNode( node ); } break; case NodeTypeBadValue: default: DEBUG_PRINT( "UNKNOWN node name: " + name ); break; }; // switch } //is a Node or Leaf else if ( node.Type() == TiXmlNode::TEXT) // isCData() { DEBUG_PRINT (std::string("CDATA ") + name); processCDATA( node, p_SubGraph ); } return true; } // beginNode() bool GRXMLDoc::endNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level ) { std::string name = node.Value(); //XMLNode::Type type = node.getType(); if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() ) { KEYWDPAIR::iterator pos; pos = m_NodeKeyWords.find( name ); KeywordValues nodeType = NodeTypeBadValue; if ( pos != m_NodeKeyWords.end() ) { nodeType = (*pos).second; } else if(node.Type() == TiXmlNode::COMMENT) { return true; } else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) { return true; } else if(node.Type() == TiXmlNode::TEXT) { } else { FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT ); } switch ( nodeType ) { case NodeTypeGrammar: { endParseGrammarNode( node ); } break; case NodeTypeRule: { endParseRuleNode( node, p_SubGraph ); } break; case NodeTypeRuleReference: { endRuleRef( node, p_SubGraph ); } break; case NodeTypeOneOf: { endOneOf( node, p_SubGraph ); } break; case NodeTypeItem: { endItem(node, p_SubGraph ); } break; case NodeTypeTag: { endTag( node, p_SubGraph ); } break; case NodeTypeCount: { endCount( node, p_SubGraph ); } break; case NodeTypeMeta: { endParseMetaNode( node ); } break; case NodeTypeBadValue: default: DEBUG_PRINT( "UNKNOWN node name: "); DEBUG_PRINT( name.c_str() ); //Extend the break; }; // switch } //isNode() or isLeaf() else { // Do nothing? } return true; } // endNode() bool GRXMLDoc::beginParseGrammarNode(XMLNode &node) { const char* attr; #define GETATTR(nAmE) ((attr=node.ToElement()->Attribute(nAmE))!=NULL) ? attr:"" m_XMLMode = GETATTR("mode"); m_XMLLanguage = GETATTR("xml:lang"); m_RootRule = GETATTR("root"); // The root rule name DEBUG_PRINT("Root rule = " + m_RootRule); m_XMLTagFormat = GETATTR("tag-format"); m_XMLVersion = GETATTR("version"); m_XMLBase = GETATTR("xml:base"); return true; } bool GRXMLDoc::beginParseMetaNode(XMLNode &node) { const char* attr; std::string meta_name = GETATTR("name"); std::string meta_value = GETATTR("content"); if(meta_name == "word_penalty") { m_MetaKeyValPairs.insert(meta_name,meta_value); // m_MetaKeyValPairs.print(); } else if(meta_name == "do_skip_interword_silence") { for(int j = 0; j<(int)meta_value.size(); j++){ meta_value[j] = tolower(meta_value[j]); //lower(); } if(meta_value!="true" && meta_value!="false") printf ("\nWarning: %s must be set to 'true' or 'false'; defaulting to 'false'\n", meta_name.c_str()); else m_MetaKeyValPairs.insert(meta_name,meta_value); } else if(meta_name == "userdict_name") { printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str()); } else { printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str()); } return true; } bool GRXMLDoc::endParseGrammarNode(XMLNode &node) { // End parse operations return true; } bool GRXMLDoc::beginParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph) { const char* attr; // Note: The subGraph may change if there are forward references. This // is fine as we revert to the previous one when finished parsing the current node. DEBUG_PRINT ( "---- Rule\n" ); std::string ruleName = GETATTR("id" ); std::string s_tag = GETATTR("tag" ); if( s_tag.length()>0) { FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) } CHECK_NOT_EMPTY( ruleName, "id" ); // Rule name must be unique within scope of entire grammar. // Put rule on stack - for context m_RuleListStack.push( ruleName ); // Check whether a ruleref placeholder exists for this rule. int index; bool foundRule = findRuleIndex( ruleName, index ); if (foundRule) { // Rule is already declared; it must have been forward referenced // so swap the placeholder subgraph in. // NB subgraph and rule name are already known to lists. SubGraph *p_ExistingSubgraph; if ( findSubGraph( ruleName, p_ExistingSubgraph ) ) { p_SubGraph = p_ExistingSubgraph; } else { FATAL_ERROR("ERROR! Subgraph without rule name entry found!", -1); } } else { // Create a Word Graph node for each rule node SubGraph *newGraph; addRuleToList( ruleName, newGraph ); p_SubGraph = newGraph; } // Make a note of the scope or rules; public, etc - used in map file. findRuleIndex( ruleName, index ); std::string ruleScope = GETATTR("scope" ); if ( !ruleScope.empty() ) { m_RuleScope.insert(index, ruleScope); } // We must accommodate Rules that have CDATA without an <item> element. // We need to infer this element for all rules. m_pGraph->BeginItem( p_SubGraph ); PRINT_EXPRESSION( ruleName + " = { " ); return true; } // beginParseRuleNode() bool GRXMLDoc::endParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph ) { // The rule expression has been built as a subgraph and ID added to the rule list. // Finished editing subgraph DEBUG_PRINT ( "---- /Rule\n" ); //m_pGraph->EndRule(&p_SubGraph); // Tell the world //std::string ruleName = attr.get( "id" ); std::string ruleName = m_RuleListStack.top(); m_RuleListStack.pop(); //CHECK_NOT_EMPTY( ruleName, "id" ); // Must be unique rule name within scope of entire grammar. // Check whether a ruleref placeholder exists for this rule. m_pGraph->addSubGraph ( p_SubGraph ); // We must accommodate Rules that have CDATA without an <item> element. // We need to infer this element for all rules. m_pGraph->EndItem( p_SubGraph ); PRINT_EXPRESSION( " }\n" ); return true; } bool GRXMLDoc::processCDATA( XMLNode &node, SubGraph *&p_SubGraph ) { // Note the Item's CDATA // Strip leading and trailing whitespace const char* cc_name = node.Parent()->Value(); std::string str_name(cc_name); // = node.Parent()->ValueStr(); // getName // std::string name = node.Parent()->Value(); // getName //if ( name == "item" ) { if ( str_name != "tag" ) { const char* const whitespace = " \t\r\n\v\f"; std::string cdata = node.Value(); // getCData() std::string word; // Words are whitespace separated cdata.erase(0, cdata.find_first_not_of(whitespace) ); cdata.erase(cdata.find_last_not_of(whitespace) + 1); #if GRXML_DEBUG std::cout << "/--" << cdata << "--/\n"; #endif std::string::size_type begIdx, endIdx; //search beginning of the first word begIdx = cdata.find_first_not_of(whitespace); //while beginning of a word found while (begIdx != std::string::npos) { //search end of the actual word endIdx = cdata.find_first_of (whitespace, begIdx); if (endIdx == string::npos) { //end of word is end of line endIdx = cdata.length(); } word.clear(); // word.assign(cdata,begIdx,endIdx); word.append (cdata, begIdx, endIdx - begIdx); if ( !word.empty() ) { #if GRXML_DEBUG std::cout << " -->" << word << "<--\n"; #endif int index; // If a slot then take note of rule name if ( IsSlot( word ) ) { const char* xmlBasename; std::string ruleName = m_RuleListStack.top(); m_SlotList.insert(index, ruleName); xmlBasename = strrchr(m_XMLFileName.c_str(),'/'); xmlBasename = xmlBasename ? xmlBasename+1 : m_XMLFileName.c_str(); word = (std::string)xmlBasename + "." + ruleName + "@" + word; addLabelToList( word ); findLabelIndex( word, index ); } else { addLabelToList( word ); findLabelIndex( word, index ); } m_pGraph->AddLabel( p_SubGraph, index ); } begIdx = cdata.find_first_not_of (whitespace, endIdx); } } //tag else { // Do nothing with CDATA for elements that are not items. // In particular, do not strip whitespace from tag cdata. // However, CPPDOM appears to remove linefeeds. May need to tidy up. } return true; } // cdata bool GRXMLDoc::beginItem( XMLNode &node, SubGraph *&p_SubGraph ) { const char* attr; DEBUG_PRINT ("---- Item:\n"); // First check whethere there is a count/repeat std::string s = GETATTR("repeat" ); int minCnt=0,maxCnt=0; std::string s_tag = GETATTR("tag" ); if( s_tag.length()>0) { FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) } if( s.length()>0 && get_range( s, &minCnt, &maxCnt) ) { FATAL_ERROR(std::string("error: while parsing range ") + s,1); } if ( !s.empty() ) { // RED FLAG: max should not be 0! A +ve number should have been given. if( maxCnt>0) { m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt ); } else { // NB: BeginItemRepeat can only use min of 0 or 1! m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1); } } else { m_pGraph->BeginItem( p_SubGraph ); } return true; } bool GRXMLDoc::endItem( XMLNode &node, SubGraph *&p_SubGraph ) { DEBUG_PRINT ( "---- /Item\n" ); // What TODO if no tag for an item? m_pGraph->EndItem( p_SubGraph ); return true; } bool GRXMLDoc::beginRuleRef( XMLNode &node, SubGraph *&p_SubGraph ) { // Extend word FST node with an entire FST subgraph. // Forward referencing of rules is supported. // NB Remove the leading # from the ruleref name! DEBUG_PRINT ( "---- Ruleref\n" ); const char* attr; std::string s_tag = GETATTR("tag" ); if( s_tag.length()>0) { FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) } std::string s = GETATTR("uri" ); if (s.empty()) { // FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 ); } // Remove the #: int p1 = s.find("#"); if ( p1 !=0 ) { FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'" + ". Rule reference must start with a '#'. External references are not supported.", -1 ); } string ruleName; getRuleRefName( node, ruleName ); //std::string parentRuleName = m_RuleListStack.top(); //addRuleDependency( parentRuleName, ruleName ); int index; bool foundRule = findRuleIndex( ruleName, index ); if (!foundRule) { // Forward reference; create a placeholder subgraph ptr. //SubGraph *newGraph = new SubGraph( (char *) ruleName.c_str() ); // RED FLAG: Remember to check fwd ref rule was filled in at end. SubGraph *newGraph; addRuleToList( ruleName, newGraph ); findRuleIndex( ruleName, index ); } // We can now treat a forward-referenced graph as if it was defined. // We will add the subgraph when we have the tag - see endItem(). m_pGraph->BeginRule( p_SubGraph ); m_pGraph->AddRuleRef( p_SubGraph, index ); m_pGraph->EndRule( p_SubGraph ); return true; } bool GRXMLDoc::endRuleRef(XMLNode &grmNode, SubGraph *&p_SubGraph ) { DEBUG_PRINT ( "---- /Ruleref\n" ); // Does nothing // NB The tag is not under the ruleref element - it is in the current item element. // We now add the tag of the AddRuleRef as we see the tag element. See EndTag(). return true; } bool GRXMLDoc::beginOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph) { DEBUG_PRINT ( "----OneOf\n" ); m_pGraph->BeginOneOf (p_SubGraph); return true; } bool GRXMLDoc::endOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph) { DEBUG_PRINT ( "----/OneOf\n" ); m_pGraph->EndOneOf (p_SubGraph); return true; } bool GRXMLDoc::beginTag( XMLNode &node, SubGraph *&p_SubGraph ) { DEBUG_PRINT ("---- Tag\n"); std::string s = node.ToElement()->GetText(); // getCdata(); #if GRXML_DEBUG std::cout << s; // debug #endif // Store the semantic tag info. // NB Do not strip whitespace from tag cdata if ( !s.empty() ) { int index; addTagToList( s ); findTagIndex( s, index ); m_pGraph->AddTag ( p_SubGraph, index ); } return true; } bool GRXMLDoc::endTag( XMLNode &node, SubGraph *&p_SubGraph ) { DEBUG_PRINT ("---- /Tag\n"); return true; } bool GRXMLDoc::beginCount( XMLNode &node, SubGraph *&p_SubGraph ) { const char* attr; // Count of reps applies to the text elements in this count node DEBUG_PRINT ("---- Count\n"); // Get number attr std::string s = GETATTR("number"); std::string s_tag = GETATTR("tag" ); if( s_tag.length()>0) { FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1) } if (s.empty()) { return false; } // not in subgraph but in graph?! //graph.BeginCount(n); int minCnt=-1, maxCnt=-1; if( get_range( s, &minCnt, &maxCnt) ) { FATAL_ERROR(std::string("error: while parsing range ") + s,1); } if ( s.c_str() == std::string("optional") ) { m_pGraph->BeginOptional( p_SubGraph ); } else if ( minCnt>0 && maxCnt>0) { m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt ); } else if( minCnt>0 ) { m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1); } else { // m_pGraph->BeginOptional ( p_SubGraph ); } return true; } bool GRXMLDoc::endCount( XMLNode &node, SubGraph *&p_SubGraph ) { DEBUG_PRINT ("---- /Count\n"); m_pGraph->EndCount( p_SubGraph ); return true; } bool GRXMLDoc::endParseMetaNode(XMLNode &node) { // End parse operations return true; } void GRXMLDoc::printNode(XMLNode &node, int level) { std::string name = node.Value(); int type = node.Type(); std::string c_data; for(int i=0;i<level;i++) std::cout << " "; char c = ' '; switch(type) { case TiXmlNode::ELEMENT: // case XMLNode::xml_nt_node: // grammar, rule, one-of, item, count c = '+'; break; /* case TiXmlNode::TEXT: // case XMLNode::xml_nt_leaf: c = '-'; break; */ case TiXmlNode::DOCUMENT: // case XMLNode::xml_nt_document: c = '\\'; break; case TiXmlNode::TEXT: // case XMLNode::xml_nt_cdata: c = '#'; c_data = node.Value(); // getCdata(); break; case TiXmlNode::UNKNOWN: case TiXmlNode::COMMENT: case TiXmlNode::TYPECOUNT: case TiXmlNode::DECLARATION: default: std::cout << "Error: not sure what to do here" << std::endl; break; } if(node.Type() == TiXmlNode::TEXT) // isCData() std::cout << c << name.c_str() << "[" << c_data << "]" << std::endl; //Extend the tag hashtable else std::cout << c << name.c_str() << std::endl; if( node.Type() == TiXmlNode::ELEMENT) { for(TiXmlAttribute* attr=node.ToElement()->FirstAttribute(); attr; attr=attr->Next() ) { // guru: added output of attributes for (int i=0; i<level; i++) std::cout << " "; std::cout << " "; std::cout << attr->Name() << ": " << attr->Value() << std::endl; } } } /** Function: addRuleToList Extends list of SubGraphs with given subGraph and extends list of rule names too. TODO: Can we use one hash and use internal numeric index for rule IDs? */ bool GRXMLDoc::addRuleToList(std::string const & ruleName, SubGraph *&p_SubGraph) { int index; if ( findRuleIndex ( ruleName, index ) ) { FATAL_ERROR("ERROR! Rule name " + ruleName + " is already defined!", -1 ); } addLabelToList( m_XMLFileName + "@" + ruleName); findLabelIndex( m_XMLFileName + "@" + ruleName, index ); #if GRXML_DEBUG std::cout << "Rule " << ruleName << std::endl; #endif // Create the new subgraph and update lists m_RuleList.insert( ruleName, index ); p_SubGraph = new SubGraph( (char *) ruleName.c_str(), index ); bool success = m_SubgraphList.insert( ruleName, p_SubGraph ); if (!success) { FATAL_ERROR("ERROR! subgraph for " + ruleName + " is already defined!", -1 ); } #if ADD_BRACES addLabelToList( "{" ); std::stringstream ss; ss << "}(" << index << ")"; addLabelToList( ss.str()); #endif return success; } bool GRXMLDoc::deleteRules() { // Delete all allocated subgraphs. // The rule strings are part of the hashtables and get deleted by them. int index; SubGraph *p_SubGraph; std::string ruleName; while ( !m_RuleList.isEmpty() ) { m_RuleList.getFirst( &ruleName, &index ); m_RuleList.remove( ruleName ); if (m_SubgraphList.getValue( ruleName, &p_SubGraph ) ) { delete p_SubGraph; } else { FATAL_ERROR("No subgraph for rule " + ruleName + "! Mismatched rules and subgraph hashtables!", -1); } } m_SubgraphList.clear(); m_RuleList.clear(); m_LabelList.clear(); m_TagList.clear(); return true; } bool GRXMLDoc::findSubGraph(std::string & s, SubGraph *&p_SubGraph) { return m_SubgraphList.getValue(s, &p_SubGraph); } bool GRXMLDoc::findRule(int i, std::string &s ) { return m_RuleList.getIndex( i, &s ); } bool GRXMLDoc::findTag(int i, std::string &s ) { return m_TagList.getValue( i, &s ); } bool GRXMLDoc::findLabel(int i, std::string &s ) { return m_LabelList.getValue( i, &s ); } bool GRXMLDoc::findSubGraphIndex( SubGraph *p_SubGraph, std::string &s ) { return m_SubgraphList.getIndex( p_SubGraph, &s ); } bool GRXMLDoc::findRuleIndex( std::string s, int &i ) { return m_RuleList.getValue( s, &i ); } bool GRXMLDoc::findTagIndex( std::string s, int &i ) { return m_TagList.getIndex( s, &i ); } bool GRXMLDoc::findLabelIndex( std::string s, int &i ) { return m_LabelList.getIndex( s, &i ); } bool GRXMLDoc::findMeta(const std::string & sn, std::string &s) { return m_MetaKeyValPairs.getValue( sn, &s ); } bool GRXMLDoc::setMeta(const std::string & sn, const std::string &s) { std::string tmp; if(findMeta(sn,tmp)) m_MetaKeyValPairs.remove(sn); return m_MetaKeyValPairs.insert(sn,s); } bool GRXMLDoc::addTagToList( std::string const& s ) { bool success = true; // Make values unique int index; if ( !findTagIndex( s, index ) ) success = m_TagList.insert( m_TagAutoIndex++, s ); return success; } bool GRXMLDoc::addLabelToList( std::string const& s ) { // TODO: Labels should be unique. Change key. int index; bool bRes = m_LabelList.getIndex( s, &index ); if(bRes == true) { return false; // exists } bRes = m_LabelList.insert( m_LabelAutoIndex++, s ); return bRes; } void GRXMLDoc::printLists() { m_SubgraphList.print(); m_RuleList.print(); m_TagList.print(); m_LabelList.print(); } void GRXMLDoc::printSubgraphs() { SubGraph *p_SubGraph; std::string rule; int index; if ( m_RuleList.getFirst( &rule, &index) ) { if ( findSubGraph( rule, p_SubGraph ) ) { DEBUG_PRINT("============ Rule: " + rule + "============"); printSubgraph( *p_SubGraph ); while ( m_RuleList.getNext( &rule, &index) ) { if ( findSubGraph( rule, p_SubGraph ) ) { printSubgraph( *p_SubGraph ); } } } } } void GRXMLDoc::printSubgraph( SubGraph &p_SubGraph ) { p_SubGraph.PrintWithLabels( *this ); } bool GRXMLDoc::getRuleRefName(XMLNode &node, std::string &ruleName) { const char* attr; std::string s = GETATTR("uri" ); if (s.empty()) { FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 ); } // Remove the #: int p1 = s.find("#"); if ( p1 !=0 ) { FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'", -1 ); } ruleName.assign( s, 1, s.size() ); return true; } void GRXMLDoc::initializeLists() { m_SubgraphList.setName("Subgraphs"); m_RuleList.setName("Rules"); m_TagList.setName("Tags"); m_LabelList.setName("Labels"); /* Predefined rules. NB Labels are also created for each rule added. // The required order for these labels in the .map output file is: // 0 eps // next come slots // pau and pau2 // everything else // We will add all these now in case they are referenced and we will // reindex after we have parsed the grammar -- when we have the list // of slots. This re-indexing is for the output files .map and .P.txt. // */ addLabelToList( "eps" ); addLabelToList( "-pau-" ); addLabelToList( "-pau2-" ); } void GRXMLDoc::writeMapFile( std::string & fileName ) { // We need to re-index in order to put the labels in correct order: // 1. eps // 2. all slots // 3. all rules // 4. -pau- words // 5. remaining labels ofstream outfile; int index, origIndex; std::string label; std::string slotRuleName; std::string scope; // For rules HashMap<int,std::string> orderedList; int orderedIndex=0; // 1. eps orderedList.insert( orderedIndex++, "eps" ); // 2. slots if ( m_LabelList.getFirst( &origIndex, &label ) ) { if ( IsSlot( label ) ) { orderedList.insert( orderedIndex++, label ); } while (m_LabelList.getNext( &origIndex, &label ) ) { if ( IsSlot( label ) ) { orderedList.insert( orderedIndex++, label ); } } } // 3. Now rules, or anything with @ if ( m_LabelList.getFirst( &origIndex, &label ) ) { do { #if GRXML_DEBUG std::cout << label << " "<< label.find_first_of ("@") << std::endl; #endif if (!IsSlot(label) && label.find_first_of ("@") != string::npos) { #if GRXML_DEBUG std::cout << " Adding " << label << std::endl; #endif orderedList.insert( orderedIndex++, label ); } } while (m_LabelList.getNext( &origIndex, &label ) ); } // 4. pau orderedList.insert( orderedIndex++, "-pau-" ); orderedList.insert( orderedIndex++, "-pau2-" ); // 5. Remaining stuff. NB We depend upon the label not // being added twice. if ( m_LabelList.getFirst( &origIndex, &label ) ) { if ( !orderedList.getIndex( label, &index ) ) { orderedList.insert( orderedIndex++, label ); } while (m_LabelList.getNext( &origIndex, &label ) ) { if ( !orderedList.getIndex( label, &index ) ) { orderedList.insert( orderedIndex++, label ); } } } outfile.open ( fileName.c_str() ); bool bRes = orderedList.getFirst( &index, &label ); do { if(!bRes) break; // Look up scope using original index m_LabelList.getIndex( label, &origIndex ); if (m_RuleScope.getValue(origIndex, &scope) ) label = scope + ":" + label; outfile << label << " " << index << std::endl; bRes = orderedList.getNext( &index, &label ); } while(bRes); outfile.close(); } void GRXMLDoc::writeScriptFile( std::string & fileName ) { ofstream outfile; int index; std::string label; outfile.open ( fileName.c_str() ); if ( m_TagList.getFirst( &index, &label ) ) { outfile << index << " " << label << std::endl; } while (m_TagList.getNext( &index, &label ) ) { outfile << index << " " << label << std::endl; } outfile.close(); //m_LabelList.writeFile( fileName ); } void GRXMLDoc::writeParamsFile( std::string & fileName ) { std::string wtw; ofstream outfile; bool bRes; outfile.open(fileName.c_str()); std::string metaname = "word_penalty"; bRes = findMeta(metaname, wtw); if(bRes) outfile << metaname.c_str() << "\t=\t" << wtw.c_str() << std::endl; // outfile << "locale" << "\t=\t" << m_XMLLanguage << std::endl; outfile.close(); } void GRXMLDoc::writeGraphFiles( std::string& prefix, bool bDoWriteRecogGraphs) { SubGraph *p_SubGraph; SubGraph *p_SemGraph; std::string fileName; if ( !findSubGraph( m_RootRule, p_SubGraph ) ) { FATAL_ERROR ("ERROR: writeGraphFiles - no root rule "+ m_RootRule + " defined. No file created", -1 ); } // Create .P.txt printf ("\nCreating semantic graph file\n"); p_SemGraph = new SubGraph( (char *) "Main", -1); m_pGraph->BeginRule( p_SemGraph ); m_pGraph->AddRuleRef( p_SemGraph, p_SubGraph->getRuleId()); m_pGraph->EndRule( p_SemGraph ); m_pGraph->ExpandRules (p_SemGraph); p_SemGraph->RemoveInternalConnections (); p_SemGraph->AddTerminalConnections (); p_SemGraph->ReduceArcsByEquivalence(); p_SemGraph->RemoveUnreachedConnections (-1, -1); p_SemGraph->DeterminizeArcs(); p_SemGraph->RemoveUnreachedConnections (-1, -1); p_SemGraph->ReduceArcsByEquivalence(); p_SemGraph->RemoveUnreachedConnections (-1, -1); fileName = prefix + ".P.txt"; p_SemGraph->WriteForwardGraphWithSemantic( fileName, *this ); delete p_SemGraph; fileName = prefix + ".omap"; this->WriteOLabels(fileName); } void GRXMLDoc::sortLabels() { // We need to re-index in order to put the labels in correct order: int index=0, origIndex; std::string label; std::string slotRuleName; std::string scope; // For rules std::vector <std::string> orderedList; if ( m_LabelList.getFirst( &origIndex, &label ) ) { // Look up scope using original index orderedList.push_back( label ); while (m_LabelList.getNext( &origIndex, &label ) ) { orderedList.push_back( label ); } } std::sort(orderedList.begin(), orderedList.end() ); m_SortedLabelList.clear(); index=0; for (std::vector<std::string>::const_iterator citer = orderedList.begin(); citer != orderedList.end(); ++citer) { label = *citer; m_LabelList.getIndex( label, &origIndex ); m_SortedLabelList.insert( index, label ); index++; // std::cout <<"Sorted: " << index <<" " << label <<std::endl; } return; } bool GRXMLDoc::findSortedLabel(int i, std::string &s ) { if (m_SortedLabelList.isEmpty() ) { sortLabels(); // Create the sorted label list. } return m_SortedLabelList.getValue( i, &s ); } bool GRXMLDoc::findSortedLabelIndex( int i, int &sortedIndex ) { std::string s; if (m_SortedLabelList.isEmpty() ) { sortLabels(); // Create the sorted label list. } if ( m_LabelList.getValue( i, &s ) ) { if ( m_SortedLabelList.getIndex(s, &sortedIndex )) { return true; } } return false; } void GRXMLDoc::addOLabelToOList( std::string &s) { m_OutputPtxtLabels.insert( s, 0); } bool GRXMLDoc::WriteOLabels(const std::string& fileName) { HashMap<int,std::string> invMap; int count = 0; int max_script_label = 0; int scriptID = 0; std::map<std::string, int>::iterator iter; bool bFound; int tmp; std::string strIndex = "eps"; bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp); if(bFound) m_OutputPtxtLabels.remove(strIndex); m_OutputPtxtLabels.insert(strIndex, count); invMap.insert( count, strIndex); count++; strIndex = "{"; bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp); if(bFound) m_OutputPtxtLabels.remove(strIndex); m_OutputPtxtLabels.insert(strIndex, count); invMap.insert( count, strIndex); count++; iter = m_OutputPtxtLabels.begin(); for( ; iter!=m_OutputPtxtLabels.end(); iter++) { const char* label = iter->first.c_str(); if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN) && strspn(label+SCRIPT_LABEL_PREFIX_LEN,"0123456789")==strlen(label+SCRIPT_LABEL_PREFIX_LEN) ) { scriptID = atoi(label+SCRIPT_LABEL_PREFIX_LEN); if(max_script_label < scriptID) max_script_label = scriptID; }/* else if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)) { invMap.insert(count, iter->first); iter->second = count; count++; }*/ else if(!invMap.getIndex((iter->first), &tmp)){ invMap.insert(count, iter->first); iter->second = count; count++; } } cout << "found max_script_label " << max_script_label << endl; for(int j=0; j<=max_script_label; j++) { std::stringstream ss; ss << SCRIPT_LABEL_PREFIX << j; if(!invMap.getIndex( ss.str(), &tmp)) { invMap.insert( count++, ss.str()); } } std::ofstream outfile(fileName.c_str()); std::string outscript; if(!outfile) { FATAL_ERROR( "Error: opening the omap file for output", 1); WARNING( "Error: opening the omap file for output"); return 1; } for(int i=0; i<count; i++) { outscript = ""; invMap.getValue(i,&outscript); if(outscript.length() == 0) { cout << "error: internal error while making .omap " << i << endl; FATAL_ERROR("error",1); } outfile << outscript.c_str() << " " << i << std::endl; } outfile.close(); return 0; }