C++程序  |  1294行  |  35.67 KB

/*---------------------------------------------------------------------------*
 *  grxmldoc.cpp  *
 *                                                                           *
 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
 *                                                                           *
 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
 *  you may not use this file except in compliance with the License.         *
 *                                                                           *
 *  You may obtain a copy of the License at                                  *
 *      http://www.apache.org/licenses/LICENSE-2.0                           *
 *                                                                           *
 *  Unless required by applicable law or agreed to in writing, software      *
 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
 *  See the License for the specific language governing permissions and      *
 *  limitations under the License.                                           *
 *                                                                           *
 *---------------------------------------------------------------------------*/

#include <assert.h>
#include <stdlib.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <algorithm> // for std::sort
#include "tinyxml.h"
#include "grph.h"       // The word graph object and interface
#include "sub_grph.h"	// The sub-graph object and interface
#include "hashmap.h"
#include "grxmldoc.h"
#include "ESR_Session.h"
//#include "LCHAR.h"

#define GRXML_DEBUG 0
#define MAX_PATH_NAME 512

#define FATAL_ERROR(x,y) { std::cout << (x) << std::endl; exit ((y)); }
#define WARNING(x) std::cout << (x) << std::endl;

#if GRXML_DEBUG
//#define DEBUG_PRINT(x) //
#define DEBUG_PRINT(x) std::cout << (x) << std::endl;
#define PRINT_EXPRESSION(x)
//#define PRINT_EXPRESSION(x) std::cout << (x) << std::endl;
#else
#define DEBUG_PRINT(x) //
#define PRINT_EXPRESSION(x) //

#endif

using namespace std;

#define CHECK_NOT_EMPTY(s, t) { if (s.empty()) \
				{ \
				std::cout << "ERROR: Empty string of type "  << t <<std::endl; \
				} \
			     }

int get_range(const std::string& s, int* minCnt, int* maxCnt)
{
  std::string sval;
  size_t p1 =s.find("-");
  if ( p1 !=string::npos ) {
    sval.assign( s, 0, p1 );
    if(strspn(sval.c_str(),"0123456789")<1) return 1;
    *minCnt = atoi( sval.c_str() );
    sval.assign( s, p1+1, s.size() );
    *maxCnt = -1;    // 0== any?
    // If max is given then use BeginCount otherwise use BeginItemRepeat
    if (!sval.empty() ) {
      if(strspn(sval.c_str(),"0123456789")<1) return 1;
      *maxCnt = atoi( sval.c_str() );
    }
    return 0;
  } 
  p1 = s.find("+");
  if( p1 != string::npos) {
    sval.assign( s, 0, p1 );
    if(strspn(sval.c_str(),"0123456789")<1) return 1;
    *minCnt = atoi( sval.c_str() );
    *maxCnt = -1; 
    return 0;
  }
  if(strspn(s.c_str(),"0123456789")<1) return 1;
  *minCnt = *maxCnt = atoi( s.c_str());
  return 0;
}

GRXMLDoc::GRXMLDoc()
{
    m_NodeKeyWords.insert(make_pair("grammar", NodeTypeGrammar));
    m_NodeKeyWords.insert(make_pair("rule", NodeTypeRule));
    m_NodeKeyWords.insert(make_pair("ruleref", NodeTypeRuleReference));
    m_NodeKeyWords.insert(make_pair("one-of", NodeTypeOneOf));
    m_NodeKeyWords.insert(make_pair("item", NodeTypeItem));
    m_NodeKeyWords.insert(make_pair("tag", NodeTypeTag));
    m_NodeKeyWords.insert(make_pair("count", NodeTypeCount));
    m_NodeKeyWords.insert(make_pair("meta", NodeTypeMeta));
    m_pGraph = 0;
    m_RuleAutoIndex = 0;
    m_TagAutoIndex = 0;
    m_LabelAutoIndex = 0;
    m_ExpandedRulesAutoIndex = 0;
    m_XMLFileName = "dummy.xml";
}


GRXMLDoc::~GRXMLDoc()
{
    deleteRules();
    if (m_pGraph) {
        delete m_pGraph;
    }
}


bool GRXMLDoc::parseGrammar( XMLNode &node, std::string & xMLFileName )
{
    m_XMLFileName = xMLFileName;
    // Set up the internally defined rules, etc.
    initializeLists();
    // The top level "document" node is given to this fn
    // Create the container for the word graph.
    if (m_pGraph) {
        delete m_pGraph;
    }
    m_pGraph = new Graph("XML grammar");
    SubGraph *p_SubGraph;

    parseNode( node, p_SubGraph, 1 );     // NB Subgraph pointed to will change in recursive fn.

    if (findSubGraph( m_RootRule, p_SubGraph )) {
	m_pGraph->ExpandRules (p_SubGraph);
	p_SubGraph->RemoveInternalConnections ();
	//Print the root rule.
	//printSubgraph( *p_SubGraph );
    }
    return true;
}


bool GRXMLDoc::parseNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
{
    // We will create a new subgraph for each rule node.
    // The "current" subgraph is substituted with the new subgraph for all ops on child nodes.
    // After processing child nodes the original subgraph is reinstated
    // for final operations in the endNode() fn.

    // Initial processing of the current node before processing children
#if 0 && GRXML_DEBUG
	if(node.Type() == TiXmlNode::ELEMENT) 
		node.ToElement()->Print( stdout, level);
	else if(node.Type() == TiXmlNode::DOCUMENT)
		node.ToDocument()->Print( stdout, level);
	else if(node.Type() == TiXmlNode::TEXT)
		node.ToText()->Print( stdout, level);
	else if(node.Type() == TiXmlNode::DECLARATION)
		node.ToDeclaration()->Print( stdout, level);
	else {
		const char* text = node.Value();
		if(!text) text = "__NULL__";
		printf("processing node type %d text %s\n", node.Type(), text);
	}
#endif
    beginNode( node, p_SubGraph, level );

    SubGraph *p_LocalSubGraph;
    p_LocalSubGraph = p_SubGraph;
	TiXmlNode* child;
	for( child = node.FirstChild(); child; child = child->NextSibling() )
    {
		parseNode ( *child, p_SubGraph, level+1 );
    }
    // Revert current node
    p_SubGraph = p_LocalSubGraph;

    // Finish processing current node
    endNode( node, p_SubGraph, level );

    return true;
} // parseNode


bool GRXMLDoc::beginNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
{
    std::string name = node.Value();
    DEBUG_PRINT("Element = " + name);

    // XMLNode::Type type = node.getType();
    if ( node.Type() == TiXmlNode::TEXT) // isCData()
    {
      const char* cc_name = node.Parent()->Value();
      std::string str_name(cc_name); 
      DEBUG_PRINT (std::string("CDATA ") + name);
      DEBUG_PRINT (std::string("CDATA ") + str_name);
      
      processCDATA( node, p_SubGraph );
    }
    else if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() /*isLeaf()*/)
      {
	//printNode(node, level);
	// Use enum value
	KEYWDPAIR::iterator pos;
	pos = m_NodeKeyWords.find( name );
	KeywordValues nodeType = NodeTypeBadValue;
	if ( pos != m_NodeKeyWords.end() )
	{
	    nodeType = (*pos).second;
	    DEBUG_PRINT("nodeType=" + nodeType);
	} else if(node.Type() == TiXmlNode::COMMENT) {
		return true;
	} else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
		return true;
	} else {
	  FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT);
	}

	switch ( nodeType )
	{
	case NodeTypeGrammar:
	    {
		beginParseGrammarNode( node );
	    }
	    break;
	case NodeTypeRule:
	    {
		// NB This fn creates a new subgraph.
		beginParseRuleNode( node, p_SubGraph );
	    }
	    break;
	    case NodeTypeRuleReference:
	    {
		// NB This fn creates a new subgraph.
		beginRuleRef( node, p_SubGraph );
	    }
	    break;
	    case NodeTypeOneOf:
	    {
		beginOneOf( node, p_SubGraph );
	    }
	    break;
	    case NodeTypeItem:
	    {
		beginItem( node, p_SubGraph );
	    }
	    break;
	    case NodeTypeTag:
	    {
		beginTag( node, p_SubGraph );
	    }
	    break;
	    case NodeTypeCount:
	    {
		beginCount( node, p_SubGraph );
	    }
	    break;
	    case NodeTypeMeta:
	    {
	        beginParseMetaNode( node );
	    }
	    break;
	    case NodeTypeBadValue:
	    default:
		DEBUG_PRINT( "UNKNOWN node name: " + name );
	    break;
	}; // switch
    } //is a Node or Leaf
    else if ( node.Type() == TiXmlNode::TEXT) // isCData()
      {
	DEBUG_PRINT (std::string("CDATA ") + name);
	processCDATA( node, p_SubGraph );
    }
    return true;
} // beginNode()


bool GRXMLDoc::endNode( XMLNode &node, SubGraph *&p_SubGraph, const unsigned int level )
{
    std::string name = node.Value();
    //XMLNode::Type type = node.getType();

    if ( node.Type()== TiXmlNode::ELEMENT /*isNode()*/ || node.NoChildren() )
    {
	KEYWDPAIR::iterator pos;
	pos = m_NodeKeyWords.find( name );
	KeywordValues nodeType = NodeTypeBadValue;
	if ( pos != m_NodeKeyWords.end() )
	{
	    nodeType = (*pos).second;
	}  else if(node.Type() == TiXmlNode::COMMENT) {
		return true;
	} else if(node.Type() == TiXmlNode::DECLARATION && name.length()==0) {
		return true;
	} else if(node.Type() == TiXmlNode::TEXT) {

	} else {
	  FATAL_ERROR( std::string("Error: unknown tag ") + name, ESR_INVALID_ARGUMENT );
	}

	switch ( nodeType )
	{
	case NodeTypeGrammar:
	{
	    endParseGrammarNode( node );
	}
	break;
	case NodeTypeRule:
	{
	    endParseRuleNode( node, p_SubGraph );
	}
	break;
	case NodeTypeRuleReference:
	{
	    endRuleRef( node, p_SubGraph );
	}
	break;
	case NodeTypeOneOf:
	{
	    endOneOf( node, p_SubGraph );
	}
	break;
	case NodeTypeItem:
	{
	    endItem(node, p_SubGraph );
	}
	break;
	case NodeTypeTag:
	{
	    endTag( node, p_SubGraph );
	}
	break;
	case NodeTypeCount:
	{
	    endCount( node, p_SubGraph );
	}
	break;
        case NodeTypeMeta:
	{
            endParseMetaNode( node );
	}
	break;
	case NodeTypeBadValue:
	default:
	    DEBUG_PRINT( "UNKNOWN node name: ");
	    DEBUG_PRINT( name.c_str() );
	//Extend the
	break;
	}; // switch
    } //isNode() or isLeaf()
    else
    {
	// Do nothing?
    }
    return true;
} // endNode()


bool GRXMLDoc::beginParseGrammarNode(XMLNode &node)
{
	const char* attr;
#define GETATTR(nAmE) ((attr=node.ToElement()->Attribute(nAmE))!=NULL) ? attr:""
	m_XMLMode      = GETATTR("mode");
	m_XMLLanguage  = GETATTR("xml:lang");
    m_RootRule     = GETATTR("root");	// The root rule name

    DEBUG_PRINT("Root rule = " + m_RootRule);

    m_XMLTagFormat = GETATTR("tag-format");
    m_XMLVersion   = GETATTR("version");
    m_XMLBase      = GETATTR("xml:base");
    return true;
}

bool GRXMLDoc::beginParseMetaNode(XMLNode &node)
{
  const char* attr;
  std::string meta_name  = GETATTR("name");
  std::string meta_value = GETATTR("content");

  if(meta_name == "word_penalty") {
    m_MetaKeyValPairs.insert(meta_name,meta_value);
    // m_MetaKeyValPairs.print();
  } else if(meta_name == "do_skip_interword_silence") {
    for(int j = 0; j<(int)meta_value.size(); j++){
      meta_value[j] = tolower(meta_value[j]); //lower();
    }
    if(meta_value!="true" && meta_value!="false") 
      printf ("\nWarning: %s must be set to 'true' or 'false'; defaulting to 'false'\n", meta_name.c_str());
    else 
      m_MetaKeyValPairs.insert(meta_name,meta_value);
  } else if(meta_name == "userdict_name") {
    printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
  } else {
    printf ("\nWarning: ignoring unsupported meta %s %s\n", meta_name.c_str(), meta_value.c_str());
  }
  return true;
}


bool GRXMLDoc::endParseGrammarNode(XMLNode &node)
{
    // End parse operations
    return true;
}


bool GRXMLDoc::beginParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph)
{
	const char* attr;
    // Note: The subGraph may change if there are forward references. This
    // is fine as we revert to the previous one when finished parsing the current node.
    DEBUG_PRINT ( "---- Rule\n" );
    std::string ruleName = GETATTR("id" );
    std::string s_tag    = GETATTR("tag" );
    if( s_tag.length()>0) {
      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
    }
    CHECK_NOT_EMPTY( ruleName, "id" );
    // Rule name must be unique within scope of entire grammar.
    // Put rule on stack - for context
    m_RuleListStack.push( ruleName );

    // Check whether a ruleref placeholder exists for this rule.
    int index;
    bool foundRule = findRuleIndex( ruleName, index );
    if (foundRule) {
	// Rule is already declared; it must have been forward referenced
	// so swap the placeholder subgraph in.
	// NB subgraph and rule name are already known to lists.
	SubGraph *p_ExistingSubgraph;
	if ( findSubGraph( ruleName, p_ExistingSubgraph ) ) {
	    p_SubGraph = p_ExistingSubgraph;
	}
	else {
	    FATAL_ERROR("ERROR! Subgraph without rule name entry found!", -1);
        }
    }
    else {
	// Create a Word Graph node for each rule node
	SubGraph *newGraph;
	addRuleToList( ruleName, newGraph );
	p_SubGraph = newGraph;
    }

    // Make a note of the scope or rules; public, etc - used in map file.
    findRuleIndex( ruleName, index );
    std::string ruleScope = GETATTR("scope" );
    if ( !ruleScope.empty() ) {
        m_RuleScope.insert(index, ruleScope);
    }

    // We must accommodate Rules that have CDATA without an <item> element.
    // We need to infer this element for all rules.
    m_pGraph->BeginItem( p_SubGraph );

    PRINT_EXPRESSION( ruleName + " = { " );
    return true;
} // beginParseRuleNode()


bool GRXMLDoc::endParseRuleNode( XMLNode &node, SubGraph *&p_SubGraph )
{
    // The rule expression has been built as a subgraph and ID added to the rule list.
    // Finished editing subgraph
    DEBUG_PRINT ( "---- /Rule\n" );
    //m_pGraph->EndRule(&p_SubGraph);
    // Tell the world
    //std::string ruleName = attr.get( "id" );
    std::string ruleName = m_RuleListStack.top();
    m_RuleListStack.pop();
    //CHECK_NOT_EMPTY( ruleName, "id" );
    // Must be unique rule name within scope of entire grammar.
    // Check whether a ruleref placeholder exists for this rule.
    m_pGraph->addSubGraph ( p_SubGraph );

    // We must accommodate Rules that have CDATA without an <item> element.
    // We need to infer this element for all rules.
    m_pGraph->EndItem( p_SubGraph );

    PRINT_EXPRESSION( " }\n" );
    return true;
}

bool GRXMLDoc::processCDATA( XMLNode &node, SubGraph *&p_SubGraph )
{
    // Note the Item's CDATA
    // Strip leading and trailing whitespace
    const char* cc_name = node.Parent()->Value();
    std::string str_name(cc_name); // = node.Parent()->ValueStr(); // getName
    // std::string name = node.Parent()->Value(); // getName
    //if ( name == "item" ) {
    if ( str_name != "tag" ) {

	const char* const whitespace = " \t\r\n\v\f";
	std::string cdata = node.Value(); // getCData()
	std::string word; // Words are whitespace separated

	cdata.erase(0, cdata.find_first_not_of(whitespace) );
	cdata.erase(cdata.find_last_not_of(whitespace) + 1);
#if GRXML_DEBUG
        std::cout << "/--" << cdata << "--/\n";
#endif

	std::string::size_type begIdx, endIdx;

        //search beginning of the first word
        begIdx = cdata.find_first_not_of(whitespace);

        //while beginning of a word found
	while (begIdx != std::string::npos) {
            //search end of the actual word
            endIdx = cdata.find_first_of (whitespace, begIdx);
            if (endIdx == string::npos) {
                //end of word is end of line
                endIdx = cdata.length();
            }
            word.clear();
	    // word.assign(cdata,begIdx,endIdx);
	    word.append (cdata, begIdx, endIdx - begIdx);
	    if ( !word.empty() )
	    {
#if GRXML_DEBUG
		std::cout << " -->" << word << "<--\n";
#endif
		int index;
		// If a slot then take note of rule name
		if ( IsSlot( word ) ) {
		  const char* xmlBasename;
		  std::string ruleName = m_RuleListStack.top();
		  m_SlotList.insert(index, ruleName);
		  xmlBasename = strrchr(m_XMLFileName.c_str(),'/');
		  xmlBasename = xmlBasename ? xmlBasename+1 : m_XMLFileName.c_str();
		  word = (std::string)xmlBasename + "." + ruleName + "@" + word;
		  addLabelToList( word );
		  findLabelIndex( word, index );
		} else {
		  addLabelToList( word );
		  findLabelIndex( word, index );
		}
		m_pGraph->AddLabel( p_SubGraph, index );
	    }
	    begIdx = cdata.find_first_not_of (whitespace, endIdx);

	}
    } //tag
    else {
	// Do nothing with CDATA for elements that are not items.
	// In particular, do not strip whitespace from tag cdata.
	// However, CPPDOM appears to remove linefeeds. May need to tidy up.

    }
    return true;
} // cdata

bool GRXMLDoc::beginItem( XMLNode &node, SubGraph *&p_SubGraph )
{
	const char* attr;
    DEBUG_PRINT ("---- Item:\n");
    // First check whethere there is a count/repeat
    std::string s     = GETATTR("repeat" );
    int minCnt=0,maxCnt=0;
    std::string s_tag = GETATTR("tag" );
    if( s_tag.length()>0) {
      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
    }
    if( s.length()>0 && get_range( s, &minCnt, &maxCnt) ) {
      FATAL_ERROR(std::string("error: while parsing range ") + s,1);
    }
    if ( !s.empty() ) {
      // RED FLAG: max should not be 0! A +ve number should have been given.
      if( maxCnt>0) {
	m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
      }
      else {
	// NB: BeginItemRepeat  can only use min of 0 or 1!
	m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
      }
    }
    else {
	m_pGraph->BeginItem( p_SubGraph );
    }
    return true;
}


bool GRXMLDoc::endItem( XMLNode &node, SubGraph *&p_SubGraph )
{
    DEBUG_PRINT ( "---- /Item\n" );

    // What TODO if no tag for an item?

    m_pGraph->EndItem( p_SubGraph );
    return true;
}


bool GRXMLDoc::beginRuleRef( XMLNode &node, SubGraph *&p_SubGraph )
{
    // Extend word FST node with an entire FST subgraph.
    // Forward referencing of rules is supported.
    // NB Remove the leading # from the ruleref name!
    DEBUG_PRINT ( "---- Ruleref\n" );

	const char* attr;
    std::string s_tag = GETATTR("tag" );
    if( s_tag.length()>0) {
      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
    }
    std::string s = GETATTR("uri" );
    if (s.empty())
    {
	//
	FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
    }
    // Remove the #:
    int p1 = s.find("#");
    if ( p1 !=0 ) {
	FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'" + ". Rule reference must start with a '#'. External references are not supported.", -1 );
    }
    string ruleName;
    getRuleRefName( node, ruleName );

    //std::string parentRuleName = m_RuleListStack.top();
    //addRuleDependency( parentRuleName, ruleName );

    int index;
    bool foundRule = findRuleIndex( ruleName, index );
    if (!foundRule) {
	// Forward reference; create a placeholder subgraph ptr.
	//SubGraph *newGraph = new SubGraph( (char *) ruleName.c_str() );
	// RED FLAG:  Remember to check fwd ref rule was filled in at end.
	SubGraph *newGraph;
	addRuleToList( ruleName, newGraph );
	findRuleIndex( ruleName, index );
    }
    // We can now treat a forward-referenced graph as if it was defined.
    // We will add the subgraph when we have the tag - see endItem().
    m_pGraph->BeginRule( p_SubGraph );
    m_pGraph->AddRuleRef( p_SubGraph, index );
    m_pGraph->EndRule( p_SubGraph );

    return true;
}


bool GRXMLDoc::endRuleRef(XMLNode &grmNode, SubGraph *&p_SubGraph )
{
    DEBUG_PRINT ( "---- /Ruleref\n" );
    // Does nothing
    // NB The tag is not under the ruleref element - it is in the current item element.
    // We now add the tag of the AddRuleRef as we see the tag element. See EndTag().

    return true;
}


bool GRXMLDoc::beginOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
{
    DEBUG_PRINT ( "----OneOf\n" );
    m_pGraph->BeginOneOf (p_SubGraph);
    return true;
}


bool GRXMLDoc::endOneOf(XMLNode &grmNode, SubGraph *&p_SubGraph)
{
    DEBUG_PRINT ( "----/OneOf\n" );
    m_pGraph->EndOneOf (p_SubGraph);
    return true;
}


bool GRXMLDoc::beginTag( XMLNode &node, SubGraph *&p_SubGraph )
{
    DEBUG_PRINT ("---- Tag\n");
    std::string s = node.ToElement()->GetText(); // getCdata();
#if GRXML_DEBUG
    std::cout << s;     // debug
#endif
    // Store the semantic tag info.
    // NB Do not strip whitespace from tag cdata
    if ( !s.empty() )
    {
	int index;
	addTagToList( s );
	findTagIndex( s, index );
	m_pGraph->AddTag ( p_SubGraph, index );
    }

    return true;
}


bool GRXMLDoc::endTag( XMLNode &node, SubGraph *&p_SubGraph )
{
    DEBUG_PRINT ("---- /Tag\n");
    return true;
}


bool GRXMLDoc::beginCount( XMLNode &node, SubGraph *&p_SubGraph )
{
	const char* attr;
    // Count of reps applies to the text elements in this count node
    DEBUG_PRINT ("---- Count\n");
    // Get number attr
    std::string s     = GETATTR("number");
    std::string s_tag = GETATTR("tag" );
    if( s_tag.length()>0) {
      FATAL_ERROR("Error: unsupported tag= syntax, use <tag> ... </tag>", 1)
    }
    if (s.empty()) {
		return false;
    }
    // not  in subgraph but in graph?!
    //graph.BeginCount(n);

    int minCnt=-1, maxCnt=-1;
    if( get_range( s, &minCnt, &maxCnt) ) {
      FATAL_ERROR(std::string("error: while parsing range ") + s,1);
    }
    if ( s.c_str() == std::string("optional") )
    {
	m_pGraph->BeginOptional( p_SubGraph );
    }
    else if ( minCnt>0 && maxCnt>0) 
    {
	m_pGraph->BeginCount( p_SubGraph, minCnt, maxCnt );
    }
    else if( minCnt>0 ) 
      {
	m_pGraph->BeginItemRepeat ( p_SubGraph, minCnt, -1);
      }
    else { //
    	m_pGraph->BeginOptional ( p_SubGraph );
    }

    return true;
}


bool GRXMLDoc::endCount( XMLNode &node, SubGraph *&p_SubGraph )
{
    DEBUG_PRINT ("---- /Count\n");
    m_pGraph->EndCount( p_SubGraph );
    return true;
}

bool GRXMLDoc::endParseMetaNode(XMLNode &node)
{
  // End parse operations
  return true;
}

void GRXMLDoc::printNode(XMLNode &node, int level)
{
    std::string name = node.Value();
    int type = node.Type();
    std::string c_data;

    for(int i=0;i<level;i++) std::cout << " ";

    char c = ' ';
    switch(type)
    {
    case TiXmlNode::ELEMENT:
	// case XMLNode::xml_nt_node: // grammar, rule, one-of, item, count
	 c = '+';
	 break;
	/* case TiXmlNode::TEXT:
	// case XMLNode::xml_nt_leaf:
	c = '-';
	break; */
    case TiXmlNode::DOCUMENT:
    // case XMLNode::xml_nt_document:
	c = '\\';
	break;
    case TiXmlNode::TEXT:
    // case XMLNode::xml_nt_cdata:
	c = '#';
	c_data = node.Value(); // getCdata();
	break;
	case TiXmlNode::UNKNOWN:
	case TiXmlNode::COMMENT:
	case TiXmlNode::TYPECOUNT:
	case TiXmlNode::DECLARATION:
	default:
		std::cout << "Error: not sure what to do here" << std::endl;
		break;
    }
	if(node.Type() == TiXmlNode::TEXT)  // isCData()
	  std::cout << c << name.c_str() << "[" << c_data << "]" << std::endl;
	//Extend the tag hashtable
    else
	  std::cout << c << name.c_str() << std::endl;

	if( node.Type() == TiXmlNode::ELEMENT) {

		for(TiXmlAttribute* attr=node.ToElement()->FirstAttribute();
			attr; attr=attr->Next() ) {

		  // guru: added output of attributes
			for (int i=0; i<level; i++)
				std::cout << " ";
			std::cout << "   ";
			std::cout << attr->Name() << ": " << attr->Value() << std::endl;
		}
	}

}

/** Function: addRuleToList
    Extends list of SubGraphs with given subGraph
    and extends list of rule names too.
    TODO: Can we use one hash and use internal numeric index for rule IDs?
*/


bool GRXMLDoc::addRuleToList(std::string const & ruleName, SubGraph *&p_SubGraph)
{
    int index;
    if ( findRuleIndex ( ruleName, index ) ) {
	FATAL_ERROR("ERROR! Rule name " + ruleName + " is already defined!", -1 );
    }

    addLabelToList( m_XMLFileName + "@" + ruleName);
    findLabelIndex( m_XMLFileName + "@" + ruleName, index );
#if GRXML_DEBUG
    std::cout << "Rule " << ruleName << std::endl;
#endif
    // Create the new subgraph and update lists
    m_RuleList.insert( ruleName, index );
    p_SubGraph = new SubGraph( (char *) ruleName.c_str(), index );

    bool success = m_SubgraphList.insert( ruleName, p_SubGraph );
    if (!success) {
	FATAL_ERROR("ERROR! subgraph for " + ruleName + " is already defined!", -1 );
    }
#if ADD_BRACES
    addLabelToList( "{" );
    std::stringstream  ss;
    ss << "}(" << index << ")";
    addLabelToList( ss.str());
#endif
    return success;
}


bool GRXMLDoc::deleteRules()
{
    // Delete all allocated subgraphs.
    // The rule strings are part of the hashtables and get deleted by them.
    int index;
    SubGraph *p_SubGraph;
    std::string ruleName;
    while ( !m_RuleList.isEmpty() ) {
	m_RuleList.getFirst( &ruleName, &index );
	m_RuleList.remove( ruleName );
	if (m_SubgraphList.getValue( ruleName, &p_SubGraph ) ) {
	    delete p_SubGraph;
	}
	else {
	    FATAL_ERROR("No subgraph for rule " + ruleName + "! Mismatched rules and subgraph hashtables!", -1);
	}
    }
    m_SubgraphList.clear();
    m_RuleList.clear();
    m_LabelList.clear();
    m_TagList.clear();
    return true;
}

bool GRXMLDoc::findSubGraph(std::string & s, SubGraph *&p_SubGraph)
{
    return m_SubgraphList.getValue(s, &p_SubGraph);
}

bool GRXMLDoc::findRule(int i, std::string &s )
{
    return m_RuleList.getIndex( i, &s );
}

bool GRXMLDoc::findTag(int i, std::string &s )
{
    return m_TagList.getValue( i, &s );
}

bool GRXMLDoc::findLabel(int i, std::string &s )
{
    return m_LabelList.getValue( i, &s );
}

bool GRXMLDoc::findSubGraphIndex( SubGraph *p_SubGraph, std::string &s )
{
    return m_SubgraphList.getIndex( p_SubGraph, &s );
}

bool GRXMLDoc::findRuleIndex( std::string s, int &i )
{
    return m_RuleList.getValue( s, &i );
}
bool GRXMLDoc::findTagIndex( std::string s, int &i )
{
    return m_TagList.getIndex( s, &i );
}
bool GRXMLDoc::findLabelIndex( std::string s, int &i )
{
    return m_LabelList.getIndex( s, &i );
}
bool GRXMLDoc::findMeta(const std::string & sn, std::string &s)
{
    return m_MetaKeyValPairs.getValue( sn, &s );
}
bool GRXMLDoc::setMeta(const std::string & sn, const std::string &s)
{
  std::string tmp;
  if(findMeta(sn,tmp)) 
    m_MetaKeyValPairs.remove(sn);
  return m_MetaKeyValPairs.insert(sn,s);
}

bool GRXMLDoc::addTagToList( std::string const& s )
{
    bool success = true;
    // Make values unique
    int index;
    if ( !findTagIndex( s, index ) ) 
	success = m_TagList.insert( m_TagAutoIndex++, s );
    return success;
}


bool GRXMLDoc::addLabelToList( std::string const& s )
{
  // TODO: Labels should be unique. Change key.
  int index;
  bool bRes = m_LabelList.getIndex( s, &index );
  if(bRes == true) {
    return false; // exists
  }
  bRes = m_LabelList.insert( m_LabelAutoIndex++, s );
  return  bRes;
}

void GRXMLDoc::printLists()
{
    m_SubgraphList.print();
    m_RuleList.print();
    m_TagList.print();
    m_LabelList.print();
}


void GRXMLDoc::printSubgraphs()
{
    SubGraph *p_SubGraph;
    std::string rule;
    int index;
    if ( m_RuleList.getFirst( &rule, &index) ) {
	if ( findSubGraph( rule, p_SubGraph ) ) {
	    DEBUG_PRINT("============ Rule: " + rule + "============");
	    printSubgraph( *p_SubGraph );
	    while ( m_RuleList.getNext( &rule, &index) ) {
		if ( findSubGraph( rule, p_SubGraph ) ) {
		    printSubgraph( *p_SubGraph );
		}
	    }
	}
    }
}


void GRXMLDoc::printSubgraph( SubGraph &p_SubGraph )
{
    p_SubGraph.PrintWithLabels( *this );
}


bool GRXMLDoc::getRuleRefName(XMLNode &node, std::string &ruleName)
{
  const char* attr;
  std::string s = GETATTR("uri" );
  if (s.empty()) {
    FATAL_ERROR( "ERROR! Ruleref specifies no uri name!", -1 );
  }
  // Remove the #:
  int p1 = s.find("#");
  if ( p1 !=0 ) {
    FATAL_ERROR( "ERROR! bad ruleref name: '" + s + "'", -1 );
  }
  ruleName.assign( s, 1, s.size() );
  return true;
}

void GRXMLDoc::initializeLists()
{
  m_SubgraphList.setName("Subgraphs");
  m_RuleList.setName("Rules");
  m_TagList.setName("Tags");
  m_LabelList.setName("Labels");
  
  /* Predefined rules. NB Labels are also created for each rule added.
  // The required order for these labels in the .map output file is:
  //     0   eps
  //     next come slots
  //     pau and pau2
  //     everything else
  // We will add all these now in case they are referenced and we will
  // reindex after we have parsed the grammar -- when we have the list
  // of slots. This re-indexing is for the output files .map and .P.txt.
  //
  */
    addLabelToList( "eps" );

    addLabelToList( "-pau-" );
    addLabelToList( "-pau2-" );
}

void GRXMLDoc::writeMapFile( std::string & fileName )
{
    // We need to re-index in order to put the labels in correct order:
    // 1. eps
    // 2. all slots
    // 3. all rules
    // 4. -pau- words
    // 5. remaining labels
    ofstream outfile;
    int index, origIndex;
    std::string label;
    std::string slotRuleName;
    std::string scope; // For rules
    HashMap<int,std::string> orderedList;
    int orderedIndex=0;
    // 1. eps
    orderedList.insert( orderedIndex++, "eps" );

    // 2. slots
    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
	if ( IsSlot( label ) ) {
	    orderedList.insert( orderedIndex++, label );
	}
	while (m_LabelList.getNext( &origIndex, &label ) ) {
	    if ( IsSlot( label ) ) {
		orderedList.insert( orderedIndex++, label );
	    }
	}
    }

    // 3.  Now rules, or anything with @
    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
	do {
#if GRXML_DEBUG
	    std::cout << label << " "<< label.find_first_of ("@") << std::endl;
#endif
            if (!IsSlot(label) && label.find_first_of ("@") != string::npos) {
#if GRXML_DEBUG
		std::cout << "    Adding " << label << std::endl;
#endif
		orderedList.insert( orderedIndex++, label );
	    }
	} while (m_LabelList.getNext( &origIndex, &label ) );
    }

    // 4. pau
    orderedList.insert( orderedIndex++, "-pau-" );
    orderedList.insert( orderedIndex++, "-pau2-" );

    // 5. Remaining stuff. NB We depend upon the label not
    //    being added twice.
    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
	if ( !orderedList.getIndex( label, &index ) ) {
	  orderedList.insert( orderedIndex++, label );
	}
	while (m_LabelList.getNext( &origIndex, &label ) ) {
	    if ( !orderedList.getIndex( label, &index ) ) {
	      orderedList.insert( orderedIndex++, label );
	    }
	}
    }
    outfile.open ( fileName.c_str() );

    bool bRes = orderedList.getFirst( &index, &label );
    do {
      if(!bRes) break;
      // Look up scope using original index
      m_LabelList.getIndex( label, &origIndex );
      if (m_RuleScope.getValue(origIndex, &scope) ) 
	label = scope + ":" + label;
      outfile << label << " " << index << std::endl;
      bRes = orderedList.getNext( &index, &label );
    } while(bRes);

    outfile.close();
}


void GRXMLDoc::writeScriptFile( std::string & fileName )
{
    ofstream outfile;
    int index;
    std::string label;
    outfile.open ( fileName.c_str() );
    if ( m_TagList.getFirst( &index, &label ) ) {
    	outfile << index << " " << label << std::endl;
    }
    while (m_TagList.getNext( &index, &label ) ) {
    	outfile << index << " " << label << std::endl;
    }
    outfile.close();

    //m_LabelList.writeFile( fileName );
}

void GRXMLDoc::writeParamsFile( std::string & fileName )
{
  std::string wtw;
  ofstream outfile;
  bool bRes;
  
  outfile.open(fileName.c_str());

  std::string metaname = "word_penalty";
  bRes = findMeta(metaname, wtw);
  if(bRes)
    outfile << metaname.c_str() << "\t=\t" << wtw.c_str() << std::endl;

  // outfile << "locale"  << "\t=\t" << m_XMLLanguage << std::endl;
  outfile.close();
}

void GRXMLDoc::writeGraphFiles( std::string& prefix, bool bDoWriteRecogGraphs)
{
    SubGraph *p_SubGraph;
    SubGraph *p_SemGraph;
    std::string fileName;
    if ( !findSubGraph( m_RootRule, p_SubGraph ) ) {
	FATAL_ERROR ("ERROR: writeGraphFiles - no root rule "+ m_RootRule + " defined. No file created", -1 );
    }

    //  Create .P.txt
    printf ("\nCreating semantic graph file\n");
    p_SemGraph = new SubGraph( (char *) "Main", -1);
    m_pGraph->BeginRule( p_SemGraph );
    m_pGraph->AddRuleRef( p_SemGraph, p_SubGraph->getRuleId());
    m_pGraph->EndRule( p_SemGraph );
    m_pGraph->ExpandRules (p_SemGraph);
    p_SemGraph->RemoveInternalConnections ();

    p_SemGraph->AddTerminalConnections ();
    p_SemGraph->ReduceArcsByEquivalence();
    p_SemGraph->RemoveUnreachedConnections (-1, -1);
    p_SemGraph->DeterminizeArcs();
    p_SemGraph->RemoveUnreachedConnections (-1, -1);
    p_SemGraph->ReduceArcsByEquivalence();
    p_SemGraph->RemoveUnreachedConnections (-1, -1);
    fileName = prefix + ".P.txt";
    p_SemGraph->WriteForwardGraphWithSemantic( fileName, *this );
    delete p_SemGraph;

    fileName = prefix + ".omap";
    this->WriteOLabels(fileName);
}

void GRXMLDoc::sortLabels()
{
    // We need to re-index in order to put the labels in correct order:
    int index=0, origIndex;
    std::string label;
    std::string slotRuleName;
    std::string scope; // For rules
    std::vector <std::string> orderedList;
    if ( m_LabelList.getFirst( &origIndex, &label ) ) {
        // Look up scope using original index
        orderedList.push_back( label );
        while (m_LabelList.getNext( &origIndex, &label ) ) {
            orderedList.push_back( label );
        }
    }
    std::sort(orderedList.begin(), orderedList.end() );
    m_SortedLabelList.clear();
    index=0;
    for (std::vector<std::string>::const_iterator citer = orderedList.begin();
     citer != orderedList.end(); ++citer) {
        label = *citer;
        m_LabelList.getIndex( label, &origIndex );
        m_SortedLabelList.insert( index, label );
        index++;
        // std::cout <<"Sorted: " << index <<" " << label <<std::endl;
    }
    return;
}

bool GRXMLDoc::findSortedLabel(int i, std::string &s )
{
    if (m_SortedLabelList.isEmpty() ) {
        sortLabels(); // Create the sorted label list.
    }
    return m_SortedLabelList.getValue( i, &s );
}

bool GRXMLDoc::findSortedLabelIndex( int i, int &sortedIndex )
{
    std::string s;
    if (m_SortedLabelList.isEmpty() ) {
        sortLabels(); // Create the sorted label list.
    }
    if ( m_LabelList.getValue( i, &s ) ) {
        if ( m_SortedLabelList.getIndex(s, &sortedIndex )) {
            return true;
        }
    }
    return false;
}

void GRXMLDoc::addOLabelToOList( std::string &s)
{
    m_OutputPtxtLabels.insert( s, 0);
}

bool GRXMLDoc::WriteOLabels(const std::string& fileName)
{
  HashMap<int,std::string> invMap;
  int count = 0;
  int max_script_label = 0;
  int scriptID = 0;
  std::map<std::string, int>::iterator iter;
  bool bFound;
  int tmp;

  std::string strIndex = "eps";
  bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
  if(bFound) 
    m_OutputPtxtLabels.remove(strIndex);
  m_OutputPtxtLabels.insert(strIndex, count); 
  invMap.insert( count, strIndex);
  count++;

  strIndex = "{";
  bFound = m_OutputPtxtLabels.getValue(strIndex, &tmp);
  if(bFound) 
    m_OutputPtxtLabels.remove(strIndex);
  m_OutputPtxtLabels.insert(strIndex, count); 
  invMap.insert( count, strIndex);
  count++;

  iter = m_OutputPtxtLabels.begin(); 
  for( ; iter!=m_OutputPtxtLabels.end(); iter++) {
    const char* label = iter->first.c_str();
    if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)
	&& strspn(label+SCRIPT_LABEL_PREFIX_LEN,"0123456789")==strlen(label+SCRIPT_LABEL_PREFIX_LEN) ) {
      scriptID = atoi(label+SCRIPT_LABEL_PREFIX_LEN);
      if(max_script_label < scriptID)
	max_script_label = scriptID;
    }/* else if( !strncmp(label,SCRIPT_LABEL_PREFIX, SCRIPT_LABEL_PREFIX_LEN)) {
      invMap.insert(count, iter->first);
      iter->second = count;
      count++;
      }*/
    else if(!invMap.getIndex((iter->first), &tmp)){
      invMap.insert(count, iter->first);
      iter->second = count;
      count++;
    }
  }

  cout << "found max_script_label " << max_script_label << endl;
  for(int j=0; j<=max_script_label; j++) {
    std::stringstream ss;
    ss << SCRIPT_LABEL_PREFIX << j;
    if(!invMap.getIndex( ss.str(), &tmp)) {
      invMap.insert( count++, ss.str());
    }
  }

  std::ofstream outfile(fileName.c_str());
  std::string outscript;
  if(!outfile) {
    FATAL_ERROR( "Error: opening the omap file for output", 1);
    WARNING( "Error: opening the omap file for output");
    return 1;
  } 
  for(int i=0; i<count; i++) {
    outscript = "";
    invMap.getValue(i,&outscript);
    if(outscript.length() == 0) {
      cout << "error: internal error while making .omap " << i << endl;
      FATAL_ERROR("error",1);
    }
    outfile << outscript.c_str() << " " << i << std::endl;
  }
  outfile.close();
  return 0;
}