lexer grammar t012lexerXMLLexer; options { language = Python3; } @header { from io import StringIO } @lexer::init { self.outbuf = StringIO() } @lexer::members { def output(self, line): self.outbuf.write(line + "\n") } DOCUMENT : XMLDECL? WS? DOCTYPE? WS? ELEMENT WS? ; fragment DOCTYPE : '<!DOCTYPE' WS rootElementName=GENERIC_ID {self.output("ROOTELEMENT: "+rootElementName.text)} WS ( ( 'SYSTEM' WS sys1=VALUE {self.output("SYSTEM: "+sys1.text)} | 'PUBLIC' WS pub=VALUE WS sys2=VALUE {self.output("PUBLIC: "+pub.text)} {self.output("SYSTEM: "+sys2.text)} ) ( WS )? )? ( dtd=INTERNAL_DTD {self.output("INTERNAL DTD: "+dtd.text)} )? '>' ; fragment INTERNAL_DTD : '[' (options {greedy=false;} : .)* ']' ; fragment PI : '<?' target=GENERIC_ID WS? {self.output("PI: "+target.text)} ( ATTRIBUTE WS? )* '?>' ; fragment XMLDECL : '<?' ('x'|'X') ('m'|'M') ('l'|'L') WS? {self.output("XML declaration")} ( ATTRIBUTE WS? )* '?>' ; fragment ELEMENT : ( START_TAG (ELEMENT | t=PCDATA {self.output('PCDATA: "{}"'.format($t.text))} | t=CDATA {self.output('CDATA: "{}"'.format($t.text))} | t=COMMENT {self.output('Comment: "{}"'.format($t.text))} | pi=PI )* END_TAG | EMPTY_ELEMENT ) ; fragment START_TAG : '<' WS? name=GENERIC_ID WS? {self.output("Start Tag: "+name.text)} ( ATTRIBUTE WS? )* '>' ; fragment EMPTY_ELEMENT : '<' WS? name=GENERIC_ID WS? {self.output("Empty Element: "+name.text)} ( ATTRIBUTE WS? )* '/>' ; fragment ATTRIBUTE : name=GENERIC_ID WS? '=' WS? value=VALUE {self.output("Attr: {}={}".format(name.text, value.text))} ; fragment END_TAG : '</' WS? name=GENERIC_ID WS? '>' {self.output("End Tag: "+name.text)} ; fragment COMMENT : '<!--' (options {greedy=false;} : .)* '-->' ; fragment CDATA : '<![CDATA[' (options {greedy=false;} : .)* ']]>' ; fragment PCDATA : (~'<')+ ; fragment VALUE : ( '\"' (~'\"')* '\"' | '\'' (~'\'')* '\'' ) ; fragment GENERIC_ID : ( LETTER | '_' | ':') ( options {greedy=true;} : LETTER | '0'..'9' | '.' | '-' | '_' | ':' )* ; fragment LETTER : 'a'..'z' | 'A'..'Z' ; fragment WS : ( ' ' | '\t' | ( '\n' | '\r\n' | '\r' ) )+ ;