#!/usr/bin/ruby # encoding: utf-8 =begin LICENSE [The "BSD licence"] Copyright (c) 2009-2010 Kyle Yetter All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =end module ANTLR3 =begin rdoc ANTLR3::Token At a minimum, tokens are data structures that bind together a chunk of text and a corresponding type symbol, which categorizes/characterizes the content of the text. Tokens also usually carry information about their location in the input, such as absolute character index, line number, and position within the line (or column). Furthermore, ANTLR tokens are assigned a "channel" number, an extra degree of categorization that groups things on a larger scale. Parsers will usually ignore tokens that have channel value 99 (the HIDDEN_CHANNEL), so you can keep things like comment and white space huddled together with neighboring tokens, effectively ignoring them without discarding them. ANTLR tokens also keep a reference to the source stream from which they originated. Token streams will also provide an index value for the token, which indicates the position of the token relative to other tokens in the stream, starting at zero. For example, the 22nd token pulled from a lexer by CommonTokenStream will have index value 21. == Token as an Interface This library provides a token implementation (see CommonToken). Additionally, you may write your own token class as long as you provide methods that give access to the attributes expected by a token. Even though most of the ANTLR library tries to use duck-typing techniques instead of pure object-oriented type checking, it's a good idea to include this ANTLR3::Token into your customized token class. =end module Token include ANTLR3::Constants include Comparable # the token's associated chunk of text attr_accessor :text # the integer value associated with the token's type attr_accessor :type # the text's starting line number within the source (indexed starting at 1) attr_accessor :line # the text's starting position in the line within the source (indexed starting at 0) attr_accessor :column # the integer value of the channel to which the token is assigned attr_accessor :channel # the index of the token with respect to other the other tokens produced during lexing attr_accessor :index # a reference to the input stream from which the token was extracted attr_accessor :input # the absolute character index in the input at which the text starts attr_accessor :start # the absolute character index in the input at which the text ends attr_accessor :stop alias :input_stream :input alias :input_stream= :input= alias :token_index :index alias :token_index= :index= # # The match operator has been implemented to match against several different # attributes of a token for convenience in quick scripts # # @example Match against an integer token type constant # token =~ VARIABLE_NAME => true/false # @example Match against a token type name as a Symbol # token =~ :FLOAT => true/false # @example Match the token text against a Regular Expression # token =~ /^@[a-z_]\w*$/i # @example Compare the token's text to a string # token =~ "class" # def =~ obj case obj when Integer then type == obj when Symbol then name == obj.to_s when Regexp then obj =~ text when String then text == obj else super end end # # Tokens are comparable by their stream index values # def <=> tk2 index <=> tk2.index end def initialize_copy( orig ) self.index = -1 self.type = orig.type self.channel = orig.channel self.text = orig.text.clone if orig.text self.start = orig.start self.stop = orig.stop self.line = orig.line self.column = orig.column self.input = orig.input end def concrete? input && start && stop ? true : false end def imaginary? input && start && stop ? false : true end def name token_name( type ) end def source_name i = input and i.source_name end def hidden? channel == HIDDEN_CHANNEL end def source_text concrete? ? input.substring( start, stop ) : text end # # Sets the token's channel value to HIDDEN_CHANNEL # def hide! self.channel = HIDDEN_CHANNEL end def inspect text_inspect = text ? "[#{ text.inspect }] " : ' ' text_position = line > 0 ? "@ line #{ line } col #{ column } " : '' stream_position = start ? "(#{ range.inspect })" : '' front = index >= 0 ? "#{ index } " : '' rep = front << name << text_inspect << text_position << stream_position rep.strip! channel == DEFAULT_CHANNEL or rep << " (#{ channel.to_s })" return( rep ) end def pretty_print( printer ) printer.text( inspect ) end def range start..stop rescue nil end def to_i index.to_i end def to_s text.to_s end private def token_name( type ) BUILT_IN_TOKEN_NAMES[ type ] end end CommonToken = Struct.new( :type, :channel, :text, :input, :start, :stop, :index, :line, :column ) =begin rdoc ANTLR3::CommonToken The base class for the standard implementation of Token. It is implemented as a simple Struct as tokens are basically simple data structures binding together a bunch of different information and Structs are slightly faster than a standard Object with accessor methods implementation. By default, ANTLR generated ruby code will provide a customized subclass of CommonToken to track token-type names efficiently for debugging, inspection, and general utility. Thus code generated for a standard combo lexer-parser grammar named XYZ will have a base module named XYZ and a customized CommonToken subclass named XYZ::Token. Here is the token structure attribute list in order: * <tt>type</tt> * <tt>channel</tt> * <tt>text</tt> * <tt>input</tt> * <tt>start</tt> * <tt>stop</tt> * <tt>index</tt> * <tt>line</tt> * <tt>column</tt> =end class CommonToken include Token DEFAULT_VALUES = { :channel => DEFAULT_CHANNEL, :index => -1, :line => 0, :column => -1 }.freeze def self.token_name( type ) BUILT_IN_TOKEN_NAMES[ type ] end def self.create( fields = {} ) fields = DEFAULT_VALUES.merge( fields ) args = members.map { |name| fields[ name.to_sym ] } new( *args ) end # allows you to make a copy of a token with a different class def self.from_token( token ) new( token.type, token.channel, token.text ? token.text.clone : nil, token.input, token.start, token.stop, -1, token.line, token.column ) end def initialize( type = nil, channel = DEFAULT_CHANNEL, text = nil, input = nil, start = nil, stop = nil, index = -1, line = 0, column = -1 ) super block_given? and yield( self ) self.text.nil? && self.start && self.stop and self.text = self.input.substring( self.start, self.stop ) end alias :input_stream :input alias :input_stream= :input= alias :token_index :index alias :token_index= :index= end module Constants # End of File / End of Input character and token type EOF_TOKEN = CommonToken.new( EOF ).freeze INVALID_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze SKIP_TOKEN = CommonToken.new( INVALID_TOKEN_TYPE ).freeze end =begin rdoc ANTLR3::TokenSource TokenSource is a simple mixin module that demands an implementation of the method #next_token. In return, it defines methods #next and #each, which provide basic iterator methods for token generators. Furthermore, it includes Enumerable to provide the standard Ruby iteration methods to token generators, like lexers. =end module TokenSource include Constants include Enumerable extend ClassMacros abstract :next_token def next token = next_token() raise StopIteration if token.nil? || token.type == EOF return token end def each block_given? or return enum_for( :each ) while token = next_token and token.type != EOF yield( token ) end return self end def to_stream( options = {} ) if block_given? CommonTokenStream.new( self, options ) { | t, stream | yield( t, stream ) } else CommonTokenStream.new( self, options ) end end end =begin rdoc ANTLR3::TokenFactory There are a variety of different entities throughout the ANTLR runtime library that need to create token objects This module serves as a mixin that provides methods for constructing tokens. Including this module provides a +token_class+ attribute. Instance of the including class can create tokens using the token class (which defaults to ANTLR3::CommonToken). Token classes are presumed to have an #initialize method that can be called without any parameters and the token objects are expected to have the standard token attributes (see ANTLR3::Token). =end module TokenFactory attr_writer :token_class def token_class @token_class ||= begin self.class.token_class rescue self::Token rescue ANTLR3::CommonToken end end def create_token( *args ) if block_given? token_class.new( *args ) do |*targs| yield( *targs ) end else token_class.new( *args ) end end end =begin rdoc ANTLR3::TokenScheme TokenSchemes exist to handle the problem of defining token types as integer values while maintaining meaningful text names for the types. They are dynamically defined modules that map integer values to constants with token-type names. --- Fundamentally, tokens exist to take a chunk of text and identify it as belonging to some category, like "VARIABLE" or "INTEGER". In code, the category is represented by an integer -- some arbitrary value that ANTLR will decide to use as it is creating the recognizer. The purpose of using an integer (instead of say, a ruby symbol) is that ANTLR's decision logic often needs to test whether a token's type falls within a range, which is not possible with symbols. The downside of token types being represented as integers is that a developer needs to be able to reference the unknown type value by name in action code. Furthermore, code that references the type by name and tokens that can be inspected with names in place of type values are more meaningful to a developer. Since ANTLR requires token type names to follow capital-letter naming conventions, defining types as named constants of the recognizer class resolves the problem of referencing type values by name. Thus, a token type like ``VARIABLE'' can be represented by a number like 5 and referenced within code by +VARIABLE+. However, when a recognizer creates tokens, the name of the token's type cannot be seen without using the data defined in the recognizer. Of course, tokens could be defined with a name attribute that could be specified when tokens are created. However, doing so would make tokens take up more space than necessary, as well as making it difficult to change the type of a token while maintaining a correct name value. TokenSchemes exist as a technique to manage token type referencing and name extraction. They: 1. keep token type references clear and understandable in recognizer code 2. permit access to a token's type-name independently of recognizer objects 3. allow multiple classes to share the same token information == Building Token Schemes TokenScheme is a subclass of Module. Thus, it has the method <tt>TokenScheme.new(tk_class = nil) { ... module-level code ...}</tt>, which will evaluate the block in the context of the scheme (module), similarly to Module#module_eval. Before evaluating the block, <tt>.new</tt> will setup the module with the following actions: 1. define a customized token class (more on that below) 2. add a new constant, TOKEN_NAMES, which is a hash that maps types to names 3. dynamically populate the new scheme module with a couple instance methods 4. include ANTLR3::Constants in the new scheme module As TokenScheme the class functions as a metaclass, figuring out some of the scoping behavior can be mildly confusing if you're trying to get a handle of the entity for your own purposes. Remember that all of the instance methods of TokenScheme function as module-level methods of TokenScheme instances, ala +attr_accessor+ and friends. <tt>TokenScheme#define_token(name_symbol, int_value)</tt> adds a constant definition <tt>name_symbol</tt> with the value <tt>int_value</tt>. It is essentially like <tt>Module#const_set</tt>, except it forbids constant overwriting (which would mess up recognizer code fairly badly) and adds an inverse type-to-name map to its own <tt>TOKEN_NAMES</tt> table. <tt>TokenScheme#define_tokens</tt> is a convenience method for defining many types with a hash pairing names to values. <tt>TokenScheme#register_name(value, name_string)</tt> specifies a custom type-to-name definition. This is particularly useful for the anonymous tokens that ANTLR generates for literal strings in the grammar specification. For example, if you refer to the literal <tt>'='</tt> in some parser rule in your grammar, ANTLR will add a lexer rule for the literal and give the token a name like <tt>T__<i>x</i></tt>, where <tt><i>x</i></tt> is the type's integer value. Since this is pretty meaningless to a developer, generated code should add a special name definition for type value <tt><i>x</i></tt> with the string <tt>"'='"</tt>. === Sample TokenScheme Construction TokenData = ANTLR3::TokenScheme.new do define_tokens( :INT => 4, :ID => 6, :T__5 => 5, :WS => 7 ) # note the self:: scoping below is due to the fact that # ruby lexically-scopes constant names instead of # looking up in the current scope register_name(self::T__5, "'='") end TokenData::ID # => 6 TokenData::T__5 # => 5 TokenData.token_name(4) # => 'INT' TokenData.token_name(5) # => "'='" class ARecognizerOrSuch < ANTLR3::Parser include TokenData ID # => 6 end == Custom Token Classes and Relationship with Tokens When a TokenScheme is created, it will define a subclass of ANTLR3::CommonToken and assigned it to the constant name +Token+. This token class will both include and extend the scheme module. Since token schemes define the private instance method <tt>token_name(type)</tt>, instances of the token class are now able to provide their type names. The Token method <tt>name</tt> uses the <tt>token_name</tt> method to provide the type name as if it were a simple attribute without storing the name itself. When a TokenScheme is included in a recognizer class, the class will now have the token types as named constants, a type-to-name map constant +TOKEN_NAMES+, and a grammar-specific subclass of ANTLR3::CommonToken assigned to the constant Token. Thus, when recognizers need to manufacture tokens, instead of using the generic CommonToken class, they can create tokens using the customized Token class provided by the token scheme. If you need to use a token class other than CommonToken, you can pass the class as a parameter to TokenScheme.new, which will be used in place of the dynamically-created CommonToken subclass. =end class TokenScheme < ::Module include TokenFactory def self.new( tk_class = nil, &body ) super() do tk_class ||= Class.new( ::ANTLR3::CommonToken ) self.token_class = tk_class const_set( :TOKEN_NAMES, ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.clone ) @types = ::ANTLR3::Constants::BUILT_IN_TOKEN_NAMES.invert @unused = ::ANTLR3::Constants::MIN_TOKEN_TYPE scheme = self define_method( :token_scheme ) { scheme } define_method( :token_names ) { scheme::TOKEN_NAMES } define_method( :token_name ) do |type| begin token_names[ type ] or super rescue NoMethodError ::ANTLR3::CommonToken.token_name( type ) end end module_function :token_name, :token_names include ANTLR3::Constants body and module_eval( &body ) end end def self.build( *token_names ) token_names = [ token_names ].flatten! token_names.compact! token_names.uniq! tk_class = Class === token_names.first ? token_names.shift : nil value_maps, names = token_names.partition { |i| Hash === i } new( tk_class ) do for value_map in value_maps define_tokens( value_map ) end for name in names define_token( name ) end end end def included( mod ) super mod.extend( self ) end private :included attr_reader :unused, :types def define_tokens( token_map = {} ) for token_name, token_value in token_map define_token( token_name, token_value ) end return self end def define_token( name, value = nil ) name = name.to_s if current_value = @types[ name ] # token type has already been defined # raise an error unless value is the same as the current value value ||= current_value unless current_value == value raise NameError.new( "new token type definition ``#{ name } = #{ value }'' conflicts " << "with existing type definition ``#{ name } = #{ current_value }''", name ) end else value ||= @unused if name =~ /^[A-Z]\w*$/ const_set( name, @types[ name ] = value ) else constant = "T__#{ value }" const_set( constant, @types[ constant ] = value ) @types[ name ] = value end register_name( value, name ) unless built_in_type?( value ) end value >= @unused and @unused = value + 1 return self end def register_names( *names ) if names.length == 1 and Hash === names.first names.first.each do |value, name| register_name( value, name ) end else names.each_with_index do |name, i| type_value = Constants::MIN_TOKEN_TYPE + i register_name( type_value, name ) end end end def register_name( type_value, name ) name = name.to_s.freeze if token_names.has_key?( type_value ) current_name = token_names[ type_value ] current_name == name and return name if current_name == "T__#{ type_value }" # only an anonymous name is registered -- upgrade the name to the full literal name token_names[ type_value ] = name elsif name == "T__#{ type_value }" # ignore name downgrade from literal to anonymous constant return current_name else error = NameError.new( "attempted assignment of token type #{ type_value }" << " to name #{ name } conflicts with existing name #{ current_name }", name ) raise error end else token_names[ type_value ] = name.to_s.freeze end end def built_in_type?( type_value ) Constants::BUILT_IN_TOKEN_NAMES.fetch( type_value, false ) and true end def token_defined?( name_or_value ) case value when Integer then token_names.has_key?( name_or_value ) else const_defined?( name_or_value.to_s ) end end def []( name_or_value ) case name_or_value when Integer then token_names.fetch( name_or_value, nil ) else const_get( name_or_value.to_s ) rescue token_names.index( name_or_value ) end end def token_class self::Token end def token_class=( klass ) Class === klass or raise( TypeError, "token_class must be a Class" ) Util.silence_warnings do klass < self or klass.send( :include, self ) const_set( :Token, klass ) end end end end