Java程序  |  476行  |  15.5 KB

package java_cup;

import java.util.Hashtable;

import java_cup.runtime.str_token;
import java_cup.runtime.token;

/** This class implements a small scanner (aka lexical analyzer or lexer) for
 *  the JavaCup specification.  This scanner reads characters from standard 
 *  input (System.in) and returns integers corresponding to the terminal 
 *  number of the next token.  Once end of input is reached the EOF token is 
 *  returned on every subsequent call.<p>
 *  Tokens currently returned include: <pre>
 *    Symbol        Constant Returned     Symbol        Constant Returned
 *    ------        -----------------     ------        -----------------
 *    "package"     PACKAGE               "import"      IMPORT 
 *    "code"        CODE                  "action"      ACTION 
 *    "parser"      PARSER                "terminal"    TERMINAL
 *    "non"         NON                   "init"        INIT 
 *    "scan"        SCAN                  "with"        WITH
 *    "start"       START                   ;           SEMI 
 *      ,           COMMA                   *           STAR 
 *      .           DOT                     :           COLON
 *      ::=         COLON_COLON_EQUALS      |           BAR
 *    identifier    ID                    {:...:}       CODE_STRING
 *    "debug"       DEBUG
 *  </pre>
 *  All symbol constants are defined in sym.java which is generated by 
 *  JavaCup from parser.cup.<p>
 * 
 *  In addition to the scanner proper (called first via init() then with
 *  next_token() to get each token) this class provides simple error and 
 *  warning routines and keeps a count of errors and warnings that is 
 *  publicly accessible.<p>
 *  
 *  This class is "static" (i.e., it has only static members and methods).
 *
 * @version last updated: 11/25/95
 * @author  Scott Hudson
 */
public class lexer {

  /*-----------------------------------------------------------*/
  /*--- Constructor(s) ----------------------------------------*/
  /*-----------------------------------------------------------*/

  /** The only constructor is private, so no instances can be created. */
  private lexer() { }

  /*-----------------------------------------------------------*/
  /*--- Static (Class) Variables ------------------------------*/
  /*-----------------------------------------------------------*/

  /** First character of lookahead. */
  protected static int next_char; 

  /** Second character of lookahead. */
  protected static int next_char2;

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** EOF constant. */
  protected static final int EOF_CHAR = -1;

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Table of keywords.  Keywords are initially treated as identifiers.
   *  Just before they are returned we look them up in this table to see if
   *  they match one of the keywords.  The string of the name is the key here,
   *  which indexes Integer objects holding the symbol number. 
   */
  protected static Hashtable keywords = new Hashtable(23);

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Table of single character symbols.  For ease of implementation, we 
   *  store all unambiguous single character tokens in this table of Integer
   *  objects keyed by Integer objects with the numerical value of the 
   *  appropriate char (currently Character objects have a bug which precludes
   *  their use in tables).
   */
  protected static Hashtable char_symbols = new Hashtable(11);

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Current line number for use in error messages. */
  protected static int current_line = 1;

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Character position in current line. */
  protected static int current_position = 1;

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Count of total errors detected so far. */
  public static int error_count = 0;

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Count of warnings issued so far */
  public static int warning_count = 0;

  /*-----------------------------------------------------------*/
  /*--- Static Methods ----------------------------------------*/
  /*-----------------------------------------------------------*/

  /** Initialize the scanner.  This sets up the keywords and char_symbols
    * tables and reads the first two characters of lookahead.  
    */
  public static void init() throws java.io.IOException
    {
      /* set up the keyword table */
      keywords.put("package",  new Integer(sym.PACKAGE));
      keywords.put("import",   new Integer(sym.IMPORT));
      keywords.put("code",     new Integer(sym.CODE));
      keywords.put("action",   new Integer(sym.ACTION));
      keywords.put("parser",   new Integer(sym.PARSER));
      keywords.put("terminal", new Integer(sym.TERMINAL));
      keywords.put("non",      new Integer(sym.NON));
      keywords.put("init",     new Integer(sym.INIT));
      keywords.put("scan",     new Integer(sym.SCAN));
      keywords.put("with",     new Integer(sym.WITH));
      keywords.put("start",    new Integer(sym.START));
      keywords.put("debug",    new Integer(sym.DEBUG));

      /* set up the table of single character symbols */
      char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
      char_symbols.put(new Integer(','), new Integer(sym.COMMA));
      char_symbols.put(new Integer('*'), new Integer(sym.STAR));
      char_symbols.put(new Integer('.'), new Integer(sym.DOT));
      char_symbols.put(new Integer('|'), new Integer(sym.BAR));

      /* read two characters of lookahead */
      next_char = System.in.read();
      if (next_char == EOF_CHAR) 
    next_char2 = EOF_CHAR;
      else
    next_char2 = System.in.read();
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Advance the scanner one character in the input stream.  This moves
   * next_char2 to next_char and then reads a new next_char2.  
   */
  protected static void advance() throws java.io.IOException
    {
      int old_char;

      old_char = next_char;
      next_char = next_char2;
      if (next_char == EOF_CHAR)
    next_char2 = EOF_CHAR;
      else
    next_char2 = System.in.read();

      /* count this */
      current_position++;
      if (old_char == '\n')
    {
      current_line++;
      current_position = 1;
    }
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Emit an error message.  The message will be marked with both the 
   *  current line number and the position in the line.  Error messages
   *  are printed on standard error (System.err).
   * @param message the message to print.
   */
  public static void emit_error(String message)
    {
      System.err.println("Error at " + current_line + "(" + current_position +
             "): " + message);
      error_count++;
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Emit a warning message.  The message will be marked with both the 
   *  current line number and the position in the line.  Messages are 
   *  printed on standard error (System.err).
   * @param message the message to print.
   */
  public static void emit_warn(String message)
    {
      System.err.println("Warning at " + current_line + "(" + current_position +
             "): " + message);
      warning_count++;
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Determine if a character is ok to start an id. 
   * @param ch the character in question.
   */
  protected static boolean id_start_char(int ch)
    {
      return (ch >= 'a' &&  ch <= 'z') || (ch >= 'A' && ch <= 'Z') || 
         (ch == '_');

      // later need to deal with non-8-bit chars here
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Determine if a character is ok for the middle of an id.
   * @param ch the character in question. 
   */
  protected static boolean id_char(int ch)
    {
      return id_start_char(ch) || (ch >= '0' && ch <= '9');
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Try to look up a single character symbol, returns -1 for not found. 
   * @param ch the character in question.
   */
  protected static int find_single_char(int ch)
    {
      Integer result;

      result = (Integer)char_symbols.get(new Integer((char)ch));
      if (result == null) 
    return -1;
      else
    return result.intValue();
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Handle swallowing up a comment.  Both old style C and new style C++
   *  comments are handled.
   */
  protected static void swallow_comment() throws java.io.IOException
    {
      /* next_char == '/' at this point */

      /* is it a traditional comment */
      if (next_char2 == '*')
    {
      /* swallow the opener */
      advance(); advance();

      /* swallow the comment until end of comment or EOF */
      for (;;)
        {
          /* if its EOF we have an error */
          if (next_char == EOF_CHAR)
        {
          emit_error("Specification file ends inside a comment");
          return;
        }

          /* if we can see the closer we are done */
          if (next_char == '*' && next_char2 == '/')
        {
          advance();
          advance();
          return;
        }

          /* otherwise swallow char and move on */
          advance();
        }
    }

      /* is its a new style comment */
      if (next_char2 == '/')
    {
      /* swallow the opener */
      advance(); advance();

      /* swallow to '\n', '\f', or EOF */ 
      while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
        advance();

      return;

    }

      /* shouldn't get here, but... if we get here we have an error */
      emit_error("Malformed comment in specification -- ignored");
      advance();
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Swallow up a code string.  Code strings begin with "{:" and include
      all characters up to the first occurrence of ":}" (there is no way to 
      include ":}" inside a code string).  The routine returns an str_token
      object suitable for return by the scanner.
   */
  protected static token do_code_string() throws java.io.IOException
    {
      StringBuffer result = new StringBuffer();

      /* at this point we have lookahead of "{:" -- swallow that */
      advance(); advance();

      /* save chars until we see ":}" */
      while (!(next_char == ':' && next_char2 == '}'))
    {
      /* if we have run off the end issue a message and break out of loop */
      if (next_char == EOF_CHAR)
        {
          emit_error("Specification file ends inside a code string");
          break;
        }

      /* otherwise record the char and move on */
      result.append(new Character((char)next_char));
      advance();
    }

      /* advance past the closer and build a return token */
      advance(); advance();
      return new str_token(sym.CODE_STRING, result.toString());
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Process an identifier.  Identifiers begin with a letter, underscore,
   *  or dollar sign, which is followed by zero or more letters, numbers,
   *  underscores or dollar signs.  This routine returns an str_token suitable
   *  for return by the scanner.
   */
  protected static token do_id() throws java.io.IOException
    {
      StringBuffer result = new StringBuffer();
      String       result_str;
      Integer      keyword_num;
      char         buffer[] = new char[1];

      /* next_char holds first character of id */
      buffer[0] = (char)next_char;
      result.append(buffer,0,1);
      advance();

      /* collect up characters while they fit in id */ 
      while(id_char(next_char))
    {
          buffer[0] = (char)next_char;
      result.append(buffer,0,1);
      advance();
    }

      /* extract a string and try to look it up as a keyword */
      result_str = result.toString();
      keyword_num = (Integer)keywords.get(result_str);

      /* if we found something, return that keyword */
      if (keyword_num != null)
    return new token(keyword_num.intValue());

      /* otherwise build and return an id token with an attached string */
      return new str_token(sym.ID, result_str);
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Return one token.  This is the main external interface to the scanner.
   *  It consumes sufficient characters to determine the next input token
   *  and returns it.  To help with debugging, this routine actually calls
   *  real_next_token() which does the work.  If you need to debug the 
   *  parser, this can be changed to call debug_next_token() which prints
   *  a debugging message before returning the token.
   */
  public static token next_token() throws java.io.IOException
    {
      return real_next_token();
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** Debugging version of next_token().  This routine calls the real scanning
   *  routine, prints a message on System.out indicating what the token is,
   *  then returns it.
   */
  public static token debug_next_token() throws java.io.IOException
    {
      token result = real_next_token();
      System.out.println("# next_token() => " + result.sym);
      return result;
    }

  /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/

  /** The actual routine to return one token.  This is normally called from
   *  next_token(), but for debugging purposes can be called indirectly from
   *  debug_next_token(). 
   */
  protected static token real_next_token() throws java.io.IOException
    {
      int sym_num;

      for (;;)
    {
      /* look for white space */
      if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
          next_char == '\f' ||  next_char == '\r')
        {
          /* advance past it and try the next character */
          advance();
          continue;
        }

      /* look for a single character symbol */
      sym_num = find_single_char(next_char);
      if (sym_num != -1)
        {
          /* found one -- advance past it and return a token for it */
          advance();
          return new token(sym_num);
        }

      /* look for : or ::= */
      if (next_char == ':')
        {
          /* if we don't have a second ':' return COLON */
          if (next_char2 != ':') 
        {
          advance();
          return new token(sym.COLON);
        }

          /* move forward and look for the '=' */
          advance();
          if (next_char2 == '=') 
        {
          advance(); advance();
          return new token(sym.COLON_COLON_EQUALS);
        }
          else
        {
          /* return just the colon (already consumed) */
          return new token(sym.COLON);
        }
        }

      /* look for a comment */
      if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
        {
          /* swallow then continue the scan */
          swallow_comment();
          continue;
        }

      /* look for start of code string */
      if (next_char == '{' && next_char2 == ':')
        return do_code_string();

      /* look for an id or keyword */
      if (id_start_char(next_char)) return do_id();

      /* look for EOF */
      if (next_char == EOF_CHAR) return new token(sym.EOF);

      /* if we get here, we have an unrecognized character */
      emit_warn("Unrecognized character '" + 
        new Character((char)next_char) + "'(" + next_char + 
        ") -- ignored");

      /* advance past it */
      advance();
    }
    }

  /*-----------------------------------------------------------*/
};