/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.impl; import com.google.common.base.Preconditions; import com.google.streamhtmlparser.ExternalState; import com.google.streamhtmlparser.Parser; import com.google.streamhtmlparser.ParseException; import com.google.streamhtmlparser.util.HtmlUtils; import java.util.Map; /** * An implementation of the {@code Parser} interface that is common to both * {@code HtmlParser} and {@code JavascriptParser}. * * <p>Provides methods for parsing input and ensuring that all in-state, * entering-a-state and exiting-a-state callbacks are invoked as appropriate. * * <p>This class started as abstract but it was found better for testing to * make it instantiatable so that the parsing logic can be tested with dummy * state transitions. */ public class GenericParser implements Parser { protected final ParserStateTable parserStateTable; protected final Map<InternalState, ExternalState> intToExtStateTable; protected final InternalState initialState; protected InternalState currentState; protected int lineNumber; protected int columnNumber; protected GenericParser(ParserStateTable parserStateTable, Map<InternalState, ExternalState> intToExtStateTable, InternalState initialState) { this.parserStateTable = parserStateTable; this.intToExtStateTable = intToExtStateTable; this.initialState = initialState; this.currentState = initialState; this.lineNumber = 1; this.columnNumber = 1; } /** * Constructs a generic parser that is an exact copy of the * one given. Note that here too, data structures that do not * change are shallow-copied (parser state table and state mappings). * * @param aGenericParser the {@code GenericParser} to copy */ protected GenericParser(GenericParser aGenericParser) { parserStateTable = aGenericParser.parserStateTable; intToExtStateTable = aGenericParser.intToExtStateTable; initialState = aGenericParser.initialState; currentState = aGenericParser.currentState; lineNumber = aGenericParser.lineNumber; columnNumber = aGenericParser.columnNumber; } /** * Tell the parser to process the provided {@code String}. This is just a * convenience method that wraps over {@link Parser#parse(char)}. * @param input the {@code String} to parse * @throws ParseException if an unrecoverable error occurred during parsing */ @Override public void parse(String input) throws ParseException { for (int i = 0; i < input.length(); i++) parse(input.charAt(i)); } /** * Main loop for parsing of input. * * <p>Absent any callbacks defined, this function simply determines the * next state to switch to based on the <code>ParserStateTable</code> which is * derived from a state-machine configuration file in the original C++ parser. * * <p>However some states have specific callbacks defined which when * receiving specific characters may decide to overwrite the next state to * go to. Hence the next state is a function both of the main state table * in {@code ParserStateTable} as well as specific run-time information * from the callback functions. * * <p>Also note that the callbacks are called in a proper sequence, * first the exit-state one then the enter-state one and finally the * in-state one. Changing the order may result in a functional change. * * @param input the input character to parse (process) * @throws ParseException if an unrecoverable error occurred during parsing */ @Override public void parse(char input) throws ParseException { InternalState nextState = parserStateTable.getNextState(currentState, input); if (nextState == InternalState.INTERNAL_ERROR_STATE) { String errorMsg = String.format("Unexpected character '%s' in int_state '%s' " + "(ext_state '%s')", HtmlUtils.encodeCharForAscii(input), currentState.getName(), getState().getName()); currentState = InternalState.INTERNAL_ERROR_STATE; throw new ParseException(this, errorMsg); } if (currentState != nextState) { nextState = handleExitState(currentState, nextState, input); } if (currentState != nextState) { nextState = handleEnterState(nextState, nextState, input); } nextState = handleInState(nextState, input); currentState = nextState; record(input); columnNumber++; if (input == '\n') { lineNumber++; columnNumber = 1; } } /** * Return the current state of the parser. */ @Override public ExternalState getState() { if (!intToExtStateTable.containsKey(currentState)) { throw new NullPointerException("Did not find external state mapping " + "For internal state: " + currentState); } return intToExtStateTable.get(currentState); } /** * Reset the parser back to its initial default state. */ @Override public void reset() { currentState = initialState; lineNumber = 1; columnNumber = 1; } /** * Sets the current line number which is returned during error messages. */ @Override public void setLineNumber(int lineNumber) { this.lineNumber = lineNumber; } /** * Returns the current line number. */ @Override public int getLineNumber() { return lineNumber; } /** * Sets the current column number which is returned during error messages. */ @Override public void setColumnNumber(int columnNumber) { this.columnNumber = columnNumber; } /** * Returns the current column number. */ @Override public int getColumnNumber() { return columnNumber; } InternalState getCurrentInternalState() { return currentState; } protected void setNextState(InternalState nextState) throws ParseException { Preconditions.checkNotNull(nextState); // Developer error if it triggers. /* We are not actually parsing hence providing * a null char to the event handlers. */ // TODO: Complicated logic to follow in C++ but clean it up. final char nullChar = '\0'; if (currentState != nextState) { nextState = handleExitState(currentState, nextState, nullChar); } if (currentState != nextState) { handleEnterState(nextState, nextState, nullChar); } currentState = nextState; } /** * Invoked when the parser enters a new state. * * @param currentState the current state of the parser * @param expectedNextState the next state according to the * state table definition * @param input the last character parsed * @return the state to change to, could be the same as the * {@code expectedNextState} provided * @throws ParseException if an unrecoverable error occurred during parsing */ protected InternalState handleEnterState(InternalState currentState, InternalState expectedNextState, char input) throws ParseException { return expectedNextState; } /** * Invoked when the parser exits a state. * * @param currentState the current state of the parser * @param expectedNextState the next state according to the * state table definition * @param input the last character parsed * @return the state to change to, could be the same as the * {@code expectedNextState} provided * @throws ParseException if an unrecoverable error occurred during parsing */ protected InternalState handleExitState(InternalState currentState, InternalState expectedNextState, char input) throws ParseException { return expectedNextState; } /** * Invoked for each character read when no state change occured. * * @param currentState the current state of the parser * @param input the last character parsed * @return the state to change to, could be the same as the * {@code expectedNextState} provided * @throws ParseException if an unrecoverable error occurred during parsing */ protected InternalState handleInState(InternalState currentState, char input) throws ParseException { return currentState; } /** * Perform some processing on the given character. Derived classes * may override this method in order to perform additional logic * on every processed character beyond the logic defined in * state transitions. * * @param input the input character to operate on */ protected void record(char input) { } }