Java程序  |  187行  |  6.79 KB

/*
 * Copyright (C) 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.streamhtmlparser.impl;

import com.google.common.base.Preconditions;

/**
 * Holds a state table which is defined as the set of all
 * recognized state transitions and the set of characters that
 * trigger them.
 *
 * <p>The logic of what character causes what state transition is derived from
 * a base definition written as a Python configuration file in the original
 * C++ parser.
 *
 * <p>This class provides methods to initially build the state table and then
 * methods at parsing time to determine the transitions to subsequent states.
 *
 * <p>Note on characters outside the extended ASCII range: Currently, all state
 * transitions in the Python configuration file trigger only on extended
 * ASCII characters, that is characters in the Unicode space of [U+0000 to
 * U+00FF]. We use that property to design a more efficient state transition
 * representation. When receiving characters outside that ASCII range, we
 * simply apply the DEFAULT transition for the given state - as we do for any
 * character that is not a hot character for that state. If no default
 * transition exists, we switch to the Internal Error state.
 *
 * <p>Technical note: In Java, a {@code char} is a code unit and in some cases
 * may not represent a complete Unicode code point. However, when that happens,
 * the code units that follow for that code point are all in the surrogate area
 * [U+D800 - U+DFFF] and hence outside of the ASCII range and will not trigger
 * any incorrect state transitions.
 *
 * <p>This class is storage-inefficient but it is static at least
 * and not generated for each Parser instance.
 */
class ParserStateTable {

  /**
   * A limit on how many different states we can have in one state table.
   * Can be increased should it no longer be sufficient.
   */
  private static final int MAX_STATES = 256;

  /**
   * We only check transitions for (extended) ASCII characters, hence
   * characters in the range 0 to MAX_CHARS -1.
   */
  private static final int MAX_CHARS = 256;

  /**
   * Records all state transitions except those identified as DEFAULT
   * transitions. It is two dimensional: Stores a target {@code InternalState}
   * given a source state (referenced by its numeric ID) and the current
   * character.
   */
  private final InternalState[][] stateTable;

  /**
   * Records all DEFAULT state transitions. These are transitions provided
   * using the {@code "[:default:]"} syntax in the Python configuration file.
   * There can be only one such transition for any given source state, hence
   * the array is one dimensional.
   */
  private final InternalState[] defaultStateTable;

  public ParserStateTable() {
    stateTable = new InternalState[MAX_STATES][MAX_CHARS];
    defaultStateTable = new InternalState[MAX_STATES];
  }

  /**
   * Returns the state to go to when receiving the current {@code char}
   * in the {@code from} state.
   * Returns {@code InternalState.INTERNAL_ERROR_STATE} if there is no
   * state transition for that character and no default state transition
   * for that state.
   *
   * <p>For ASCII characters, first look-up an explicit state transition for
   * the current character. If none is found, look-up a default transition. For
   * non-ASCII characters, look-up a default transition only.
   *
   * @param from the source state
   * @param currentChar the character received
   * @return the state to move to or {@code InternalState.INTERNAL_ERROR_STATE}
   */
  InternalState getNextState(InternalState from, int currentChar) {
    // TODO: Consider throwing run-time error here.
    if (from == null || currentChar < 0)
      return InternalState.INTERNAL_ERROR_STATE;

    int id = from.getId();
    if (id < 0 || id >= MAX_STATES) {
      return InternalState.INTERNAL_ERROR_STATE;
    }

    InternalState result = null;
    if (currentChar < MAX_CHARS) {
      result = stateTable[id][currentChar];
    }
    if (result == null) {
        result = defaultStateTable[from.getId()];
    }
    return result != null ? result : InternalState.INTERNAL_ERROR_STATE;
  }

  void setExpression(String expr, InternalState from, InternalState to) {
    if ((expr == null) || (from == null) || (to == null)) {
      return;
    }

    // This special string indicates a default state transition.
    if ("[:default:]".equals(expr)) {
      setDefaultDestination(from, to);
      return;
    }
    int i = 0;
    while (i < expr.length()) {
      // If next char is a '-' which is not the last character of the expr
      if (i < (expr.length() - 2) && expr.charAt(i + 1) == '-') {
        setRange(from, expr.charAt(i), expr.charAt(i + 2), to);
        i += 2;
      } else {
        setDestination(from, expr.charAt(i), to);
        i++;
      }
    }
  }

  private void fill(InternalState from, InternalState to) {
    char c;
    for (c = 0; c < MAX_CHARS; c++) {
      setDestination(from, c, to);
    }
  }

  private void setDefaultDestination(InternalState from, InternalState to) {
    Preconditions.checkNotNull(from);   // Developer error if it triggers
    Preconditions.checkNotNull(to);     // Developer error if it triggers
    int id = from.getId();
    if ((id < 0) || (id >= MAX_STATES)) {
      return;
    }
    // TODO: Consider asserting if there was a state transition defined.
    defaultStateTable[from.getId()] = to;
  }

  private void setDestination(InternalState from, char chr, InternalState to) {
    Preconditions.checkNotNull(from);   // Developer error if it triggers
    Preconditions.checkNotNull(to);     // Developer error if it triggers
    Preconditions.checkArgument(chr >= 0 && chr < MAX_CHARS,
                                "char must be in ASCII set: %c", chr);
    int id = from.getId();
    if ((id < 0) || (id >= MAX_STATES)) {
      return;
    }
    stateTable[from.getId()][chr] = to;
  }

  private void setRange(InternalState from, char start, char end,
                        InternalState to) {
    // Developer error if either trigger.
    Preconditions.checkArgument(start >= 0 && start < MAX_CHARS,
                                "char must be in ASCII set: %c", start);
    Preconditions.checkArgument(end >= 0 && end < MAX_CHARS,
                                "char must be in ASCII set: %c", end);
    char c;
    for (c = start; c <= end; c++) {
      setDestination(from, c, to);
    }
  }
}