/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.util; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import java.util.Map; /** * <p>Decodes (unescapes) HTML entities with the complication that these * are received one character at a time hence must be stored temporarily. * Also, we may receive some "junk" characters before the actual * entity which we will discard. * * <p>This class is designed to be 100% compatible with the corresponding * logic in the C-version of the * {@link com.google.security.streamhtmlparser.HtmlParser}, found * in <code>htmlparser.c</code>. There are however a few intentional * differences outlines below: * <ul> * <li>We accept lower and upper-case hex NCRs, the C-version * accepts only lower-case ones. * <li>The output on some invalid inputs may be different. This is * currently in the process of consolidation with Filipe. * <li>The API is a bit different, I find this one better suited * for Java. In particular, the C method <code>processChar</code> * returns the output {@code String} whereas in Java, we return * a status code and then provide the {@code String} in a separate * method <code>getEntity</code>. It is cleaner as it avoids the * need to return empty {@code String}s during incomplete processing. * </ul> * * <p>Valid HTML entities have one of the following three forms: * <ul> * <li><code>&dd;</code> where dd is a number in decimal (base 10) form. * <li><code>&x|Xyy;</code> where yy is a hex-number (base 16). * <li><code>&<html-entity>;</code> where * <code><html-entity></code> is one of <code>lt</code>, * <code>gt</code>, <code>amp</code>, <code>quot</code> or * <code>apos</code>. * </ul> * * <p>A <code>reset</code> method is provided to facilitate object re-use. */ public class EntityResolver { /** * Returned in <code>processChar</code> method. * <p> * <ul> * <li><code>NOT_STARTED</code> indicates we are still processing * trailing characters before the start of an entity. * The caller may want to save the characters it provided us. * <li><code>IN_PROGRESS</code> indicates we are currently processing * characters part of an entity. * <li><code>COMPLETED</code> indicates we have finished processing * an entity. The caller can then invoke <code>getEntity</code> * then re-set the object for future re-use. * </ul> */ public enum Status { NOT_STARTED("Not Started"), IN_PROGRESS("In Progress"), COMPLETED("Completed"); private final String message; private Status(String message) { this.message = message; } /** * Returns a brief description of the {@code Status} for * debugging purposes. The format of the returned {@code String} * is not fully specified nor guaranteed to remain the same. */ @Override public String toString() { return message; } } /** * How many characters to store as we are processing an entity. Once we * reach that size, we know the entity is definitely invalid. The size * is higher than needed but keeping it as-is for compatibility with * the C-version. */ private static final int MAX_ENTITY_SIZE = 10; /** * Map containing the recognized HTML entities and their decoded values. * The trailing ';' is not included in the key but it is accounted for. */ private static final Map<String, String> HTML_ENTITIES_MAP = new ImmutableMap.Builder<String, String>() .put("<", "<") .put(">", ">") .put("&", "&") .put("&apos", "'") .build(); /** Storage for received until characters until an HTML entity is complete. */ private final StringBuilder sb; /** * Indicates the state we are in. see {@link EntityResolver.Status}. */ private Status status; private String entity; /** * Constructs an entity resolver that is initially empty and * with status {@code NOT_STARTED}, see {@link EntityResolver.Status}. * */ public EntityResolver() { sb = new StringBuilder(); status = Status.NOT_STARTED; entity = ""; } /** * Constructs an entity resolver that is an exact copy of * the one provided. In particular it has the same contents * and status. * * @param aEntityResolver the entity resolver to copy */ public EntityResolver(EntityResolver aEntityResolver) { sb = new StringBuilder(); sb.replace(0, sb.length(), aEntityResolver.sb.toString()); entity = aEntityResolver.entity; status = aEntityResolver.status; } /** * Returns the object to its original state for re-use, deleting any * stored characters that may be present. */ public void reset() { status = Status.NOT_STARTED; sb.setLength(0); entity = ""; } /** * Returns the full state of the <code>StreamEntityResolver</code> * in a human readable form. The format of the returned <code>String</code> * is not specified and is subject to change. * * @return full state of this object */ @Override public String toString() { return String.format("Status: %s; Contents (%d): %s", status.toString(), sb.length(), sb.toString()); } /** * Returns the decoded HTML Entity. Should only be called * after {@code processChar} returned status {@code COMPLETED}. * * @return the decoded HTML Entity or an empty {@code String} if * we were called with any status other than {@code COMPLETED} */ public String getEntity() { return entity; } /** * Processes a character from the input stream and decodes any html entities * from that processed input stream. * * @param input the {@code char} to process * @return the processed {@code String}. Typically returns an empty * {@code String} while awaiting for more characters to complete * processing of the entity. */ public Status processChar(char input) { // Developer error if the precondition fails. Preconditions.checkState(status != Status.NOT_STARTED || sb.length() == 0); if (status == Status.NOT_STARTED) { if (input == '&') { sb.append(input); status = Status.IN_PROGRESS; } } else if (status == Status.IN_PROGRESS) { if ((input == ';') || (HtmlUtils.isHtmlSpace(input))) { status = Status.COMPLETED; entity = convertEntity(input); } else { if (sb.length() < MAX_ENTITY_SIZE) { sb.append(input); } else { status = Status.COMPLETED; entity = uncovertedInput(input); } } } else { // Status.COMPLETED, ignore character, do nothing. } return status; } /** * Performs the decoding of a complete HTML entity and saves the * result back into the buffer. * <a href="http://www.w3.org/TR/REC-html40/charset.html#h-5.3.1"> * Numeric Character References</a> * * @param terminator the last character read, unused on successful * conversions since it is the end delimiter of the entity * @return The decoded entity or the original input if we could not decode it. */ private String convertEntity(char terminator) { // Developer error if the buffer was empty or does not start with '&'. Preconditions.checkArgument(sb.length() > 0); Preconditions.checkArgument(sb.charAt(0) == '&'); if (sb.length() > 1) { if (sb.charAt(1) == '#') { if (sb.length() <= 2) { // Error => return content as-is. return uncovertedInput(terminator); } try { if ((sb.charAt(2) == 'x') || (sb.charAt(2) == 'X')) { // Hex NCR return new String(Character.toChars( Integer.parseInt(sb.substring(3), 16))); } else { // Decimal NCR return new String(Character.toChars( Integer.parseInt(sb.substring(2)))); } } catch (NumberFormatException e) { return uncovertedInput(terminator); } } // See if it matches any of the few recognized entities. String key = sb.toString(); if (HTML_ENTITIES_MAP.containsKey(key)) { return HTML_ENTITIES_MAP.get(key); } } // Covers the case of a lonely '&' given or valid/invalid unknown entities. return uncovertedInput(terminator); } private String uncovertedInput(char terminator) { return String.format("%s%c", sb.toString(), terminator); } }