/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser; /** * Methods exposed for HTML parsing of text to facilitate implementation * of Automatic context-aware escaping. The HTML parser also embeds a * Javascript parser for processing Javascript fragments. In the future, * it will also embed other specific parsers and hence most likely remain * the main interface to callers of this package. * * <p>Note: These are the exact methods exposed in the original C++ Parser. The * names are simply modified to conform to Java. */ public interface HtmlParser extends Parser { /** * The Parser Mode requested for parsing a given template. * Currently we support: * <ul> * <li>{@code HTML} for HTML templates. * <li>{@code JS} for javascript templates. * <li>{@code CSS} for Cascading Style-Sheets templates. * <li>{@code HTML_IN_TAG} for HTML templates that consist only of * HTML attribute name and value pairs. This is typically the case for * a template that is being included from a parent template where the * parent template contains the start and the closing of the HTML tag. * This is a special mode, for standard HTML templates please use * {@link #HTML}. * An example of such as template is: * <p><code>class="someClass" target="_blank"</code></p> * <p>Which could be included from a parent template that contains * an anchor tag, say:</p> * <p><code><a href="/bla" ["INCLUDED_TEMPLATE"]></code></p> * </ul> */ public enum Mode { HTML, JS, CSS, HTML_IN_TAG } /** * Indicates the type of HTML attribute that the parser is currently in or * {@code NONE} if the parser is not currently in an attribute. * {@code URI} is for attributes taking a URI such as "href" and "src". * {@code JS} is for attributes taking javascript such as "onclick". * {@code STYLE} is for the "style" attribute. * All other attributes fall under {@code REGULAR}. * * Returned by {@link HtmlParser#getAttributeType()} */ public enum ATTR_TYPE { NONE, REGULAR, URI, JS, STYLE } /** * All the states in which the parser can be. These are external states. * The parser has many more internal states that are not exposed and which * are instead mapped to one of these external ones. * {@code STATE_TEXT} the parser is in HTML proper. * {@code STATE_TAG} the parser is inside an HTML tag name. * {@code STATE_COMMENT} the parser is inside an HTML comment. * {@code STATE_ATTR} the parser is inside an HTML attribute name. * {@code STATE_VALUE} the parser is inside an HTML attribute value. * {@code STATE_JS_FILE} the parser is inside javascript code. * {@code STATE_CSS_FILE} the parser is inside CSS code. * * <p>All these states map exactly to those exposed in the C++ (original) * version of the HtmlParser. */ public final static ExternalState STATE_TEXT = new ExternalState("STATE_TEXT"); public final static ExternalState STATE_TAG = new ExternalState("STATE_TAG"); public final static ExternalState STATE_COMMENT = new ExternalState("STATE_COMMENT"); public final static ExternalState STATE_ATTR = new ExternalState("STATE_ATTR"); public final static ExternalState STATE_VALUE = new ExternalState("STATE_VALUE"); public final static ExternalState STATE_JS_FILE = new ExternalState("STATE_JS_FILE"); public final static ExternalState STATE_CSS_FILE = new ExternalState("STATE_CSS_FILE"); /** * Returns {@code true} if the parser is currently processing Javascript. * Such is the case if and only if, the parser is processing an attribute * that takes Javascript, a Javascript script block or the parser * is (re)set with {@link Mode#JS}. * * @return {@code true} if the parser is processing Javascript, * {@code false} otherwise */ public boolean inJavascript(); /** * Returns {@code true} if the parser is currently processing * a Javascript litteral that is quoted. The caller will typically * invoke this method after determining that the parser is processing * Javascript. Knowing whether the element is quoted or not helps * determine which escaping to apply to it when needed. * * @return {@code true} if and only if the parser is inside a quoted * Javascript literal */ public boolean isJavascriptQuoted(); /** * Returns {@code true} if and only if the parser is currently within * an attribute, be it within the attribute name or the attribute value. * * @return {@code true} if and only if inside an attribute */ public boolean inAttribute(); /** * Returns {@code true} if and only if the parser is currently within * a CSS context. A CSS context is one of the below: * <ul> * <li>Inside a STYLE tag. * <li>Inside a STYLE attribute. * <li>Inside a CSS file when the parser was reset in the CSS mode. * </ul> * * @return {@code true} if and only if the parser is inside CSS */ public boolean inCss(); /** * Returns the type of the attribute that the parser is in * or {@code ATTR_TYPE.NONE} if we are not parsing an attribute. * The caller will typically invoke this method after determining * that the parser is processing an attribute. * * <p>This is useful to determine which escaping to apply based * on the type of value this attribute expects. * * @return type of the attribute * @see HtmlParser.ATTR_TYPE */ public ATTR_TYPE getAttributeType(); /** * Returns {@code true} if and only if the parser is currently within * an attribute value and that attribute value is quoted. * * @return {@code true} if and only if the attribute value is quoted */ public boolean isAttributeQuoted(); /** * Returns the name of the HTML tag if the parser is currently within one. * Note that the name may be incomplete if the parser is currently still * parsing the name. Returns an empty {@code String} if the parser is not * in a tag as determined by {@code getCurrentExternalState}. * * @return the name of the HTML tag or an empty {@code String} if we are * not within an HTML tag */ public String getTag(); /** * Returns the name of the HTML attribute the parser is currently processing. * If the parser is still parsing the name, then the returned name * may be incomplete. Returns an empty {@code String} if the parser is not * in an attribute as determined by {@code getCurrentExternalState}. * * @return the name of the HTML attribute or an empty {@code String} * if we are not within an HTML attribute */ public String getAttribute(); /** * Returns the value of an HTML attribute if the parser is currently * within one. If the parser is currently parsing the value, the returned * value may be incomplete. The caller will typically first determine * that the parser is processing a value by calling * {@code getCurrentExternalState}. * * @return the value, could be an empty {@code String} if the parser is not * in an HTML attribute value */ public String getValue(); /** * Returns the current position of the parser within the HTML attribute * value, zero being the position of the first character in the value. * The caller will typically first determine that the parser is * processing a value by calling {@link #getState()}. * * @return the index or zero if the parser is not processing a value */ public int getValueIndex(); /** * Returns {@code true} if and only if the current position of the parser is * at the start of a URL HTML attribute value. This is the case when the * following three conditions are all met: * <p> * <ol> * <li>The parser is in an HTML attribute value. * <li>The HTML attribute expects a URL, as determined by * {@link #getAttributeType()} returning {@code .ATTR_TYPE#URI}. * <li>The parser has not yet seen any characters from that URL. * </ol> * * <p> This method may be used by an Html Sanitizer or an Auto-Escape system * to determine whether to validate the URL for well-formedness and validate * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe. * In particular, it is recommended to use this method instead of * checking that {@link #getValueIndex()} is {@code 0} to support attribute * types where the URL does not start at index zero, such as the * {@code content} attribute of the {@code meta} HTML tag. * * @return {@code true} if and only if the parser is at the start of the URL */ public boolean isUrlStart(); /** * Resets the state of the parser, allowing for reuse of the * {@code HtmlParser} object. * * <p>See the {@link HtmlParser.Mode} enum for information on all * the valid modes. * * @param mode is an enum representing the high-level state of the parser */ public void resetMode(HtmlParser.Mode mode); /** * A specialized directive to tell the parser there is some content * that will be inserted here but that it will not get to parse. Used * by the template system that may not be able to give some content * to the parser but wants it to know there typically will be content * inserted at that point. This is a hint used in corner cases within * parsing of HTML attribute names and values where content we do not * get to see could affect our parsing and alter our current state. * * <p>Returns {@code false} if and only if the parser encountered * a fatal error which prevents it from continuing further parsing. * * <p>Note: The return value is different from the C++ Parser which * always returns {@code true} but in my opinion makes more sense. * * @throws ParseException if an unrecoverable error occurred during parsing */ public void insertText() throws ParseException; /** * Returns the state the Javascript parser is in. * * <p>See {@link JavascriptParser} for more information on the valid * external states. The caller will typically first determine that the * parser is processing Javascript and then invoke this method to * obtain more fine-grained state information. * * @return external state of the javascript parser */ public ExternalState getJavascriptState(); }