/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser; import com.google.streamhtmlparser.impl.HtmlParserImpl; import java.util.Set; import java.util.logging.Logger; /** * A factory class to obtain instances of an {@link HtmlParser}. * Currently each instance is a new object given these are fairly * light-weight. * * <p>In the unlikely case that this class fails to initialize properly * (a developer error), an error is emitted to the error console and the logs * and the specialized parser creation methods will throw * an {@link AssertionError} on all invokations. */ public class HtmlParserFactory { private static final Logger logger = Logger.getLogger(HtmlParserFactory.class.getName()); /** * To provide additional options when creating an {@code HtmlParser} using * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE, * boolean, Set)} */ public enum AttributeOptions { /** * Indicates that the attribute value is Javascript-quoted. Only takes * effect for Javascript-accepting attributes - as identified by * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also * HTML quoted. */ JS_QUOTED, /** * Indicates the attribute value is only a part of a URL as opposed to a * full URL. In particular, the value is not at the start of a URL and * hence does not necessitate validation of the URL scheme. * Only valid for URI-accepting attributes - as identified by * {@link HtmlParser.ATTR_TYPE#URI}. */ URL_PARTIAL, } /** * To provide additional options when creating an {@code HtmlParser} using * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)} */ public enum ModeOptions { /** * Indicates that the parser is inside a quoted {@code String}. Only * valid in the {@link HtmlParser.Mode#JS} mode. */ JS_QUOTED } private static final HtmlParser parserInDefaultAttr = createParser(); private static final HtmlParser parserInDefaultAttrQ = createParser(); private static final HtmlParser parserInUriAttrComplete = createParser(); private static final HtmlParser parserInUriAttrQComplete = createParser(); private static final HtmlParser parserInUriAttrPartial = createParser(); private static final HtmlParser parserInUriAttrQPartial = createParser(); private static final HtmlParser parserInJsAttr = createParser(); private static final HtmlParser parserInJsAttrQ = createParser(); private static final HtmlParser parserInQJsAttr = createParser(); private static final HtmlParser parserInStyleAttr = createParser(); private static final HtmlParser parserInStyleAttrQ = createParser(); private static final HtmlParser parserInJsQ = createParser(); /** * Protects all the createParserXXX methods by throwing a run-time exception * if this class failed to initialize properly. */ private static boolean initSuccess = false; static { try { initializeParsers(); initSuccess = true; } catch (ParseException e) { // Log a severe error and print it to stderr along with a stack trace. String error = HtmlParserFactory.class.getName() + " Failed initialization: " + e.getMessage(); logger.severe(error); System.err.println(error); e.printStackTrace(); } } // Static class. private HtmlParserFactory() { } // COV_NF_LINE /** * Returns an {@code HtmlParser} object ready to parse HTML input. * * @return an {@code HtmlParser} in the provided mode */ public static HtmlParser createParser() { return new HtmlParserImpl(); } /** * Returns an {@code HtmlParser} object initialized with the * requested Mode. Provide non {@code null} options to provide * a more precise initialization with the desired Mode. * * @param mode the mode to reset the parser with * @param options additional options or {@code null} for none * @return an {@code HtmlParser} in the provided mode * @throws AssertionError when this class failed to initialize */ public static HtmlParser createParserInMode(HtmlParser.Mode mode, Set<ModeOptions> options) { requireInitialized(); if (options != null && options.contains(ModeOptions.JS_QUOTED)) return createParser(parserInJsQ); // With no options given, this method is just a convenience wrapper for // the two calls below. HtmlParser parser = new HtmlParserImpl(); parser.resetMode(mode); return parser; } /** * Returns an {@code HtmlParser} that is a copy of the one * supplied. It holds the same internal state and hence can * proceed with parsing in-lieu of the supplied parser. * * @param aHtmlParser a {@code HtmlParser} to copy from * @return an {@code HtmlParser} that is a copy of the provided one * @throws AssertionError when this class failed to initialize */ public static HtmlParser createParser(HtmlParser aHtmlParser) { requireInitialized(); // Should never get a ClassCastException since there is only one // implementation of the HtmlParser interface. return new HtmlParserImpl((HtmlParserImpl) aHtmlParser); } /** * A very specialized {@code HtmlParser} accessor that returns a parser * in a state where it expects to read the value of an attribute * of an HTML tag. This is only useful when the parser has not seen a * certain HTML tag and an attribute name and needs to continue parsing * from a state as though it has. * * <p>For example, to create a parser in a state akin to that * after the parser has parsed "<a href=\"", invoke: * <pre> * createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)} * </pre> * * <p>You must provide the proper value of quoting or the parser * will go into an unexpected state. * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE} * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state * inside an HTML tag where it expects an attribute name not an attribute * value. It becomes equivalent to a parser initialized in the * {@code HTML_IN_TAG} mode. * * @param attrtype the attribute type which the parser should be in * @param quoted whether the attribute value is enclosed in double quotes * @param options additional options or {@code null} for none * @return an {@code HtmlParser} initialized in the given attribute type * and quoting * @throws AssertionError when this class failed to initialize */ public static HtmlParser createParserInAttribute( HtmlParser.ATTR_TYPE attrtype, boolean quoted, Set<AttributeOptions> options) { requireInitialized(); HtmlParser parser; switch (attrtype) { case REGULAR: parser = createParser( quoted ? parserInDefaultAttrQ : parserInDefaultAttr); break; case URI: if (options != null && options.contains(AttributeOptions.URL_PARTIAL)) parser = createParser( quoted ? parserInUriAttrQPartial : parserInUriAttrPartial); else parser = createParser( quoted ? parserInUriAttrQComplete : parserInUriAttrComplete); break; case JS: // Note: We currently do not support the case of the value being // inside a Javascript quoted string that is in an unquoted HTML // attribute, such as <a href=bla onmouseover=alert('[VALUE')>. // It would be simple to add but currently we assume Javascript // quoted attribute values are always HTML quoted. if (quoted) { if (options != null && options.contains(AttributeOptions.JS_QUOTED)) parser = createParser(parserInQJsAttr); else parser = createParser(parserInJsAttrQ); } else { parser = createParser(parserInJsAttr); } break; case STYLE: parser = createParser( quoted ? parserInStyleAttrQ : parserInStyleAttr); break; case NONE: parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null); break; default: throw new IllegalArgumentException( "Did not recognize ATTR_TYPE given: " + attrtype); } return parser; } /** * Initializes a set of static parsers to be subsequently used * by the various createParserXXX methods. * The parsers are set to their proper states by making them parse * an appropriate HTML input fragment. This approach is the most likely * to ensure all their internal state is consistent. * * <p>In the very unexpected case of the parsing failing (developer error), * this class will fail to initialize properly. * * <p>In addition: * <ul> * <li>The HTML tag is set to a fictitious name {@code xparsertag}. * <li>The attribute name is chosen to match the required attribute type. * When several possibilities exist, one is chosen arbitrarily. * <li>If quoting is required, a double quote is provided after the '='. * </ul> * * @throws ParseException if parsing failed. */ private static void initializeParsers() throws ParseException { parserInDefaultAttr.parse("<xparsertag htmlparser="); parserInDefaultAttrQ.parse("<xparsertag htmlparser=\""); // Chosing the "src" attribute, one of several possible names here parserInUriAttrComplete.parse("<xparsertag src="); parserInUriAttrQComplete.parse("<xparsertag src=\""); // To support a parser that is initialized within a URL parameter // rather than at the beginning of a URL. We use a fake domain // (example.com from RFC 2606 <http://www.rfc-editor.org/rfc/rfc2606.txt>) // and a fake query parameter. final String fakeUrlPrefix = "http://example.com/fakequeryparam="; parserInUriAttrPartial.parse("<xparsertag src=" + fakeUrlPrefix); parserInUriAttrQPartial.parse("<xparsertag src=\"" + fakeUrlPrefix); // Using onmouse= which is a fictitious attribute name that the parser // understands as being a valid javascript-enabled attribute. Chosing fake // names may help during debugging. parserInJsAttr.parse("<xparsertag onmouse="); parserInJsAttrQ.parse("<xparsertag onmouse=\""); // Single quote added as the Javascript is itself quoted. parserInQJsAttr.parse("<xparsertag onmouse=\"'"); // A parser in the Javascript context within a (single) quoted string. parserInJsQ.resetMode(HtmlParser.Mode.JS); parserInJsQ.parse("var fakeparservar='"); // Chosing the "style" attribute as it is the only option parserInStyleAttr.parse("<xparsertag style="); parserInStyleAttrQ.parse("<xparsertag style=\""); } /** * Throws an {@link AssertionError} if the class was not initialized * correctly, otherwise simply returns. This is to protect against the * possibility the needed parsers were not created successfully during * static initialized, which can only happen due to an error during * development of this library. * * @throws AssertionError when this class failed to initialize */ private static void requireInitialized() { if (!initSuccess) throw new AssertionError("HtmlParserFactory failed initialization."); } }