1 /* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.streamhtmlparser; 18 19 import com.google.streamhtmlparser.impl.HtmlParserImpl; 20 21 import java.util.Set; 22 import java.util.logging.Logger; 23 24 /** 25 * A factory class to obtain instances of an {@link HtmlParser}. 26 * Currently each instance is a new object given these are fairly 27 * light-weight. 28 * 29 * <p>In the unlikely case that this class fails to initialize properly 30 * (a developer error), an error is emitted to the error console and the logs 31 * and the specialized parser creation methods will throw 32 * an {@link AssertionError} on all invokations. 33 */ 34 public class HtmlParserFactory { 35 36 private static final Logger logger = 37 Logger.getLogger(HtmlParserFactory.class.getName()); 38 39 /** 40 * To provide additional options when creating an {@code HtmlParser} using 41 * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE, 42 * boolean, Set)} 43 */ 44 public enum AttributeOptions { 45 46 /** 47 * Indicates that the attribute value is Javascript-quoted. Only takes 48 * effect for Javascript-accepting attributes - as identified by 49 * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also 50 * HTML quoted. 51 */ 52 JS_QUOTED, 53 54 /** 55 * Indicates the attribute value is only a part of a URL as opposed to a 56 * full URL. In particular, the value is not at the start of a URL and 57 * hence does not necessitate validation of the URL scheme. 58 * Only valid for URI-accepting attributes - as identified by 59 * {@link HtmlParser.ATTR_TYPE#URI}. 60 */ 61 URL_PARTIAL, 62 } 63 64 /** 65 * To provide additional options when creating an {@code HtmlParser} using 66 * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)} 67 */ 68 public enum ModeOptions { 69 70 /** 71 * Indicates that the parser is inside a quoted {@code String}. Only 72 * valid in the {@link HtmlParser.Mode#JS} mode. 73 */ 74 JS_QUOTED 75 } 76 77 private static final HtmlParser parserInDefaultAttr = createParser(); 78 private static final HtmlParser parserInDefaultAttrQ = createParser(); 79 private static final HtmlParser parserInUriAttrComplete = createParser(); 80 private static final HtmlParser parserInUriAttrQComplete = createParser(); 81 private static final HtmlParser parserInUriAttrPartial = createParser(); 82 private static final HtmlParser parserInUriAttrQPartial = createParser(); 83 private static final HtmlParser parserInJsAttr = createParser(); 84 private static final HtmlParser parserInJsAttrQ = createParser(); 85 private static final HtmlParser parserInQJsAttr = createParser(); 86 private static final HtmlParser parserInStyleAttr = createParser(); 87 private static final HtmlParser parserInStyleAttrQ = createParser(); 88 private static final HtmlParser parserInJsQ = createParser(); 89 90 /** 91 * Protects all the createParserXXX methods by throwing a run-time exception 92 * if this class failed to initialize properly. 93 */ 94 private static boolean initSuccess = false; 95 96 static { 97 try { 98 initializeParsers(); 99 initSuccess = true; 100 } catch (ParseException e) { 101 // Log a severe error and print it to stderr along with a stack trace. 102 String error = HtmlParserFactory.class.getName() + 103 " Failed initialization: " + e.getMessage(); 104 logger.severe(error); 105 System.err.println(error); 106 e.printStackTrace(); 107 } 108 } 109 110 // Static class. 111 private HtmlParserFactory() { 112 } // COV_NF_LINE 113 114 /** 115 * Returns an {@code HtmlParser} object ready to parse HTML input. 116 * 117 * @return an {@code HtmlParser} in the provided mode 118 */ 119 public static HtmlParser createParser() { 120 return new HtmlParserImpl(); 121 } 122 123 /** 124 * Returns an {@code HtmlParser} object initialized with the 125 * requested Mode. Provide non {@code null} options to provide 126 * a more precise initialization with the desired Mode. 127 * 128 * @param mode the mode to reset the parser with 129 * @param options additional options or {@code null} for none 130 * @return an {@code HtmlParser} in the provided mode 131 * @throws AssertionError when this class failed to initialize 132 */ 133 public static HtmlParser createParserInMode(HtmlParser.Mode mode, 134 Set<ModeOptions> options) { 135 requireInitialized(); 136 137 if (options != null && options.contains(ModeOptions.JS_QUOTED)) 138 return createParser(parserInJsQ); 139 140 // With no options given, this method is just a convenience wrapper for 141 // the two calls below. 142 HtmlParser parser = new HtmlParserImpl(); 143 parser.resetMode(mode); 144 return parser; 145 } 146 147 /** 148 * Returns an {@code HtmlParser} that is a copy of the one 149 * supplied. It holds the same internal state and hence can 150 * proceed with parsing in-lieu of the supplied parser. 151 * 152 * @param aHtmlParser a {@code HtmlParser} to copy from 153 * @return an {@code HtmlParser} that is a copy of the provided one 154 * @throws AssertionError when this class failed to initialize 155 */ 156 public static HtmlParser createParser(HtmlParser aHtmlParser) { 157 requireInitialized(); 158 159 // Should never get a ClassCastException since there is only one 160 // implementation of the HtmlParser interface. 161 return new HtmlParserImpl((HtmlParserImpl) aHtmlParser); 162 } 163 164 /** 165 * A very specialized {@code HtmlParser} accessor that returns a parser 166 * in a state where it expects to read the value of an attribute 167 * of an HTML tag. This is only useful when the parser has not seen a 168 * certain HTML tag and an attribute name and needs to continue parsing 169 * from a state as though it has. 170 * 171 * <p>For example, to create a parser in a state akin to that 172 * after the parser has parsed "<a href=\"", invoke: 173 * <pre> 174 * createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)} 175 * </pre> 176 * 177 * <p>You must provide the proper value of quoting or the parser 178 * will go into an unexpected state. 179 * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE} 180 * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state 181 * inside an HTML tag where it expects an attribute name not an attribute 182 * value. It becomes equivalent to a parser initialized in the 183 * {@code HTML_IN_TAG} mode. 184 * 185 * @param attrtype the attribute type which the parser should be in 186 * @param quoted whether the attribute value is enclosed in double quotes 187 * @param options additional options or {@code null} for none 188 * @return an {@code HtmlParser} initialized in the given attribute type 189 * and quoting 190 * @throws AssertionError when this class failed to initialize 191 */ 192 public static HtmlParser createParserInAttribute( 193 HtmlParser.ATTR_TYPE attrtype, 194 boolean quoted, Set<AttributeOptions> options) { 195 requireInitialized(); 196 197 HtmlParser parser; 198 switch (attrtype) { 199 case REGULAR: 200 parser = createParser( 201 quoted ? parserInDefaultAttrQ : parserInDefaultAttr); 202 break; 203 case URI: 204 if (options != null && options.contains(AttributeOptions.URL_PARTIAL)) 205 parser = createParser( 206 quoted ? parserInUriAttrQPartial : parserInUriAttrPartial); 207 else 208 parser = createParser( 209 quoted ? parserInUriAttrQComplete : parserInUriAttrComplete); 210 break; 211 case JS: 212 // Note: We currently do not support the case of the value being 213 // inside a Javascript quoted string that is in an unquoted HTML 214 // attribute, such as <a href=bla onmouseover=alert('[VALUE')>. 215 // It would be simple to add but currently we assume Javascript 216 // quoted attribute values are always HTML quoted. 217 if (quoted) { 218 if (options != null && options.contains(AttributeOptions.JS_QUOTED)) 219 parser = createParser(parserInQJsAttr); 220 else 221 parser = createParser(parserInJsAttrQ); 222 } else { 223 parser = createParser(parserInJsAttr); 224 } 225 break; 226 case STYLE: 227 parser = createParser( 228 quoted ? parserInStyleAttrQ : parserInStyleAttr); 229 break; 230 case NONE: 231 parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null); 232 break; 233 default: 234 throw new IllegalArgumentException( 235 "Did not recognize ATTR_TYPE given: " + attrtype); 236 } 237 return parser; 238 } 239 240 /** 241 * Initializes a set of static parsers to be subsequently used 242 * by the various createParserXXX methods. 243 * The parsers are set to their proper states by making them parse 244 * an appropriate HTML input fragment. This approach is the most likely 245 * to ensure all their internal state is consistent. 246 * 247 * <p>In the very unexpected case of the parsing failing (developer error), 248 * this class will fail to initialize properly. 249 * 250 * <p>In addition: 251 * <ul> 252 * <li>The HTML tag is set to a fictitious name {@code xparsertag}. 253 * <li>The attribute name is chosen to match the required attribute type. 254 * When several possibilities exist, one is chosen arbitrarily. 255 * <li>If quoting is required, a double quote is provided after the '='. 256 * </ul> 257 * 258 * @throws ParseException if parsing failed. 259 */ 260 private static void initializeParsers() throws ParseException { 261 parserInDefaultAttr.parse("<xparsertag htmlparser="); 262 parserInDefaultAttrQ.parse("<xparsertag htmlparser=\""); 263 264 // Chosing the "src" attribute, one of several possible names here 265 parserInUriAttrComplete.parse("<xparsertag src="); 266 parserInUriAttrQComplete.parse("<xparsertag src=\""); 267 268 // To support a parser that is initialized within a URL parameter 269 // rather than at the beginning of a URL. We use a fake domain 270 // (example.com from RFC 2606 <http://www.rfc-editor.org/rfc/rfc2606.txt>) 271 // and a fake query parameter. 272 final String fakeUrlPrefix = "http://example.com/fakequeryparam="; 273 parserInUriAttrPartial.parse("<xparsertag src=" + fakeUrlPrefix); 274 parserInUriAttrQPartial.parse("<xparsertag src=\"" + fakeUrlPrefix); 275 276 // Using onmouse= which is a fictitious attribute name that the parser 277 // understands as being a valid javascript-enabled attribute. Chosing fake 278 // names may help during debugging. 279 parserInJsAttr.parse("<xparsertag onmouse="); 280 parserInJsAttrQ.parse("<xparsertag onmouse=\""); 281 // Single quote added as the Javascript is itself quoted. 282 parserInQJsAttr.parse("<xparsertag onmouse=\"'"); 283 284 // A parser in the Javascript context within a (single) quoted string. 285 parserInJsQ.resetMode(HtmlParser.Mode.JS); 286 parserInJsQ.parse("var fakeparservar='"); 287 288 // Chosing the "style" attribute as it is the only option 289 parserInStyleAttr.parse("<xparsertag style="); 290 parserInStyleAttrQ.parse("<xparsertag style=\""); 291 } 292 293 /** 294 * Throws an {@link AssertionError} if the class was not initialized 295 * correctly, otherwise simply returns. This is to protect against the 296 * possibility the needed parsers were not created successfully during 297 * static initialized, which can only happen due to an error during 298 * development of this library. 299 * 300 * @throws AssertionError when this class failed to initialize 301 */ 302 private static void requireInitialized() { 303 if (!initSuccess) 304 throw new AssertionError("HtmlParserFactory failed initialization."); 305 } 306 } 307