1 package org.unicode.cldr.util; 2 3 import java.io.IOException; 4 import java.io.Reader; 5 6 /** 7 * Extremely simple class for parsing HTML. Extremely lenient. Call next() until 8 * DONE is returned. 9 * <p> 10 * Element content will be returned in the following sequence: 11 * 12 * <pre> 13 * ELEMENT_START 14 * ELEMENT strong 15 * ELEMENT_END 16 * ELEMENT_CONTENT Alphabetic code 17 * ELEMENT_START 18 * ELEMENT_POP 19 * ELEMENT strong 20 * ELEMENT_END 21 * </pre> 22 * 23 * while attributes will be returned as: 24 * 25 * <pre> 26 * ELEMENT_START 27 * ELEMENT div 28 * ATTRIBUTE id 29 * ATTRIBUTE_CONTENT mainContent 30 * ELEMENT_END 31 * </pre> 32 * 33 * 34 * @author markdavis 35 * 36 */ 37 public class SimpleHtmlParser { 38 public enum Type { 39 DONE, 40 /** 41 * No contents, set when we hit < 42 */ 43 ELEMENT_START, 44 /** 45 * '<' contents/b 46 */ 47 ELEMENT, 48 /** 49 * '<element/bcontents(=...) 50 */ 51 ATTRIBUTE, 52 /** 53 * attribute=['"]contents['"] 54 */ 55 ATTRIBUTE_CONTENT, 56 /** 57 * No contents, set when we hit '>' 58 */ 59 ELEMENT_END, 60 /** 61 * No contents, set when we hit '/' after '<' 62 */ 63 ELEMENT_POP, 64 /** 65 * '<!--' contents '-->' 66 */ 67 QUOTE, 68 /** 69 * '<element>' contents '</element>' 70 */ 71 ELEMENT_CONTENT 72 }; 73 74 private enum State { 75 BASE, IN_ELEMENT, AFTER_ELEMENT, IN_CONTENT, IN_ATTRIBUTE, IN_ATTRIBUTE_CONTENT, IN_ATTRIBUTE_CONTENT1, IN_ATTRIBUTE_CONTENT2, ELEMENT_STOP, IN_QUOTE 76 }; 77 78 private Reader input; 79 80 private State state; 81 82 private Type bufferedReturn; 83 84 private int lineCount; 85 86 public SimpleHtmlParser setReader(Reader input) { 87 this.input = input; 88 state = State.IN_CONTENT; 89 bufferedReturn = null; 90 lineCount = 0; 91 return this; 92 } 93 94 public int getLineCount() { 95 return lineCount; 96 } 97 98 public Type next(StringBuilder result) throws IOException { 99 result.setLength(0); 100 if (bufferedReturn != null) { 101 if (bufferedReturn == Type.DONE) { // once DONE, stay DONE 102 return Type.DONE; 103 } 104 Type temp = bufferedReturn; 105 bufferedReturn = null; 106 return temp; 107 } 108 while (true) { 109 char ch; 110 { 111 int chi = input.read(); 112 if (chi < 0) { 113 bufferedReturn = Type.DONE; 114 chi = 0; 115 } 116 ch = (char) chi; 117 if (ch == '\n') { 118 ++lineCount; 119 } 120 } 121 122 switch (state) { 123 case BASE: 124 if (ch == 0xFEFF) 125 break; 126 // fall through! 127 128 case IN_CONTENT: 129 if (ch == '<') { 130 state = State.IN_ELEMENT; 131 bufferedReturn = Type.ELEMENT_START; 132 return Type.ELEMENT_CONTENT; 133 } 134 if (ch == 0) { 135 return Type.ELEMENT_CONTENT; 136 } 137 result.append(ch); 138 break; 139 140 case IN_ELEMENT: 141 if (ch <= ' ') { 142 if (equals(result, "!--")) { 143 state = State.IN_QUOTE; 144 result.setLength(0); 145 break; 146 } 147 state = State.AFTER_ELEMENT; 148 return Type.ELEMENT; 149 } 150 if (ch == '>') { 151 state = State.IN_CONTENT; 152 bufferedReturn = Type.ELEMENT_END; 153 return Type.ELEMENT; 154 } 155 if (ch == '/') { 156 return Type.ELEMENT_POP; 157 } 158 result.append(ch); 159 break; 160 161 case AFTER_ELEMENT: 162 if (ch <= ' ') 163 break; 164 if (ch == '>') { 165 state = State.IN_CONTENT; 166 return Type.ELEMENT_END; 167 } 168 result.append(ch); 169 state = State.IN_ATTRIBUTE; 170 break; 171 172 case IN_ATTRIBUTE: 173 if (ch <= ' ') { 174 state = State.AFTER_ELEMENT; 175 return Type.ATTRIBUTE; 176 } 177 if (ch == '>') { 178 state = State.IN_CONTENT; 179 bufferedReturn = Type.ELEMENT_END; 180 return Type.ATTRIBUTE; 181 } 182 if (ch == '=') { 183 state = State.IN_ATTRIBUTE_CONTENT; 184 return Type.ATTRIBUTE; 185 } 186 result.append(ch); 187 break; 188 189 case IN_ATTRIBUTE_CONTENT: 190 if (ch <= ' ') { 191 break; 192 } 193 if (ch == '>') { 194 state = State.IN_CONTENT; 195 bufferedReturn = Type.ELEMENT_END; 196 return Type.ATTRIBUTE_CONTENT; 197 } 198 if (ch == '\'') { 199 state = State.IN_ATTRIBUTE_CONTENT1; 200 break; 201 } 202 if (ch == '"') { 203 state = State.IN_ATTRIBUTE_CONTENT2; 204 break; 205 } 206 result.append(ch); 207 break; 208 209 case IN_ATTRIBUTE_CONTENT1: 210 if (ch == 0 || ch == '\'') { 211 state = State.AFTER_ELEMENT; 212 return Type.ATTRIBUTE_CONTENT; 213 } 214 result.append(ch); 215 break; 216 217 case IN_ATTRIBUTE_CONTENT2: 218 if (ch == 0 || ch == '"') { 219 state = State.AFTER_ELEMENT; 220 return Type.ATTRIBUTE_CONTENT; 221 } 222 result.append(ch); 223 break; 224 225 case IN_QUOTE: 226 if (ch == 0) { 227 state = State.IN_CONTENT; 228 return Type.QUOTE; 229 } 230 if (ch == '>' && endsWith(result, "--")) { 231 result.setLength(result.length() - 2); 232 state = State.IN_CONTENT; 233 return Type.QUOTE; 234 } 235 result.append(ch); 236 break; 237 default: 238 } 239 } 240 } 241 242 public static final boolean endsWith(CharSequence a, CharSequence b) { 243 int aStart = a.length() - b.length(); 244 if (aStart < 0) { 245 return false; 246 } 247 return regionEquals(a, aStart, b, 0, b.length()); 248 } 249 250 public static final boolean equals(CharSequence a, CharSequence b) { 251 int len = a.length(); 252 if (len != b.length()) { 253 return false; 254 } 255 return regionEquals(a, 0, b, 0, len); 256 } 257 258 public static boolean regionEquals(CharSequence a, int i, CharSequence b, int j, int len) { 259 for (; --len >= 0; ++i, ++j) { 260 if (a.charAt(i) != b.charAt(j)) { 261 return false; 262 } 263 } 264 return true; 265 } 266 267 public static void writeResult(Type type, StringBuilder result, Appendable writer) throws IOException { 268 switch (type) { 269 case ELEMENT: 270 writer.append(result); 271 break; 272 case ELEMENT_START: 273 writer.append('<'); 274 break; 275 case ELEMENT_END: 276 writer.append('>'); 277 break; 278 case ATTRIBUTE: 279 writer.append(' ').append(result); 280 break; 281 case ATTRIBUTE_CONTENT: 282 writer.append("=\"").append(result).append('"'); 283 break; 284 case ELEMENT_CONTENT: 285 writer.append(result); 286 break; 287 case ELEMENT_POP: 288 writer.append('/'); 289 break; 290 case QUOTE: 291 writer.append(result); 292 break; 293 case DONE: 294 break; 295 default: 296 throw new IllegalArgumentException("Missing case: " + type); 297 } 298 } 299 } 300