1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. 2 // 3 // TagSoup is licensed under the Apache License, 4 // Version 2.0. You may obtain a copy of this license at 5 // http://www.apache.org/licenses/LICENSE-2.0 . You may also have 6 // additional legal rights not granted by this license. 7 // 8 // TagSoup is distributed in the hope that it will be useful, but 9 // unless required by applicable law or agreed to in writing, TagSoup 10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 11 // OF ANY KIND, either express or implied; not even the implied warranty 12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 // 14 // 15 package org.ccil.cowan.tagsoup; 16 import java.io.*; 17 import org.xml.sax.SAXException; 18 import org.xml.sax.Locator; 19 20 /** 21 This class implements a table-driven scanner for HTML, allowing for lots of 22 defects. It implements the Scanner interface, which accepts a Reader 23 object to fetch characters from and a ScanHandler object to report lexical 24 events to. 25 */ 26 27 public class HTMLScanner implements Scanner, Locator { 28 29 // Start of state table 30 @@STATE_TABLE@@ 31 // End of state table 32 33 private String thePublicid; // Locator state 34 private String theSystemid; 35 private int theLastLine; 36 private int theLastColumn; 37 private int theCurrentLine; 38 private int theCurrentColumn; 39 40 int theState; // Current state 41 int theNextState; // Next state 42 char[] theOutputBuffer = new char[200]; // Output buffer 43 int theSize; // Current buffer size 44 int[] theWinMap = { // Windows chars map 45 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 46 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, 47 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 48 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178}; 49 50 /** 51 * Index into the state table for [state][input character - 2]. 52 * The state table consists of 4-entry runs on the form 53 * { current state, input character, action, next state }. 54 * We precompute the index into the state table for all possible 55 * { current state, input character } and store the result in 56 * the statetableIndex array. Since only some input characters 57 * are present in the state table, we only do the computation for 58 * characters 0 to the highest character value in the state table. 59 * An input character of -2 is used to cover all other characters 60 * as -2 is guaranteed not to match any input character entry 61 * in the state table. 62 * 63 * <p>When doing lookups, the input character should first be tested 64 * to be in the range [-1 (inclusive), statetableIndexMaxChar (exclusive)]. 65 * if it isn't use -2 as the input character. 66 * 67 * <p>Finally, add 2 to the input character to cover for the fact that 68 * Java doesn't support negative array indexes. Then look up 69 * the value in the statetableIndex. If the value is -1, then 70 * no action or next state was found for the { state, input } that 71 * you had. If it isn't -1, then action = statetable[value + 2] and 72 * next state = statetable[value + 3]. That is, the value points 73 * to the start of the answer 4-tuple in the statetable. 74 */ 75 static short[][] statetableIndex; 76 /** 77 * The highest character value seen in the statetable. 78 * See the doc comment for statetableIndex to see how this 79 * is used. 80 */ 81 static int statetableIndexMaxChar; 82 static { 83 int maxState = -1; 84 int maxChar = -1; 85 for (int i = 0; i < statetable.length; i += 4) { 86 if (statetable[i] > maxState) { 87 maxState = statetable[i]; 88 } 89 if (statetable[i + 1] > maxChar) { 90 maxChar = statetable[i + 1]; 91 } 92 } 93 statetableIndexMaxChar = maxChar + 1; 94 95 statetableIndex = new short[maxState + 1][maxChar + 3]; 96 for (int theState = 0; theState <= maxState; ++theState) { 97 for (int ch = -2; ch <= maxChar; ++ch) { 98 int hit = -1; 99 int action = 0; 100 for (int i = 0; i < statetable.length; i += 4) { 101 if (theState != statetable[i]) { 102 if (action != 0) break; 103 continue; 104 } 105 if (statetable[i+1] == 0) { 106 hit = i; 107 action = statetable[i+2]; 108 } 109 else if (statetable[i+1] == ch) { 110 hit = i; 111 action = statetable[i+2]; 112 break; 113 } 114 } 115 statetableIndex[theState][ch + 2] = (short) hit; 116 } 117 } 118 } 119 120 // Compensate for bug in PushbackReader that allows 121 // pushing back EOF. 122 private void unread(PushbackReader r, int c) throws IOException { 123 if (c != -1) r.unread(c); 124 } 125 126 // Locator implementation 127 128 public int getLineNumber() { 129 return theLastLine; 130 } 131 public int getColumnNumber() { 132 return theLastColumn; 133 } 134 public String getPublicId() { 135 return thePublicid; 136 } 137 public String getSystemId() { 138 return theSystemid; 139 } 140 141 142 // Scanner implementation 143 144 /** 145 Reset document locator, supplying systemid and publicid. 146 @param systemid System id 147 @param publicid Public id 148 */ 149 150 public void resetDocumentLocator(String publicid, String systemid) { 151 thePublicid = publicid; 152 theSystemid = systemid; 153 theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0; 154 } 155 156 /** 157 Scan HTML source, reporting lexical events. 158 @param r0 Reader that provides characters 159 @param h ScanHandler that accepts lexical events. 160 */ 161 162 public void scan(Reader r0, ScanHandler h) throws IOException, SAXException { 163 theState = S_PCDATA; 164 PushbackReader r; 165 if (r0 instanceof BufferedReader) { 166 r = new PushbackReader(r0, 5); 167 } 168 else { 169 r = new PushbackReader(new BufferedReader(r0), 5); 170 } 171 172 int firstChar = r.read(); // Remove any leading BOM 173 if (firstChar != '\uFEFF') unread(r, firstChar); 174 175 while (theState != S_DONE) { 176 int ch = r.read(); 177 178 // Process control characters 179 if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80]; 180 181 if (ch == '\r') { 182 ch = r.read(); // expect LF next 183 if (ch != '\n') { 184 unread(r, ch); // nope 185 ch = '\n'; 186 } 187 } 188 189 if (ch == '\n') { 190 theCurrentLine++; 191 theCurrentColumn = 0; 192 } 193 else { 194 theCurrentColumn++; 195 } 196 197 if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue; 198 199 // Search state table 200 int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2; 201 int statetableRow = statetableIndex[theState][adjCh + 2]; 202 int action = 0; 203 if (statetableRow != -1) { 204 action = statetable[statetableRow + 2]; 205 theNextState = statetable[statetableRow + 3]; 206 } 207 208 // System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]); 209 switch (action) { 210 case 0: 211 throw new Error( 212 "HTMLScanner can't cope with " + Integer.toString(ch) + " in state " + 213 Integer.toString(theState)); 214 case A_ADUP: 215 h.adup(theOutputBuffer, 0, theSize); 216 theSize = 0; 217 break; 218 case A_ADUP_SAVE: 219 h.adup(theOutputBuffer, 0, theSize); 220 theSize = 0; 221 save(ch, h); 222 break; 223 case A_ADUP_STAGC: 224 h.adup(theOutputBuffer, 0, theSize); 225 theSize = 0; 226 h.stagc(theOutputBuffer, 0, theSize); 227 break; 228 case A_ANAME: 229 h.aname(theOutputBuffer, 0, theSize); 230 theSize = 0; 231 break; 232 case A_ANAME_ADUP: 233 h.aname(theOutputBuffer, 0, theSize); 234 theSize = 0; 235 h.adup(theOutputBuffer, 0, theSize); 236 break; 237 case A_ANAME_ADUP_STAGC: 238 h.aname(theOutputBuffer, 0, theSize); 239 theSize = 0; 240 h.adup(theOutputBuffer, 0, theSize); 241 h.stagc(theOutputBuffer, 0, theSize); 242 break; 243 case A_AVAL: 244 h.aval(theOutputBuffer, 0, theSize); 245 theSize = 0; 246 break; 247 case A_AVAL_STAGC: 248 h.aval(theOutputBuffer, 0, theSize); 249 theSize = 0; 250 h.stagc(theOutputBuffer, 0, theSize); 251 break; 252 case A_CDATA: 253 mark(); 254 // suppress the final "]]" in the buffer 255 if (theSize > 1) theSize -= 2; 256 h.pcdata(theOutputBuffer, 0, theSize); 257 theSize = 0; 258 break; 259 case A_ENTITY_START: 260 h.pcdata(theOutputBuffer, 0, theSize); 261 theSize = 0; 262 save(ch, h); 263 break; 264 case A_ENTITY: 265 mark(); 266 char ch1 = (char)ch; 267 // System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK"))); 268 if (theState == S_ENT && ch1 == '#') { 269 theNextState = S_NCR; 270 save(ch, h); 271 break; 272 } 273 else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) { 274 theNextState = S_XNCR; 275 save(ch, h); 276 break; 277 } 278 else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) { 279 save(ch, h); 280 break; 281 } 282 else if (theState == S_NCR && Character.isDigit(ch1)) { 283 save(ch, h); 284 break; 285 } 286 else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) { 287 save(ch, h); 288 break; 289 } 290 291 // The whole entity reference has been collected 292 // System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); 293 h.entity(theOutputBuffer, 1, theSize - 1); 294 int ent = h.getEntity(); 295 // System.err.println("%% value = " + ent); 296 if (ent != 0) { 297 theSize = 0; 298 if (ent >= 0x80 && ent <= 0x9F) { 299 ent = theWinMap[ent-0x80]; 300 } 301 if (ent < 0x20) { 302 // Control becomes space 303 ent = 0x20; 304 } 305 else if (ent >= 0xD800 && ent <= 0xDFFF) { 306 // Surrogates get dropped 307 ent = 0; 308 } 309 else if (ent <= 0xFFFF) { 310 // BMP character 311 save(ent, h); 312 } 313 else { 314 // Astral converted to two surrogates 315 ent -= 0x10000; 316 save((ent>>10) + 0xD800, h); 317 save((ent&0x3FF) + 0xDC00, h); 318 } 319 if (ch != ';') { 320 unread(r, ch); 321 theCurrentColumn--; 322 } 323 } 324 else { 325 unread(r, ch); 326 theCurrentColumn--; 327 } 328 theNextState = S_PCDATA; 329 break; 330 case A_ETAG: 331 h.etag(theOutputBuffer, 0, theSize); 332 theSize = 0; 333 break; 334 case A_DECL: 335 h.decl(theOutputBuffer, 0, theSize); 336 theSize = 0; 337 break; 338 case A_GI: 339 h.gi(theOutputBuffer, 0, theSize); 340 theSize = 0; 341 break; 342 case A_GI_STAGC: 343 h.gi(theOutputBuffer, 0, theSize); 344 theSize = 0; 345 h.stagc(theOutputBuffer, 0, theSize); 346 break; 347 case A_LT: 348 mark(); 349 save('<', h); 350 save(ch, h); 351 break; 352 case A_LT_PCDATA: 353 mark(); 354 save('<', h); 355 h.pcdata(theOutputBuffer, 0, theSize); 356 theSize = 0; 357 break; 358 case A_PCDATA: 359 mark(); 360 h.pcdata(theOutputBuffer, 0, theSize); 361 theSize = 0; 362 break; 363 case A_CMNT: 364 mark(); 365 h.cmnt(theOutputBuffer, 0, theSize); 366 theSize = 0; 367 break; 368 case A_MINUS3: 369 save('-', h); 370 save(' ', h); 371 break; 372 case A_MINUS2: 373 save('-', h); 374 save(' ', h); 375 // fall through into A_MINUS 376 case A_MINUS: 377 save('-', h); 378 save(ch, h); 379 break; 380 case A_PI: 381 mark(); 382 h.pi(theOutputBuffer, 0, theSize); 383 theSize = 0; 384 break; 385 case A_PITARGET: 386 h.pitarget(theOutputBuffer, 0, theSize); 387 theSize = 0; 388 break; 389 case A_PITARGET_PI: 390 h.pitarget(theOutputBuffer, 0, theSize); 391 theSize = 0; 392 h.pi(theOutputBuffer, 0, theSize); 393 break; 394 case A_SAVE: 395 save(ch, h); 396 break; 397 case A_SKIP: 398 break; 399 case A_SP: 400 save(' ', h); 401 break; 402 case A_STAGC: 403 h.stagc(theOutputBuffer, 0, theSize); 404 theSize = 0; 405 break; 406 case A_EMPTYTAG: 407 mark(); 408 // System.err.println("%%% Empty tag seen"); 409 if (theSize > 0) h.gi(theOutputBuffer, 0, theSize); 410 theSize = 0; 411 h.stage(theOutputBuffer, 0, theSize); 412 break; 413 case A_UNGET: 414 unread(r, ch); 415 theCurrentColumn--; 416 break; 417 case A_UNSAVE_PCDATA: 418 if (theSize > 0) theSize--; 419 h.pcdata(theOutputBuffer, 0, theSize); 420 theSize = 0; 421 break; 422 default: 423 throw new Error("Can't process state " + action); 424 } 425 theState = theNextState; 426 } 427 h.eof(theOutputBuffer, 0, 0); 428 } 429 430 /** 431 * Mark the current scan position as a "point of interest" - start of a tag, 432 * cdata, processing instruction etc. 433 */ 434 435 private void mark() { 436 theLastColumn = theCurrentColumn; 437 theLastLine = theCurrentLine; 438 } 439 440 /** 441 A callback for the ScanHandler that allows it to force 442 the lexer state to CDATA content (no markup is recognized except 443 the end of element. 444 */ 445 446 public void startCDATA() { theNextState = S_CDATA; } 447 448 private void save(int ch, ScanHandler h) throws IOException, SAXException { 449 if (theSize >= theOutputBuffer.length - 20) { 450 if (theState == S_PCDATA || theState == S_CDATA) { 451 // Return a buffer-sized chunk of PCDATA 452 h.pcdata(theOutputBuffer, 0, theSize); 453 theSize = 0; 454 } 455 else { 456 // Grow the buffer size 457 char[] newOutputBuffer = new char[theOutputBuffer.length * 2]; 458 System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1); 459 theOutputBuffer = newOutputBuffer; 460 } 461 } 462 theOutputBuffer[theSize++] = (char)ch; 463 } 464 465 /** 466 Test procedure. Reads HTML from the standard input and writes 467 PYX to the standard output. 468 */ 469 470 public static void main(String[] argv) throws IOException, SAXException { 471 Scanner s = new HTMLScanner(); 472 Reader r = new InputStreamReader(System.in, "UTF-8"); 473 Writer w = new OutputStreamWriter(System.out, "UTF-8"); 474 PYXWriter pw = new PYXWriter(w); 475 s.scan(r, pw); 476 w.close(); 477 } 478 479 480 private static String nicechar(int in) { 481 if (in == '\n') return "\\n"; 482 if (in < 32) return "0x"+Integer.toHexString(in); 483 return "'"+((char)in)+"'"; 484 } 485 486 } 487