Home | History | Annotate | Download | only in tagsoup
      1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
      2 //
      3 // TagSoup is licensed under the Apache License,
      4 // Version 2.0.  You may obtain a copy of this license at
      5 // http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
      6 // additional legal rights not granted by this license.
      7 //
      8 // TagSoup is distributed in the hope that it will be useful, but
      9 // unless required by applicable law or agreed to in writing, TagSoup
     10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
     11 // OF ANY KIND, either express or implied; not even the implied warranty
     12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     13 //
     14 //
     15 package org.ccil.cowan.tagsoup;
     16 import java.io.*;
     17 import org.xml.sax.SAXException;
     18 import org.xml.sax.Locator;
     19 
     20 /**
     21 This class implements a table-driven scanner for HTML, allowing for lots of
     22 defects.  It implements the Scanner interface, which accepts a Reader
     23 object to fetch characters from and a ScanHandler object to report lexical
     24 events to.
     25 */
     26 
     27 public class HTMLScanner implements Scanner, Locator {
     28 
     29 	// Start of state table
     30 	@@STATE_TABLE@@
     31 	// End of state table
     32 
     33 	private String thePublicid;			// Locator state
     34 	private String theSystemid;
     35 	private int theLastLine;
     36 	private int theLastColumn;
     37 	private int theCurrentLine;
     38 	private int theCurrentColumn;
     39 
     40 	int theState;					// Current state
     41 	int theNextState;				// Next state
     42 	char[] theOutputBuffer = new char[200];	// Output buffer
     43 	int theSize;					// Current buffer size
     44 	int[] theWinMap = {				// Windows chars map
     45 		0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
     46 		0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
     47 		0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
     48 		0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178};
     49 
     50 	/**
     51 	 * Index into the state table for [state][input character - 2].
     52 	 * The state table consists of 4-entry runs on the form
     53 	 * { current state, input character, action, next state }.
     54 	 * We precompute the index into the state table for all possible
     55 	 * { current state, input character } and store the result in
     56 	 * the statetableIndex array. Since only some input characters
     57 	 * are present in the state table, we only do the computation for
     58 	 * characters 0 to the highest character value in the state table.
     59 	 * An input character of -2 is used to cover all other characters
     60 	 * as -2 is guaranteed not to match any input character entry
     61 	 * in the state table.
     62 	 *
     63 	 * <p>When doing lookups, the input character should first be tested
     64 	 * to be in the range [-1 (inclusive), statetableIndexMaxChar (exclusive)].
     65 	 * if it isn't use -2 as the input character.
     66 	 *
     67 	 * <p>Finally, add 2 to the input character to cover for the fact that
     68 	 * Java doesn't support negative array indexes. Then look up
     69 	 * the value in the statetableIndex. If the value is -1, then
     70 	 * no action or next state was found for the { state, input } that
     71 	 * you had. If it isn't -1, then action = statetable[value + 2] and
     72 	 * next state = statetable[value + 3]. That is, the value points
     73 	 * to the start of the answer 4-tuple in the statetable.
     74 	 */
     75 	static short[][] statetableIndex;
     76 	/**
     77 	 * The highest character value seen in the statetable.
     78 	 * See the doc comment for statetableIndex to see how this
     79 	 * is used.
     80 	 */
     81 	static int statetableIndexMaxChar;
     82 	static {
     83 		int maxState = -1;
     84 		int maxChar = -1;
     85 		for (int i = 0; i < statetable.length; i += 4) {
     86 			if (statetable[i] > maxState) {
     87 				maxState = statetable[i];
     88 				}
     89 			if (statetable[i + 1] > maxChar) {
     90 				maxChar = statetable[i + 1];
     91 				}
     92 			}
     93 		statetableIndexMaxChar = maxChar + 1;
     94 
     95 		statetableIndex = new short[maxState + 1][maxChar + 3];
     96 		for (int theState = 0; theState <= maxState; ++theState) {
     97 			for (int ch = -2; ch <= maxChar; ++ch) {
     98 				int hit = -1;
     99 				int action = 0;
    100 				for (int i = 0; i < statetable.length; i += 4) {
    101 					if (theState != statetable[i]) {
    102 						if (action != 0) break;
    103 						continue;
    104 						}
    105 					if (statetable[i+1] == 0) {
    106 						hit = i;
    107 						action = statetable[i+2];
    108 						}
    109 					else if (statetable[i+1] == ch) {
    110 						hit = i;
    111 						action = statetable[i+2];
    112 						break;
    113 						}
    114 					}
    115 				statetableIndex[theState][ch + 2] = (short) hit;
    116 				}
    117 			}
    118 		}
    119 
    120 	// Compensate for bug in PushbackReader that allows
    121 	// pushing back EOF.
    122 	private void unread(PushbackReader r, int c) throws IOException {
    123 		if (c != -1) r.unread(c);
    124 		}
    125 
    126 	// Locator implementation
    127 
    128 	public int getLineNumber() {
    129 		return theLastLine;
    130 		}
    131 	public int getColumnNumber() {
    132 		return theLastColumn;
    133 		}
    134 	public String getPublicId() {
    135 		return thePublicid;
    136 		}
    137 	public String getSystemId() {
    138 		return theSystemid;
    139 		}
    140 
    141 
    142 	// Scanner implementation
    143 
    144 	/**
    145 	Reset document locator, supplying systemid and publicid.
    146 	@param systemid System id
    147 	@param publicid Public id
    148 	*/
    149 
    150 	public void resetDocumentLocator(String publicid, String systemid) {
    151 		thePublicid = publicid;
    152 		theSystemid = systemid;
    153 		theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
    154 		}
    155 
    156 	/**
    157 	Scan HTML source, reporting lexical events.
    158 	@param r0 Reader that provides characters
    159 	@param h ScanHandler that accepts lexical events.
    160 	*/
    161 
    162 	public void scan(Reader r0, ScanHandler h) throws IOException, SAXException {
    163 		theState = S_PCDATA;
    164 		PushbackReader r;
    165 		if (r0 instanceof BufferedReader) {
    166 			r = new PushbackReader(r0, 5);
    167 			}
    168 		else {
    169 			r = new PushbackReader(new BufferedReader(r0), 5);
    170 			}
    171 
    172 		int firstChar = r.read();	// Remove any leading BOM
    173 		if (firstChar != '\uFEFF') unread(r, firstChar);
    174 
    175 		while (theState != S_DONE) {
    176 			int ch = r.read();
    177 
    178 			// Process control characters
    179 			if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
    180 
    181 			if (ch == '\r') {
    182 				ch = r.read();		// expect LF next
    183 				if (ch != '\n') {
    184 					unread(r, ch);	// nope
    185 					ch = '\n';
    186 					}
    187 				}
    188 
    189 			if (ch == '\n') {
    190 				theCurrentLine++;
    191 				theCurrentColumn = 0;
    192 				}
    193 			else {
    194 				theCurrentColumn++;
    195 				}
    196 
    197 			if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
    198 
    199 			// Search state table
    200 			int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2;
    201 			int statetableRow = statetableIndex[theState][adjCh + 2];
    202 			int action = 0;
    203 			if (statetableRow != -1) {
    204 				action = statetable[statetableRow + 2];
    205 				theNextState = statetable[statetableRow + 3];
    206 				}
    207 
    208 //			System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
    209 			switch (action) {
    210 			case 0:
    211 				throw new Error(
    212 					"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
    213 					Integer.toString(theState));
    214 			case A_ADUP:
    215 				h.adup(theOutputBuffer, 0, theSize);
    216 				theSize = 0;
    217 				break;
    218 			case A_ADUP_SAVE:
    219 				h.adup(theOutputBuffer, 0, theSize);
    220 				theSize = 0;
    221 				save(ch, h);
    222 				break;
    223 			case A_ADUP_STAGC:
    224 				h.adup(theOutputBuffer, 0, theSize);
    225 				theSize = 0;
    226 				h.stagc(theOutputBuffer, 0, theSize);
    227 				break;
    228 			case A_ANAME:
    229 				h.aname(theOutputBuffer, 0, theSize);
    230 				theSize = 0;
    231 				break;
    232 			case A_ANAME_ADUP:
    233 				h.aname(theOutputBuffer, 0, theSize);
    234 				theSize = 0;
    235 				h.adup(theOutputBuffer, 0, theSize);
    236 				break;
    237 			case A_ANAME_ADUP_STAGC:
    238 				h.aname(theOutputBuffer, 0, theSize);
    239 				theSize = 0;
    240 				h.adup(theOutputBuffer, 0, theSize);
    241 				h.stagc(theOutputBuffer, 0, theSize);
    242 				break;
    243 			case A_AVAL:
    244 				h.aval(theOutputBuffer, 0, theSize);
    245 				theSize = 0;
    246 				break;
    247 			case A_AVAL_STAGC:
    248 				h.aval(theOutputBuffer, 0, theSize);
    249 				theSize = 0;
    250 				h.stagc(theOutputBuffer, 0, theSize);
    251 				break;
    252 			case A_CDATA:
    253 				mark();
    254 				// suppress the final "]]" in the buffer
    255 				if (theSize > 1) theSize -= 2;
    256 				h.pcdata(theOutputBuffer, 0, theSize);
    257 				theSize = 0;
    258 				break;
    259 			case A_ENTITY_START:
    260 				h.pcdata(theOutputBuffer, 0, theSize);
    261 				theSize = 0;
    262 				save(ch, h);
    263 				break;
    264 			case A_ENTITY:
    265 				mark();
    266 				char ch1 = (char)ch;
    267 //				System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
    268 				if (theState == S_ENT && ch1 == '#') {
    269 					theNextState = S_NCR;
    270 					save(ch, h);
    271 					break;
    272 					}
    273 				else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) {
    274 					theNextState = S_XNCR;
    275 					save(ch, h);
    276 					break;
    277 					}
    278 				else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
    279 					save(ch, h);
    280 					break;
    281 					}
    282 				else if (theState == S_NCR && Character.isDigit(ch1)) {
    283 					save(ch, h);
    284 					break;
    285 					}
    286 				else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
    287 					save(ch, h);
    288 					break;
    289 					}
    290 
    291 				// The whole entity reference has been collected
    292 //				System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
    293 				h.entity(theOutputBuffer, 1, theSize - 1);
    294 				int ent = h.getEntity();
    295 //				System.err.println("%% value = " + ent);
    296 				if (ent != 0) {
    297 					theSize = 0;
    298 					if (ent >= 0x80 && ent <= 0x9F) {
    299 						ent = theWinMap[ent-0x80];
    300 						}
    301 					if (ent < 0x20) {
    302 						// Control becomes space
    303 						ent = 0x20;
    304 						}
    305 					else if (ent >= 0xD800 && ent <= 0xDFFF) {
    306 						// Surrogates get dropped
    307 						ent = 0;
    308 						}
    309 					else if (ent <= 0xFFFF) {
    310 						// BMP character
    311 						save(ent, h);
    312 						}
    313 					else {
    314 						// Astral converted to two surrogates
    315 						ent -= 0x10000;
    316 						save((ent>>10) + 0xD800, h);
    317 						save((ent&0x3FF) + 0xDC00, h);
    318 						}
    319 					if (ch != ';') {
    320 						unread(r, ch);
    321 						theCurrentColumn--;
    322 						}
    323 					}
    324 				else {
    325 					unread(r, ch);
    326 					theCurrentColumn--;
    327 					}
    328 				theNextState = S_PCDATA;
    329 				break;
    330 			case A_ETAG:
    331 				h.etag(theOutputBuffer, 0, theSize);
    332 				theSize = 0;
    333 				break;
    334 			case A_DECL:
    335 				h.decl(theOutputBuffer, 0, theSize);
    336 				theSize = 0;
    337 				break;
    338 			case A_GI:
    339 				h.gi(theOutputBuffer, 0, theSize);
    340 				theSize = 0;
    341 				break;
    342 			case A_GI_STAGC:
    343 				h.gi(theOutputBuffer, 0, theSize);
    344 				theSize = 0;
    345 				h.stagc(theOutputBuffer, 0, theSize);
    346 				break;
    347 			case A_LT:
    348 				mark();
    349 				save('<', h);
    350 				save(ch, h);
    351 				break;
    352 			case A_LT_PCDATA:
    353 				mark();
    354 				save('<', h);
    355 				h.pcdata(theOutputBuffer, 0, theSize);
    356 				theSize = 0;
    357 				break;
    358 			case A_PCDATA:
    359 				mark();
    360 				h.pcdata(theOutputBuffer, 0, theSize);
    361 				theSize = 0;
    362 				break;
    363 			case A_CMNT:
    364 				mark();
    365 				h.cmnt(theOutputBuffer, 0, theSize);
    366 				theSize = 0;
    367 				break;
    368 			case A_MINUS3:
    369 				save('-', h);
    370 				save(' ', h);
    371 				break;
    372 			case A_MINUS2:
    373 				save('-', h);
    374 				save(' ', h);
    375 				// fall through into A_MINUS
    376 			case A_MINUS:
    377 				save('-', h);
    378 				save(ch, h);
    379 				break;
    380 			case A_PI:
    381 				mark();
    382 				h.pi(theOutputBuffer, 0, theSize);
    383 				theSize = 0;
    384 				break;
    385 			case A_PITARGET:
    386 				h.pitarget(theOutputBuffer, 0, theSize);
    387 				theSize = 0;
    388 				break;
    389 			case A_PITARGET_PI:
    390 				h.pitarget(theOutputBuffer, 0, theSize);
    391 				theSize = 0;
    392 				h.pi(theOutputBuffer, 0, theSize);
    393 				break;
    394 			case A_SAVE:
    395 				save(ch, h);
    396 				break;
    397 			case A_SKIP:
    398 				break;
    399 			case A_SP:
    400 				save(' ', h);
    401 				break;
    402 			case A_STAGC:
    403 				h.stagc(theOutputBuffer, 0, theSize);
    404 				theSize = 0;
    405 				break;
    406 			case A_EMPTYTAG:
    407 				mark();
    408 //				System.err.println("%%% Empty tag seen");
    409 				if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
    410 				theSize = 0;
    411 				h.stage(theOutputBuffer, 0, theSize);
    412 				break;
    413 			case A_UNGET:
    414 				unread(r, ch);
    415 				theCurrentColumn--;
    416 				break;
    417 			case A_UNSAVE_PCDATA:
    418 				if (theSize > 0) theSize--;
    419 				h.pcdata(theOutputBuffer, 0, theSize);
    420 				theSize = 0;
    421 				break;
    422 			default:
    423 				throw new Error("Can't process state " + action);
    424 				}
    425 			theState = theNextState;
    426 			}
    427 		h.eof(theOutputBuffer, 0, 0);
    428 		}
    429 
    430 	/**
    431 	* Mark the current scan position as a "point of interest" - start of a tag,
    432 	* cdata, processing instruction etc.
    433 	*/
    434 
    435 	private void mark() {
    436 		theLastColumn = theCurrentColumn;
    437 		theLastLine = theCurrentLine;
    438 		}
    439 
    440 	/**
    441 	A callback for the ScanHandler that allows it to force
    442 	the lexer state to CDATA content (no markup is recognized except
    443 	the end of element.
    444 	*/
    445 
    446 	public void startCDATA() { theNextState = S_CDATA; }
    447 
    448 	private void save(int ch, ScanHandler h) throws IOException, SAXException {
    449 		if (theSize >= theOutputBuffer.length - 20) {
    450 			if (theState == S_PCDATA || theState == S_CDATA) {
    451 				// Return a buffer-sized chunk of PCDATA
    452 				h.pcdata(theOutputBuffer, 0, theSize);
    453 				theSize = 0;
    454 				}
    455 			else {
    456 				// Grow the buffer size
    457 				char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
    458 				System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
    459 				theOutputBuffer = newOutputBuffer;
    460 				}
    461 			}
    462 		theOutputBuffer[theSize++] = (char)ch;
    463 		}
    464 
    465 	/**
    466 	Test procedure.  Reads HTML from the standard input and writes
    467 	PYX to the standard output.
    468 	*/
    469 
    470 	public static void main(String[] argv) throws IOException, SAXException {
    471 		Scanner s = new HTMLScanner();
    472 		Reader r = new InputStreamReader(System.in, "UTF-8");
    473 		Writer w = new OutputStreamWriter(System.out, "UTF-8");
    474 		PYXWriter pw = new PYXWriter(w);
    475 		s.scan(r, pw);
    476 		w.close();
    477 		}
    478 
    479 
    480 	private static String nicechar(int in) {
    481 		if (in == '\n') return "\\n";
    482 		if (in < 32) return "0x"+Integer.toHexString(in);
    483 		return "'"+((char)in)+"'";
    484 		}
    485 
    486 	}
    487