Home | History | Annotate | Download | only in tagsoup
      1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
      2 //
      3 // TagSoup is licensed under the Apache License,
      4 // Version 2.0.  You may obtain a copy of this license at
      5 // http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
      6 // additional legal rights not granted by this license.
      7 //
      8 // TagSoup is distributed in the hope that it will be useful, but
      9 // unless required by applicable law or agreed to in writing, TagSoup
     10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
     11 // OF ANY KIND, either express or implied; not even the implied warranty
     12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     13 //
     14 //
     15 package org.ccil.cowan.tagsoup;
     16 import java.io.*;
     17 import org.xml.sax.SAXException;
     18 import org.xml.sax.Locator;
     19 
     20 /**
     21 This class implements a table-driven scanner for HTML, allowing for lots of
     22 defects.  It implements the Scanner interface, which accepts a Reader
     23 object to fetch characters from and a ScanHandler object to report lexical
     24 events to.
     25 */
     26 
     27 public class HTMLScanner implements Scanner, Locator {
     28 
     29 	// Start of state table
     30 		private static final int S_ANAME = 1;
     31 	private static final int S_APOS = 2;
     32 	private static final int S_AVAL = 3;
     33 	private static final int S_BB = 4;
     34 	private static final int S_BBC = 5;
     35 	private static final int S_BBCD = 6;
     36 	private static final int S_BBCDA = 7;
     37 	private static final int S_BBCDAT = 8;
     38 	private static final int S_BBCDATA = 9;
     39 	private static final int S_CDATA = 10;
     40 	private static final int S_CDATA2 = 11;
     41 	private static final int S_CDSECT = 12;
     42 	private static final int S_CDSECT1 = 13;
     43 	private static final int S_CDSECT2 = 14;
     44 	private static final int S_COM = 15;
     45 	private static final int S_COM2 = 16;
     46 	private static final int S_COM3 = 17;
     47 	private static final int S_COM4 = 18;
     48 	private static final int S_DECL = 19;
     49 	private static final int S_DECL2 = 20;
     50 	private static final int S_DONE = 21;
     51 	private static final int S_EMPTYTAG = 22;
     52 	private static final int S_ENT = 23;
     53 	private static final int S_EQ = 24;
     54 	private static final int S_ETAG = 25;
     55 	private static final int S_GI = 26;
     56 	private static final int S_NCR = 27;
     57 	private static final int S_PCDATA = 28;
     58 	private static final int S_PI = 29;
     59 	private static final int S_PITARGET = 30;
     60 	private static final int S_QUOT = 31;
     61 	private static final int S_STAGC = 32;
     62 	private static final int S_TAG = 33;
     63 	private static final int S_TAGWS = 34;
     64 	private static final int S_XNCR = 35;
     65 	private static final int A_ADUP = 1;
     66 	private static final int A_ADUP_SAVE = 2;
     67 	private static final int A_ADUP_STAGC = 3;
     68 	private static final int A_ANAME = 4;
     69 	private static final int A_ANAME_ADUP = 5;
     70 	private static final int A_ANAME_ADUP_STAGC = 6;
     71 	private static final int A_AVAL = 7;
     72 	private static final int A_AVAL_STAGC = 8;
     73 	private static final int A_CDATA = 9;
     74 	private static final int A_CMNT = 10;
     75 	private static final int A_DECL = 11;
     76 	private static final int A_EMPTYTAG = 12;
     77 	private static final int A_ENTITY = 13;
     78 	private static final int A_ENTITY_START = 14;
     79 	private static final int A_ETAG = 15;
     80 	private static final int A_GI = 16;
     81 	private static final int A_GI_STAGC = 17;
     82 	private static final int A_LT = 18;
     83 	private static final int A_LT_PCDATA = 19;
     84 	private static final int A_MINUS = 20;
     85 	private static final int A_MINUS2 = 21;
     86 	private static final int A_MINUS3 = 22;
     87 	private static final int A_PCDATA = 23;
     88 	private static final int A_PI = 24;
     89 	private static final int A_PITARGET = 25;
     90 	private static final int A_PITARGET_PI = 26;
     91 	private static final int A_SAVE = 27;
     92 	private static final int A_SKIP = 28;
     93 	private static final int A_SP = 29;
     94 	private static final int A_STAGC = 30;
     95 	private static final int A_UNGET = 31;
     96 	private static final int A_UNSAVE_PCDATA = 32;
     97 	private static int[] statetable = {
     98 		S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG,
     99 		S_ANAME, '=', A_ANAME, S_AVAL,
    100 		S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA,
    101 		S_ANAME, 0, A_SAVE, S_ANAME,
    102 		S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE,
    103 		S_ANAME, ' ', A_ANAME, S_EQ,
    104 		S_ANAME, '\n', A_ANAME, S_EQ,
    105 		S_ANAME, '\t', A_ANAME, S_EQ,
    106 		S_APOS, '\'', A_AVAL, S_TAGWS,
    107 		S_APOS, 0, A_SAVE, S_APOS,
    108 		S_APOS, -1, A_AVAL_STAGC, S_DONE,
    109 		S_APOS, ' ', A_SP, S_APOS,
    110 		S_APOS, '\n', A_SP, S_APOS,
    111 		S_APOS, '\t', A_SP, S_APOS,
    112 		S_AVAL, '"', A_SKIP, S_QUOT,
    113 		S_AVAL, '\'', A_SKIP, S_APOS,
    114 		S_AVAL, '>', A_AVAL_STAGC, S_PCDATA,
    115 		S_AVAL, 0, A_SAVE, S_STAGC,
    116 		S_AVAL, -1, A_AVAL_STAGC, S_DONE,
    117 		S_AVAL, ' ', A_SKIP, S_AVAL,
    118 		S_AVAL, '\n', A_SKIP, S_AVAL,
    119 		S_AVAL, '\t', A_SKIP, S_AVAL,
    120 		S_BB, 'C', A_SKIP, S_BBC,
    121 		S_BB, 0, A_SKIP, S_DECL,
    122 		S_BB, -1, A_SKIP, S_DONE,
    123 		S_BBC, 'D', A_SKIP, S_BBCD,
    124 		S_BBC, 0, A_SKIP, S_DECL,
    125 		S_BBC, -1, A_SKIP, S_DONE,
    126 		S_BBCD, 'A', A_SKIP, S_BBCDA,
    127 		S_BBCD, 0, A_SKIP, S_DECL,
    128 		S_BBCD, -1, A_SKIP, S_DONE,
    129 		S_BBCDA, 'T', A_SKIP, S_BBCDAT,
    130 		S_BBCDA, 0, A_SKIP, S_DECL,
    131 		S_BBCDA, -1, A_SKIP, S_DONE,
    132 		S_BBCDAT, 'A', A_SKIP, S_BBCDATA,
    133 		S_BBCDAT, 0, A_SKIP, S_DECL,
    134 		S_BBCDAT, -1, A_SKIP, S_DONE,
    135 		S_BBCDATA, '[', A_SKIP, S_CDSECT,
    136 		S_BBCDATA, 0, A_SKIP, S_DECL,
    137 		S_BBCDATA, -1, A_SKIP, S_DONE,
    138 		S_CDATA, '<', A_SAVE, S_CDATA2,
    139 		S_CDATA, 0, A_SAVE, S_CDATA,
    140 		S_CDATA, -1, A_PCDATA, S_DONE,
    141 		S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG,
    142 		S_CDATA2, 0, A_SAVE, S_CDATA,
    143 		S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE,
    144 		S_CDSECT, ']', A_SAVE, S_CDSECT1,
    145 		S_CDSECT, 0, A_SAVE, S_CDSECT,
    146 		S_CDSECT, -1, A_SKIP, S_DONE,
    147 		S_CDSECT1, ']', A_SAVE, S_CDSECT2,
    148 		S_CDSECT1, 0, A_SAVE, S_CDSECT,
    149 		S_CDSECT1, -1, A_SKIP, S_DONE,
    150 		S_CDSECT2, '>', A_CDATA, S_PCDATA,
    151 		S_CDSECT2, 0, A_SAVE, S_CDSECT,
    152 		S_CDSECT2, -1, A_SKIP, S_DONE,
    153 		S_COM, '-', A_SKIP, S_COM2,
    154 		S_COM, 0, A_SAVE, S_COM2,
    155 		S_COM, -1, A_CMNT, S_DONE,
    156 		S_COM2, '-', A_SKIP, S_COM3,
    157 		S_COM2, 0, A_SAVE, S_COM2,
    158 		S_COM2, -1, A_CMNT, S_DONE,
    159 		S_COM3, '-', A_SKIP, S_COM4,
    160 		S_COM3, 0, A_MINUS, S_COM2,
    161 		S_COM3, -1, A_CMNT, S_DONE,
    162 		S_COM4, '-', A_MINUS3, S_COM4,
    163 		S_COM4, '>', A_CMNT, S_PCDATA,
    164 		S_COM4, 0, A_MINUS2, S_COM2,
    165 		S_COM4, -1, A_CMNT, S_DONE,
    166 		S_DECL, '-', A_SKIP, S_COM,
    167 		S_DECL, '>', A_SKIP, S_PCDATA,
    168 		S_DECL, '[', A_SKIP, S_BB,
    169 		S_DECL, 0, A_SAVE, S_DECL2,
    170 		S_DECL, -1, A_SKIP, S_DONE,
    171 		S_DECL2, '>', A_DECL, S_PCDATA,
    172 		S_DECL2, 0, A_SAVE, S_DECL2,
    173 		S_DECL2, -1, A_SKIP, S_DONE,
    174 		S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA,
    175 		S_EMPTYTAG, 0, A_SAVE, S_ANAME,
    176 		S_EMPTYTAG, ' ', A_SKIP, S_TAGWS,
    177 		S_EMPTYTAG, '\n', A_SKIP, S_TAGWS,
    178 		S_EMPTYTAG, '\t', A_SKIP, S_TAGWS,
    179 		S_ENT, 0, A_ENTITY, S_ENT,
    180 		S_ENT, -1, A_ENTITY, S_DONE,
    181 		S_EQ, '=', A_SKIP, S_AVAL,
    182 		S_EQ, '>', A_ADUP_STAGC, S_PCDATA,
    183 		S_EQ, 0, A_ADUP_SAVE, S_ANAME,
    184 		S_EQ, -1, A_ADUP_STAGC, S_DONE,
    185 		S_EQ, ' ', A_SKIP, S_EQ,
    186 		S_EQ, '\n', A_SKIP, S_EQ,
    187 		S_EQ, '\t', A_SKIP, S_EQ,
    188 		S_ETAG, '>', A_ETAG, S_PCDATA,
    189 		S_ETAG, 0, A_SAVE, S_ETAG,
    190 		S_ETAG, -1, A_ETAG, S_DONE,
    191 		S_ETAG, ' ', A_SKIP, S_ETAG,
    192 		S_ETAG, '\n', A_SKIP, S_ETAG,
    193 		S_ETAG, '\t', A_SKIP, S_ETAG,
    194 		S_GI, '/', A_SKIP, S_EMPTYTAG,
    195 		S_GI, '>', A_GI_STAGC, S_PCDATA,
    196 		S_GI, 0, A_SAVE, S_GI,
    197 		S_GI, -1, A_SKIP, S_DONE,
    198 		S_GI, ' ', A_GI, S_TAGWS,
    199 		S_GI, '\n', A_GI, S_TAGWS,
    200 		S_GI, '\t', A_GI, S_TAGWS,
    201 		S_NCR, 0, A_ENTITY, S_NCR,
    202 		S_NCR, -1, A_ENTITY, S_DONE,
    203 		S_PCDATA, '&', A_ENTITY_START, S_ENT,
    204 		S_PCDATA, '<', A_PCDATA, S_TAG,
    205 		S_PCDATA, 0, A_SAVE, S_PCDATA,
    206 		S_PCDATA, -1, A_PCDATA, S_DONE,
    207 		S_PI, '>', A_PI, S_PCDATA,
    208 		S_PI, 0, A_SAVE, S_PI,
    209 		S_PI, -1, A_PI, S_DONE,
    210 		S_PITARGET, '>', A_PITARGET_PI, S_PCDATA,
    211 		S_PITARGET, 0, A_SAVE, S_PITARGET,
    212 		S_PITARGET, -1, A_PITARGET_PI, S_DONE,
    213 		S_PITARGET, ' ', A_PITARGET, S_PI,
    214 		S_PITARGET, '\n', A_PITARGET, S_PI,
    215 		S_PITARGET, '\t', A_PITARGET, S_PI,
    216 		S_QUOT, '"', A_AVAL, S_TAGWS,
    217 		S_QUOT, 0, A_SAVE, S_QUOT,
    218 		S_QUOT, -1, A_AVAL_STAGC, S_DONE,
    219 		S_QUOT, ' ', A_SP, S_QUOT,
    220 		S_QUOT, '\n', A_SP, S_QUOT,
    221 		S_QUOT, '\t', A_SP, S_QUOT,
    222 		S_STAGC, '>', A_AVAL_STAGC, S_PCDATA,
    223 		S_STAGC, 0, A_SAVE, S_STAGC,
    224 		S_STAGC, -1, A_AVAL_STAGC, S_DONE,
    225 		S_STAGC, ' ', A_AVAL, S_TAGWS,
    226 		S_STAGC, '\n', A_AVAL, S_TAGWS,
    227 		S_STAGC, '\t', A_AVAL, S_TAGWS,
    228 		S_TAG, '!', A_SKIP, S_DECL,
    229 		S_TAG, '/', A_SKIP, S_ETAG,
    230 		S_TAG, '<', A_SAVE, S_TAG,
    231 		S_TAG, '?', A_SKIP, S_PITARGET,
    232 		S_TAG, 0, A_SAVE, S_GI,
    233 		S_TAG, -1, A_LT_PCDATA, S_DONE,
    234 		S_TAG, ' ', A_LT, S_PCDATA,
    235 		S_TAG, '\n', A_LT, S_PCDATA,
    236 		S_TAG, '\t', A_LT, S_PCDATA,
    237 		S_TAGWS, '/', A_SKIP, S_EMPTYTAG,
    238 		S_TAGWS, '>', A_STAGC, S_PCDATA,
    239 		S_TAGWS, 0, A_SAVE, S_ANAME,
    240 		S_TAGWS, -1, A_STAGC, S_DONE,
    241 		S_TAGWS, ' ', A_SKIP, S_TAGWS,
    242 		S_TAGWS, '\n', A_SKIP, S_TAGWS,
    243 		S_TAGWS, '\t', A_SKIP, S_TAGWS,
    244 		S_XNCR, 0, A_ENTITY, S_XNCR,
    245 		S_XNCR, -1, A_ENTITY, S_DONE,
    246 
    247 	};
    248 	private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"};
    249 	private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"};
    250 
    251 
    252 	// End of state table
    253 
    254 	private String thePublicid;			// Locator state
    255 	private String theSystemid;
    256 	private int theLastLine;
    257 	private int theLastColumn;
    258 	private int theCurrentLine;
    259 	private int theCurrentColumn;
    260 
    261 	int theState;					// Current state
    262 	int theNextState;				// Next state
    263 	char[] theOutputBuffer = new char[200];	// Output buffer
    264 	int theSize;					// Current buffer size
    265 	int[] theWinMap = {				// Windows chars map
    266 		0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
    267 		0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
    268 		0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
    269 		0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178};
    270 
    271 	/**
    272 	 * Index into the state table for [state][input character - 2].
    273 	 * The state table consists of 4-entry runs on the form
    274 	 * { current state, input character, action, next state }.
    275 	 * We precompute the index into the state table for all possible
    276 	 * { current state, input character } and store the result in
    277 	 * the statetableIndex array. Since only some input characters
    278 	 * are present in the state table, we only do the computation for
    279 	 * characters 0 to the highest character value in the state table.
    280 	 * An input character of -2 is used to cover all other characters
    281 	 * as -2 is guaranteed not to match any input character entry
    282 	 * in the state table.
    283 	 *
    284 	 * <p>When doing lookups, the input character should first be tested
    285 	 * to be in the range [-1 (inclusive), statetableIndexMaxChar (exclusive)].
    286 	 * if it isn't use -2 as the input character.
    287 	 *
    288 	 * <p>Finally, add 2 to the input character to cover for the fact that
    289 	 * Java doesn't support negative array indexes. Then look up
    290 	 * the value in the statetableIndex. If the value is -1, then
    291 	 * no action or next state was found for the { state, input } that
    292 	 * you had. If it isn't -1, then action = statetable[value + 2] and
    293 	 * next state = statetable[value + 3]. That is, the value points
    294 	 * to the start of the answer 4-tuple in the statetable.
    295 	 */
    296 	static short[][] statetableIndex;
    297 	/**
    298 	 * The highest character value seen in the statetable.
    299 	 * See the doc comment for statetableIndex to see how this
    300 	 * is used.
    301 	 */
    302 	static int statetableIndexMaxChar;
    303 	static {
    304 		int maxState = -1;
    305 		int maxChar = -1;
    306 		for (int i = 0; i < statetable.length; i += 4) {
    307 			if (statetable[i] > maxState) {
    308 				maxState = statetable[i];
    309 				}
    310 			if (statetable[i + 1] > maxChar) {
    311 				maxChar = statetable[i + 1];
    312 				}
    313 			}
    314 		statetableIndexMaxChar = maxChar + 1;
    315 
    316 		statetableIndex = new short[maxState + 1][maxChar + 3];
    317 		for (int theState = 0; theState <= maxState; ++theState) {
    318 			for (int ch = -2; ch <= maxChar; ++ch) {
    319 				int hit = -1;
    320 				int action = 0;
    321 				for (int i = 0; i < statetable.length; i += 4) {
    322 					if (theState != statetable[i]) {
    323 						if (action != 0) break;
    324 						continue;
    325 						}
    326 					if (statetable[i+1] == 0) {
    327 						hit = i;
    328 						action = statetable[i+2];
    329 						}
    330 					else if (statetable[i+1] == ch) {
    331 						hit = i;
    332 						action = statetable[i+2];
    333 						break;
    334 						}
    335 					}
    336 				statetableIndex[theState][ch + 2] = (short) hit;
    337 				}
    338 			}
    339 		}
    340 
    341 	// Compensate for bug in PushbackReader that allows
    342 	// pushing back EOF.
    343 	private void unread(PushbackReader r, int c) throws IOException {
    344 		if (c != -1) r.unread(c);
    345 		}
    346 
    347 	// Locator implementation
    348 
    349 	public int getLineNumber() {
    350 		return theLastLine;
    351 		}
    352 	public int getColumnNumber() {
    353 		return theLastColumn;
    354 		}
    355 	public String getPublicId() {
    356 		return thePublicid;
    357 		}
    358 	public String getSystemId() {
    359 		return theSystemid;
    360 		}
    361 
    362 
    363 	// Scanner implementation
    364 
    365 	/**
    366 	Reset document locator, supplying systemid and publicid.
    367 	@param systemid System id
    368 	@param publicid Public id
    369 	*/
    370 
    371 	public void resetDocumentLocator(String publicid, String systemid) {
    372 		thePublicid = publicid;
    373 		theSystemid = systemid;
    374 		theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
    375 		}
    376 
    377 	/**
    378 	Scan HTML source, reporting lexical events.
    379 	@param r0 Reader that provides characters
    380 	@param h ScanHandler that accepts lexical events.
    381 	*/
    382 
    383 	public void scan(Reader r0, ScanHandler h) throws IOException, SAXException {
    384 		theState = S_PCDATA;
    385 		PushbackReader r;
    386 		if (r0 instanceof BufferedReader) {
    387 			r = new PushbackReader(r0, 5);
    388 			}
    389 		else {
    390 			r = new PushbackReader(new BufferedReader(r0), 5);
    391 			}
    392 
    393 		int firstChar = r.read();	// Remove any leading BOM
    394 		if (firstChar != '\uFEFF') unread(r, firstChar);
    395 
    396 		while (theState != S_DONE) {
    397 			int ch = r.read();
    398 
    399 			// Process control characters
    400 			if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
    401 
    402 			if (ch == '\r') {
    403 				ch = r.read();		// expect LF next
    404 				if (ch != '\n') {
    405 					unread(r, ch);	// nope
    406 					ch = '\n';
    407 					}
    408 				}
    409 
    410 			if (ch == '\n') {
    411 				theCurrentLine++;
    412 				theCurrentColumn = 0;
    413 				}
    414 			else {
    415 				theCurrentColumn++;
    416 				}
    417 
    418 			if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
    419 
    420 			// Search state table
    421 			int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2;
    422 			int statetableRow = statetableIndex[theState][adjCh + 2];
    423 			int action = 0;
    424 			if (statetableRow != -1) {
    425 				action = statetable[statetableRow + 2];
    426 				theNextState = statetable[statetableRow + 3];
    427 				}
    428 
    429 //			System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
    430 			switch (action) {
    431 			case 0:
    432 				throw new Error(
    433 					"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
    434 					Integer.toString(theState));
    435 			case A_ADUP:
    436 				h.adup(theOutputBuffer, 0, theSize);
    437 				theSize = 0;
    438 				break;
    439 			case A_ADUP_SAVE:
    440 				h.adup(theOutputBuffer, 0, theSize);
    441 				theSize = 0;
    442 				save(ch, h);
    443 				break;
    444 			case A_ADUP_STAGC:
    445 				h.adup(theOutputBuffer, 0, theSize);
    446 				theSize = 0;
    447 				h.stagc(theOutputBuffer, 0, theSize);
    448 				break;
    449 			case A_ANAME:
    450 				h.aname(theOutputBuffer, 0, theSize);
    451 				theSize = 0;
    452 				break;
    453 			case A_ANAME_ADUP:
    454 				h.aname(theOutputBuffer, 0, theSize);
    455 				theSize = 0;
    456 				h.adup(theOutputBuffer, 0, theSize);
    457 				break;
    458 			case A_ANAME_ADUP_STAGC:
    459 				h.aname(theOutputBuffer, 0, theSize);
    460 				theSize = 0;
    461 				h.adup(theOutputBuffer, 0, theSize);
    462 				h.stagc(theOutputBuffer, 0, theSize);
    463 				break;
    464 			case A_AVAL:
    465 				h.aval(theOutputBuffer, 0, theSize);
    466 				theSize = 0;
    467 				break;
    468 			case A_AVAL_STAGC:
    469 				h.aval(theOutputBuffer, 0, theSize);
    470 				theSize = 0;
    471 				h.stagc(theOutputBuffer, 0, theSize);
    472 				break;
    473 			case A_CDATA:
    474 				mark();
    475 				// suppress the final "]]" in the buffer
    476 				if (theSize > 1) theSize -= 2;
    477 				h.pcdata(theOutputBuffer, 0, theSize);
    478 				theSize = 0;
    479 				break;
    480 			case A_ENTITY_START:
    481 				h.pcdata(theOutputBuffer, 0, theSize);
    482 				theSize = 0;
    483 				save(ch, h);
    484 				break;
    485 			case A_ENTITY:
    486 				mark();
    487 				char ch1 = (char)ch;
    488 //				System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
    489 				if (theState == S_ENT && ch1 == '#') {
    490 					theNextState = S_NCR;
    491 					save(ch, h);
    492 					break;
    493 					}
    494 				else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) {
    495 					theNextState = S_XNCR;
    496 					save(ch, h);
    497 					break;
    498 					}
    499 				else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
    500 					save(ch, h);
    501 					break;
    502 					}
    503 				else if (theState == S_NCR && Character.isDigit(ch1)) {
    504 					save(ch, h);
    505 					break;
    506 					}
    507 				else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
    508 					save(ch, h);
    509 					break;
    510 					}
    511 
    512 				// The whole entity reference has been collected
    513 //				System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
    514 				h.entity(theOutputBuffer, 1, theSize - 1);
    515 				int ent = h.getEntity();
    516 //				System.err.println("%% value = " + ent);
    517 				if (ent != 0) {
    518 					theSize = 0;
    519 					if (ent >= 0x80 && ent <= 0x9F) {
    520 						ent = theWinMap[ent-0x80];
    521 						}
    522 					if (ent < 0x20) {
    523 						// Control becomes space
    524 						ent = 0x20;
    525 						}
    526 					else if (ent >= 0xD800 && ent <= 0xDFFF) {
    527 						// Surrogates get dropped
    528 						ent = 0;
    529 						}
    530 					else if (ent <= 0xFFFF) {
    531 						// BMP character
    532 						save(ent, h);
    533 						}
    534 					else {
    535 						// Astral converted to two surrogates
    536 						ent -= 0x10000;
    537 						save((ent>>10) + 0xD800, h);
    538 						save((ent&0x3FF) + 0xDC00, h);
    539 						}
    540 					if (ch != ';') {
    541 						unread(r, ch);
    542 						theCurrentColumn--;
    543 						}
    544 					}
    545 				else {
    546 					unread(r, ch);
    547 					theCurrentColumn--;
    548 					}
    549 				theNextState = S_PCDATA;
    550 				break;
    551 			case A_ETAG:
    552 				h.etag(theOutputBuffer, 0, theSize);
    553 				theSize = 0;
    554 				break;
    555 			case A_DECL:
    556 				h.decl(theOutputBuffer, 0, theSize);
    557 				theSize = 0;
    558 				break;
    559 			case A_GI:
    560 				h.gi(theOutputBuffer, 0, theSize);
    561 				theSize = 0;
    562 				break;
    563 			case A_GI_STAGC:
    564 				h.gi(theOutputBuffer, 0, theSize);
    565 				theSize = 0;
    566 				h.stagc(theOutputBuffer, 0, theSize);
    567 				break;
    568 			case A_LT:
    569 				mark();
    570 				save('<', h);
    571 				save(ch, h);
    572 				break;
    573 			case A_LT_PCDATA:
    574 				mark();
    575 				save('<', h);
    576 				h.pcdata(theOutputBuffer, 0, theSize);
    577 				theSize = 0;
    578 				break;
    579 			case A_PCDATA:
    580 				mark();
    581 				h.pcdata(theOutputBuffer, 0, theSize);
    582 				theSize = 0;
    583 				break;
    584 			case A_CMNT:
    585 				mark();
    586 				h.cmnt(theOutputBuffer, 0, theSize);
    587 				theSize = 0;
    588 				break;
    589 			case A_MINUS3:
    590 				save('-', h);
    591 				save(' ', h);
    592 				break;
    593 			case A_MINUS2:
    594 				save('-', h);
    595 				save(' ', h);
    596 				// fall through into A_MINUS
    597 			case A_MINUS:
    598 				save('-', h);
    599 				save(ch, h);
    600 				break;
    601 			case A_PI:
    602 				mark();
    603 				h.pi(theOutputBuffer, 0, theSize);
    604 				theSize = 0;
    605 				break;
    606 			case A_PITARGET:
    607 				h.pitarget(theOutputBuffer, 0, theSize);
    608 				theSize = 0;
    609 				break;
    610 			case A_PITARGET_PI:
    611 				h.pitarget(theOutputBuffer, 0, theSize);
    612 				theSize = 0;
    613 				h.pi(theOutputBuffer, 0, theSize);
    614 				break;
    615 			case A_SAVE:
    616 				save(ch, h);
    617 				break;
    618 			case A_SKIP:
    619 				break;
    620 			case A_SP:
    621 				save(' ', h);
    622 				break;
    623 			case A_STAGC:
    624 				h.stagc(theOutputBuffer, 0, theSize);
    625 				theSize = 0;
    626 				break;
    627 			case A_EMPTYTAG:
    628 				mark();
    629 //				System.err.println("%%% Empty tag seen");
    630 				if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
    631 				theSize = 0;
    632 				h.stage(theOutputBuffer, 0, theSize);
    633 				break;
    634 			case A_UNGET:
    635 				unread(r, ch);
    636 				theCurrentColumn--;
    637 				break;
    638 			case A_UNSAVE_PCDATA:
    639 				if (theSize > 0) theSize--;
    640 				h.pcdata(theOutputBuffer, 0, theSize);
    641 				theSize = 0;
    642 				break;
    643 			default:
    644 				throw new Error("Can't process state " + action);
    645 				}
    646 			theState = theNextState;
    647 			}
    648 		h.eof(theOutputBuffer, 0, 0);
    649 		}
    650 
    651 	/**
    652 	* Mark the current scan position as a "point of interest" - start of a tag,
    653 	* cdata, processing instruction etc.
    654 	*/
    655 
    656 	private void mark() {
    657 		theLastColumn = theCurrentColumn;
    658 		theLastLine = theCurrentLine;
    659 		}
    660 
    661 	/**
    662 	A callback for the ScanHandler that allows it to force
    663 	the lexer state to CDATA content (no markup is recognized except
    664 	the end of element.
    665 	*/
    666 
    667 	public void startCDATA() { theNextState = S_CDATA; }
    668 
    669 	private void save(int ch, ScanHandler h) throws IOException, SAXException {
    670 		if (theSize >= theOutputBuffer.length - 20) {
    671 			if (theState == S_PCDATA || theState == S_CDATA) {
    672 				// Return a buffer-sized chunk of PCDATA
    673 				h.pcdata(theOutputBuffer, 0, theSize);
    674 				theSize = 0;
    675 				}
    676 			else {
    677 				// Grow the buffer size
    678 				char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
    679 				System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
    680 				theOutputBuffer = newOutputBuffer;
    681 				}
    682 			}
    683 		theOutputBuffer[theSize++] = (char)ch;
    684 		}
    685 
    686 	/**
    687 	Test procedure.  Reads HTML from the standard input and writes
    688 	PYX to the standard output.
    689 	*/
    690 
    691 	public static void main(String[] argv) throws IOException, SAXException {
    692 		Scanner s = new HTMLScanner();
    693 		Reader r = new InputStreamReader(System.in, "UTF-8");
    694 		Writer w = new OutputStreamWriter(System.out, "UTF-8");
    695 		PYXWriter pw = new PYXWriter(w);
    696 		s.scan(r, pw);
    697 		w.close();
    698 		}
    699 
    700 
    701 	private static String nicechar(int in) {
    702 		if (in == '\n') return "\\n";
    703 		if (in < 32) return "0x"+Integer.toHexString(in);
    704 		return "'"+((char)in)+"'";
    705 		}
    706 
    707 	}
    708