Home | History | Annotate | Download | only in tagsoup
      1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
      2 //
      3 // TagSoup is licensed under the Apache License,
      4 // Version 2.0.  You may obtain a copy of this license at
      5 // http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
      6 // additional legal rights not granted by this license.
      7 //
      8 // TagSoup is distributed in the hope that it will be useful, but
      9 // unless required by applicable law or agreed to in writing, TagSoup
     10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
     11 // OF ANY KIND, either express or implied; not even the implied warranty
     12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     13 //
     14 //
     15 package org.ccil.cowan.tagsoup;
     16 import java.io.*;
     17 import org.xml.sax.SAXException;
     18 import org.xml.sax.Locator;
     19 
     20 /**
     21 This class implements a table-driven scanner for HTML, allowing for lots of
     22 defects.  It implements the Scanner interface, which accepts a Reader
     23 object to fetch characters from and a ScanHandler object to report lexical
     24 events to.
     25 */
     26 
     27 public class HTMLScanner implements Scanner, Locator {
     28 
     29 	// Start of state table
     30 		private static final int S_ANAME = 1;
     31 	private static final int S_APOS = 2;
     32 	private static final int S_AVAL = 3;
     33 	private static final int S_BB = 4;
     34 	private static final int S_BBC = 5;
     35 	private static final int S_BBCD = 6;
     36 	private static final int S_BBCDA = 7;
     37 	private static final int S_BBCDAT = 8;
     38 	private static final int S_BBCDATA = 9;
     39 	private static final int S_CDATA = 10;
     40 	private static final int S_CDATA2 = 11;
     41 	private static final int S_CDSECT = 12;
     42 	private static final int S_CDSECT1 = 13;
     43 	private static final int S_CDSECT2 = 14;
     44 	private static final int S_COM = 15;
     45 	private static final int S_COM2 = 16;
     46 	private static final int S_COM3 = 17;
     47 	private static final int S_COM4 = 18;
     48 	private static final int S_DECL = 19;
     49 	private static final int S_DECL2 = 20;
     50 	private static final int S_DONE = 21;
     51 	private static final int S_EMPTYTAG = 22;
     52 	private static final int S_ENT = 23;
     53 	private static final int S_EQ = 24;
     54 	private static final int S_ETAG = 25;
     55 	private static final int S_GI = 26;
     56 	private static final int S_NCR = 27;
     57 	private static final int S_PCDATA = 28;
     58 	private static final int S_PI = 29;
     59 	private static final int S_PITARGET = 30;
     60 	private static final int S_QUOT = 31;
     61 	private static final int S_STAGC = 32;
     62 	private static final int S_TAG = 33;
     63 	private static final int S_TAGWS = 34;
     64 	private static final int S_XNCR = 35;
     65 	private static final int A_ADUP = 1;
     66 	private static final int A_ADUP_SAVE = 2;
     67 	private static final int A_ADUP_STAGC = 3;
     68 	private static final int A_ANAME = 4;
     69 	private static final int A_ANAME_ADUP = 5;
     70 	private static final int A_ANAME_ADUP_STAGC = 6;
     71 	private static final int A_AVAL = 7;
     72 	private static final int A_AVAL_STAGC = 8;
     73 	private static final int A_CDATA = 9;
     74 	private static final int A_CMNT = 10;
     75 	private static final int A_DECL = 11;
     76 	private static final int A_EMPTYTAG = 12;
     77 	private static final int A_ENTITY = 13;
     78 	private static final int A_ENTITY_START = 14;
     79 	private static final int A_ETAG = 15;
     80 	private static final int A_GI = 16;
     81 	private static final int A_GI_STAGC = 17;
     82 	private static final int A_LT = 18;
     83 	private static final int A_LT_PCDATA = 19;
     84 	private static final int A_MINUS = 20;
     85 	private static final int A_MINUS2 = 21;
     86 	private static final int A_MINUS3 = 22;
     87 	private static final int A_PCDATA = 23;
     88 	private static final int A_PI = 24;
     89 	private static final int A_PITARGET = 25;
     90 	private static final int A_PITARGET_PI = 26;
     91 	private static final int A_SAVE = 27;
     92 	private static final int A_SKIP = 28;
     93 	private static final int A_SP = 29;
     94 	private static final int A_STAGC = 30;
     95 	private static final int A_UNGET = 31;
     96 	private static final int A_UNSAVE_PCDATA = 32;
     97 	private static int[] statetable = {
     98 		S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG,
     99 		S_ANAME, '=', A_ANAME, S_AVAL,
    100 		S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA,
    101 		S_ANAME, 0, A_SAVE, S_ANAME,
    102 		S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE,
    103 		S_ANAME, ' ', A_ANAME, S_EQ,
    104 		S_ANAME, '\n', A_ANAME, S_EQ,
    105 		S_ANAME, '\t', A_ANAME, S_EQ,
    106 		S_APOS, '\'', A_AVAL, S_TAGWS,
    107 		S_APOS, 0, A_SAVE, S_APOS,
    108 		S_APOS, -1, A_AVAL_STAGC, S_DONE,
    109 		S_APOS, ' ', A_SP, S_APOS,
    110 		S_APOS, '\n', A_SP, S_APOS,
    111 		S_APOS, '\t', A_SP, S_APOS,
    112 		S_AVAL, '\'', A_SKIP, S_APOS,
    113 		S_AVAL, '"', A_SKIP, S_QUOT,
    114 		S_AVAL, '>', A_AVAL_STAGC, S_PCDATA,
    115 		S_AVAL, 0, A_SAVE, S_STAGC,
    116 		S_AVAL, -1, A_AVAL_STAGC, S_DONE,
    117 		S_AVAL, ' ', A_SKIP, S_AVAL,
    118 		S_AVAL, '\n', A_SKIP, S_AVAL,
    119 		S_AVAL, '\t', A_SKIP, S_AVAL,
    120 		S_BB, 'C', A_SKIP, S_BBC,
    121 		S_BB, 0, A_SKIP, S_DECL,
    122 		S_BB, -1, A_SKIP, S_DONE,
    123 		S_BBC, 'D', A_SKIP, S_BBCD,
    124 		S_BBC, 0, A_SKIP, S_DECL,
    125 		S_BBC, -1, A_SKIP, S_DONE,
    126 		S_BBCD, 'A', A_SKIP, S_BBCDA,
    127 		S_BBCD, 0, A_SKIP, S_DECL,
    128 		S_BBCD, -1, A_SKIP, S_DONE,
    129 		S_BBCDA, 'T', A_SKIP, S_BBCDAT,
    130 		S_BBCDA, 0, A_SKIP, S_DECL,
    131 		S_BBCDA, -1, A_SKIP, S_DONE,
    132 		S_BBCDAT, 'A', A_SKIP, S_BBCDATA,
    133 		S_BBCDAT, 0, A_SKIP, S_DECL,
    134 		S_BBCDAT, -1, A_SKIP, S_DONE,
    135 		S_BBCDATA, '[', A_SKIP, S_CDSECT,
    136 		S_BBCDATA, 0, A_SKIP, S_DECL,
    137 		S_BBCDATA, -1, A_SKIP, S_DONE,
    138 		S_CDATA, '<', A_SAVE, S_CDATA2,
    139 		S_CDATA, 0, A_SAVE, S_CDATA,
    140 		S_CDATA, -1, A_PCDATA, S_DONE,
    141 		S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG,
    142 		S_CDATA2, 0, A_SAVE, S_CDATA,
    143 		S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE,
    144 		S_CDSECT, ']', A_SAVE, S_CDSECT1,
    145 		S_CDSECT, 0, A_SAVE, S_CDSECT,
    146 		S_CDSECT, -1, A_SKIP, S_DONE,
    147 		S_CDSECT1, ']', A_SAVE, S_CDSECT2,
    148 		S_CDSECT1, 0, A_SAVE, S_CDSECT,
    149 		S_CDSECT1, -1, A_SKIP, S_DONE,
    150 		S_CDSECT2, '>', A_CDATA, S_PCDATA,
    151 		S_CDSECT2, 0, A_SAVE, S_CDSECT,
    152 		S_CDSECT2, -1, A_SKIP, S_DONE,
    153 		S_COM, '-', A_SKIP, S_COM2,
    154 		S_COM, 0, A_SAVE, S_COM2,
    155 		S_COM, -1, A_CMNT, S_DONE,
    156 		S_COM2, '-', A_SKIP, S_COM3,
    157 		S_COM2, 0, A_SAVE, S_COM2,
    158 		S_COM2, -1, A_CMNT, S_DONE,
    159 		S_COM3, '-', A_SKIP, S_COM4,
    160 		S_COM3, 0, A_MINUS, S_COM2,
    161 		S_COM3, -1, A_CMNT, S_DONE,
    162 		S_COM4, '-', A_MINUS3, S_COM4,
    163 		S_COM4, '>', A_CMNT, S_PCDATA,
    164 		S_COM4, 0, A_MINUS2, S_COM2,
    165 		S_COM4, -1, A_CMNT, S_DONE,
    166 		S_DECL, '-', A_SKIP, S_COM,
    167 		S_DECL, '[', A_SKIP, S_BB,
    168 		S_DECL, '>', A_SKIP, S_PCDATA,
    169 		S_DECL, 0, A_SAVE, S_DECL2,
    170 		S_DECL, -1, A_SKIP, S_DONE,
    171 		S_DECL2, '>', A_DECL, S_PCDATA,
    172 		S_DECL2, 0, A_SAVE, S_DECL2,
    173 		S_DECL2, -1, A_SKIP, S_DONE,
    174 		S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA,
    175 		S_EMPTYTAG, 0, A_SAVE, S_ANAME,
    176 		S_EMPTYTAG, ' ', A_SKIP, S_TAGWS,
    177 		S_EMPTYTAG, '\n', A_SKIP, S_TAGWS,
    178 		S_EMPTYTAG, '\t', A_SKIP, S_TAGWS,
    179 		S_ENT, 0, A_ENTITY, S_ENT,
    180 		S_ENT, -1, A_ENTITY, S_DONE,
    181 		S_EQ, '=', A_SKIP, S_AVAL,
    182 		S_EQ, '>', A_ADUP_STAGC, S_PCDATA,
    183 		S_EQ, 0, A_ADUP_SAVE, S_ANAME,
    184 		S_EQ, -1, A_ADUP_STAGC, S_DONE,
    185 		S_EQ, ' ', A_SKIP, S_EQ,
    186 		S_EQ, '\n', A_SKIP, S_EQ,
    187 		S_EQ, '\t', A_SKIP, S_EQ,
    188 		S_ETAG, '>', A_ETAG, S_PCDATA,
    189 		S_ETAG, 0, A_SAVE, S_ETAG,
    190 		S_ETAG, -1, A_ETAG, S_DONE,
    191 		S_ETAG, ' ', A_SKIP, S_ETAG,
    192 		S_ETAG, '\n', A_SKIP, S_ETAG,
    193 		S_ETAG, '\t', A_SKIP, S_ETAG,
    194 		S_GI, '/', A_SKIP, S_EMPTYTAG,
    195 		S_GI, '>', A_GI_STAGC, S_PCDATA,
    196 		S_GI, 0, A_SAVE, S_GI,
    197 		S_GI, -1, A_SKIP, S_DONE,
    198 		S_GI, ' ', A_GI, S_TAGWS,
    199 		S_GI, '\n', A_GI, S_TAGWS,
    200 		S_GI, '\t', A_GI, S_TAGWS,
    201 		S_NCR, 0, A_ENTITY, S_NCR,
    202 		S_NCR, -1, A_ENTITY, S_DONE,
    203 		S_PCDATA, '&', A_ENTITY_START, S_ENT,
    204 		S_PCDATA, '<', A_PCDATA, S_TAG,
    205 		S_PCDATA, 0, A_SAVE, S_PCDATA,
    206 		S_PCDATA, -1, A_PCDATA, S_DONE,
    207 		S_PI, '>', A_PI, S_PCDATA,
    208 		S_PI, 0, A_SAVE, S_PI,
    209 		S_PI, -1, A_PI, S_DONE,
    210 		S_PITARGET, '>', A_PITARGET_PI, S_PCDATA,
    211 		S_PITARGET, 0, A_SAVE, S_PITARGET,
    212 		S_PITARGET, -1, A_PITARGET_PI, S_DONE,
    213 		S_PITARGET, ' ', A_PITARGET, S_PI,
    214 		S_PITARGET, '\n', A_PITARGET, S_PI,
    215 		S_PITARGET, '\t', A_PITARGET, S_PI,
    216 		S_QUOT, '"', A_AVAL, S_TAGWS,
    217 		S_QUOT, 0, A_SAVE, S_QUOT,
    218 		S_QUOT, -1, A_AVAL_STAGC, S_DONE,
    219 		S_QUOT, ' ', A_SP, S_QUOT,
    220 		S_QUOT, '\n', A_SP, S_QUOT,
    221 		S_QUOT, '\t', A_SP, S_QUOT,
    222 		S_STAGC, '>', A_AVAL_STAGC, S_PCDATA,
    223 		S_STAGC, 0, A_SAVE, S_STAGC,
    224 		S_STAGC, -1, A_AVAL_STAGC, S_DONE,
    225 		S_STAGC, ' ', A_AVAL, S_TAGWS,
    226 		S_STAGC, '\n', A_AVAL, S_TAGWS,
    227 		S_STAGC, '\t', A_AVAL, S_TAGWS,
    228 		S_TAG, '!', A_SKIP, S_DECL,
    229 		S_TAG, '?', A_SKIP, S_PITARGET,
    230 		S_TAG, '/', A_SKIP, S_ETAG,
    231 		S_TAG, '<', A_SAVE, S_TAG,
    232 		S_TAG, 0, A_SAVE, S_GI,
    233 		S_TAG, -1, A_LT_PCDATA, S_DONE,
    234 		S_TAG, ' ', A_LT, S_PCDATA,
    235 		S_TAG, '\n', A_LT, S_PCDATA,
    236 		S_TAG, '\t', A_LT, S_PCDATA,
    237 		S_TAGWS, '/', A_SKIP, S_EMPTYTAG,
    238 		S_TAGWS, '>', A_STAGC, S_PCDATA,
    239 		S_TAGWS, 0, A_SAVE, S_ANAME,
    240 		S_TAGWS, -1, A_STAGC, S_DONE,
    241 		S_TAGWS, ' ', A_SKIP, S_TAGWS,
    242 		S_TAGWS, '\n', A_SKIP, S_TAGWS,
    243 		S_TAGWS, '\t', A_SKIP, S_TAGWS,
    244 		S_XNCR, 0, A_ENTITY, S_XNCR,
    245 		S_XNCR, -1, A_ENTITY, S_DONE,
    246 
    247 	};
    248 	private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"};
    249 	private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"};
    250 
    251 
    252 	// End of state table
    253 
    254 	private String thePublicid;			// Locator state
    255 	private String theSystemid;
    256 	private int theLastLine;
    257 	private int theLastColumn;
    258 	private int theCurrentLine;
    259 	private int theCurrentColumn;
    260 
    261 	int theState;					// Current state
    262 	int theNextState;				// Next state
    263 	char[] theOutputBuffer = new char[200];	// Output buffer
    264 	int theSize;					// Current buffer size
    265 	int[] theWinMap = {				// Windows chars map
    266 		0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
    267 		0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
    268 		0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
    269 		0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178};
    270 
    271 	// Compensate for bug in PushbackReader that allows
    272 	// pushing back EOF.
    273 	private void unread(PushbackReader r, int c) throws IOException {
    274 		if (c != -1) r.unread(c);
    275 		}
    276 
    277 	// Locator implementation
    278 
    279 	public int getLineNumber() {
    280 		return theLastLine;
    281 		}
    282 	public int getColumnNumber() {
    283 		return theLastColumn;
    284 		}
    285 	public String getPublicId() {
    286 		return thePublicid;
    287 		}
    288 	public String getSystemId() {
    289 		return theSystemid;
    290 		}
    291 
    292 
    293 	// Scanner implementation
    294 
    295 	/**
    296 	Reset document locator, supplying systemid and publicid.
    297 	@param systemid System id
    298 	@param publicid Public id
    299 	*/
    300 
    301 	public void resetDocumentLocator(String publicid, String systemid) {
    302 		thePublicid = publicid;
    303 		theSystemid = systemid;
    304 		theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
    305 		}
    306 
    307 	/**
    308 	Scan HTML source, reporting lexical events.
    309 	@param r0 Reader that provides characters
    310 	@param h ScanHandler that accepts lexical events.
    311 	*/
    312 
    313 	public void scan(Reader r0, ScanHandler h) throws IOException, SAXException {
    314 		theState = S_PCDATA;
    315 		PushbackReader r;
    316 		if (r0 instanceof PushbackReader) {
    317 			r = (PushbackReader)r0;
    318 			}
    319 		else if (r0 instanceof BufferedReader) {
    320 			r = new PushbackReader(r0);
    321 			}
    322 		else {
    323 			r = new PushbackReader(new BufferedReader(r0, 200));
    324 			}
    325 
    326 		int firstChar = r.read();	// Remove any leading BOM
    327 		if (firstChar != '\uFEFF') unread(r, firstChar);
    328 
    329 		while (theState != S_DONE) {
    330 			int ch = r.read();
    331 
    332 			// Process control characters
    333 			if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
    334 
    335 			if (ch == '\r') {
    336 				ch = r.read();		// expect LF next
    337 				if (ch != '\n') {
    338 					unread(r, ch);	// nope
    339 					ch = '\n';
    340 					}
    341 				}
    342 
    343 			if (ch == '\n') {
    344 				theCurrentLine++;
    345 				theCurrentColumn = 0;
    346 				}
    347 			else {
    348 				theCurrentColumn++;
    349 				}
    350 
    351 			if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
    352 
    353 			// Search state table
    354 			int action = 0;
    355 			for (int i = 0; i < statetable.length; i += 4) {
    356 				if (theState != statetable[i]) {
    357 					if (action != 0) break;
    358 					continue;
    359 					}
    360 				if (statetable[i+1] == 0) {
    361 					action = statetable[i+2];
    362 					theNextState = statetable[i+3];
    363 					}
    364 				else if (statetable[i+1] == ch) {
    365 					action = statetable[i+2];
    366 					theNextState = statetable[i+3];
    367 					break;
    368 					}
    369 				}
    370 //			System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
    371 			switch (action) {
    372 			case 0:
    373 				throw new Error(
    374 "HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
    375 Integer.toString(theState));
    376         		case A_ADUP:
    377 				h.adup(theOutputBuffer, 0, theSize);
    378 				theSize = 0;
    379 				break;
    380         		case A_ADUP_SAVE:
    381 				h.adup(theOutputBuffer, 0, theSize);
    382 				theSize = 0;
    383 				save(ch, h);
    384 				break;
    385         		case A_ADUP_STAGC:
    386 				h.adup(theOutputBuffer, 0, theSize);
    387 				theSize = 0;
    388 				h.stagc(theOutputBuffer, 0, theSize);
    389 				break;
    390         		case A_ANAME:
    391 				h.aname(theOutputBuffer, 0, theSize);
    392 				theSize = 0;
    393 				break;
    394         		case A_ANAME_ADUP:
    395 				h.aname(theOutputBuffer, 0, theSize);
    396 				theSize = 0;
    397 				h.adup(theOutputBuffer, 0, theSize);
    398 				break;
    399         		case A_ANAME_ADUP_STAGC:
    400 				h.aname(theOutputBuffer, 0, theSize);
    401 				theSize = 0;
    402 				h.adup(theOutputBuffer, 0, theSize);
    403 				h.stagc(theOutputBuffer, 0, theSize);
    404 				break;
    405         		case A_AVAL:
    406 				h.aval(theOutputBuffer, 0, theSize);
    407 				theSize = 0;
    408 				break;
    409         		case A_AVAL_STAGC:
    410 				h.aval(theOutputBuffer, 0, theSize);
    411 				theSize = 0;
    412 				h.stagc(theOutputBuffer, 0, theSize);
    413 				break;
    414 			case A_CDATA:
    415 				mark();
    416 				// suppress the final "]]" in the buffer
    417 				if (theSize > 1) theSize -= 2;
    418 				h.pcdata(theOutputBuffer, 0, theSize);
    419 				theSize = 0;
    420 				break;
    421 			case A_ENTITY_START:
    422 				h.pcdata(theOutputBuffer, 0, theSize);
    423 				theSize = 0;
    424 				save(ch, h);
    425 				break;
    426 			case A_ENTITY:
    427 				mark();
    428 				char ch1 = (char)ch;
    429 //				System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
    430 				if (theState == S_ENT && ch1 == '#') {
    431 					theNextState = S_NCR;
    432 					save(ch, h);
    433 					break;
    434 					}
    435 				else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) {
    436 					theNextState = S_XNCR;
    437 					save(ch, h);
    438 					break;
    439 					}
    440 				else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
    441 					save(ch, h);
    442 					break;
    443 					}
    444 				else if (theState == S_NCR && Character.isDigit(ch1)) {
    445 					save(ch, h);
    446 					break;
    447 					}
    448 				else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
    449 					save(ch, h);
    450 					break;
    451 					}
    452 
    453 				// The whole entity reference has been collected
    454 //				System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
    455 				h.entity(theOutputBuffer, 1, theSize - 1);
    456 				int ent = h.getEntity();
    457 //				System.err.println("%% value = " + ent);
    458 				if (ent != 0) {
    459 					theSize = 0;
    460 					if (ent >= 0x80 && ent <= 0x9F) {
    461 						ent = theWinMap[ent-0x80];
    462 						}
    463 					if (ent < 0x20) {
    464 						// Control becomes space
    465 						ent = 0x20;
    466 						}
    467 					else if (ent >= 0xD800 && ent <= 0xDFFF) {
    468 						// Surrogates get dropped
    469 						ent = 0;
    470 						}
    471 					else if (ent <= 0xFFFF) {
    472 						// BMP character
    473 						save(ent, h);
    474 						}
    475 					else {
    476 						// Astral converted to two surrogates
    477 						ent -= 0x10000;
    478 						save((ent>>10) + 0xD800, h);
    479 						save((ent&0x3FF) + 0xDC00, h);
    480 						}
    481 					if (ch != ';') {
    482 						unread(r, ch);
    483 						theCurrentColumn--;
    484 						}
    485 					}
    486 				else {
    487 					unread(r, ch);
    488 					theCurrentColumn--;
    489 					}
    490 				theNextState = S_PCDATA;
    491 				break;
    492         		case A_ETAG:
    493 				h.etag(theOutputBuffer, 0, theSize);
    494 				theSize = 0;
    495 				break;
    496         		case A_DECL:
    497 				h.decl(theOutputBuffer, 0, theSize);
    498 				theSize = 0;
    499 				break;
    500         		case A_GI:
    501 				h.gi(theOutputBuffer, 0, theSize);
    502 				theSize = 0;
    503 				break;
    504 			case A_GI_STAGC:
    505 				h.gi(theOutputBuffer, 0, theSize);
    506 				theSize = 0;
    507 				h.stagc(theOutputBuffer, 0, theSize);
    508 				break;
    509         		case A_LT:
    510 				mark();
    511 				save('<', h);
    512 				save(ch, h);
    513 				break;
    514 			case A_LT_PCDATA:
    515 				mark();
    516 				save('<', h);
    517 				h.pcdata(theOutputBuffer, 0, theSize);
    518 				theSize = 0;
    519 				break;
    520         		case A_PCDATA:
    521 				mark();
    522 				h.pcdata(theOutputBuffer, 0, theSize);
    523 				theSize = 0;
    524 				break;
    525 			case A_CMNT:
    526 				mark();
    527 				h.cmnt(theOutputBuffer, 0, theSize);
    528 				theSize = 0;
    529 				break;
    530 			case A_MINUS3:
    531 				save('-', h);
    532 				save(' ', h);
    533 				break;
    534 			case A_MINUS2:
    535 				save('-', h);
    536 				save(' ', h);
    537 				// fall through into A_MINUS
    538 			case A_MINUS:
    539 				save('-', h);
    540 				save(ch, h);
    541 				break;
    542         		case A_PI:
    543 				mark();
    544 				h.pi(theOutputBuffer, 0, theSize);
    545 				theSize = 0;
    546 				break;
    547         		case A_PITARGET:
    548 				h.pitarget(theOutputBuffer, 0, theSize);
    549 				theSize = 0;
    550 				break;
    551         		case A_PITARGET_PI:
    552 				h.pitarget(theOutputBuffer, 0, theSize);
    553 				theSize = 0;
    554 				h.pi(theOutputBuffer, 0, theSize);
    555 				break;
    556         		case A_SAVE:
    557 				save(ch, h);
    558 				break;
    559         		case A_SKIP:
    560 				break;
    561         		case A_SP:
    562 				save(' ', h);
    563 				break;
    564         		case A_STAGC:
    565 				h.stagc(theOutputBuffer, 0, theSize);
    566 				theSize = 0;
    567 				break;
    568 			case A_EMPTYTAG:
    569 				mark();
    570 //				System.err.println("%%% Empty tag seen");
    571 				if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
    572 				theSize = 0;
    573 				h.stage(theOutputBuffer, 0, theSize);
    574 				break;
    575 			case A_UNGET:
    576 				unread(r, ch);
    577 				theCurrentColumn--;
    578 				break;
    579         		case A_UNSAVE_PCDATA:
    580 				if (theSize > 0) theSize--;
    581 				h.pcdata(theOutputBuffer, 0, theSize);
    582 				theSize = 0;
    583 				break;
    584 			default:
    585 				throw new Error("Can't process state " + action);
    586 				}
    587 			theState = theNextState;
    588 			}
    589 		h.eof(theOutputBuffer, 0, 0);
    590 		}
    591 
    592 	/**
    593 	* Mark the current scan position as a "point of interest" - start of a tag,
    594 	* cdata, processing instruction etc.
    595 	*/
    596 
    597 	private void mark() {
    598 		theLastColumn = theCurrentColumn;
    599 		theLastLine = theCurrentLine;
    600 		}
    601 
    602 	/**
    603 	A callback for the ScanHandler that allows it to force
    604 	the lexer state to CDATA content (no markup is recognized except
    605 	the end of element.
    606 	*/
    607 
    608 	public void startCDATA() { theNextState = S_CDATA; }
    609 
    610 	private void save(int ch, ScanHandler h) throws IOException, SAXException {
    611 		if (theSize >= theOutputBuffer.length - 20) {
    612 			if (theState == S_PCDATA || theState == S_CDATA) {
    613 				// Return a buffer-sized chunk of PCDATA
    614 				h.pcdata(theOutputBuffer, 0, theSize);
    615 				theSize = 0;
    616 				}
    617 			else {
    618 				// Grow the buffer size
    619 				char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
    620                                 System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
    621 				theOutputBuffer = newOutputBuffer;
    622 				}
    623 			}
    624 		theOutputBuffer[theSize++] = (char)ch;
    625 		}
    626 
    627 	/**
    628 	Test procedure.  Reads HTML from the standard input and writes
    629 	PYX to the standard output.
    630 	*/
    631 
    632 	public static void main(String[] argv) throws IOException, SAXException {
    633 		Scanner s = new HTMLScanner();
    634 		Reader r = new InputStreamReader(System.in, "UTF-8");
    635 		Writer w = new OutputStreamWriter(System.out, "UTF-8");
    636 		PYXWriter pw = new PYXWriter(w);
    637 		s.scan(r, pw);
    638 		w.close();
    639 		}
    640 
    641 
    642 	private static String nicechar(int in) {
    643 		if (in == '\n') return "\\n";
    644 		if (in < 32) return "0x"+Integer.toHexString(in);
    645 		return "'"+((char)in)+"'";
    646 		}
    647 
    648 	}
    649