Home | History | Annotate | Download | only in tagsoup
      1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
      2 //
      3 // TagSoup is licensed under the Apache License,
      4 // Version 2.0.  You may obtain a copy of this license at
      5 // http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
      6 // additional legal rights not granted by this license.
      7 //
      8 // TagSoup is distributed in the hope that it will be useful, but
      9 // unless required by applicable law or agreed to in writing, TagSoup
     10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
     11 // OF ANY KIND, either express or implied; not even the implied warranty
     12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     13 //
     14 //
     15 // The TagSoup parser
     16 
     17 package org.ccil.cowan.tagsoup;
     18 import java.util.HashMap;
     19 import java.util.ArrayList;
     20 import java.io.*;
     21 import java.net.URL;
     22 import java.net.URLConnection;
     23 import org.xml.sax.*;
     24 import org.xml.sax.helpers.DefaultHandler;
     25 import org.xml.sax.ext.LexicalHandler;
     26 
     27 
     28 /**
     29 The SAX parser class.
     30 **/
     31 public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler {
     32 
     33 	// XMLReader implementation
     34 
     35 	private ContentHandler theContentHandler = this;
     36 	private LexicalHandler theLexicalHandler = this;
     37 	private DTDHandler theDTDHandler = this;
     38 	private ErrorHandler theErrorHandler = this;
     39 	private EntityResolver theEntityResolver = this;
     40 	private Schema theSchema;
     41 	private Scanner theScanner;
     42 	private AutoDetector theAutoDetector;
     43 
     44 	// Default values for feature flags
     45 
     46 	private static boolean DEFAULT_NAMESPACES = true;
     47 	private static boolean DEFAULT_IGNORE_BOGONS = false;
     48 	private static boolean DEFAULT_BOGONS_EMPTY = false;
     49         private static boolean DEFAULT_ROOT_BOGONS = true;
     50 	private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true;
     51 	private static boolean DEFAULT_TRANSLATE_COLONS = false;
     52 	private static boolean DEFAULT_RESTART_ELEMENTS = true;
     53 	private static boolean DEFAULT_IGNORABLE_WHITESPACE = false;
     54 	private static boolean DEFAULT_CDATA_ELEMENTS = true;
     55 
     56 	// Feature flags.
     57 
     58 	private boolean namespaces = DEFAULT_NAMESPACES;
     59 	private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS;
     60 	private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY;
     61         private boolean rootBogons = DEFAULT_ROOT_BOGONS;
     62 	private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES;
     63 	private boolean translateColons = DEFAULT_TRANSLATE_COLONS;
     64 	private boolean restartElements = DEFAULT_RESTART_ELEMENTS;
     65 	private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
     66 	private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS;
     67 
     68 	/**
     69 	A value of "true" indicates namespace URIs and unprefixed local
     70 	names for element and attribute names will be available.
     71 	**/
     72 	public final static String namespacesFeature =
     73 		"http://xml.org/sax/features/namespaces";
     74 
     75 	/**
     76 	A value of "true" indicates that XML qualified names (with prefixes)
     77 	and attributes (including xmlns* attributes) will be available.
     78 	We don't support this value.
     79 	**/
     80 	public final static String namespacePrefixesFeature =
     81 		"http://xml.org/sax/features/namespace-prefixes";
     82 
     83 	/**
     84 	Reports whether this parser processes external general entities
     85 	(it doesn't).
     86 	**/
     87 	public final static String externalGeneralEntitiesFeature =
     88 		"http://xml.org/sax/features/external-general-entities";
     89 
     90 	/**
     91 	Reports whether this parser processes external parameter entities
     92 	(it doesn't).
     93 	**/
     94 	public final static String externalParameterEntitiesFeature =
     95 		"http://xml.org/sax/features/external-parameter-entities";
     96 
     97 	/**
     98 	May be examined only during a parse, after the startDocument()
     99 	callback has been completed; read-only. The value is true if
    100 	the document specified standalone="yes" in its XML declaration,
    101 	and otherwise is false.  (It's always false.)
    102 	**/
    103 	public final static String isStandaloneFeature =
    104 		"http://xml.org/sax/features/is-standalone";
    105 
    106 	/**
    107 	A value of "true" indicates that the LexicalHandler will report
    108 	the beginning and end of parameter entities (it won't).
    109 	**/
    110 	public final static String lexicalHandlerParameterEntitiesFeature =
    111 		"http://xml.org/sax/features/lexical-handler/parameter-entities";
    112 
    113 	/**
    114 	A value of "true" indicates that system IDs in declarations will
    115 	be absolutized (relative to their base URIs) before reporting.
    116 	(This returns true but doesn't actually do anything.)
    117 	**/
    118 	public final static String resolveDTDURIsFeature =
    119 		"http://xml.org/sax/features/resolve-dtd-uris";
    120 
    121 	/**
    122 	Has a value of "true" if all XML names (for elements,
    123 	prefixes, attributes, entities, notations, and local
    124 	names), as well as Namespace URIs, will have been interned
    125 	using java.lang.String.intern. This supports fast testing of
    126 	equality/inequality against string constants, rather than forcing
    127 	slower calls to String.equals().  (We always intern.)
    128 	**/
    129 	public final static String stringInterningFeature =
    130 		"http://xml.org/sax/features/string-interning";
    131 
    132 	/**
    133 	Returns "true" if the Attributes objects passed by this
    134 	parser in ContentHandler.startElement() implement the
    135 	org.xml.sax.ext.Attributes2 interface.	(They don't.)
    136 	**/
    137 
    138 	public final static String useAttributes2Feature =
    139 		"http://xml.org/sax/features/use-attributes2";
    140 
    141 	/**
    142 	Returns "true" if the Locator objects passed by this parser
    143 	in ContentHandler.setDocumentLocator() implement the
    144 	org.xml.sax.ext.Locator2 interface.  (They don't.)
    145 	**/
    146 	public final static String useLocator2Feature =
    147 		"http://xml.org/sax/features/use-locator2";
    148 
    149 	/**
    150 	Returns "true" if, when setEntityResolver is given an object
    151 	implementing the org.xml.sax.ext.EntityResolver2 interface,
    152 	those new methods will be used.  (They won't be.)
    153 	**/
    154 	public final static String useEntityResolver2Feature =
    155 		"http://xml.org/sax/features/use-entity-resolver2";
    156 
    157 	/**
    158 	Controls whether the parser is reporting all validity errors
    159 	(We don't report any validity errors.)
    160 	**/
    161 	public final static String validationFeature =
    162 		"http://xml.org/sax/features/validation";
    163 
    164 	/**
    165 	Controls whether the parser reports Unicode normalization
    166 	errors as described in section 2.13 and Appendix B of the XML
    167 	1.1 Recommendation.  (We don't normalize.)
    168 	**/
    169 	public final static String unicodeNormalizationCheckingFeature =
    170 "http://xml.org/sax/features/unicode-normalization-checking";
    171 
    172 	/**
    173 	Controls whether, when the namespace-prefixes feature is set,
    174 	the parser treats namespace declaration attributes as being in
    175 	the http://www.w3.org/2000/xmlns/ namespace.  (It doesn't.)
    176 	**/
    177 	public final static String xmlnsURIsFeature =
    178 		"http://xml.org/sax/features/xmlns-uris";
    179 
    180 	/**
    181 	Returns "true" if the parser supports both XML 1.1 and XML 1.0.
    182 	(Always false.)
    183 	**/
    184 	public final static String XML11Feature =
    185 		"http://xml.org/sax/features/xml-1.1";
    186 
    187 	/**
    188 	A value of "true" indicates that the parser will ignore
    189 	unknown elements.
    190 	**/
    191 	public final static String ignoreBogonsFeature =
    192 		"http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons";
    193 
    194 	/**
    195 	A value of "true" indicates that the parser will give unknown
    196 	elements a content model of EMPTY; a value of "false", a
    197 	content model of ANY.
    198 	**/
    199 	public final static String bogonsEmptyFeature =
    200 		"http://www.ccil.org/~cowan/tagsoup/features/bogons-empty";
    201 
    202 	/**
    203 	A value of "true" indicates that the parser will allow unknown
    204 	elements to be the root element.
    205 	**/
    206 	public final static String rootBogonsFeature =
    207 		"http://www.ccil.org/~cowan/tagsoup/features/root-bogons";
    208 
    209 	/**
    210 	A value of "true" indicates that the parser will return default
    211 	attribute values for missing attributes that have default values.
    212 	**/
    213 	public final static String defaultAttributesFeature =
    214 		"http://www.ccil.org/~cowan/tagsoup/features/default-attributes";
    215 
    216 	/**
    217 	A value of "true" indicates that the parser will
    218 	translate colons into underscores in names.
    219 	**/
    220 	public final static String translateColonsFeature =
    221 		"http://www.ccil.org/~cowan/tagsoup/features/translate-colons";
    222 
    223 	/**
    224 	A value of "true" indicates that the parser will
    225 	attempt to restart the restartable elements.
    226 	**/
    227 	public final static String restartElementsFeature =
    228 		"http://www.ccil.org/~cowan/tagsoup/features/restart-elements";
    229 
    230 	/**
    231 	A value of "true" indicates that the parser will
    232 	transmit whitespace in element-only content via the SAX
    233 	ignorableWhitespace callback.  Normally this is not done,
    234 	because HTML is an SGML application and SGML suppresses
    235 	such whitespace.
    236 	**/
    237 	public final static String ignorableWhitespaceFeature =
    238 		"http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace";
    239 
    240 	/**
    241 	A value of "true" indicates that the parser will treat CDATA
    242 	elements specially.  Normally true, since the input is by
    243 	default HTML.
    244 	**/
    245 	public final static String CDATAElementsFeature =
    246 		"http://www.ccil.org/~cowan/tagsoup/features/cdata-elements";
    247 
    248 	/**
    249 	Used to see some syntax events that are essential in some
    250 	applications: comments, CDATA delimiters, selected general
    251 	entity inclusions, and the start and end of the DTD (and
    252 	declaration of document element name). The Object must implement
    253 	org.xml.sax.ext.LexicalHandler.
    254 	**/
    255 	public final static String lexicalHandlerProperty =
    256 		"http://xml.org/sax/properties/lexical-handler";
    257 
    258 	/**
    259 	Specifies the Scanner object this Parser uses.
    260 	**/
    261 	public final static String scannerProperty =
    262 		"http://www.ccil.org/~cowan/tagsoup/properties/scanner";
    263 
    264 	/**
    265 	Specifies the Schema object this Parser uses.
    266 	**/
    267 	public final static String schemaProperty =
    268 		"http://www.ccil.org/~cowan/tagsoup/properties/schema";
    269 
    270 	/**
    271 	Specifies the AutoDetector (for encoding detection) this Parser uses.
    272 	**/
    273 	public final static String autoDetectorProperty =
    274 		"http://www.ccil.org/~cowan/tagsoup/properties/auto-detector";
    275 
    276 	// Due to sucky Java order of initialization issues, these
    277 	// entries are maintained separately from the initial values of
    278 	// the corresponding instance variables, but care must be taken
    279 	// to keep them in sync.
    280 
    281 	private HashMap theFeatures = new HashMap();
    282 	{
    283 		theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES));
    284 		theFeatures.put(namespacePrefixesFeature, Boolean.FALSE);
    285 		theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE);
    286 		theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE);
    287 		theFeatures.put(isStandaloneFeature, Boolean.FALSE);
    288 		theFeatures.put(lexicalHandlerParameterEntitiesFeature,
    289 			Boolean.FALSE);
    290 		theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE);
    291 		theFeatures.put(stringInterningFeature, Boolean.TRUE);
    292 		theFeatures.put(useAttributes2Feature, Boolean.FALSE);
    293 		theFeatures.put(useLocator2Feature, Boolean.FALSE);
    294 		theFeatures.put(useEntityResolver2Feature, Boolean.FALSE);
    295 		theFeatures.put(validationFeature, Boolean.FALSE);
    296 		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
    297 		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
    298 		theFeatures.put(XML11Feature, Boolean.FALSE);
    299 		theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS));
    300 		theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY));
    301 		theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS));
    302 		theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES));
    303 		theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS));
    304 		theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS));
    305 		theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE));
    306 		theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS));
    307 		}
    308 
    309 	// Private clone of Boolean.valueOf that is guaranteed to return
    310 	// Boolean.TRUE or Boolean.FALSE
    311 	private static Boolean truthValue(boolean b) {
    312 		return b ? Boolean.TRUE : Boolean.FALSE;
    313 		}
    314 
    315 
    316 	public boolean getFeature (String name)
    317 		throws SAXNotRecognizedException, SAXNotSupportedException {
    318 		Boolean b = (Boolean)theFeatures.get(name);
    319 		if (b == null) {
    320 			throw new SAXNotRecognizedException("Unknown feature " + name);
    321 			}
    322 		return b.booleanValue();
    323 		}
    324 
    325 	public void setFeature (String name, boolean value)
    326 	throws SAXNotRecognizedException, SAXNotSupportedException {
    327 		Boolean b = (Boolean)theFeatures.get(name);
    328 		if (b == null) {
    329 			throw new SAXNotRecognizedException("Unknown feature " + name);
    330 			}
    331 		if (value) theFeatures.put(name, Boolean.TRUE);
    332 		else theFeatures.put(name, Boolean.FALSE);
    333 
    334 		if (name.equals(namespacesFeature)) namespaces = value;
    335 		else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value;
    336 		else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value;
    337 		else if (name.equals(rootBogonsFeature)) rootBogons = value;
    338 		else if (name.equals(defaultAttributesFeature)) defaultAttributes = value;
    339 		else if (name.equals(translateColonsFeature)) translateColons = value;
    340 		else if (name.equals(restartElementsFeature)) restartElements = value;
    341 		else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value;
    342 		else if (name.equals(CDATAElementsFeature)) CDATAElements = value;
    343 		}
    344 
    345 	public Object getProperty (String name)
    346 	throws SAXNotRecognizedException, SAXNotSupportedException {
    347 		if (name.equals(lexicalHandlerProperty)) {
    348 			return theLexicalHandler == this ? null : theLexicalHandler;
    349 			}
    350 		else if (name.equals(scannerProperty)) {
    351 			return theScanner;
    352 			}
    353 		else if (name.equals(schemaProperty)) {
    354 			return theSchema;
    355 			}
    356 		else if (name.equals(autoDetectorProperty)) {
    357 			return theAutoDetector;
    358 			}
    359 		else {
    360 			throw new SAXNotRecognizedException("Unknown property " + name);
    361 			}
    362 		}
    363 
    364 	public void setProperty (String name, Object value)
    365 	throws SAXNotRecognizedException, SAXNotSupportedException {
    366 		if (name.equals(lexicalHandlerProperty)) {
    367 			if (value == null) {
    368 				theLexicalHandler = this;
    369 				}
    370 			else if (value instanceof LexicalHandler) {
    371 				theLexicalHandler = (LexicalHandler)value;
    372 				}
    373 			else {
    374 				throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler");
    375 				}
    376 			}
    377 		else if (name.equals(scannerProperty)) {
    378 			if (value instanceof Scanner) {
    379 				theScanner = (Scanner)value;
    380 				}
    381 			else {
    382 				throw new SAXNotSupportedException("Your scanner is not a Scanner");
    383 				}
    384 			}
    385 		else if (name.equals(schemaProperty)) {
    386 			if (value instanceof Schema) {
    387 				theSchema = (Schema)value;
    388 				}
    389 			else {
    390 				 throw new SAXNotSupportedException("Your schema is not a Schema");
    391 				}
    392 			}
    393 		else if (name.equals(autoDetectorProperty)) {
    394 			if (value instanceof AutoDetector) {
    395 				theAutoDetector = (AutoDetector)value;
    396 				}
    397 			else {
    398 				throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector");
    399 				}
    400 			}
    401 		else {
    402 			throw new SAXNotRecognizedException("Unknown property " + name);
    403 			}
    404 		}
    405 
    406 	public void setEntityResolver (EntityResolver resolver) {
    407 		theEntityResolver = (resolver == null) ? this : resolver;
    408 		}
    409 
    410 	public EntityResolver getEntityResolver () {
    411 		return (theEntityResolver == this) ? null : theEntityResolver;
    412 		}
    413 
    414 	public void setDTDHandler (DTDHandler handler) {
    415 		theDTDHandler = (handler == null) ? this : handler;
    416 		}
    417 
    418 	public DTDHandler getDTDHandler () {
    419 		return (theDTDHandler == this) ? null : theDTDHandler;
    420 		}
    421 
    422 	public void setContentHandler (ContentHandler handler) {
    423 		theContentHandler = (handler == null) ? this : handler;
    424 		}
    425 
    426 	public ContentHandler getContentHandler () {
    427 		return (theContentHandler == this) ? null : theContentHandler;
    428 		}
    429 
    430 	public void setErrorHandler (ErrorHandler handler) {
    431 		theErrorHandler = (handler == null) ? this : handler;
    432 		}
    433 
    434 	public ErrorHandler getErrorHandler () {
    435 		return (theErrorHandler == this) ? null : theErrorHandler;
    436 		}
    437 
    438 	public void parse (InputSource input) throws IOException, SAXException {
    439 		setup();
    440 		Reader r = getReader(input);
    441 		theContentHandler.startDocument();
    442 		theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId());
    443 		if (theScanner instanceof Locator) {
    444 			theContentHandler.setDocumentLocator((Locator)theScanner);
    445 			}
    446 		if (!(theSchema.getURI().equals("")))
    447 			theContentHandler.startPrefixMapping(theSchema.getPrefix(),
    448 				theSchema.getURI());
    449 		theScanner.scan(r, this);
    450 		}
    451 
    452 	public void parse (String systemid) throws IOException, SAXException {
    453 		parse(new InputSource(systemid));
    454 		}
    455 
    456 	// Sets up instance variables that haven't been set by setFeature
    457 	private void setup() {
    458 		if (theSchema == null) theSchema = new HTMLSchema();
    459 		if (theScanner == null) theScanner = new HTMLScanner();
    460 		if (theAutoDetector == null) {
    461 			theAutoDetector = new AutoDetector() {
    462 				public Reader autoDetectingReader(InputStream i) {
    463 					return new InputStreamReader(i);
    464 					}
    465 				};
    466 			}
    467 		theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes);
    468 		thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes);
    469 		theNewElement = null;
    470 		theAttributeName = null;
    471 		thePITarget = null;
    472 		theSaved = null;
    473 		theEntity = 0;
    474 		virginStack = true;
    475                 theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null;
    476 		}
    477 
    478 	// Return a Reader based on the contents of an InputSource
    479 	// Buffer both the InputStream and the Reader
    480 	private Reader getReader(InputSource s) throws SAXException, IOException {
    481 		Reader r = s.getCharacterStream();
    482 		InputStream i = s.getByteStream();
    483 		String encoding = s.getEncoding();
    484 		String publicid = s.getPublicId();
    485 		String systemid = s.getSystemId();
    486 		if (r == null) {
    487 			if (i == null) i = getInputStream(publicid, systemid);
    488 //			i = new BufferedInputStream(i);
    489 			if (encoding == null) {
    490 				r = theAutoDetector.autoDetectingReader(i);
    491 				}
    492 			else {
    493 				try {
    494 					r = new InputStreamReader(i, encoding);
    495 					}
    496 				catch (UnsupportedEncodingException e) {
    497 					r = new InputStreamReader(i);
    498 					}
    499 				}
    500 			}
    501 //		r = new BufferedReader(r);
    502 		return r;
    503 		}
    504 
    505 	// Get an InputStream based on a publicid and a systemid
    506 	private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException {
    507 		URL basis = new URL("file", "", System.getProperty("user.dir") + "/.");
    508 		URL url = new URL(basis, systemid);
    509 		URLConnection c = url.openConnection();
    510 		return c.getInputStream();
    511 		}
    512 		// We don't process publicids (who uses them anyhow?)
    513 
    514 	// ScanHandler implementation
    515 
    516 	private Element theNewElement = null;
    517 	private String theAttributeName = null;
    518 	private boolean theDoctypeIsPresent = false;
    519 	private String theDoctypePublicId = null;
    520 	private String theDoctypeSystemId = null;
    521 	private String theDoctypeName = null;
    522 	private String thePITarget = null;
    523 	private Element theStack = null;
    524 	private Element theSaved = null;
    525 	private Element thePCDATA = null;
    526 	private int theEntity = 0;	// needs to support chars past U+FFFF
    527 
    528 	public void adup(char[] buff, int offset, int length) throws SAXException {
    529 		if (theNewElement == null || theAttributeName == null) return;
    530 		theNewElement.setAttribute(theAttributeName, null, theAttributeName);
    531 		theAttributeName = null;
    532 		}
    533 
    534 	public void aname(char[] buff, int offset, int length) throws SAXException {
    535 		if (theNewElement == null) return;
    536 		// Currently we don't rely on Schema to canonicalize
    537 		// attribute names.
    538 		theAttributeName = makeName(buff, offset, length).toLowerCase();
    539 //		System.err.println("%% Attribute name " + theAttributeName);
    540 		}
    541 
    542 	public void aval(char[] buff, int offset, int length) throws SAXException {
    543 		if (theNewElement == null || theAttributeName == null) return;
    544 		String value = new String(buff, offset, length);
    545 //		System.err.println("%% Attribute value [" + value + "]");
    546 		value = expandEntities(value);
    547 		theNewElement.setAttribute(theAttributeName, null, value);
    548 		theAttributeName = null;
    549 //		System.err.println("%% Aval done");
    550 		}
    551 
    552 	// Expand entity references in attribute values selectively.
    553 	// Currently we expand a reference iff it is properly terminated
    554 	// with a semicolon.
    555 	private String expandEntities(String src) {
    556 		int refStart = -1;
    557 		int len = src.length();
    558 		char[] dst = new char[len];
    559 		int dstlen = 0;
    560 		for (int i = 0; i < len; i++) {
    561 			char ch = src.charAt(i);
    562 			dst[dstlen++] = ch;
    563 //			System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] ");
    564 			if (ch == '&' && refStart == -1) {
    565 				// start of a ref excluding &
    566 				refStart = dstlen;
    567 //				System.err.println("start of ref");
    568 				}
    569 			else if (refStart == -1) {
    570 				// not in a ref
    571 //				System.err.println("not in ref");
    572 				}
    573 			else if (Character.isLetter(ch) ||
    574 					Character.isDigit(ch) ||
    575 					ch == '#') {
    576 				// valid entity char
    577 //				System.err.println("valid");
    578 				}
    579 			else if (ch == ';') {
    580 				// properly terminated ref
    581 //				System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]");
    582 				int ent = lookupEntity(dst, refStart, dstlen - refStart - 1);
    583 //				System.err.println(" = " + ent);
    584 				if (ent > 0xFFFF) {
    585 					ent -= 0x10000;
    586 					dst[refStart - 1] = (char)((ent>>10) + 0xD800);
    587 					dst[refStart] = (char)((ent&0x3FF) + 0xDC00);
    588 					dstlen = refStart + 1;
    589 					}
    590 				else if (ent != 0) {
    591 					dst[refStart - 1] = (char)ent;
    592 					dstlen = refStart;
    593 					}
    594 				refStart = -1;
    595 				}
    596 			else {
    597 				// improperly terminated ref
    598 //				System.err.println("end of ref");
    599 				refStart = -1;
    600 				}
    601 			}
    602 		return new String(dst, 0, dstlen);
    603 		}
    604 
    605 	public void entity(char[] buff, int offset, int length) throws SAXException {
    606 		theEntity = lookupEntity(buff, offset, length);
    607 		}
    608 
    609 	// Process numeric character references,
    610 	// deferring to the schema for named ones.
    611 	private int lookupEntity(char[] buff, int offset, int length) {
    612 		int result = 0;
    613 		if (length < 1) return result;
    614 //		System.err.println("%% Entity at " + offset + " " + length);
    615 //		System.err.println("%% Got entity [" + new String(buff, offset, length) + "]");
    616 		if (buff[offset] == '#') {
    617                         if (length > 1 && (buff[offset+1] == 'x'
    618                                         || buff[offset+1] == 'X')) {
    619                                 try {
    620                                         return Integer.parseInt(new String(buff, offset + 2, length - 2), 16);
    621                                         }
    622                                 catch (NumberFormatException e) { return 0; }
    623                                 }
    624                         try {
    625                                 return Integer.parseInt(new String(buff, offset + 1, length - 1), 10);
    626                                 }
    627                         catch (NumberFormatException e) { return 0; }
    628                         }
    629 		return theSchema.getEntity(new String(buff, offset, length));
    630 		}
    631 
    632 	public void eof(char[] buff, int offset, int length) throws SAXException {
    633 		if (virginStack) rectify(thePCDATA);
    634 		while (theStack.next() != null) {
    635 			pop();
    636 			}
    637 		if (!(theSchema.getURI().equals("")))
    638 			theContentHandler.endPrefixMapping(theSchema.getPrefix());
    639 		theContentHandler.endDocument();
    640 		}
    641 
    642 	public void etag(char[] buff, int offset, int length) throws SAXException {
    643 		if (etag_cdata(buff, offset, length)) return;
    644 		etag_basic(buff, offset, length);
    645 		}
    646 
    647 	private static char[] etagchars = {'<', '/', '>'};
    648 	public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException {
    649 		String currentName = theStack.name();
    650 		// If this is a CDATA element and the tag doesn't match,
    651 		// or isn't properly formed (junk after the name),
    652 		// restart CDATA mode and process the tag as characters.
    653 		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
    654 			boolean realTag = (length == currentName.length());
    655 			if (realTag) {
    656 				for (int i = 0; i < length; i++) {
    657 					if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) {
    658 						realTag = false;
    659 						break;
    660 						}
    661 					}
    662 				}
    663 			if (!realTag) {
    664 				theContentHandler.characters(etagchars, 0, 2);
    665 				theContentHandler.characters(buff, offset, length);
    666 				theContentHandler.characters(etagchars, 2, 1);
    667 				theScanner.startCDATA();
    668 				return true;
    669 				}
    670 			}
    671 		return false;
    672 		}
    673 
    674 	public void etag_basic(char[] buff, int offset, int length) throws SAXException {
    675 		theNewElement = null;
    676 		String name;
    677 		if (length != 0) {
    678 			// Canonicalize case of name
    679 			name = makeName(buff, offset, length);
    680 //			System.err.println("got etag [" + name + "]");
    681 			ElementType type = theSchema.getElementType(name);
    682 			if (type == null) return;	// mysterious end-tag
    683 			name = type.name();
    684 			}
    685 		else {
    686 			name = theStack.name();
    687 			}
    688 //		System.err.println("%% Got end of " + name);
    689 
    690 		Element sp;
    691 		boolean inNoforce = false;
    692 		for (sp = theStack; sp != null; sp = sp.next()) {
    693 			if (sp.name().equals(name)) break;
    694 			if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true;
    695 			}
    696 
    697 		if (sp == null) return;		// Ignore unknown etags
    698 		if (sp.next() == null || sp.next().next() == null) return;
    699 		if (inNoforce) {		// inside an F_NOFORCE element?
    700 			sp.preclose();		// preclose the matching element
    701 			}
    702 		else {			// restartably pop everything above us
    703 			while (theStack != sp) {
    704 				restartablyPop();
    705 				}
    706 			pop();
    707 			}
    708 		// pop any preclosed elements now at the top
    709 		while (theStack.isPreclosed()) {
    710 			pop();
    711 			}
    712 		restart(null);
    713 		}
    714 
    715 	// Push restartables on the stack if possible
    716 	// e is the next element to be started, if we know what it is
    717 	private void restart(Element e) throws SAXException {
    718 		while (theSaved != null && theStack.canContain(theSaved) &&
    719 				(e == null || theSaved.canContain(e))) {
    720 			Element next = theSaved.next();
    721 			push(theSaved);
    722 			theSaved = next;
    723 			}
    724 		}
    725 
    726 	// Pop the stack irrevocably
    727 	private void pop() throws SAXException {
    728 		if (theStack == null) return;		// empty stack
    729 		String name = theStack.name();
    730 		String localName = theStack.localName();
    731 		String namespace = theStack.namespace();
    732 		String prefix = prefixOf(name);
    733 
    734 //		System.err.println("%% Popping " + name);
    735 		if (!namespaces) namespace = localName = "";
    736 		theContentHandler.endElement(namespace, localName, name);
    737 		if (foreign(prefix, namespace)) {
    738 			theContentHandler.endPrefixMapping(prefix);
    739 //			System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace);
    740 			}
    741 		Attributes atts = theStack.atts();
    742 		for (int i = atts.getLength() - 1; i >= 0; i--) {
    743 			String attNamespace = atts.getURI(i);
    744 			String attPrefix = prefixOf(atts.getQName(i));
    745 			if (foreign(attPrefix, attNamespace)) {
    746 				theContentHandler.endPrefixMapping(attPrefix);
    747 //			System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace);
    748 				}
    749 			}
    750 		theStack = theStack.next();
    751 		}
    752 
    753 	// Pop the stack restartably
    754 	private void restartablyPop() throws SAXException {
    755 		Element popped = theStack;
    756 		pop();
    757 		if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) {
    758 			popped.anonymize();
    759 			popped.setNext(theSaved);
    760 			theSaved = popped;
    761 			}
    762 		}
    763 
    764 	// Push element onto stack
    765 	private boolean virginStack = true;
    766 	private void push(Element e) throws SAXException {
    767 		String name = e.name();
    768 		String localName = e.localName();
    769 		String namespace = e.namespace();
    770 		String prefix = prefixOf(name);
    771 
    772 //		System.err.println("%% Pushing " + name);
    773 		e.clean();
    774 		if (!namespaces) namespace = localName = "";
    775                 if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) {
    776                     try {
    777                         theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId);
    778                     } catch (IOException ew) { }   // Can't be thrown for root I believe.
    779                 }
    780 		if (foreign(prefix, namespace)) {
    781 			theContentHandler.startPrefixMapping(prefix, namespace);
    782 //			System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace);
    783 			}
    784 		Attributes atts = e.atts();
    785 		int len = atts.getLength();
    786 		for (int i = 0; i < len; i++) {
    787 			String attNamespace = atts.getURI(i);
    788 			String attPrefix = prefixOf(atts.getQName(i));
    789 			if (foreign(attPrefix, attNamespace)) {
    790 				theContentHandler.startPrefixMapping(attPrefix, attNamespace);
    791 //				System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace);
    792 				}
    793 			}
    794 		theContentHandler.startElement(namespace, localName, name, e.atts());
    795 		e.setNext(theStack);
    796 		theStack = e;
    797 		virginStack = false;
    798 		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
    799 			theScanner.startCDATA();
    800 			}
    801 		}
    802 
    803 	// Get the prefix from a QName
    804 	private String prefixOf(String name) {
    805 		int i = name.indexOf(':');
    806 		String prefix = "";
    807 		if (i != -1) prefix = name.substring(0, i);
    808 //		System.err.println("%% " + prefix + " is prefix of " + name);
    809 		return prefix;
    810 		}
    811 
    812 	// Return true if we have a foreign name
    813 	private boolean foreign(String prefix, String namespace) {
    814 //		System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- ");
    815 		boolean foreign = !(prefix.equals("") || namespace.equals("") ||
    816 			namespace.equals(theSchema.getURI()));
    817 //		System.err.println(foreign);
    818 		return foreign;
    819 		}
    820 
    821         /**
    822          * Parsing the complete XML Document Type Definition is way too complex,
    823          * but for many simple cases we can extract something useful from it.
    824          *
    825          * doctypedecl  ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
    826          *  DeclSep     ::= PEReference | S
    827          *  intSubset   ::= (markupdecl | DeclSep)*
    828          *  markupdecl  ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
    829          *  ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
    830          */
    831 	public void decl(char[] buff, int offset, int length) throws SAXException {
    832 		String s = new String(buff, offset, length);
    833 		String name = null;
    834 		String systemid = null;
    835 		String publicid = null;
    836 		String[] v = split(s);
    837 		if (v.length > 0 && "DOCTYPE".equals(v[0])) {
    838 			if (theDoctypeIsPresent) return;		// one doctype only!
    839 			theDoctypeIsPresent = true;
    840 			if (v.length > 1) {
    841 				name = v[1];
    842 				if (v.length>3 && "SYSTEM".equals(v[2])) {
    843 				systemid = v[3];
    844 				}
    845 			else if (v.length > 3 && "PUBLIC".equals(v[2])) {
    846 				publicid = v[3];
    847 				if (v.length > 4) {
    848 					systemid = v[4];
    849 					}
    850 				else {
    851 					systemid = "";
    852 					}
    853                     }
    854                 }
    855             }
    856 		publicid = trimquotes(publicid);
    857 		systemid = trimquotes(systemid);
    858 		if (name != null) {
    859 			publicid = cleanPublicid(publicid);
    860 			theLexicalHandler.startDTD(name, publicid, systemid);
    861 			theLexicalHandler.endDTD();
    862 			theDoctypeName = name;
    863 			theDoctypePublicId = publicid;
    864 		if (theScanner instanceof Locator) {    // Must resolve systemid
    865                     theDoctypeSystemId  = ((Locator)theScanner).getSystemId();
    866                     try {
    867                         theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString();
    868                     } catch (Exception e) {}
    869                 }
    870             }
    871         }
    872 
    873 	// If the String is quoted, trim the quotes.
    874 	private static String trimquotes(String in) {
    875 		if (in == null) return in;
    876 		int length = in.length();
    877 		if (length == 0) return in;
    878 		char s = in.charAt(0);
    879 		char e = in.charAt(length - 1);
    880 		if (s == e && (s == '\'' || s == '"')) {
    881 			in = in.substring(1, in.length() - 1);
    882 			}
    883 		return in;
    884 		}
    885 
    886 	// Split the supplied String into words or phrases seperated by spaces.
    887 	// Recognises quotes around a phrase and doesn't split it.
    888 	private static String[] split(String val) throws IllegalArgumentException {
    889 		val = val.trim();
    890 		if (val.length() == 0) {
    891 			return new String[0];
    892 			}
    893 		else {
    894 			ArrayList l = new ArrayList();
    895 			int s = 0;
    896 			int e = 0;
    897 			boolean sq = false;	// single quote
    898 			boolean dq = false;	// double quote
    899 			char lastc = 0;
    900 			int len = val.length();
    901 			for (e=0; e < len; e++) {
    902 				char c = val.charAt(e);
    903 				if (!dq && c == '\'' && lastc != '\\') {
    904 				sq = !sq;
    905 				if (s < 0) s = e;
    906 				}
    907 			else if (!sq && c == '\"' && lastc != '\\') {
    908 				dq = !dq;
    909 				if (s < 0) s = e;
    910 				}
    911 			else if (!sq && !dq) {
    912 				if (Character.isWhitespace(c)) {
    913 					if (s >= 0) l.add(val.substring(s, e));
    914 					s = -1;
    915 					}
    916 				else if (s < 0 && c != ' ') {
    917 					s = e;
    918 					}
    919 				}
    920 			lastc = c;
    921 			}
    922 		l.add(val.substring(s, e));
    923 		return (String[])l.toArray(new String[0]);
    924 		}
    925         }
    926 
    927 	// Replace junk in publicids with spaces
    928 	private static String legal =
    929 		"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%";
    930 
    931 	private String cleanPublicid(String src) {
    932 		if (src == null) return null;
    933 		int len = src.length();
    934 		StringBuffer dst = new StringBuffer(len);
    935 		boolean suppressSpace = true;
    936 		for (int i = 0; i < len; i++) {
    937 			char ch = src.charAt(i);
    938 			if (legal.indexOf(ch) != -1) { 	// legal but not whitespace
    939 				dst.append(ch);
    940 				suppressSpace = false;
    941 				}
    942 			else if (suppressSpace) {	// normalizable whitespace or junk
    943 				;
    944 				}
    945 			else {
    946 				dst.append(' ');
    947 				suppressSpace = true;
    948 				}
    949 			}
    950 //		System.err.println("%% Publicid [" + dst.toString().trim() + "]");
    951 		return dst.toString().trim();	// trim any final junk whitespace
    952 		}
    953 
    954 
    955 	public void gi(char[] buff, int offset, int length) throws SAXException {
    956 		if (theNewElement != null) return;
    957 		String name = makeName(buff, offset, length);
    958 		if (name == null) return;
    959 		ElementType type = theSchema.getElementType(name);
    960 		if (type == null) {
    961 			// Suppress unknown elements if ignore-bogons is on
    962 			if (ignoreBogons) return;
    963 			int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY;
    964 			int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT);
    965 			theSchema.elementType(name, bogonModel, bogonMemberOf, 0);
    966 			if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name());
    967 			type = theSchema.getElementType(name);
    968 			}
    969 
    970 		theNewElement = new Element(type, defaultAttributes);
    971 //		System.err.println("%% Got GI " + theNewElement.name());
    972 		}
    973 
    974 	public void cdsect(char[] buff, int offset, int length) throws SAXException {
    975 		theLexicalHandler.startCDATA();
    976 		pcdata(buff, offset, length);
    977 		theLexicalHandler.endCDATA();
    978 		}
    979 	public void pcdata(char[] buff, int offset, int length) throws SAXException {
    980 		if (length == 0) return;
    981 		boolean allWhite = true;
    982 		for (int i = 0; i < length; i++) {
    983 			if (!Character.isWhitespace(buff[offset+i])) {
    984 				allWhite = false;
    985 				}
    986 			}
    987 		if (allWhite && !theStack.canContain(thePCDATA)) {
    988 			if (ignorableWhitespace) {
    989 				theContentHandler.ignorableWhitespace(buff, offset, length);
    990 				}
    991 			}
    992 		else {
    993 			rectify(thePCDATA);
    994 			theContentHandler.characters(buff, offset, length);
    995 			}
    996 		}
    997 
    998 	public void pitarget(char[] buff, int offset, int length) throws SAXException {
    999 		if (theNewElement != null) return;
   1000 		thePITarget = makeName(buff, offset, length).replace(':', '_');
   1001 		}
   1002 
   1003 	public void pi(char[] buff, int offset, int length) throws SAXException {
   1004 		if (theNewElement != null || thePITarget == null) return;
   1005 		if ("xml".equalsIgnoreCase(thePITarget)) return;
   1006 //		if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI");
   1007 		if (length > 0 && buff[length - 1] == '?') length--;	// remove trailing ?
   1008 		theContentHandler.processingInstruction(thePITarget,
   1009 			new String(buff, offset, length));
   1010 		thePITarget = null;
   1011 		}
   1012 
   1013 	public void stagc(char[] buff, int offset, int length) throws SAXException {
   1014 //		System.err.println("%% Start-tag");
   1015 		if (theNewElement == null) return;
   1016 		rectify(theNewElement);
   1017 		if (theStack.model() == Schema.M_EMPTY) {
   1018 			// Force an immediate end tag
   1019 			etag_basic(buff, offset, length);
   1020 			}
   1021 		}
   1022 
   1023 	public void stage(char[] buff, int offset, int length) throws SAXException {
   1024 //		System.err.println("%% Empty-tag");
   1025 		if (theNewElement == null) return;
   1026 		rectify(theNewElement);
   1027 		// Force an immediate end tag
   1028 		etag_basic(buff, offset, length);
   1029 		}
   1030 
   1031 	// Comment buffer is twice the size of the output buffer
   1032 	private char[] theCommentBuffer = new char[2000];
   1033 	public void cmnt(char[] buff, int offset, int length) throws SAXException {
   1034 		theLexicalHandler.comment(buff, offset, length);
   1035 		}
   1036 
   1037 	// Rectify the stack, pushing and popping as needed
   1038 	// so that the argument can be safely pushed
   1039 	private void rectify(Element e) throws SAXException {
   1040 		Element sp;
   1041 		while (true) {
   1042 			for (sp = theStack; sp != null; sp = sp.next()) {
   1043 				if (sp.canContain(e)) break;
   1044 				}
   1045 			if (sp != null) break;
   1046 			ElementType parentType = e.parent();
   1047 			if (parentType == null) break;
   1048 			Element parent = new Element(parentType, defaultAttributes);
   1049 //			System.err.println("%% Ascending from " + e.name() + " to " + parent.name());
   1050 			parent.setNext(e);
   1051 			e = parent;
   1052 			}
   1053 		if (sp == null) return;		// don't know what to do
   1054 		while (theStack != sp) {
   1055 			if (theStack == null || theStack.next() == null ||
   1056 				theStack.next().next() == null) break;
   1057 			restartablyPop();
   1058 			}
   1059 		while (e != null) {
   1060 			Element nexte = e.next();
   1061 			if (!e.name().equals("<pcdata>")) push(e);
   1062 			e = nexte;
   1063 			restart(e);
   1064 			}
   1065 		theNewElement = null;
   1066 		}
   1067 
   1068 	public int getEntity() {
   1069 		return theEntity;
   1070 		}
   1071 
   1072 	// Return the argument as a valid XML name
   1073 	// This no longer lowercases the result: we depend on Schema to
   1074 	// canonicalize case.
   1075 	private String makeName(char[] buff, int offset, int length) {
   1076 		StringBuffer dst = new StringBuffer(length + 2);
   1077 		boolean seenColon = false;
   1078 		boolean start = true;
   1079 //		String src = new String(buff, offset, length); // DEBUG
   1080 		for (; length-- > 0; offset++) {
   1081 			char ch = buff[offset];
   1082 			if (Character.isLetter(ch) || ch == '_') {
   1083 				start = false;
   1084 				dst.append(ch);
   1085 				}
   1086 			else if (Character.isDigit(ch) || ch == '-' || ch == '.') {
   1087 				if (start) dst.append('_');
   1088 				start = false;
   1089 				dst.append(ch);
   1090 				}
   1091 			else if (ch == ':' && !seenColon) {
   1092 				seenColon = true;
   1093 				if (start) dst.append('_');
   1094 				start = true;
   1095 				dst.append(translateColons ? '_' : ch);
   1096 				}
   1097 			}
   1098 		int dstLength = dst.length();
   1099 		if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_');
   1100 //		System.err.println("Made name \"" + dst + "\" from \"" + src + "\"");
   1101 		return dst.toString().intern();
   1102 		}
   1103 
   1104 	// Default LexicalHandler implementation
   1105 
   1106 	public void comment(char[] ch, int start, int length) throws SAXException { }
   1107 	public void endCDATA() throws SAXException { }
   1108 	public void endDTD() throws SAXException { }
   1109 	public void endEntity(String name) throws SAXException { }
   1110 	public void startCDATA() throws SAXException { }
   1111 	public void startDTD(String name, String publicid, String systemid) throws SAXException { }
   1112 	public void startEntity(String name) throws SAXException { }
   1113 
   1114 	}
   1115