1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan. 2 // 3 // TagSoup is licensed under the Apache License, 4 // Version 2.0. You may obtain a copy of this license at 5 // http://www.apache.org/licenses/LICENSE-2.0 . You may also have 6 // additional legal rights not granted by this license. 7 // 8 // TagSoup is distributed in the hope that it will be useful, but 9 // unless required by applicable law or agreed to in writing, TagSoup 10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 11 // OF ANY KIND, either express or implied; not even the implied warranty 12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 // 14 // 15 // The TagSoup parser 16 17 package org.ccil.cowan.tagsoup; 18 import java.util.HashMap; 19 import java.util.ArrayList; 20 import java.io.*; 21 import java.net.URL; 22 import java.net.URLConnection; 23 import org.xml.sax.*; 24 import org.xml.sax.helpers.DefaultHandler; 25 import org.xml.sax.ext.LexicalHandler; 26 27 28 /** 29 The SAX parser class. 30 **/ 31 public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler { 32 33 // XMLReader implementation 34 35 private ContentHandler theContentHandler = this; 36 private LexicalHandler theLexicalHandler = this; 37 private DTDHandler theDTDHandler = this; 38 private ErrorHandler theErrorHandler = this; 39 private EntityResolver theEntityResolver = this; 40 private Schema theSchema; 41 private Scanner theScanner; 42 private AutoDetector theAutoDetector; 43 44 // Default values for feature flags 45 46 private static boolean DEFAULT_NAMESPACES = true; 47 private static boolean DEFAULT_IGNORE_BOGONS = false; 48 private static boolean DEFAULT_BOGONS_EMPTY = false; 49 private static boolean DEFAULT_ROOT_BOGONS = true; 50 private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true; 51 private static boolean DEFAULT_TRANSLATE_COLONS = false; 52 private static boolean DEFAULT_RESTART_ELEMENTS = true; 53 private static boolean DEFAULT_IGNORABLE_WHITESPACE = false; 54 private static boolean DEFAULT_CDATA_ELEMENTS = true; 55 56 // Feature flags. 57 58 private boolean namespaces = DEFAULT_NAMESPACES; 59 private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS; 60 private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY; 61 private boolean rootBogons = DEFAULT_ROOT_BOGONS; 62 private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES; 63 private boolean translateColons = DEFAULT_TRANSLATE_COLONS; 64 private boolean restartElements = DEFAULT_RESTART_ELEMENTS; 65 private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE; 66 private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS; 67 68 /** 69 A value of "true" indicates namespace URIs and unprefixed local 70 names for element and attribute names will be available. 71 **/ 72 public final static String namespacesFeature = 73 "http://xml.org/sax/features/namespaces"; 74 75 /** 76 A value of "true" indicates that XML qualified names (with prefixes) 77 and attributes (including xmlns* attributes) will be available. 78 We don't support this value. 79 **/ 80 public final static String namespacePrefixesFeature = 81 "http://xml.org/sax/features/namespace-prefixes"; 82 83 /** 84 Reports whether this parser processes external general entities 85 (it doesn't). 86 **/ 87 public final static String externalGeneralEntitiesFeature = 88 "http://xml.org/sax/features/external-general-entities"; 89 90 /** 91 Reports whether this parser processes external parameter entities 92 (it doesn't). 93 **/ 94 public final static String externalParameterEntitiesFeature = 95 "http://xml.org/sax/features/external-parameter-entities"; 96 97 /** 98 May be examined only during a parse, after the startDocument() 99 callback has been completed; read-only. The value is true if 100 the document specified standalone="yes" in its XML declaration, 101 and otherwise is false. (It's always false.) 102 **/ 103 public final static String isStandaloneFeature = 104 "http://xml.org/sax/features/is-standalone"; 105 106 /** 107 A value of "true" indicates that the LexicalHandler will report 108 the beginning and end of parameter entities (it won't). 109 **/ 110 public final static String lexicalHandlerParameterEntitiesFeature = 111 "http://xml.org/sax/features/lexical-handler/parameter-entities"; 112 113 /** 114 A value of "true" indicates that system IDs in declarations will 115 be absolutized (relative to their base URIs) before reporting. 116 (This returns true but doesn't actually do anything.) 117 **/ 118 public final static String resolveDTDURIsFeature = 119 "http://xml.org/sax/features/resolve-dtd-uris"; 120 121 /** 122 Has a value of "true" if all XML names (for elements, 123 prefixes, attributes, entities, notations, and local 124 names), as well as Namespace URIs, will have been interned 125 using java.lang.String.intern. This supports fast testing of 126 equality/inequality against string constants, rather than forcing 127 slower calls to String.equals(). (We always intern.) 128 **/ 129 public final static String stringInterningFeature = 130 "http://xml.org/sax/features/string-interning"; 131 132 /** 133 Returns "true" if the Attributes objects passed by this 134 parser in ContentHandler.startElement() implement the 135 org.xml.sax.ext.Attributes2 interface. (They don't.) 136 **/ 137 138 public final static String useAttributes2Feature = 139 "http://xml.org/sax/features/use-attributes2"; 140 141 /** 142 Returns "true" if the Locator objects passed by this parser 143 in ContentHandler.setDocumentLocator() implement the 144 org.xml.sax.ext.Locator2 interface. (They don't.) 145 **/ 146 public final static String useLocator2Feature = 147 "http://xml.org/sax/features/use-locator2"; 148 149 /** 150 Returns "true" if, when setEntityResolver is given an object 151 implementing the org.xml.sax.ext.EntityResolver2 interface, 152 those new methods will be used. (They won't be.) 153 **/ 154 public final static String useEntityResolver2Feature = 155 "http://xml.org/sax/features/use-entity-resolver2"; 156 157 /** 158 Controls whether the parser is reporting all validity errors 159 (We don't report any validity errors.) 160 **/ 161 public final static String validationFeature = 162 "http://xml.org/sax/features/validation"; 163 164 /** 165 Controls whether the parser reports Unicode normalization 166 errors as described in section 2.13 and Appendix B of the XML 167 1.1 Recommendation. (We don't normalize.) 168 **/ 169 public final static String unicodeNormalizationCheckingFeature = 170 "http://xml.org/sax/features/unicode-normalization-checking"; 171 172 /** 173 Controls whether, when the namespace-prefixes feature is set, 174 the parser treats namespace declaration attributes as being in 175 the http://www.w3.org/2000/xmlns/ namespace. (It doesn't.) 176 **/ 177 public final static String xmlnsURIsFeature = 178 "http://xml.org/sax/features/xmlns-uris"; 179 180 /** 181 Returns "true" if the parser supports both XML 1.1 and XML 1.0. 182 (Always false.) 183 **/ 184 public final static String XML11Feature = 185 "http://xml.org/sax/features/xml-1.1"; 186 187 /** 188 A value of "true" indicates that the parser will ignore 189 unknown elements. 190 **/ 191 public final static String ignoreBogonsFeature = 192 "http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons"; 193 194 /** 195 A value of "true" indicates that the parser will give unknown 196 elements a content model of EMPTY; a value of "false", a 197 content model of ANY. 198 **/ 199 public final static String bogonsEmptyFeature = 200 "http://www.ccil.org/~cowan/tagsoup/features/bogons-empty"; 201 202 /** 203 A value of "true" indicates that the parser will allow unknown 204 elements to be the root element. 205 **/ 206 public final static String rootBogonsFeature = 207 "http://www.ccil.org/~cowan/tagsoup/features/root-bogons"; 208 209 /** 210 A value of "true" indicates that the parser will return default 211 attribute values for missing attributes that have default values. 212 **/ 213 public final static String defaultAttributesFeature = 214 "http://www.ccil.org/~cowan/tagsoup/features/default-attributes"; 215 216 /** 217 A value of "true" indicates that the parser will 218 translate colons into underscores in names. 219 **/ 220 public final static String translateColonsFeature = 221 "http://www.ccil.org/~cowan/tagsoup/features/translate-colons"; 222 223 /** 224 A value of "true" indicates that the parser will 225 attempt to restart the restartable elements. 226 **/ 227 public final static String restartElementsFeature = 228 "http://www.ccil.org/~cowan/tagsoup/features/restart-elements"; 229 230 /** 231 A value of "true" indicates that the parser will 232 transmit whitespace in element-only content via the SAX 233 ignorableWhitespace callback. Normally this is not done, 234 because HTML is an SGML application and SGML suppresses 235 such whitespace. 236 **/ 237 public final static String ignorableWhitespaceFeature = 238 "http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace"; 239 240 /** 241 A value of "true" indicates that the parser will treat CDATA 242 elements specially. Normally true, since the input is by 243 default HTML. 244 **/ 245 public final static String CDATAElementsFeature = 246 "http://www.ccil.org/~cowan/tagsoup/features/cdata-elements"; 247 248 /** 249 Used to see some syntax events that are essential in some 250 applications: comments, CDATA delimiters, selected general 251 entity inclusions, and the start and end of the DTD (and 252 declaration of document element name). The Object must implement 253 org.xml.sax.ext.LexicalHandler. 254 **/ 255 public final static String lexicalHandlerProperty = 256 "http://xml.org/sax/properties/lexical-handler"; 257 258 /** 259 Specifies the Scanner object this Parser uses. 260 **/ 261 public final static String scannerProperty = 262 "http://www.ccil.org/~cowan/tagsoup/properties/scanner"; 263 264 /** 265 Specifies the Schema object this Parser uses. 266 **/ 267 public final static String schemaProperty = 268 "http://www.ccil.org/~cowan/tagsoup/properties/schema"; 269 270 /** 271 Specifies the AutoDetector (for encoding detection) this Parser uses. 272 **/ 273 public final static String autoDetectorProperty = 274 "http://www.ccil.org/~cowan/tagsoup/properties/auto-detector"; 275 276 // Due to sucky Java order of initialization issues, these 277 // entries are maintained separately from the initial values of 278 // the corresponding instance variables, but care must be taken 279 // to keep them in sync. 280 281 private HashMap theFeatures = new HashMap(); 282 { 283 theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES)); 284 theFeatures.put(namespacePrefixesFeature, Boolean.FALSE); 285 theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE); 286 theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE); 287 theFeatures.put(isStandaloneFeature, Boolean.FALSE); 288 theFeatures.put(lexicalHandlerParameterEntitiesFeature, 289 Boolean.FALSE); 290 theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE); 291 theFeatures.put(stringInterningFeature, Boolean.TRUE); 292 theFeatures.put(useAttributes2Feature, Boolean.FALSE); 293 theFeatures.put(useLocator2Feature, Boolean.FALSE); 294 theFeatures.put(useEntityResolver2Feature, Boolean.FALSE); 295 theFeatures.put(validationFeature, Boolean.FALSE); 296 theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); 297 theFeatures.put(xmlnsURIsFeature, Boolean.FALSE); 298 theFeatures.put(XML11Feature, Boolean.FALSE); 299 theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS)); 300 theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY)); 301 theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS)); 302 theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES)); 303 theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS)); 304 theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS)); 305 theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE)); 306 theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS)); 307 } 308 309 // Private clone of Boolean.valueOf that is guaranteed to return 310 // Boolean.TRUE or Boolean.FALSE 311 private static Boolean truthValue(boolean b) { 312 return b ? Boolean.TRUE : Boolean.FALSE; 313 } 314 315 316 public boolean getFeature (String name) 317 throws SAXNotRecognizedException, SAXNotSupportedException { 318 Boolean b = (Boolean)theFeatures.get(name); 319 if (b == null) { 320 throw new SAXNotRecognizedException("Unknown feature " + name); 321 } 322 return b.booleanValue(); 323 } 324 325 public void setFeature (String name, boolean value) 326 throws SAXNotRecognizedException, SAXNotSupportedException { 327 Boolean b = (Boolean)theFeatures.get(name); 328 if (b == null) { 329 throw new SAXNotRecognizedException("Unknown feature " + name); 330 } 331 if (value) theFeatures.put(name, Boolean.TRUE); 332 else theFeatures.put(name, Boolean.FALSE); 333 334 if (name.equals(namespacesFeature)) namespaces = value; 335 else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value; 336 else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value; 337 else if (name.equals(rootBogonsFeature)) rootBogons = value; 338 else if (name.equals(defaultAttributesFeature)) defaultAttributes = value; 339 else if (name.equals(translateColonsFeature)) translateColons = value; 340 else if (name.equals(restartElementsFeature)) restartElements = value; 341 else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value; 342 else if (name.equals(CDATAElementsFeature)) CDATAElements = value; 343 } 344 345 public Object getProperty (String name) 346 throws SAXNotRecognizedException, SAXNotSupportedException { 347 if (name.equals(lexicalHandlerProperty)) { 348 return theLexicalHandler == this ? null : theLexicalHandler; 349 } 350 else if (name.equals(scannerProperty)) { 351 return theScanner; 352 } 353 else if (name.equals(schemaProperty)) { 354 return theSchema; 355 } 356 else if (name.equals(autoDetectorProperty)) { 357 return theAutoDetector; 358 } 359 else { 360 throw new SAXNotRecognizedException("Unknown property " + name); 361 } 362 } 363 364 public void setProperty (String name, Object value) 365 throws SAXNotRecognizedException, SAXNotSupportedException { 366 if (name.equals(lexicalHandlerProperty)) { 367 if (value == null) { 368 theLexicalHandler = this; 369 } 370 else if (value instanceof LexicalHandler) { 371 theLexicalHandler = (LexicalHandler)value; 372 } 373 else { 374 throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler"); 375 } 376 } 377 else if (name.equals(scannerProperty)) { 378 if (value instanceof Scanner) { 379 theScanner = (Scanner)value; 380 } 381 else { 382 throw new SAXNotSupportedException("Your scanner is not a Scanner"); 383 } 384 } 385 else if (name.equals(schemaProperty)) { 386 if (value instanceof Schema) { 387 theSchema = (Schema)value; 388 } 389 else { 390 throw new SAXNotSupportedException("Your schema is not a Schema"); 391 } 392 } 393 else if (name.equals(autoDetectorProperty)) { 394 if (value instanceof AutoDetector) { 395 theAutoDetector = (AutoDetector)value; 396 } 397 else { 398 throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector"); 399 } 400 } 401 else { 402 throw new SAXNotRecognizedException("Unknown property " + name); 403 } 404 } 405 406 public void setEntityResolver (EntityResolver resolver) { 407 theEntityResolver = (resolver == null) ? this : resolver; 408 } 409 410 public EntityResolver getEntityResolver () { 411 return (theEntityResolver == this) ? null : theEntityResolver; 412 } 413 414 public void setDTDHandler (DTDHandler handler) { 415 theDTDHandler = (handler == null) ? this : handler; 416 } 417 418 public DTDHandler getDTDHandler () { 419 return (theDTDHandler == this) ? null : theDTDHandler; 420 } 421 422 public void setContentHandler (ContentHandler handler) { 423 theContentHandler = (handler == null) ? this : handler; 424 } 425 426 public ContentHandler getContentHandler () { 427 return (theContentHandler == this) ? null : theContentHandler; 428 } 429 430 public void setErrorHandler (ErrorHandler handler) { 431 theErrorHandler = (handler == null) ? this : handler; 432 } 433 434 public ErrorHandler getErrorHandler () { 435 return (theErrorHandler == this) ? null : theErrorHandler; 436 } 437 438 public void parse (InputSource input) throws IOException, SAXException { 439 setup(); 440 Reader r = getReader(input); 441 theContentHandler.startDocument(); 442 theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId()); 443 if (theScanner instanceof Locator) { 444 theContentHandler.setDocumentLocator((Locator)theScanner); 445 } 446 if (!(theSchema.getURI().equals(""))) 447 theContentHandler.startPrefixMapping(theSchema.getPrefix(), 448 theSchema.getURI()); 449 theScanner.scan(r, this); 450 } 451 452 public void parse (String systemid) throws IOException, SAXException { 453 parse(new InputSource(systemid)); 454 } 455 456 // Sets up instance variables that haven't been set by setFeature 457 private void setup() { 458 if (theSchema == null) theSchema = new HTMLSchema(); 459 if (theScanner == null) theScanner = new HTMLScanner(); 460 if (theAutoDetector == null) { 461 theAutoDetector = new AutoDetector() { 462 public Reader autoDetectingReader(InputStream i) { 463 return new InputStreamReader(i); 464 } 465 }; 466 } 467 theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes); 468 thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes); 469 theNewElement = null; 470 theAttributeName = null; 471 thePITarget = null; 472 theSaved = null; 473 theEntity = 0; 474 virginStack = true; 475 theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null; 476 } 477 478 // Return a Reader based on the contents of an InputSource 479 // Buffer both the InputStream and the Reader 480 private Reader getReader(InputSource s) throws SAXException, IOException { 481 Reader r = s.getCharacterStream(); 482 InputStream i = s.getByteStream(); 483 String encoding = s.getEncoding(); 484 String publicid = s.getPublicId(); 485 String systemid = s.getSystemId(); 486 if (r == null) { 487 if (i == null) i = getInputStream(publicid, systemid); 488 // i = new BufferedInputStream(i); 489 if (encoding == null) { 490 r = theAutoDetector.autoDetectingReader(i); 491 } 492 else { 493 try { 494 r = new InputStreamReader(i, encoding); 495 } 496 catch (UnsupportedEncodingException e) { 497 r = new InputStreamReader(i); 498 } 499 } 500 } 501 // r = new BufferedReader(r); 502 return r; 503 } 504 505 // Get an InputStream based on a publicid and a systemid 506 private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException { 507 URL basis = new URL("file", "", System.getProperty("user.dir") + "/."); 508 URL url = new URL(basis, systemid); 509 URLConnection c = url.openConnection(); 510 return c.getInputStream(); 511 } 512 // We don't process publicids (who uses them anyhow?) 513 514 // ScanHandler implementation 515 516 private Element theNewElement = null; 517 private String theAttributeName = null; 518 private boolean theDoctypeIsPresent = false; 519 private String theDoctypePublicId = null; 520 private String theDoctypeSystemId = null; 521 private String theDoctypeName = null; 522 private String thePITarget = null; 523 private Element theStack = null; 524 private Element theSaved = null; 525 private Element thePCDATA = null; 526 private int theEntity = 0; // needs to support chars past U+FFFF 527 528 public void adup(char[] buff, int offset, int length) throws SAXException { 529 if (theNewElement == null || theAttributeName == null) return; 530 theNewElement.setAttribute(theAttributeName, null, theAttributeName); 531 theAttributeName = null; 532 } 533 534 public void aname(char[] buff, int offset, int length) throws SAXException { 535 if (theNewElement == null) return; 536 // Currently we don't rely on Schema to canonicalize 537 // attribute names. 538 theAttributeName = makeName(buff, offset, length).toLowerCase(); 539 // System.err.println("%% Attribute name " + theAttributeName); 540 } 541 542 public void aval(char[] buff, int offset, int length) throws SAXException { 543 if (theNewElement == null || theAttributeName == null) return; 544 String value = new String(buff, offset, length); 545 // System.err.println("%% Attribute value [" + value + "]"); 546 value = expandEntities(value); 547 theNewElement.setAttribute(theAttributeName, null, value); 548 theAttributeName = null; 549 // System.err.println("%% Aval done"); 550 } 551 552 // Expand entity references in attribute values selectively. 553 // Currently we expand a reference iff it is properly terminated 554 // with a semicolon. 555 private String expandEntities(String src) { 556 int refStart = -1; 557 int len = src.length(); 558 char[] dst = new char[len]; 559 int dstlen = 0; 560 for (int i = 0; i < len; i++) { 561 char ch = src.charAt(i); 562 dst[dstlen++] = ch; 563 // System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] "); 564 if (ch == '&' && refStart == -1) { 565 // start of a ref excluding & 566 refStart = dstlen; 567 // System.err.println("start of ref"); 568 } 569 else if (refStart == -1) { 570 // not in a ref 571 // System.err.println("not in ref"); 572 } 573 else if (Character.isLetter(ch) || 574 Character.isDigit(ch) || 575 ch == '#') { 576 // valid entity char 577 // System.err.println("valid"); 578 } 579 else if (ch == ';') { 580 // properly terminated ref 581 // System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]"); 582 int ent = lookupEntity(dst, refStart, dstlen - refStart - 1); 583 // System.err.println(" = " + ent); 584 if (ent > 0xFFFF) { 585 ent -= 0x10000; 586 dst[refStart - 1] = (char)((ent>>10) + 0xD800); 587 dst[refStart] = (char)((ent&0x3FF) + 0xDC00); 588 dstlen = refStart + 1; 589 } 590 else if (ent != 0) { 591 dst[refStart - 1] = (char)ent; 592 dstlen = refStart; 593 } 594 refStart = -1; 595 } 596 else { 597 // improperly terminated ref 598 // System.err.println("end of ref"); 599 refStart = -1; 600 } 601 } 602 return new String(dst, 0, dstlen); 603 } 604 605 public void entity(char[] buff, int offset, int length) throws SAXException { 606 theEntity = lookupEntity(buff, offset, length); 607 } 608 609 // Process numeric character references, 610 // deferring to the schema for named ones. 611 private int lookupEntity(char[] buff, int offset, int length) { 612 int result = 0; 613 if (length < 1) return result; 614 // System.err.println("%% Entity at " + offset + " " + length); 615 // System.err.println("%% Got entity [" + new String(buff, offset, length) + "]"); 616 if (buff[offset] == '#') { 617 if (length > 1 && (buff[offset+1] == 'x' 618 || buff[offset+1] == 'X')) { 619 try { 620 return Integer.parseInt(new String(buff, offset + 2, length - 2), 16); 621 } 622 catch (NumberFormatException e) { return 0; } 623 } 624 try { 625 return Integer.parseInt(new String(buff, offset + 1, length - 1), 10); 626 } 627 catch (NumberFormatException e) { return 0; } 628 } 629 return theSchema.getEntity(new String(buff, offset, length)); 630 } 631 632 public void eof(char[] buff, int offset, int length) throws SAXException { 633 if (virginStack) rectify(thePCDATA); 634 while (theStack.next() != null) { 635 pop(); 636 } 637 if (!(theSchema.getURI().equals(""))) 638 theContentHandler.endPrefixMapping(theSchema.getPrefix()); 639 theContentHandler.endDocument(); 640 } 641 642 public void etag(char[] buff, int offset, int length) throws SAXException { 643 if (etag_cdata(buff, offset, length)) return; 644 etag_basic(buff, offset, length); 645 } 646 647 private static char[] etagchars = {'<', '/', '>'}; 648 public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException { 649 String currentName = theStack.name(); 650 // If this is a CDATA element and the tag doesn't match, 651 // or isn't properly formed (junk after the name), 652 // restart CDATA mode and process the tag as characters. 653 if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { 654 boolean realTag = (length == currentName.length()); 655 if (realTag) { 656 for (int i = 0; i < length; i++) { 657 if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) { 658 realTag = false; 659 break; 660 } 661 } 662 } 663 if (!realTag) { 664 theContentHandler.characters(etagchars, 0, 2); 665 theContentHandler.characters(buff, offset, length); 666 theContentHandler.characters(etagchars, 2, 1); 667 theScanner.startCDATA(); 668 return true; 669 } 670 } 671 return false; 672 } 673 674 public void etag_basic(char[] buff, int offset, int length) throws SAXException { 675 theNewElement = null; 676 String name; 677 if (length != 0) { 678 // Canonicalize case of name 679 name = makeName(buff, offset, length); 680 // System.err.println("got etag [" + name + "]"); 681 ElementType type = theSchema.getElementType(name); 682 if (type == null) return; // mysterious end-tag 683 name = type.name(); 684 } 685 else { 686 name = theStack.name(); 687 } 688 // System.err.println("%% Got end of " + name); 689 690 Element sp; 691 boolean inNoforce = false; 692 for (sp = theStack; sp != null; sp = sp.next()) { 693 if (sp.name().equals(name)) break; 694 if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true; 695 } 696 697 if (sp == null) return; // Ignore unknown etags 698 if (sp.next() == null || sp.next().next() == null) return; 699 if (inNoforce) { // inside an F_NOFORCE element? 700 sp.preclose(); // preclose the matching element 701 } 702 else { // restartably pop everything above us 703 while (theStack != sp) { 704 restartablyPop(); 705 } 706 pop(); 707 } 708 // pop any preclosed elements now at the top 709 while (theStack.isPreclosed()) { 710 pop(); 711 } 712 restart(null); 713 } 714 715 // Push restartables on the stack if possible 716 // e is the next element to be started, if we know what it is 717 private void restart(Element e) throws SAXException { 718 while (theSaved != null && theStack.canContain(theSaved) && 719 (e == null || theSaved.canContain(e))) { 720 Element next = theSaved.next(); 721 push(theSaved); 722 theSaved = next; 723 } 724 } 725 726 // Pop the stack irrevocably 727 private void pop() throws SAXException { 728 if (theStack == null) return; // empty stack 729 String name = theStack.name(); 730 String localName = theStack.localName(); 731 String namespace = theStack.namespace(); 732 String prefix = prefixOf(name); 733 734 // System.err.println("%% Popping " + name); 735 if (!namespaces) namespace = localName = ""; 736 theContentHandler.endElement(namespace, localName, name); 737 if (foreign(prefix, namespace)) { 738 theContentHandler.endPrefixMapping(prefix); 739 // System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace); 740 } 741 Attributes atts = theStack.atts(); 742 for (int i = atts.getLength() - 1; i >= 0; i--) { 743 String attNamespace = atts.getURI(i); 744 String attPrefix = prefixOf(atts.getQName(i)); 745 if (foreign(attPrefix, attNamespace)) { 746 theContentHandler.endPrefixMapping(attPrefix); 747 // System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace); 748 } 749 } 750 theStack = theStack.next(); 751 } 752 753 // Pop the stack restartably 754 private void restartablyPop() throws SAXException { 755 Element popped = theStack; 756 pop(); 757 if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) { 758 popped.anonymize(); 759 popped.setNext(theSaved); 760 theSaved = popped; 761 } 762 } 763 764 // Push element onto stack 765 private boolean virginStack = true; 766 private void push(Element e) throws SAXException { 767 String name = e.name(); 768 String localName = e.localName(); 769 String namespace = e.namespace(); 770 String prefix = prefixOf(name); 771 772 // System.err.println("%% Pushing " + name); 773 e.clean(); 774 if (!namespaces) namespace = localName = ""; 775 if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) { 776 try { 777 theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId); 778 } catch (IOException ew) { } // Can't be thrown for root I believe. 779 } 780 if (foreign(prefix, namespace)) { 781 theContentHandler.startPrefixMapping(prefix, namespace); 782 // System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace); 783 } 784 Attributes atts = e.atts(); 785 int len = atts.getLength(); 786 for (int i = 0; i < len; i++) { 787 String attNamespace = atts.getURI(i); 788 String attPrefix = prefixOf(atts.getQName(i)); 789 if (foreign(attPrefix, attNamespace)) { 790 theContentHandler.startPrefixMapping(attPrefix, attNamespace); 791 // System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace); 792 } 793 } 794 theContentHandler.startElement(namespace, localName, name, e.atts()); 795 e.setNext(theStack); 796 theStack = e; 797 virginStack = false; 798 if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) { 799 theScanner.startCDATA(); 800 } 801 } 802 803 // Get the prefix from a QName 804 private String prefixOf(String name) { 805 int i = name.indexOf(':'); 806 String prefix = ""; 807 if (i != -1) prefix = name.substring(0, i); 808 // System.err.println("%% " + prefix + " is prefix of " + name); 809 return prefix; 810 } 811 812 // Return true if we have a foreign name 813 private boolean foreign(String prefix, String namespace) { 814 // System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- "); 815 boolean foreign = !(prefix.equals("") || namespace.equals("") || 816 namespace.equals(theSchema.getURI())); 817 // System.err.println(foreign); 818 return foreign; 819 } 820 821 /** 822 * Parsing the complete XML Document Type Definition is way too complex, 823 * but for many simple cases we can extract something useful from it. 824 * 825 * doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' 826 * DeclSep ::= PEReference | S 827 * intSubset ::= (markupdecl | DeclSep)* 828 * markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment 829 * ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral 830 */ 831 public void decl(char[] buff, int offset, int length) throws SAXException { 832 String s = new String(buff, offset, length); 833 String name = null; 834 String systemid = null; 835 String publicid = null; 836 String[] v = split(s); 837 if (v.length > 0 && "DOCTYPE".equals(v[0])) { 838 if (theDoctypeIsPresent) return; // one doctype only! 839 theDoctypeIsPresent = true; 840 if (v.length > 1) { 841 name = v[1]; 842 if (v.length>3 && "SYSTEM".equals(v[2])) { 843 systemid = v[3]; 844 } 845 else if (v.length > 3 && "PUBLIC".equals(v[2])) { 846 publicid = v[3]; 847 if (v.length > 4) { 848 systemid = v[4]; 849 } 850 else { 851 systemid = ""; 852 } 853 } 854 } 855 } 856 publicid = trimquotes(publicid); 857 systemid = trimquotes(systemid); 858 if (name != null) { 859 publicid = cleanPublicid(publicid); 860 theLexicalHandler.startDTD(name, publicid, systemid); 861 theLexicalHandler.endDTD(); 862 theDoctypeName = name; 863 theDoctypePublicId = publicid; 864 if (theScanner instanceof Locator) { // Must resolve systemid 865 theDoctypeSystemId = ((Locator)theScanner).getSystemId(); 866 try { 867 theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString(); 868 } catch (Exception e) {} 869 } 870 } 871 } 872 873 // If the String is quoted, trim the quotes. 874 private static String trimquotes(String in) { 875 if (in == null) return in; 876 int length = in.length(); 877 if (length == 0) return in; 878 char s = in.charAt(0); 879 char e = in.charAt(length - 1); 880 if (s == e && (s == '\'' || s == '"')) { 881 in = in.substring(1, in.length() - 1); 882 } 883 return in; 884 } 885 886 // Split the supplied String into words or phrases seperated by spaces. 887 // Recognises quotes around a phrase and doesn't split it. 888 private static String[] split(String val) throws IllegalArgumentException { 889 val = val.trim(); 890 if (val.length() == 0) { 891 return new String[0]; 892 } 893 else { 894 ArrayList l = new ArrayList(); 895 int s = 0; 896 int e = 0; 897 boolean sq = false; // single quote 898 boolean dq = false; // double quote 899 char lastc = 0; 900 int len = val.length(); 901 for (e=0; e < len; e++) { 902 char c = val.charAt(e); 903 if (!dq && c == '\'' && lastc != '\\') { 904 sq = !sq; 905 if (s < 0) s = e; 906 } 907 else if (!sq && c == '\"' && lastc != '\\') { 908 dq = !dq; 909 if (s < 0) s = e; 910 } 911 else if (!sq && !dq) { 912 if (Character.isWhitespace(c)) { 913 if (s >= 0) l.add(val.substring(s, e)); 914 s = -1; 915 } 916 else if (s < 0 && c != ' ') { 917 s = e; 918 } 919 } 920 lastc = c; 921 } 922 l.add(val.substring(s, e)); 923 return (String[])l.toArray(new String[0]); 924 } 925 } 926 927 // Replace junk in publicids with spaces 928 private static String legal = 929 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%"; 930 931 private String cleanPublicid(String src) { 932 if (src == null) return null; 933 int len = src.length(); 934 StringBuffer dst = new StringBuffer(len); 935 boolean suppressSpace = true; 936 for (int i = 0; i < len; i++) { 937 char ch = src.charAt(i); 938 if (legal.indexOf(ch) != -1) { // legal but not whitespace 939 dst.append(ch); 940 suppressSpace = false; 941 } 942 else if (suppressSpace) { // normalizable whitespace or junk 943 ; 944 } 945 else { 946 dst.append(' '); 947 suppressSpace = true; 948 } 949 } 950 // System.err.println("%% Publicid [" + dst.toString().trim() + "]"); 951 return dst.toString().trim(); // trim any final junk whitespace 952 } 953 954 955 public void gi(char[] buff, int offset, int length) throws SAXException { 956 if (theNewElement != null) return; 957 String name = makeName(buff, offset, length); 958 if (name == null) return; 959 ElementType type = theSchema.getElementType(name); 960 if (type == null) { 961 // Suppress unknown elements if ignore-bogons is on 962 if (ignoreBogons) return; 963 int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY; 964 int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT); 965 theSchema.elementType(name, bogonModel, bogonMemberOf, 0); 966 if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name()); 967 type = theSchema.getElementType(name); 968 } 969 970 theNewElement = new Element(type, defaultAttributes); 971 // System.err.println("%% Got GI " + theNewElement.name()); 972 } 973 974 public void cdsect(char[] buff, int offset, int length) throws SAXException { 975 theLexicalHandler.startCDATA(); 976 pcdata(buff, offset, length); 977 theLexicalHandler.endCDATA(); 978 } 979 public void pcdata(char[] buff, int offset, int length) throws SAXException { 980 if (length == 0) return; 981 boolean allWhite = true; 982 for (int i = 0; i < length; i++) { 983 if (!Character.isWhitespace(buff[offset+i])) { 984 allWhite = false; 985 } 986 } 987 if (allWhite && !theStack.canContain(thePCDATA)) { 988 if (ignorableWhitespace) { 989 theContentHandler.ignorableWhitespace(buff, offset, length); 990 } 991 } 992 else { 993 rectify(thePCDATA); 994 theContentHandler.characters(buff, offset, length); 995 } 996 } 997 998 public void pitarget(char[] buff, int offset, int length) throws SAXException { 999 if (theNewElement != null) return; 1000 thePITarget = makeName(buff, offset, length).replace(':', '_'); 1001 } 1002 1003 public void pi(char[] buff, int offset, int length) throws SAXException { 1004 if (theNewElement != null || thePITarget == null) return; 1005 if ("xml".equalsIgnoreCase(thePITarget)) return; 1006 // if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI"); 1007 if (length > 0 && buff[length - 1] == '?') length--; // remove trailing ? 1008 theContentHandler.processingInstruction(thePITarget, 1009 new String(buff, offset, length)); 1010 thePITarget = null; 1011 } 1012 1013 public void stagc(char[] buff, int offset, int length) throws SAXException { 1014 // System.err.println("%% Start-tag"); 1015 if (theNewElement == null) return; 1016 rectify(theNewElement); 1017 if (theStack.model() == Schema.M_EMPTY) { 1018 // Force an immediate end tag 1019 etag_basic(buff, offset, length); 1020 } 1021 } 1022 1023 public void stage(char[] buff, int offset, int length) throws SAXException { 1024 // System.err.println("%% Empty-tag"); 1025 if (theNewElement == null) return; 1026 rectify(theNewElement); 1027 // Force an immediate end tag 1028 etag_basic(buff, offset, length); 1029 } 1030 1031 // Comment buffer is twice the size of the output buffer 1032 private char[] theCommentBuffer = new char[2000]; 1033 public void cmnt(char[] buff, int offset, int length) throws SAXException { 1034 theLexicalHandler.comment(buff, offset, length); 1035 } 1036 1037 // Rectify the stack, pushing and popping as needed 1038 // so that the argument can be safely pushed 1039 private void rectify(Element e) throws SAXException { 1040 Element sp; 1041 while (true) { 1042 for (sp = theStack; sp != null; sp = sp.next()) { 1043 if (sp.canContain(e)) break; 1044 } 1045 if (sp != null) break; 1046 ElementType parentType = e.parent(); 1047 if (parentType == null) break; 1048 Element parent = new Element(parentType, defaultAttributes); 1049 // System.err.println("%% Ascending from " + e.name() + " to " + parent.name()); 1050 parent.setNext(e); 1051 e = parent; 1052 } 1053 if (sp == null) return; // don't know what to do 1054 while (theStack != sp) { 1055 if (theStack == null || theStack.next() == null || 1056 theStack.next().next() == null) break; 1057 restartablyPop(); 1058 } 1059 while (e != null) { 1060 Element nexte = e.next(); 1061 if (!e.name().equals("<pcdata>")) push(e); 1062 e = nexte; 1063 restart(e); 1064 } 1065 theNewElement = null; 1066 } 1067 1068 public int getEntity() { 1069 return theEntity; 1070 } 1071 1072 // Return the argument as a valid XML name 1073 // This no longer lowercases the result: we depend on Schema to 1074 // canonicalize case. 1075 private String makeName(char[] buff, int offset, int length) { 1076 StringBuffer dst = new StringBuffer(length + 2); 1077 boolean seenColon = false; 1078 boolean start = true; 1079 // String src = new String(buff, offset, length); // DEBUG 1080 for (; length-- > 0; offset++) { 1081 char ch = buff[offset]; 1082 if (Character.isLetter(ch) || ch == '_') { 1083 start = false; 1084 dst.append(ch); 1085 } 1086 else if (Character.isDigit(ch) || ch == '-' || ch == '.') { 1087 if (start) dst.append('_'); 1088 start = false; 1089 dst.append(ch); 1090 } 1091 else if (ch == ':' && !seenColon) { 1092 seenColon = true; 1093 if (start) dst.append('_'); 1094 start = true; 1095 dst.append(translateColons ? '_' : ch); 1096 } 1097 } 1098 int dstLength = dst.length(); 1099 if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_'); 1100 // System.err.println("Made name \"" + dst + "\" from \"" + src + "\""); 1101 return dst.toString().intern(); 1102 } 1103 1104 // Default LexicalHandler implementation 1105 1106 public void comment(char[] ch, int start, int length) throws SAXException { } 1107 public void endCDATA() throws SAXException { } 1108 public void endDTD() throws SAXException { } 1109 public void endEntity(String name) throws SAXException { } 1110 public void startCDATA() throws SAXException { } 1111 public void startDTD(String name, String publicid, String systemid) throws SAXException { } 1112 public void startEntity(String name) throws SAXException { } 1113 1114 } 1115